2004-03-09 21:29:16

by Chris Mason

[permalink] [raw]
Subject: [PATCH] lockfs patch for 2.6

Hello everyone,

In order to get consistent snapshots with the device mapper code, you
need to sync and lock down any filesystems sitting on top of the
device. This isn't as critical in the 2.6 code since it can do writable
snapshots, but it is still nice to have things properly synced and
consistent.

I've had various forms of this against 2.4, the ugly part was always the
locking to make sure a new FS wasn't mounted on the source while the
snapshot was being setup. Here's my 2.6 version, with the DM code
contributed by Kevin Corry. The basic idea is to add a semaphore to the
block device that gets used to make sure there are no new mounts.

This needs the rest of Joe's device mapper updates to be useful, but I
wanted to send here for comments on locking and other issues:

-chris

Index: linux.dm/fs/block_dev.c
===================================================================
--- linux.dm.orig/fs/block_dev.c 2004-02-27 15:47:10.796588369 -0500
+++ linux.dm/fs/block_dev.c 2004-02-27 15:48:41.512740351 -0500
@@ -242,6 +242,7 @@
{
memset(bdev, 0, sizeof(*bdev));
sema_init(&bdev->bd_sem, 1);
+ sema_init(&bdev->bd_mount_sem, 1);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
inode_init_once(&ei->vfs_inode);
Index: linux.dm/fs/super.c
===================================================================
--- linux.dm.orig/fs/super.c 2004-02-27 15:47:11.065508887 -0500
+++ linux.dm/fs/super.c 2004-02-27 15:48:41.521737673 -0500
@@ -292,6 +292,62 @@
}

/*
+ * triggered by the device mapper code to lock a filesystem and force
+ * it into a consistent state.
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until unlockfs is called. If a super is found on this
+ * block device, we hould a read lock on the s->s_umount sem to make sure
+ * nobody unmounts until the snapshot creation is done
+ */
+void sync_super_lockfs(struct block_device *bdev)
+{
+ struct super_block *sb;
+ down(&bdev->bd_mount_sem);
+ sb = get_super(bdev);
+ if (sb) {
+ lock_super(sb);
+ if (sb->s_dirt && sb->s_op->write_super)
+ sb->s_op->write_super(sb);
+ if (sb->s_op->write_super_lockfs)
+ sb->s_op->write_super_lockfs(sb);
+ unlock_super(sb);
+ }
+ /* unlockfs releases s->s_umount and bd_mount_sem */
+}
+
+void unlockfs(struct block_device *bdev)
+{
+ struct list_head *p;
+ /*
+ * copied from get_super, but we need to
+ * do special things since lockfs left the
+ * s_umount sem held
+ */
+ spin_lock(&sb_lock);
+ list_for_each(p, &super_blocks) {
+ struct super_block *s = sb_entry(p);
+ /*
+ * if there is a super for this block device
+ * in the list, get_super must have found it
+ * during sync_super_lockfs, so our drop_super
+ * will drop the reference created there.
+ */
+ if (s->s_bdev == bdev && s->s_root) {
+ spin_unlock(&sb_lock);
+ if (s->s_op->unlockfs)
+ s->s_op->unlockfs(s);
+ drop_super(s);
+ goto unlock;
+ }
+ }
+ spin_unlock(&sb_lock);
+unlock:
+ up(&bdev->bd_mount_sem);
+}
+EXPORT_SYMBOL(unlockfs);
+
+/*
* Note: check the dirty flag before waiting, so we don't
* hold up the sync while mounting a device. (The newly
* mounted device won't need syncing.)
@@ -612,7 +668,14 @@
if (IS_ERR(bdev))
return (struct super_block *)bdev;

+ /*
+ * once the super is inserted into the list by sget, s_umount
+ * will protect the lockfs code from trying to start a snapshot
+ * while we are mounting
+ */
+ down(&bdev->bd_mount_sem);
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+ up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto out;

Index: linux.dm/fs/buffer.c
===================================================================
--- linux.dm.orig/fs/buffer.c 2004-02-27 15:47:36.139106189 -0500
+++ linux.dm/fs/buffer.c 2004-02-27 15:48:41.516739161 -0500
@@ -260,6 +260,17 @@
return sync_blockdev(bdev);
}

+int fsync_bdev_lockfs(struct block_device *bdev)
+{
+ int res;
+ res = fsync_bdev(bdev);
+ if (res)
+ return res;
+ sync_super_lockfs(bdev);
+ return sync_blockdev(bdev);
+}
+EXPORT_SYMBOL(fsync_bdev_lockfs);
+
/*
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
Index: linux.dm/drivers/md/dm.c
===================================================================
--- linux.dm.orig/drivers/md/dm.c 2004-02-27 15:48:38.171735120 -0500
+++ linux.dm/drivers/md/dm.c 2004-02-27 15:48:41.510740946 -0500
@@ -12,6 +12,7 @@
#include <linux/moduleparam.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
+#include <linux/buffer_head.h>
#include <linux/mempool.h>
#include <linux/slab.h>

@@ -51,6 +52,7 @@
*/
#define DMF_BLOCK_IO 0
#define DMF_SUSPENDED 1
+#define DMF_FS_LOCKED 2

struct mapped_device {
struct rw_semaphore lock;
@@ -844,6 +846,24 @@
return 0;
}

+static void __lock_disk(struct gendisk *disk)
+{
+ struct block_device *bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ fsync_bdev_lockfs(bdev);
+ bdput(bdev);
+ }
+}
+
+static void __unlock_disk(struct gendisk *disk)
+{
+ struct block_device *bdev = bdget_disk(disk, 0);
+ if (bdev) {
+ unlockfs(bdev);
+ bdput(bdev);
+ }
+}
+
/*
* We need to be able to change a mapping table under a mounted
* filesystem. For example we might want to move some data in
@@ -855,12 +875,23 @@
{
DECLARE_WAITQUEUE(wait, current);

- down_write(&md->lock);
+ /* Flush I/O to the device. */
+ down_read(&md->lock);
+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ up_read(&md->lock);
+ return -EINVAL;
+ }
+
+ if (!test_and_set_bit(DMF_FS_LOCKED, &md->flags)) {
+ __lock_disk(md->disk);
+ }
+ up_read(&md->lock);

/*
* First we set the BLOCK_IO flag so no more ios will be
* mapped.
*/
+ down_write(&md->lock);
if (test_bit(DMF_BLOCK_IO, &md->flags)) {
up_write(&md->lock);
return -EINVAL;
@@ -910,11 +941,13 @@
dm_table_resume_targets(md->map);
clear_bit(DMF_SUSPENDED, &md->flags);
clear_bit(DMF_BLOCK_IO, &md->flags);
+ clear_bit(DMF_FS_LOCKED, &md->flags);

def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->lock);

+ __unlock_disk(md->disk);
blk_run_queues();

return 0;
Index: linux.dm/include/linux/fs.h
===================================================================
--- linux.dm.orig/include/linux/fs.h 2004-02-27 15:47:33.193980650 -0500
+++ linux.dm/include/linux/fs.h 2004-02-27 15:48:41.532734400 -0500
@@ -343,6 +343,7 @@
struct inode * bd_inode; /* will die */
int bd_openers;
struct semaphore bd_sem; /* open/close mutex */
+ struct semaphore bd_mount_sem; /* mount mutex */
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;
@@ -1218,6 +1219,7 @@
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
extern void sync_supers(void);
+extern void sync_super_lockfs(struct block_device *);
extern void sync_filesystems(int wait);
extern void emergency_sync(void);
extern void emergency_remount(void);
Index: linux.dm/include/linux/buffer_head.h
===================================================================
--- linux.dm.orig/include/linux/buffer_head.h 2004-02-05 16:56:30.000000000 -0500
+++ linux.dm/include/linux/buffer_head.h 2004-02-27 15:48:41.530734995 -0500
@@ -164,6 +164,8 @@
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
void wake_up_buffer(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
+int fsync_bdev_lockfs(struct block_device *);
+void unlockfs(struct block_device *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
struct buffer_head *__find_get_block(struct block_device *, sector_t, int);





2004-03-12 09:32:34

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Tue, Mar 09, 2004 at 04:31:25PM -0500, Chris Mason wrote:
> /*
> + * triggered by the device mapper code to lock a filesystem and force
> + * it into a consistent state.
> + *
> + * This takes the block device bd_mount_sem to make sure no new mounts
> + * happen on bdev until unlockfs is called. If a super is found on this
> + * block device, we hould a read lock on the s->s_umount sem to make sure
> + * nobody unmounts until the snapshot creation is done
> + */
> +void sync_super_lockfs(struct block_device *bdev)
> +{
> + struct super_block *sb;
> + down(&bdev->bd_mount_sem);
> + sb = get_super(bdev);
> + if (sb) {
> + lock_super(sb);
> + if (sb->s_dirt && sb->s_op->write_super)
> + sb->s_op->write_super(sb);
> + if (sb->s_op->write_super_lockfs)
> + sb->s_op->write_super_lockfs(sb);

Can we please rename write_super_lockfs to a sane name?

freeze_fs/thaw_fs sounds like a good name.

> +void unlockfs(struct block_device *bdev)
> +{
> + struct list_head *p;
> + /*
> + * copied from get_super, but we need to
> + * do special things since lockfs left the
> + * s_umount sem held
> + */
> + spin_lock(&sb_lock);
> + list_for_each(p, &super_blocks) {
> + struct super_block *s = sb_entry(p);
> + /*
> + * if there is a super for this block device
> + * in the list, get_super must have found it
> + * during sync_super_lockfs, so our drop_super
> + * will drop the reference created there.
> + */
> + if (s->s_bdev == bdev && s->s_root) {
> + spin_unlock(&sb_lock);
> + if (s->s_op->unlockfs)
> + s->s_op->unlockfs(s);
> + drop_super(s);
> + goto unlock;
> + }
> + }
> + spin_unlock(&sb_lock);
> +unlock:
> + up(&bdev->bd_mount_sem);
> +}
> +EXPORT_SYMBOL(unlockfs);

This looks ugly. What about returning the superblock from the freeze
routine so you can simply pass it into the thaw routine?

> ===================================================================
> --- linux.dm.orig/fs/buffer.c 2004-02-27 15:47:36.139106189 -0500
> +++ linux.dm/fs/buffer.c 2004-02-27 15:48:41.516739161 -0500
> @@ -260,6 +260,17 @@
> return sync_blockdev(bdev);
> }
>
> +int fsync_bdev_lockfs(struct block_device *bdev)
> +{
> + int res;
> + res = fsync_bdev(bdev);
> + if (res)
> + return res;
> + sync_super_lockfs(bdev);
> + return sync_blockdev(bdev);
> +}
> +EXPORT_SYMBOL(fsync_bdev_lockfs);

This looks grossly misnamed again. And why do you need to have
sync_super_locks splitted out? Calling it on it's own doesn't make much
sense.

> --- linux.dm.orig/include/linux/buffer_head.h 2004-02-05 16:56:30.000000000 -0500
> +++ linux.dm/include/linux/buffer_head.h 2004-02-27 15:48:41.530734995 -0500
> @@ -164,6 +164,8 @@
> wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
> void wake_up_buffer(struct buffer_head *bh);
> int fsync_bdev(struct block_device *);
> +int fsync_bdev_lockfs(struct block_device *);
> +void unlockfs(struct block_device *);

Again rather misplaced. Even a fs not using bufferheads at all would
benefit from the interface.

2004-03-12 15:48:45

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Fri, 2004-03-12 at 04:31, Christoph Hellwig wrote:

> Can we please rename write_super_lockfs to a sane name?
>
> freeze_fs/thaw_fs sounds like a good name.
>
Sure.

> This looks ugly. What about returning the superblock from the freeze
> routine so you can simply pass it into the thaw routine?
>
I like it, will do.

>
> This looks grossly misnamed again. And why do you need to have
> sync_super_locks splitted out? Calling it on it's own doesn't make much
> sense.
>

Would you like this better:

device mapper code:
fsync_bdev(bdev);
s = freeze_fs(bdev);
< create snap shot >
thaw_fs(bdev, s);

thaw_fs needs the bdev so it can up the bdev mount semaphore.

-chris


2004-03-12 15:52:02

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Fri, Mar 12, 2004 at 10:50:53AM -0500, Chris Mason wrote:
> Would you like this better:
>
> device mapper code:
> fsync_bdev(bdev);
> s = freeze_fs(bdev);
> < create snap shot >
> thaw_fs(bdev, s);

Hmm, I actually thought about moving the fsync_bdev into
freeze_fs, but having it separate might more sense indeed.

> thaw_fs needs the bdev so it can up the bdev mount semaphore.

sb->s_bdev should do it aswell

2004-03-13 13:14:52

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Tue, Mar 09, 2004 at 04:31:25PM -0500, Chris Mason wrote:
> Hello everyone,
>
> In order to get consistent snapshots with the device mapper code, you
> need to sync and lock down any filesystems sitting on top of the
> device. This isn't as critical in the 2.6 code since it can do writable
> snapshots, but it is still nice to have things properly synced and
> consistent.
>
> I've had various forms of this against 2.4, the ugly part was always the
> locking to make sure a new FS wasn't mounted on the source while the
> snapshot was being setup. Here's my 2.6 version, with the DM code
> contributed by Kevin Corry. The basic idea is to add a semaphore to the
> block device that gets used to make sure there are no new mounts.

Okay, I actually took a look at the XFS freeze code and it seems the
current infrastructure doesn't suite XFS very well.

What XFS currently does when freezing is

1. set a flag in the mount structure that blocks all new writes
2. flush all file data
3. set a flag blocking all new transactions
4. flush any dirty inode state into buffers
5. push out all buffers to disk
6. mark the filesystem clean

Now how does this fit into generic freeze/thaw fs structure?

1. should probably move into the VFS (generic_file_write)

2,4,5 basically is fsync_bdev except that we have no chance to block
transaction that way. So either we need two calls into the fs or have
some trivial state in the superblock that tells xfs to block transaction,
basically an enum { FS_UNFROZEN, FS_FROZEN_WRITE, FS_FROZEN_FULL };

and a function fs_check_frozen similar to xfs_check_frozen that makes the
caller block until the fs is unfrozen.

Doing it that way would get rid of lots of mess in XFS so I'm all for it :)

2004-03-13 15:18:17

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sat, 2004-03-13 at 08:14, Christoph Hellwig wrote:
> On Tue, Mar 09, 2004 at 04:31:25PM -0500, Chris Mason wrote:
> > Hello everyone,
> >
> > In order to get consistent snapshots with the device mapper code, you
> > need to sync and lock down any filesystems sitting on top of the
> > device. This isn't as critical in the 2.6 code since it can do writable
> > snapshots, but it is still nice to have things properly synced and
> > consistent.
> >
> > I've had various forms of this against 2.4, the ugly part was always the
> > locking to make sure a new FS wasn't mounted on the source while the
> > snapshot was being setup. Here's my 2.6 version, with the DM code
> > contributed by Kevin Corry. The basic idea is to add a semaphore to the
> > block device that gets used to make sure there are no new mounts.
>
> Okay, I actually took a look at the XFS freeze code and it seems the
> current infrastructure doesn't suite XFS very well.
>
> What XFS currently does when freezing is
>
> 1. set a flag in the mount structure that blocks all new writes
> 2. flush all file data
> 3. set a flag blocking all new transactions
> 4. flush any dirty inode state into buffers
> 5. push out all buffers to disk
> 6. mark the filesystem clean
>
> Now how does this fit into generic freeze/thaw fs structure?
>
> 1. should probably move into the VFS (generic_file_write)
>
> 2,4,5 basically is fsync_bdev except that we have no chance to block
> transaction that way. So either we need two calls into the fs or have
> some trivial state in the superblock that tells xfs to block transaction,
> basically an enum { FS_UNFROZEN, FS_FROZEN_WRITE, FS_FROZEN_FULL };
>
> and a function fs_check_frozen similar to xfs_check_frozen that makes the
> caller block until the fs is unfrozen.
>
> Doing it that way would get rid of lots of mess in XFS so I'm all for it :)

This is basically how reiserfs does it. Various critical spots (like
the code to start a transaction) check to see if the FS is frozen.

I'll rework the patch as we've discussed on Friday, if you need it
broken up differently, please let me know.

-chris


2004-03-13 16:34:05

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sat, Mar 13, 2004 at 10:20:14AM -0500, Chris Mason wrote:
> This is basically how reiserfs does it. Various critical spots (like
> the code to start a transaction) check to see if the FS is frozen.
>
> I'll rework the patch as we've discussed on Friday, if you need it
> broken up differently, please let me know.

Here's the reworked patch I have, ignoring dm but converting the xfs-internal
interfaces that do freezing. But without an interface change we still need
to do all the flushing a second time inside XFS which I absoutely dislike.

Let me think about how to do this nicer.

--- 1.155/fs/block_dev.c Fri Mar 12 10:33:01 2004
+++ edited/fs/block_dev.c Sat Mar 13 15:28:49 2004
@@ -251,6 +251,7 @@
{
memset(bdev, 0, sizeof(*bdev));
sema_init(&bdev->bd_sem, 1);
+ sema_init(&bdev->bd_mount_sem, 1);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
inode_init_once(&ei->vfs_inode);
===== fs/buffer.c 1.224 vs edited =====
--- 1.224/fs/buffer.c Sun Mar 7 08:16:11 2004
+++ edited/fs/buffer.c Sat Mar 13 17:59:13 2004
@@ -260,6 +260,50 @@
}

/*
+ * triggered by the device mapper code to lock a filesystem and force
+ * it into a consistent state.
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until unlockfs is called. If a super is found on this
+ * block device, we hould a read lock on the s->s_umount sem to make sure
+ * nobody unmounts until the snapshot creation is done
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ down(&bdev->bd_mount_sem);
+ sb = get_super(bdev);
+ if (sb && !(sb->s_flags & MS_RDONLY)) {
+ fsync_super(sb);
+ lock_super(sb);
+ if (sb->s_dirt && sb->s_op->write_super)
+ sb->s_op->write_super(sb);
+ if (sb->s_op->write_super_lockfs)
+ sb->s_op->write_super_lockfs(sb);
+ unlock_super(sb);
+ }
+
+ sync_blockdev(bdev);
+ return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+ if (sb) {
+ BUG_ON(sb->s_bdev != bdev);
+
+ if (sb->s_op->unlockfs)
+ sb->s_op->unlockfs(sb);
+ drop_super(sb);
+ }
+
+ up(&bdev->bd_mount_sem);
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+/*
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
*/
===== fs/super.c 1.115 vs edited =====
--- 1.115/fs/super.c Fri Mar 12 10:30:24 2004
+++ edited/fs/super.c Sat Mar 13 15:30:35 2004
@@ -621,7 +621,14 @@
if (IS_ERR(bdev))
return (struct super_block *)bdev;

+ /*
+ * once the super is inserted into the list by sget, s_umount
+ * will protect the lockfs code from trying to start a snapshot
+ * while we are mounting
+ */
+ down(&bdev->bd_mount_sem);
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+ up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto out;

===== fs/xfs/xfs_fsops.c 1.11 vs edited =====
--- 1.11/fs/xfs/xfs_fsops.c Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/xfs_fsops.c Sat Mar 13 18:12:11 2004
@@ -582,15 +582,14 @@
}

int
-xfs_fs_freeze(
- xfs_mount_t *mp)
+xfs_freeze(
+ bhv_desc_t *bdp)
{
- vfs_t *vfsp;
+ vfs_t *vfsp = bhvtovfs(bdp);
+ xfs_mount_t *mp = XFS_BHVTOM(bdp);
/*REFERENCED*/
int error;

- vfsp = XFS_MTOVFS(mp);
-
/* Stop new writers */
xfs_start_freeze(mp, XFS_FREEZE_WRITE);

@@ -619,12 +618,11 @@
return 0;
}

-int
-xfs_fs_thaw(
- xfs_mount_t *mp)
+void
+xfs_thaw(
+ bhv_desc_t *bdp)
{
- xfs_finish_freeze(mp);
- return 0;
+ xfs_finish_freeze(XFS_BHVTOM(bdp));
}

int
@@ -632,13 +630,21 @@
xfs_mount_t *mp,
__uint32_t inflags)
{
- switch (inflags)
- {
- case XFS_FSOP_GOING_FLAGS_DEFAULT:
- xfs_fs_freeze(mp);
- xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
- xfs_fs_thaw(mp);
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (inflags) {
+ case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+ struct vfs *vfsp = XFS_MTOVFS(mp);
+ struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
+
+ if (sb) {
+ xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+ thaw_bdev(sb->s_bdev, sb);
+ }
+
break;
+ }
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
break;
===== fs/xfs/xfs_fsops.h 1.3 vs edited =====
--- 1.3/fs/xfs/xfs_fsops.h Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/xfs_fsops.h Sat Mar 13 18:08:01 2004
@@ -60,12 +60,12 @@
xfs_fsop_resblks_t *outval);

int
-xfs_fs_freeze(
- xfs_mount_t *mp);
+xfs_freeze(
+ struct bhv_desc *bdp);

-int
-xfs_fs_thaw(
- xfs_mount_t *mp);
+void
+xfs_thaw(
+ struct bhv_desc *bdp);

int
xfs_fs_goingdown(
===== fs/xfs/linux/xfs_ioctl.c 1.21 vs edited =====
--- 1.21/fs/xfs/linux/xfs_ioctl.c Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/linux/xfs_ioctl.c Sat Mar 13 18:23:23 2004
@@ -825,13 +825,14 @@
case XFS_IOC_FREEZE:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- xfs_fs_freeze(mp);
+
+ freeze_bdev(inode->i_sb->s_bdev);
return 0;

case XFS_IOC_THAW:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- xfs_fs_thaw(mp);
+ thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
return 0;

case XFS_IOC_GOINGDOWN: {
===== fs/xfs/linux/xfs_super.c 1.70 vs edited =====
--- 1.70/fs/xfs/linux/xfs_super.c Sat Mar 6 04:46:51 2004
+++ edited/fs/xfs/linux/xfs_super.c Sat Mar 13 17:59:56 2004
@@ -590,14 +590,9 @@
struct super_block *sb)
{
vfs_t *vfsp = LINVFS_GET_VFS(sb);
- vnode_t *vp;
int error;

- if (sb->s_flags & MS_RDONLY)
- return;
- VFS_ROOT(vfsp, &vp, error);
- VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_FREEZE, 0, error);
- VN_RELE(vp);
+ VFS_FREEZE(vfsp, error);
}

STATIC void
@@ -605,12 +600,8 @@
struct super_block *sb)
{
vfs_t *vfsp = LINVFS_GET_VFS(sb);
- vnode_t *vp;
- int error;

- VFS_ROOT(vfsp, &vp, error);
- VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_THAW, 0, error);
- VN_RELE(vp);
+ VFS_THAW(vfsp);
}

STATIC struct dentry *
===== fs/xfs/linux/xfs_vfs.c 1.11 vs edited =====
--- 1.11/fs/xfs/linux/xfs_vfs.c Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/linux/xfs_vfs.c Sat Mar 13 17:57:06 2004
@@ -230,6 +230,30 @@
((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
}

+int
+vfs_freeze(
+ struct bhv_desc *bdp)
+{
+ struct bhv_desc *next = bdp;
+
+ ASSERT(next);
+ while (! (bhvtovfsops(next))->vfs_freeze)
+ next = BHV_NEXT(next);
+ return ((*bhvtovfsops(next)->vfs_freeze)(next));
+}
+
+void
+vfs_thaw(
+ struct bhv_desc *bdp)
+{
+ struct bhv_desc *next = bdp;
+
+ ASSERT(next);
+ while (! (bhvtovfsops(next))->vfs_thaw)
+ next = BHV_NEXT(next);
+ ((*bhvtovfsops(next)->vfs_thaw)(next));
+}
+
vfs_t *
vfs_allocate( void )
{
===== fs/xfs/linux/xfs_vfs.h 1.18 vs edited =====
--- 1.18/fs/xfs/linux/xfs_vfs.h Fri Jan 9 06:59:58 2004
+++ edited/fs/xfs/linux/xfs_vfs.h Sat Mar 13 17:55:33 2004
@@ -112,6 +112,8 @@
typedef void (*vfs_init_vnode_t)(bhv_desc_t *,
struct vnode *, bhv_desc_t *, int);
typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
+typedef int (*vfs_freeze_t)(bhv_desc_t *);
+typedef void (*vfs_thaw_t)(bhv_desc_t *);

typedef struct vfsops {
bhv_position_t vf_position; /* behavior chain position */
@@ -128,6 +130,8 @@
vfs_quotactl_t vfs_quotactl; /* disk quota */
vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */
vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */
+ vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */
+ vfs_thaw_t vfs_thaw; /* .. and release it again */
} vfsops_t;

/*
@@ -147,6 +151,8 @@
#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
+#define VFS_FREEZE(v, rv) ((rv) = vfs_freeze(VHEAD(v)))
+#define VFS_THAW(v) ( vfs_thaw(VHEAD(v)) )

/*
* PVFS's. Operates on behavior descriptor pointers.
@@ -164,6 +170,8 @@
#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p))
#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) )
#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) )
+#define PVFS_FREEZE(b, rv) ((rv) = vfs_freeze(b))
+#define PVFS_THAW(b) ( vfs_thaw(b) )

extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
@@ -178,6 +186,8 @@
extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
+extern int vfs_freeze(bhv_desc_t *);
+extern void vfs_thaw(bhv_desc_t *);

typedef struct bhv_vfsops {
struct vfsops bhv_common;
===== include/linux/buffer_head.h 1.46 vs edited =====
--- 1.46/include/linux/buffer_head.h Tue Jan 20 00:38:11 2004
+++ edited/include/linux/buffer_head.h Sat Mar 13 15:35:32 2004
@@ -164,6 +164,8 @@
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
void wake_up_buffer(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *);
+void thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
===== include/linux/fs.h 1.290 vs edited =====
--- 1.290/include/linux/fs.h Fri Mar 12 10:32:59 2004
+++ edited/include/linux/fs.h Sat Mar 13 15:35:44 2004
@@ -344,6 +344,7 @@
struct inode * bd_inode; /* will die */
int bd_openers;
struct semaphore bd_sem; /* open/close mutex */
+ struct semaphore bd_mount_sem; /* mount mutex */
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;

2004-03-14 14:00:54

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sat, Mar 13, 2004 at 04:33:46PM +0000, Christoph Hellwig wrote:
> Here's the reworked patch I have, ignoring dm but converting the xfs-internal
> interfaces that do freezing. But without an interface change we still need
> to do all the flushing a second time inside XFS which I absoutely dislike.
>
> Let me think about how to do this nicer.

Okay, here's a new patch that basically moves all the XFS freezing
infrastructure to the VFS, leavin only writing of the log record and some
transaction count handing inside XFS.

I've given it some heavy testing with xfs_freeze, dm is still not updated.

P.S. this patch now kills 16 lines of kernel code summarized :)

--- 1.155/fs/block_dev.c Fri Mar 12 10:33:01 2004
+++ edited/fs/block_dev.c Sat Mar 13 15:28:49 2004
@@ -251,6 +251,7 @@
{
memset(bdev, 0, sizeof(*bdev));
sema_init(&bdev->bd_sem, 1);
+ sema_init(&bdev->bd_mount_sem, 1);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
inode_init_once(&ei->vfs_inode);
===== fs/buffer.c 1.224 vs edited =====
--- 1.224/fs/buffer.c Sun Mar 7 08:16:11 2004
+++ edited/fs/buffer.c Sun Mar 14 15:37:00 2004
@@ -260,6 +260,69 @@
}

/*
+ * triggered by the device mapper code to lock a filesystem and force
+ * it into a consistent state.
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until unlockfs is called. If a super is found on this
+ * block device, we hould a read lock on the s->s_umount sem to make sure
+ * nobody unmounts until the snapshot creation is done
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ down(&bdev->bd_mount_sem);
+ sb = get_super(bdev);
+ if (sb && !(sb->s_flags & MS_RDONLY)) {
+ sb->s_frozen = SB_FREEZE_WRITE;
+ wmb();
+
+ sync_inodes_sb(sb, 0);
+ DQUOT_SYNC(sb);
+
+ sb->s_frozen = SB_FREEZE_TRANS;
+ wmb();
+
+ lock_super(sb);
+ if (sb->s_dirt && sb->s_op->write_super)
+ sb->s_op->write_super(sb);
+ unlock_super(sb);
+
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+
+ sync_blockdev(sb->s_bdev);
+ sync_inodes_sb(sb, 1);
+ sync_blockdev(sb->s_bdev);
+
+ if (sb->s_op->write_super_lockfs)
+ sb->s_op->write_super_lockfs(sb);
+ }
+
+ sync_blockdev(bdev);
+ return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+ if (sb) {
+ BUG_ON(sb->s_bdev != bdev);
+
+ if (sb->s_op->unlockfs)
+ sb->s_op->unlockfs(sb);
+ sb->s_frozen = SB_UNFROZEN;
+ wmb();
+ wake_up(&sb->s_wait_unfrozen);
+ drop_super(sb);
+ }
+
+ up(&bdev->bd_mount_sem);
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+/*
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
*/
===== fs/super.c 1.115 vs edited =====
--- 1.115/fs/super.c Fri Mar 12 10:30:24 2004
+++ edited/fs/super.c Sun Mar 14 14:24:56 2004
@@ -77,6 +77,7 @@
sema_init(&s->s_dquot.dqio_sem, 1);
sema_init(&s->s_dquot.dqonoff_sem, 1);
init_rwsem(&s->s_dquot.dqptr_sem);
+ init_waitqueue_head(&s->s_wait_unfrozen);
s->s_maxbytes = MAX_NON_LFS;
s->dq_op = sb_dquot_ops;
s->s_qcop = sb_quotactl_ops;
@@ -621,7 +622,14 @@
if (IS_ERR(bdev))
return (struct super_block *)bdev;

+ /*
+ * once the super is inserted into the list by sget, s_umount
+ * will protect the lockfs code from trying to start a snapshot
+ * while we are mounting
+ */
+ down(&bdev->bd_mount_sem);
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+ up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto out;

===== fs/xfs/xfs_fsops.c 1.11 vs edited =====
--- 1.11/fs/xfs/xfs_fsops.c Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/xfs_fsops.c Sun Mar 14 14:03:01 2004
@@ -582,63 +582,25 @@
}

int
-xfs_fs_freeze(
- xfs_mount_t *mp)
-{
- vfs_t *vfsp;
- /*REFERENCED*/
- int error;
-
- vfsp = XFS_MTOVFS(mp);
-
- /* Stop new writers */
- xfs_start_freeze(mp, XFS_FREEZE_WRITE);
-
- /* Flush the refcache */
- xfs_refcache_purge_mp(mp);
-
- /* Flush delalloc and delwri data */
- VFS_SYNC(vfsp, SYNC_DELWRI|SYNC_WAIT, NULL, error);
-
- /* Pause transaction subsystem */
- xfs_start_freeze(mp, XFS_FREEZE_TRANS);
-
- /* Flush any remaining inodes into buffers */
- VFS_SYNC(vfsp, SYNC_ATTR|SYNC_WAIT, NULL, error);
-
- /* Push all buffers out to disk */
- xfs_binval(mp->m_ddev_targp);
- if (mp->m_rtdev_targp) {
- xfs_binval(mp->m_rtdev_targp);
- }
-
- /* Push the superblock and write an unmount record */
- xfs_log_unmount_write(mp);
- xfs_unmountfs_writesb(mp);
-
- return 0;
-}
-
-int
-xfs_fs_thaw(
- xfs_mount_t *mp)
-{
- xfs_finish_freeze(mp);
- return 0;
-}
-
-int
xfs_fs_goingdown(
xfs_mount_t *mp,
__uint32_t inflags)
{
- switch (inflags)
- {
- case XFS_FSOP_GOING_FLAGS_DEFAULT:
- xfs_fs_freeze(mp);
- xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
- xfs_fs_thaw(mp);
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (inflags) {
+ case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+ struct vfs *vfsp = XFS_MTOVFS(mp);
+ struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
+
+ if (sb) {
+ xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+ thaw_bdev(sb->s_bdev, sb);
+ }
+
break;
+ }
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
break;
===== fs/xfs/xfs_fsops.h 1.3 vs edited =====
--- 1.3/fs/xfs/xfs_fsops.h Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/xfs_fsops.h Sun Mar 14 14:03:33 2004
@@ -60,14 +60,6 @@
xfs_fsop_resblks_t *outval);

int
-xfs_fs_freeze(
- xfs_mount_t *mp);
-
-int
-xfs_fs_thaw(
- xfs_mount_t *mp);
-
-int
xfs_fs_goingdown(
xfs_mount_t *mp,
__uint32_t inflags);
===== fs/xfs/xfs_log.c 1.34 vs edited =====
--- 1.34/fs/xfs/xfs_log.c Sat Mar 6 04:16:30 2004
+++ edited/fs/xfs/xfs_log.c Sun Mar 14 13:59:50 2004
@@ -820,7 +820,7 @@
xlog_t *log = mp->m_log;
vfs_t *vfsp = XFS_MTOVFS(mp);

- if (mp->m_frozen || XFS_FORCED_SHUTDOWN(mp) ||
+ if (vfsp->vfs_super->s_frozen || XFS_FORCED_SHUTDOWN(mp) ||
(vfsp->vfs_flag & VFS_RDONLY))
return 0;

===== fs/xfs/xfs_mount.c 1.41 vs edited =====
--- 1.41/fs/xfs/xfs_mount.c Fri Feb 27 07:54:47 2004
+++ edited/fs/xfs/xfs_mount.c Sun Mar 14 14:01:45 2004
@@ -139,9 +139,6 @@
*/
xfs_trans_ail_init(mp);

- /* Init freeze sync structures */
- spinlock_init(&mp->m_freeze_lock, "xfs_freeze");
- init_sv(&mp->m_wait_unfreeze, SV_DEFAULT, "xfs_freeze", 0);
atomic_set(&mp->m_active_trans, 0);

return mp;
@@ -191,8 +188,6 @@
VFS_REMOVEBHV(vfsp, &mp->m_bhv);
}

- spinlock_destroy(&mp->m_freeze_lock);
- sv_destroy(&mp->m_wait_unfreeze);
kmem_free(mp, sizeof(xfs_mount_t));
}

@@ -1584,60 +1579,4 @@
}
xfs_mod_sb(tp, fields);
xfs_trans_commit(tp, 0, NULL);
-}
-
-/* Functions to lock access out of the filesystem for forced
- * shutdown or snapshot.
- */
-
-void
-xfs_start_freeze(
- xfs_mount_t *mp,
- int level)
-{
- unsigned long s = mutex_spinlock(&mp->m_freeze_lock);
-
- mp->m_frozen = level;
- mutex_spinunlock(&mp->m_freeze_lock, s);
-
- if (level == XFS_FREEZE_TRANS) {
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
- }
-}
-
-void
-xfs_finish_freeze(
- xfs_mount_t *mp)
-{
- unsigned long s = mutex_spinlock(&mp->m_freeze_lock);
-
- if (mp->m_frozen) {
- mp->m_frozen = 0;
- sv_broadcast(&mp->m_wait_unfreeze);
- }
-
- mutex_spinunlock(&mp->m_freeze_lock, s);
-}
-
-void
-xfs_check_frozen(
- xfs_mount_t *mp,
- bhv_desc_t *bdp,
- int level)
-{
- unsigned long s;
-
- if (mp->m_frozen) {
- s = mutex_spinlock(&mp->m_freeze_lock);
-
- if (mp->m_frozen < level) {
- mutex_spinunlock(&mp->m_freeze_lock, s);
- } else {
- sv_wait(&mp->m_wait_unfreeze, 0, &mp->m_freeze_lock, s);
- }
- }
-
- if (level == XFS_FREEZE_TRANS)
- atomic_inc(&mp->m_active_trans);
}
===== fs/xfs/xfs_mount.h 1.25 vs edited =====
--- 1.25/fs/xfs/xfs_mount.h Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/xfs_mount.h Sun Mar 14 14:02:01 2004
@@ -379,10 +379,6 @@
struct xfs_dmops m_dm_ops; /* vector of DMI ops */
struct xfs_qmops m_qm_ops; /* vector of XQM ops */
struct xfs_ioops m_io_ops; /* vector of I/O ops */
- lock_t m_freeze_lock; /* Lock for m_frozen */
- uint m_frozen; /* FS frozen for shutdown or
- * snapshot */
- sv_t m_wait_unfreeze;/* waiting to unfreeze */
atomic_t m_active_trans; /* number trans frozen */
} xfs_mount_t;

@@ -557,16 +553,6 @@
extern void xfs_initialize_perag(xfs_mount_t *, int);
extern void xfs_xlatesb(void *, struct xfs_sb *, int, xfs_arch_t,
__int64_t);
-
-/*
- * Flags for freeze operations.
- */
-#define XFS_FREEZE_WRITE 1
-#define XFS_FREEZE_TRANS 2
-
-extern void xfs_start_freeze(xfs_mount_t *, int);
-extern void xfs_finish_freeze(xfs_mount_t *);
-extern void xfs_check_frozen(xfs_mount_t *, bhv_desc_t *, int);

extern struct vfsops xfs_vfsops;
extern struct vnodeops xfs_vnodeops;
===== fs/xfs/xfs_trans.c 1.14 vs edited =====
--- 1.14/fs/xfs/xfs_trans.c Fri Jan 9 00:18:09 2004
+++ edited/fs/xfs/xfs_trans.c Sun Mar 14 14:01:41 2004
@@ -131,7 +131,9 @@
xfs_mount_t *mp,
uint type)
{
- xfs_check_frozen(mp, NULL, XFS_FREEZE_TRANS);
+ vfs_check_frozen(XFS_MTOVFS(mp)->vfs_super, SB_FREEZE_TRANS);
+ atomic_inc(&mp->m_active_trans);
+
return (_xfs_trans_alloc(mp, type));

}
===== fs/xfs/xfs_vfsops.c 1.57 vs edited =====
--- 1.57/fs/xfs/xfs_vfsops.c Wed Mar 3 05:52:58 2004
+++ edited/fs/xfs/xfs_vfsops.c Sun Mar 14 14:42:21 2004
@@ -1544,6 +1544,11 @@
}
}

+ if (XFS_MTOVFS(mp)->vfs_super->s_frozen == SB_FREEZE_TRANS) {
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+ }
+
return XFS_ERROR(last_error);
}

@@ -1853,6 +1858,17 @@
return 0;
}

+STATIC void
+xfs_freeze(
+ bhv_desc_t *bdp)
+{
+ xfs_mount_t *mp = XFS_BHVTOM(bdp);
+
+ /* Push the superblock and write an unmount record */
+ xfs_log_unmount_write(mp);
+ xfs_unmountfs_writesb(mp);
+}
+

vfsops_t xfs_vfsops = {
BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
@@ -1869,4 +1885,5 @@
.vfs_quotactl = (vfs_quotactl_t)fs_nosys,
.vfs_init_vnode = xfs_initialize_vnode,
.vfs_force_shutdown = xfs_do_force_shutdown,
+ .vfs_freeze = xfs_freeze,
};
===== fs/xfs/linux/xfs_ioctl.c 1.21 vs edited =====
--- 1.21/fs/xfs/linux/xfs_ioctl.c Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/linux/xfs_ioctl.c Sat Mar 13 18:23:23 2004
@@ -825,13 +825,14 @@
case XFS_IOC_FREEZE:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- xfs_fs_freeze(mp);
+
+ freeze_bdev(inode->i_sb->s_bdev);
return 0;

case XFS_IOC_THAW:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- xfs_fs_thaw(mp);
+ thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
return 0;

case XFS_IOC_GOINGDOWN: {
===== fs/xfs/linux/xfs_lrw.c 1.40 vs edited =====
--- 1.40/fs/xfs/linux/xfs_lrw.c Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/linux/xfs_lrw.c Sun Mar 14 14:47:00 2004
@@ -682,8 +682,6 @@
io = &xip->i_iocore;
mp = io->io_mount;

- xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);
-
if (XFS_FORCED_SHUTDOWN(mp)) {
return -EIO;
}
===== fs/xfs/linux/xfs_super.c 1.70 vs edited =====
--- 1.70/fs/xfs/linux/xfs_super.c Sat Mar 6 04:46:51 2004
+++ edited/fs/xfs/linux/xfs_super.c Sun Mar 14 14:43:46 2004
@@ -589,28 +589,7 @@
linvfs_freeze_fs(
struct super_block *sb)
{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- vnode_t *vp;
- int error;
-
- if (sb->s_flags & MS_RDONLY)
- return;
- VFS_ROOT(vfsp, &vp, error);
- VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_FREEZE, 0, error);
- VN_RELE(vp);
-}
-
-STATIC void
-linvfs_unfreeze_fs(
- struct super_block *sb)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- vnode_t *vp;
- int error;
-
- VFS_ROOT(vfsp, &vp, error);
- VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_THAW, 0, error);
- VN_RELE(vp);
+ VFS_FREEZE(LINVFS_GET_VFS(sb));
}

STATIC struct dentry *
@@ -850,7 +829,6 @@
.write_super = linvfs_write_super,
.sync_fs = linvfs_sync_super,
.write_super_lockfs = linvfs_freeze_fs,
- .unlockfs = linvfs_unfreeze_fs,
.statfs = linvfs_statfs,
.remount_fs = linvfs_remount,
.show_options = linvfs_show_options,
===== fs/xfs/linux/xfs_vfs.c 1.11 vs edited =====
--- 1.11/fs/xfs/linux/xfs_vfs.c Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/linux/xfs_vfs.c Sun Mar 14 14:43:15 2004
@@ -230,6 +230,18 @@
((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
}

+void
+vfs_freeze(
+ struct bhv_desc *bdp)
+{
+ struct bhv_desc *next = bdp;
+
+ ASSERT(next);
+ while (! (bhvtovfsops(next))->vfs_freeze)
+ next = BHV_NEXT(next);
+ ((*bhvtovfsops(next)->vfs_freeze)(next));
+}
+
vfs_t *
vfs_allocate( void )
{
===== fs/xfs/linux/xfs_vfs.h 1.18 vs edited =====
--- 1.18/fs/xfs/linux/xfs_vfs.h Fri Jan 9 06:59:58 2004
+++ edited/fs/xfs/linux/xfs_vfs.h Sun Mar 14 14:43:07 2004
@@ -112,6 +112,7 @@
typedef void (*vfs_init_vnode_t)(bhv_desc_t *,
struct vnode *, bhv_desc_t *, int);
typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
+typedef void (*vfs_freeze_t)(bhv_desc_t *);

typedef struct vfsops {
bhv_position_t vf_position; /* behavior chain position */
@@ -128,6 +129,7 @@
vfs_quotactl_t vfs_quotactl; /* disk quota */
vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */
vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */
+ vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */
} vfsops_t;

/*
@@ -147,6 +149,7 @@
#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
+#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) )

/*
* PVFS's. Operates on behavior descriptor pointers.
@@ -164,6 +167,7 @@
#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p))
#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) )
#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) )
+#define PVFS_FREEZE(b) ( vfs_freeze(b) )

extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
@@ -178,6 +182,7 @@
extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
+extern void vfs_freeze(bhv_desc_t *);

typedef struct bhv_vfsops {
struct vfsops bhv_common;
===== include/linux/buffer_head.h 1.46 vs edited =====
--- 1.46/include/linux/buffer_head.h Tue Jan 20 00:38:11 2004
+++ edited/include/linux/buffer_head.h Sat Mar 13 15:35:32 2004
@@ -164,6 +164,8 @@
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
void wake_up_buffer(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *);
+void thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
===== include/linux/fs.h 1.290 vs edited =====
--- 1.290/include/linux/fs.h Fri Mar 12 10:32:59 2004
+++ edited/include/linux/fs.h Sun Mar 14 14:08:14 2004
@@ -344,6 +344,7 @@
struct inode * bd_inode; /* will die */
int bd_openers;
struct semaphore bd_sem; /* open/close mutex */
+ struct semaphore bd_mount_sem; /* mount mutex */
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;
@@ -712,6 +713,9 @@
struct list_head s_instances;
struct quota_info s_dquot; /* Diskquota specific options */

+ int s_frozen;
+ wait_queue_head_t s_wait_unfrozen;
+
char s_id[32]; /* Informational name */

struct kobject kobj; /* anchor for sysfs */
@@ -723,6 +727,18 @@
*/
struct semaphore s_vfs_rename_sem; /* Kludge */
};
+
+/*
+ * Snapshotting support.
+ */
+enum {
+ SB_UNFROZEN = 0,
+ SB_FREEZE_WRITE = 1,
+ SB_FREEZE_TRANS = 2,
+};
+
+#define vfs_check_frozen(sb, level) \
+ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))

/*
* Superblock locking.
===== mm/filemap.c 1.225 vs edited =====
--- 1.225/mm/filemap.c Mon Mar 8 15:21:17 2004
+++ edited/mm/filemap.c Sun Mar 14 14:16:40 2004
@@ -1746,6 +1746,8 @@
unsigned long seg;
char __user *buf;

+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
ocount = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];

2004-03-14 15:21:21

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sun, 2004-03-14 at 09:00, Christoph Hellwig wrote:
> On Sat, Mar 13, 2004 at 04:33:46PM +0000, Christoph Hellwig wrote:
> > Here's the reworked patch I have, ignoring dm but converting the xfs-internal
> > interfaces that do freezing. But without an interface change we still need
> > to do all the flushing a second time inside XFS which I absoutely dislike.
> >
> > Let me think about how to do this nicer.
>
> Okay, here's a new patch that basically moves all the XFS freezing
> infrastructure to the VFS, leavin only writing of the log record and some
> transaction count handing inside XFS.
>
> I've given it some heavy testing with xfs_freeze, dm is still not updated.
>
> P.S. this patch now kills 16 lines of kernel code summarized :)
>

It looks good, I'll give it a try.

-chris


2004-03-14 18:44:43

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

Christoph Hellwig <[email protected]> wrote:
>
> + * This takes the block device bd_mount_sem to make sure no new mounts
> + * happen on bdev until unlockfs is called. If a super is found on this
> + * block device, we hould a read lock on the s->s_umount sem to make sure
> + * nobody unmounts until the snapshot creation is done
> + */
> +struct super_block *freeze_bdev(struct block_device *bdev)

I think you do need s_umount, as the comments say. But this patch doesn't
touch it.

2004-03-14 18:53:01

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sun, Mar 14, 2004 at 10:44:39AM -0800, Andrew Morton wrote:
> > + * This takes the block device bd_mount_sem to make sure no new mounts
> > + * happen on bdev until unlockfs is called. If a super is found on this
> > + * block device, we hould a read lock on the s->s_umount sem to make sure
> > + * nobody unmounts until the snapshot creation is done
> > + */
> > +struct super_block *freeze_bdev(struct block_device *bdev)
>
> I think you do need s_umount, as the comments say. But this patch doesn't
> touch it.

get_super takes it for us.

2004-03-14 18:54:26

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sun, 2004-03-14 at 13:44, Andrew Morton wrote:
> Christoph Hellwig <[email protected]> wrote:
> >
> > + * This takes the block device bd_mount_sem to make sure no new mounts
> > + * happen on bdev until unlockfs is called. If a super is found on this
> > + * block device, we hould a read lock on the s->s_umount sem to make sure
> > + * nobody unmounts until the snapshot creation is done
> > + */
> > +struct super_block *freeze_bdev(struct block_device *bdev)
>
> I think you do need s_umount, as the comments say. But this patch doesn't
> touch it.

get_super gives us a read lock on it.

-chris


2004-03-26 10:25:54

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Sun, Mar 14, 2004 at 10:23:31AM -0500, Chris Mason wrote:
> > P.S. this patch now kills 16 lines of kernel code summarized :)
> >
>
> It looks good, I'll give it a try.

ping? I've seen you merged the old patch into the suse tree, and having
shipping distros with incompatible APIs doesn't sound exactly like a good
idea..

2004-03-26 13:26:09

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Fri, 2004-03-26 at 05:25, Christoph Hellwig wrote:
> On Sun, Mar 14, 2004 at 10:23:31AM -0500, Chris Mason wrote:
> > > P.S. this patch now kills 16 lines of kernel code summarized :)
> > >
> >
> > It looks good, I'll give it a try.
>
> ping? I've seen you merged the old patch into the suse tree, and having
> shipping distros with incompatible APIs doesn't sound exactly like a good
> idea..

I'm replacing the old one in the suse tree, I just got sucked into to a
few critical tasks. Yours is the way to go.

-chris


2004-04-01 20:36:47

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Fri, 2004-03-26 at 05:25, Christoph Hellwig wrote:
> On Sun, Mar 14, 2004 at 10:23:31AM -0500, Chris Mason wrote:
> > > P.S. this patch now kills 16 lines of kernel code summarized :)
> > >
> >
> > It looks good, I'll give it a try.
>
> ping? I've seen you merged the old patch into the suse tree, and having
> shipping distros with incompatible APIs doesn't sound exactly like a good
> idea..

Christoph's vfs patch looks good, I've stripped out the XFS bits (FS
parts should probably be in different patches) and made one small
change. freeze/thaw now check to make sure bdev != NULL.

On the device mapper side, I had to add a struct super_block pointer to
the dm struct and changed it to pin the bdev struct during the freeze.
Before, it could go away due to bdput, leading to deadlock when someone
else got a bdev struct with bd_mount_sem held.

New patches attached, along with an incremental to clearly show my
changes to the dm patch.

-chris


Attachments:
vfs-lockfs-2 (5.56 kB)
dm-09-dm-lockfs (3.14 kB)
dm-incr (851.00 B)
Download all attachments

2004-04-01 22:33:37

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Thursday 01 April 2004 2:35 pm, Chris Mason wrote:
> On Fri, 2004-03-26 at 05:25, Christoph Hellwig wrote:
> > On Sun, Mar 14, 2004 at 10:23:31AM -0500, Chris Mason wrote:
> > > > P.S. this patch now kills 16 lines of kernel code summarized :)
> > >
> > > It looks good, I'll give it a try.
> >
> > ping? I've seen you merged the old patch into the suse tree, and having
> > shipping distros with incompatible APIs doesn't sound exactly like a good
> > idea..
>
> Christoph's vfs patch looks good, I've stripped out the XFS bits (FS
> parts should probably be in different patches) and made one small
> change. freeze/thaw now check to make sure bdev != NULL.
>
> On the device mapper side, I had to add a struct super_block pointer to
> the dm struct and changed it to pin the bdev struct during the freeze.
> Before, it could go away due to bdput, leading to deadlock when someone
> else got a bdev struct with bd_mount_sem held.
>
> New patches attached, along with an incremental to clearly show my
> changes to the dm patch.
>
> -chris

Thanks Chris! I'll test the new VFS-lock code in the morning.

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2004-04-02 20:01:19

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Thursday 01 April 2004 2:35 pm, Chris Mason wrote:
> On Fri, 2004-03-26 at 05:25, Christoph Hellwig wrote:
> > On Sun, Mar 14, 2004 at 10:23:31AM -0500, Chris Mason wrote:
> > > > P.S. this patch now kills 16 lines of kernel code summarized :)
> > >
> > > It looks good, I'll give it a try.
> >
> > ping? I've seen you merged the old patch into the suse tree, and having
> > shipping distros with incompatible APIs doesn't sound exactly like a good
> > idea..
>
> Christoph's vfs patch looks good, I've stripped out the XFS bits (FS
> parts should probably be in different patches) and made one small
> change. freeze/thaw now check to make sure bdev != NULL.

Does this mean there are patches required for XFS to work properly with
this new VFS-lock patch? I'm getting hangs when suspending a DM device
that contains an XFS filesystem with active I/O. Ext3, Reiser, and JFS
seem to behave as expected.

Backtrace from the "cp" command on the XFS filesystem:

kernel: cp D C1201BE0 0 784 727 (NOTLB)
kernel: ccf09cd0 00000086 00000000 c1201be0 c1202540 00000017 00000017 ccf09d78
kernel: 00000018 c1201be0 0001d058 bcc411f2 00000040 cf1f2348 cfd3b590 cfd3b79c
kernel: ccf09000 ccf09d68 00000001 ccf09de4 c0148644 0000d0e4 c2c21000 00000000
kernel: Call Trace:
kernel: [<c0148644>] generic_file_aio_write_nolock+0xe4/0xe20
kernel: [<c0124e20>] autoremove_wake_function+0x0/0x50
kernel: [<c01cb811>] pathrelse+0x31/0x50
kernel: [<c0124e20>] autoremove_wake_function+0x0/0x50
kernel: [<c01d722c>] do_journal_end+0x88c/0xbf0
kernel: [<c0121e80>] default_wake_function+0x0/0x20
kernel: [<c01d5c3c>] journal_end+0x9c/0xc0
kernel: [<c02a30c8>] xfs_ichgtime+0xf8/0xfa
kernel: [<c02cfdc4>] xfs_write+0x264/0x7d0
kernel: [<c01932ac>] __mark_inode_dirty+0x17c/0x190
kernel: [<c01471d8>] do_generic_mapping_read+0x128/0x3e0
kernel: [<c018bfa1>] update_atime+0xd1/0xe0
kernel: [<c02caff0>] linvfs_write+0xb0/0x120
kernel: [<c016ba57>] do_sync_write+0x87/0xc0
kernel: [<c0177ecf>] cp_new_stat64+0x10f/0x130
kernel: [<c016bb3a>] vfs_write+0xaa/0x130
kernel: [<c016bc5f>] sys_write+0x3f/0x60
kernel: [<c043f1cf>] syscall_call+0x7/0xb

Backtrace from the "dmsetup resume" call on that device:

kernel: dm D C1201BE0 0 788 758 (NOTLB)
kernel: c2f21c58 00000082 00000000 c1201be0 c1202540 00000000 00000000 00000000
kernel: 00000010 c1201be0 00035e9e fd17af8b 00000040 cf85d968 00000000 00000010
kernel: c0fc210c cf85d7b0 fffeffff c2f21c88 c02e050c c0fa32fc c2f21c98 c0fc2118
kernel: Call Trace:
kernel: [<c02e050c>] rwsem_down_read_failed+0xbc/0x1b0
kernel: [<c029d5ec>] .text.lock.xfs_iget+0x71/0x145
kernel: [<c02bd7bc>] xfs_sync_inodes+0x28c/0xaa0
kernel: [<c02be2aa>] xfs_syncsub+0x2da/0x2f0
kernel: [<c02bd526>] xfs_sync+0x26/0x30
kernel: [<c02d19d1>] vfs_sync+0x41/0x50
kernel: [<c0296ded>] xfs_fs_freeze+0x3d/0xb0
kernel: [<c02cd2ac>] xfs_ioctl+0x7bc/0xa60
kernel: [<c0154d93>] pagevec_lookup_tag+0x33/0x40
kernel: [<c0146409>] wait_on_page_writeback_range+0x79/0x130
kernel: [<c018b308>] igrab+0x88/0xa0
kernel: [<c02d2794>] vn_hold+0x44/0x90
kernel: [<c02bd32f>] xfs_root+0x1f/0x30
kernel: [<c02d11e6>] linvfs_freeze_fs+0x66/0x80
kernel: [<c016d686>] freeze_bdev+0x116/0x140
kernel: [<c03a747a>] __lock_fs+0x5a/0xb0
kernel: [<c03a75db>] dm_suspend+0x7b/0x210
kernel: [<c0121e80>] default_wake_function+0x0/0x20
kernel: [<c0121e80>] default_wake_function+0x0/0x20
kernel: [<c03aa164>] __get_name_cell+0x14/0x70
kernel: [<c03ab26c>] do_resume+0x16c/0x1b0
kernel: [<c03ac2c6>] ctl_ioctl+0xe6/0x180
kernel: [<c03ab2b0>] dev_suspend+0x0/0x20
kernel: [<c0181b82>] sys_ioctl+0x152/0x300
kernel: [<c043f1cf>] syscall_call+0x7/0xb


--
Kevin Corry
[email protected]
http://evms.sourceforge.net/

2004-04-02 20:03:44

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Fri, Apr 02, 2004 at 02:00:18PM -0600, Kevin Corry wrote:
> > Christoph's vfs patch looks good, I've stripped out the XFS bits (FS
> > parts should probably be in different patches) and made one small
> > change. freeze/thaw now check to make sure bdev != NULL.
>
> Does this mean there are patches required for XFS to work properly with
> this new VFS-lock patch? I'm getting hangs when suspending a DM device
> that contains an XFS filesystem with active I/O. Ext3, Reiser, and JFS
> seem to behave as expected.

Yes. Here's the patch I sent out earlier (xfs bits + common code):

--- 1.155/fs/block_dev.c Fri Mar 12 10:33:01 2004
+++ edited/fs/block_dev.c Sat Mar 13 15:28:49 2004
@@ -251,6 +251,7 @@
{
memset(bdev, 0, sizeof(*bdev));
sema_init(&bdev->bd_sem, 1);
+ sema_init(&bdev->bd_mount_sem, 1);
INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
inode_init_once(&ei->vfs_inode);
===== fs/buffer.c 1.224 vs edited =====
--- 1.224/fs/buffer.c Sun Mar 7 08:16:11 2004
+++ edited/fs/buffer.c Sun Mar 14 15:37:00 2004
@@ -260,6 +260,69 @@
}

/*
+ * triggered by the device mapper code to lock a filesystem and force
+ * it into a consistent state.
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until unlockfs is called. If a super is found on this
+ * block device, we hould a read lock on the s->s_umount sem to make sure
+ * nobody unmounts until the snapshot creation is done
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ down(&bdev->bd_mount_sem);
+ sb = get_super(bdev);
+ if (sb && !(sb->s_flags & MS_RDONLY)) {
+ sb->s_frozen = SB_FREEZE_WRITE;
+ wmb();
+
+ sync_inodes_sb(sb, 0);
+ DQUOT_SYNC(sb);
+
+ sb->s_frozen = SB_FREEZE_TRANS;
+ wmb();
+
+ lock_super(sb);
+ if (sb->s_dirt && sb->s_op->write_super)
+ sb->s_op->write_super(sb);
+ unlock_super(sb);
+
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, 1);
+
+ sync_blockdev(sb->s_bdev);
+ sync_inodes_sb(sb, 1);
+ sync_blockdev(sb->s_bdev);
+
+ if (sb->s_op->write_super_lockfs)
+ sb->s_op->write_super_lockfs(sb);
+ }
+
+ sync_blockdev(bdev);
+ return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+ if (sb) {
+ BUG_ON(sb->s_bdev != bdev);
+
+ if (sb->s_op->unlockfs)
+ sb->s_op->unlockfs(sb);
+ sb->s_frozen = SB_UNFROZEN;
+ wmb();
+ wake_up(&sb->s_wait_unfrozen);
+ drop_super(sb);
+ }
+
+ up(&bdev->bd_mount_sem);
+}
+EXPORT_SYMBOL(thaw_bdev);
+
+/*
* sync everything. Start out by waking pdflush, because that writes back
* all queues in parallel.
*/
===== fs/super.c 1.115 vs edited =====
--- 1.115/fs/super.c Fri Mar 12 10:30:24 2004
+++ edited/fs/super.c Sun Mar 14 14:24:56 2004
@@ -77,6 +77,7 @@
sema_init(&s->s_dquot.dqio_sem, 1);
sema_init(&s->s_dquot.dqonoff_sem, 1);
init_rwsem(&s->s_dquot.dqptr_sem);
+ init_waitqueue_head(&s->s_wait_unfrozen);
s->s_maxbytes = MAX_NON_LFS;
s->dq_op = sb_dquot_ops;
s->s_qcop = sb_quotactl_ops;
@@ -621,7 +622,14 @@
if (IS_ERR(bdev))
return (struct super_block *)bdev;

+ /*
+ * once the super is inserted into the list by sget, s_umount
+ * will protect the lockfs code from trying to start a snapshot
+ * while we are mounting
+ */
+ down(&bdev->bd_mount_sem);
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+ up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto out;

===== fs/xfs/xfs_fsops.c 1.11 vs edited =====
--- 1.11/fs/xfs/xfs_fsops.c Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/xfs_fsops.c Sun Mar 14 14:03:01 2004
@@ -582,63 +582,25 @@
}

int
-xfs_fs_freeze(
- xfs_mount_t *mp)
-{
- vfs_t *vfsp;
- /*REFERENCED*/
- int error;
-
- vfsp = XFS_MTOVFS(mp);
-
- /* Stop new writers */
- xfs_start_freeze(mp, XFS_FREEZE_WRITE);
-
- /* Flush the refcache */
- xfs_refcache_purge_mp(mp);
-
- /* Flush delalloc and delwri data */
- VFS_SYNC(vfsp, SYNC_DELWRI|SYNC_WAIT, NULL, error);
-
- /* Pause transaction subsystem */
- xfs_start_freeze(mp, XFS_FREEZE_TRANS);
-
- /* Flush any remaining inodes into buffers */
- VFS_SYNC(vfsp, SYNC_ATTR|SYNC_WAIT, NULL, error);
-
- /* Push all buffers out to disk */
- xfs_binval(mp->m_ddev_targp);
- if (mp->m_rtdev_targp) {
- xfs_binval(mp->m_rtdev_targp);
- }
-
- /* Push the superblock and write an unmount record */
- xfs_log_unmount_write(mp);
- xfs_unmountfs_writesb(mp);
-
- return 0;
-}
-
-int
-xfs_fs_thaw(
- xfs_mount_t *mp)
-{
- xfs_finish_freeze(mp);
- return 0;
-}
-
-int
xfs_fs_goingdown(
xfs_mount_t *mp,
__uint32_t inflags)
{
- switch (inflags)
- {
- case XFS_FSOP_GOING_FLAGS_DEFAULT:
- xfs_fs_freeze(mp);
- xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
- xfs_fs_thaw(mp);
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (inflags) {
+ case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+ struct vfs *vfsp = XFS_MTOVFS(mp);
+ struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
+
+ if (sb) {
+ xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+ thaw_bdev(sb->s_bdev, sb);
+ }
+
break;
+ }
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
break;
===== fs/xfs/xfs_fsops.h 1.3 vs edited =====
--- 1.3/fs/xfs/xfs_fsops.h Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/xfs_fsops.h Sun Mar 14 14:03:33 2004
@@ -60,14 +60,6 @@
xfs_fsop_resblks_t *outval);

int
-xfs_fs_freeze(
- xfs_mount_t *mp);
-
-int
-xfs_fs_thaw(
- xfs_mount_t *mp);
-
-int
xfs_fs_goingdown(
xfs_mount_t *mp,
__uint32_t inflags);
===== fs/xfs/xfs_log.c 1.34 vs edited =====
--- 1.34/fs/xfs/xfs_log.c Sat Mar 6 04:16:30 2004
+++ edited/fs/xfs/xfs_log.c Sun Mar 14 13:59:50 2004
@@ -820,7 +820,7 @@
xlog_t *log = mp->m_log;
vfs_t *vfsp = XFS_MTOVFS(mp);

- if (mp->m_frozen || XFS_FORCED_SHUTDOWN(mp) ||
+ if (vfsp->vfs_super->s_frozen || XFS_FORCED_SHUTDOWN(mp) ||
(vfsp->vfs_flag & VFS_RDONLY))
return 0;

===== fs/xfs/xfs_mount.c 1.41 vs edited =====
--- 1.41/fs/xfs/xfs_mount.c Fri Feb 27 07:54:47 2004
+++ edited/fs/xfs/xfs_mount.c Sun Mar 14 14:01:45 2004
@@ -139,9 +139,6 @@
*/
xfs_trans_ail_init(mp);

- /* Init freeze sync structures */
- spinlock_init(&mp->m_freeze_lock, "xfs_freeze");
- init_sv(&mp->m_wait_unfreeze, SV_DEFAULT, "xfs_freeze", 0);
atomic_set(&mp->m_active_trans, 0);

return mp;
@@ -191,8 +188,6 @@
VFS_REMOVEBHV(vfsp, &mp->m_bhv);
}

- spinlock_destroy(&mp->m_freeze_lock);
- sv_destroy(&mp->m_wait_unfreeze);
kmem_free(mp, sizeof(xfs_mount_t));
}

@@ -1584,60 +1579,4 @@
}
xfs_mod_sb(tp, fields);
xfs_trans_commit(tp, 0, NULL);
-}
-
-/* Functions to lock access out of the filesystem for forced
- * shutdown or snapshot.
- */
-
-void
-xfs_start_freeze(
- xfs_mount_t *mp,
- int level)
-{
- unsigned long s = mutex_spinlock(&mp->m_freeze_lock);
-
- mp->m_frozen = level;
- mutex_spinunlock(&mp->m_freeze_lock, s);
-
- if (level == XFS_FREEZE_TRANS) {
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
- }
-}
-
-void
-xfs_finish_freeze(
- xfs_mount_t *mp)
-{
- unsigned long s = mutex_spinlock(&mp->m_freeze_lock);
-
- if (mp->m_frozen) {
- mp->m_frozen = 0;
- sv_broadcast(&mp->m_wait_unfreeze);
- }
-
- mutex_spinunlock(&mp->m_freeze_lock, s);
-}
-
-void
-xfs_check_frozen(
- xfs_mount_t *mp,
- bhv_desc_t *bdp,
- int level)
-{
- unsigned long s;
-
- if (mp->m_frozen) {
- s = mutex_spinlock(&mp->m_freeze_lock);
-
- if (mp->m_frozen < level) {
- mutex_spinunlock(&mp->m_freeze_lock, s);
- } else {
- sv_wait(&mp->m_wait_unfreeze, 0, &mp->m_freeze_lock, s);
- }
- }
-
- if (level == XFS_FREEZE_TRANS)
- atomic_inc(&mp->m_active_trans);
}
===== fs/xfs/xfs_mount.h 1.25 vs edited =====
--- 1.25/fs/xfs/xfs_mount.h Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/xfs_mount.h Sun Mar 14 14:02:01 2004
@@ -379,10 +379,6 @@
struct xfs_dmops m_dm_ops; /* vector of DMI ops */
struct xfs_qmops m_qm_ops; /* vector of XQM ops */
struct xfs_ioops m_io_ops; /* vector of I/O ops */
- lock_t m_freeze_lock; /* Lock for m_frozen */
- uint m_frozen; /* FS frozen for shutdown or
- * snapshot */
- sv_t m_wait_unfreeze;/* waiting to unfreeze */
atomic_t m_active_trans; /* number trans frozen */
} xfs_mount_t;

@@ -557,16 +553,6 @@
extern void xfs_initialize_perag(xfs_mount_t *, int);
extern void xfs_xlatesb(void *, struct xfs_sb *, int, xfs_arch_t,
__int64_t);
-
-/*
- * Flags for freeze operations.
- */
-#define XFS_FREEZE_WRITE 1
-#define XFS_FREEZE_TRANS 2
-
-extern void xfs_start_freeze(xfs_mount_t *, int);
-extern void xfs_finish_freeze(xfs_mount_t *);
-extern void xfs_check_frozen(xfs_mount_t *, bhv_desc_t *, int);

extern struct vfsops xfs_vfsops;
extern struct vnodeops xfs_vnodeops;
===== fs/xfs/xfs_trans.c 1.14 vs edited =====
--- 1.14/fs/xfs/xfs_trans.c Fri Jan 9 00:18:09 2004
+++ edited/fs/xfs/xfs_trans.c Sun Mar 14 14:01:41 2004
@@ -131,7 +131,9 @@
xfs_mount_t *mp,
uint type)
{
- xfs_check_frozen(mp, NULL, XFS_FREEZE_TRANS);
+ vfs_check_frozen(XFS_MTOVFS(mp)->vfs_super, SB_FREEZE_TRANS);
+ atomic_inc(&mp->m_active_trans);
+
return (_xfs_trans_alloc(mp, type));

}
===== fs/xfs/xfs_vfsops.c 1.57 vs edited =====
--- 1.57/fs/xfs/xfs_vfsops.c Wed Mar 3 05:52:58 2004
+++ edited/fs/xfs/xfs_vfsops.c Sun Mar 14 14:42:21 2004
@@ -1544,6 +1544,11 @@
}
}

+ if (XFS_MTOVFS(mp)->vfs_super->s_frozen == SB_FREEZE_TRANS) {
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+ }
+
return XFS_ERROR(last_error);
}

@@ -1853,6 +1858,17 @@
return 0;
}

+STATIC void
+xfs_freeze(
+ bhv_desc_t *bdp)
+{
+ xfs_mount_t *mp = XFS_BHVTOM(bdp);
+
+ /* Push the superblock and write an unmount record */
+ xfs_log_unmount_write(mp);
+ xfs_unmountfs_writesb(mp);
+}
+

vfsops_t xfs_vfsops = {
BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
@@ -1869,4 +1885,5 @@
.vfs_quotactl = (vfs_quotactl_t)fs_nosys,
.vfs_init_vnode = xfs_initialize_vnode,
.vfs_force_shutdown = xfs_do_force_shutdown,
+ .vfs_freeze = xfs_freeze,
};
===== fs/xfs/linux/xfs_ioctl.c 1.21 vs edited =====
--- 1.21/fs/xfs/linux/xfs_ioctl.c Fri Feb 27 07:28:05 2004
+++ edited/fs/xfs/linux/xfs_ioctl.c Sat Mar 13 18:23:23 2004
@@ -825,13 +825,14 @@
case XFS_IOC_FREEZE:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- xfs_fs_freeze(mp);
+
+ freeze_bdev(inode->i_sb->s_bdev);
return 0;

case XFS_IOC_THAW:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- xfs_fs_thaw(mp);
+ thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
return 0;

case XFS_IOC_GOINGDOWN: {
===== fs/xfs/linux/xfs_lrw.c 1.40 vs edited =====
--- 1.40/fs/xfs/linux/xfs_lrw.c Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/linux/xfs_lrw.c Sun Mar 14 14:47:00 2004
@@ -682,8 +682,6 @@
io = &xip->i_iocore;
mp = io->io_mount;

- xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);
-
if (XFS_FORCED_SHUTDOWN(mp)) {
return -EIO;
}
===== fs/xfs/linux/xfs_super.c 1.70 vs edited =====
--- 1.70/fs/xfs/linux/xfs_super.c Sat Mar 6 04:46:51 2004
+++ edited/fs/xfs/linux/xfs_super.c Sun Mar 14 14:43:46 2004
@@ -589,28 +589,7 @@
linvfs_freeze_fs(
struct super_block *sb)
{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- vnode_t *vp;
- int error;
-
- if (sb->s_flags & MS_RDONLY)
- return;
- VFS_ROOT(vfsp, &vp, error);
- VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_FREEZE, 0, error);
- VN_RELE(vp);
-}
-
-STATIC void
-linvfs_unfreeze_fs(
- struct super_block *sb)
-{
- vfs_t *vfsp = LINVFS_GET_VFS(sb);
- vnode_t *vp;
- int error;
-
- VFS_ROOT(vfsp, &vp, error);
- VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_THAW, 0, error);
- VN_RELE(vp);
+ VFS_FREEZE(LINVFS_GET_VFS(sb));
}

STATIC struct dentry *
@@ -850,7 +829,6 @@
.write_super = linvfs_write_super,
.sync_fs = linvfs_sync_super,
.write_super_lockfs = linvfs_freeze_fs,
- .unlockfs = linvfs_unfreeze_fs,
.statfs = linvfs_statfs,
.remount_fs = linvfs_remount,
.show_options = linvfs_show_options,
===== fs/xfs/linux/xfs_vfs.c 1.11 vs edited =====
--- 1.11/fs/xfs/linux/xfs_vfs.c Wed Mar 3 05:52:57 2004
+++ edited/fs/xfs/linux/xfs_vfs.c Sun Mar 14 14:43:15 2004
@@ -230,6 +230,18 @@
((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
}

+void
+vfs_freeze(
+ struct bhv_desc *bdp)
+{
+ struct bhv_desc *next = bdp;
+
+ ASSERT(next);
+ while (! (bhvtovfsops(next))->vfs_freeze)
+ next = BHV_NEXT(next);
+ ((*bhvtovfsops(next)->vfs_freeze)(next));
+}
+
vfs_t *
vfs_allocate( void )
{
===== fs/xfs/linux/xfs_vfs.h 1.18 vs edited =====
--- 1.18/fs/xfs/linux/xfs_vfs.h Fri Jan 9 06:59:58 2004
+++ edited/fs/xfs/linux/xfs_vfs.h Sun Mar 14 14:43:07 2004
@@ -112,6 +112,7 @@
typedef void (*vfs_init_vnode_t)(bhv_desc_t *,
struct vnode *, bhv_desc_t *, int);
typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
+typedef void (*vfs_freeze_t)(bhv_desc_t *);

typedef struct vfsops {
bhv_position_t vf_position; /* behavior chain position */
@@ -128,6 +129,7 @@
vfs_quotactl_t vfs_quotactl; /* disk quota */
vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */
vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */
+ vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */
} vfsops_t;

/*
@@ -147,6 +149,7 @@
#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
+#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) )

/*
* PVFS's. Operates on behavior descriptor pointers.
@@ -164,6 +167,7 @@
#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p))
#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) )
#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) )
+#define PVFS_FREEZE(b) ( vfs_freeze(b) )

extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
@@ -178,6 +182,7 @@
extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
+extern void vfs_freeze(bhv_desc_t *);

typedef struct bhv_vfsops {
struct vfsops bhv_common;
===== include/linux/buffer_head.h 1.46 vs edited =====
--- 1.46/include/linux/buffer_head.h Tue Jan 20 00:38:11 2004
+++ edited/include/linux/buffer_head.h Sat Mar 13 15:35:32 2004
@@ -164,6 +164,8 @@
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
void wake_up_buffer(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *);
+void thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
===== include/linux/fs.h 1.290 vs edited =====
--- 1.290/include/linux/fs.h Fri Mar 12 10:32:59 2004
+++ edited/include/linux/fs.h Sun Mar 14 14:08:14 2004
@@ -344,6 +344,7 @@
struct inode * bd_inode; /* will die */
int bd_openers;
struct semaphore bd_sem; /* open/close mutex */
+ struct semaphore bd_mount_sem; /* mount mutex */
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;
@@ -712,6 +713,9 @@
struct list_head s_instances;
struct quota_info s_dquot; /* Diskquota specific options */

+ int s_frozen;
+ wait_queue_head_t s_wait_unfrozen;
+
char s_id[32]; /* Informational name */

struct kobject kobj; /* anchor for sysfs */
@@ -723,6 +727,18 @@
*/
struct semaphore s_vfs_rename_sem; /* Kludge */
};
+
+/*
+ * Snapshotting support.
+ */
+enum {
+ SB_UNFROZEN = 0,
+ SB_FREEZE_WRITE = 1,
+ SB_FREEZE_TRANS = 2,
+};
+
+#define vfs_check_frozen(sb, level) \
+ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))

/*
* Superblock locking.
===== mm/filemap.c 1.225 vs edited =====
--- 1.225/mm/filemap.c Mon Mar 8 15:21:17 2004
+++ edited/mm/filemap.c Sun Mar 14 14:16:40 2004
@@ -1746,6 +1746,8 @@
unsigned long seg;
char __user *buf;

+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
ocount = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];

2004-04-02 20:27:08

by Kevin Corry

[permalink] [raw]
Subject: Re: [PATCH] lockfs patch for 2.6

On Friday 02 April 2004 2:02 pm, Christoph Hellwig wrote:
> On Fri, Apr 02, 2004 at 02:00:18PM -0600, Kevin Corry wrote:
> > > Christoph's vfs patch looks good, I've stripped out the XFS bits (FS
> > > parts should probably be in different patches) and made one small
> > > change. freeze/thaw now check to make sure bdev != NULL.
> >
> > Does this mean there are patches required for XFS to work properly with
> > this new VFS-lock patch? I'm getting hangs when suspending a DM device
> > that contains an XFS filesystem with active I/O. Ext3, Reiser, and JFS
> > seem to behave as expected.
>
> Yes. Here's the patch I sent out earlier (xfs bits + common code):

Thanks. Sorry I missed your patch the first time around. XFS works as expected
now.

--
Kevin Corry
[email protected]
http://evms.sourceforge.net/