2008-01-25 11:01:10

by Takashi Sato

[permalink] [raw]
Subject: [RFC] ext3 freeze feature

Hi,

Currently, ext3 doesn't have the freeze feature which suspends write
requests. So, we cannot get a backup which keeps the filesystem's
consistency with the storage device's features (snapshot, replication)
while it is mounted.
In many case, a commercial filesystems (e.g. VxFS) has the freeze
feature and it would be used to get the consistent backup.

So I am planning on implementing the ioctl of the freeze feature for ext3.
I think we can get the consistent backup with the following steps.
1. Freeze the filesystem with ioctl.
2. Separate the replication volume or get the snapshot
with the storage device's feature.
3. Unfreeze the filesystem with ioctl.
4. Get the backup from the separated replication volume
or the snapshot.

The usage of the ioctl is as below.
int ioctl(int fd, int cmd, long *timeval)
fd: The file descriptor of the mountpoint.
cmd: EXT3_IOC_FREEZE for the freeze or EXT3_IOC_THAW for the unfreeze.
timeval: The timeout value expressed in seconds.
If it's 0, the timeout isn't set.
Return value: 0 if the operation succeeds. Otherwise, -1.

I have made sure that write requests were suspended with the experimental
patch for this feature and attached it in this mail.

The points of the implementation are followings.
- Add calls of the freeze function (freeze_bdev) and
the unfreeze function (thaw_bdev) in ext3_ioctl().

- ext3_freeze_timeout() which calls the unfreeze function (thaw_bdev)
is registered to the delayed work queue to unfreeze the filesystem
automatically after the lapse of the specified time.

Any comments are very welcome.

Signed-off-by: Takashi Sato <[email protected]>
---
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/ext3/ioctl.c linux-2.6.24-rc8-freeze/fs/ext3/ioctl.c
--- linux-2.6.24-rc8/fs/ext3/ioctl.c 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/fs/ext3/ioctl.c 2008-01-22 18:20:33.000000000 +0900
@@ -254,6 +254,42 @@ flags_err:
return err;
}

+ case EXT3_IOC_FREEZE: {
+ long timeout_sec;
+ long timeout_msec;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (inode->i_sb->s_frozen != SB_UNFROZEN)
+ return -EINVAL;
+ /* arg(sec) to tick value */
+ get_user(timeout_sec, (long __user *) arg);
+ timeout_msec = timeout_sec * 1000;
+ if (timeout_msec < 0)
+ return -EINVAL;
+
+ /* Freeze */
+ freeze_bdev(inode->i_sb->s_bdev);
+
+ /* set up unfreeze timer */
+ if (timeout_msec > 0)
+ ext3_add_freeze_timeout(EXT3_SB(inode->i_sb),
+ timeout_msec);
+ return 0;
+ }
+ case EXT3_IOC_THAW: {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (inode->i_sb->s_frozen == SB_UNFROZEN)
+ return -EINVAL;
+
+ /* delete unfreeze timer */
+ ext3_del_freeze_timeout(EXT3_SB(inode->i_sb));
+
+ /* Unfreeze */
+ thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+
+ return 0;
+ }

default:
return -ENOTTY;
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/ext3/super.c linux-2.6.24-rc8-freeze/fs/ext3/super.c
--- linux-2.6.24-rc8/fs/ext3/super.c 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/fs/ext3/super.c 2008-01-22 18:20:33.000000000 +0900
@@ -63,6 +63,7 @@ static int ext3_statfs (struct dentry *
static void ext3_unlockfs(struct super_block *sb);
static void ext3_write_super (struct super_block * sb);
static void ext3_write_super_lockfs(struct super_block *sb);
+static void ext3_freeze_timeout(struct work_struct *work);

/*
* Wrappers for journal_start/end.
@@ -323,6 +324,44 @@ void ext3_update_dynamic_rev(struct supe
}

/*
+ * ext3_add_freeze_timeout - Add timeout for ext3 freeze.
+ *
+ * @sbi : ext3 super block
+ * @timeout_msec : timeout period
+ *
+ * Add the delayed work for ext3 freeze timeout
+ * to the delayed work queue.
+ */
+void ext3_add_freeze_timeout(struct ext3_sb_info *sbi,
+ long timeout_msec)
+{
+ s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
+
+ /*
+ * setup freeze timeout function
+ */
+ INIT_DELAYED_WORK(&sbi->s_freeze_timeout, ext3_freeze_timeout);
+
+ /* set delayed work queue */
+ cancel_delayed_work(&sbi->s_freeze_timeout);
+ schedule_delayed_work(&sbi->s_freeze_timeout, timeout_jiffies);
+}
+
+/*
+ * ext3_del_freeze_timeout - Delete timeout for ext3 freeze.
+ *
+ * @sbi : ext3 super block
+ *
+ * Delete the delayed work for ext3 freeze timeout
+ * from the delayed work queue.
+ */
+void ext3_del_freeze_timeout(struct ext3_sb_info *sbi)
+{
+ if (delayed_work_pending(&sbi->s_freeze_timeout))
+ cancel_delayed_work(&sbi->s_freeze_timeout);
+}
+
+/*
* Open the external journal device
*/
static struct block_device *ext3_blkdev_get(dev_t dev)
@@ -2367,10 +2406,31 @@ static void ext3_unlockfs(struct super_b
EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
unlock_super(sb);
- journal_unlock_updates(EXT3_SB(sb)->s_journal);
+ journal_unlock_updates_if_needed(EXT3_SB(sb)->s_journal);
}
}

+/*
+ * ext3_freeze_timeout - Thaw the filesystem.
+ *
+ * @work : work queue (delayed_work.work)
+ *
+ * Called by the delayed work when elapsing the timeout period.
+ * Thaw the filesystem.
+ */
+static void ext3_freeze_timeout(struct work_struct *work)
+{
+ struct ext3_sb_info *sbi = container_of(work,
+ struct ext3_sb_info,
+ s_freeze_timeout.work);
+ struct super_block *sb = get_super_block(sbi);
+
+ BUG_ON(sb == NULL);
+
+ if (sb->s_frozen != SB_UNFROZEN)
+ thaw_bdev(sb->s_bdev, sb);
+}
+
static int ext3_remount (struct super_block * sb, int * flags, char * data)
{
struct ext3_super_block * es;
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/jbd/journal.c linux-2.6.24-rc8-freeze/fs/jbd/journal.c
--- linux-2.6.24-rc8/fs/jbd/journal.c 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/fs/jbd/journal.c 2008-01-22 18:20:33.000000000 +0900
@@ -46,6 +46,7 @@ EXPORT_SYMBOL(journal_extend);
EXPORT_SYMBOL(journal_stop);
EXPORT_SYMBOL(journal_lock_updates);
EXPORT_SYMBOL(journal_unlock_updates);
+EXPORT_SYMBOL(journal_unlock_updates_if_needed);
EXPORT_SYMBOL(journal_get_write_access);
EXPORT_SYMBOL(journal_get_create_access);
EXPORT_SYMBOL(journal_get_undo_access);
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/jbd/transaction.c linux-2.6.24-rc8-freeze/fs/jbd/transaction.c
--- linux-2.6.24-rc8/fs/jbd/transaction.c 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/fs/jbd/transaction.c 2008-01-22 18:20:33.000000000 +0900
@@ -485,6 +485,29 @@ void journal_unlock_updates (journal_t *
wake_up(&journal->j_wait_transaction_locked);
}

+/**
+ * journal_unlock_updates_if_needed - release barrier if needed.
+ *
+ * @journal: Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained if barrier count is not 0.
+ * Should be called without the journal lock held.
+ */
+void journal_unlock_updates_if_needed(journal_t *journal)
+{
+ spin_lock(&journal->j_state_lock);
+
+ if (!journal->j_barrier_count) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ --journal->j_barrier_count;
+ spin_unlock(&journal->j_state_lock);
+ mutex_unlock(&journal->j_barrier);
+ wake_up(&journal->j_wait_transaction_locked);
+}
+
/*
* Report any unexpected dirty buffers which turn up. Normally those
* indicate an error, but they can occur if the user is running (say)
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/super.c linux-2.6.24-rc8-freeze/fs/super.c
--- linux-2.6.24-rc8/fs/super.c 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/fs/super.c 2008-01-22 18:20:33.000000000 +0900
@@ -950,3 +950,30 @@ struct vfsmount *kern_mount_data(struct
}

EXPORT_SYMBOL_GPL(kern_mount_data);
+
+/**
+ * get_super_block - get super_block
+ * @s_fs_info : filesystem dependent information
+ * (super_block.s_fs_info)
+ *
+ * Get super_block which holds s_fs_info from super_blocks.
+ * get_super_block() returns a pointer of super block or
+ * %NULL if it have failed.
+ */
+struct super_block *get_super_block(void *s_fs_info)
+{
+ struct super_block *sb;
+
+ spin_lock(&sb_lock);
+ sb = sb_entry(super_blocks.prev);
+ for (; sb != sb_entry(&super_blocks);
+ sb = sb_entry(sb->s_list.prev)) {
+ if (sb->s_fs_info == s_fs_info) {
+ spin_unlock(&sb_lock);
+ return sb;
+ }
+ }
+ spin_unlock(&sb_lock);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(get_super_block);
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/ext3_fs.h linux-2.6.24-rc8-freeze/include/linux/ext3_fs.h
--- linux-2.6.24-rc8/include/linux/ext3_fs.h 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/include/linux/ext3_fs.h 2008-01-22 18:20:33.000000000 +0900
@@ -225,6 +225,8 @@ struct ext3_new_group_data {
#endif
#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT3_IOC_FREEZE _IOW('f', 9, long)
+#define EXT3_IOC_THAW _IOW('f', 10, long)

/*
* ioctl commands in 32 bit emulation
@@ -864,6 +866,9 @@ extern void ext3_abort (struct super_blo
extern void ext3_warning (struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void ext3_update_dynamic_rev (struct super_block *sb);
+extern void ext3_add_freeze_timeout(struct ext3_sb_info *sbi,
+ long timeout_msec);
+extern void ext3_del_freeze_timeout(struct ext3_sb_info *sbi);

#define ext3_std_error(sb, errno) \
do { \
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/ext3_fs_sb.h linux-2.6.24-rc8-freeze/include/linux/ext3_fs_sb.h
--- linux-2.6.24-rc8/include/linux/ext3_fs_sb.h 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/include/linux/ext3_fs_sb.h 2008-01-22 18:20:33.000000000 +0900
@@ -81,6 +81,8 @@ struct ext3_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+ /* Delayed work for freeze */
+ struct delayed_work s_freeze_timeout;
};

#endif /* _LINUX_EXT3_FS_SB */
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/fs.h linux-2.6.24-rc8-freeze/include/linux/fs.h
--- linux-2.6.24-rc8/include/linux/fs.h 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/include/linux/fs.h 2008-01-22 18:20:33.000000000 +0900
@@ -2095,6 +2095,7 @@ struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos);

+extern struct super_block *get_super_block(void *s_fs_info);

#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */
diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/jbd.h linux-2.6.24-rc8-freeze/include/linux/jbd.h
--- linux-2.6.24-rc8/include/linux/jbd.h 2008-01-16 13:22:48.000000000 +0900
+++ linux-2.6.24-rc8-freeze/include/linux/jbd.h 2008-01-22 18:20:33.000000000 +0900
@@ -905,6 +905,7 @@ extern int journal_stop(handle_t *);
extern int journal_flush (journal_t *);
extern void journal_lock_updates (journal_t *);
extern void journal_unlock_updates (journal_t *);
+extern void journal_unlock_updates_if_needed(journal_t *);

extern journal_t * journal_init_dev(struct block_device *bdev,
struct block_device *fs_dev,


2008-01-25 11:17:34

by Pekka Enberg

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/ext3_fs_sb.h linux-2.6.24-rc8-freeze/include/linux/ext3_fs_sb.h
> --- linux-2.6.24-rc8/include/linux/ext3_fs_sb.h 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/include/linux/ext3_fs_sb.h 2008-01-22 18:20:33.000000000 +0900
> @@ -81,6 +81,8 @@ struct ext3_sb_info {
> char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
> int s_jquota_fmt; /* Format of quota to use */
> #endif
> + /* Delayed work for freeze */
> + struct delayed_work s_freeze_timeout;

Why not put this in struct super_block? Then you don't need this

> +/**
> + * get_super_block - get super_block
> + * @s_fs_info : filesystem dependent information
> + * (super_block.s_fs_info)
> + *
> + * Get super_block which holds s_fs_info from super_blocks.
> + * get_super_block() returns a pointer of super block or
> + * %NULL if it have failed.
> + */
> +struct super_block *get_super_block(void *s_fs_info)
> +{

And these can be put to generic code:

> /*
> + * ext3_add_freeze_timeout - Add timeout for ext3 freeze.
> + *
> + * @sbi : ext3 super block
> + * @timeout_msec : timeout period
> + *
> + * Add the delayed work for ext3 freeze timeout
> + * to the delayed work queue.
> + */
> +void ext3_add_freeze_timeout(struct ext3_sb_info *sbi,
> + long timeout_msec)
> +{
> + s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
> +
> + /*
> + * setup freeze timeout function
> + */
> + INIT_DELAYED_WORK(&sbi->s_freeze_timeout, ext3_freeze_timeout);
> +
> + /* set delayed work queue */
> + cancel_delayed_work(&sbi->s_freeze_timeout);
> + schedule_delayed_work(&sbi->s_freeze_timeout, timeout_jiffies);
> +}
> +
> +/*
> + * ext3_del_freeze_timeout - Delete timeout for ext3 freeze.
> + *
> + * @sbi : ext3 super block
> + *
> + * Delete the delayed work for ext3 freeze timeout
> + * from the delayed work queue.
> + */
> +void ext3_del_freeze_timeout(struct ext3_sb_info *sbi)
> +{
> + if (delayed_work_pending(&sbi->s_freeze_timeout))
> + cancel_delayed_work(&sbi->s_freeze_timeout);
> +}

> +/*
> + * ext3_freeze_timeout - Thaw the filesystem.
> + *
> + * @work : work queue (delayed_work.work)
> + *
> + * Called by the delayed work when elapsing the timeout period.
> + * Thaw the filesystem.
> + */
> +static void ext3_freeze_timeout(struct work_struct *work)
> +{
> + struct ext3_sb_info *sbi = container_of(work,
> + struct ext3_sb_info,
> + s_freeze_timeout.work);
> + struct super_block *sb = get_super_block(sbi);
> +
> + BUG_ON(sb == NULL);
> +
> + if (sb->s_frozen != SB_UNFROZEN)
> + thaw_bdev(sb->s_bdev, sb);
> +}
> +

I am also wondering whether we should have system call(s) for these:

On Jan 25, 2008 12:59 PM, Takashi Sato <[email protected]> wrote:
> + case EXT3_IOC_FREEZE: {

> + case EXT3_IOC_THAW: {

And just convert XFS to use them too?

Pekka

2008-01-25 12:20:21

by Dmitri Monakhov

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On 19:59 Fri 25 Jan , Takashi Sato wrote:
> Hi,
>
> Currently, ext3 doesn't have the freeze feature which suspends write
> requests. So, we cannot get a backup which keeps the filesystem's
> consistency with the storage device's features (snapshot, replication)
> while it is mounted.
> In many case, a commercial filesystems (e.g. VxFS) has the freeze
> feature and it would be used to get the consistent backup.
First of all Linux already have at least one open-source(dm-snap),
and several commercial snapshot solutions. In fact dm-snaps it
not perfect:
a) bit map loading is not supported (this is useful for freezing
only used blocks) which causing significant slowdown even for new writes
b) non patched dm-snap code has significant performance slowdown for all
rewrite requests.
c) IMHO memory footprint is too big.

BUT, it works well for most file-systems.
>
> So I am planning on implementing the ioctl of the freeze feature for ext3.
> I think we can get the consistent backup with the following steps.
> 1. Freeze the filesystem with ioctl.
So you plan to do it from userspace.. well good luck with it :)

> 2. Separate the replication volume or get the snapshot
> with the storage device's feature.
> 3. Unfreeze the filesystem with ioctl.

You have to realize what delay between 1-3 stages have to be minimal.
for example dm-snap perform it only for explicit journal flushing.
>From my experience if delay is more than 4-5 seconds whole system becomes
unstable.
BTW: you have to always remember that while locking ext3 via freeze_bdev
sb->ext3_write_super_lockfs() will be called wich implemented as "simple"
journal lock. This means what some bio-s still may reach original device
even after file system was locked (i've observed this in real life
situation).

> 4. Get the backup from the separated replication volume
> or the snapshot.
>
> The usage of the ioctl is as below.
> int ioctl(int fd, int cmd, long *timeval)
> fd: The file descriptor of the mountpoint.
> cmd: EXT3_IOC_FREEZE for the freeze or EXT3_IOC_THAW for the unfreeze.
> timeval: The timeout value expressed in seconds.
> If it's 0, the timeout isn't set.
> Return value: 0 if the operation succeeds. Otherwise, -1.
>
> I have made sure that write requests were suspended with the experimental
> patch for this feature and attached it in this mail.
>
> The points of the implementation are followings.
> - Add calls of the freeze function (freeze_bdev) and
> the unfreeze function (thaw_bdev) in ext3_ioctl().
>
> - ext3_freeze_timeout() which calls the unfreeze function (thaw_bdev)
> is registered to the delayed work queue to unfreeze the filesystem
> automatically after the lapse of the specified time.
>
> Any comments are very welcome.
>
> Signed-off-by: Takashi Sato <[email protected]>
> ---
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/ext3/ioctl.c linux-2.6.24-rc8-freeze/fs/ext3/ioctl.c
> --- linux-2.6.24-rc8/fs/ext3/ioctl.c 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/fs/ext3/ioctl.c 2008-01-22 18:20:33.000000000 +0900
> @@ -254,6 +254,42 @@ flags_err:
> return err;
> }
>
> + case EXT3_IOC_FREEZE: {
> + long timeout_sec;
> + long timeout_msec;
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> + if (inode->i_sb->s_frozen != SB_UNFROZEN)

> + return -EINVAL
WOW timeout extending is not supported !?
So you wanna say what caller have to set timer to the maximal possible
timeout from the very beginning.
IMHO it is better to use heart-beat timer approach, for example:
each second caller extend it's timeout for two seconds. in this approach
even after caller was killed by any reason, it's timeout will be expired in
two seconds.

if (inode->i_sb->s_frozen == SB_FROZEN)
/* extending timeout */
......


> + /* arg(sec) to tick value */
> + get_user(timeout_sec, (long __user *) arg);
> + timeout_msec = timeout_sec * 1000;
> + if (timeout_msec < 0)
> + return -EINVAL;
> +
> + /* Freeze */
> + freeze_bdev(inode->i_sb->s_bdev);
> +
> + /* set up unfreeze timer */
> + if (timeout_msec > 0)
> + ext3_add_freeze_timeout(EXT3_SB(inode->i_sb),
> + timeout_msec);
> + return 0;
> + }
> + case EXT3_IOC_THAW: {
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> + if (inode->i_sb->s_frozen == SB_UNFROZEN)
> + return -EINVAL;
> +
> + /* delete unfreeze timer */
> + ext3_del_freeze_timeout(EXT3_SB(inode->i_sb));
> +
> + /* Unfreeze */
> + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
> +
> + return 0;
> + }
>
> default:
> return -ENOTTY;
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/ext3/super.c linux-2.6.24-rc8-freeze/fs/ext3/super.c
> --- linux-2.6.24-rc8/fs/ext3/super.c 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/fs/ext3/super.c 2008-01-22 18:20:33.000000000 +0900
> @@ -63,6 +63,7 @@ static int ext3_statfs (struct dentry *
> static void ext3_unlockfs(struct super_block *sb);
> static void ext3_write_super (struct super_block * sb);
> static void ext3_write_super_lockfs(struct super_block *sb);
> +static void ext3_freeze_timeout(struct work_struct *work);
>
> /*
> * Wrappers for journal_start/end.
> @@ -323,6 +324,44 @@ void ext3_update_dynamic_rev(struct supe
> }
>
> /*
> + * ext3_add_freeze_timeout - Add timeout for ext3 freeze.
> + *
> + * @sbi : ext3 super block
> + * @timeout_msec : timeout period
> + *
> + * Add the delayed work for ext3 freeze timeout
> + * to the delayed work queue.
> + */
> +void ext3_add_freeze_timeout(struct ext3_sb_info *sbi,
> + long timeout_msec)
> +{
> + s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
> +
> + /*
> + * setup freeze timeout function
> + */
> + INIT_DELAYED_WORK(&sbi->s_freeze_timeout, ext3_freeze_timeout);
> +
> + /* set delayed work queue */
> + cancel_delayed_work(&sbi->s_freeze_timeout);
> + schedule_delayed_work(&sbi->s_freeze_timeout, timeout_jiffies);
> +}
> +
> +/*
> + * ext3_del_freeze_timeout - Delete timeout for ext3 freeze.
> + *
> + * @sbi : ext3 super block
> + *
> + * Delete the delayed work for ext3 freeze timeout
> + * from the delayed work queue.
> + */
> +void ext3_del_freeze_timeout(struct ext3_sb_info *sbi)
> +{
> + if (delayed_work_pending(&sbi->s_freeze_timeout))
> + cancel_delayed_work(&sbi->s_freeze_timeout);
> +}
> +
> +/*
> * Open the external journal device
> */
> static struct block_device *ext3_blkdev_get(dev_t dev)
> @@ -2367,10 +2406,31 @@ static void ext3_unlockfs(struct super_b
> EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
> ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
> unlock_super(sb);
> - journal_unlock_updates(EXT3_SB(sb)->s_journal);
> + journal_unlock_updates_if_needed(EXT3_SB(sb)->s_journal);
> }
> }
>
> +/*
> + * ext3_freeze_timeout - Thaw the filesystem.
> + *
> + * @work : work queue (delayed_work.work)
> + *
> + * Called by the delayed work when elapsing the timeout period.
> + * Thaw the filesystem.
> + */
> +static void ext3_freeze_timeout(struct work_struct *work)
> +{
> + struct ext3_sb_info *sbi = container_of(work,
> + struct ext3_sb_info,
> + s_freeze_timeout.work);
> + struct super_block *sb = get_super_block(sbi);
> +
> + BUG_ON(sb == NULL);
> +
> + if (sb->s_frozen != SB_UNFROZEN)
> + thaw_bdev(sb->s_bdev, sb);
> +}
> +
> static int ext3_remount (struct super_block * sb, int * flags, char * data)
> {
> struct ext3_super_block * es;
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/jbd/journal.c linux-2.6.24-rc8-freeze/fs/jbd/journal.c
> --- linux-2.6.24-rc8/fs/jbd/journal.c 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/fs/jbd/journal.c 2008-01-22 18:20:33.000000000 +0900
> @@ -46,6 +46,7 @@ EXPORT_SYMBOL(journal_extend);
> EXPORT_SYMBOL(journal_stop);
> EXPORT_SYMBOL(journal_lock_updates);
> EXPORT_SYMBOL(journal_unlock_updates);
> +EXPORT_SYMBOL(journal_unlock_updates_if_needed);
> EXPORT_SYMBOL(journal_get_write_access);
> EXPORT_SYMBOL(journal_get_create_access);
> EXPORT_SYMBOL(journal_get_undo_access);
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/jbd/transaction.c linux-2.6.24-rc8-freeze/fs/jbd/transaction.c
> --- linux-2.6.24-rc8/fs/jbd/transaction.c 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/fs/jbd/transaction.c 2008-01-22 18:20:33.000000000 +0900
> @@ -485,6 +485,29 @@ void journal_unlock_updates (journal_t *
> wake_up(&journal->j_wait_transaction_locked);
> }
>
> +/**
> + * journal_unlock_updates_if_needed - release barrier if needed.
> + *
> + * @journal: Journal to release the barrier on.
> + *
> + * Release a transaction barrier obtained if barrier count is not 0.
> + * Should be called without the journal lock held.
> + */
> +void journal_unlock_updates_if_needed(journal_t *journal)
> +{
> + spin_lock(&journal->j_state_lock);
> +
> + if (!journal->j_barrier_count) {
> + spin_unlock(&journal->j_state_lock);
> + return;
> + }
> +
> + --journal->j_barrier_count;
> + spin_unlock(&journal->j_state_lock);
> + mutex_unlock(&journal->j_barrier);
> + wake_up(&journal->j_wait_transaction_locked);
> +}
> +
> /*
> * Report any unexpected dirty buffers which turn up. Normally those
> * indicate an error, but they can occur if the user is running (say)
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/fs/super.c linux-2.6.24-rc8-freeze/fs/super.c
> --- linux-2.6.24-rc8/fs/super.c 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/fs/super.c 2008-01-22 18:20:33.000000000 +0900
> @@ -950,3 +950,30 @@ struct vfsmount *kern_mount_data(struct
> }
>
> EXPORT_SYMBOL_GPL(kern_mount_data);
> +
> +/**
> + * get_super_block - get super_block
> + * @s_fs_info : filesystem dependent information
> + * (super_block.s_fs_info)
> + *
> + * Get super_block which holds s_fs_info from super_blocks.
> + * get_super_block() returns a pointer of super block or
> + * %NULL if it have failed.
> + */
> +struct super_block *get_super_block(void *s_fs_info)
> +{
> + struct super_block *sb;
> +
> + spin_lock(&sb_lock);
> + sb = sb_entry(super_blocks.prev);
> + for (; sb != sb_entry(&super_blocks);
> + sb = sb_entry(sb->s_list.prev)) {
> + if (sb->s_fs_info == s_fs_info) {
> + spin_unlock(&sb_lock);
> + return sb;
> + }
> + }
> + spin_unlock(&sb_lock);
> + return NULL;
> +}
> +EXPORT_SYMBOL_GPL(get_super_block);
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/ext3_fs.h linux-2.6.24-rc8-freeze/include/linux/ext3_fs.h
> --- linux-2.6.24-rc8/include/linux/ext3_fs.h 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/include/linux/ext3_fs.h 2008-01-22 18:20:33.000000000 +0900
> @@ -225,6 +225,8 @@ struct ext3_new_group_data {
> #endif
> #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
> #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
> +#define EXT3_IOC_FREEZE _IOW('f', 9, long)
> +#define EXT3_IOC_THAW _IOW('f', 10, long)
>
> /*
> * ioctl commands in 32 bit emulation
> @@ -864,6 +866,9 @@ extern void ext3_abort (struct super_blo
> extern void ext3_warning (struct super_block *, const char *, const char *, ...)
> __attribute__ ((format (printf, 3, 4)));
> extern void ext3_update_dynamic_rev (struct super_block *sb);
> +extern void ext3_add_freeze_timeout(struct ext3_sb_info *sbi,
> + long timeout_msec);
> +extern void ext3_del_freeze_timeout(struct ext3_sb_info *sbi);
>
> #define ext3_std_error(sb, errno) \
> do { \
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/ext3_fs_sb.h linux-2.6.24-rc8-freeze/include/linux/ext3_fs_sb.h
> --- linux-2.6.24-rc8/include/linux/ext3_fs_sb.h 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/include/linux/ext3_fs_sb.h 2008-01-22 18:20:33.000000000 +0900
> @@ -81,6 +81,8 @@ struct ext3_sb_info {
> char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
> int s_jquota_fmt; /* Format of quota to use */
> #endif
> + /* Delayed work for freeze */
> + struct delayed_work s_freeze_timeout;
> };
>
> #endif /* _LINUX_EXT3_FS_SB */
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/fs.h linux-2.6.24-rc8-freeze/include/linux/fs.h
> --- linux-2.6.24-rc8/include/linux/fs.h 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/include/linux/fs.h 2008-01-22 18:20:33.000000000 +0900
> @@ -2095,6 +2095,7 @@ struct ctl_table;
> int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
> void __user *buffer, size_t *lenp, loff_t *ppos);
>
> +extern struct super_block *get_super_block(void *s_fs_info);
>
> #endif /* __KERNEL__ */
> #endif /* _LINUX_FS_H */
> diff -uprN -X linux-2.6.24-rc8/Documentation/dontdiff linux-2.6.24-rc8/include/linux/jbd.h linux-2.6.24-rc8-freeze/include/linux/jbd.h
> --- linux-2.6.24-rc8/include/linux/jbd.h 2008-01-16 13:22:48.000000000 +0900
> +++ linux-2.6.24-rc8-freeze/include/linux/jbd.h 2008-01-22 18:20:33.000000000 +0900
> @@ -905,6 +905,7 @@ extern int journal_stop(handle_t *);
> extern int journal_flush (journal_t *);
> extern void journal_lock_updates (journal_t *);
> extern void journal_unlock_updates (journal_t *);
> +extern void journal_unlock_updates_if_needed(journal_t *);
>
> extern journal_t * journal_init_dev(struct block_device *bdev,
> struct block_device *fs_dev,
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2008-01-25 12:44:34

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

> I am also wondering whether we should have system call(s) for these:
>
> On Jan 25, 2008 12:59 PM, Takashi Sato <[email protected]> wrote:
>> + case EXT3_IOC_FREEZE: {
>
>> + case EXT3_IOC_THAW: {
>
> And just convert XFS to use them too?

I think it is reasonable to implement it as the generic system call, as you said.
Does XFS folks think so?

Cheers, Takashi

2008-01-25 13:33:37

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Jan 25, 2008 at 03:18:51PM +0300, Dmitri Monakhov wrote:
> First of all Linux already have at least one open-source(dm-snap),
> and several commercial snapshot solutions.

Yes, but it requires that the filesystem be stored under LVM. Unlike
what EVMS v1 allowed us to do, we can't currently take a snapshot of a
bare block device. This patch could potentially be useful for systems
which aren't using LVM, however....

> You have to realize what delay between 1-3 stages have to be minimal.
> for example dm-snap perform it only for explicit journal flushing.
> From my experience if delay is more than 4-5 seconds whole system becomes
> unstable.

That's the problem. You can't afford to freeze for very long.

What you *could* do is to start putting processes to sleep if they
attempt to write to the frozen filesystem, and then detect the
deadlock case where the process holding the file descriptor used to
freeze the filesystem gets frozen because it attempted to write to the
filesystem --- at which point it gets some kind of signal (which
defaults to killing the process), and the filesystem is unfrozen and
as part of the unfreeze you wake up all of the processes that were put
to sleep for touching the frozen filesystem.

The other approach would be to say, "oh well, the freeze ioctl is
inherently dangerous, and root is allowed to himself in the foot, so
who cares". :-)

But it was this concern which is why ext3 never exported freeze
functionality to userspace, even though other commercial filesystems
do support this. It wasn't that it wasn't considered, but the concern
about whether or not it was sufficiently safe to make available.

And I do agree that we probably should just implement this in
filesystem independent way, in which case all of the filesystems that
support this already have super_operations functions
write_super_lockfs() and unlockfs().

So if this is done using a new system call, there should be no
filesystem-specific changes needed, and all filesystems which support
those super_operations method functions would be able to provide this
functionality to the new system call.

- Ted

P.S. Oh yeah, it should be noted that freezing at the filesystem
layer does *not* guarantee that changes to the block device aren't
happening via mmap()'ed files. The LVM needs to freeze writes the
block device level if it wants to guarantee a completely stable
snapshot image. So the proposed patch doens't quite give you those
guarantees, if that was the intended goal.

2008-01-25 16:34:25

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Theodore Tso wrote:
> The other approach would be to say, "oh well, the freeze ioctl is
> inherently dangerous, and root is allowed to himself in the foot, so
> who cares". :-)

I tend to agree. Either you need your fs frozen, or not, and if you do,
be prepared for the consequences.

> But it was this concern which is why ext3 never exported freeze
> functionality to userspace, even though other commercial filesystems
> do support this. It wasn't that it wasn't considered, but the concern
> about whether or not it was sufficiently safe to make available.

What's the safety concern; that the admin will forget to unfreeze?

> And I do agree that we probably should just implement this in
> filesystem independent way, in which case all of the filesystems that
> support this already have super_operations functions
> write_super_lockfs() and unlockfs().

That's what I was thinking; can't the path to freeze_bdev just be
elevated out of dm-ioctl.c to fs/ioctl.c and exposed, such that any
filesystem which implements .write_super_lockfs can be frozen? This is
essentially what the xfs_freeze userspace does via
xfs_ioctl/XFS_IOC_FREEZE - which, AFAIK, isn't used much now that the
lvm hooks are in place.

I'm also not sure I see the point of the timeout in the original patch;
either you are done snapshotting and ready to unfreeze, or you're not;
1, or 2, or 3 seconds doesn't really matter. When you're done, you're
done, and you can only unfreeze then. Shouldn't this be done
programmatically, and not with some pre-determined timeout?

-Eric

2008-01-25 16:42:29

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Jan 25, 2008 at 10:34:25AM -0600, Eric Sandeen wrote:
> > But it was this concern which is why ext3 never exported freeze
> > functionality to userspace, even though other commercial filesystems
> > do support this. It wasn't that it wasn't considered, but the concern
> > about whether or not it was sufficiently safe to make available.
>
> What's the safety concern; that the admin will forget to unfreeze?

That the admin would manage to deadlock him/herself and wedge up the
whole system...

> I'm also not sure I see the point of the timeout in the original patch;
> either you are done snapshotting and ready to unfreeze, or you're not;
> 1, or 2, or 3 seconds doesn't really matter. When you're done, you're
> done, and you can only unfreeze then. Shouldn't this be done
> programmatically, and not with some pre-determined timeout?

This is only a guess, but I suspect it was a fail-safe in case the
admin did manage to deadlock him/herself.

I would think a better approach would be to make the filesystem
unfreeze if the file descriptor that was used to freeze the filesystem
is closed, and then have explicit deadlock detection that kills the
process doing the freeze, at which point the filesystem unlocks and
the system can recover.

- Ted

2008-01-26 05:17:51

by David Chinner

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Jan 25, 2008 at 09:42:30PM +0900, Takashi Sato wrote:
> >I am also wondering whether we should have system call(s) for these:
> >
> >On Jan 25, 2008 12:59 PM, Takashi Sato <[email protected]> wrote:
> >>+ case EXT3_IOC_FREEZE: {
> >
> >>+ case EXT3_IOC_THAW: {
> >
> >And just convert XFS to use them too?
>
> I think it is reasonable to implement it as the generic system call, as you
> said. Does XFS folks think so?

Sure.

Note that we can't immediately remove the XFS ioctls otherwise
we'd break userspace utilities that use them....

Cheers,

Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group

2008-01-26 05:35:26

by David Chinner

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Jan 25, 2008 at 07:59:38PM +0900, Takashi Sato wrote:
> The points of the implementation are followings.
> - Add calls of the freeze function (freeze_bdev) and
> the unfreeze function (thaw_bdev) in ext3_ioctl().
>
> - ext3_freeze_timeout() which calls the unfreeze function (thaw_bdev)
> is registered to the delayed work queue to unfreeze the filesystem
> automatically after the lapse of the specified time.

Seems like pointless complexity to me - what happens if a
timeout occurs while the filsystem is still freezing?

It's not uncommon for a freeze to take minutes if memory
is full of dirty data that needs to be flushed out, esp. if
dm-snap is doing COWs for every write issued....

> + case EXT3_IOC_FREEZE: {
....
> + if (inode->i_sb->s_frozen != SB_UNFROZEN)
> + return -EINVAL;
....
> + freeze_bdev(inode->i_sb->s_bdev);
....
> + case EXT3_IOC_THAW: {
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> + if (inode->i_sb->s_frozen == SB_UNFROZEN)
> + return -EINVAL;
.....
> + /* Unfreeze */
> + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);

That's inherently unsafe - you can have multiple unfreezes
running in parallel which seriously screws with the bdev semaphore
count that is used to lock the device due to doing multiple up()s
for every down.

Your timeout thingy guarantee that at some point you will get
multiple up()s occuring due to the timer firing racing with
a thaw ioctl.

If this interface is to be more widely exported, then it needs
a complete revamp of the bdev is locked while it is frozen so
that there is no chance of a double up() ever occuring on the
bd_mount_sem due to racing thaws.....

Cheers,

Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group

2008-01-26 05:39:28

by David Chinner

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Sat, Jan 26, 2008 at 04:35:26PM +1100, David Chinner wrote:
> On Fri, Jan 25, 2008 at 07:59:38PM +0900, Takashi Sato wrote:
> > The points of the implementation are followings.
> > - Add calls of the freeze function (freeze_bdev) and
> > the unfreeze function (thaw_bdev) in ext3_ioctl().
> >
> > - ext3_freeze_timeout() which calls the unfreeze function (thaw_bdev)
> > is registered to the delayed work queue to unfreeze the filesystem
> > automatically after the lapse of the specified time.
>
> Seems like pointless complexity to me - what happens if a
> timeout occurs while the filsystem is still freezing?
>
> It's not uncommon for a freeze to take minutes if memory
> is full of dirty data that needs to be flushed out, esp. if
> dm-snap is doing COWs for every write issued....

Sorry, ignore this bit - I just realised the timer is set
up after the freeze has occurred....

Still, that makes it potentially dangerous to whatever is being
done while the filesystem is frozen....

Cheers,

Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group

2008-01-26 19:11:00

by [email protected]

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Jan 25, 2008 at 09:42:30PM +0900, Takashi Sato wrote:
> Hi,
>
> >I am also wondering whether we should have system call(s) for these:
> >
> >On Jan 25, 2008 12:59 PM, Takashi Sato <[email protected]> wrote:
> >>+ case EXT3_IOC_FREEZE: {
> >
> >>+ case EXT3_IOC_THAW: {
> >
> >And just convert XFS to use them too?
>
> I think it is reasonable to implement it as the generic system call, as you
> said.
> Does XFS folks think so?

Given that XFS has implemented the ioctls for such a long time it might
make more sense to simply move the ioctl implementation to fs/ioctl.c
so it applies to all filesystem. No need to add a new syscall when the
equivalent-functionality ioctls have to be supported forever anyway.

2008-01-28 13:07:38

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

Thank you for your comments.

> That's inherently unsafe - you can have multiple unfreezes
> running in parallel which seriously screws with the bdev semaphore
> count that is used to lock the device due to doing multiple up()s
> for every down.
>
> Your timeout thingy guarantee that at some point you will get
> multiple up()s occuring due to the timer firing racing with
> a thaw ioctl.
>
> If this interface is to be more widely exported, then it needs
> a complete revamp of the bdev is locked while it is frozen so
> that there is no chance of a double up() ever occuring on the
> bd_mount_sem due to racing thaws.....

My patch has the race condition as you said.
I will fix it.

Cheers, Takashi


2008-01-28 13:13:05

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

> What you *could* do is to start putting processes to sleep if they
> attempt to write to the frozen filesystem, and then detect the
> deadlock case where the process holding the file descriptor used to
> freeze the filesystem gets frozen because it attempted to write to the
> filesystem --- at which point it gets some kind of signal (which
> defaults to killing the process), and the filesystem is unfrozen and
> as part of the unfreeze you wake up all of the processes that were put
> to sleep for touching the frozen filesystem.

I don't think close() usually writes to journal and the deadlock occurs.
Is there the special case which close() writes to journal in case of
getting signal?

Cheers, Takashi


2008-01-31 08:53:16

by Daniel Phillips

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Friday 25 January 2008 05:33, Theodore Tso wrote:
> and then detect the
> deadlock case where the process holding the file descriptor used to
> freeze the filesystem gets frozen because it attempted to write to the
> filesystem --- at which point it gets some kind of signal (which
> defaults to killing the process), and the filesystem is unfrozen and
> as part of the unfreeze you wake up all of the processes that were put
> to sleep for touching the frozen filesystem.

Hi Ted,

There are a few holes:

* The process may try to handle the signal and end up blocking on
the filesystem again.

* The process might pass the fd to another process by forking or
fd passing.

* The process holding the fd might be trying to take a lock held
by another process that is blocked on the filesystem, and infinite
variations on that theme.

Remembering the task that did the ioctl might work out better than
remembering the fd. Or just not try to be so fancy and rely on the
application to take appropriate measures to ensure it will not access
the filesystem, such as memlocking and not execing.

The freezer also needs to run in PF_MEMALLOC mode or similar
unless it can be sure it will not cause pageout to the frozen filesystem
under low memory conditions.

Regards,

Daniel

2008-02-01 03:04:25

by Kazuto Miyoshi

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature


Sato-san,

> > What you *could* do is to start putting processes to sleep if they
> > attempt to write to the frozen filesystem, and then detect the
> > deadlock case where the process holding the file descriptor used to
> > freeze the filesystem gets frozen because it attempted to write to the
> > filesystem --- at which point it gets some kind of signal (which
> > defaults to killing the process), and the filesystem is unfrozen and
> > as part of the unfreeze you wake up all of the processes that were put
> > to sleep for touching the frozen filesystem.
>
> I don't think close() usually writes to journal and the deadlock occurs.
> Is there the special case which close() writes to journal in case of
> getting signal?

I am afraid that Ted-san is concerning about the fact that the freeze
program can touch the target filesystem by mistake (rather than just
close() and its journal behavior)

# cd /mnt
# freeze /mnt > ./logfile

This is more unclear to admins than "rm -rf /" case.
So we need to implement some bail-out mechanism as he pointed out,
such that if kernel noticed that the freezer is trying to touch
the target filesystem, kernel would kill the freezer and
automatically unfreeze the filesystem again.

Regards,

2008-02-02 13:52:07

by Pavel Machek

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri 2008-01-25 11:42:29, Theodore Tso wrote:
> On Fri, Jan 25, 2008 at 10:34:25AM -0600, Eric Sandeen wrote:
> > > But it was this concern which is why ext3 never exported freeze
> > > functionality to userspace, even though other commercial filesystems
> > > do support this. It wasn't that it wasn't considered, but the concern
> > > about whether or not it was sufficiently safe to make available.
> >
> > What's the safety concern; that the admin will forget to unfreeze?
>
> That the admin would manage to deadlock him/herself and wedge up the
> whole system...
>
> > I'm also not sure I see the point of the timeout in the original patch;
> > either you are done snapshotting and ready to unfreeze, or you're not;
> > 1, or 2, or 3 seconds doesn't really matter. When you're done, you're
> > done, and you can only unfreeze then. Shouldn't this be done
> > programmatically, and not with some pre-determined timeout?
>
> This is only a guess, but I suspect it was a fail-safe in case the
> admin did manage to deadlock him/herself.
>
> I would think a better approach would be to make the filesystem
> unfreeze if the file descriptor that was used to freeze the filesystem
> is closed, and then have explicit deadlock detection that kills the
> process doing the freeze, at which point the filesystem unlocks and
> the system can recover.

Hmm, not sure that works.

I have shell I used to freeze the ext3. Then it is pushed out by dirty
data waiting to be written to that ext3. Deadlock, with file
descriptor still open, and very hard to detect.

Ok, OOM killer will eventually hit the shell, close the fd and
unfreeze, but that is probably not what you want.

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2008-02-07 01:06:59

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

> What you *could* do is to start putting processes to sleep if they
> attempt to write to the frozen filesystem, and then detect the
> deadlock case where the process holding the file descriptor used to
> freeze the filesystem gets frozen because it attempted to write to the
> filesystem --- at which point it gets some kind of signal (which
> defaults to killing the process), and the filesystem is unfrozen and
> as part of the unfreeze you wake up all of the processes that were put
> to sleep for touching the frozen filesystem.
>
> The other approach would be to say, "oh well, the freeze ioctl is
> inherently dangerous, and root is allowed to himself in the foot, so
> who cares". :-)

Currently the XFS freezer doesn't solve a deadlock automatically
and we rely on administrators for ensuring that the freezer will not
access the filesystem.
And even if the wrong freezer causes a deadlock, it can be solved
by other unfreeze process(unfreeze command).

So I don't think the freezer itself needs to solve the deadlock.
I think the timeout is effective for a unexpected deadlock
and the timeout extending feature is very useful
as Dmitri proposed.

Cheers, Takashi

2008-02-08 10:48:25

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

Ted wrote:
> And I do agree that we probably should just implement this in
> filesystem independent way, in which case all of the filesystems that
> support this already have super_operations functions
> write_super_lockfs() and unlockfs().
>
> So if this is done using a new system call, there should be no
> filesystem-specific changes needed, and all filesystems which support
> those super_operations method functions would be able to provide this
> functionality to the new system call.

OK I would like to implement the freeze feature on VFS
as the filesystem independent ioctl so that it can be
available on filesystems that have already had write_super_lockfs()
and unlockfs().
The usage for the freeze ioctl is the following.
int ioctl(int fd, int FIFREEZE, long *timeval);
fd:file descriptor of mountpoint
FIFREEZE:request cord for freeze
timeval:timeout period (second)

And the unfreeze ioctl is the following.
int ioctl(int fd, int FITHAW, NULL);
fd:file descriptor of mountpoint
FITHAW:Request cord for unfreeze

I think we need the timeout feature which thaws the filesystem
after lapse of specified time for a fail-safe in case the freezer
accesses the frozen filesystem and causes a deadlock.
I intend to implement the timeout feature on VFS.
(This is realized by registering the delayed work which calls
thaw_bdev() to the delayed work queue.)

Any comments are very welcome.

Cheers, Takashi

2008-02-08 13:27:12

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Feb 08, 2008 19:48 +0900, Takashi Sato wrote:
> OK I would like to implement the freeze feature on VFS
> as the filesystem independent ioctl so that it can be
> available on filesystems that have already had write_super_lockfs()
> and unlockfs().
> The usage for the freeze ioctl is the following.
> int ioctl(int fd, int FIFREEZE, long *timeval);
> fd:file descriptor of mountpoint
> FIFREEZE:request cord for freeze
> timeval:timeout period (second)
>
> And the unfreeze ioctl is the following.
> int ioctl(int fd, int FITHAW, NULL);
> fd:file descriptor of mountpoint
> FITHAW:Request cord for unfreeze

You may as well make the common ioctl the same as the XFS version,
both by number and parameters, so that applications which already
understand the XFS ioctl will work on other filesystems.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-02-08 14:59:11

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Feb 08, 2008 at 08:26:57AM -0500, Andreas Dilger wrote:
> You may as well make the common ioctl the same as the XFS version,
> both by number and parameters, so that applications which already
> understand the XFS ioctl will work on other filesystems.

Yes. In facy you should be able to lift the implementations of
XFS_IOC_FREEZE and XFS_IOC_THAW to generic code, there's nothing
XFS-specific in there.


2008-02-13 08:25:10

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

> P.S. Oh yeah, it should be noted that freezing at the filesystem
> layer does *not* guarantee that changes to the block device aren't
> happening via mmap()'ed files. The LVM needs to freeze writes the
> block device level if it wants to guarantee a completely stable
> snapshot image. So the proposed patch doens't quite give you those
> guarantees, if that was the intended goal.

I don't think a mmap()'ed file is written to a block device while a filesystem
is frozen. pdflush starts the writing procedure of the mmap()'ed file's
data and calls ext3_ordered_writepage. ext3_ordered_writepage calls
ext3_journal_start to get the journal handle. As a result, the process
waits for unfreeze in start_this_handle.
pdflush
: :
ext3_ordered_writepage
ext3_journal_start
ext3_journal_start_sb
journal_start
start_this_handle <--- wait here

I actually tried freezing the filesystem after updating the mmap()'ed
file's data. But, the writing to the block device didn't happen.
(It happened right after unfreeze.)

I don't think the freeze feature on the block device level is needed
because the writing for the mmap()'ed file is suspended on
the frozen filesystem.

Any comments are very welcome.

Cheers, Takashi

2008-02-15 11:51:59

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Hi,

Christoph Hellwig wrote:
> On Fri, Feb 08, 2008 at 08:26:57AM -0500, Andreas Dilger wrote:
>> You may as well make the common ioctl the same as the XFS version,
>> both by number and parameters, so that applications which already
>> understand the XFS ioctl will work on other filesystems.
>
> Yes. In facy you should be able to lift the implementations of
> XFS_IOC_FREEZE and XFS_IOC_THAW to generic code, there's nothing
> XFS-specific in there.

According to Documentation/ioctl-number.txt,
XFS_IOC_XXXs (_IOWR('X', aa, bb)) are defined for XFS like below.
>From Documentation/ioctl-number.txt:
----------------------------------------------------------------------------
Code Seq# Include File Comments
========================================================
: :
'X' all linux/xfs_fs.h
----------------------------------------------------------------------------
So XFS_IOC_FREEZE and XFS_IOC_THAW cannot be lifted to generic code simply.
I think we should create new generic numbers for freeze and thaw
like FIBMAP as followings.
linux/fs.h:
#define FIFREEZE _IO(0x00,3)
#define FITHAW _IO(0x00,4)

And xfs_freeze calls XFS_IOC_FREEZE with a magic number 1, but what is 1?
Instead, I'd like the sec to timeout on freeze API in order to thaw
the filesystem automatically. It can prevent a filesystem from staying
frozen forever.
(Because a freezer may cause a deadlock by accessing the frozen filesystem.)

Any comments are very welcome.

Cheers, Takashi

2008-02-15 14:24:24

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

Takashi Sato wrote:
> Hi,
>
> Christoph Hellwig wrote:
>> On Fri, Feb 08, 2008 at 08:26:57AM -0500, Andreas Dilger wrote:
>>> You may as well make the common ioctl the same as the XFS version,
>>> both by number and parameters, so that applications which already
>>> understand the XFS ioctl will work on other filesystems.
>> Yes. In facy you should be able to lift the implementations of
>> XFS_IOC_FREEZE and XFS_IOC_THAW to generic code, there's nothing
>> XFS-specific in there.
>
> According to Documentation/ioctl-number.txt,
> XFS_IOC_XXXs (_IOWR('X', aa, bb)) are defined for XFS like below.
> From Documentation/ioctl-number.txt:
> ----------------------------------------------------------------------------
> Code Seq# Include File Comments
> ========================================================
> : :
> 'X' all linux/xfs_fs.h
> ----------------------------------------------------------------------------

It also says:

'f' 00-1F linux/ext2_fs.h

and yet include/linux.h has:

#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)

as generic vfs ioctls. These ioctls started out as
EXT2_IOC_SETFLAGS/EXT2_IOC_GETFLAGS but they were generically useful,
other filesystems picked them up, and they were "elevated" to the vfs.

> So XFS_IOC_FREEZE and XFS_IOC_THAW cannot be lifted to generic code simply.

It would be a simple matter of changing the documentation, I think.

> I think we should create new generic numbers for freeze and thaw
> like FIBMAP as followings.
> linux/fs.h:
> #define FIFREEZE _IO(0x00,3)
> #define FITHAW _IO(0x00,4)
>
> And xfs_freeze calls XFS_IOC_FREEZE with a magic number 1, but what is 1?

Looks like it's called "level" but it's probably a holdover, it doesn't
look like it's used.

> Instead, I'd like the sec to timeout on freeze API in order to thaw
> the filesystem automatically. It can prevent a filesystem from staying
> frozen forever.
> (Because a freezer may cause a deadlock by accessing the frozen filesystem.)

I'm still not very comfortable with the timeout; if you un-freeze on a
timer, how do you know that the work for which you needed the fileystem
frozen is complete? How would you know if your snapshot was good if
there's a possibility that the fs unfroze while it was being taken?

Thanks,
-Eric

> Any comments are very welcome.
>
> Cheers, Takashi
>

2008-02-16 13:25:07

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature

On Fri, Feb 15, 2008 at 08:51:15PM +0900, Takashi Sato wrote:
> So XFS_IOC_FREEZE and XFS_IOC_THAW cannot be lifted to generic code simply.
> I think we should create new generic numbers for freeze and thaw

Actually we've lifted specific ioctls to the generic layer before all
the time in drivers. That's the only way to make functionality that was
specific to a single driver (or in this case filesystem) generic. If
the numbering issues confuses you make sure to add a big comment
describing it

> And xfs_freeze calls XFS_IOC_FREEZE with a magic number 1, but what is 1?

As Eric said it's ignored.

> Instead, I'd like the sec to timeout on freeze API in order to thaw
> the filesystem automatically. It can prevent a filesystem from staying
> frozen forever.
> (Because a freezer may cause a deadlock by accessing the frozen filesystem.)

Timeout based locking is generally a horrible idea, there's a reason
we don't have any primitives for that in the kernel :)

2008-02-19 11:28:20

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature


Hi,

>#define FS_IOC_GETFLAGS _IOR('f', 1, long)
>#define FS_IOC_SETFLAGS _IOW('f', 2, long)
>
>as generic vfs ioctls. These ioctls started out as
>EXT2_IOC_SETFLAGS/EXT2_IOC_GETFLAGS but they were generically useful,
>other filesystems picked them up, and they were "elevated" to the vfs.

Thank you for good information.
I will elevate XFS_IOC_FREEZE and XFS_IOC_THAW to the VFS.

>> And xfs_freeze calls XFS_IOC_FREEZE with a magic number 1, but what is 1?
>
>Looks like it's called "level" but it's probably a holdover, it doesn't
>look like it's used.

I see.

>> Instead, I'd like the sec to timeout on freeze API in order to thaw
>> the filesystem automatically. It can prevent a filesystem from staying
>> frozen forever.
>> (Because a freezer may cause a deadlock by accessing the frozen filesystem.)
>
>I'm still not very comfortable with the timeout; if you un-freeze on a
>timer, how do you know that the work for which you needed the fileystem
>frozen is complete? How would you know if your snapshot was good if
>there's a possibility that the fs unfroze while it was being taken?

My following freeze ioctl never perform the timeout when 0 is specified
as timeval. So, existent applications which don't expect the timeout
can stay frozen with 0.
int ioctl(int fd, int FIFREEZE, long *timeval);
fd:file descriptor of mountpoint
FIFREEZE:request cord for freeze
timeval:timeout period (second)

And how about adding the new ioctl to reset the timeval like below?
(Dmitri proposed this idea before.)
int ioctl(int fd, int FIFREEZE_RESET_TIMEOUT, long *timeval);
fd:file descriptor of mountpoint
FIFREEZE_RESET_TIMEOUT:request cord for reset of timeout period
timeval:new timeout period
This is useful for the application to set the timeval more accurately.
For example, the freezer resets the timeval to 10 seconds every 5
seconds. In this approach, even if the freezer causes a deadlock
by accessing the frozen filesystem, it will be solved by the timeout
in 10 seconds and the freezer can recognize that at the next reset
of timeval.

Any comments are very welcome.

Cheers, Takashi

2008-02-26 08:20:20

by Takashi Sato

[permalink] [raw]
Subject: [RFC] ext3 freeze feature ver 0.2

Hi,

Takashi Sato wrote:
>>> Instead, I'd like the sec to timeout on freeze API in order to thaw
>>> the filesystem automatically. It can prevent a filesystem from staying
>>> frozen forever.
>>> (Because a freezer may cause a deadlock by accessing the frozen filesystem.)
>>
>>I'm still not very comfortable with the timeout; if you un-freeze on a
>>timer, how do you know that the work for which you needed the fileystem
>>frozen is complete? How would you know if your snapshot was good if
>>there's a possibility that the fs unfroze while it was being taken?
>
>And how about adding the new ioctl to reset the timeval like below?
>(Dmitri proposed this idea before.)
> int ioctl(int fd, int FIFREEZE_RESET_TIMEOUT, long *timeval);
> fd:file descriptor of mountpoint
> FIFREEZE_RESET_TIMEOUT:request code for reset of timeout period
> timeval:new timeout period
>This is useful for the application to set the timeval more accurately.
>For example, the freezer resets the timeval to 10 seconds every 5
>seconds. In this approach, even if the freezer causes a deadlock
>by accessing the frozen filesystem, it will be solved by the timeout
>in 10 seconds and the freezer can recognize that at the next reset
>of timeval.

I have improved the following two points in my ext3 freeze feature.
o Add the new ioctl to reset the timeout period as above
The usage is as below.
int ioctl(int fd, int FIFREEZE_RESET_TIMEOUT, long *timeval);
fd:file descriptor of mountpoint
FIFREEZE_RESET_TIMEOUT:request code for reset of timeout period
timeval:new timeout period
Return value: 0 if the operation succeeds. Otherwise, -1
Error number: If the filesystem has already been unfrozen,
it sets EINVAL to errno.
I have made sure the following two results with this ioctl.
- After the deadlock occurred by accessing the frozen filesystem,
it could be solved by the reset timeout.
- And the freezer could recognize that from the error number (EINVAL)
at the next reset of timeval.

o Elevate XFS ioctl numbers (XFS_IOC_FREEZE and XFS_IOC_THAW) to the VFS
As Andreas Dilger and Christoph Hellwig advised me, I have elevated
them to include/linux/fs.h as below.
#define FIFREEZE _IOWR('X', 119, int)
$B!!(B #define FITHAW _IOWR('X', 120, int)
The ioctl numbers used by XFS applications don't need to be changed.
But my following ioctl for the freeze needs the parameter
as the timeout period. So if XFS applications don't want the timeout
feature as the current implementation, the parameter needs to be
changed 1 (level?) into 0.

I haven't changed the following ioctls from the previous version.
int ioctl(int fd, int cmd, long *timeval)
fd: The file descriptor of the mountpoint
cmd: FIFREEZE for the freeze or FITHAW for the unfreeze
timeval: The timeout value expressed in seconds
If it's 0, the timeout isn't set.
Return value: 0 if the operation succeeds. Otherwise, -1

Any comments are very welcome.

Cheers, Takashi

Signed-off-by: Takashi Sato <[email protected]>
---
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/drivers/md/dm.c linux-2.6.25-rc3-freeze/drivers/
md/dm.c
--- linux-2.6.25-rc3.org/drivers/md/dm.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/drivers/md/dm.c 2008-02-25 10:50:04.000000000 +0900
@@ -1407,7 +1407,7 @@ static int lock_fs(struct mapped_device

WARN_ON(md->frozen_sb);

- md->frozen_sb = freeze_bdev(md->suspended_bdev);
+ md->frozen_sb = freeze_bdev(md->suspended_bdev, 0);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/block_dev.c linux-2.6.25-rc3-freeze/fs/block_
dev.c
--- linux-2.6.25-rc3.org/fs/block_dev.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/block_dev.c 2008-02-25 10:50:04.000000000 +0900
@@ -284,6 +284,11 @@ static void init_once(struct kmem_cache
INIT_LIST_HEAD(&bdev->bd_holder_list);
#endif
inode_init_once(&ei->vfs_inode);
+
+ /* Initialize semaphore for freeze. */
+ sema_init(&bdev->bd_freeze_sem, 1);
+ /* Setup freeze timeout function. */
+ INIT_DELAYED_WORK(&bdev->bd_freeze_timeout, freeze_timeout);
}

static inline void __bd_forget(struct inode *inode)
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/buffer.c linux-2.6.25-rc3-freeze/fs/buffer.c
--- linux-2.6.25-rc3.org/fs/buffer.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/buffer.c 2008-02-25 10:50:04.000000000 +0900
@@ -190,17 +190,33 @@ int fsync_bdev(struct block_device *bdev

/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
- * @bdev: blockdevice to lock
+ * @bdev: blockdevice to lock
+ * @timeout_msec: timeout period
*
* This takes the block device bd_mount_sem to make sure no new mounts
* happen on bdev until thaw_bdev() is called.
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
+ * If timeout_msec is bigger than 0, this registers the delayed work for
+ * timeout of the freeze feature.
*/
-struct super_block *freeze_bdev(struct block_device *bdev)
+struct super_block *freeze_bdev(struct block_device *bdev, long timeout_msec)
{
struct super_block *sb;

+ down(&bdev->bd_freeze_sem);
+ sb = get_super_without_lock(bdev);
+
+ /* If super_block has been already frozen, return. */
+ if (sb && sb->s_frozen != SB_UNFROZEN) {
+ put_super(sb);
+ up(&bdev->bd_freeze_sem);
+ return sb;
+ }
+
+ if (sb)
+ put_super(sb);
+
down(&bdev->bd_mount_sem);
sb = get_super(bdev);
if (sb && !(sb->s_flags & MS_RDONLY)) {
@@ -219,6 +235,13 @@ struct super_block *freeze_bdev(struct b
}

sync_blockdev(bdev);
+
+ /* Setup unfreeze timer. */
+ if (timeout_msec > 0)
+ add_freeze_timeout(bdev, timeout_msec);
+
+ up(&bdev->bd_freeze_sem);
+
return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
}
EXPORT_SYMBOL(freeze_bdev);
@@ -232,6 +255,16 @@ EXPORT_SYMBOL(freeze_bdev);
*/
void thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
+ down(&bdev->bd_freeze_sem);
+
+ if (sb && sb->s_frozen == SB_UNFROZEN) {
+ up(&bdev->bd_freeze_sem);
+ return;
+ }
+
+ /* Delete unfreeze timer. */
+ del_freeze_timeout(bdev);
+
if (sb) {
BUG_ON(sb->s_bdev != bdev);

@@ -244,6 +277,8 @@ void thaw_bdev(struct block_device *bdev
}

up(&bdev->bd_mount_sem);
+
+ up(&bdev->bd_freeze_sem);
}
EXPORT_SYMBOL(thaw_bdev);

diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/ioctl.c linux-2.6.25-rc3-freeze/fs/ioctl.c
--- linux-2.6.25-rc3.org/fs/ioctl.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/ioctl.c 2008-02-26 16:40:34.000000000 +0900
@@ -13,6 +13,7 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/buffer_head.h>

#include <asm/ioctls.h>

@@ -181,6 +182,95 @@ int do_vfs_ioctl(struct file *filp, unsi
} else
error = -ENOTTY;
break;
+
+ case FIFREEZE: {
+ long timeout_sec;
+ long timeout_msec;
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* If filesystem doesn't support freeze feature, return. */
+ if (sb->s_op->write_super_lockfs == NULL) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* arg(sec) to tick value. */
+ error = get_user(timeout_sec, (long __user *) arg);
+ if (error != 0)
+ break;
+ timeout_msec = timeout_sec * 1000;
+ if (timeout_msec < 0) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* Freeze. */
+ freeze_bdev(sb->s_bdev, timeout_msec);
+
+ break;
+ }
+
+ case FITHAW: {
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* If filesystem doesn't support freeze feature, return. */
+ if (sb->s_op->unlockfs == NULL) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* Thaw. */
+ thaw_bdev(sb->s_bdev, sb);
+ break;
+ }
+
+ case FIFREEZE_RESET_TIMEOUT: {
+ long timeout_sec;
+ long timeout_msec;
+ struct super_block *sb
+ = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* arg(sec) to tick value */
+ error = get_user(timeout_sec, (long __user *) arg);
+ if (error)
+ break;
+ timeout_msec = timeout_sec * 1000;
+ if (timeout_msec < 0) {
+ error = -EINVAL;
+ break;
+ }
+
+ if (sb) {
+ down(&sb->s_bdev->bd_freeze_sem);
+ if (sb->s_frozen == SB_UNFROZEN) {
+ up(&sb->s_bdev->bd_freeze_sem);
+ error = -EINVAL;
+ break;
+ }
+ /* setup unfreeze timer */
+ if (timeout_msec > 0)
+ add_freeze_timeout(sb->s_bdev,
+ timeout_msec);
+ up(&sb->s_bdev->bd_freeze_sem);
+ }
+ break;
+ }
+
default:
if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/super.c linux-2.6.25-rc3-freeze/fs/super.c
--- linux-2.6.25-rc3.org/fs/super.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/super.c 2008-02-25 10:50:04.000000000 +0900
@@ -154,7 +154,7 @@ int __put_super_and_need_restart(struct
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -507,6 +507,36 @@ rescan:

EXPORT_SYMBOL(get_super);

+/*
+ * get_super_without_lock - Get super_block from block_device without lock.
+ * @bdev: block device struct
+ *
+ * Scan the superblock list and finds the superblock of the file system
+ * mounted on the block device given. This doesn't lock anyone.
+ * %NULL is returned if no match is found.
+ */
+struct super_block *get_super_without_lock(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ if (!bdev)
+ return NULL;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdev == bdev) {
+ if (sb->s_root) {
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ return sb;
+ }
+ }
+ }
+ spin_unlock(&sb_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(get_super_without_lock);
+
struct super_block * user_get_super(dev_t dev)
{
struct super_block *sb;
@@ -952,3 +982,56 @@ struct vfsmount *kern_mount_data(struct
}

EXPORT_SYMBOL_GPL(kern_mount_data);
+
+/*
+ * freeze_timeout - Thaw the filesystem.
+ *
+ * @work: work queue (delayed_work.work)
+ *
+ * Called by the delayed work when elapsing the timeout period.
+ * Thaw the filesystem.
+ */
+void freeze_timeout(struct work_struct *work)
+{
+ struct block_device *bd = container_of(work,
+ struct block_device, bd_freeze_timeout.work);
+
+ struct super_block *sb = get_super_without_lock(bd);
+
+ BUG_ON(sb == NULL);
+
+ thaw_bdev(bd, sb);
+
+ put_super(sb);
+}
+EXPORT_SYMBOL_GPL(freeze_timeout);
+
+/*
+ * add_freeze_timeout - Add timeout for freeze.
+ *
+ * @bdev: block device struct
+ * @timeout_msec: timeout period
+ *
+ * Add the delayed work for freeze timeout to the delayed work queue.
+ */
+void add_freeze_timeout(struct block_device *bdev, long timeout_msec)
+{
+ s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
+
+ /* Set delayed work queue */
+ cancel_delayed_work(&bdev->bd_freeze_timeout);
+ schedule_delayed_work(&bdev->bd_freeze_timeout, timeout_jiffies);
+}
+
+/*
+ * del_freeze_timeout - Delete timeout for freeze.
+ *
+ * @bdev: block device struct
+ *
+ * Delete the delayed work for freeze timeout from the delayed work queue.
+ */
+void del_freeze_timeout(struct block_device *bdev)
+{
+ if (delayed_work_pending(&bdev->bd_freeze_timeout))
+ cancel_delayed_work(&bdev->bd_freeze_timeout);
+}
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/xfs/linux-2.6/xfs_ioctl.c linux-2.6.25-rc3-fr
eeze/fs/xfs/linux-2.6/xfs_ioctl.c
--- linux-2.6.25-rc3.org/fs/xfs/linux-2.6/xfs_ioctl.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/xfs/linux-2.6/xfs_ioctl.c 2008-02-25 10:50:04.000000000 +0900
@@ -911,7 +911,7 @@ xfs_ioctl(
return -EPERM;

if (inode->i_sb->s_frozen == SB_UNFROZEN)
- freeze_bdev(inode->i_sb->s_bdev);
+ freeze_bdev(inode->i_sb->s_bdev, 0);
return 0;

case XFS_IOC_THAW:
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/xfs/xfs_fsops.c linux-2.6.25-rc3-freeze/fs/xf
s/xfs_fsops.c
--- linux-2.6.25-rc3.org/fs/xfs/xfs_fsops.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/xfs/xfs_fsops.c 2008-02-25 10:50:04.000000000 +0900
@@ -623,7 +623,7 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+ struct super_block *sb = freeze_bdev(mp->m_super->s_bdev, 0);

if (sb && !IS_ERR(sb)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/include/linux/buffer_head.h linux-2.6.25-rc3-fre
eze/include/linux/buffer_head.h
--- linux-2.6.25-rc3.org/include/linux/buffer_head.h 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/include/linux/buffer_head.h 2008-02-25 10:50:04.000000000 +0900
@@ -170,7 +170,7 @@ int sync_blockdev(struct block_device *b
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
-struct super_block *freeze_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *, long timeout_msec);
void thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/include/linux/fs.h linux-2.6.25-rc3-freeze/inclu
de/linux/fs.h
--- linux-2.6.25-rc3.org/include/linux/fs.h 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/include/linux/fs.h 2008-02-25 10:50:04.000000000 +0900
@@ -8,6 +8,7 @@

#include <linux/limits.h>
#include <linux/ioctl.h>
+#include <linux/workqueue.h>

/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -223,6 +224,9 @@ extern int dir_notify_enable;
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
+#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
+#define FITHAW _IOWR('X', 120, int) /* Thaw */
+#define FIFREEZE_RESET_TIMEOUT _IO(0x00, 3) /* Reset freeze timeout */

#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
@@ -548,6 +552,11 @@ struct block_device {
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
+
+ /* Delayed work for freeze */
+ struct delayed_work bd_freeze_timeout;
+ /* Semaphore for freeze */
+ struct semaphore bd_freeze_sem;
};

/*
@@ -1926,7 +1935,9 @@ extern int do_vfs_ioctl(struct file *fil
extern void get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
+extern void put_super(struct super_block *sb);
extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_super_without_lock(struct block_device *);
extern struct super_block *user_get_super(dev_t);
extern void drop_super(struct super_block *sb);

@@ -2070,7 +2081,6 @@ ssize_t simple_attr_read(struct file *fi
ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos);

-
#ifdef CONFIG_SECURITY
static inline char *alloc_secdata(void)
{
@@ -2097,5 +2107,9 @@ int proc_nr_files(struct ctl_table *tabl

int get_filesystem_list(char * buf);

+extern void add_freeze_timeout(struct block_device *bdev, long timeout_msec);
+extern void del_freeze_timeout(struct block_device *bdev);
+extern void freeze_timeout(struct work_struct *work);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */

2008-02-26 16:39:05

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature ver 0.2

Takashi Sato wrote:

> o Elevate XFS ioctl numbers (XFS_IOC_FREEZE and XFS_IOC_THAW) to the VFS
> As Andreas Dilger and Christoph Hellwig advised me, I have elevated
> them to include/linux/fs.h as below.
> #define FIFREEZE _IOWR('X', 119, int)
> $B!!(B #define FITHAW _IOWR('X', 120, int)
> The ioctl numbers used by XFS applications don't need to be changed.
> But my following ioctl for the freeze needs the parameter
> as the timeout period. So if XFS applications don't want the timeout
> feature as the current implementation, the parameter needs to be
> changed 1 (level?) into 0.

So, existing xfs applications calling the xfs ioctl now will behave
differently, right? We can only keep the same ioctl number if the
calling semantics are the same. Keeping the same number but changing
the semantics is harmful, IMHO....

-Eric

2008-02-26 17:08:19

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature ver 0.2

On Feb 26, 2008 08:39 -0800, Eric Sandeen wrote:
> Takashi Sato wrote:
>
> > o Elevate XFS ioctl numbers (XFS_IOC_FREEZE and XFS_IOC_THAW) to the VFS
> > As Andreas Dilger and Christoph Hellwig advised me, I have elevated
> > them to include/linux/fs.h as below.
> > #define FIFREEZE _IOWR('X', 119, int)
> >   #define FITHAW _IOWR('X', 120, int)
> > The ioctl numbers used by XFS applications don't need to be changed.
> > But my following ioctl for the freeze needs the parameter
> > as the timeout period. So if XFS applications don't want the timeout
> > feature as the current implementation, the parameter needs to be
> > changed 1 (level?) into 0.
>
> So, existing xfs applications calling the xfs ioctl now will behave
> differently, right? We can only keep the same ioctl number if the
> calling semantics are the same. Keeping the same number but changing
> the semantics is harmful, IMHO....

Do we know what this parameter was supposed to mean?

We could special case "1" if needed to keep compatibility (documenting
this clearly), either making it == 0, or some very long timeout (1h
or whatever). A relatively minor wart I think.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-02-27 08:31:52

by Takashi Sato

[permalink] [raw]
Subject: Re: [RFC] ext3 freeze feature ver 0.2

Hi,

Andreas Dilger wrote:
>> > o Elevate XFS ioctl numbers (XFS_IOC_FREEZE and XFS_IOC_THAW) to the VFS
>> > As Andreas Dilger and Christoph Hellwig advised me, I have elevated
>> > them to include/linux/fs.h as below.
>> > #define FIFREEZE _IOWR('X', 119, int)
>> > $B!!(B #define FITHAW _IOWR('X', 120, int)
>> > The ioctl numbers used by XFS applications don't need to be changed.
>> > But my following ioctl for the freeze needs the parameter
>> > as the timeout period. So if XFS applications don't want the timeout
>> > feature as the current implementation, the parameter needs to be
>> > changed 1 (level?) into 0.
>>
>> So, existing xfs applications calling the xfs ioctl now will behave
>> differently, right? We can only keep the same ioctl number if the
>> calling semantics are the same. Keeping the same number but changing
>> the semantics is harmful, IMHO....
>
>Do we know what this parameter was supposed to mean?
>
>We could special case "1" if needed to keep compatibility (documenting
>this clearly), either making it == 0, or some very long timeout (1h
>or whatever). A relatively minor wart I think.

I agree.

Because the original xfs_freeze doesn't have the timeout feature,
I think my freeze ioctl had better not do the timeout in case of
specifying 1 as ioctl's parameter.
So I have modified my freeze ioctl not to set the timeout in case of
specifying 1 or 0 as below.
(I have attached the modified patch in this mail.)
int ioctl(int fd, int FIFREEZE, long *timeval)
fd: The file descriptor of the mountpoint
FIFREEZE: The request code for the freeze
timeval: The timeout value expressed in seconds
If it's 0 or 1, the timeout isn't set.
Return value: 0 if the operation succeeds. Otherwise, -1

I have tested my attached patch and confirmed that the original
xfs_freeze could freeze the filesystem without the timeout.

Any comments are very welcome.

I haven't changed the following ioctls from the previous version.
o Reset the timeout period
int ioctl(int fd, int FIFREEZE_RESET_TIMEOUT, long *timeval)
fd:file descriptor of mountpoint
FIFREEZE_RESET_TIMEOUT:request code for reset of timeout period
timeval:new timeout period
Return value: 0 if the operation succeeds. Otherwise, -1
Error number: If the filesystem has already been unfrozen,
it sets EINVAL to errno.

o Unfreeze the filesystem
int ioctl(int fd, int FITHAW, long *timeval)
fd: The file descriptor of the mountpoint
FITHAW: request code for unfreeze
timeval: Ignored
Return value: 0 if the operation succeeds. Otherwise, -1

Cheers, Takashi

Signed-off-by: Takashi Sato <[email protected]>
---
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/drivers/md/dm.c linux-2.6.25-rc3-freeze/drivers/
md/dm.c
--- linux-2.6.25-rc3.org/drivers/md/dm.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/drivers/md/dm.c 2008-02-25 10:50:04.000000000 +0900
@@ -1407,7 +1407,7 @@ static int lock_fs(struct mapped_device

WARN_ON(md->frozen_sb);

- md->frozen_sb = freeze_bdev(md->suspended_bdev);
+ md->frozen_sb = freeze_bdev(md->suspended_bdev, 0);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/block_dev.c linux-2.6.25-rc3-freeze/fs/block_
dev.c
--- linux-2.6.25-rc3.org/fs/block_dev.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/block_dev.c 2008-02-25 10:50:04.000000000 +0900
@@ -284,6 +284,11 @@ static void init_once(struct kmem_cache
INIT_LIST_HEAD(&bdev->bd_holder_list);
#endif
inode_init_once(&ei->vfs_inode);
+
+ /* Initialize semaphore for freeze. */
+ sema_init(&bdev->bd_freeze_sem, 1);
+ /* Setup freeze timeout function. */
+ INIT_DELAYED_WORK(&bdev->bd_freeze_timeout, freeze_timeout);
}

static inline void __bd_forget(struct inode *inode)
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/buffer.c linux-2.6.25-rc3-freeze/fs/buffer.c
--- linux-2.6.25-rc3.org/fs/buffer.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/buffer.c 2008-02-25 10:50:04.000000000 +0900
@@ -190,17 +190,33 @@ int fsync_bdev(struct block_device *bdev

/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
- * @bdev: blockdevice to lock
+ * @bdev: blockdevice to lock
+ * @timeout_msec: timeout period
*
* This takes the block device bd_mount_sem to make sure no new mounts
* happen on bdev until thaw_bdev() is called.
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
+ * If timeout_msec is bigger than 0, this registers the delayed work for
+ * timeout of the freeze feature.
*/
-struct super_block *freeze_bdev(struct block_device *bdev)
+struct super_block *freeze_bdev(struct block_device *bdev, long timeout_msec)
{
struct super_block *sb;

+ down(&bdev->bd_freeze_sem);
+ sb = get_super_without_lock(bdev);
+
+ /* If super_block has been already frozen, return. */
+ if (sb && sb->s_frozen != SB_UNFROZEN) {
+ put_super(sb);
+ up(&bdev->bd_freeze_sem);
+ return sb;
+ }
+
+ if (sb)
+ put_super(sb);
+
down(&bdev->bd_mount_sem);
sb = get_super(bdev);
if (sb && !(sb->s_flags & MS_RDONLY)) {
@@ -219,6 +235,13 @@ struct super_block *freeze_bdev(struct b
}

sync_blockdev(bdev);
+
+ /* Setup unfreeze timer. */
+ if (timeout_msec > 0)
+ add_freeze_timeout(bdev, timeout_msec);
+
+ up(&bdev->bd_freeze_sem);
+
return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
}
EXPORT_SYMBOL(freeze_bdev);
@@ -232,6 +255,16 @@ EXPORT_SYMBOL(freeze_bdev);
*/
void thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
+ down(&bdev->bd_freeze_sem);
+
+ if (sb && sb->s_frozen == SB_UNFROZEN) {
+ up(&bdev->bd_freeze_sem);
+ return;
+ }
+
+ /* Delete unfreeze timer. */
+ del_freeze_timeout(bdev);
+
if (sb) {
BUG_ON(sb->s_bdev != bdev);

@@ -244,6 +277,8 @@ void thaw_bdev(struct block_device *bdev
}

up(&bdev->bd_mount_sem);
+
+ up(&bdev->bd_freeze_sem);
}
EXPORT_SYMBOL(thaw_bdev);

diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/ioctl.c linux-2.6.25-rc3-freeze/fs/ioctl.c
--- linux-2.6.25-rc3.org/fs/ioctl.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/ioctl.c 2008-02-27 10:30:30.000000000 +0900
@@ -13,6 +13,7 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/buffer_head.h>

#include <asm/ioctls.h>

@@ -181,6 +182,102 @@ int do_vfs_ioctl(struct file *filp, unsi
} else
error = -ENOTTY;
break;
+
+ case FIFREEZE: {
+ long timeout_sec;
+ long timeout_msec;
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* If filesystem doesn't support freeze feature, return. */
+ if (sb->s_op->write_super_lockfs == NULL) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* arg(sec) to tick value. */
+ error = get_user(timeout_sec, (long __user *) arg);
+ if (error != 0)
+ break;
+ /*
+ * If 1 is specified as the timeout period,
+ * it will be changed into 0 to keep the compatibility
+ * of XFS application(xfs_freeze).
+ */
+ if (timeout_sec < 0) {
+ error = -EINVAL;
+ break;
+ } else if (timeout_sec < 2) {
+ timeout_sec = 0;
+ }
+
+ timeout_msec = timeout_sec * 1000;
+ /* overflow case */
+ if (timeout_msec < 0) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* Freeze. */
+ freeze_bdev(sb->s_bdev, timeout_msec);
+
+ break;
+ }
+
+ case FITHAW: {
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* Thaw. */
+ thaw_bdev(sb->s_bdev, sb);
+ break;
+ }
+
+ case FIFREEZE_RESET_TIMEOUT: {
+ long timeout_sec;
+ long timeout_msec;
+ struct super_block *sb
+ = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* arg(sec) to tick value */
+ error = get_user(timeout_sec, (long __user *) arg);
+ if (error)
+ break;
+ timeout_msec = timeout_sec * 1000;
+ if (timeout_msec < 0) {
+ error = -EINVAL;
+ break;
+ }
+
+ if (sb) {
+ down(&sb->s_bdev->bd_freeze_sem);
+ if (sb->s_frozen == SB_UNFROZEN) {
+ up(&sb->s_bdev->bd_freeze_sem);
+ error = -EINVAL;
+ break;
+ }
+ /* setup unfreeze timer */
+ if (timeout_msec > 0)
+ add_freeze_timeout(sb->s_bdev,
+ timeout_msec);
+ up(&sb->s_bdev->bd_freeze_sem);
+ }
+ break;
+ }
+
default:
if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/super.c linux-2.6.25-rc3-freeze/fs/super.c
--- linux-2.6.25-rc3.org/fs/super.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/super.c 2008-02-25 10:50:04.000000000 +0900
@@ -154,7 +154,7 @@ int __put_super_and_need_restart(struct
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -507,6 +507,36 @@ rescan:

EXPORT_SYMBOL(get_super);

+/*
+ * get_super_without_lock - Get super_block from block_device without lock.
+ * @bdev: block device struct
+ *
+ * Scan the superblock list and finds the superblock of the file system
+ * mounted on the block device given. This doesn't lock anyone.
+ * %NULL is returned if no match is found.
+ */
+struct super_block *get_super_without_lock(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ if (!bdev)
+ return NULL;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdev == bdev) {
+ if (sb->s_root) {
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ return sb;
+ }
+ }
+ }
+ spin_unlock(&sb_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(get_super_without_lock);
+
struct super_block * user_get_super(dev_t dev)
{
struct super_block *sb;
@@ -952,3 +982,56 @@ struct vfsmount *kern_mount_data(struct
}

EXPORT_SYMBOL_GPL(kern_mount_data);
+
+/*
+ * freeze_timeout - Thaw the filesystem.
+ *
+ * @work: work queue (delayed_work.work)
+ *
+ * Called by the delayed work when elapsing the timeout period.
+ * Thaw the filesystem.
+ */
+void freeze_timeout(struct work_struct *work)
+{
+ struct block_device *bd = container_of(work,
+ struct block_device, bd_freeze_timeout.work);
+
+ struct super_block *sb = get_super_without_lock(bd);
+
+ BUG_ON(sb == NULL);
+
+ thaw_bdev(bd, sb);
+
+ put_super(sb);
+}
+EXPORT_SYMBOL_GPL(freeze_timeout);
+
+/*
+ * add_freeze_timeout - Add timeout for freeze.
+ *
+ * @bdev: block device struct
+ * @timeout_msec: timeout period
+ *
+ * Add the delayed work for freeze timeout to the delayed work queue.
+ */
+void add_freeze_timeout(struct block_device *bdev, long timeout_msec)
+{
+ s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
+
+ /* Set delayed work queue */
+ cancel_delayed_work(&bdev->bd_freeze_timeout);
+ schedule_delayed_work(&bdev->bd_freeze_timeout, timeout_jiffies);
+}
+
+/*
+ * del_freeze_timeout - Delete timeout for freeze.
+ *
+ * @bdev: block device struct
+ *
+ * Delete the delayed work for freeze timeout from the delayed work queue.
+ */
+void del_freeze_timeout(struct block_device *bdev)
+{
+ if (delayed_work_pending(&bdev->bd_freeze_timeout))
+ cancel_delayed_work(&bdev->bd_freeze_timeout);
+}
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/xfs/linux-2.6/xfs_ioctl.c linux-2.6.25-rc3-fr
eeze/fs/xfs/linux-2.6/xfs_ioctl.c
--- linux-2.6.25-rc3.org/fs/xfs/linux-2.6/xfs_ioctl.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/xfs/linux-2.6/xfs_ioctl.c 2008-02-25 10:50:04.000000000 +0900
@@ -911,7 +911,7 @@ xfs_ioctl(
return -EPERM;

if (inode->i_sb->s_frozen == SB_UNFROZEN)
- freeze_bdev(inode->i_sb->s_bdev);
+ freeze_bdev(inode->i_sb->s_bdev, 0);
return 0;

case XFS_IOC_THAW:
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/fs/xfs/xfs_fsops.c linux-2.6.25-rc3-freeze/fs/xf
s/xfs_fsops.c
--- linux-2.6.25-rc3.org/fs/xfs/xfs_fsops.c 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/fs/xfs/xfs_fsops.c 2008-02-25 10:50:04.000000000 +0900
@@ -623,7 +623,7 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+ struct super_block *sb = freeze_bdev(mp->m_super->s_bdev, 0);

if (sb && !IS_ERR(sb)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/include/linux/buffer_head.h linux-2.6.25-rc3-fre
eze/include/linux/buffer_head.h
--- linux-2.6.25-rc3.org/include/linux/buffer_head.h 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/include/linux/buffer_head.h 2008-02-25 10:50:04.000000000 +0900
@@ -170,7 +170,7 @@ int sync_blockdev(struct block_device *b
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
-struct super_block *freeze_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *, long timeout_msec);
void thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
diff -uprN -X /home/sho/pub/MC/freeze-set/dontdiff linux-2.6.25-rc3.org/include/linux/fs.h linux-2.6.25-rc3-freeze/inclu
de/linux/fs.h
--- linux-2.6.25-rc3.org/include/linux/fs.h 2008-02-25 06:25:54.000000000 +0900
+++ linux-2.6.25-rc3-freeze/include/linux/fs.h 2008-02-25 10:50:04.000000000 +0900
@@ -8,6 +8,7 @@

#include <linux/limits.h>
#include <linux/ioctl.h>
+#include <linux/workqueue.h>

/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -223,6 +224,9 @@ extern int dir_notify_enable;
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
+#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
+#define FITHAW _IOWR('X', 120, int) /* Thaw */
+#define FIFREEZE_RESET_TIMEOUT _IO(0x00, 3) /* Reset freeze timeout */

#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
@@ -548,6 +552,11 @@ struct block_device {
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
+
+ /* Delayed work for freeze */
+ struct delayed_work bd_freeze_timeout;
+ /* Semaphore for freeze */
+ struct semaphore bd_freeze_sem;
};

/*
@@ -1926,7 +1935,9 @@ extern int do_vfs_ioctl(struct file *fil
extern void get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
+extern void put_super(struct super_block *sb);
extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_super_without_lock(struct block_device *);
extern struct super_block *user_get_super(dev_t);
extern void drop_super(struct super_block *sb);

@@ -2070,7 +2081,6 @@ ssize_t simple_attr_read(struct file *fi
ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos);

-
#ifdef CONFIG_SECURITY
static inline char *alloc_secdata(void)
{
@@ -2097,5 +2107,9 @@ int proc_nr_files(struct ctl_table *tabl

int get_filesystem_list(char * buf);

+extern void add_freeze_timeout(struct block_device *bdev, long timeout_msec);
+extern void del_freeze_timeout(struct block_device *bdev);
+extern void freeze_timeout(struct work_struct *work);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */


2008-03-07 09:14:15

by Takashi Sato

[permalink] [raw]
Subject: [RFC] freeze feature ver 1.0

Hi,

I have re-based my freeze patch from linux-2.6.25-rc3 to
linux-2.6.25-rc4.
There is no functional change from the previous version.
All of comments from ML have already been reflected in this patch.
The ioctls for the freeze feature are below.

o Freeze the filesystem
int ioctl(int fd, int FIFREEZE, long *timeval)
fd: The file descriptor of the mountpoint
FIFREEZE: request code for the freeze
timeval: the timeout period in seconds
If it's 0 or 1, the timeout isn't set.
This special case of "1" is implemented to keep
the compatibility with XFS applications.
Return value: 0 if the operation succeeds. Otherwise, -1

o Reset the timeout period
This is useful for the application to set the timeval more accurately.
For example, the freezer resets the timeval to 10 seconds every 5
seconds. In this approach, even if the freezer causes a deadlock
by accessing the frozen filesystem, it will be solved by the timeout
in 10 seconds and the freezer can recognize that at the next reset
of timeval.
int ioctl(int fd, int FIFREEZE_RESET_TIMEOUT, long *timeval)
fd:file descriptor of mountpoint
FIFREEZE_RESET_TIMEOUT: request code for reset of timeout period
timeval: new timeout period in seconds
Return value: 0 if the operation succeeds. Otherwise, -1
Error number: If the filesystem has already been unfrozen,
errno is set to EINVAL.

o Unfreeze the filesystem
int ioctl(int fd, int FITHAW, long *timeval)
fd: The file descriptor of the mountpoint
FITHAW: request code for unfreeze
timeval: Ignored
Return value: 0 if the operation succeeds. Otherwise, -1

Any comments are very welcome.

Cheers, Takashi

Signed-off-by: Takashi Sato <[email protected]>
---
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/drivers/md/dm.c linux-2.6.25-rc4-freeze/dr
ivers/md/dm.c
--- linux-2.6.25-rc4/drivers/md/dm.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/drivers/md/dm.c 2008-03-07 20:34:43.000000000 +0900
@@ -1407,7 +1407,7 @@ static int lock_fs(struct mapped_device

WARN_ON(md->frozen_sb);

- md->frozen_sb = freeze_bdev(md->suspended_bdev);
+ md->frozen_sb = freeze_bdev(md->suspended_bdev, 0);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/fs/block_dev.c linux-2.6.25-rc4-freeze/fs/
block_dev.c
--- linux-2.6.25-rc4/fs/block_dev.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/fs/block_dev.c 2008-03-07 20:34:43.000000000 +0900
@@ -284,6 +284,11 @@ static void init_once(struct kmem_cache
INIT_LIST_HEAD(&bdev->bd_holder_list);
#endif
inode_init_once(&ei->vfs_inode);
+
+ /* Initialize semaphore for freeze. */
+ sema_init(&bdev->bd_freeze_sem, 1);
+ /* Setup freeze timeout function. */
+ INIT_DELAYED_WORK(&bdev->bd_freeze_timeout, freeze_timeout);
}

static inline void __bd_forget(struct inode *inode)
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/fs/buffer.c linux-2.6.25-rc4-freeze/fs/buf
fer.c
--- linux-2.6.25-rc4/fs/buffer.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/fs/buffer.c 2008-03-07 20:34:43.000000000 +0900
@@ -190,17 +190,33 @@ int fsync_bdev(struct block_device *bdev

/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
- * @bdev: blockdevice to lock
+ * @bdev: blockdevice to lock
+ * @timeout_msec: timeout period
*
* This takes the block device bd_mount_sem to make sure no new mounts
* happen on bdev until thaw_bdev() is called.
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
+ * If timeout_msec is bigger than 0, this registers the delayed work for
+ * timeout of the freeze feature.
*/
-struct super_block *freeze_bdev(struct block_device *bdev)
+struct super_block *freeze_bdev(struct block_device *bdev, long timeout_msec)
{
struct super_block *sb;

+ down(&bdev->bd_freeze_sem);
+ sb = get_super_without_lock(bdev);
+
+ /* If super_block has been already frozen, return. */
+ if (sb && sb->s_frozen != SB_UNFROZEN) {
+ put_super(sb);
+ up(&bdev->bd_freeze_sem);
+ return sb;
+ }
+
+ if (sb)
+ put_super(sb);
+
down(&bdev->bd_mount_sem);
sb = get_super(bdev);
if (sb && !(sb->s_flags & MS_RDONLY)) {
@@ -219,6 +235,13 @@ struct super_block *freeze_bdev(struct b
}

sync_blockdev(bdev);
+
+ /* Setup unfreeze timer. */
+ if (timeout_msec > 0)
+ add_freeze_timeout(bdev, timeout_msec);
+
+ up(&bdev->bd_freeze_sem);
+
return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
}
EXPORT_SYMBOL(freeze_bdev);
@@ -232,6 +255,16 @@ EXPORT_SYMBOL(freeze_bdev);
*/
void thaw_bdev(struct block_device *bdev, struct super_block *sb)
{
+ down(&bdev->bd_freeze_sem);
+
+ if (sb && sb->s_frozen == SB_UNFROZEN) {
+ up(&bdev->bd_freeze_sem);
+ return;
+ }
+
+ /* Delete unfreeze timer. */
+ del_freeze_timeout(bdev);
+
if (sb) {
BUG_ON(sb->s_bdev != bdev);

@@ -244,6 +277,8 @@ void thaw_bdev(struct block_device *bdev
}

up(&bdev->bd_mount_sem);
+
+ up(&bdev->bd_freeze_sem);
}
EXPORT_SYMBOL(thaw_bdev);

diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/fs/ioctl.c linux-2.6.25-rc4-freeze/fs/ioct
l.c
--- linux-2.6.25-rc4/fs/ioctl.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/fs/ioctl.c 2008-03-07 20:40:03.000000000 +0900
@@ -13,6 +13,7 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/buffer_head.h>

#include <asm/ioctls.h>

@@ -181,6 +182,102 @@ int do_vfs_ioctl(struct file *filp, unsi
} else
error = -ENOTTY;
break;
+
+ case FIFREEZE: {
+ long timeout_sec;
+ long timeout_msec;
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* If filesystem doesn't support freeze feature, return. */
+ if (sb->s_op->write_super_lockfs == NULL) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* arg(sec) to tick value. */
+ error = get_user(timeout_sec, (long __user *) arg);
+ if (error != 0)
+ break;
+ /*
+ * If 1 is specified as the timeout period,
+ * it will be changed into 0 to keep the compatibility
+ * of XFS application(xfs_freeze).
+ */
+ if (timeout_sec < 0) {
+ error = -EINVAL;
+ break;
+ } else if (timeout_sec < 2) {
+ timeout_sec = 0;
+ }
+
+ timeout_msec = timeout_sec * 1000;
+ /* overflow case */
+ if (timeout_msec < 0) {
+ error = -EINVAL;
+ break;
+ }
+
+ /* Freeze. */
+ freeze_bdev(sb->s_bdev, timeout_msec);
+
+ break;
+ }
+
+ case FITHAW: {
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* Thaw. */
+ thaw_bdev(sb->s_bdev, sb);
+ break;
+ }
+
+ case FIFREEZE_RESET_TIMEOUT: {
+ long timeout_sec;
+ long timeout_msec;
+ struct super_block *sb
+ = filp->f_path.dentry->d_inode->i_sb;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ error = -EPERM;
+ break;
+ }
+
+ /* arg(sec) to tick value */
+ error = get_user(timeout_sec, (long __user *) arg);
+ if (error)
+ break;
+ timeout_msec = timeout_sec * 1000;
+ if (timeout_msec < 0) {
+ error = -EINVAL;
+ break;
+ }
+
+ if (sb) {
+ down(&sb->s_bdev->bd_freeze_sem);
+ if (sb->s_frozen == SB_UNFROZEN) {
+ up(&sb->s_bdev->bd_freeze_sem);
+ error = -EINVAL;
+ break;
+ }
+ /* setup unfreeze timer */
+ if (timeout_msec > 0)
+ add_freeze_timeout(sb->s_bdev,
+ timeout_msec);
+ up(&sb->s_bdev->bd_freeze_sem);
+ }
+ break;
+ }
+
default:
if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/fs/super.c linux-2.6.25-rc4-freeze/fs/supe
r.c
--- linux-2.6.25-rc4/fs/super.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/fs/super.c 2008-03-07 20:36:25.000000000 +0900
@@ -154,7 +154,7 @@ int __put_super_and_need_restart(struct
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -507,6 +507,36 @@ rescan:

EXPORT_SYMBOL(get_super);

+/*
+ * get_super_without_lock - Get super_block from block_device without lock.
+ * @bdev: block device struct
+ *
+ * Scan the superblock list and finds the superblock of the file system
+ * mounted on the block device given. This doesn't lock anyone.
+ * %NULL is returned if no match is found.
+ */
+struct super_block *get_super_without_lock(struct block_device *bdev)
+{
+ struct super_block *sb;
+
+ if (!bdev)
+ return NULL;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdev == bdev) {
+ if (sb->s_root) {
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ return sb;
+ }
+ }
+ }
+ spin_unlock(&sb_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(get_super_without_lock);
+
struct super_block * user_get_super(dev_t dev)
{
struct super_block *sb;
@@ -952,3 +982,55 @@ struct vfsmount *kern_mount_data(struct
}

EXPORT_SYMBOL_GPL(kern_mount_data);
+
+/*
+ * freeze_timeout - Thaw the filesystem.
+ *
+ * @work: work queue (delayed_work.work)
+ *
+ * Called by the delayed work when elapsing the timeout period.
+ * Thaw the filesystem.
+ */
+void freeze_timeout(struct work_struct *work)
+{
+ struct block_device *bd = container_of(work,
+ struct block_device, bd_freeze_timeout.work);
+
+ struct super_block *sb = get_super_without_lock(bd);
+
+ thaw_bdev(bd, sb);
+
+ if (sb)
+ put_super(sb);
+}
+EXPORT_SYMBOL_GPL(freeze_timeout);
+
+/*
+ * add_freeze_timeout - Add timeout for freeze.
+ *
+ * @bdev: block device struct
+ * @timeout_msec: timeout period
+ *
+ * Add the delayed work for freeze timeout to the delayed work queue.
+ */
+void add_freeze_timeout(struct block_device *bdev, long timeout_msec)
+{
+ s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
+
+ /* Set delayed work queue */
+ cancel_delayed_work(&bdev->bd_freeze_timeout);
+ schedule_delayed_work(&bdev->bd_freeze_timeout, timeout_jiffies);
+}
+
+/*
+ * del_freeze_timeout - Delete timeout for freeze.
+ *
+ * @bdev: block device struct
+ *
+ * Delete the delayed work for freeze timeout from the delayed work queue.
+ */
+void del_freeze_timeout(struct block_device *bdev)
+{
+ if (delayed_work_pending(&bdev->bd_freeze_timeout))
+ cancel_delayed_work(&bdev->bd_freeze_timeout);
+}
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/fs/xfs/linux-2.6/xfs_ioctl.c linux-2.6.25-
rc4-freeze/fs/xfs/linux-2.6/xfs_ioctl.c
--- linux-2.6.25-rc4/fs/xfs/linux-2.6/xfs_ioctl.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/fs/xfs/linux-2.6/xfs_ioctl.c 2008-03-07 20:34:43.000000000 +0900
@@ -911,7 +911,7 @@ xfs_ioctl(
return -EPERM;

if (inode->i_sb->s_frozen == SB_UNFROZEN)
- freeze_bdev(inode->i_sb->s_bdev);
+ freeze_bdev(inode->i_sb->s_bdev, 0);
return 0;

case XFS_IOC_THAW:
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/fs/xfs/xfs_fsops.c linux-2.6.25-rc4-freeze
/fs/xfs/xfs_fsops.c
--- linux-2.6.25-rc4/fs/xfs/xfs_fsops.c 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/fs/xfs/xfs_fsops.c 2008-03-07 20:34:43.000000000 +0900
@@ -623,7 +623,7 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+ struct super_block *sb = freeze_bdev(mp->m_super->s_bdev, 0);

if (sb && !IS_ERR(sb)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/include/linux/buffer_head.h linux-2.6.25-r
c4-freeze/include/linux/buffer_head.h
--- linux-2.6.25-rc4/include/linux/buffer_head.h 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/include/linux/buffer_head.h 2008-03-07 20:34:43.000000000 +0900
@@ -170,7 +170,7 @@ int sync_blockdev(struct block_device *b
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
-struct super_block *freeze_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *, long timeout_msec);
void thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
diff -uprN -X linux-2.6.25-rc4-freeze/Documentation/dontdiff linux-2.6.25-rc4/include/linux/fs.h linux-2.6.25-rc4-freeze
/include/linux/fs.h
--- linux-2.6.25-rc4/include/linux/fs.h 2008-03-05 13:33:54.000000000 +0900
+++ linux-2.6.25-rc4-freeze/include/linux/fs.h 2008-03-07 20:34:43.000000000 +0900
@@ -8,6 +8,7 @@

#include <linux/limits.h>
#include <linux/ioctl.h>
+#include <linux/workqueue.h>

/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -223,6 +224,9 @@ extern int dir_notify_enable;
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
+#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
+#define FITHAW _IOWR('X', 120, int) /* Thaw */
+#define FIFREEZE_RESET_TIMEOUT _IO(0x00, 3) /* Reset freeze timeout */

#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
@@ -548,6 +552,11 @@ struct block_device {
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
+
+ /* Delayed work for freeze */
+ struct delayed_work bd_freeze_timeout;
+ /* Semaphore for freeze */
+ struct semaphore bd_freeze_sem;
};

/*
@@ -1926,7 +1935,9 @@ extern int do_vfs_ioctl(struct file *fil
extern void get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
+extern void put_super(struct super_block *sb);
extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_super_without_lock(struct block_device *);
extern struct super_block *user_get_super(dev_t);
extern void drop_super(struct super_block *sb);

@@ -2097,5 +2108,9 @@ int proc_nr_files(struct ctl_table *tabl

int get_filesystem_list(char * buf);

+extern void add_freeze_timeout(struct block_device *bdev, long timeout_msec);
+extern void del_freeze_timeout(struct block_device *bdev);
+extern void freeze_timeout(struct work_struct *work);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */