The timeout feature is added to freeze ioctl. And new ioctl
to reset the timeout period is added.
o Freeze the filesystem
int ioctl(int fd, int FIFREEZE, long *timeout_sec)
fd: The file descriptor of the mountpoint
FIFREEZE: request code for the freeze
timeout_sec: the timeout period in seconds
If it's 0 or 1, the timeout isn't set.
This special case of "1" is implemented to keep
the compatibility with XFS applications.
Return value: 0 if the operation succeeds. Otherwise, -1
o Reset the timeout period
int ioctl(int fd, int FIFREEZE_RESET_TIMEOUT, long *timeout_sec)
fd:file descriptor of mountpoint
FIFREEZE_RESET_TIMEOUT: request code for reset of timeout period
timeout_sec: new timeout period in seconds
Return value: 0 if the operation succeeds. Otherwise, -1
Error number: If the filesystem has already been unfrozen,
errno is set to EINVAL.
Signed-off-by: Takashi Sato <[email protected]>
Signed-off-by: Masayuki Hamaguchi <[email protected]>
---
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/drivers/md/dm.c linux-2.6.26-rc7-timeout/
drivers/md/dm.c
--- linux-2.6.26-rc7-xfs/drivers/md/dm.c 2008-06-25 12:06:59.000000000 +0900
+++ linux-2.6.26-rc7-timeout/drivers/md/dm.c 2008-06-25 16:28:40.000000000 +0900
@@ -1407,7 +1407,7 @@ static int lock_fs(struct mapped_device
WARN_ON(md->frozen_sb);
- md->frozen_sb = freeze_bdev(md->suspended_bdev);
+ md->frozen_sb = freeze_bdev(md->suspended_bdev, 0);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/fs/block_dev.c linux-2.6.26-rc7-timeout/f
s/block_dev.c
--- linux-2.6.26-rc7-xfs/fs/block_dev.c 2008-06-25 12:07:19.000000000 +0900
+++ linux-2.6.26-rc7-timeout/fs/block_dev.c 2008-06-25 16:30:49.000000000 +0900
@@ -285,6 +285,8 @@ static void init_once(struct kmem_cache
INIT_LIST_HEAD(&bdev->bd_holder_list);
#endif
inode_init_once(&ei->vfs_inode);
+ /* Setup freeze timeout function. */
+ INIT_DELAYED_WORK(&bdev->bd_freeze_timeout, freeze_timeout);
}
static inline void __bd_forget(struct inode *inode)
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/fs/buffer.c linux-2.6.26-rc7-timeout/fs/b
uffer.c
--- linux-2.6.26-rc7-xfs/fs/buffer.c 2008-06-30 12:59:53.000000000 +0900
+++ linux-2.6.26-rc7-timeout/fs/buffer.c 2008-06-27 10:59:55.000000000 +0900
@@ -190,14 +190,18 @@ int fsync_bdev(struct block_device *bdev
/**
* freeze_bdev -- lock a filesystem and force it into a consistent state
- * @bdev: blockdevice to lock
+ * @bdev: blockdevice to lock
+ * @timeout_msec: timeout period
*
* This takes the block device bd_mount_sem to make sure no new mounts
* happen on bdev until thaw_bdev() is called.
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
+ * If timeout_msec is bigger than 0, this registers the delayed work for
+ * timeout of the freeze feature.
*/
-struct super_block *freeze_bdev(struct block_device *bdev)
+struct super_block *freeze_bdev(struct block_device *bdev,
+ unsigned int timeout_msec)
{
struct super_block *sb;
@@ -234,6 +238,10 @@ struct super_block *freeze_bdev(struct b
}
sync_blockdev(bdev);
+ /* Setup unfreeze timer. */
+ if (timeout_msec > 0)
+ add_freeze_timeout(bdev, timeout_msec);
+
clear_bit(BD_FREEZE_OP, &bdev->bd_state);
return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
@@ -258,6 +266,9 @@ int thaw_bdev(struct block_device *bdev,
return 0;
}
+ /* Delete unfreeze timer. */
+ del_freeze_timeout(bdev);
+
if (sb) {
BUG_ON(sb->s_bdev != bdev);
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/fs/ioctl.c linux-2.6.26-rc7-timeout/fs/io
ctl.c
--- linux-2.6.26-rc7-xfs/fs/ioctl.c 2008-06-25 12:07:20.000000000 +0900
+++ linux-2.6.26-rc7-timeout/fs/ioctl.c 2008-06-27 08:49:58.000000000 +0900
@@ -145,22 +145,47 @@ static int ioctl_fioasync(unsigned int f
* ioctl_freeze - Freeze the filesystem.
*
* @filp: target file
+ * @argp: timeout value(sec)
*
* Call freeze_bdev() to freeze the filesystem.
*/
-static int ioctl_freeze(struct file *filp)
+static int ioctl_freeze(struct file *filp, int __user *argp)
{
+ int timeout_sec;
+ unsigned int timeout_msec;
struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- /* If filesystem doesn't support freeze feature, return. */
+ /* If filesystem doesn't support freeze feature, return EOPNOTSUPP. */
if (sb->s_op->write_super_lockfs == NULL)
return -EOPNOTSUPP;
+ /* If a regular file or a directory isn't specified, return EINVAL. */
+ if (sb->s_bdev == NULL)
+ return -EINVAL;
+
+ /* arg(sec) to tick value. */
+ error = get_user(timeout_sec, argp);
+ if (error != 0)
+ return error;
+
+ if (timeout_sec < 0 || timeout_sec > UINT_MAX/1000)
+ return -EINVAL;
+
+ /*
+ * If 1 is specified as the timeout period it is changed into 0
+ * to retain compatibility with XFS's xfs_freeze.
+ */
+ if (timeout_sec == 1)
+ timeout_sec = 0;
+
+ timeout_msec = timeout_sec * 1000;
+
/* Freeze */
- sb = freeze_bdev(sb->s_bdev);
+ sb = freeze_bdev(sb->s_bdev, timeout_msec);
if (IS_ERR(sb))
return PTR_ERR(sb);
return 0;
@@ -180,11 +205,61 @@ static int ioctl_thaw(struct file *filp)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ /* If a regular file or a directory isn't specified, return EINVAL. */
+ if (sb->s_bdev == NULL)
+ return -EINVAL;
+
/* Thaw */
return thaw_bdev(sb->s_bdev, sb);
}
/*
+ * ioctl_freeze_reset_timeout - Reset timeout for freeze.
+ *
+ * @filp: target file
+ * @argp: timeout value(sec)
+ *
+ * Reset timeout for freeze.
+ */
+static int
+ioctl_freeze_reset_timeout(struct file *filp, int __user *argp)
+{
+ int timeout_sec;
+ unsigned int timeout_msec;
+ struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* If a regular file or a directory isn't specified, return EINVAL. */
+ if (sb->s_bdev == NULL)
+ return -EINVAL;
+
+ /* arg(sec) to tick value */
+ error = get_user(timeout_sec, argp);
+ if (error)
+ return error;
+
+ if (timeout_sec <= 0 || timeout_sec > UINT_MAX/1000)
+ return -EINVAL;
+
+ timeout_msec = timeout_sec * 1000;
+
+ if (test_and_set_bit(BD_FREEZE_OP, &sb->s_bdev->bd_state))
+ return -EBUSY;
+ if (sb->s_frozen == SB_UNFROZEN) {
+ clear_bit(BD_FREEZE_OP, &sb->s_bdev->bd_state);
+ return -EINVAL;
+ }
+ /* setup unfreeze timer */
+ add_freeze_timeout(sb->s_bdev, timeout_msec);
+ clear_bit(BD_FREEZE_OP, &sb->s_bdev->bd_state);
+
+ return 0;
+}
+
+/*
* When you add any new common ioctls to the switches above and below
* please update compat_sys_ioctl() too.
*
@@ -227,13 +302,17 @@ int do_vfs_ioctl(struct file *filp, unsi
break;
case FIFREEZE:
- error = ioctl_freeze(filp);
+ error = ioctl_freeze(filp, argp);
break;
case FITHAW:
error = ioctl_thaw(filp);
break;
+ case FIFREEZE_RESET_TIMEOUT:
+ error = ioctl_freeze_reset_timeout(filp, argp);
+ break;
+
default:
if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/fs/super.c linux-2.6.26-rc7-timeout/fs/su
per.c
--- linux-2.6.26-rc7-xfs/fs/super.c 2008-06-30 16:41:48.000000000 +0900
+++ linux-2.6.26-rc7-timeout/fs/super.c 2008-06-30 17:30:38.000000000 +0900
@@ -980,3 +980,60 @@ struct vfsmount *kern_mount_data(struct
}
EXPORT_SYMBOL_GPL(kern_mount_data);
+
+/*
+ * freeze_timeout - Thaw the filesystem.
+ *
+ * @work: work queue (delayed_work.work)
+ *
+ * Called by the delayed work when elapsing the timeout period.
+ * Thaw the filesystem.
+ */
+void freeze_timeout(struct work_struct *work)
+{
+ struct block_device *bd = container_of(work,
+ struct block_device, bd_freeze_timeout.work);
+ struct super_block *sb = get_super(bd);
+
+ thaw_bdev(bd, sb);
+
+ if (sb)
+ drop_super(sb);
+}
+EXPORT_SYMBOL_GPL(freeze_timeout);
+
+/*
+ * add_freeze_timeout - Add timeout for freeze.
+ *
+ * @bdev: block device struct
+ * @timeout_msec: timeout period
+ *
+ * Add the delayed work for freeze timeout to the delayed work queue.
+ */
+void add_freeze_timeout(struct block_device *bdev, unsigned int timeout_msec)
+{
+ s64 timeout_jiffies = msecs_to_jiffies(timeout_msec);
+
+ /* Set delayed work queue */
+ cancel_delayed_work_sync(&bdev->bd_freeze_timeout);
+ schedule_delayed_work(&bdev->bd_freeze_timeout, timeout_jiffies);
+}
+
+/*
+ * del_freeze_timeout - Delete timeout for freeze.
+ *
+ * @bdev: block device struct
+ *
+ * Delete the delayed work for freeze timeout from the delayed work queue.
+ */
+void del_freeze_timeout(struct block_device *bdev)
+{
+ /*
+ * It's possible that the delayed work task (freeze_timeout()) calls
+ * del_freeze_timeout(). If the delayed work task calls
+ * cancel_delayed_work_sync((), the deadlock will occur.
+ * So we need this check (delayed_work_pending()).
+ */
+ if (delayed_work_pending(&bdev->bd_freeze_timeout))
+ cancel_delayed_work_sync(&bdev->bd_freeze_timeout);
+}
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/fs/xfs/xfs_fsops.c linux-2.6.26-rc7-timeo
ut/fs/xfs/xfs_fsops.c
--- linux-2.6.26-rc7-xfs/fs/xfs/xfs_fsops.c 2008-06-25 12:07:19.000000000 +0900
+++ linux-2.6.26-rc7-timeout/fs/xfs/xfs_fsops.c 2008-06-25 16:30:48.000000000 +0900
@@ -619,7 +619,7 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
+ struct super_block *sb = freeze_bdev(mp->m_super->s_bdev, 0);
if (sb && !IS_ERR(sb)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/include/linux/buffer_head.h linux-2.6.26-
rc7-timeout/include/linux/buffer_head.h
--- linux-2.6.26-rc7-xfs/include/linux/buffer_head.h 2008-06-25 12:07:24.000000000 +0900
+++ linux-2.6.26-rc7-timeout/include/linux/buffer_head.h 2008-06-27 11:11:38.000000000 +0900
@@ -170,7 +170,8 @@ int sync_blockdev(struct block_device *b
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
-struct super_block *freeze_bdev(struct block_device *);
+struct super_block *freeze_bdev(struct block_device *,
+ unsigned int timeout_msec);
int thaw_bdev(struct block_device *, struct super_block *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
diff -uprN -X linux-2.6.26-rc7.org/Documentation/dontdiff linux-2.6.26-rc7-xfs/include/linux/fs.h linux-2.6.26-rc7-timeo
ut/include/linux/fs.h
--- linux-2.6.26-rc7-xfs/include/linux/fs.h 2008-06-30 16:42:11.000000000 +0900
+++ linux-2.6.26-rc7-timeout/include/linux/fs.h 2008-06-30 16:43:13.000000000 +0900
@@ -8,6 +8,7 @@
#include <linux/limits.h>
#include <linux/ioctl.h>
+#include <linux/workqueue.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -225,6 +226,7 @@ extern int dir_notify_enable;
#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
#define FIFREEZE _IOWR('X', 119, int) /* Freeze */
#define FITHAW _IOWR('X', 120, int) /* Thaw */
+#define FIFREEZE_RESET_TIMEOUT _IO(0x00, 3) /* Reset freeze timeout */
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
@@ -559,6 +561,8 @@ struct block_device {
/* State of the block device. (Used by freeze feature) */
unsigned long bd_state;
+ /* Delayed work for freeze */
+ struct delayed_work bd_freeze_timeout;
};
/*
@@ -2147,5 +2151,10 @@ int proc_nr_files(struct ctl_table *tabl
int get_filesystem_list(char * buf);
+extern void add_freeze_timeout(struct block_device *bdev,
+ unsigned int timeout_msec);
+extern void del_freeze_timeout(struct block_device *bdev);
+extern void freeze_timeout(struct work_struct *work);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_FS_H */
I still disagree with this whole patch. There is not reason to let
the freeze request timeout - an auto-unfreezing will only confuse the
hell out of the caller. The only reason where the current XFS freeze
call can hang and this would be theoretically useful is when the
filesystem is already frozen by someone else, but this should be fixed
by refusing to do the second freeze, as suggested in my comment to patch
1.
On Tue, Jul 01, 2008 at 04:10:26AM -0400, Christoph Hellwig wrote:
> I still disagree with this whole patch.
Same here - if you want a timeout, what stops you from implementing it in a
userspace process? If your concern is that the process might die without
thawing the filesystem, take a look at the userspace LVM/multipath code for
ideas - lock into memory, disable OOM killer, run from ramdisk etc.
In practice, those techniques seem to be good enough.
> call can hang and this would be theoretically useful is when the
> filesystem is already frozen by someone else, but this should be fixed
> by refusing to do the second freeze, as suggested in my comment to patch
> 1.
Similarly if a device-mapper device is involved, how should the following
sequence behave - A, B or C?
1. dmsetup suspend (freezes)
2. FIFREEZE
3. FITHAW
4. dmsetup resume (thaws)
A:
1 succeeds, freezes
2 succeeds, remains frozen
3 succeeds, remains frozen
4 succeeds, thaws
B:
1 succeeds, freezes
2 fails, remains frozen
3 shouldn't be called because 2 failed but if it is: succeeds, thaws
4 succeeds (already thawed, but still does the device-mapper parts)
C:
1 succeeds, freezes
2 fails, remains frozen
3 fails (because device-mapper owns the freeze/thaw), remains frozen
4 succeeds, thaws
Alasdair
--
[email protected]
Hi Christoph and Alasdair,
> On Tue, Jul 01, 2008 at 04:10:26AM -0400, Christoph Hellwig wrote:
>> I still disagree with this whole patch.
>
> Same here - if you want a timeout, what stops you from implementing it in a
> userspace process? If your concern is that the process might die without
> thawing the filesystem, take a look at the userspace LVM/multipath code for
> ideas - lock into memory, disable OOM killer, run from ramdisk etc.
> In practice, those techniques seem to be good enough.
If the freezer accesses the frozen filesystem and causes a deadlock,
the above ideas can't solve it. The timeout is useful to solve such a deadlock.
If you don't need the timeout, you can disable it by specifying "0" as the
timeout period.
> Similarly if a device-mapper device is involved, how should the following
> sequence behave - A, B or C?
>
> 1. dmsetup suspend (freezes)
> 2. FIFREEZE
> 3. FITHAW
> 4. dmsetup resume (thaws)
[...]
> C:
> 1 succeeds, freezes
> 2 fails, remains frozen
> 3 fails (because device-mapper owns the freeze/thaw), remains frozen
> 4 succeeds, thaws
I think C is appropriate and the following change makes it possible.
How do you think?
1. Add the new bit flag(BD_FREEZE_DM) in block_device.bd_state.
It means that the volume is frozen by the device-mapper.
2. Operate and check this bit flag as followings.
- Bit operations in the device-mapper's freeze/thaw
FREEZE:
dm_suspend(): set BD_FREEZE_DM
freeze_bdev():set BD_FREEZE_OP
THAW:
thaw_bdev(): clear BD_FREEZE_OP
dm_resume(): clear BD_FREEZE_DM
- Checks in FIFREEZE/FITHAW
FREEZE:
ioctl_freeze(): Not need to check BD_FREEZE_DM
freeze_bdev():set BD_FREEZE_OP
THAW:
ioctl_thaw(): If BD_FREEZE_DM is set, fail, otherwise, call thaw_bdev()
thaw_bdev(): clear BD_FREEZE_OP
Cheers, Takashi
On Thu, Jul 03, 2008 at 09:11:05PM +0900, Takashi Sato wrote:
> If the freezer accesses the frozen filesystem and causes a deadlock,
> the above ideas can't solve it
But you could also say that if the 'freezer' process accesses the frozen
filesystem and deadlocks then that's just a bug and that userspace code
should be fixed and there's no need to introduce the complexity of a
timeout parameter.
> >Similarly if a device-mapper device is involved, how should the following
> >sequence behave - A, B or C?
> >
> >1. dmsetup suspend (freezes)
> >2. FIFREEZE
> >3. FITHAW
> >4. dmsetup resume (thaws)
> [...]
> >C:
> > 1 succeeds, freezes
> > 2 fails, remains frozen
> > 3 fails (because device-mapper owns the freeze/thaw), remains frozen
> > 4 succeeds, thaws
>
> I think C is appropriate and the following change makes it possible.
> How do you think?
The point I'm trying to make here is:
Under what real-world circumstances might multiple concurrent freezing
attempts occur, and which of A, B or C (or other variations) would be
the most appropriate way of handling such situations?
A common example is people running xfs_freeze followed by an lvm command
which also attempts to freeze the filesystem.
I can see a case for B or C, but personally I prefer A:
> > 1 succeeds, freezes
> > 2 succeeds, remains frozen
> > 3 succeeds, remains frozen
> > 4 succeeds, thaws
Alasdair
--
[email protected]
Takashi Sato wrote:
> Hi Christoph and Alasdair,
>
>> On Tue, Jul 01, 2008 at 04:10:26AM -0400, Christoph Hellwig wrote:
>>> I still disagree with this whole patch.
>> Same here - if you want a timeout, what stops you from implementing it in a
>> userspace process? If your concern is that the process might die without
>> thawing the filesystem, take a look at the userspace LVM/multipath code for
>> ideas - lock into memory, disable OOM killer, run from ramdisk etc.
>> In practice, those techniques seem to be good enough.
>
> If the freezer accesses the frozen filesystem and causes a deadlock,
> the above ideas can't solve it. The timeout is useful to solve such a deadlock.
> If you don't need the timeout, you can disable it by specifying "0" as the
> timeout period.
>
>> Similarly if a device-mapper device is involved, how should the following
>> sequence behave - A, B or C?
>>
>> 1. dmsetup suspend (freezes)
>> 2. FIFREEZE
>> 3. FITHAW
>> 4. dmsetup resume (thaws)
> [...]
>> C:
>> 1 succeeds, freezes
>> 2 fails, remains frozen
>> 3 fails (because device-mapper owns the freeze/thaw), remains frozen
>> 4 succeeds, thaws
>
> I think C is appropriate and the following change makes it possible.
> How do you think?
>
> 1. Add the new bit flag(BD_FREEZE_DM) in block_device.bd_state.
> It means that the volume is frozen by the device-mapper.
Will we add a new bit/flag for every possible subysstem that may call
freeze/thaw? This seems odd to me.
They are different paths to the same underlying mechanism; it should not
matter if it is an existing freeze from DM or via FIFREEZE or via the
xfs ioctl, or any other mechanism should it? I don't think this generic
interface should use any flag named *_DM, personally.
It seems that nested freeze requests must be handled in a generic way
regardless of what initiates any of the requests?
Refcounting freezes as Alasdair suggests seems to make sense to me, i.e.
freeze, freeze, thaw, thaw leads to:
>> > > 1 (freeze) succeeds, freezes (frozen++)
>> > > 2 (freeze) succeeds, remains frozen (frozen++)
>> > > 3 (thaw) succeeds, remains frozen (frozen--)
>> > > 4 (thaw) succeeds, thaws (frozen--)
that way each caller of freeze is guaranteed that the fs is frozen at
least until they call thaw?
Thanks,
-Eric
On Thu, Jul 03, 2008 at 01:47:10PM +0100, Alasdair G Kergon wrote:
> On Thu, Jul 03, 2008 at 09:11:05PM +0900, Takashi Sato wrote:
> > If the freezer accesses the frozen filesystem and causes a deadlock,
> > the above ideas can't solve it
>
> But you could also say that if the 'freezer' process accesses the frozen
> filesystem and deadlocks then that's just a bug and that userspace code
> should be fixed and there's no need to introduce the complexity of a
> timeout parameter.
Seconded - that was also my primary objection to the timeout code.
> The point I'm trying to make here is:
> Under what real-world circumstances might multiple concurrent freezing
> attempts occur, and which of A, B or C (or other variations) would be
> the most appropriate way of handling such situations?
>
> A common example is people running xfs_freeze followed by an lvm command
> which also attempts to freeze the filesystem.
Yes, I've seen that reported a number of times.
> I can see a case for B or C, but personally I prefer A:
>
> > > 1 succeeds, freezes
> > > 2 succeeds, remains frozen
> > > 3 succeeds, remains frozen
> > > 4 succeeds, thaws
Agreed, though I'd modify the definition of that case to be "remain
frozen until the last thaw occurs". That has the advantage that
it's relatively simple to implement with just a counter...
Cheers,
Dave.
--
Dave Chinner
[email protected]
Hi Alasdair, Eric and Dave,
> On Thu, Jul 03, 2008 at 01:47:10PM +0100, Alasdair G Kergon wrote:
>> On Thu, Jul 03, 2008 at 09:11:05PM +0900, Takashi Sato wrote:
>> > If the freezer accesses the frozen filesystem and causes a deadlock,
>> > the above ideas can't solve it
>>
>> But you could also say that if the 'freezer' process accesses the frozen
>> filesystem and deadlocks then that's just a bug and that userspace code
>> should be fixed and there's no need to introduce the complexity of a
>> timeout parameter.
>
> Seconded - that was also my primary objection to the timeout code.
I will consider removing the timeout.
>> The point I'm trying to make here is:
>> Under what real-world circumstances might multiple concurrent freezing
>> attempts occur, and which of A, B or C (or other variations) would be
>> the most appropriate way of handling such situations?
>>
>> A common example is people running xfs_freeze followed by an lvm command
>> which also attempts to freeze the filesystem.
>
> Yes, I've seen that reported a number of times.
>
>> I can see a case for B or C, but personally I prefer A:
>>
>> > > 1 succeeds, freezes
>> > > 2 succeeds, remains frozen
>> > > 3 succeeds, remains frozen
>> > > 4 succeeds, thaws
>
> Agreed, though I'd modify the definition of that case to be "remain
> frozen until the last thaw occurs". That has the advantage that
> it's relatively simple to implement with just a counter...
I agree this idea.
But I have one concern. When device-mapper's freeze follows FIFREEZE,
can device-mapper freeze only device-mapper's part correctly?
And when device-mapper's thaw follows FITHAW,
can device-mapper thaw only device-mapper's part?
Cheers, Takashi
Hi!
> I still disagree with this whole patch. There is not reason to let
> the freeze request timeout - an auto-unfreezing will only confuse the
> hell out of the caller. The only reason where the current XFS freeze
> call can hang and this would be theoretically useful is when the
What happens when someone dirties so much data that vm swaps out
whatever process that frozen the filesystem?
I though that was why the timeout was there...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Mon, Jul 07, 2008 at 01:07:31PM +0200, Pavel Machek wrote:
> Hi!
>
> > I still disagree with this whole patch. There is not reason to let
> > the freeze request timeout - an auto-unfreezing will only confuse the
> > hell out of the caller. The only reason where the current XFS freeze
> > call can hang and this would be theoretically useful is when the
>
> What happens when someone dirties so much data that vm swaps out
> whatever process that frozen the filesystem?
a) you can't dirty a frozen filesystem - by definition a frozen
filesystem is a *clean filesystem* and *cannot be dirtied*.
b) Swap doesn't write through the filesystem
c) you can still read from a frozen filesystem to page your
executableѕ in.
d) if dirtying another unfrozen filesystem swaps out your
application so it can't run, then there's a major VM bug.
Regardless, until the app completes it is relying on the
filesystem being frozen, so it better remain frozen....
> I though that was why the timeout was there...
Not that I know of.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed 2008-07-09 09:10:27, Dave Chinner wrote:
> On Mon, Jul 07, 2008 at 01:07:31PM +0200, Pavel Machek wrote:
> > Hi!
> >
> > > I still disagree with this whole patch. There is not reason to let
> > > the freeze request timeout - an auto-unfreezing will only confuse the
> > > hell out of the caller. The only reason where the current XFS freeze
> > > call can hang and this would be theoretically useful is when the
> >
> > What happens when someone dirties so much data that vm swaps out
> > whatever process that frozen the filesystem?
>
> a) you can't dirty a frozen filesystem - by definition a frozen
> filesystem is a *clean filesystem* and *cannot be dirtied*.
Can you stop me?
mmap("/some/huge_file", MAP_SHARED);
then write to memory mapping?
> b) Swap doesn't write through the filesystem
> c) you can still read from a frozen filesystem to page your
> executable?? in.
atime modification should mean dirty data, right? And dirty data mean
memory pressure, right?
> d) if dirtying another unfrozen filesystem swaps out your
~~~~~~~
> application so it can't run, then there's a major VM bug.
> Regardless, until the app completes it is relying on the
> filesystem being frozen, so it better remain frozen....
Agreed. With emphasis on "another".
> > I though that was why the timeout was there...
>
> Not that I know of.
Ok, lets see how you deal with mmap.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Wed, Jul 09, 2008 at 01:20:31AM +0200, Pavel Machek wrote:
> On Wed 2008-07-09 09:10:27, Dave Chinner wrote:
> > On Mon, Jul 07, 2008 at 01:07:31PM +0200, Pavel Machek wrote:
> > > Hi!
> > >
> > > > I still disagree with this whole patch. There is not reason to let
> > > > the freeze request timeout - an auto-unfreezing will only confuse the
> > > > hell out of the caller. The only reason where the current XFS freeze
> > > > call can hang and this would be theoretically useful is when the
> > >
> > > What happens when someone dirties so much data that vm swaps out
> > > whatever process that frozen the filesystem?
> >
> > a) you can't dirty a frozen filesystem - by definition a frozen
> > filesystem is a *clean filesystem* and *cannot be dirtied*.
>
> Can you stop me?
>
> mmap("/some/huge_file", MAP_SHARED);
>
> then write to memory mapping?
Sure - we can put a hook in ->page_mkwrite() to prevent it. We
don't right now because nobody in the real world really cares if one
half of a concurrent user data change is in the old snapshot or the
new one......
> > b) Swap doesn't write through the filesystem
> > c) you can still read from a frozen filesystem to page your
> > executable?? in.
>
> atime modification should mean dirty data, right?
Metadata, not data. If that's really a problem (and it never has
been for XFS because we always allow in memory changes to atime)
then touch_atime could be easily changed to avoid this...
> And dirty data mean
> memory pressure, right?
If you walk enough inodes while the filesystem is frozen, it
theoretically could happen. Typically a filesystem is only for a
few seconds at a time so in the real world this has never, ever been
a problem.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Jul 09, 2008 at 10:52:54AM +1000, Dave Chinner wrote:
> If you walk enough inodes while the filesystem is frozen, it
> theoretically could happen. Typically a filesystem is only for a
> few seconds at a time so in the real world this has never, ever been
> a problem.
I had argued for the timeout (and so it's mostly my fault that
Takashi-San included it as a feature) mainly because I was (and still
amm) deeply paranoid about the competence of the application
programers who might use this feature. I could see them screwing up
leaving it locked forever --- perhaps when their program core dumps or
when the user types ^C and they forgot to install a signal handler,
leaving the filesystem frozen forever.
In the meantime, user applications that try to create files on that
filesystem, or write to files already opened when the filesystem are
frozen will accumulate dirty pages in the page cache, until the system
finally falls over.
Think about some of the evil perpetrated by hal and the userspace
suspend-resume scripts (and how much complexity with random XML
fragments getting parsed by various dbus plugins), and tell me with a
straight face that you would trust these modern-day desktop
application writers with this interface. Because they *will* find
some interesting way to (ab)use it.....
Also, I didn't think the extra code complexity to implements timeouts
was *that* bad --- it seemed fairly small for the functionality.
Best regards,
- Ted
On Tue, Jul 08, 2008 at 09:09:22PM -0400, Theodore Tso wrote:
> I had argued for the timeout (and so it's mostly my fault that
> Takashi-San included it as a feature) mainly because I was (and still
> amm) deeply paranoid about the competence of the application
> programers who might use this feature. I could see them screwing up
> leaving it locked forever --- perhaps when their program core dumps or
> when the user types ^C and they forgot to install a signal handler,
> leaving the filesystem frozen forever.
>
> In the meantime, user applications that try to create files on that
> filesystem, or write to files already opened when the filesystem are
> frozen will accumulate dirty pages in the page cache, until the system
> finally falls over.
>
> Think about some of the evil perpetrated by hal and the userspace
> suspend-resume scripts (and how much complexity with random XML
> fragments getting parsed by various dbus plugins), and tell me with a
> straight face that you would trust these modern-day desktop
> application writers with this interface. Because they *will* find
> some interesting way to (ab)use it.....
>
> Also, I didn't think the extra code complexity to implements timeouts
> was *that* bad --- it seemed fairly small for the functionality.
Just as an extra point of reference, VxFS supports a freeze/thaw by
ioctl very similar to this including a timeout in seconds. This
means someone else thought it was a useful feature.
http://sfdoccentral.symantec.com/sf/5.0/linux/manpages/vxfs/vxfsio_7.html
Brad Boyer
[email protected]
On Tue, 8 Jul 2008, Theodore Tso wrote:
> On Wed, Jul 09, 2008 at 10:52:54AM +1000, Dave Chinner wrote:
> > If you walk enough inodes while the filesystem is frozen, it
> > theoretically could happen. Typically a filesystem is only for a
> > few seconds at a time so in the real world this has never, ever been
> > a problem.
>
> I had argued for the timeout (and so it's mostly my fault that
> Takashi-San included it as a feature) mainly because I was (and still
> amm) deeply paranoid about the competence of the application
> programers who might use this feature. I could see them screwing up
> leaving it locked forever --- perhaps when their program core dumps or
> when the user types ^C and they forgot to install a signal handler,
> leaving the filesystem frozen forever.
A much better way to deal with that would be to limit the freeze to
the lifetime of the process somehow. E.g. the freeze ioctl sets a bit
in struct file (I'm sure we have some available) and release on the
file checks this bit and thaws the filesystem.
This would mean that freeze and thaw will have to be done on the same
file descriptor, but this isn't unreasonable to expect, is it?
Miklos
On Wed, Jul 09, 2008 at 08:13:21AM +0200, Miklos Szeredi wrote:
> This would mean that freeze and thaw will have to be done on the same
> file descriptor, but this isn't unreasonable to expect, is it?
It is certainly not the current use case, where you run one command
to freeze the filesystem and another one to unfreeze it.
On Wed, 9 Jul 2008, Christoph Hellwig wrote:
> On Wed, Jul 09, 2008 at 08:13:21AM +0200, Miklos Szeredi wrote:
> > This would mean that freeze and thaw will have to be done on the same
> > file descriptor, but this isn't unreasonable to expect, is it?
>
> It is certainly not the current use case, where you run one command
> to freeze the filesystem and another one to unfreeze it.
So instead of
freeze_fs mountpoint
backup-command
unfreeze_fs mountpoint
the user would have do to
run_freezed mountpoint backup-command
I find the second one nicer, regardless of any reliability issues.
Miklos
On Wed, 09 Jul 2008 08:22:56 +0200
Miklos Szeredi <[email protected]> wrote:
> On Wed, 9 Jul 2008, Christoph Hellwig wrote:
> > On Wed, Jul 09, 2008 at 08:13:21AM +0200, Miklos Szeredi wrote:
> > > This would mean that freeze and thaw will have to be done on the
> > > same file descriptor, but this isn't unreasonable to expect, is
> > > it?
> >
> > It is certainly not the current use case, where you run one command
> > to freeze the filesystem and another one to unfreeze it.
>
> So instead of
>
> freeze_fs mountpoint
> backup-command
> unfreeze_fs mountpoint
>
> the user would have do to
>
> run_freezed mountpoint backup-command
>
> I find the second one nicer, regardless of any reliability issues.
nah he needs to do
make_snapshot ; backup-command ; unref_snapshot.
freezing isn't the right solution for the backup problem ;)
--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On Tue, 8 Jul 2008, Arjan van de Ven wrote:
> nah he needs to do
>
> make_snapshot ; backup-command ; unref_snapshot.
>
> freezing isn't the right solution for the backup problem ;)
Confused, what's freezing _is_ for then? Patch description says:
Currently, ext3 in mainline Linux doesn't have the freeze feature which
suspends write requests. So, we cannot take a backup which keeps
the filesystem's consistency with the storage device's features
(snapshot and replication) while it is mounted.
In many case, a commercial filesystem (e.g. VxFS) has
the freeze feature and it would be used to get the consistent backup.
If Linux's standard filesytem ext3 has the freeze feature, we can do it
without a commercial filesystem.
Miklos
On Wed, 09 Jul 2008 08:48:42 +0200
Miklos Szeredi <[email protected]> wrote:
> On Tue, 8 Jul 2008, Arjan van de Ven wrote:
> > nah he needs to do
> >
> > make_snapshot ; backup-command ; unref_snapshot.
> >
> > freezing isn't the right solution for the backup problem ;)
>
> Confused, what's freezing _is_ for then? Patch description says:
>
> Currently, ext3 in mainline Linux doesn't have the freeze feature
> which suspends write requests. So, we cannot take a backup which
> keeps the filesystem's consistency with the storage device's features
> (snapshot and replication) while it is mounted.
I tihnk the idea there is
freeze . do the snapshot op . unfreeze . make backup of snapshot
one can argue about the need of doing the first 3 steps via a userland
loop; it sure sounds like one needs to be really careful to not do any
writes to the fs from the app that does snapshots (and that includes
doing any syscalls in the kernel that allocate memory.. just because
that already could cause unrelated data to be written from inside the
app. Not fun.)
--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On Tue, Jul 08, 2008 at 11:41:20PM -0700, Arjan van de Ven wrote:
> On Wed, 09 Jul 2008 08:22:56 +0200
> Miklos Szeredi <[email protected]> wrote:
> > > On Wed, Jul 09, 2008 at 08:13:21AM +0200, Miklos Szeredi wrote:
> > > > This would mean that freeze and thaw will have to be done on the
> > > > same file descriptor, but this isn't unreasonable to expect, is
> > > > it?
> > >
> > > It is certainly not the current use case, where you run one command
> > > to freeze the filesystem and another one to unfreeze it.
> >
> > So instead of
> >
> > freeze_fs mountpoint
> > backup-command
> > unfreeze_fs mountpoint
> >
> > the user would have do to
> >
> > run_freezed mountpoint backup-command
> >
> > I find the second one nicer, regardless of any reliability issues.
>
> nah he needs to do
>
> make_snapshot ; backup-command ; unref_snapshot.
>
> freezing isn't the right solution for the backup problem ;)
You're forgetting that to take a snapshot you generally need to
freeze the filesystem. ;) i.e:
freeze; make_snapshot; unfreeze
backup-command
unref_snapshot
Yes, dm_snapshot does the freeze/snapshot/unfreeze for you, but the
point is there is a freeze in the example you gave.
The argument against Miklos' version is that there may be multiple
commands to execute while the fs is frozen.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, 8 Jul 2008, Arjan van de Ven wrote:
> I tihnk the idea there is
>
> freeze . do the snapshot op . unfreeze . make backup of snapshot
Ah, so then my proposal would become
run_frozen mountpoint do-snapshot
do-backup
release-snapshot
and if they are afraid of deadlocks they can just implement the
timeout in userspace:
run_frozen -t timeout mountpoint do-snapshot
'run_frozen' can be a trivial 30 line app, that can be guaranteed not
to deadlock.
> one can argue about the need of doing the first 3 steps via a userland
> loop; it sure sounds like one needs to be really careful to not do any
> writes to the fs from the app that does snapshots (and that includes
> doing any syscalls in the kernel that allocate memory.. just because
> that already could cause unrelated data to be written from inside the
> app. Not fun.)
Userland always has to be careful when messing with raw devices. That
alone is not a reason to put the snapshotting facility in kernel IMO.
Miklos
On Tue, Jul 08, 2008 at 11:55:02PM -0700, Arjan van de Ven wrote:
> On Wed, 09 Jul 2008 08:48:42 +0200
> Miklos Szeredi <[email protected]> wrote:
>
> > On Tue, 8 Jul 2008, Arjan van de Ven wrote:
> > > nah he needs to do
> > >
> > > make_snapshot ; backup-command ; unref_snapshot.
> > >
> > > freezing isn't the right solution for the backup problem ;)
> >
> > Confused, what's freezing _is_ for then? Patch description says:
> >
> > Currently, ext3 in mainline Linux doesn't have the freeze feature
> > which suspends write requests. So, we cannot take a backup which
> > keeps the filesystem's consistency with the storage device's features
> > (snapshot and replication) while it is mounted.
>
> I tihnk the idea there is
>
> freeze . do the snapshot op . unfreeze . make backup of snapshot
>
> one can argue about the need of doing the first 3 steps via a userland
> loop; it sure sounds like one needs to be really careful to not do any
> writes to the fs from the app that does snapshots (and that includes
> doing any syscalls in the kernel that allocate memory.. just because
> that already could cause unrelated data to be written from inside the
> app. Not fun.)
Bloody hell! Doesn't *anyone* understand that a frozen filesystem is
*clean*? That the process of freezing it ensures all dirty data and
metadata is written out before the freeze completes? And that once
frozen, it can't be dirtied until unfrozen?
That's 3 (or is it 4 - maybe 5 now that I think about it) different
ppl in 24 hours that have made this same broken argument about
being unable to write back dirty data on a frozen filesystem......
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, 9 Jul 2008, Dave Chinner wrote:
> The argument against Miklos' version is that there may be multiple
> commands to execute while the fs is frozen.
Which is what a shell is for ;)
Miklos
On Wed, Jul 09, 2008 at 09:13:34AM +0200, Miklos Szeredi wrote:
> On Wed, 9 Jul 2008, Dave Chinner wrote:
> > The argument against Miklos' version is that there may be multiple
> > commands to execute while the fs is frozen.
>
> Which is what a shell is for ;)
Yeah, weĺl, with your method I ca't tell a user to:
# xfs_freeze -f /mntpt
# xfs_db -r -c 'sb 0' -c 'p rootino' /dev/foo
128
# xfs_db -r -c 'ino 128' -c 'p core.format' /dev/foo
core.format = 2 (extents)
# xfs_db .....
.....
# xfs_freeze -u /mntpt
i.e. using the freeze to force all metadata to disk and
prevent it from changing while doing interactive debugging
of some problem.
Yes, a one-shot freeze/unfreeze mechanism works for some
use cases. The point is that it does not work for them all.....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, 9 Jul 2008, Dave Chinner wrote:
> On Wed, Jul 09, 2008 at 09:13:34AM +0200, Miklos Szeredi wrote:
> > On Wed, 9 Jul 2008, Dave Chinner wrote:
> > > The argument against Miklos' version is that there may be multiple
> > > commands to execute while the fs is frozen.
> >
> > Which is what a shell is for ;)
>
> Yeah, weĺl, with your method I ca't tell a user to:
>
> # xfs_freeze -f /mntpt
> # xfs_db -r -c 'sb 0' -c 'p rootino' /dev/foo
> 128
> # xfs_db -r -c 'ino 128' -c 'p core.format' /dev/foo
> core.format = 2 (extents)
> # xfs_db .....
> .....
> # xfs_freeze -u /mntpt
>
> i.e. using the freeze to force all metadata to disk and
> prevent it from changing while doing interactive debugging
> of some problem.
# run_freeze /mntpt /bin/bash
# ...
# ^D
It's the same, no?
Miklos
>
> Bloody hell! Doesn't *anyone* understand that a frozen filesystem is
> *clean*? That the process of freezing it ensures all dirty data and
> metadata is written out before the freeze completes? And that once
> frozen, it can't be dirtied until unfrozen?
What do you mean by "it can't be diritied until unfrozen". What
happens if I have a kernel compilation happening on a filesystem which
I am trying to freeze? Does
(a) the freeze fail (because the checks equivalent to what happens
when you remount a filesystem read-only happen)?
(b) The process gets a kill -9 when it tries to write a file on the
frozen filesystem?
(c) The process gets a kill -STOP when it tries to write
to a file on the frozen filesystem?
(d) The process won't fail, but just continue to run, filling the page
cache with dirty pages that can't be written out because the
filesystem is frozen?
If the answer is (b) or (c), and if you don't have a timeout, and the
backup process which has frozen the filesystem tries to write to the
filesystem, hilarity will ensue....
> That's 3 (or is it 4 - maybe 5 now that I think about it) different
> ppl in 24 hours that have made this same broken argument about
> being unable to write back dirty data on a frozen filesystem......
It's not a question of writing back dirty data, it's the fact that you
*can't*, leading to the page cache filling up wirth dirty data,
leading eventually to the OOM killer running --- and since the last
time I tried suggesting that if the process holding the file
descriptor freezing the filesystem, that idea got shot down (I see
it's been suggested again), if that happens, there is going to be no
other recovery path other than the Big Red Button.
- Ted
On Wed, Jul 09, 2008 at 10:11:01AM +0200, Miklos Szeredi wrote:
> On Wed, 9 Jul 2008, Dave Chinner wrote:
> > On Wed, Jul 09, 2008 at 09:13:34AM +0200, Miklos Szeredi wrote:
> > > On Wed, 9 Jul 2008, Dave Chinner wrote:
> > > > The argument against Miklos' version is that there may be multiple
> > > > commands to execute while the fs is frozen.
> > >
> > > Which is what a shell is for ;)
> >
> > Yeah, weĺl, with your method I ca't tell a user to:
> >
> > # xfs_freeze -f /mntpt
> > # xfs_db -r -c 'sb 0' -c 'p rootino' /dev/foo
> > 128
> > # xfs_db -r -c 'ino 128' -c 'p core.format' /dev/foo
> > core.format = 2 (extents)
> > # xfs_db .....
> > .....
> > # xfs_freeze -u /mntpt
> >
> > i.e. using the freeze to force all metadata to disk and
> > prevent it from changing while doing interactive debugging
> > of some problem.
>
> # run_freeze /mntpt /bin/bash
> # ...
> # ^D
>
> It's the same, no?
For that case, yeah. But it's a horrible hack - if that's the
best we can come up with for this freeze/unfreeze then we've
already lost.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Jul 09, 2008 at 07:09:00AM -0400, Theodore Tso wrote:
> >
> > Bloody hell! Doesn't *anyone* understand that a frozen filesystem is
> > *clean*? That the process of freezing it ensures all dirty data and
> > metadata is written out before the freeze completes? And that once
> > frozen, it can't be dirtied until unfrozen?
>
> What do you mean by "it can't be diritied until unfrozen". What
> happens if I have a kernel compilation happening on a filesystem which
> I am trying to freeze? Does
> (a) the freeze fail (because the checks equivalent to what happens
> when you remount a filesystem read-only happen)?
>
> (b) The process gets a kill -9 when it tries to write a file on the
> frozen filesystem?
>
> (c) The process gets a kill -STOP when it tries to write
> to a file on the frozen filesystem?
>
> (d) The process won't fail, but just continue to run, filling the page
> cache with dirty pages that can't be written out because the
> filesystem is frozen?
(e) none of the above. The kernel compilation will appear to pause
until the filesystem is unfrozen. No other visible effect should
occur. It will get blocked in a write or filesystem transaction
because the fs is frozen.
Look at vfs_check_frozen() - any call to that will block if the
filesystem is frozen or being frozen. The generic hook is in
__generic_file_aio_write_nolock() and various other filesystems have
calls in their specific write paths (fuse, ntfs, ocfs2, xfs, xip) to
do this.
For all other modifications, filesystem specific methods of
blocking transactions are used. XFS uses vfs_check_frozen() in
xfs_trans_alloc(), ext3 (and probably ocfs2) do it via
their ->write_super_lockfs method calling journal_lock_updates(),
ext4 via jbd2_lock_updates() and so on....
When the filesystem is unfrozen the journal is unlocked and
anything sleeping on the vfs_check_frozen() waitqueue is
woken.....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Jul 09, 2008 at 09:49:58PM +1000, Dave Chinner wrote:
> (e) none of the above. The kernel compilation will appear to pause
> until the filesystem is unfrozen. No other visible effect should
> occur. It will get blocked in a write or filesystem transaction
> because the fs is frozen.
So if the process which froze the filesystem accidentally tries
writing to a log file (or database file containing the backup
information, or whatever) that happens to be on the filesystem that is
frozen, that process will get blocked and you end up in a deadlock;
did I get that right?
- Ted
On Wed, 2008-07-09 at 08:24 -0400, Theodore Tso wrote:
> On Wed, Jul 09, 2008 at 09:49:58PM +1000, Dave Chinner wrote:
> > (e) none of the above. The kernel compilation will appear to pause
> > until the filesystem is unfrozen. No other visible effect should
> > occur. It will get blocked in a write or filesystem transaction
> > because the fs is frozen.
>
> So if the process which froze the filesystem accidentally tries
> writing to a log file (or database file containing the backup
> information, or whatever) that happens to be on the filesystem that is
> frozen, that process will get blocked and you end up in a deadlock;
> did I get that right?
Where do you see the deadlock?
The process doesn't have a lock on filesystem or something. You can
always unfreeze from another process.
Regards,
Olaf
--
Olaf Frączyk <[email protected]>
On Wed, 9 Jul 2008 17:13:46 +1000
Dave Chinner <[email protected]> wrote:
> > one can argue about the need of doing the first 3 steps via a
> > userland loop; it sure sounds like one needs to be really careful
> > to not do any writes to the fs from the app that does snapshots
> > (and that includes doing any syscalls in the kernel that allocate
> > memory.. just because that already could cause unrelated data to be
> > written from inside the app. Not fun.)
>
> Bloody hell! Doesn't *anyone* understand that a frozen filesystem is
> *clean*? That the process of freezing it ensures all dirty data and
> metadata is written out before the freeze completes? And that once
> frozen, it can't be dirtied until unfrozen?
>
> That's 3 (or is it 4 - maybe 5 now that I think about it) different
> ppl in 24 hours that have made this same broken argument about
> being unable to write back dirty data on a frozen filesystem......
unless you also freeze the system as a whole (in a 'refrigerator
suspend' way).. the "clean" part is just about a nanosecond long. After
that stuff gets dirty again (you're doing that sendfile to receive more
packets from the FTP upload etc etc).
Sure you can pause those. But there's a real risk that you end up
pausing the app that you want to unfreeze the fs (via the memory
allocation->direct reclaim path). And no, mlock doesn't help.
Especially with delayed allocation, where data writes will cause
metadata activity, this is not just theory.
--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On Wed, 9 Jul 2008 21:49:58 +1000
Dave Chinner <[email protected]> wrote:
>
> (e) none of the above. The kernel compilation will appear to pause
> until the filesystem is unfrozen. No other visible effect should
> occur. It will get blocked in a write or filesystem transaction
> because the fs is frozen.
>
> Look at vfs_check_frozen() - any call to that will block if the
> filesystem is frozen or being frozen. The generic hook is in
> __generic_file_aio_write_nolock() and various other filesystems have
> calls in their specific write paths (fuse, ntfs, ocfs2, xfs, xip) to
> do this.
yeah and mmap doesn't happen
>
> For all other modifications, filesystem specific methods of
> blocking transactions are used. XFS uses vfs_check_frozen() in
> xfs_trans_alloc(), ext3 (and probably ocfs2) do it via
> their ->write_super_lockfs method calling journal_lock_updates(),
> ext4 via jbd2_lock_updates() and so on....
and what if it's the process that you need to unfreeze the fs later?
Good luck.
--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
Jumping into the battle...
Advfs implemented freezefs and thawfs in 2001 so here is
the design rational from a commercial unix view.
Note - We already had built-in snapshots for local disk
consistent backups so some choices might be different on Linux.
NEED - provide way for SAN and hardware raid storage to do
its snapshot/copy function while the system was in-use and
get an image that could mount cleanly. Without freeze, at
a minimum we usually needed filesystem metadata recovery
to run, worst case is completely unusable snapshits :)
freezefs() is single-level:
ENOTSUPPOTED - by any other fs
EOK - done
EINPROGRESS
EALREADY
As implemented, freezefs only ensures the metadata is
consistent so the filesystem copy can mount anywhere.
This means ONLY SOME metadata (or no metadata) is flushed and
then all metadata updates are stopped. User/kernel writes
to already allocated file pages WILL go to a frozen disk.
It also means writers that need storage allocation (not
delaloc or existing) and things that semantically must
force on-disk updates will hang during the freeze.
These semantics meet the need and has the advantage of the
best perfomance. The design specification for freezefs
provided flags on the api to add more consistency options
later if they were desired:
- flush all dirty metadata
- flush all existing dirty file data
- prevent new dirty file data to disk
but they would all add to the "kill the system" problem.
freezefs has the timeout argument and the default timeout
is a system config parameter:
> 0 specifies the timeout value
= 0 uses the default timeout
< 0 disable timeout
A program could call the freezefs/thawfs api, but the
only current use is the separate commands
# freezefs
# [do your hardware raid stuff]
# thawfs
This is either operator driven or script/cron driven
because hardware raid providers (especially our own)
are really unfriendly and not helpful.
NUMBER ONE RULE - freeze must not hang/crash the system
because that defeats the customer reason for wanting it.
WHY A TIMEOUT - need a way for operator to abort the
freeze because with a frozen filesystem they may not
even be able to do a login to thaw it!
Users get pissed when the system is hung for a long
time and our experience with SAN devices is that their
response to commands is often unreasonably long.
In addition to the user controllable timeout mechanism,
we internally implement AUTO-THAW in the filesystem
whenever necessary to prevent a kernel hang/crash.
If an AUTO-THAW occurs, we post to the log and an
event manager so the user knows the snapshot is bad.
jim
On Wed, 09 Jul 2008 14:59:20 +0200
Olaf Frączyk <[email protected]> wrote:
> On Wed, 2008-07-09 at 08:24 -0400, Theodore Tso wrote:
> > On Wed, Jul 09, 2008 at 09:49:58PM +1000, Dave Chinner wrote:
> > > (e) none of the above. The kernel compilation will appear to
> > > pause until the filesystem is unfrozen. No other visible effect
> > > should occur. It will get blocked in a write or filesystem
> > > transaction because the fs is frozen.
> >
> > So if the process which froze the filesystem accidentally tries
> > writing to a log file (or database file containing the backup
> > information, or whatever) that happens to be on the filesystem that
> > is frozen, that process will get blocked and you end up in a
> > deadlock; did I get that right?
> Where do you see the deadlock?
> The process doesn't have a lock on filesystem or something. You can
> always unfreeze from another process.
>
if it's one of your main filesystems... good luck starting a shell
without writing a single thing to disk... FAIL.
Oops..
> WHY A TIMEOUT - need a way for operator to abort the
> freeze
instead of "abort" I should have said "limit" because
it is really proactive control (so they will not get
called in the middle of the night by pissed users).
I forgot to make it clear that TIMEOUT is the same as AUTO-THAW
in logging errors so the adnin knows they have a bad snapshot and
can do it again.
I also forgot to say that our customers all say they can deal
with retrying a snapshot, but not with unknown bad snapshots
and most definitely not with killing their 24/7 operations!
jim
On Wed 2008-07-09 09:08:07, Miklos Szeredi wrote:
> On Tue, 8 Jul 2008, Arjan van de Ven wrote:
> > I tihnk the idea there is
> >
> > freeze . do the snapshot op . unfreeze . make backup of snapshot
>
> Ah, so then my proposal would become
>
> run_frozen mountpoint do-snapshot
> do-backup
> release-snapshot
>
> and if they are afraid of deadlocks they can just implement the
> timeout in userspace:
>
> run_frozen -t timeout mountpoint do-snapshot
>
> 'run_frozen' can be a trivial 30 line app, that can be guaranteed not
> to deadlock.
Userland apps can be swapped out and need kernel memory allocations
during syscalls.
I bet even sleep(30) uses kmalloc internally.
So yes, even trivial applications can deadlock.
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Hi!
> > > > > I still disagree with this whole patch. There is not reason to let
> > > > > the freeze request timeout - an auto-unfreezing will only confuse the
> > > > > hell out of the caller. The only reason where the current XFS freeze
> > > > > call can hang and this would be theoretically useful is when the
> > > >
> > > > What happens when someone dirties so much data that vm swaps out
> > > > whatever process that frozen the filesystem?
> > >
> > > a) you can't dirty a frozen filesystem - by definition a frozen
> > > filesystem is a *clean filesystem* and *cannot be dirtied*.
> >
> > Can you stop me?
> >
> > mmap("/some/huge_file", MAP_SHARED);
> >
> > then write to memory mapping?
>
> Sure - we can put a hook in ->page_mkwrite() to prevent it. We
> don't right now because nobody in the real world really cares if one
> half of a concurrent user data change is in the old snapshot or the
> new one......
>
> > > b) Swap doesn't write through the filesystem
> > > c) you can still read from a frozen filesystem to page your
> > > executable?? in.
> >
> > atime modification should mean dirty data, right?
>
> Metadata, not data. If that's really a problem (and it never has
> been for XFS because we always allow in memory changes to atime)
> then touch_atime could be easily changed to avoid this...
>
> > And dirty data mean
> > memory pressure, right?
>
> If you walk enough inodes while the filesystem is frozen, it
> theoretically could happen. Typically a filesystem is only for a
> few seconds at a time so in the real world this has never, ever been
> a problem.
So we have freezing interface that does not really freeze, and
that can break the system when filesystem is frozen for too long...
:-(.
Maybe you could use process freezer -- cgroup people are adding
userspace interface to that -- to solve those... but that would mean
stopping everyone but thread doing freezing...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Hi!
> NEED - provide way for SAN and hardware raid storage to do
> its snapshot/copy function while the system was in-use and
> get an image that could mount cleanly. Without freeze, at
> a minimum we usually needed filesystem metadata recovery
> to run, worst case is completely unusable snapshits :)
>
> freezefs() is single-level:
>
> ENOTSUPPOTED - by any other fs
> EOK - done
> EINPROGRESS
> EALREADY
>
> As implemented, freezefs only ensures the metadata is
> consistent so the filesystem copy can mount anywhere.
>
> This means ONLY SOME metadata (or no metadata) is flushed and
> then all metadata updates are stopped. User/kernel writes
> to already allocated file pages WILL go to a frozen disk.
That's the difference here. They do write file data, and thus avoid
mmap()-writes problem.
...and they _still_ provide auto-thaw.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Pavel Machek wrote:
>>This means ONLY SOME metadata (or no metadata) is flushed and
>>then all metadata updates are stopped. User/kernel writes
>>to already allocated file pages WILL go to a frozen disk.
>
> That's the difference here. They do write file data, and thus avoid
> mmap()-writes problem.
>
> ...and they _still_ provide auto-thaw.
> Pavel
One of the hardest things to make people understand is that
stopping file data writes in the filesystem during a freeze
is not just dangerous, it is also __worthless__ unless you
have a complete "user environment freeze" mechanism.
In a real 24/7 environment, the DB and application stack
may be poorly glued together stuff from multiple vendors.
And unless each independent component has a freeze and they
can all be coordinated, the data in the pipeline is never
stable enough to say "if you stop all writes to disk and
take a snapshot, this is the same as an orderly shutdown,
backup, restore, and startup".
If you need to stop applications before a freeze, there
is no reason to implement "stop writing file data to disk".
The only real way to make it work (and what the smart apps
do) is to have application "checkpoint" commands so they
can roll-back to a stable point from the snapshot while
allowing new user activity to proceed.
People who don't have checkpoints or some other way to
make their environment stable with a transitioning snapshot
must stop all user activity before snapshotting and have
maintenance windows defined to do that.
jim
On Sun 2008-07-13 13:15:43, jim owens wrote:
> Pavel Machek wrote:
>
>>> This means ONLY SOME metadata (or no metadata) is flushed and
>>> then all metadata updates are stopped. User/kernel writes
>>> to already allocated file pages WILL go to a frozen disk.
>>
>> That's the difference here. They do write file data, and thus avoid
>> mmap()-writes problem.
>>
>> ...and they _still_ provide auto-thaw.
>> Pavel
>
> One of the hardest things to make people understand is that
> stopping file data writes in the filesystem during a freeze
> is not just dangerous, it is also __worthless__ unless you
> have a complete "user environment freeze" mechanism.
Eh?
> And unless each independent component has a freeze and they
> can all be coordinated, the data in the pipeline is never
> stable enough to say "if you stop all writes to disk and
> take a snapshot, this is the same as an orderly shutdown,
> backup, restore, and startup".
If you also freeze data writes, at least snapshot is not worse than
_unexpected_ shutdown.
And apps should already be ready for unexpected shutdowns (using fsync
etc).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Hi,
jim owens wrote:
> In addition to the user controllable timeout mechanism,
> we internally implement AUTO-THAW in the filesystem
> whenever necessary to prevent a kernel hang/crash.
>
> If an AUTO-THAW occurs, we post to the log and an
> event manager so the user knows the snapshot is bad.
I am interested in AUTO-THAW.
What is the difference between the timeout and AUTO-THAW?
When the kernel detects a deadlock, does it occur to solve it?
Cheers, Takashi
Pavel Machek wrote:
> If you also freeze data writes, at least snapshot is not worse than
> _unexpected_ shutdown.
True. But the key point is also that stopping file writes is
__no better__ than an unexpected shutdown for applications.
Once kernel people accept that it is identical to an unknown
shutdown state, they realize that as you said...
> And apps should already be ready for unexpected shutdowns (using fsync
> etc).
The people writing the code outside the kernel are the ones
responsible for the logic of handling "we don't know what
application writes made it to disk".
Remember that immediately after fsync(), the next write
can make the file inconsistent. For example, look at this
simple sales processing database type sequence:
write(sale_last_name_file, "Mackek"
write(sale_first_name_file, "Pavel")
fsync(sale_last_name_file)
fsync(sale_first_name_file)
...
write(sale_last_name_file, "Owens")
write(sale_first_name_file, "Jim")
fsync(sale_last_name_file)
FREEZEFS [defined to NOT allow file data writes]
So if we restart from that snapshot, the sales order
thinks the customer is "Pavel Owens"... unless there
is some mechanism such as seqence numbers that tie
the files together. And if they have such a mechanism
then it doesn't matter if we allow even more writes
such as:
write(sale_first_name_file, "Ted")
There just is no way inside the kernel to know a
freeze was triggered at a stable application point
such that data before the freeze must be on disk
and data after the freez must not go to disk.
This issue is not unique to freeze, it is also
present with filesystems that have snapshots and
cow/versioning built in.
jim
Takashi Sato wrote:
> What is the difference between the timeout and AUTO-THAW?
> When the kernel detects a deadlock, does it occur to solve it?
TIMEOUT is a user-specified limit for the freeze. It is
not a deadlock preventer or deadlock breaker. The reason
it exists is:
- middle of the night (low but not zero users)
- cron triggers freeze and hardware snapshot
- san is overloaded by tape copy traffic so
hardware will take 2 hours to ack snapshot done
- user "company president" tries to create a report
needed for an AM meeting with bankers
- with so few users, system will just patiently
wait for hardware to finish
- after 10 minutes "company president" pages
admin, admin's boss, and "IT vice president"
in a real unhappy mood
AUTO-THAW is simply a name for the effect of all deadlock
preventer and deadlock breaker code that the kernel has
in the freeze implementation paths... if that code would
unfreeze the filesystem. We also implemented deadlock
preventer code that does not thaw the freeze.
None of the AUTO-THAW code is there to stop a stupid
userspace program caller of freeze. It handles things
like "a system in our cluster is going down so we
must have this filesystem unfrozen or the whole
cluster will crash". In places where there could be
a kernel deadlock we made it "lock-only-if-non-blocking"
and if we could not wait to retry later, the failure
to lock would trigger an immediate unfreeze.
Deadlock prevention needs code in critical paths in more
than just filesystems. Sometimes this is as simple as
an "I can't wait on freeze" flag added to a vm-filesystem
interface.
Timers just don't work for keeping the kernel alive
because they don't trigger on resource exhaustion.
jim