2005-12-24 20:04:23

by OGAWA Hirofumi

[permalink] [raw]
Subject: [EXPERIMENT] Add new "flush" option

This adds new "flush" option for hotplug devices.

Current implementation of "flush" option does,

- synchronizing data pages at ->release() (last close(2))
- if user's work seems to be done (fs is not active), all
metadata syncs by pdflush()

This option would provide kind of sane progress, and dirty buffers is
flushed more frequently (if fs is not active). This option doesn't
provide any robustness (robustness is provided by other options), but
probably the option is proper for hotplug devices.

(Please don't assume that dirty buffers is synchronized at any point.
This implementation will be changed easily.)
--
OGAWA Hirofumi <[email protected]>


Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/Makefile | 2 -
fs/flush.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/fs-writeback.c | 1
fs/namespace.c | 1
fs/super.c | 3 +
include/linux/fs.h | 21 ++++++++++-
6 files changed, 119 insertions(+), 3 deletions(-)

diff -puN fs/Makefile~add-flush_option fs/Makefile
--- linux-2.6/fs/Makefile~add-flush_option 2005-12-24 12:46:59.000000000 +0900
+++ linux-2.6-hirofumi/fs/Makefile 2005-12-24 12:46:59.000000000 +0900
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
- ioprio.o pnode.o
+ ioprio.o pnode.o flush.o

obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o
diff -puN /dev/null fs/flush.c
--- /dev/null 2005-12-25 04:55:31.716482250 +0900
+++ linux-2.6-hirofumi/fs/flush.c 2005-12-24 12:46:59.000000000 +0900
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2005, OGAWA Hirofumi
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+
+#define FLUSH_INITAL_DELAY HZ
+#define FLUSH_DELAY (HZ / 2)
+
+int fs_flush_sync_fdata(struct inode *inode, struct file *filp)
+{
+ int err = 0;
+
+ if (IS_FLUSH(inode) && filp->f_mode & FMODE_WRITE) {
+ current->flags |= PF_SYNCWRITE;
+ err = filemap_write_and_wait(inode->i_mapping);
+ current->flags &= ~PF_SYNCWRITE;
+ }
+ return err;
+}
+EXPORT_SYMBOL(fs_flush_sync_fdata);
+
+static void fs_flush_pdflush_handler(unsigned long arg)
+{
+ struct super_block *sb = (struct super_block *)arg;
+ fsync_super(sb);
+ up_read(&sb->s_umount);
+}
+
+static void fs_flush_timer(unsigned long data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct backing_dev_info *bdi = blk_get_backing_dev_info(sb->s_bdev);
+ unsigned long last_flush_jiff;
+ int err;
+
+ if (bdi_write_congested(bdi)) {
+ mod_timer(&sb->flush_timer, jiffies + (HZ / 10));
+ return;
+ }
+
+ last_flush_jiff = sb->last_flush_jiff;
+
+ if (!time_after_eq(jiffies, last_flush_jiff + FLUSH_DELAY)) {
+ mod_timer(&sb->flush_timer, last_flush_jiff + FLUSH_DELAY);
+ return;
+ }
+
+ if (down_read_trylock(&sb->s_umount)) {
+ if (sb->s_root) {
+ err = pdflush_operation(fs_flush_pdflush_handler, data);
+ if (!err)
+ return;
+ mod_timer(&sb->flush_timer, jiffies + FLUSH_DELAY);
+ }
+ up_read(&sb->s_umount);
+ }
+}
+
+void __fs_mark_flush(struct super_block *sb)
+{
+ sb->last_flush_jiff = jiffies;
+ /*
+ * make sure by smb_wmb() that dirty buffers before here is
+ * processed at the timer routine.
+ */
+ smp_wmb();
+
+ if (!timer_pending(&sb->flush_timer))
+ mod_timer(&sb->flush_timer, jiffies + FLUSH_INITAL_DELAY);
+}
+EXPORT_SYMBOL(__fs_mark_flush);
+
+/*
+ * caller must take down_write(sb->s_umount), otherwise pdflush
+ * handler may be run after this del_timer_sync.
+ */
+void fs_flush_stop(struct super_block *sb)
+{
+ sb->s_flags &= ~MS_FLUSH;
+ del_timer_sync(&sb->flush_timer);
+}
+
+void fs_flush_init(struct super_block *sb)
+{
+ init_timer(&sb->flush_timer);
+ sb->flush_timer.data = (unsigned long)sb;
+ sb->flush_timer.function = fs_flush_timer;
+ sb->last_flush_jiff = 0;
+}
diff -puN fs/fs-writeback.c~add-flush_option fs/fs-writeback.c
--- linux-2.6/fs/fs-writeback.c~add-flush_option 2005-12-24 12:46:59.000000000 +0900
+++ linux-2.6-hirofumi/fs/fs-writeback.c 2005-12-24 12:46:59.000000000 +0900
@@ -63,6 +63,7 @@ void __mark_inode_dirty(struct inode *in
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode);
+ fs_mark_flush(sb);
}

/*
diff -puN fs/namespace.c~add-flush_option fs/namespace.c
--- linux-2.6/fs/namespace.c~add-flush_option 2005-12-24 12:46:59.000000000 +0900
+++ linux-2.6-hirofumi/fs/namespace.c 2005-12-24 12:46:59.000000000 +0900
@@ -354,6 +354,7 @@ static int show_vfsmnt(struct seq_file *
} fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
+ { MS_FLUSH, ",flush" },
{ MS_MANDLOCK, ",mand" },
{ MS_NOATIME, ",noatime" },
{ MS_NODIRATIME, ",nodiratime" },
diff -puN fs/super.c~add-flush_option fs/super.c
--- linux-2.6/fs/super.c~add-flush_option 2005-12-24 12:46:59.000000000 +0900
+++ linux-2.6-hirofumi/fs/super.c 2005-12-24 12:46:59.000000000 +0900
@@ -86,6 +86,7 @@ static struct super_block *alloc_super(v
s->s_qcop = sb_quotactl_ops;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
+ fs_flush_init(s);
}
out:
return s;
@@ -230,6 +231,7 @@ void generic_shutdown_super(struct super

if (root) {
sb->s_root = NULL;
+ fs_flush_stop(sb);
shrink_dcache_parent(root);
shrink_dcache_anon(&sb->s_anon);
dput(root);
@@ -536,6 +538,7 @@ int do_remount_sb(struct super_block *sb
return -EACCES;
if (flags & MS_RDONLY)
acct_auto_close(sb);
+ fs_flush_stop(sb);
shrink_dcache_sb(sb);
fsync_super(sb);

diff -puN include/linux/fs.h~add-flush_option include/linux/fs.h
--- linux-2.6/include/linux/fs.h~add-flush_option 2005-12-24 12:46:59.000000000 +0900
+++ linux-2.6-hirofumi/include/linux/fs.h 2005-12-24 12:46:59.000000000 +0900
@@ -98,6 +98,7 @@ extern int dir_notify_enable;
#define MS_REMOUNT 32 /* Alter flags of a mounted FS */
#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#define MS_FLUSH 256 /* Frequently flushes for hotplug devices */
#define MS_NOATIME 1024 /* Do not update access times. */
#define MS_NODIRATIME 2048 /* Do not update directory access times */
#define MS_BIND 4096
@@ -115,8 +116,8 @@ extern int dir_notify_enable;
/*
* Superblock flags that can be altered by MS_REMOUNT
*/
-#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
- MS_NODIRATIME)
+#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_FLUSH| \
+ MS_NOATIME|MS_NODIRATIME)

/*
* Old magic mount flag and mask
@@ -157,6 +158,7 @@ extern int dir_notify_enable;
((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
+#define IS_FLUSH(inode) __IS_FLG(inode, MS_FLUSH)
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)

#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
@@ -222,6 +224,7 @@ extern int dir_notify_enable;
#include <linux/prio_tree.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/timer.h>

#include <asm/atomic.h>
#include <asm/semaphore.h>
@@ -826,6 +829,9 @@ struct super_block {
/* Granuality of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;
+
+ struct timer_list flush_timer;
+ unsigned long last_flush_jiff;
};

extern struct timespec current_fs_time(struct super_block *sb);
@@ -1099,6 +1105,17 @@ static inline void file_accessed(struct
touch_atime(file->f_vfsmnt, file->f_dentry);
}

+int fs_flush_sync_fdata(struct inode *inode, struct file *filp);
+void __fs_mark_flush(struct super_block *sb);
+void fs_flush_stop(struct super_block *sb);
+void fs_flush_init(struct super_block *sb);
+
+static inline void fs_mark_flush(struct super_block *sb)
+{
+ if (sb->s_flags & MS_FLUSH)
+ __fs_mark_flush(sb);
+}
+
int sync_inode(struct inode *inode, struct writeback_control *wbc);

/**
_




Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/fat/file.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff -puN fs/fat/file.c~fat_flush-support fs/fat/file.c
--- linux-2.6/fs/fat/file.c~fat_flush-support 2005-12-25 04:58:16.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/file.c 2005-12-25 04:58:16.000000000 +0900
@@ -119,6 +119,7 @@ struct file_operations fat_file_operatio
.writev = generic_file_writev,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
+ .release = fs_flush_sync_fdata,
.mmap = generic_file_mmap,
.ioctl = fat_generic_ioctl,
.fsync = file_fsync,
@@ -272,7 +273,8 @@ static int fat_free(struct inode *inode,

void fat_truncate(struct inode *inode)
{
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
const unsigned int cluster_size = sbi->cluster_size;
int nr_clusters;

@@ -288,6 +290,7 @@ void fat_truncate(struct inode *inode)
lock_kernel();
fat_free(inode, nr_clusters);
unlock_kernel();
+ fs_mark_flush(sb);
}

struct inode_operations fat_file_inode_operations = {
_




Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/fat/inode.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff -puN fs/fat/inode.c~fat_flush-debug fs/fat/inode.c
--- linux-2.6/fs/fat/inode.c~fat_flush-debug 2005-12-25 04:58:19.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/inode.c 2005-12-25 04:58:19.000000000 +0900
@@ -855,7 +855,7 @@ static int fat_show_options(struct seq_f
enum {
Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
Opt_umask, Opt_dmask, Opt_fmask, Opt_codepage, Opt_nocase,
- Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable,
+ Opt_flush, Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable,
Opt_dots, Opt_nodots,
Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
@@ -877,6 +877,7 @@ static match_table_t fat_tokens = {
{Opt_fmask, "fmask=%o"},
{Opt_codepage, "codepage=%u"},
{Opt_nocase, "nocase"},
+ {Opt_flush, "flush"},
{Opt_quiet, "quiet"},
{Opt_showexec, "showexec"},
{Opt_debug, "debug"},
@@ -991,6 +992,9 @@ static int parse_options(char *options,
| VFAT_SFN_CREATE_WIN95;
}
break;
+ case Opt_flush:
+ *debug = 1;
+ break;
case Opt_quiet:
opts->quiet = 1;
break;
@@ -1182,6 +1186,8 @@ int fat_fill_super(struct super_block *s
error = parse_options(data, isvfat, silent, &debug, &sbi->options);
if (error)
goto out_fail;
+ if (debug)
+ sb->s_flags |= MS_FLUSH;

error = -EIO;
sb_min_blocksize(sb, 512);
_


2005-12-25 12:19:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXPERIMENT] Add new "flush" option

OGAWA Hirofumi <[email protected]> wrote:
>
> This adds new "flush" option for hotplug devices.
>
> Current implementation of "flush" option does,
>
> - synchronizing data pages at ->release() (last close(2))
> - if user's work seems to be done (fs is not active), all
> metadata syncs by pdflush()
>
> This option would provide kind of sane progress, and dirty buffers is
> flushed more frequently (if fs is not active). This option doesn't
> provide any robustness (robustness is provided by other options), but
> probably the option is proper for hotplug devices.
>
> (Please don't assume that dirty buffers is synchronized at any point.
> This implementation will be changed easily.)
>
> ...
>
> +
> +#define FLUSH_INITAL_DELAY HZ
> +#define FLUSH_DELAY (HZ / 2)
> +
> +int fs_flush_sync_fdata(struct inode *inode, struct file *filp)
> +{
> + int err = 0;
> +
> + if (IS_FLUSH(inode) && filp->f_mode & FMODE_WRITE) {
> + current->flags |= PF_SYNCWRITE;
> + err = filemap_write_and_wait(inode->i_mapping);
> + current->flags &= ~PF_SYNCWRITE;
> + }
> + return err;
> +}

You can use filp->f_mapping here, remove the inode* argument.

> +EXPORT_SYMBOL(fs_flush_sync_fdata);
> +
> +static void fs_flush_pdflush_handler(unsigned long arg)
> +{
> + struct super_block *sb = (struct super_block *)arg;
> + fsync_super(sb);
> + up_read(&sb->s_umount);
> +}
> +
> +static void fs_flush_timer(unsigned long data)
> +{
> + struct super_block *sb = (struct super_block *)data;
> + struct backing_dev_info *bdi = blk_get_backing_dev_info(sb->s_bdev);
> + unsigned long last_flush_jiff;
> + int err;
> +
> + if (bdi_write_congested(bdi)) {
> + mod_timer(&sb->flush_timer, jiffies + (HZ / 10));
> + return;
> + }

The bdi_write_congested() test probably isn't doing anything useful: it
only returns true if there's really heavy writeout in progress. Possibly
you could look at the disk queue accounting stats, work out how much I/O
has been happening lately.

> + last_flush_jiff = sb->last_flush_jiff;
> +
> + if (!time_after_eq(jiffies, last_flush_jiff + FLUSH_DELAY)) {
> + mod_timer(&sb->flush_timer, last_flush_jiff + FLUSH_DELAY);
> + return;
> + }
> +
> + if (down_read_trylock(&sb->s_umount)) {
> + if (sb->s_root) {
> + err = pdflush_operation(fs_flush_pdflush_handler, data);
> + if (!err)
> + return;
> + mod_timer(&sb->flush_timer, jiffies + FLUSH_DELAY);
> + }
> + up_read(&sb->s_umount);
> + }
> +}
>
> +void __fs_mark_flush(struct super_block *sb)
> +{
> + sb->last_flush_jiff = jiffies;
> + /*
> + * make sure by smb_wmb() that dirty buffers before here is
> + * processed at the timer routine.
> + */
> + smp_wmb();
> +
> + if (!timer_pending(&sb->flush_timer))
> + mod_timer(&sb->flush_timer, jiffies + FLUSH_INITAL_DELAY);
> +}
> +EXPORT_SYMBOL(__fs_mark_flush);
> +
> +/*
> + * caller must take down_write(sb->s_umount), otherwise pdflush
> + * handler may be run after this del_timer_sync.
> + */
> +void fs_flush_stop(struct super_block *sb)
> +{
> + sb->s_flags &= ~MS_FLUSH;
> + del_timer_sync(&sb->flush_timer);
> +}
> +
> +void fs_flush_init(struct super_block *sb)
> +{
> + init_timer(&sb->flush_timer);
> + sb->flush_timer.data = (unsigned long)sb;
> + sb->flush_timer.function = fs_flush_timer;
> + sb->last_flush_jiff = 0;
> +}

The superblock lifetime handling all looks OK to me.


However I wonder if all this code would become simpler if we were to just
tweak writeback_inodes() a bit: if the superblock was mounted with MS_FLUSH
then temporarily set wbc->sync_mode to WB_SYNC_ALL.

If we do that, the regular wb_kupdate() will sync all the IS_FLUSH
filesystems for us at dirty_writeback_centisecs intervals. That's a bit
less flexible, but probably less code.

(Alternatively, add a new writeback_control.ms_flush_only boolean. Set
that, then reuse some of the code in mm/page-writeback.c in some manner).

You abandoned the flush-file-in-release() idea? That seemed faily neat.
What was the thinking here?

2005-12-25 14:23:17

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [EXPERIMENT] Add new "flush" option

Andrew Morton <[email protected]> writes:

>> + if (bdi_write_congested(bdi)) {
>> + mod_timer(&sb->flush_timer, jiffies + (HZ / 10));
>> + return;
>> + }
>
> The bdi_write_congested() test probably isn't doing anything useful: it
> only returns true if there's really heavy writeout in progress. Possibly
> you could look at the disk queue accounting stats, work out how much I/O
> has been happening lately.

Umm... If queue is too busy, we can't flush immediately. So, this code
is delaying flush, and we can wait the more user's request by it, I think.

> However I wonder if all this code would become simpler if we were to just
> tweak writeback_inodes() a bit: if the superblock was mounted with MS_FLUSH
> then temporarily set wbc->sync_mode to WB_SYNC_ALL.
>
> If we do that, the regular wb_kupdate() will sync all the IS_FLUSH
> filesystems for us at dirty_writeback_centisecs intervals. That's a bit
> less flexible, but probably less code.
>
> (Alternatively, add a new writeback_control.ms_flush_only boolean. Set
> that, then reuse some of the code in mm/page-writeback.c in some manner).

Yes, but probably we need to change handling of bd_inode (sb->s_bdev).
The buffer_heads seems to be synced via block_super. And DQUOT_SYNC(sb)
may be needed.

I think it will need big change (and cleanup) after all. (That's no
problem itself I think, but, I would need to make more time)

> You abandoned the flush-file-in-release() idea? That seemed faily neat.
> What was the thinking here?

I'm still doing it in the following. I thought the other filesystems
may select other flushing strategy on ->release(), so I didn't do it in VFS.

diff -puN fs/fat/file.c~fat_flush-support fs/fat/file.c
--- linux-2.6/fs/fat/file.c~fat_flush-support 2005-12-25 04:58:16.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/file.c 2005-12-25 04:58:16.000000000 +0900
@@ -119,6 +119,7 @@ struct file_operations fat_file_operatio
.writev = generic_file_writev,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
+ .release = fs_flush_sync_fdata,
.mmap = generic_file_mmap,
.ioctl = fat_generic_ioctl,
.fsync = file_fsync,
--
OGAWA Hirofumi <[email protected]>

2005-12-25 16:03:04

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [EXPERIMENT] Add new "flush" option

I was missing the following part. Sorry.

Andrew Morton <[email protected]> writes:

> You can use filp->f_mapping here, remove the inode* argument.

Ah, I see. However, it is intended to use as ->release() handler.
So, I updated the patch as following.

+int fs_flush_sync_fdata(struct inode *inode, struct file *filp)
+{
+ int err = 0;
+
+ if (IS_FLUSH(inode) && filp->f_mode & FMODE_WRITE) {
+ current->flags |= PF_SYNCWRITE;
+ err = filemap_write_and_wait(filp->f_mapping);
+ current->flags &= ~PF_SYNCWRITE;
+ }
+ return err;
+}
--
OGAWA Hirofumi <[email protected]>

2005-12-25 23:05:09

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXPERIMENT] Add new "flush" option

OGAWA Hirofumi <[email protected]> wrote:
>
> Andrew Morton <[email protected]> writes:
>
> >> + if (bdi_write_congested(bdi)) {
> >> + mod_timer(&sb->flush_timer, jiffies + (HZ / 10));
> >> + return;
> >> + }
> >
> > The bdi_write_congested() test probably isn't doing anything useful: it
> > only returns true if there's really heavy writeout in progress. Possibly
> > you could look at the disk queue accounting stats, work out how much I/O
> > has been happening lately.
>
> Umm... If queue is too busy, we can't flush immediately. So, this code
> is delaying flush, and we can wait the more user's request by it, I think.

The thing is that bdi_write_congested() will only return true when the
queue is under really really heavy writeout. Most of the time it just
won't trigger, so this code isn't doing anything very useful.

If you indeed want to implement something like "only sync the device when
it is otherwise idle" then some different approach will be needed. One
which

a) detects when there is light writeout happening and which

b) detects when there are reads happening too.

I don't know whether any of this is particularly useful, really. So I'd
suggest that we just remove the bdi_write_contested() test and leave it at
that.

I could be wrong, of course - if you have some testcase in which the
bdi_write_congested() test makes some perceptible difference then I'd be
interested in hearing about it. If you put a printk in there, does it
trigger much?

2005-12-26 16:37:45

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [EXPERIMENT] Add new "flush" option

Andrew Morton <[email protected]> writes:

>> Umm... If queue is too busy, we can't flush immediately. So, this code
>> is delaying flush, and we can wait the more user's request by it, I think.
>
> The thing is that bdi_write_congested() will only return true when the
> queue is under really really heavy writeout. Most of the time it just
> won't trigger, so this code isn't doing anything very useful.
>
> If you indeed want to implement something like "only sync the device when
> it is otherwise idle" then some different approach will be needed. One
> which
>
> a) detects when there is light writeout happening and which
>
> b) detects when there are reads happening too.
>
> I don't know whether any of this is particularly useful, really. So I'd
> suggest that we just remove the bdi_write_contested() test and leave it at
> that.

OK, I'm not sure whether this part is really useful or not, so I'll
remove this.

Also, for improving this patch overall, I'd like to hear feedback from
real user of hotplug devices.

> I could be wrong, of course - if you have some testcase in which the
> bdi_write_congested() test makes some perceptible difference then I'd be
> interested in hearing about it. If you put a printk in there, does it
> trigger much?

I didn't trigger this on my test. However queue->nr_requests is
configurable (/sys/block/xxx/queue/nr_requests), so if user changes
it, I guess it may be triggered.

But also I don't know whether it is useful or not.

Thanks.
--
OGAWA Hirofumi <[email protected]>

2005-12-26 19:00:27

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [EXPERIMENT] Add new "flush" option

Hi,

Here is updated patch. The changes is

- s/EXPORT_SYMBOL/&_GPL/
- Use filp->f_mapping in fs_flush_sync_fdata().
- Remove bdi_write_congested() check


Please try this if you have interests/complaints.

These patches is including the debug patch to try "-o flush" without
changeing mount command.

e.g.
# mount -t vfat /dev/xxx /mnt -o flush

Thanks.
--
OGAWA Hirofumi <[email protected]>


This adds new "flush" option for hotplug devices.

Current implementation of "flush" option does,

- synchronizing data pages at ->release() (last close(2))
- if user's work seems to be done (fs is not active), all
metadata syncs by pdflush()

This option would provide kind of sane progress, and dirty buffers is
flushed more frequently (if fs is not active). This option doesn't
provide any robustness (robustness is provided by other options), but
probably the option is proper for hotplug devices.

(Please don't assume that dirty buffers is synchronized at any point.
This implementation will be changed easily.)

Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/Makefile | 2 -
fs/flush.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/fs-writeback.c | 1
fs/namespace.c | 1
fs/super.c | 3 +
include/linux/fs.h | 21 +++++++++++-
6 files changed, 112 insertions(+), 3 deletions(-)

diff -puN fs/Makefile~add-flush_option fs/Makefile
--- linux-2.6/fs/Makefile~add-flush_option 2005-12-26 00:45:16.000000000 +0900
+++ linux-2.6-hirofumi/fs/Makefile 2005-12-26 00:45:16.000000000 +0900
@@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
- ioprio.o pnode.o
+ ioprio.o pnode.o flush.o

obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_EPOLL) += eventpoll.o
diff -puN /dev/null fs/flush.c
--- /dev/null 2005-12-26 10:53:17.220513750 +0900
+++ linux-2.6-hirofumi/fs/flush.c 2005-12-27 01:41:43.000000000 +0900
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, OGAWA Hirofumi
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+
+#define FLUSH_INITAL_DELAY HZ
+#define FLUSH_DELAY (HZ / 2)
+
+int fs_flush_sync_fdata(struct inode *inode, struct file *filp)
+{
+ int err = 0;
+
+ if (IS_FLUSH(inode) && filp->f_mode & FMODE_WRITE) {
+ current->flags |= PF_SYNCWRITE;
+ err = filemap_write_and_wait(filp->f_mapping);
+ current->flags &= ~PF_SYNCWRITE;
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(fs_flush_sync_fdata);
+
+static void fs_flush_pdflush_handler(unsigned long arg)
+{
+ struct super_block *sb = (struct super_block *)arg;
+ fsync_super(sb);
+ up_read(&sb->s_umount);
+}
+
+static void fs_flush_timer(unsigned long data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ unsigned long last_flush_jiff;
+ int err;
+
+ last_flush_jiff = sb->last_flush_jiff;
+
+ if (!time_after_eq(jiffies, last_flush_jiff + FLUSH_DELAY)) {
+ mod_timer(&sb->flush_timer, last_flush_jiff + FLUSH_DELAY);
+ return;
+ }
+
+ if (down_read_trylock(&sb->s_umount)) {
+ if (sb->s_root) {
+ err = pdflush_operation(fs_flush_pdflush_handler, data);
+ if (!err)
+ return;
+ mod_timer(&sb->flush_timer, jiffies + FLUSH_DELAY);
+ }
+ up_read(&sb->s_umount);
+ }
+}
+
+void __fs_mark_flush(struct super_block *sb)
+{
+ sb->last_flush_jiff = jiffies;
+ /*
+ * make sure by smb_wmb() that dirty buffers before here is
+ * processed at the timer routine.
+ */
+ smp_wmb();
+
+ if (!timer_pending(&sb->flush_timer))
+ mod_timer(&sb->flush_timer, jiffies + FLUSH_INITAL_DELAY);
+}
+EXPORT_SYMBOL_GPL(__fs_mark_flush);
+
+/*
+ * caller must take down_write(sb->s_umount), otherwise pdflush
+ * handler may be run after this del_timer_sync.
+ */
+void fs_flush_stop(struct super_block *sb)
+{
+ sb->s_flags &= ~MS_FLUSH;
+ del_timer_sync(&sb->flush_timer);
+}
+
+void fs_flush_init(struct super_block *sb)
+{
+ init_timer(&sb->flush_timer);
+ sb->flush_timer.data = (unsigned long)sb;
+ sb->flush_timer.function = fs_flush_timer;
+ sb->last_flush_jiff = 0;
+}
diff -puN fs/fs-writeback.c~add-flush_option fs/fs-writeback.c
--- linux-2.6/fs/fs-writeback.c~add-flush_option 2005-12-26 00:45:16.000000000 +0900
+++ linux-2.6-hirofumi/fs/fs-writeback.c 2005-12-26 00:45:16.000000000 +0900
@@ -63,6 +63,7 @@ void __mark_inode_dirty(struct inode *in
if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
if (sb->s_op->dirty_inode)
sb->s_op->dirty_inode(inode);
+ fs_mark_flush(sb);
}

/*
diff -puN fs/namespace.c~add-flush_option fs/namespace.c
--- linux-2.6/fs/namespace.c~add-flush_option 2005-12-26 00:45:16.000000000 +0900
+++ linux-2.6-hirofumi/fs/namespace.c 2005-12-26 00:45:16.000000000 +0900
@@ -354,6 +354,7 @@ static int show_vfsmnt(struct seq_file *
} fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
+ { MS_FLUSH, ",flush" },
{ MS_MANDLOCK, ",mand" },
{ MS_NOATIME, ",noatime" },
{ MS_NODIRATIME, ",nodiratime" },
diff -puN fs/super.c~add-flush_option fs/super.c
--- linux-2.6/fs/super.c~add-flush_option 2005-12-26 00:45:16.000000000 +0900
+++ linux-2.6-hirofumi/fs/super.c 2005-12-26 00:45:16.000000000 +0900
@@ -86,6 +86,7 @@ static struct super_block *alloc_super(v
s->s_qcop = sb_quotactl_ops;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
+ fs_flush_init(s);
}
out:
return s;
@@ -230,6 +231,7 @@ void generic_shutdown_super(struct super

if (root) {
sb->s_root = NULL;
+ fs_flush_stop(sb);
shrink_dcache_parent(root);
shrink_dcache_anon(&sb->s_anon);
dput(root);
@@ -536,6 +538,7 @@ int do_remount_sb(struct super_block *sb
return -EACCES;
if (flags & MS_RDONLY)
acct_auto_close(sb);
+ fs_flush_stop(sb);
shrink_dcache_sb(sb);
fsync_super(sb);

diff -puN include/linux/fs.h~add-flush_option include/linux/fs.h
--- linux-2.6/include/linux/fs.h~add-flush_option 2005-12-26 00:45:16.000000000 +0900
+++ linux-2.6-hirofumi/include/linux/fs.h 2005-12-26 00:45:16.000000000 +0900
@@ -98,6 +98,7 @@ extern int dir_notify_enable;
#define MS_REMOUNT 32 /* Alter flags of a mounted FS */
#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#define MS_FLUSH 256 /* Frequently flushes for hotplug devices */
#define MS_NOATIME 1024 /* Do not update access times. */
#define MS_NODIRATIME 2048 /* Do not update directory access times */
#define MS_BIND 4096
@@ -115,8 +116,8 @@ extern int dir_notify_enable;
/*
* Superblock flags that can be altered by MS_REMOUNT
*/
-#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
- MS_NODIRATIME)
+#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_FLUSH| \
+ MS_NOATIME|MS_NODIRATIME)

/*
* Old magic mount flag and mask
@@ -157,6 +158,7 @@ extern int dir_notify_enable;
((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
+#define IS_FLUSH(inode) __IS_FLG(inode, MS_FLUSH)
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)

#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
@@ -222,6 +224,7 @@ extern int dir_notify_enable;
#include <linux/prio_tree.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/timer.h>

#include <asm/atomic.h>
#include <asm/semaphore.h>
@@ -826,6 +829,9 @@ struct super_block {
/* Granuality of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;
+
+ struct timer_list flush_timer;
+ unsigned long last_flush_jiff;
};

extern struct timespec current_fs_time(struct super_block *sb);
@@ -1099,6 +1105,17 @@ static inline void file_accessed(struct
touch_atime(file->f_vfsmnt, file->f_dentry);
}

+int fs_flush_sync_fdata(struct inode *inode, struct file *filp);
+void __fs_mark_flush(struct super_block *sb);
+void fs_flush_stop(struct super_block *sb);
+void fs_flush_init(struct super_block *sb);
+
+static inline void fs_mark_flush(struct super_block *sb)
+{
+ if (sb->s_flags & MS_FLUSH)
+ __fs_mark_flush(sb);
+}
+
int sync_inode(struct inode *inode, struct writeback_control *wbc);

/**
_



Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/fat/file.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff -puN fs/fat/file.c~fat_flush-support fs/fat/file.c
--- linux-2.6/fs/fat/file.c~fat_flush-support 2005-12-26 00:55:31.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/file.c 2005-12-26 00:55:31.000000000 +0900
@@ -119,6 +119,7 @@ struct file_operations fat_file_operatio
.writev = generic_file_writev,
.aio_read = generic_file_aio_read,
.aio_write = generic_file_aio_write,
+ .release = fs_flush_sync_fdata,
.mmap = generic_file_mmap,
.ioctl = fat_generic_ioctl,
.fsync = file_fsync,
@@ -272,7 +273,8 @@ static int fat_free(struct inode *inode,

void fat_truncate(struct inode *inode)
{
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
const unsigned int cluster_size = sbi->cluster_size;
int nr_clusters;

@@ -288,6 +290,7 @@ void fat_truncate(struct inode *inode)
lock_kernel();
fat_free(inode, nr_clusters);
unlock_kernel();
+ fs_mark_flush(sb);
}

struct inode_operations fat_file_inode_operations = {
_



Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/fat/inode.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff -puN fs/fat/inode.c~fat_flush-debug fs/fat/inode.c
--- linux-2.6/fs/fat/inode.c~fat_flush-debug 2005-12-26 00:55:35.000000000 +0900
+++ linux-2.6-hirofumi/fs/fat/inode.c 2005-12-26 00:55:35.000000000 +0900
@@ -855,7 +855,7 @@ static int fat_show_options(struct seq_f
enum {
Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
Opt_umask, Opt_dmask, Opt_fmask, Opt_codepage, Opt_nocase,
- Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable,
+ Opt_flush, Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable,
Opt_dots, Opt_nodots,
Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
@@ -877,6 +877,7 @@ static match_table_t fat_tokens = {
{Opt_fmask, "fmask=%o"},
{Opt_codepage, "codepage=%u"},
{Opt_nocase, "nocase"},
+ {Opt_flush, "flush"},
{Opt_quiet, "quiet"},
{Opt_showexec, "showexec"},
{Opt_debug, "debug"},
@@ -991,6 +992,9 @@ static int parse_options(char *options,
| VFAT_SFN_CREATE_WIN95;
}
break;
+ case Opt_flush:
+ *debug = 1;
+ break;
case Opt_quiet:
opts->quiet = 1;
break;
@@ -1182,6 +1186,8 @@ int fat_fill_super(struct super_block *s
error = parse_options(data, isvfat, silent, &debug, &sbi->options);
if (error)
goto out_fail;
+ if (debug)
+ sb->s_flags |= MS_FLUSH;

error = -EIO;
sb_min_blocksize(sb, 512);
_