Add a new mount option which enables a new "lazytime" mode. This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode. The on-disk times will only get
updated when (a) when the inode table block for the inode needs to be
updated for some non-time related change involving any inode in the
block, (b) if userspace calls fsync(), or (c) the refcount on an
undeleted inode goes to zero (in most cases, when the last file
descriptor assoicated with the inode is closed).
This is legal according to POSIX because there are no guarantees after
a crash unless userspace explicitly requests via a fsync(2) call. So
in fact, this a better way of reducing the disk traffic resulting from
atime is use lazytime instead of relatime or noatime. Enabling
lazytime and disabling the default realtime will result in fewer extra
disk writes, and has the benefits of being POSIX-compliant --- since
either noatime and relatime violates POSIX.
The lazytime mode reduces pressure on the journal spinlocks, since
time updates resulting from calls to file_update_time() are almost
always done using separate jbd2 handles. For allocating writes, the
inode will need to be updated anyway when i_blocks change, and so the
mtime updates will be folded into jbd2 handle in the ext4 write path.
In addition, for workloads feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table. The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives. Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact 99.9 percentile latencies ---
which is a very big deal for web serving tiers (for example).
n.b.: because of the many wins of this mode, we may want to enable
lazytime updates by default in the future. If you know of use cases
where having inaccurate mtime values after a crash would be extremely
problematic, please us know at [email protected].
Google-Bug-Id: 18297052
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/ext4.h | 3 +++
fs/ext4/file.c | 1 +
fs/ext4/fsync.c | 3 +++
fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/namei.c | 2 ++
fs/ext4/super.c | 14 ++++++++++++
fs/ext4/symlink.c | 2 ++
fs/inode.c | 36 +++++++++++++++++++++++++++++++
include/linux/fs.h | 2 ++
9 files changed, 124 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c55a1fa..494c504 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -970,6 +970,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#define EXT4_MOUNT_LAZYTIME 0x00200 /* Update the time lazily */
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -1407,6 +1408,7 @@ enum {
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
+ EXT4_STATE_DIRTY_TIME, /* the time needs to be updated */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2114,6 +2116,7 @@ extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
+extern int ext4_update_time(struct inode *, struct timespec *, int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int ext4_sync_inode(handle_t *, struct inode *);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8..2cf6aaf 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -603,6 +603,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
+ .update_time = ext4_update_time,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f..ba05c83 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -116,6 +116,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
+
+ if (!datasync && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME))
+ ext4_dirty_inode(inode, 0);
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3356ab5..1b5e4bd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4163,6 +4163,46 @@ static int ext4_inode_blocks_set(handle_t *handle,
}
/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+ unsigned long orig_ino, char *buf)
+{
+ struct ext4_inode_info *ei;
+ struct ext4_inode *raw_inode;
+ unsigned long ino;
+ struct inode *inode;
+ int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int inode_size = EXT4_INODE_SIZE(sb);
+
+ ino = orig_ino & ~(inodes_per_block - 1);
+ for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+ if (ino == orig_ino)
+ continue;
+ inode = find_active_inode_nowait(sb, ino);
+ if (!inode ||
+ !ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
+ iput(inode);
+ continue;
+ }
+ raw_inode = (struct ext4_inode *) buf;
+ ei = EXT4_I(inode);
+
+ smp_mb__before_spinlock();
+ spin_lock(&ei->i_raw_lock);
+ ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
+ EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+ ext4_inode_csum_set(inode, raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+ iput(inode);
+ }
+}
+
+/*
* Post the struct inode info into an on-disk inode location in the
* buffer-cache. This gobbles the caller's reference to the
* buffer_head in the inode location struct.
@@ -4182,7 +4222,9 @@ static int ext4_do_update_inode(handle_t *handle,
uid_t i_uid;
gid_t i_gid;
+ smp_mb__before_spinlock();
spin_lock(&ei->i_raw_lock);
+ ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
/* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
@@ -4273,8 +4315,8 @@ static int ext4_do_update_inode(handle_t *handle,
}
ext4_inode_csum_set(inode, raw_inode, ei);
-
spin_unlock(&ei->i_raw_lock);
+ ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4622,6 +4664,24 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+int ext4_update_time(struct inode *inode, struct timespec *time, int flags)
+{
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_VERSION)
+ inode_inc_iversion(inode);
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+ if (test_opt(inode->i_sb, LAZYTIME)) {
+ smp_wmb();
+ ext4_set_inode_state(inode, EXT4_STATE_DIRTY_TIME);
+ } else
+ mark_inode_dirty_sync(inode);
+ return 0;
+}
+
static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
int pextents)
{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4262118..f782040 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3532,6 +3532,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.tmpfile = ext4_tmpfile,
.rename2 = ext4_rename2,
.setattr = ext4_setattr,
+ .update_time = ext4_update_time,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
@@ -3545,6 +3546,7 @@ const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
+ .update_time = ext4_update_time,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c9e686..16c9983 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -910,6 +910,14 @@ static int ext4_drop_inode(struct inode *inode)
int drop = generic_drop_inode(inode);
trace_ext4_drop_inode(inode, drop);
+ if (!drop && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
+ atomic_inc(&inode->i_count);
+ spin_unlock(&inode->i_lock);
+ ext4_dirty_inode(inode, 0);
+ spin_lock(&inode->i_lock);
+ if (atomic_dec_and_test(&inode->i_count))
+ drop = generic_drop_inode(inode);
+ }
return drop;
}
@@ -1142,6 +1150,7 @@ enum {
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
@@ -1204,6 +1213,8 @@ static const match_table_t tokens = {
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_removed, "mblk_io_submit"},
{Opt_removed, "nomblk_io_submit"},
@@ -1361,6 +1372,8 @@ static const struct mount_opts {
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
MOPT_EXT4_ONLY | MOPT_CLEAR},
+ {Opt_lazytime, EXT4_MOUNT_LAZYTIME, MOPT_SET},
+ {Opt_nolazytime, EXT4_MOUNT_LAZYTIME, MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_SET},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -3514,6 +3527,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+ set_opt(sb, LAZYTIME);
set_opt(sb, INIT_INODE_TABLE);
if (def_mount_opts & EXT4_DEFM_DEBUG)
set_opt(sb, DEBUG);
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ff37119..7c92b93 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,6 +35,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.setattr = ext4_setattr,
+ .update_time = ext4_update_time,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
.setattr = ext4_setattr,
+ .update_time = ext4_update_time,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
diff --git a/fs/inode.c b/fs/inode.c
index 26753ba..cde073a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1280,6 +1280,42 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
}
EXPORT_SYMBOL(ilookup);
+/**
+ * find_active_inode_nowait - find an active inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * Search for an active inode @ino in the inode cache, and if the
+ * inode is in the cache, the inode is returned with an incremented
+ * reference count. If the inode is being freed or is newly
+ * initialized, return nothing instead of trying to wait for the inode
+ * initialization or destruction to be complete.
+ */
+struct inode *find_active_inode_nowait(struct super_block *sb,
+ unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode, *ret_inode = NULL;
+
+ spin_lock(&inode_hash_lock);
+ hlist_for_each_entry(inode, head, i_hash) {
+ if ((inode->i_ino != ino) ||
+ (inode->i_sb != sb))
+ continue;
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) == 0) {
+ __iget(inode);
+ ret_inode = inode;
+ }
+ spin_unlock(&inode->i_lock);
+ goto out;
+ }
+out:
+ spin_unlock(&inode_hash_lock);
+ return ret_inode;
+}
+EXPORT_SYMBOL(find_active_inode_nowait);
+
int insert_inode_locked(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e..b5e6b6b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2410,6 +2410,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern struct inode *find_active_inode_nowait(struct super_block *,
+ unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
--
2.1.0
Theodore Ts'o <[email protected]> writes:
> Add a new mount option which enables a new "lazytime" mode. This mode
> causes atime, mtime, and ctime updates to only be made to the
> in-memory version of the inode. The on-disk times will only get
> updated when (a) when the inode table block for the inode needs to be
> updated for some non-time related change involving any inode in the
> block, (b) if userspace calls fsync(), or (c) the refcount on an
> undeleted inode goes to zero (in most cases, when the last file
> descriptor assoicated with the inode is closed).
>
> This is legal according to POSIX because there are no guarantees after
> a crash unless userspace explicitly requests via a fsync(2) call. So
> in fact, this a better way of reducing the disk traffic resulting from
> atime is use lazytime instead of relatime or noatime. Enabling
> lazytime and disabling the default realtime will result in fewer extra
> disk writes, and has the benefits of being POSIX-compliant --- since
> either noatime and relatime violates POSIX.
Indeed, in fact even current implementation can not guarantee correct
mtime in case of power-fail for example:
DIO_WRITE_TASK
->file_update_time:
journal_start
save mtime in journal in tid:B
journal_stop
->direct_IO(): modify files's content which may be flushed to non
volatile storage.
POWER_FAILURE
NEW_MOUNT: journal replay will find that tid:B has no commit record and
ignore it. As result file has old mtime, but new content
>
> The lazytime mode reduces pressure on the journal spinlocks, since
> time updates resulting from calls to file_update_time() are almost
> always done using separate jbd2 handles. For allocating writes, the
> inode will need to be updated anyway when i_blocks change, and so the
> mtime updates will be folded into jbd2 handle in the ext4 write path.
>
> In addition, for workloads feature a large number of random write to a
> preallocated file, the lazytime mount option significantly reduces
> writes to the inode table. The repeated 4k writes to a single block
> will result in undesirable stress on flash devices and SMR disk
> drives. Even on conventional HDD's, the repeated writes to the inode
> table block will trigger Adjacent Track Interference (ATI) remediation
> latencies, which very negatively impact 99.9 percentile latencies ---
> which is a very big deal for web serving tiers (for example).
Also sync mtime updates is a great pain for AIO submitter
because AIO submission may be blocked for a seconds (up to 5 second in my case)
if inode is part of current committing transaction see: do_get_write_access
>
> n.b.: because of the many wins of this mode, we may want to enable
> lazytime updates by default in the future. If you know of use cases
> where having inaccurate mtime values after a crash would be extremely
> problematic, please us know at [email protected].
>
> Google-Bug-Id: 18297052
Yeah we also has ticket for that :)
https://jira.sw.ru/browse/PSBM-20411
>
> Signed-off-by: Theodore Ts'o <[email protected]>
Patch looks good with minor nodes, please see below.
> ---
> fs/ext4/ext4.h | 3 +++
> fs/ext4/file.c | 1 +
> fs/ext4/fsync.c | 3 +++
> fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/namei.c | 2 ++
> fs/ext4/super.c | 14 ++++++++++++
> fs/ext4/symlink.c | 2 ++
> fs/inode.c | 36 +++++++++++++++++++++++++++++++
> include/linux/fs.h | 2 ++
> 9 files changed, 124 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index c55a1fa..494c504 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -970,6 +970,7 @@ struct ext4_inode_info {
> #define EXT4_MOUNT_ERRORS_MASK 0x00070
> #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
> #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
> +#define EXT4_MOUNT_LAZYTIME 0x00200 /* Update the time lazily */
> #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
> #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
> #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
> @@ -1407,6 +1408,7 @@ enum {
> EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
> EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
> EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
> + EXT4_STATE_DIRTY_TIME, /* the time needs to be updated */
> };
>
> #define EXT4_INODE_BIT_FNS(name, field, offset) \
> @@ -2114,6 +2116,7 @@ extern int ext4_write_inode(struct inode *, struct writeback_control *);
> extern int ext4_setattr(struct dentry *, struct iattr *);
> extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
> struct kstat *stat);
> +extern int ext4_update_time(struct inode *, struct timespec *, int);
> extern void ext4_evict_inode(struct inode *);
> extern void ext4_clear_inode(struct inode *);
> extern int ext4_sync_inode(handle_t *, struct inode *);
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 8131be8..2cf6aaf 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -603,6 +603,7 @@ const struct file_operations ext4_file_operations = {
> const struct inode_operations ext4_file_inode_operations = {
> .setattr = ext4_setattr,
> .getattr = ext4_getattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index a8bc47f..ba05c83 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -116,6 +116,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> if (ret)
> return ret;
> +
> + if (!datasync && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME))
> + ext4_dirty_inode(inode, 0);
> /*
> * data=writeback,ordered:
> * The caller's filemap_fdatawrite()/wait will sync the data.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3356ab5..1b5e4bd 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4163,6 +4163,46 @@ static int ext4_inode_blocks_set(handle_t *handle,
> }
>
> /*
> + * Opportunistically update the other time fields for other inodes in
> + * the same inode table block.
> + */
> +static void ext4_update_other_inodes_time(struct super_block *sb,
> + unsigned long orig_ino, char *buf)
> +{
> + struct ext4_inode_info *ei;
> + struct ext4_inode *raw_inode;
> + unsigned long ino;
> + struct inode *inode;
> + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> + int inode_size = EXT4_INODE_SIZE(sb);
> +
> + ino = orig_ino & ~(inodes_per_block - 1);
> + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> + if (ino == orig_ino)
> + continue;
> + inode = find_active_inode_nowait(sb, ino);
> + if (!inode ||
> + !ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
> + iput(inode);
> + continue;
> + }
> + raw_inode = (struct ext4_inode *) buf;
> + ei = EXT4_I(inode);
> +
> + smp_mb__before_spinlock();
> + spin_lock(&ei->i_raw_lock);
> + ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
> + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
> + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
> + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
> + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
> + ext4_inode_csum_set(inode, raw_inode, ei);
> + spin_unlock(&ei->i_raw_lock);
> + iput(inode);
> + }
> +}
> +
> +/*
> * Post the struct inode info into an on-disk inode location in the
> * buffer-cache. This gobbles the caller's reference to the
> * buffer_head in the inode location struct.
> @@ -4182,7 +4222,9 @@ static int ext4_do_update_inode(handle_t *handle,
> uid_t i_uid;
> gid_t i_gid;
>
> + smp_mb__before_spinlock();
> spin_lock(&ei->i_raw_lock);
> + ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
>
> /* For fields not tracked in the in-memory inode,
> * initialise them to zero for new inodes. */
> @@ -4273,8 +4315,8 @@ static int ext4_do_update_inode(handle_t *handle,
> }
>
> ext4_inode_csum_set(inode, raw_inode, ei);
> -
> spin_unlock(&ei->i_raw_lock);
> + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data);
>
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> rc = ext4_handle_dirty_metadata(handle, NULL, bh);
> @@ -4622,6 +4664,24 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
> return 0;
> }
>
> +int ext4_update_time(struct inode *inode, struct timespec *time, int flags)
> +{
> + if (flags & S_ATIME)
> + inode->i_atime = *time;
> + if (flags & S_VERSION)
> + inode_inc_iversion(inode);
> + if (flags & S_CTIME)
> + inode->i_ctime = *time;
> + if (flags & S_MTIME)
> + inode->i_mtime = *time;
> + if (test_opt(inode->i_sb, LAZYTIME)) {
> + smp_wmb();
> + ext4_set_inode_state(inode, EXT4_STATE_DIRTY_TIME);
Since we want update all in-memory data we also have to explicitly update inode->i_version
Which was previously updated implicitly here:
mark_inode_dirty_sync()
->__mark_inode_dirty
->ext4_dirty_inode
->ext4_mark_inode_dirty
->ext4_mark_iloc_dirty
->inode_inc_iversion(inode);
> + } else
> + mark_inode_dirty_sync(inode);
> + return 0;
> +}
> +
> static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
> int pextents)
> {
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 4262118..f782040 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -3532,6 +3532,7 @@ const struct inode_operations ext4_dir_inode_operations = {
> .tmpfile = ext4_tmpfile,
> .rename2 = ext4_rename2,
> .setattr = ext4_setattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> @@ -3545,6 +3546,7 @@ const struct inode_operations ext4_special_inode_operations = {
> .setattr = ext4_setattr,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> + .update_time = ext4_update_time,
> .listxattr = ext4_listxattr,
> .removexattr = generic_removexattr,
> .get_acl = ext4_get_acl,
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 2c9e686..16c9983 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -910,6 +910,14 @@ static int ext4_drop_inode(struct inode *inode)
> int drop = generic_drop_inode(inode);
>
> trace_ext4_drop_inode(inode, drop);
> + if (!drop && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
> + atomic_inc(&inode->i_count);
> + spin_unlock(&inode->i_lock);
> + ext4_dirty_inode(inode, 0);
> + spin_lock(&inode->i_lock);
> + if (atomic_dec_and_test(&inode->i_count))
> + drop = generic_drop_inode(inode);
> + }
> return drop;
> }
>
> @@ -1142,6 +1150,7 @@ enum {
> Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
> Opt_usrquota, Opt_grpquota, Opt_i_version,
> Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
> + Opt_lazytime, Opt_nolazytime,
> Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
> Opt_inode_readahead_blks, Opt_journal_ioprio,
> Opt_dioread_nolock, Opt_dioread_lock,
> @@ -1204,6 +1213,8 @@ static const match_table_t tokens = {
> {Opt_i_version, "i_version"},
> {Opt_stripe, "stripe=%u"},
> {Opt_delalloc, "delalloc"},
> + {Opt_lazytime, "lazytime"},
> + {Opt_nolazytime, "nolazytime"},
> {Opt_nodelalloc, "nodelalloc"},
> {Opt_removed, "mblk_io_submit"},
> {Opt_removed, "nomblk_io_submit"},
> @@ -1361,6 +1372,8 @@ static const struct mount_opts {
> MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
> {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
> MOPT_EXT4_ONLY | MOPT_CLEAR},
> + {Opt_lazytime, EXT4_MOUNT_LAZYTIME, MOPT_SET},
> + {Opt_nolazytime, EXT4_MOUNT_LAZYTIME, MOPT_CLEAR},
> {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
> MOPT_EXT4_ONLY | MOPT_SET},
> {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
> @@ -3514,6 +3527,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>
> /* Set defaults before we parse the mount options */
> def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
> + set_opt(sb, LAZYTIME);
> set_opt(sb, INIT_INODE_TABLE);
> if (def_mount_opts & EXT4_DEFM_DEBUG)
> set_opt(sb, DEBUG);
> diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
> index ff37119..7c92b93 100644
> --- a/fs/ext4/symlink.c
> +++ b/fs/ext4/symlink.c
> @@ -35,6 +35,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
> .follow_link = page_follow_link_light,
> .put_link = page_put_link,
> .setattr = ext4_setattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> @@ -45,6 +46,7 @@ const struct inode_operations ext4_fast_symlink_inode_operations = {
> .readlink = generic_readlink,
> .follow_link = ext4_follow_link,
> .setattr = ext4_setattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> diff --git a/fs/inode.c b/fs/inode.c
> index 26753ba..cde073a 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1280,6 +1280,42 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
> }
> EXPORT_SYMBOL(ilookup);
>
> +/**
> + * find_active_inode_nowait - find an active inode in the inode cache
> + * @sb: super block of file system to search
> + * @ino: inode number to search for
> + *
> + * Search for an active inode @ino in the inode cache, and if the
> + * inode is in the cache, the inode is returned with an incremented
> + * reference count. If the inode is being freed or is newly
> + * initialized, return nothing instead of trying to wait for the inode
> + * initialization or destruction to be complete.
> + */
> +struct inode *find_active_inode_nowait(struct super_block *sb,
> + unsigned long ino)
> +{
> + struct hlist_head *head = inode_hashtable + hash(sb, ino);
> + struct inode *inode, *ret_inode = NULL;
> +
> + spin_lock(&inode_hash_lock);
> + hlist_for_each_entry(inode, head, i_hash) {
> + if ((inode->i_ino != ino) ||
> + (inode->i_sb != sb))
> + continue;
> + spin_lock(&inode->i_lock);
> + if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) == 0) {
> + __iget(inode);
> + ret_inode = inode;
> + }
> + spin_unlock(&inode->i_lock);
> + goto out;
> + }
> +out:
> + spin_unlock(&inode_hash_lock);
> + return ret_inode;
> +}
> +EXPORT_SYMBOL(find_active_inode_nowait);
> +
> int insert_inode_locked(struct inode *inode)
> {
> struct super_block *sb = inode->i_sb;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9ab779e..b5e6b6b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2410,6 +2410,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
>
> extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
> extern struct inode * iget_locked(struct super_block *, unsigned long);
> +extern struct inode *find_active_inode_nowait(struct super_block *,
> + unsigned long);
> extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
> extern int insert_inode_locked(struct inode *);
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
[cc linux-fsdevel]
On Tue, Nov 11, 2014 at 11:07:07PM -0500, Theodore Ts'o wrote:
> Add a new mount option which enables a new "lazytime" mode. This mode
> causes atime, mtime, and ctime updates to only be made to the
> in-memory version of the inode. The on-disk times will only get
> updated when (a) when the inode table block for the inode needs to be
> updated for some non-time related change involving any inode in the
> block, (b) if userspace calls fsync(), or (c) the refcount on an
> undeleted inode goes to zero (in most cases, when the last file
> descriptor assoicated with the inode is closed).
>
> This is legal according to POSIX because there are no guarantees after
> a crash unless userspace explicitly requests via a fsync(2) call. So
> in fact, this a better way of reducing the disk traffic resulting from
> atime is use lazytime instead of relatime or noatime. Enabling
> lazytime and disabling the default realtime will result in fewer extra
> disk writes, and has the benefits of being POSIX-compliant --- since
> either noatime and relatime violates POSIX.
>
> The lazytime mode reduces pressure on the journal spinlocks, since
> time updates resulting from calls to file_update_time() are almost
> always done using separate jbd2 handles. For allocating writes, the
> inode will need to be updated anyway when i_blocks change, and so the
> mtime updates will be folded into jbd2 handle in the ext4 write path.
>
> In addition, for workloads feature a large number of random write to a
> preallocated file, the lazytime mount option significantly reduces
> writes to the inode table. The repeated 4k writes to a single block
> will result in undesirable stress on flash devices and SMR disk
> drives. Even on conventional HDD's, the repeated writes to the inode
> table block will trigger Adjacent Track Interference (ATI) remediation
> latencies, which very negatively impact 99.9 percentile latencies ---
> which is a very big deal for web serving tiers (for example).
>
> n.b.: because of the many wins of this mode, we may want to enable
> lazytime updates by default in the future. If you know of use cases
> where having inaccurate mtime values after a crash would be extremely
> problematic, please us know at [email protected].
I think this needs to a VFS level inode timestamp update option.
The games ext4 is playing with reference counts inside .drop_inode are
pretty nasty and could be avoided if this is implemented at the VFs
level..
I think that the "lazy time update" status should really be tracked
in the inode->i_state field. Something like lazytime updates do not
call ->update_inode, nor do they mark the inode dirty, but they do
update the inode->i_[acm]time fields and set a TIMEDIRTY state flag.
Then when the filesystem next logs or writes the inode it can
log those fields and clear the TIMEDIRTY flag, or if iput_final()
sees that flag it can call ->update_time directly to get the
filesystem to run a transaction to update the timestamps before the
inode is evicted from cache.
That way the same update semantics can be easily implemented on all
filesystems, and filesystems that already implement update_time
should work without any modification at all...
Cheers,
Dave.
>
> Google-Bug-Id: 18297052
>
> Signed-off-by: Theodore Ts'o <[email protected]>
> ---
> fs/ext4/ext4.h | 3 +++
> fs/ext4/file.c | 1 +
> fs/ext4/fsync.c | 3 +++
> fs/ext4/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/namei.c | 2 ++
> fs/ext4/super.c | 14 ++++++++++++
> fs/ext4/symlink.c | 2 ++
> fs/inode.c | 36 +++++++++++++++++++++++++++++++
> include/linux/fs.h | 2 ++
> 9 files changed, 124 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index c55a1fa..494c504 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -970,6 +970,7 @@ struct ext4_inode_info {
> #define EXT4_MOUNT_ERRORS_MASK 0x00070
> #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
> #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
> +#define EXT4_MOUNT_LAZYTIME 0x00200 /* Update the time lazily */
> #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
> #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
> #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
> @@ -1407,6 +1408,7 @@ enum {
> EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
> EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
> EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
> + EXT4_STATE_DIRTY_TIME, /* the time needs to be updated */
> };
>
> #define EXT4_INODE_BIT_FNS(name, field, offset) \
> @@ -2114,6 +2116,7 @@ extern int ext4_write_inode(struct inode *, struct writeback_control *);
> extern int ext4_setattr(struct dentry *, struct iattr *);
> extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
> struct kstat *stat);
> +extern int ext4_update_time(struct inode *, struct timespec *, int);
> extern void ext4_evict_inode(struct inode *);
> extern void ext4_clear_inode(struct inode *);
> extern int ext4_sync_inode(handle_t *, struct inode *);
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 8131be8..2cf6aaf 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -603,6 +603,7 @@ const struct file_operations ext4_file_operations = {
> const struct inode_operations ext4_file_inode_operations = {
> .setattr = ext4_setattr,
> .getattr = ext4_getattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index a8bc47f..ba05c83 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -116,6 +116,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> if (ret)
> return ret;
> +
> + if (!datasync && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME))
> + ext4_dirty_inode(inode, 0);
> /*
> * data=writeback,ordered:
> * The caller's filemap_fdatawrite()/wait will sync the data.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3356ab5..1b5e4bd 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4163,6 +4163,46 @@ static int ext4_inode_blocks_set(handle_t *handle,
> }
>
> /*
> + * Opportunistically update the other time fields for other inodes in
> + * the same inode table block.
> + */
> +static void ext4_update_other_inodes_time(struct super_block *sb,
> + unsigned long orig_ino, char *buf)
> +{
> + struct ext4_inode_info *ei;
> + struct ext4_inode *raw_inode;
> + unsigned long ino;
> + struct inode *inode;
> + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
> + int inode_size = EXT4_INODE_SIZE(sb);
> +
> + ino = orig_ino & ~(inodes_per_block - 1);
> + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
> + if (ino == orig_ino)
> + continue;
> + inode = find_active_inode_nowait(sb, ino);
> + if (!inode ||
> + !ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
> + iput(inode);
> + continue;
> + }
> + raw_inode = (struct ext4_inode *) buf;
> + ei = EXT4_I(inode);
> +
> + smp_mb__before_spinlock();
> + spin_lock(&ei->i_raw_lock);
> + ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
> + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
> + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
> + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
> + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
> + ext4_inode_csum_set(inode, raw_inode, ei);
> + spin_unlock(&ei->i_raw_lock);
> + iput(inode);
> + }
> +}
> +
> +/*
> * Post the struct inode info into an on-disk inode location in the
> * buffer-cache. This gobbles the caller's reference to the
> * buffer_head in the inode location struct.
> @@ -4182,7 +4222,9 @@ static int ext4_do_update_inode(handle_t *handle,
> uid_t i_uid;
> gid_t i_gid;
>
> + smp_mb__before_spinlock();
> spin_lock(&ei->i_raw_lock);
> + ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
>
> /* For fields not tracked in the in-memory inode,
> * initialise them to zero for new inodes. */
> @@ -4273,8 +4315,8 @@ static int ext4_do_update_inode(handle_t *handle,
> }
>
> ext4_inode_csum_set(inode, raw_inode, ei);
> -
> spin_unlock(&ei->i_raw_lock);
> + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data);
>
> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
> rc = ext4_handle_dirty_metadata(handle, NULL, bh);
> @@ -4622,6 +4664,24 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
> return 0;
> }
>
> +int ext4_update_time(struct inode *inode, struct timespec *time, int flags)
> +{
> + if (flags & S_ATIME)
> + inode->i_atime = *time;
> + if (flags & S_VERSION)
> + inode_inc_iversion(inode);
> + if (flags & S_CTIME)
> + inode->i_ctime = *time;
> + if (flags & S_MTIME)
> + inode->i_mtime = *time;
> + if (test_opt(inode->i_sb, LAZYTIME)) {
> + smp_wmb();
> + ext4_set_inode_state(inode, EXT4_STATE_DIRTY_TIME);
> + } else
> + mark_inode_dirty_sync(inode);
> + return 0;
> +}
> +
> static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
> int pextents)
> {
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 4262118..f782040 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -3532,6 +3532,7 @@ const struct inode_operations ext4_dir_inode_operations = {
> .tmpfile = ext4_tmpfile,
> .rename2 = ext4_rename2,
> .setattr = ext4_setattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> @@ -3545,6 +3546,7 @@ const struct inode_operations ext4_special_inode_operations = {
> .setattr = ext4_setattr,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> + .update_time = ext4_update_time,
> .listxattr = ext4_listxattr,
> .removexattr = generic_removexattr,
> .get_acl = ext4_get_acl,
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 2c9e686..16c9983 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -910,6 +910,14 @@ static int ext4_drop_inode(struct inode *inode)
> int drop = generic_drop_inode(inode);
>
> trace_ext4_drop_inode(inode, drop);
> + if (!drop && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
> + atomic_inc(&inode->i_count);
> + spin_unlock(&inode->i_lock);
> + ext4_dirty_inode(inode, 0);
> + spin_lock(&inode->i_lock);
> + if (atomic_dec_and_test(&inode->i_count))
> + drop = generic_drop_inode(inode);
> + }
> return drop;
> }
>
> @@ -1142,6 +1150,7 @@ enum {
> Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
> Opt_usrquota, Opt_grpquota, Opt_i_version,
> Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
> + Opt_lazytime, Opt_nolazytime,
> Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
> Opt_inode_readahead_blks, Opt_journal_ioprio,
> Opt_dioread_nolock, Opt_dioread_lock,
> @@ -1204,6 +1213,8 @@ static const match_table_t tokens = {
> {Opt_i_version, "i_version"},
> {Opt_stripe, "stripe=%u"},
> {Opt_delalloc, "delalloc"},
> + {Opt_lazytime, "lazytime"},
> + {Opt_nolazytime, "nolazytime"},
> {Opt_nodelalloc, "nodelalloc"},
> {Opt_removed, "mblk_io_submit"},
> {Opt_removed, "nomblk_io_submit"},
> @@ -1361,6 +1372,8 @@ static const struct mount_opts {
> MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
> {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
> MOPT_EXT4_ONLY | MOPT_CLEAR},
> + {Opt_lazytime, EXT4_MOUNT_LAZYTIME, MOPT_SET},
> + {Opt_nolazytime, EXT4_MOUNT_LAZYTIME, MOPT_CLEAR},
> {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
> MOPT_EXT4_ONLY | MOPT_SET},
> {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
> @@ -3514,6 +3527,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>
> /* Set defaults before we parse the mount options */
> def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
> + set_opt(sb, LAZYTIME);
> set_opt(sb, INIT_INODE_TABLE);
> if (def_mount_opts & EXT4_DEFM_DEBUG)
> set_opt(sb, DEBUG);
> diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
> index ff37119..7c92b93 100644
> --- a/fs/ext4/symlink.c
> +++ b/fs/ext4/symlink.c
> @@ -35,6 +35,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
> .follow_link = page_follow_link_light,
> .put_link = page_put_link,
> .setattr = ext4_setattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> @@ -45,6 +46,7 @@ const struct inode_operations ext4_fast_symlink_inode_operations = {
> .readlink = generic_readlink,
> .follow_link = ext4_follow_link,
> .setattr = ext4_setattr,
> + .update_time = ext4_update_time,
> .setxattr = generic_setxattr,
> .getxattr = generic_getxattr,
> .listxattr = ext4_listxattr,
> diff --git a/fs/inode.c b/fs/inode.c
> index 26753ba..cde073a 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1280,6 +1280,42 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
> }
> EXPORT_SYMBOL(ilookup);
>
> +/**
> + * find_active_inode_nowait - find an active inode in the inode cache
> + * @sb: super block of file system to search
> + * @ino: inode number to search for
> + *
> + * Search for an active inode @ino in the inode cache, and if the
> + * inode is in the cache, the inode is returned with an incremented
> + * reference count. If the inode is being freed or is newly
> + * initialized, return nothing instead of trying to wait for the inode
> + * initialization or destruction to be complete.
> + */
> +struct inode *find_active_inode_nowait(struct super_block *sb,
> + unsigned long ino)
> +{
> + struct hlist_head *head = inode_hashtable + hash(sb, ino);
> + struct inode *inode, *ret_inode = NULL;
> +
> + spin_lock(&inode_hash_lock);
> + hlist_for_each_entry(inode, head, i_hash) {
> + if ((inode->i_ino != ino) ||
> + (inode->i_sb != sb))
> + continue;
> + spin_lock(&inode->i_lock);
> + if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) == 0) {
> + __iget(inode);
> + ret_inode = inode;
> + }
> + spin_unlock(&inode->i_lock);
> + goto out;
> + }
> +out:
> + spin_unlock(&inode_hash_lock);
> + return ret_inode;
> +}
> +EXPORT_SYMBOL(find_active_inode_nowait);
> +
> int insert_inode_locked(struct inode *inode)
> {
> struct super_block *sb = inode->i_sb;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9ab779e..b5e6b6b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2410,6 +2410,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
>
> extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
> extern struct inode * iget_locked(struct super_block *, unsigned long);
> +extern struct inode *find_active_inode_nowait(struct super_block *,
> + unsigned long);
> extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
> extern int insert_inode_locked(struct inode *);
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
Dave Chinner
[email protected]
On 11/13/2014 08:41 AM, Dave Chinner wrote:
<>
>
> I think this needs to a VFS level inode timestamp update option.
> The games ext4 is playing with reference counts inside .drop_inode are
> pretty nasty and could be avoided if this is implemented at the VFs
> level..
>
> I think that the "lazy time update" status should really be tracked
> in the inode->i_state field. Something like lazytime updates do not
> call ->update_inode, nor do they mark the inode dirty, but they do
> update the inode->i_[acm]time fields and set a TIMEDIRTY state flag.
>
> Then when the filesystem next logs or writes the inode it can
> log those fields and clear the TIMEDIRTY flag, or if iput_final()
> sees that flag it can call ->update_time directly to get the
> filesystem to run a transaction to update the timestamps before the
> inode is evicted from cache.
>
> That way the same update semantics can be easily implemented on all
> filesystems, and filesystems that already implement update_time
> should work without any modification at all...
>
+1 Like from Boaz. I think Dave is very much right!
> Cheers,
Indeed
Boaz
> Dave.
<>
On Wed, Nov 12, 2014 at 04:47:42PM +0300, Dmitry Monakhov wrote:
> Also sync mtime updates is a great pain for AIO submitter
> because AIO submission may be blocked for a seconds (up to 5 second in my case)
> if inode is part of current committing transaction see: do_get_write_access
5 seconds?!? So you're seeing cases where the jbd2 layer is taking
that long to close a commit? It might be worth looking at that so we
can understand why that is happening, and to see if there's anything
we might do to improve things on that front. Even if we can get rid
of most of the mtime updates, there will be other cases where a commit
that takes a long time to complete will cause all sorts of other very
nasty latencies on the entire system.
> Yeah we also has ticket for that :)
> https://jira.sw.ru/browse/PSBM-20411
Is this supposed to be a URL to publically visible web page?
Host jira.sw.ru not found: 3(NXDOMAIN)
> > + if (flags & S_VERSION)
> > + inode_inc_iversion(inode);
....
> Since we want update all in-memory data we also have to explicitly update inode->i_version
> Which was previously updated implicitly here:
> mark_inode_dirty_sync()
> ->__mark_inode_dirty
> ->ext4_dirty_inode
> ->ext4_mark_inode_dirty
> ->ext4_mark_iloc_dirty
> ->inode_inc_iversion(inode);
It's not necessary to add a anothre call to inode_inc_version() since
we already incremented the i_version if S_VERSION is set, and
S_VERSIOn gets set when it's necessary to handle incrementing
i_Version.
The inode_inc_iversion() in mark4_ext4_iloc_dirty() is probably not
necessary, since we already should be incrementing i_version whenever
ctime and mtime gets updated. The inode_inc_iversion() there is more
of a "belt and suspenders" safety thing, on the theory that the extra
bump in i_version won't hurt anything.
Cheers,
- Ted
On Thu, Nov 13, 2014 at 05:41:50PM +1100, Dave Chinner wrote:
>
> I think this needs to a VFS level inode timestamp update option.
> The games ext4 is playing with reference counts inside .drop_inode are
> pretty nasty and could be avoided if this is implemented at the VFs
> level..
I'm happy to implement this at the VFS level, assuming that there are
no objections from other file system developers. I do need to note
that one potential downside of this feature is that if an inode stays
in the inode cache for potentially a long, long time, and the file is
a preallocated file which is updated using random DIO or AIO writes
(for example, enterprise database files on a long-running server), and
the system crashes, the mtime in memory could potentially out of synch
for days, weeks, months, etc. I'm personally not bothered by this,
but I could imagine that some other folks might.
One other thing we could do at the VFS layer is to change the default
from relatime (which is not POSIX compliant) to enabling atime update
plus lazytime enabled (which is POSIX compliant). Would there be
consensus in making such a change in the default?
> I think that the "lazy time update" status should really be tracked
> in the inode->i_state field. Something like lazytime updates do not
> call ->update_inode, nor do they mark the inode dirty, but they do
> update the inode->i_[acm]time fields and set a TIMEDIRTY state flag.
It looks like the only file systems that have an update_inode today is
btrfs and xfs, and it looks like this change should be fine for both
of them, so sure, that sounds workable.
- Ted
On Thu, Nov 13, 2014 at 11:35:11AM -0500, Theodore Ts'o wrote:
> On Thu, Nov 13, 2014 at 05:41:50PM +1100, Dave Chinner wrote:
> >
> > I think this needs to a VFS level inode timestamp update option.
> > The games ext4 is playing with reference counts inside .drop_inode are
> > pretty nasty and could be avoided if this is implemented at the VFs
> > level..
>
> I'm happy to implement this at the VFS level, assuming that there are
> no objections from other file system developers. I do need to note
> that one potential downside of this feature is that if an inode stays
> in the inode cache for potentially a long, long time, and the file is
> a preallocated file which is updated using random DIO or AIO writes
> (for example, enterprise database files on a long-running server), and
> the system crashes, the mtime in memory could potentially out of synch
> for days, weeks, months, etc. I'm personally not bothered by this,
> but I could imagine that some other folks might.
I really don't care what the behaviour is, as long as it's
*consistent across all filesystems*.
However, we'd be fools to ignore the development of relatime, which
in it's original form never updated the atime until m/ctime updated.
3 years after it was introduced, relatime was changed to limit the
update delay to 24 hours (before it was made the default) so that
applications that required regular updates to timestamps didn't
silently stop working.
So perhaps what we should simply define "lazytime" policy to be
"only update timestamps once a day or whenever the inode is
otherwise modified, whichever comes first".
> One other thing we could do at the VFS layer is to change the default
> from relatime (which is not POSIX compliant) to enabling atime update
> plus lazytime enabled (which is POSIX compliant). Would there be
> consensus in making such a change in the default?
lazytime isn't POSIX compliant as it is implemented in the patch.
sync() won't write back inodes with lazy timestamp updates as they
are not tracked in dirty lists or the ext4 journal, therefore a
crash after a sync() can still lose timestamp updates from before
the sync() ran.
w.r.t. default behaviour, like relatime, I think this it will be a
couple of years before we can consider it to be a default. We'll
need to shake out it's impact on the Real World first....
> > I think that the "lazy time update" status should really be tracked
> > in the inode->i_state field. Something like lazytime updates do not
> > call ->update_inode, nor do they mark the inode dirty, but they do
> > update the inode->i_[acm]time fields and set a TIMEDIRTY state flag.
>
> It looks like the only file systems that have an update_inode today is
> btrfs and xfs, and it looks like this change should be fine for both
> of them, so sure, that sounds workable.
For those that don't implement ->update_time, just calling
write_inode_now() if the TIMEDIRTY flag set in iput_final()
should end up doing the right thing, too...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Fri, Nov 14, 2014 at 07:48:32AM +1100, Dave Chinner wrote:
> However, we'd be fools to ignore the development of relatime, which
> in it's original form never updated the atime until m/ctime updated.
> 3 years after it was introduced, relatime was changed to limit the
> update delay to 24 hours (before it was made the default) so that
> applications that required regular updates to timestamps didn't
> silently stop working.
>
> So perhaps what we should simply define "lazytime" policy to be
> "only update timestamps once a day or whenever the inode is
> otherwise modified, whichever comes first".
Yes, that's reasonable thing to do. Right after I hit send on my last
email I thought about adding a tunable that would do this sort of time
delay, but making it be a hard-coded 24 hours is simpler.
> lazytime isn't POSIX compliant as it is implemented in the patch.
> sync() won't write back inodes with lazy timestamp updates as they
> are not tracked in dirty lists or the ext4 journal, therefore a
> crash after a sync() can still lose timestamp updates from before
> the sync() ran.
Good point, I had fixed it for fsync(), but not for sync();
fortunately it's easy enough to make sure this gets fixed if we
implement it at the VFS Layer.
> w.r.t. default behaviour, like relatime, I think this it will be a
> couple of years before we can consider it to be a default. We'll
> need to shake out it's impact on the Real World first....
Fair enough, although that will only work if distributions actually
enable it by default so we get that real world experience. Otherwise,
we'll not gain any experience wrt to Real World impact.
What I'm thinking about doing is adding a way to enable lazytime via a
flag in the ext4 superblock, which in turn can be set by tune2fs or
mke2fs (if so enabled by /etc/mke2fs.conf), so that we _can_ get this
real world experience.
Cheers,
- Ted
On Thu, Nov 13, 2014 at 04:34:50PM -0500, Theodore Ts'o wrote:
> On Fri, Nov 14, 2014 at 07:48:32AM +1100, Dave Chinner wrote:
> > However, we'd be fools to ignore the development of relatime, which
> > in it's original form never updated the atime until m/ctime updated.
> > 3 years after it was introduced, relatime was changed to limit the
> > update delay to 24 hours (before it was made the default) so that
> > applications that required regular updates to timestamps didn't
> > silently stop working.
> >
> > So perhaps what we should simply define "lazytime" policy to be
> > "only update timestamps once a day or whenever the inode is
> > otherwise modified, whichever comes first".
>
> Yes, that's reasonable thing to do. Right after I hit send on my last
> email I thought about adding a tunable that would do this sort of time
> delay, but making it be a hard-coded 24 hours is simpler.
*nod*
> > lazytime isn't POSIX compliant as it is implemented in the patch.
> > sync() won't write back inodes with lazy timestamp updates as they
> > are not tracked in dirty lists or the ext4 journal, therefore a
> > crash after a sync() can still lose timestamp updates from before
> > the sync() ran.
>
> Good point, I had fixed it for fsync(), but not for sync();
> fortunately it's easy enough to make sure this gets fixed if we
> implement it at the VFS Layer.
Yup, though it probably requires adding those inodes to a special
dirty list so we can find them easily when needed. That would also
make a periodic writeback scan simple.
I suspect this could be easily integrated into the existing
mark_inode_dirty() infrastructure as I_DIRTY_TIME,
mark_inode_dirty_time() and putting them onto the wb->b_dirty list
with an expiry time 24 hours in the future....
The only difference is that we'd need to call ->update_time() if it
existed rather than write_inode() so that we log the timestamp
changes rather than just write the unlogged changes to the inode
(especially as XFS doesn't implement ->write_inode).
> > w.r.t. default behaviour, like relatime, I think this it will be a
> > couple of years before we can consider it to be a default. We'll
> > need to shake out it's impact on the Real World first....
>
> Fair enough, although that will only work if distributions actually
> enable it by default so we get that real world experience. Otherwise,
> we'll not gain any experience wrt to Real World impact.
I think there'll be enough interest in this for there to be plenty
of real world testing done.
Besides, my Magic Crystal Ball(tm) says this:
"The racer-boys who run $FASTEST_DISTRO_OF_THE_MONTH and infest
racer-boy tech forums always tune atime for maximum warp speed. One
of them will make the connection between atime updates and lazytime
and so turn on lazytime. They will then make a forum post saying how
their bonnie++ test runs 3.4% faster but only when they stand on
their head, go cross-eyed and stick their tongue out and so everyone
should do that it because it's Clearly Better.
Me Too posts will rapidly spread across racer forums around the
world and Google will then start returning them as the first hits
when someone searches for lazytime. We'll see a cargo cult of users
doing headstands and using "-o noatime,nodiratime,lazytime" because
that's what the first guy did and everyone who has tried it since
has agreed that it was Clearly Better.
In ten years time we'll still be telling people that bonnie++
numbers are meaningless, nodiratime is redundant when you specific
noatime, that both are redundant when you specific lazytime and that
standing on your head making faces just makes you look like an idiot
and gives you a headache.
Meanwhile Eric will be sitting in the corner muttering quietly
to himself about how there *still* isn't a Sed For Google API that
would let him revise history to stop people finding old posts about
the performance benefits of standing on your head, making faces and
using noatime,nodiratime,lazytime..."
:)
> What I'm thinking about doing is adding a way to enable lazytime via a
> flag in the ext4 superblock, which in turn can be set by tune2fs or
> mke2fs (if so enabled by /etc/mke2fs.conf), so that we _can_ get this
> real world experience.
If you want, but I don't see the point - it's easy enough to add it
to /etc/fstab or a config section in xfstests without needing to
modify mkfs binaries.... ;)
Cheers,
Dave.
--
Dave Chinner
[email protected]
Theodore Ts'o <[email protected]> writes:
> On Wed, Nov 12, 2014 at 04:47:42PM +0300, Dmitry Monakhov wrote:
>> Also sync mtime updates is a great pain for AIO submitter
>> because AIO submission may be blocked for a seconds (up to 5 second in my case)
>> if inode is part of current committing transaction see: do_get_write_access
>
> 5 seconds?!? So you're seeing cases where the jbd2 layer is taking
> that long to close a commit? It might be worth looking at that so we
> can understand why that is happening, and to see if there's anything
> we might do to improve things on that front. Even if we can get rid
> of most of the mtime updates, there will be other cases where a commit
> that takes a long time to complete will cause all sorts of other very
> nasty latencies on the entire system.
Our chunk server workload is quite generic
submit_task: performs aio-dio requests in to multiple chunk files from
several threads, this task should not block for too long.
sync_task: performs fsync/fdatasync on demand for modified chunk files before
we can ACK write-op to user, this task may block
Here is chunk server simulation load:
#TEST_CASE assumes that target fs is mounted to /mnt
# Performs random aio-dio write bsz:64k to preallocated files (size:128M) threads:32
# and performs fdatasync each 32'th write operation
$ fio ./aio-dio.fio
# Measure AIO-DIO write submission latency
$ dd if=/dev/zero of=/mnt/f bs=1M count=1
$ ioping -A -C -D -WWW /mnt/f
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=1 time=410 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=2 time=430 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=3 time=370 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=4 time=400 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=5 time=1.9 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=6 time=4.2 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=7 time=3.8 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=8 time=3.7 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=9 time=4.1 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=10 time=1.9 s
>
>> Yeah we also has ticket for that :)
>> https://jira.sw.ru/browse/PSBM-20411
>
> Is this supposed to be a URL to publically visible web page?
>
> Host jira.sw.ru not found: 3(NXDOMAIN)
Ohh, unfortunetly this host is not visiable from outside.
>
>> > + if (flags & S_VERSION)
>> > + inode_inc_iversion(inode);
> ....
>> Since we want update all in-memory data we also have to explicitly update inode->i_version
>> Which was previously updated implicitly here:
>> mark_inode_dirty_sync()
>> ->__mark_inode_dirty
>> ->ext4_dirty_inode
>> ->ext4_mark_inode_dirty
>> ->ext4_mark_iloc_dirty
>> ->inode_inc_iversion(inode);
>
> It's not necessary to add a anothre call to inode_inc_version() since
> we already incremented the i_version if S_VERSION is set, and
> S_VERSIOn gets set when it's necessary to handle incrementing
> i_Version.
>
> The inode_inc_iversion() in mark4_ext4_iloc_dirty() is probably not
> necessary, since we already should be incrementing i_version whenever
> ctime and mtime gets updated. The inode_inc_iversion() there is more
> of a "belt and suspenders" safety thing, on the theory that the extra
> bump in i_version won't hurt anything.
>
> Cheers,
>
> - Ted
Theodore Ts'o <[email protected]> writes:
> On Wed, Nov 12, 2014 at 04:47:42PM +0300, Dmitry Monakhov wrote:
>> Also sync mtime updates is a great pain for AIO submitter
>> because AIO submission may be blocked for a seconds (up to 5 second in my case)
>> if inode is part of current committing transaction see: do_get_write_access
>
> 5 seconds?!? So you're seeing cases where the jbd2 layer is taking
> that long to close a commit? It might be worth looking at that so we
> can understand why that is happening, and to see if there's anything
> we might do to improve things on that front. Even if we can get rid
> of most of the mtime updates, there will be other cases where a commit
> that takes a long time to complete will cause all sorts of other very
> nasty latencies on the entire system.
Our chunk server workload is quite generic
submit_task: performs aio-dio requests in to multiple chunk files from
several threads, this task should not block for too long.
sync_task: performs fsync/fdatasync on demand for modified chunk files before
we can ACK write-op to user, this task may block
Here is chunk server simulation load:
#TEST_CASE assumes that target fs is mounted to /mnt
# Performs random aio-dio write bsz:64k to preallocated files (size:128M) threads:32
# and performs fdatasync each 32'th write operation
$ fio ./aio-dio.fio
# Measure AIO-DIO write submission latency
$ dd if=/dev/zero of=/mnt/f bs=1M count=1
$ ioping -A -C -D -WWW /mnt/f
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=1 time=410 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=2 time=430 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=3 time=370 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=4 time=400 us
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=5 time=1.9 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=6 time=4.2 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=7 time=3.8 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=8 time=3.7 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=9 time=4.1 s
4.0 KiB from /mnt/f (ext4 /dev/mapper/vzvg-scratch_dev): request=10 time=1.9 s
>
>> Yeah we also has ticket for that :)
>> https://jira.sw.ru/browse/PSBM-20411
>
> Is this supposed to be a URL to publically visible web page?
>
> Host jira.sw.ru not found: 3(NXDOMAIN)
Ohh, unfortunetly this host is not visiable from outside.