2005-01-16 03:38:38

by Andreas Gruenbacher

[permalink] [raw]
Subject: [RFC] Ext3 nanosecond timestamps in big inodes

Hello,

this is a spin-off of an old patch by Alex Tomas <[email protected]>:
Alex originally had nanosecond timestamps in his original patch; here is
a rejuvenated version. Please tell me what you think. Alex also added a
create timestamp in his original patch. Do we actually need that?

Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits
currently are zeroed out implicitly. We could later use them remaining two
bits for years beyond 2038.

Index: linux-2.6.11-rc1-mm1/fs/ext3/inode.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/inode.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/inode.c
@@ -628,7 +628,7 @@ static int ext3_splice_branch(handle_t *

/* We are done with atomic stuff, now do the rest of housekeeping */

- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = ext3_current_time(inode);
ext3_mark_inode_dirty(handle, inode);

/* had we spliced it onto indirect block? */
@@ -2201,7 +2201,7 @@ do_indirects:
;
}
up(&ei->truncate_sem);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_ctime = ext3_current_time(inode);
ext3_mark_inode_dirty(handle, inode);

/* In a multi-transaction truncate, we only make the final
@@ -2501,6 +2501,14 @@ void ext3_read_inode(struct inode * inod
ei->i_extra_isize;
if (le32_to_cpu(*magic))
ei->i_state |= EXT3_STATE_XATTR;
+ if (ei->i_extra_isize >= 2*sizeof(__le16) + 3*sizeof(__le32)) {
+ inode->i_atime.tv_nsec =
+ le32_to_cpu(raw_inode->i_atime_nsec);
+ inode->i_ctime.tv_nsec =
+ le32_to_cpu(raw_inode->i_ctime_nsec);
+ inode->i_mtime.tv_nsec =
+ le32_to_cpu(raw_inode->i_mtime_nsec);
+ }
}

if (S_ISREG(inode->i_mode)) {
@@ -2638,8 +2646,17 @@ static int ext3_do_update_inode(handle_t
} else for (block = 0; block < EXT3_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];

- if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
+ if (ei->i_extra_isize) {
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (ei->i_extra_isize > 2*sizeof(__le16) + 3*sizeof(__le32)) {
+ raw_inode->i_atime_nsec =
+ cpu_to_le32(inode->i_atime.tv_nsec);
+ raw_inode->i_ctime_nsec =
+ cpu_to_le32(inode->i_ctime.tv_nsec);
+ raw_inode->i_mtime_nsec =
+ cpu_to_le32(inode->i_mtime.tv_nsec);
+ }
+ }

BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
rc = ext3_journal_dirty_metadata(handle, bh);
Index: linux-2.6.11-rc1-mm1/fs/ext3/super.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/super.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/super.c
@@ -1402,6 +1402,8 @@ static int ext3_fill_super (struct super
sbi->s_inode_size);
goto failed_mount;
}
+ if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE)
+ sb->s_time_gran = 1;
}
sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
le32_to_cpu(es->s_log_frag_size);
Index: linux-2.6.11-rc1-mm1/fs/ext3/ialloc.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/ialloc.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/ialloc.c
@@ -558,7 +558,7 @@ got:
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blksize = PAGE_SIZE;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = ext3_current_time(inode);

memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_next_alloc_block = 0;
Index: linux-2.6.11-rc1-mm1/fs/ext3/xattr.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/xattr.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/xattr.c
@@ -1001,7 +1001,7 @@ ext3_xattr_set_handle(handle_t *handle,
}
}
if (!error) {
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = ext3_current_time(inode);
error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
/*
* The bh is consumed by ext3_mark_iloc_dirty, even with
Index: linux-2.6.11-rc1-mm1/fs/ext3/ioctl.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/ioctl.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/ioctl.c
@@ -87,7 +87,7 @@ int ext3_ioctl (struct inode * inode, st
ei->i_flags = flags;

ext3_set_inode_flags(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = ext3_current_time(inode);

err = ext3_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -121,7 +121,7 @@ flags_err:
return PTR_ERR(handle);
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = ext3_current_time(inode);
inode->i_generation = generation;
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
Index: linux-2.6.11-rc1-mm1/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.11-rc1-mm1.orig/include/linux/ext3_fs.h
+++ linux-2.6.11-rc1-mm1/include/linux/ext3_fs.h
@@ -245,9 +245,9 @@ struct ext3_inode {
__le16 i_mode; /* File mode */
__le16 i_uid; /* Low 16 bits of Owner Uid */
__le32 i_size; /* Size in bytes */
- __le32 i_atime; /* Access time */
- __le32 i_ctime; /* Creation time */
- __le32 i_mtime; /* Modification time */
+ __le32 i_atime; /* Access time (seconds) */
+ __le32 i_ctime; /* Change time (seconds) */
+ __le32 i_mtime; /* Modification time (seconds) */
__le32 i_dtime; /* Deletion Time */
__le16 i_gid; /* Low 16 bits of Group Id */
__le16 i_links_count; /* Links count */
@@ -295,6 +295,9 @@ struct ext3_inode {
} osd2; /* OS dependent 2 */
__le16 i_extra_isize;
__le16 i_pad1;
+ __le32 i_atime_nsec; /* Access time (nanoseconds) */
+ __le32 i_ctime_nsec; /* Change time (nanoseconds) */
+ __le32 i_mtime_nsec; /* Modification time (nanoseconds) */
};

#define i_size_high i_dir_acl
@@ -476,6 +479,11 @@ static inline struct ext3_inode_info *EX
{
return container_of(inode, struct ext3_inode_info, vfs_inode);
}
+static inline struct timespec ext3_current_time(struct inode *inode)
+{
+ return (inode->i_sb->s_time_gran == 1) ?
+ CURRENT_TIME : CURRENT_TIME_SEC;
+}
#else
/* Assume that user mode programs are passing in an ext3fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
Index: linux-2.6.11-rc1-mm1/fs/ext3/namei.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/namei.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/namei.c
@@ -1256,7 +1256,7 @@ static int add_dirent_to_buf(handle_t *h
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ dir->i_mtime = dir->i_ctime = ext3_current_time(dir);
ext3_update_dx_flag(dir);
dir->i_version++;
ext3_mark_inode_dirty(handle, dir);
@@ -2034,7 +2034,7 @@ static int ext3_rmdir (struct inode * di
* recovery. */
inode->i_size = 0;
ext3_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = ext3_current_time(inode);
ext3_mark_inode_dirty(handle, inode);
dir->i_nlink--;
ext3_update_dx_flag(dir);
@@ -2084,7 +2084,7 @@ static int ext3_unlink(struct inode * di
retval = ext3_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
- dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ dir->i_ctime = dir->i_mtime = ext3_current_time(dir);
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
inode->i_nlink--;
@@ -2174,7 +2174,7 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;

- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = ext3_current_time(inode);
ext3_inc_count(handle, inode);
atomic_inc(&inode->i_count);

@@ -2275,7 +2275,7 @@ static int ext3_rename (struct inode * o
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = CURRENT_TIME_SEC;
+ old_inode->i_ctime = ext3_current_time(old_inode);
ext3_mark_inode_dirty(handle, old_inode);

/*
@@ -2308,9 +2308,9 @@ static int ext3_rename (struct inode * o

if (new_inode) {
new_inode->i_nlink--;
- new_inode->i_ctime = CURRENT_TIME_SEC;
+ new_inode->i_ctime = ext3_current_time(new_inode);
}
- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+ old_dir->i_ctime = old_dir->i_mtime = ext3_current_time(old_dir);
ext3_update_dx_flag(old_dir);
if (dir_bh) {
BUFFER_TRACE(dir_bh, "get_write_access");

Regards,
--
Andreas Gruenbacher <[email protected]>
SUSE Labs, SUSE LINUX PRODUCTS GMBH


2005-01-16 04:38:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] Ext3 nanosecond timestamps in big inodes

Andreas Gruenbacher <[email protected]> writes:

> this is a spin-off of an old patch by Alex Tomas <[email protected]>:
> Alex originally had nanosecond timestamps in his original patch; here is
> a rejuvenated version. Please tell me what you think. Alex also added a
> create timestamp in his original patch. Do we actually need that?
>
> Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits
> currently are zeroed out implicitly. We could later use them remaining two
> bits for years beyond 2038.

Looks good. Just two suggestions:

- Provide an mount option to turn it off because there may be
performance regressions in some workload because inodes will be
flushed more often.
[I actually considered doing this generally at the VFS level
when doing the s_time_gran patch, but it needed some more changes
that I didn't want to do at that time. Doing it in the FS as
interim solution would be fine too]

- Use the 2 bits for additionals years right now on 64bit
hosts. No need to keep the y2038 issue around longer than necessary.

-Andi

2005-01-16 21:03:43

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] Ext3 nanosecond timestamps in big inodes

On Jan 14, 2005 22:16 +0100, Andreas Gruenbacher wrote:
> this is a spin-off of an old patch by Alex Tomas <[email protected]>:
> Alex originally had nanosecond timestamps in his original patch; here is
> a rejuvenated version. Please tell me what you think. Alex also added a
> create timestamp in his original patch. Do we actually need that?

I think the create timestamp can't hurt things and might be useful in some
cases (e.g. CIFS could use this for exporting, and Lustre will also). It
would also be useful for some disk "forensics" (term used loosely) unlike
the ctime. I haven't ever been very keen on nsecond atimes, but it can't
kill us (atime is already flushed at most 1/sec I think).

> Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits
> currently are zeroed out implicitly. We could later use them remaining two
> bits for years beyond 2038.

My thought has always been to use, say, 24 bits for "nseconds>>6" and 8 bits
for "seconds<<32" or something close to this (a few bits either way) to give
us more range on the high end. In the past we talked of usecond timestamps
(which would have only been 20 bits, leaving 12 bits for seconds msb),
but the kernel ended up using nseconds for the internal timestamps.

> @@ -2501,6 +2501,14 @@ void ext3_read_inode(struct inode * inod
> + if (ei->i_extra_isize >= 2*sizeof(__le16) + 3*sizeof(__le32)) {
> + inode->i_atime.tv_nsec =
> + le32_to_cpu(raw_inode->i_atime_nsec);
> + inode->i_ctime.tv_nsec =
> + le32_to_cpu(raw_inode->i_ctime_nsec);
> + inode->i_mtime.tv_nsec =
> + le32_to_cpu(raw_inode->i_mtime_nsec);
> + }

You may as well put a macro here to convert from the raw timestamp
(probably shouldn't just be called "nsec" if we are also encoding more
info there, maybe msb_and_ns?) so we can mask (and shift) as necessary.

> @@ -2638,8 +2646,17 @@ static int ext3_do_update_inode(handle_t
> - if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
> + if (ei->i_extra_isize) {

I don't think this is right. This means we would only ever store nsec
timestamps if there was already other data in the large inode. Since
writing that part of the inode is free (we will be writing a full sector
anyways), no reason not to save it.

> raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> + if (ei->i_extra_isize > 2*sizeof(__le16) + 3*sizeof(__le32)) {
> + raw_inode->i_atime_nsec =
> + cpu_to_le32(inode->i_atime.tv_nsec);
> + raw_inode->i_ctime_nsec =
> + cpu_to_le32(inode->i_ctime.tv_nsec);
> + raw_inode->i_mtime_nsec =
> + cpu_to_le32(inode->i_mtime.tv_nsec);
> + }
> + }

Macro here also to convert from nsec to the ext3-internal "msb_and_ns" format.

> +static inline struct timespec ext3_current_time(struct inode *inode)
> +{
> + return (inode->i_sb->s_time_gran == 1) ?
> + CURRENT_TIME : CURRENT_TIME_SEC;
> +}

If "s_time_gran" (I haven't seen this before but it doesn't appear to
be a part of your patch) had some useful meaning we could use it to e.g.
shift the nsec part of the timestamps as Andy requested so as not to make
the timestamps change too often.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://members.shaw.ca/adilger/ http://members.shaw.ca/golinux/


Attachments:
(No filename) (3.21 kB)
(No filename) (189.00 B)
Download all attachments

2005-01-16 22:36:51

by Andreas Gruenbacher

[permalink] [raw]
Subject: Re: [RFC] Ext3 nanosecond timestamps in big inodes

On Sunday 16 January 2005 05:37, Andi Kleen wrote:
> Andreas Gruenbacher <[email protected]> writes:
> > this is a spin-off of an old patch by Alex Tomas <[email protected]>:
> > Alex originally had nanosecond timestamps in his original patch; here is
> > a rejuvenated version. Please tell me what you think. Alex also added a
> > create timestamp in his original patch. Do we actually need that?
> >
> > Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits
> > currently are zeroed out implicitly. We could later use them remaining
> > two bits for years beyond 2038.
>
> Looks good. Just two suggestions:
>
> - Provide an mount option to turn it off because there may be
> performance regressions in some workload because inodes will be
> flushed more often.
> [I actually considered doing this generally at the VFS level
> when doing the s_time_gran patch, but it needed some more changes
> that I didn't want to do at that time. Doing it in the FS as
> interim solution would be fine too]

I think I agree, also with doing it at the VFS layer. We must make sure that
eventually the most recent timestamp makes it to disk (unless the machine
crashes, in which case slightly out-of-date timestamps probably can be
tolerated).

> - Use the 2 bits for additionals years right now on 64bit
> hosts. No need to keep the y2038 issue around longer than necessary.

Yes. Actually zeroing out the upper two bits is wrong for timestamps before
the epoch if we use plain two's complement representation. The two bits give
us an extended range of about [1697 .. 2242] instead of [1901 .. 2038].

--
Andreas Gruenbacher <[email protected]>
SUSE Labs, SUSE LINUX PRODUCTS GMBH

2005-01-17 09:49:44

by Anton Altaparmakov

[permalink] [raw]
Subject: Re: [RFC] Ext3 nanosecond timestamps in big inodes

On Sat, 2005-01-15 at 22:46 -0700, Andreas Dilger wrote:
> On Jan 14, 2005 22:16 +0100, Andreas Gruenbacher wrote:
> > +static inline struct timespec ext3_current_time(struct inode *inode)
> > +{
> > + return (inode->i_sb->s_time_gran == 1) ?
> > + CURRENT_TIME : CURRENT_TIME_SEC;
> > +}
>
> If "s_time_gran" (I haven't seen this before but it doesn't appear to
> be a part of your patch) had some useful meaning we could use it to e.g.
> shift the nsec part of the timestamps as Andy requested so as not to make
> the timestamps change too often.

sb->s_time_gran is the granularity used for the time in each fs in
nanoseconds. So, for example in NTFS it is set to 100 as NTFS stores
time as 100ns intervals. This means the kernel time can be rounded
appropriately when the fs inode times are being updated. Without this
you can see inode time jumping backwards in time if the inode is thrown
out of memory and then read in again and in the process it had some of
the time bits truncated...

See the original post of the patch from Andi Kleen for details:

http://marc.theaimsgroup.com/?l=linux-kernel&m=110134111125012&w=2

Best regards,

Anton
--
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/