As I had brought up during one of the lightning talks at the Linux
Storage and Filesystem workshop, I am interested in introducing two new
open flags, O_HOT and O_COLD. These flags are passed down to the
individual file system's inode operations' create function, and the file
system can use these flags as a hint regarding whether the file is
likely to be accessed frequently or not.
In the future I plan to do further work on how ext4 would use these
flags, but I want to first get the ability to pass these flags plumbed
into the VFS layer and the code points for O_HOT and O_COLD reserved.
Theodore Ts'o (3):
fs: add new open flags O_HOT and O_COLD
fs: propagate the open_flags structure down to the low-level fs's
create()
ext4: use the O_HOT and O_COLD open flags to influence inode
allocation
fs/9p/vfs_inode.c | 2 +-
fs/affs/affs.h | 2 +-
fs/affs/namei.c | 3 ++-
fs/bfs/dir.c | 2 +-
fs/btrfs/inode.c | 3 ++-
fs/cachefiles/namei.c | 3 ++-
fs/ceph/dir.c | 2 +-
fs/cifs/dir.c | 2 +-
fs/coda/dir.c | 3 ++-
fs/ecryptfs/inode.c | 5 +++--
fs/exofs/namei.c | 2 +-
fs/ext2/namei.c | 4 +++-
fs/ext3/namei.c | 5 +++--
fs/ext4/ext4.h | 8 +++++++-
fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------
fs/ext4/migrate.c | 2 +-
fs/ext4/namei.c | 17 ++++++++++++-----
fs/fat/namei_msdos.c | 2 +-
fs/fat/namei_vfat.c | 2 +-
fs/fcntl.c | 5 +++--
fs/fuse/dir.c | 2 +-
fs/gfs2/inode.c | 3 ++-
fs/hfs/dir.c | 2 +-
fs/hfsplus/dir.c | 5 +++--
fs/hostfs/hostfs_kern.c | 2 +-
fs/hugetlbfs/inode.c | 4 +++-
fs/internal.h | 6 ------
fs/jffs2/dir.c | 5 +++--
fs/jfs/namei.c | 2 +-
fs/logfs/dir.c | 2 +-
fs/minix/namei.c | 2 +-
fs/namei.c | 9 +++++----
fs/ncpfs/dir.c | 5 +++--
fs/nfs/dir.c | 6 ++++--
fs/nfsd/vfs.c | 4 ++--
fs/nilfs2/namei.c | 2 +-
fs/ocfs2/namei.c | 3 ++-
fs/omfs/dir.c | 2 +-
fs/ramfs/inode.c | 3 ++-
fs/reiserfs/namei.c | 5 +++--
fs/sysv/namei.c | 4 +++-
fs/ubifs/dir.c | 2 +-
fs/udf/namei.c | 2 +-
fs/ufs/namei.c | 2 +-
fs/xfs/xfs_iops.c | 3 ++-
include/asm-generic/fcntl.h | 7 +++++++
include/linux/fs.h | 14 ++++++++++++--
ipc/mqueue.c | 2 +-
48 files changed, 143 insertions(+), 74 deletions(-)
--
1.7.10.rc3
This allows the file systems access to the open flags, so they can
take these into account when making allocation decisions for newly
created inodes.
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/9p/vfs_inode.c | 2 +-
fs/affs/affs.h | 2 +-
fs/affs/namei.c | 3 ++-
fs/bfs/dir.c | 2 +-
fs/btrfs/inode.c | 3 ++-
fs/cachefiles/namei.c | 3 ++-
fs/ceph/dir.c | 2 +-
fs/cifs/dir.c | 2 +-
fs/coda/dir.c | 3 ++-
fs/ecryptfs/inode.c | 5 +++--
fs/exofs/namei.c | 2 +-
fs/ext2/namei.c | 4 +++-
fs/ext3/namei.c | 5 +++--
fs/ext4/namei.c | 2 +-
fs/fat/namei_msdos.c | 2 +-
fs/fat/namei_vfat.c | 2 +-
fs/fuse/dir.c | 2 +-
fs/gfs2/inode.c | 3 ++-
fs/hfs/dir.c | 2 +-
fs/hfsplus/dir.c | 5 +++--
fs/hostfs/hostfs_kern.c | 2 +-
fs/hugetlbfs/inode.c | 4 +++-
fs/internal.h | 6 ------
fs/jffs2/dir.c | 5 +++--
fs/jfs/namei.c | 2 +-
fs/logfs/dir.c | 2 +-
fs/minix/namei.c | 2 +-
fs/namei.c | 9 +++++----
fs/ncpfs/dir.c | 5 +++--
fs/nfs/dir.c | 6 ++++--
fs/nfsd/vfs.c | 4 ++--
fs/nilfs2/namei.c | 2 +-
fs/ocfs2/namei.c | 3 ++-
fs/omfs/dir.c | 2 +-
fs/ramfs/inode.c | 3 ++-
fs/reiserfs/namei.c | 5 +++--
fs/sysv/namei.c | 4 +++-
fs/ubifs/dir.c | 2 +-
fs/udf/namei.c | 2 +-
fs/ufs/namei.c | 2 +-
fs/xfs/xfs_iops.c | 3 ++-
include/linux/fs.h | 14 ++++++++++++--
ipc/mqueue.c | 2 +-
43 files changed, 87 insertions(+), 60 deletions(-)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 014c8dd..53c448b 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -722,7 +722,7 @@ error:
static int
v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
int err;
u32 perm;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 45a0ce4..2710c36 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -156,7 +156,7 @@ extern void affs_free_bitmap(struct super_block *sb);
extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
extern int affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *);
+extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *, const struct open_flags *);
extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
extern int affs_rmdir(struct inode *dir, struct dentry *dentry);
extern int affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 4780694..44c74a0 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -255,7 +255,8 @@ affs_unlink(struct inode *dir, struct dentry *dentry)
}
int
-affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
+affs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ struct nameidata *nd, const struct open_flags *op)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d12c796..163e16f 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -85,7 +85,7 @@ const struct file_operations bfs_dir_operations = {
extern void dump_imap(const char *, struct super_block *);
static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
int err;
struct inode *inode;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 115bc05..6c3e9a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4854,7 +4854,8 @@ out_unlock:
}
static int btrfs_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7f0771d..bc1aba9 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -567,7 +567,8 @@ lookup_again:
if (ret < 0)
goto create_error;
start = jiffies;
- ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+ ret = vfs_create(dir->d_inode, next, S_IFREG, NULL,
+ NULL);
cachefiles_hist(cachefiles_create_histogram, start);
if (ret < 0)
goto create_error;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e8094b..40318505 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -700,7 +700,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
}
static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
dout("create in dir %p dentry %p name '%.*s'\n",
dir, dentry, dentry->d_name.len, dentry->d_name.name);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d172c8e..05647d7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -137,7 +137,7 @@ cifs_bp_rename_retry:
int
cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
int rc = -ENOENT;
int xid;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 1775158..51e5413 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -30,7 +30,8 @@
#include "coda_int.h"
/* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, struct nameidata *nd);
+static int coda_create(struct inode *dir, struct dentry *new, umode_t mode,
+ struct nameidata *nd, const struct open_flags *op);
static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
static int coda_link(struct dentry *old_dentry, struct inode *dir_inode,
struct dentry *entry);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ab35b11..5daa0aa 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -173,7 +173,8 @@ ecryptfs_do_create(struct inode *directory_inode,
inode = ERR_CAST(lower_dir_dentry);
goto out;
}
- rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, NULL);
+ rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode,
+ NULL, NULL);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
@@ -248,7 +249,7 @@ out:
*/
static int
ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd, const struct open_flags *op)
{
struct inode *ecryptfs_inode;
int rc;
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index fc7161d..67e83a7 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -60,7 +60,7 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
}
static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode = exofs_new_inode(dir, mode);
int err = PTR_ERR(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dffb865..1d40748 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -94,7 +94,9 @@ struct dentry *ext2_get_parent(struct dentry *child)
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd)
+static int ext2_create (struct inode * dir, struct dentry * dentry,
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
struct inode *inode;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index d7940b2..2607c74 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1689,8 +1689,9 @@ static int ext3_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
- struct nameidata *nd)
+static int ext3_create (struct inode * dir, struct dentry * dentry,
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
handle_t *handle;
struct inode * inode;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 349d7b3..6f48ff8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1737,7 +1737,7 @@ static int ext4_add_nondir(handle_t *handle,
* with d_instantiate().
*/
static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
handle_t *handle;
struct inode *inode;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c5938c9..cef8ddc 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -265,7 +265,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
/***** Create a file */
static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct super_block *sb = dir->i_sb;
struct inode *inode = NULL;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 98ae804..8dd2f1b 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -772,7 +772,7 @@ error:
}
static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2066328..56bd899 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -574,7 +574,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
}
static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
if (nd) {
int err = fuse_create_open(dir, entry, mode, nd);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c98a60e..36701c8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -755,7 +755,8 @@ fail:
*/
static int gfs2_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
int excl = 0;
if (nd && (nd->flags & LOOKUP_EXCL))
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 62fc14e..67a2885 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -187,7 +187,7 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
* the directory and the name (and its length) of the new file.
*/
static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode;
int res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 88e155f..2da1def 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -453,8 +453,9 @@ out:
return res;
}
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+static int hfsplus_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
return hfsplus_mknod(dir, dentry, mode, 0);
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 07c516b..8420a7f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -553,7 +553,7 @@ static int read_name(struct inode *ino, char *name)
}
int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode;
char *name;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28cf06e..df64699 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -564,7 +564,9 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mod
return retval;
}
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
+static int hugetlbfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
diff --git a/fs/internal.h b/fs/internal.h
index 9962c59..63ddd95 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,12 +87,6 @@ extern struct super_block *user_get_super(dev_t);
struct nameidata;
extern struct file *nameidata_to_filp(struct nameidata *);
extern void release_open_intent(struct nameidata *);
-struct open_flags {
- int open_flag;
- umode_t mode;
- int acc_mode;
- int intent;
-};
extern struct file *do_filp_open(int dfd, const char *pathname,
const struct open_flags *op, int lookup_flags);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index b560188..4a92b38 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -25,7 +25,7 @@
static int jffs2_readdir (struct file *, void *, filldir_t);
static int jffs2_create (struct inode *,struct dentry *,umode_t,
- struct nameidata *);
+ struct nameidata *, const struct open_flags *);
static struct dentry *jffs2_lookup (struct inode *,struct dentry *,
struct nameidata *);
static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
@@ -175,7 +175,8 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
static int jffs2_create(struct inode *dir_i, struct dentry *dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
struct jffs2_raw_inode *ri;
struct jffs2_inode_info *f, *dir_f;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 07c91ca..8537059 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -73,7 +73,7 @@ static inline void free_ea_wmap(struct inode *inode)
*
*/
static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
int rc = 0;
tid_t tid; /* transaction id */
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index bea5d1b..2e219e8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -502,7 +502,7 @@ static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 2d0ee17..c0a4510 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -55,7 +55,7 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
}
static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
return minix_mknod(dir, dentry, mode, 0);
}
diff --git a/fs/namei.c b/fs/namei.c
index 0062dd1..898b2af 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2057,7 +2057,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
int error = may_create(dir, dentry);
@@ -2071,7 +2071,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- error = dir->i_op->create(dir, dentry, mode, nd);
+ error = dir->i_op->create(dir, dentry, mode, nd, op);
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -2278,7 +2278,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
error = security_path_mknod(&nd->path, dentry, mode, 0);
if (error)
goto exit_mutex_unlock;
- error = vfs_create(dir->d_inode, dentry, mode, nd);
+ error = vfs_create(dir->d_inode, dentry, mode, nd, op);
if (error)
goto exit_mutex_unlock;
mutex_unlock(&dir->d_inode->i_mutex);
@@ -2596,7 +2596,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
goto out_drop_write;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);
+ error = vfs_create(path.dentry->d_inode,dentry,mode,
+ NULL, NULL);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index aeed93a..9aff9fd 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -30,7 +30,8 @@ static void ncp_do_readdir(struct file *, void *, filldir_t,
static int ncp_readdir(struct file *, void *, filldir_t);
-static int ncp_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
+static int ncp_create(struct inode *, struct dentry *, umode_t,
+ struct nameidata *, const struct open_flags *);
static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
static int ncp_unlink(struct inode *, struct dentry *);
static int ncp_mkdir(struct inode *, struct dentry *, umode_t);
@@ -980,7 +981,7 @@ out:
}
static int ncp_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
return ncp_create_new(dir, dentry, mode, 0, 0);
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4aaf031..986cde5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -47,7 +47,8 @@ static int nfs_opendir(struct inode *, struct file *);
static int nfs_closedir(struct inode *, struct file *);
static int nfs_readdir(struct file *, void *, filldir_t);
static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int nfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
+static int nfs_create(struct inode *, struct dentry *, umode_t,
+ struct nameidata *, const struct open_flags *);
static int nfs_mkdir(struct inode *, struct dentry *, umode_t);
static int nfs_rmdir(struct inode *, struct dentry *);
static int nfs_unlink(struct inode *, struct dentry *);
@@ -1678,7 +1679,8 @@ out_error:
* reply path made it appear to have failed.
*/
static int nfs_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct nameidata *nd)
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
struct iattr attr;
int error;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 296d671..4ff4a61 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1329,7 +1329,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = 0;
switch (type) {
case S_IFREG:
- host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL, NULL);
if (!host_err)
nfsd_check_ignore_resizing(iap);
break;
@@ -1492,7 +1492,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
- host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL, NULL);
if (host_err < 0) {
fh_drop_write(fhp);
goto out_nfserr;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index fce2bbe..13e99aa 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -85,7 +85,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
* with d_instantiate().
*/
static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode;
struct nilfs_transaction_info ti;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a9856e3..f1d8a81 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -618,7 +618,8 @@ static int ocfs2_mkdir(struct inode *dir,
static int ocfs2_create(struct inode *dir,
struct dentry *dentry,
umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd,
+ const struct open_flags *op)
{
int ret;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index f00576e..db58089 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -285,7 +285,7 @@ static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
return omfs_add_node(dir, dentry, mode | S_IFREG);
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a1fdabe..b1ac7c6 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -114,7 +114,8 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
return retval;
}
-static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct nameidata *nd)
+static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ struct nameidata *nd, const struct open_flags *op)
{
return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 84e8a69..85ed28d 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,8 +572,9 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
return 0;
}
-static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+static int reiserfs_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
int retval;
struct inode *inode;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index d7466e2..177ee46 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -80,7 +80,9 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode,
return err;
}
-static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, struct nameidata *nd)
+static int sysv_create(struct inode * dir, struct dentry * dentry,
+ umode_t mode, struct nameidata *nd,
+ const struct open_flags *op)
{
return sysv_mknod(dir, dentry, mode, 0);
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ec9f187..450a0c8 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -254,7 +254,7 @@ out:
}
static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 38de8f2..74057d2 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -551,7 +551,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
}
static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct udf_fileident_bh fibh;
struct inode *inode;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index a2281ca..61e086f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -71,7 +71,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
* with d_instantiate().
*/
static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd, const struct open_flags *op)
{
struct inode *inode;
int err;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3011b87..c39e076 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -182,7 +182,8 @@ xfs_vn_create(
struct inode *dir,
struct dentry *dentry,
umode_t mode,
- struct nameidata *nd)
+ struct nameidata *nd,
+ const struct open_flags *op)
{
return xfs_vn_mknod(dir, dentry, mode, 0);
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8de6755..4ed5244 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1173,6 +1173,14 @@ struct file_lock {
#define OFFT_OFFSET_MAX INT_LIMIT(off_t)
#endif
+/* Passed down to the VFS's create function */
+struct open_flags {
+ int open_flag;
+ umode_t mode;
+ int acc_mode;
+ int intent;
+};
+
#include <linux/fcntl.h>
extern void send_sigio(struct fown_struct *fown, int fd, int band);
@@ -1542,7 +1550,8 @@ extern void unlock_super(struct super_block *);
/*
* VFS helper functions..
*/
-extern int vfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
+extern int vfs_create(struct inode *, struct dentry *, umode_t,
+ struct nameidata *, const struct open_flags *op);
extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
@@ -1645,7 +1654,8 @@ struct inode_operations {
int (*readlink) (struct dentry *, char __user *,int);
void (*put_link) (struct dentry *, struct nameidata *, void *);
- int (*create) (struct inode *,struct dentry *,umode_t,struct nameidata *);
+ int (*create) (struct inode *,struct dentry *,umode_t,
+ struct nameidata *, const struct open_flags *);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 28bd64d..dab7606 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -624,7 +624,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
ret = mnt_want_write(ipc_ns->mq_mnt);
if (ret)
goto out;
- ret = vfs_create(dir->d_inode, dentry, mode, NULL);
+ ret = vfs_create(dir->d_inode, dentry, mode, NULL, NULL);
dentry->d_fsdata = NULL;
if (ret)
goto out_drop_write;
--
1.7.10.rc3
The O_HOT and O_COLD flag are hints to the file system that the file
is going to be frequently accessed (a "hot" file) and or very
infrequently accessed (a "cold" file). It is up the file system to
decide how these flags should be interpreted; in some cases, such as
O_HOT, the file system may require appropriate privileges or implement
check some kind of per-user quota before deciding to honor the flag.
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/fcntl.c | 5 +++--
include/asm-generic/fcntl.h | 7 +++++++
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 75e7c1f..463352d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -835,14 +835,15 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH
+ __FMODE_EXEC | O_PATH | O_HOT |
+ O_COLD
));
fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 9e5b035..1fdcbb1 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -84,6 +84,13 @@
#define O_PATH 010000000
#endif
+#ifndef O_HOT
+#define O_HOT 020000000
+#endif
+#ifndef O_COLD
+#define O_COLD 040000000
+#endif
+
#ifndef O_NDELAY
#define O_NDELAY O_NONBLOCK
#endif
--
1.7.10.rc3
Wire up the use of the O_HOT and O_COLD open flags so that when an
inode is being created, it can influence which part of the disk gets
used on rotational storage devices.
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 8 +++++++-
fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------
fs/ext4/migrate.c | 2 +-
fs/ext4/namei.c | 15 +++++++++++----
4 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e01e90..6539c9a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1553,6 +1553,12 @@ struct ext4_dir_entry_2 {
#define EXT4_MAX_REC_LEN ((1<<16)-1)
/*
+ * Flags for ext4_new_inode()
+ */
+#define EXT4_NEWI_HOT 0x0001
+#define EXT4_NEWI_COLD 0x0002
+
+/*
* If we ever get support for fs block sizes > page_size, we'll need
* to remove the #if statements in the next two functions...
*/
@@ -1850,7 +1856,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct
/* ialloc.c */
extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
- uid_t *owner);
+ uid_t *owner, int flags);
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 409c2ee..3dcc8c8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -363,7 +363,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t *group, umode_t mode,
- const struct qstr *qstr)
+ const struct qstr *qstr, int flags)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -508,13 +508,20 @@ fallback_retry:
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, umode_t mode)
+ ext4_group_t *group, umode_t mode, int flags)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+ if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) &&
+ (parent_group > ngroups / 3))
+ parent_group = 0;
+ if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) &&
+ (parent_group < (2 * (ngroups / 3))))
+ parent_group = 2 * (ngroups / 3);
+
/*
* Try to place the inode is the same flex group as its
* parent. If we can't find space, use the Orlov algorithm to
@@ -550,7 +557,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group + flex_size;
if (*group > ngroups)
*group = 0;
- return find_group_orlov(sb, parent, group, mode, NULL);
+ return find_group_orlov(sb, parent, group, mode, NULL, flags);
}
/*
@@ -614,7 +621,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* group to find a free inode.
*/
struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
- const struct qstr *qstr, __u32 goal, uid_t *owner)
+ const struct qstr *qstr, __u32 goal, uid_t *owner,
+ int flags)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -643,6 +651,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);
+ if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
+ flags &= ~(EXT4_NEWI_HOT | EXT4_NEWI_COLD);
+
+ /*
+ * We will only allow the HOT flag if the user passes the
+ * reserved uid/gid check, or if she has CAP_SYS_RESOURCE
+ */
+ if ((flags & EXT4_NEWI_HOT) &&
+ !(sbi->s_resuid == current_fsuid() ||
+ ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+ capable(CAP_SYS_RESOURCE)))
+ flags &= ~EXT4_NEWI_HOT;
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -654,9 +675,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
}
if (S_ISDIR(mode))
- ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr, flags);
else
- ret2 = find_group_other(sb, dir, &group, mode);
+ ret2 = find_group_other(sb, dir, &group, mode, flags);
got_group:
EXT4_I(dir)->i_last_alloc_group = group;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f39f80f..2b3d65c 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -469,7 +469,7 @@ int ext4_ext_migrate(struct inode *inode)
owner[0] = inode->i_uid;
owner[1] = inode->i_gid;
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
- S_IFREG, NULL, goal, owner);
+ S_IFREG, NULL, goal, owner, 0);
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6f48ff8..222a419 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1742,6 +1742,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
handle_t *handle;
struct inode *inode;
int err, retries = 0;
+ int flags = 0;
dquot_initialize(dir);
@@ -1755,7 +1756,13 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ if (op && op->open_flag & O_HOT)
+ flags |= EXT4_NEWI_HOT;
+ if (op && op->open_flag & O_COLD)
+ flags |= EXT4_NEWI_COLD;
+
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0,
+ NULL, flags);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -1791,7 +1798,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL, 0);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1831,7 +1838,7 @@ retry:
ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
- &dentry->d_name, 0, NULL);
+ &dentry->d_name, 0, NULL, 0);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2278,7 +2285,7 @@ retry:
ext4_handle_sync(handle);
inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
- &dentry->d_name, 0, NULL);
+ &dentry->d_name, 0, NULL, 0);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
--
1.7.10.rc3
On 4/19/12 2:20 PM, Theodore Ts'o wrote:
> Wire up the use of the O_HOT and O_COLD open flags so that when an
> inode is being created, it can influence which part of the disk gets
> used on rotational storage devices.
I'm curious to know how this will work for example on a linear device
make up of rotational devices (possibly a concat of raids, etc).
At least for dm, it will be still marked as rotational,
but the relative speed of regions of the linear device can't be inferred
from the offset within the device.
Do we really have enough information about the storage under us to
know what parts are "fast" and what parts are "slow?"
-Eric
> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
> fs/ext4/ext4.h | 8 +++++++-
> fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------
> fs/ext4/migrate.c | 2 +-
> fs/ext4/namei.c | 15 +++++++++++----
> 4 files changed, 46 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 0e01e90..6539c9a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1553,6 +1553,12 @@ struct ext4_dir_entry_2 {
> #define EXT4_MAX_REC_LEN ((1<<16)-1)
>
> /*
> + * Flags for ext4_new_inode()
> + */
> +#define EXT4_NEWI_HOT 0x0001
> +#define EXT4_NEWI_COLD 0x0002
> +
> +/*
> * If we ever get support for fs block sizes > page_size, we'll need
> * to remove the #if statements in the next two functions...
> */
> @@ -1850,7 +1856,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct
> /* ialloc.c */
> extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
> const struct qstr *qstr, __u32 goal,
> - uid_t *owner);
> + uid_t *owner, int flags);
> extern void ext4_free_inode(handle_t *, struct inode *);
> extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
> extern unsigned long ext4_count_free_inodes(struct super_block *);
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 409c2ee..3dcc8c8 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -363,7 +363,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
>
> static int find_group_orlov(struct super_block *sb, struct inode *parent,
> ext4_group_t *group, umode_t mode,
> - const struct qstr *qstr)
> + const struct qstr *qstr, int flags)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -508,13 +508,20 @@ fallback_retry:
> }
>
> static int find_group_other(struct super_block *sb, struct inode *parent,
> - ext4_group_t *group, umode_t mode)
> + ext4_group_t *group, umode_t mode, int flags)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
> struct ext4_group_desc *desc;
> int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
>
> + if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) &&
> + (parent_group > ngroups / 3))
> + parent_group = 0;
> + if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) &&
> + (parent_group < (2 * (ngroups / 3))))
> + parent_group = 2 * (ngroups / 3);
> +
> /*
> * Try to place the inode is the same flex group as its
> * parent. If we can't find space, use the Orlov algorithm to
> @@ -550,7 +557,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
> *group = parent_group + flex_size;
> if (*group > ngroups)
> *group = 0;
> - return find_group_orlov(sb, parent, group, mode, NULL);
> + return find_group_orlov(sb, parent, group, mode, NULL, flags);
> }
>
> /*
> @@ -614,7 +621,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
> * group to find a free inode.
> */
> struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
> - const struct qstr *qstr, __u32 goal, uid_t *owner)
> + const struct qstr *qstr, __u32 goal, uid_t *owner,
> + int flags)
> {
> struct super_block *sb;
> struct buffer_head *inode_bitmap_bh = NULL;
> @@ -643,6 +651,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
> ei = EXT4_I(inode);
> sbi = EXT4_SB(sb);
>
> + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev)))
> + flags &= ~(EXT4_NEWI_HOT | EXT4_NEWI_COLD);
> +
> + /*
> + * We will only allow the HOT flag if the user passes the
> + * reserved uid/gid check, or if she has CAP_SYS_RESOURCE
> + */
> + if ((flags & EXT4_NEWI_HOT) &&
> + !(sbi->s_resuid == current_fsuid() ||
> + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
> + capable(CAP_SYS_RESOURCE)))
> + flags &= ~EXT4_NEWI_HOT;
> +
> if (!goal)
> goal = sbi->s_inode_goal;
>
> @@ -654,9 +675,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
> }
>
> if (S_ISDIR(mode))
> - ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
> + ret2 = find_group_orlov(sb, dir, &group, mode, qstr, flags);
> else
> - ret2 = find_group_other(sb, dir, &group, mode);
> + ret2 = find_group_other(sb, dir, &group, mode, flags);
>
> got_group:
> EXT4_I(dir)->i_last_alloc_group = group;
> diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
> index f39f80f..2b3d65c 100644
> --- a/fs/ext4/migrate.c
> +++ b/fs/ext4/migrate.c
> @@ -469,7 +469,7 @@ int ext4_ext_migrate(struct inode *inode)
> owner[0] = inode->i_uid;
> owner[1] = inode->i_gid;
> tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
> - S_IFREG, NULL, goal, owner);
> + S_IFREG, NULL, goal, owner, 0);
> if (IS_ERR(tmp_inode)) {
> retval = PTR_ERR(tmp_inode);
> ext4_journal_stop(handle);
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 6f48ff8..222a419 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -1742,6 +1742,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
> handle_t *handle;
> struct inode *inode;
> int err, retries = 0;
> + int flags = 0;
>
> dquot_initialize(dir);
>
> @@ -1755,7 +1756,13 @@ retry:
> if (IS_DIRSYNC(dir))
> ext4_handle_sync(handle);
>
> - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
> + if (op && op->open_flag & O_HOT)
> + flags |= EXT4_NEWI_HOT;
> + if (op && op->open_flag & O_COLD)
> + flags |= EXT4_NEWI_COLD;
> +
> + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0,
> + NULL, flags);
> err = PTR_ERR(inode);
> if (!IS_ERR(inode)) {
> inode->i_op = &ext4_file_inode_operations;
> @@ -1791,7 +1798,7 @@ retry:
> if (IS_DIRSYNC(dir))
> ext4_handle_sync(handle);
>
> - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
> + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL, 0);
> err = PTR_ERR(inode);
> if (!IS_ERR(inode)) {
> init_special_inode(inode, inode->i_mode, rdev);
> @@ -1831,7 +1838,7 @@ retry:
> ext4_handle_sync(handle);
>
> inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
> - &dentry->d_name, 0, NULL);
> + &dentry->d_name, 0, NULL, 0);
> err = PTR_ERR(inode);
> if (IS_ERR(inode))
> goto out_stop;
> @@ -2278,7 +2285,7 @@ retry:
> ext4_handle_sync(handle);
>
> inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
> - &dentry->d_name, 0, NULL);
> + &dentry->d_name, 0, NULL, 0);
> err = PTR_ERR(inode);
> if (IS_ERR(inode))
> goto out_stop;
On Thu, Apr 19, 2012 at 02:45:28PM -0500, Eric Sandeen wrote:
>
> I'm curious to know how this will work for example on a linear device
> make up of rotational devices (possibly a concat of raids, etc).
>
> At least for dm, it will be still marked as rotational,
> but the relative speed of regions of the linear device can't be inferred
> from the offset within the device.
Hmm, good point. We need a way to determine whether this is some kind
of glued-together dm thing versus a plain-old HDD.
> Do we really have enough information about the storage under us to
> know what parts are "fast" and what parts are "slow?"
Well, plain and simple HDD's are still quite common; not everyone
drops in an intermediate dm layer. I view dm as being similar to
enterprise storage arrays where we will need to pass down an explicit
hint with block ranges down to the storage device. However, it's
going to be a long time before we get that part of the interface
plumbed in.
In the meantime, it would be nice if we had something that worked in
the common case of plain old stupid HDD's --- we just need a way of
determining that's what we are dealing with.
- Ted
On 2012-04-19, at 1:59 PM, Ted Ts'o wrote:
> On Thu, Apr 19, 2012 at 02:45:28PM -0500, Eric Sandeen wrote:
>>
>> I'm curious to know how this will work for example on a linear device
>> make up of rotational devices (possibly a concat of raids, etc).
>>
>> At least for dm, it will be still marked as rotational,
>> but the relative speed of regions of the linear device can't be inferred from the offset within the device.
>
> Hmm, good point. We need a way to determine whether this is some kind
> of glued-together dm thing versus a plain-old HDD.
I would posit that in a majority of cases that low-address blocks
are much more likely to be "fast" than high-address blocks. This
is true for RAID-0,1,5,6, most LVs built atop those devices (since
they are allocated from low-to-high offset order).
It is true that some less common configurations (the above dm-concat)
may not follow this rule, but in that case the filesystem is not
worse off compared to not having this information at all.
>> Do we really have enough information about the storage under us to
>> know what parts are "fast" and what parts are "slow?"
>
> Well, plain and simple HDD's are still quite common; not everyone
> drops in an intermediate dm layer. I view dm as being similar to
> enterprise storage arrays where we will need to pass down an explicit
> hint with block ranges down to the storage device. However, it's
> going to be a long time before we get that part of the interface
> plumbed in.
>
> In the meantime, it would be nice if we had something that worked in
> the common case of plain old stupid HDD's --- we just need a way of
> determining that's what we are dealing with.
Also, if the admin knows (or can control) what these hints mean, then
they can configure the storage explicitly to match the usage. I've
long been a proponent of configuring LVs with hybrid SSD+HDD storage,
so that ext4 can allocate inodes + directories on the SSD part of each
flex_bg, and files on the RAID-6 part of the flex_bg. This kind of
API would allow files to be hinted similarly.
While having flexible kernel APIs that allowed the upper layers to
understand the underlying layout would be great, I also don't imagine
that this will arrive any time soon. It will also take userspace and
application support to be able to leverage that, and we have to start
somewhere.
Cheers, Andreas
--
Andreas Dilger Whamcloud, Inc.
Principal Lustre Engineer http://www.whamcloud.com/
On Thu, Apr 19, 2012 at 03:20:11PM -0400, Theodore Ts'o wrote:
> Wire up the use of the O_HOT and O_COLD open flags so that when an
> inode is being created, it can influence which part of the disk gets
> used on rotational storage devices.
.....
> @@ -508,13 +508,20 @@ fallback_retry:
> }
>
> static int find_group_other(struct super_block *sb, struct inode *parent,
> - ext4_group_t *group, umode_t mode)
> + ext4_group_t *group, umode_t mode, int flags)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
> struct ext4_group_desc *desc;
> int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
>
> + if ((flags & EXT4_NEWI_HOT) && (ngroups > 3) &&
> + (parent_group > ngroups / 3))
> + parent_group = 0;
> + if ((flags & EXT4_NEWI_COLD) && (ngroups > 3) &&
> + (parent_group < (2 * (ngroups / 3))))
> + parent_group = 2 * (ngroups / 3);
> +
So you're assuming that locating the inodes somewhere "hot" is going
to improve performance. So say an application has a "hot" file (say
an index file) but still has a lot of other files it creates and
reads, and they are all in the same directory.
If the index file is created "hot", then it is going to be placed a
long way away from all the other files that applciation is using,
and every time you access the hot file you now seek away to a
different location on disk. The net result: the application goes
slower because average seek times have increased.
Essentially, an application is going to have to claim all files it
is working on at any point in time are either hot, normal or cold,
otherwise it is going to seek between hot, normal and cold regions
all the time. That's going to increase average seek times compared
to having all the files in the same general location, hot, cold or
otherwise.
Note: I'm not saying that O_HOT/O_COLD is a bad idea, just that it's
going to be had to implement in a way that behaves consistently in a
way that users would expect - i.e. improves performance. IMO,
unless you have tiered storage and knowledge of the underlying block
device characteristics, then HOT/COLD are going to be very difficult
to implement sanely....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 04/19/2012 02:20 PM, Theodore Ts'o wrote:
> As I had brought up during one of the lightning talks at the Linux
> Storage and Filesystem workshop, I am interested in introducing two new
> open flags, O_HOT and O_COLD. These flags are passed down to the
> individual file system's inode operations' create function, and the file
> system can use these flags as a hint regarding whether the file is
> likely to be accessed frequently or not.
>
> In the future I plan to do further work on how ext4 would use these
> flags, but I want to first get the ability to pass these flags plumbed
> into the VFS layer and the code points for O_HOT and O_COLD reserved.
I don't like it.
I do think that the idea of being able to communicate information
like this to the filesystem is good, and we ought to be investigating
that.
But I have two initial concerns: setting this attribute at create time;
and ambiguity in interpreting what it represents.
These flags are stating that for the lifetime of the file being
created it is "hot" (or "cold"). I think very rarely will whichever
value is set be appropriate for a file's entire lifetime.
I would rather see "hotness" be a attribute of an open that did not
persist after final close. I realize that precludes making an initial
placement decision for a likely hot (or not) file for some filesystems,
but then again, that's another reason why I have a problem with it.
The scenario I'm thinking about is that users could easily request
hot files repeatedly, and could thereby quickly exhaust all available
speedy-quick media designated to serve this purpose--and that will
be especially bad for those filesystems which base initial allocation
decisions on this.
I would prefer to see something like this communicated via fcntl().
It already passes information down to the underlying filesystem in
some cases so you avoid touching all these create interfaces.
The second problem is that "hot/cold" is a lot like "performance."
What is meant by "hot" really depends on what you want. I think it
most closely aligns with frequent access, but someone might want
it to mean "very write-y" or "needing exceptionally low latency"
or "hammering on it from lots of concurrent threads" or "notably
good looking." In any case, there are lots of possible hints
that a filesystem could benefit from, but if we're going to start
down that path I suggest "hot/cold" is not the right kind of
naming scheme we ought to be using.
-Alex
>
> Theodore Ts'o (3):
> fs: add new open flags O_HOT and O_COLD
> fs: propagate the open_flags structure down to the low-level fs's
> create()
> ext4: use the O_HOT and O_COLD open flags to influence inode
> allocation
>
> fs/9p/vfs_inode.c | 2 +-
> fs/affs/affs.h | 2 +-
> fs/affs/namei.c | 3 ++-
> fs/bfs/dir.c | 2 +-
> fs/btrfs/inode.c | 3 ++-
> fs/cachefiles/namei.c | 3 ++-
> fs/ceph/dir.c | 2 +-
> fs/cifs/dir.c | 2 +-
> fs/coda/dir.c | 3 ++-
> fs/ecryptfs/inode.c | 5 +++--
> fs/exofs/namei.c | 2 +-
> fs/ext2/namei.c | 4 +++-
> fs/ext3/namei.c | 5 +++--
> fs/ext4/ext4.h | 8 +++++++-
> fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------
> fs/ext4/migrate.c | 2 +-
> fs/ext4/namei.c | 17 ++++++++++++-----
> fs/fat/namei_msdos.c | 2 +-
> fs/fat/namei_vfat.c | 2 +-
> fs/fcntl.c | 5 +++--
> fs/fuse/dir.c | 2 +-
> fs/gfs2/inode.c | 3 ++-
> fs/hfs/dir.c | 2 +-
> fs/hfsplus/dir.c | 5 +++--
> fs/hostfs/hostfs_kern.c | 2 +-
> fs/hugetlbfs/inode.c | 4 +++-
> fs/internal.h | 6 ------
> fs/jffs2/dir.c | 5 +++--
> fs/jfs/namei.c | 2 +-
> fs/logfs/dir.c | 2 +-
> fs/minix/namei.c | 2 +-
> fs/namei.c | 9 +++++----
> fs/ncpfs/dir.c | 5 +++--
> fs/nfs/dir.c | 6 ++++--
> fs/nfsd/vfs.c | 4 ++--
> fs/nilfs2/namei.c | 2 +-
> fs/ocfs2/namei.c | 3 ++-
> fs/omfs/dir.c | 2 +-
> fs/ramfs/inode.c | 3 ++-
> fs/reiserfs/namei.c | 5 +++--
> fs/sysv/namei.c | 4 +++-
> fs/ubifs/dir.c | 2 +-
> fs/udf/namei.c | 2 +-
> fs/ufs/namei.c | 2 +-
> fs/xfs/xfs_iops.c | 3 ++-
> include/asm-generic/fcntl.h | 7 +++++++
> include/linux/fs.h | 14 ++++++++++++--
> ipc/mqueue.c | 2 +-
> 48 files changed, 143 insertions(+), 74 deletions(-)
>
On Fri, Apr 20, 2012 at 09:27:57AM +1000, Dave Chinner wrote:
> So you're assuming that locating the inodes somewhere "hot" is going
> to improve performance. So say an application has a "hot" file (say
> an index file) but still has a lot of other files it creates and
> reads, and they are all in the same directory.
>
> If the index file is created "hot", then it is going to be placed a
> long way away from all the other files that applciation is using,
> and every time you access the hot file you now seek away to a
> different location on disk. The net result: the application goes
> slower because average seek times have increased.
Well, let's assume the application is using all or most of the disk,
so the objects it is fetching from the 2T disk are randomly
distributed throughout the disk. Short seeks are faster, yes, but the
seek time as a function of the seek distance is decidedly non-linear,
with a sharp "knee" in the curve at around 10-15% of a full-stroke
seek. (Ref:
http://static.usenix.org/event/fast05/tech/schlosser/schlosser.pdf)
So most of the time, as you seek back and forth fetching data objects,
most of the time you will be incurring 75-85% of the cost of a
worst-case seek anyway. So seeking *is* going to be a fact of life
that we can't run away from that.
Given that, the question then is whether we are better off (a) putting
the index files in the exact middle of the disk, trying to minimize
seeks, (b) scattering the index files all over the disk randomly, or
(c) concentrating the index files near the beginning of the disk?
Given the non-linear seek times, it seems to suggest that (c) would
probably be the best case for this use case.
Note that when we short-stroke, it's not just a matter of minimizing
seek distances; if it were, then it wouldn't matter if we used the
first third of the disk closest to the outer edge, or the last third
of the disk closer to the inner part of the disk.
Granted this may be a relatively small effect compared to the huge
wins of placing your data according to its usage frequency on tiered
storage. But the effect should still be there.
Cheers,
- Ted
On Thu, Apr 19, 2012 at 07:26:17PM -0500, Alex Elder wrote:
>
> The scenario I'm thinking about is that users could easily request
> hot files repeatedly, and could thereby quickly exhaust all available
> speedy-quick media designated to serve this purpose--and that will
> be especially bad for those filesystems which base initial allocation
> decisions on this.
Sure, there will need to be some controls about this. In the sample
implementation, it required CAP_SYS_RESOURCE or the uid or guid had to
match the res_uid/res_gid stored in the ext 2/3/4 superblock (this was
there already to allow certain users or groups access to the reserved
free space on the file system). I could imagine other implementations
using a full-fleged quota system.
> I would prefer to see something like this communicated via fcntl().
> It already passes information down to the underlying filesystem in
> some cases so you avoid touching all these create interfaces.
Well, programs could also set or clear these flags via fcntl's
SETFL/GETFL. The reason why I'm interested in having this flexibility
is so that it's possible for applications to pass in these flags at
open time or via fcntl.
> The second problem is that "hot/cold" is a lot like "performance."
> What is meant by "hot" really depends on what you want. I think it
> most closely aligns with frequent access, but someone might want
> it to mean "very write-y" or "needing exceptionally low latency"
> or "hammering on it from lots of concurrent threads" or "notably
> good looking." In any case, there are lots of possible hints
> that a filesystem could benefit from, but if we're going to start
> down that path I suggest "hot/cold" is not the right kind of
> naming scheme we ought to be using.
There are two ways we could go with this. One is to try to define
what the semantics of the performance flags that the application
program might want to request, very precisely. Going down that path
leads to something like what the T10 folks have done, with multiple
4-bit slider specifying write-frequency, read-frequency, retention
levels, etc. in great exhaustive detail.
The other approach is to leave things roughly undefined, and accept
the fact that applications which use this will probably be specialized
applications that are very much aware of what file system they are
using, and just need to pass minimal hints to the application in a
general way, and that's the approach I went with in this O_HOT/O_COLD proposal.
I suspect that HOT/COLD is enough to go quite far even for tiered
storage; maybe at some point we will want some other, more
fine-grained interface where an application program can very precisely
dial in their requirements in a T10-like fashion. Perhaps. But I
don't think having a simple O_HOT/O_COLD interface precludes the
other, or vice versa. In fact, one advantage with sticking with
HOT/COLD is that there's much less chance of bike-shedding, with
people arguing over what a more fine-grained interface might look like.
So why not start with this, and if we need to use something more
complex later, we can cross that bridge if and when we get to it? In
the meantime, I think there are valid uses of this simple, minimal
interface in the case of a local disk file system supporting a cluster
file system such as Hadoopfs or TFS. One of the useful things that
came out of the ext4 workshop where we got to talk to developers from
Taobao was finding out how much their interests matched with some of
the things we've talked about doing at Google to support our internal
customers.
- Ted
On 04/19/2012 10:20 PM, Theodore Ts'o wrote:
> As I had brought up during one of the lightning talks at the Linux
> Storage and Filesystem workshop, I am interested in introducing two new
> open flags, O_HOT and O_COLD. These flags are passed down to the
> individual file system's inode operations' create function, and the file
> system can use these flags as a hint regarding whether the file is
> likely to be accessed frequently or not.
>
> In the future I plan to do further work on how ext4 would use these
> flags, but I want to first get the ability to pass these flags plumbed
> into the VFS layer and the code points for O_HOT and O_COLD reserved.
>
>
> Theodore Ts'o (3):
> fs: add new open flags O_HOT and O_COLD
> fs: propagate the open_flags structure down to the low-level fs's
> create()
> ext4: use the O_HOT and O_COLD open flags to influence inode
> allocation
>
I would expect that the first, and most important patch to this
set would be the man page which would define the new API.
What do you mean by cold/normal/hot? what is expected if supported?
how can we know if supported? ....
I presume you mean 3 levels (not even 2 bits) of what T10 called
"read-frequency" or is that "write-frequency", or some other metrics
you defined?
Well in the patchset you supplied it means closer to outer-edge.
What ever that means? so in the case of ext4 on SSD or DM/MD or
loop or thin provisioned LUN. How do I stop it. The code is already
there in Kernel and the application is setting that flag at create,
how do I make the FS not do that stupid, for me, thing?
I wish you'd be transparent, call it O_OUTER_DISK and be honest
about it. The "undefined API" never ever worked in the past,
why would it work now?
And Yes an fctrl is a much better match, and with delayed allocation
that should not matter, right?
And one last thing. We would like to see numbers. Please show us where/how
it matters. Are there down sides?. If it's so good we'd like to implement
it too.
Thanks
Boaz
> fs/9p/vfs_inode.c | 2 +-
> fs/affs/affs.h | 2 +-
> fs/affs/namei.c | 3 ++-
> fs/bfs/dir.c | 2 +-
> fs/btrfs/inode.c | 3 ++-
> fs/cachefiles/namei.c | 3 ++-
> fs/ceph/dir.c | 2 +-
> fs/cifs/dir.c | 2 +-
> fs/coda/dir.c | 3 ++-
> fs/ecryptfs/inode.c | 5 +++--
> fs/exofs/namei.c | 2 +-
> fs/ext2/namei.c | 4 +++-
> fs/ext3/namei.c | 5 +++--
> fs/ext4/ext4.h | 8 +++++++-
> fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------
> fs/ext4/migrate.c | 2 +-
> fs/ext4/namei.c | 17 ++++++++++++-----
> fs/fat/namei_msdos.c | 2 +-
> fs/fat/namei_vfat.c | 2 +-
> fs/fcntl.c | 5 +++--
> fs/fuse/dir.c | 2 +-
> fs/gfs2/inode.c | 3 ++-
> fs/hfs/dir.c | 2 +-
> fs/hfsplus/dir.c | 5 +++--
> fs/hostfs/hostfs_kern.c | 2 +-
> fs/hugetlbfs/inode.c | 4 +++-
> fs/internal.h | 6 ------
> fs/jffs2/dir.c | 5 +++--
> fs/jfs/namei.c | 2 +-
> fs/logfs/dir.c | 2 +-
> fs/minix/namei.c | 2 +-
> fs/namei.c | 9 +++++----
> fs/ncpfs/dir.c | 5 +++--
> fs/nfs/dir.c | 6 ++++--
> fs/nfsd/vfs.c | 4 ++--
> fs/nilfs2/namei.c | 2 +-
> fs/ocfs2/namei.c | 3 ++-
> fs/omfs/dir.c | 2 +-
> fs/ramfs/inode.c | 3 ++-
> fs/reiserfs/namei.c | 5 +++--
> fs/sysv/namei.c | 4 +++-
> fs/ubifs/dir.c | 2 +-
> fs/udf/namei.c | 2 +-
> fs/ufs/namei.c | 2 +-
> fs/xfs/xfs_iops.c | 3 ++-
> include/asm-generic/fcntl.h | 7 +++++++
> include/linux/fs.h | 14 ++++++++++++--
> ipc/mqueue.c | 2 +-
> 48 files changed, 143 insertions(+), 74 deletions(-)
>
On 04/20/2012 05:45 AM, Ted Ts'o wrote:
> The other approach is to leave things roughly undefined, and accept
> the fact that applications which use this will probably be specialized
> applications that are very much aware of what file system they are
> using,
If that is the case then I prefer an FS specific IOCTL. Since the app
already has FS specific code built in.
> and just need to pass minimal hints to the application in a
> general way, and that's the approach I went with in this O_HOT/O_COLD proposal.
>
You are contradicting yourself. Above you say specific FS (read ext4)
and here you say "general way".
Please show me how your proposal is not ext4 outer-rim specific, in devices
that are single rotational disks.
What does the "general way" mean?
> I suspect that HOT/COLD is enough to go quite far even for tiered
> storage; maybe at some point we will want some other, more
> fine-grained interface where an application program can very precisely
> dial in their requirements in a T10-like fashion. Perhaps. But I
> don't think having a simple O_HOT/O_COLD interface precludes the
> other, or vice versa. In fact, one advantage with sticking with
> HOT/COLD is that there's much less chance of bike-shedding, with
> people arguing over what a more fine-grained interface might look like.
>
But bike-shedding is exactly what you propose. (well not that you actually
stated what you propose)
Your patch says "beginning of the disk" but the flag is called O_HOT,
That's bike-shedding. You hope there will be new meaning for it in the
future.
> So why not start with this, and if we need to use something more
> complex later, we can cross that bridge if and when we get to it? In
> the meantime, I think there are valid uses of this simple, minimal
> interface in the case of a local disk file system supporting a cluster
> file system such as Hadoopfs or TFS. One of the useful things that
> came out of the ext4 workshop where we got to talk to developers from
> Taobao was finding out how much their interests matched with some of
> the things we've talked about doing at Google to support our internal
> customers.
>
This all reads, ext4 specific / app specific. Why a general API? and
why must it be at create?
> - Ted
Thanks
Boaz
On Fri, 20 Apr 2012, Boaz Harrosh wrote:
> On 04/19/2012 10:20 PM, Theodore Ts'o wrote:
>
> > As I had brought up during one of the lightning talks at the Linux
> > Storage and Filesystem workshop, I am interested in introducing two new
> > open flags, O_HOT and O_COLD. These flags are passed down to the
> > individual file system's inode operations' create function, and the file
> > system can use these flags as a hint regarding whether the file is
> > likely to be accessed frequently or not.
> >
> > In the future I plan to do further work on how ext4 would use these
> > flags, but I want to first get the ability to pass these flags plumbed
> > into the VFS layer and the code points for O_HOT and O_COLD reserved.
> >
> >
> > Theodore Ts'o (3):
> > fs: add new open flags O_HOT and O_COLD
> > fs: propagate the open_flags structure down to the low-level fs's
> > create()
> > ext4: use the O_HOT and O_COLD open flags to influence inode
> > allocation
> >
>
>
> I would expect that the first, and most important patch to this
> set would be the man page which would define the new API.
> What do you mean by cold/normal/hot? what is expected if supported?
> how can we know if supported? ....
Well, this is exactly my concern as well. There is no way anyone would
know what it actually means a what users can expect form using it. The
result of this is very simple, everyone will just use O_HOT for
everything (if they will use it at all).
Ted, as I've mentioned on LSF I think that the HOT/COLD name is really
bad choice for exactly this reason. It means nothing. If you want to use
this flag to place the inode on the faster part of the disk, then just
say so and name the flag accordingly, this way everyone can use it.
However for this to actually work we need some fs<->storage interface to
query storage layout, which actually should not be that hard to do. I am
afraid that in current form it will suit only Google and Taobao. I would
really like to have interface to pass tags between user->fs and
fs<->storage, but this one does not seem like a good start.
There was one flag you've mentioned on LSF which makes sense to me, but
unfortunately I can not see it here. It is O_TEMP, which says exactly
how user should use it, hence it will be useful.
Also we have to think about the interface for passing tags from users,
because clearly open flags does not scale. fnctl, or fadvise might be
better choice, but I understand that in some cases we need to have this
information on allocation and I am not sure if we can rely on delayed
allocation (it seems really hacky). Or maybe it can be fadvise/fnctl
flag for a directory, since files in one directory might have similar
access pattern and it also have the advantage of forcing users to divide
their files to the directories according to their use, which will be
beneficial anyway.
I have to admit that I do not have any particularly strong feeling about
any of those approaches (open/fnctl/fadvise/directory), but someone else
might... But I definitely think that we need to define the interface
well and also rather do it from bottom-up. There already is a need to have
fs<->storage information exchange interface for variety of reasons, so
why not start there first to see what can be provided ?
Thanks!
-Lukas
>
> I presume you mean 3 levels (not even 2 bits) of what T10 called
> "read-frequency" or is that "write-frequency", or some other metrics
> you defined?
>
> Well in the patchset you supplied it means closer to outer-edge.
> What ever that means? so in the case of ext4 on SSD or DM/MD or
> loop or thin provisioned LUN. How do I stop it. The code is already
> there in Kernel and the application is setting that flag at create,
> how do I make the FS not do that stupid, for me, thing?
>
> I wish you'd be transparent, call it O_OUTER_DISK and be honest
> about it. The "undefined API" never ever worked in the past,
> why would it work now?
>
> And Yes an fctrl is a much better match, and with delayed allocation
> that should not matter, right?
>
> And one last thing. We would like to see numbers. Please show us where/how
> it matters. Are there down sides?. If it's so good we'd like to implement
> it too.
>
> Thanks
> Boaz
>
> > fs/9p/vfs_inode.c | 2 +-
> > fs/affs/affs.h | 2 +-
> > fs/affs/namei.c | 3 ++-
> > fs/bfs/dir.c | 2 +-
> > fs/btrfs/inode.c | 3 ++-
> > fs/cachefiles/namei.c | 3 ++-
> > fs/ceph/dir.c | 2 +-
> > fs/cifs/dir.c | 2 +-
> > fs/coda/dir.c | 3 ++-
> > fs/ecryptfs/inode.c | 5 +++--
> > fs/exofs/namei.c | 2 +-
> > fs/ext2/namei.c | 4 +++-
> > fs/ext3/namei.c | 5 +++--
> > fs/ext4/ext4.h | 8 +++++++-
> > fs/ext4/ialloc.c | 33 +++++++++++++++++++++++++++------
> > fs/ext4/migrate.c | 2 +-
> > fs/ext4/namei.c | 17 ++++++++++++-----
> > fs/fat/namei_msdos.c | 2 +-
> > fs/fat/namei_vfat.c | 2 +-
> > fs/fcntl.c | 5 +++--
> > fs/fuse/dir.c | 2 +-
> > fs/gfs2/inode.c | 3 ++-
> > fs/hfs/dir.c | 2 +-
> > fs/hfsplus/dir.c | 5 +++--
> > fs/hostfs/hostfs_kern.c | 2 +-
> > fs/hugetlbfs/inode.c | 4 +++-
> > fs/internal.h | 6 ------
> > fs/jffs2/dir.c | 5 +++--
> > fs/jfs/namei.c | 2 +-
> > fs/logfs/dir.c | 2 +-
> > fs/minix/namei.c | 2 +-
> > fs/namei.c | 9 +++++----
> > fs/ncpfs/dir.c | 5 +++--
> > fs/nfs/dir.c | 6 ++++--
> > fs/nfsd/vfs.c | 4 ++--
> > fs/nilfs2/namei.c | 2 +-
> > fs/ocfs2/namei.c | 3 ++-
> > fs/omfs/dir.c | 2 +-
> > fs/ramfs/inode.c | 3 ++-
> > fs/reiserfs/namei.c | 5 +++--
> > fs/sysv/namei.c | 4 +++-
> > fs/ubifs/dir.c | 2 +-
> > fs/udf/namei.c | 2 +-
> > fs/ufs/namei.c | 2 +-
> > fs/xfs/xfs_iops.c | 3 ++-
> > include/asm-generic/fcntl.h | 7 +++++++
> > include/linux/fs.h | 14 ++++++++++++--
> > ipc/mqueue.c | 2 +-
> > 48 files changed, 143 insertions(+), 74 deletions(-)
> >
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
On 04/19/2012 09:20 PM, Theodore Ts'o wrote:
> As I had brought up during one of the lightning talks at the Linux
> Storage and Filesystem workshop, I am interested in introducing two new
> open flags, O_HOT and O_COLD. These flags are passed down to the
> individual file system's inode operations' create function, and the file
> system can use these flags as a hint regarding whether the file is
> likely to be accessed frequently or not.
>
> In the future I plan to do further work on how ext4 would use these
> flags, but I want to first get the ability to pass these flags plumbed
> into the VFS layer and the code points for O_HOT and O_COLD reserved.
>
Ted, you still remember the directory-block read-ahead patches I sent
last year for ext4 and which you declined, as it would add another mount
parameter for ext4?
http://www.digipedia.pl/usenet/thread/11916/24502/#post24502
If an application could use those flags to the file system (and then not
only ext4, but any file system) to indicate a certain directory is
important and frequently accessed, it would be simple to update those
patches to work without another mount option.
And while I updating our FhGFS meta data on disk layout to workaround
the general problem we, we (and for example Lustre) are still affected
the object-storage-side.
Thanks,
Bernd
On Fri, 20 Apr 2012, Bernd Schubert wrote:
> On 04/19/2012 09:20 PM, Theodore Ts'o wrote:
> > As I had brought up during one of the lightning talks at the Linux
> > Storage and Filesystem workshop, I am interested in introducing two new
> > open flags, O_HOT and O_COLD. These flags are passed down to the
> > individual file system's inode operations' create function, and the file
> > system can use these flags as a hint regarding whether the file is
> > likely to be accessed frequently or not.
> >
> > In the future I plan to do further work on how ext4 would use these
> > flags, but I want to first get the ability to pass these flags plumbed
> > into the VFS layer and the code points for O_HOT and O_COLD reserved.
> >
>
> Ted, you still remember the directory-block read-ahead patches I sent last
> year for ext4 and which you declined, as it would add another mount parameter
> for ext4?
>
> http://www.digipedia.pl/usenet/thread/11916/24502/#post24502
>
>
> If an application could use those flags to the file system (and then not only
> ext4, but any file system) to indicate a certain directory is important and
> frequently accessed, it would be simple to update those patches to work
> without another mount option.
> And while I updating our FhGFS meta data on disk layout to workaround the
> general problem we, we (and for example Lustre) are still affected the
> object-storage-side.
>
>
> Thanks,
> Bernd
Nice, there are probably lots of flags we can think of, so we
definitely need good interface for it as we certainly do not want to use
O_HOT open flags here :)
Thanks!
-Lukas
On Fri, 2012-04-20 at 11:45 +0200, Lukas Czerner wrote:
> On Fri, 20 Apr 2012, Boaz Harrosh wrote:
>
> > On 04/19/2012 10:20 PM, Theodore Ts'o wrote:
> >
> > > As I had brought up during one of the lightning talks at the Linux
> > > Storage and Filesystem workshop, I am interested in introducing two new
> > > open flags, O_HOT and O_COLD. These flags are passed down to the
> > > individual file system's inode operations' create function, and the file
> > > system can use these flags as a hint regarding whether the file is
> > > likely to be accessed frequently or not.
> > >
> > > In the future I plan to do further work on how ext4 would use these
> > > flags, but I want to first get the ability to pass these flags plumbed
> > > into the VFS layer and the code points for O_HOT and O_COLD reserved.
> > >
> > >
> > > Theodore Ts'o (3):
> > > fs: add new open flags O_HOT and O_COLD
> > > fs: propagate the open_flags structure down to the low-level fs's
> > > create()
> > > ext4: use the O_HOT and O_COLD open flags to influence inode
> > > allocation
> > >
> >
> >
> > I would expect that the first, and most important patch to this
> > set would be the man page which would define the new API.
> > What do you mean by cold/normal/hot? what is expected if supported?
> > how can we know if supported? ....
>
> Well, this is exactly my concern as well. There is no way anyone would
> know what it actually means a what users can expect form using it. The
> result of this is very simple, everyone will just use O_HOT for
> everything (if they will use it at all).
>
> Ted, as I've mentioned on LSF I think that the HOT/COLD name is really
> bad choice for exactly this reason. It means nothing. If you want to use
> this flag to place the inode on the faster part of the disk, then just
> say so and name the flag accordingly, this way everyone can use it.
> However for this to actually work we need some fs<->storage interface to
> query storage layout, which actually should not be that hard to do. I am
> afraid that in current form it will suit only Google and Taobao. I would
> really like to have interface to pass tags between user->fs and
> fs<->storage, but this one does not seem like a good start.
I think this is a little unfair. We already have the notion of hot and
cold pages within the page cache. The definitions for storage is
similar: a hot block is one which will likely be read again shortly and
a cold block is one that likely won't (ignoring the 30 odd gradations of
in-between that the draft standard currently mandates)
The concern I have is that the notion of hot and cold files *isn't*
propagated to the page cache, it's just shared between the fs and the
disk. It looks like we could tie the notion of file opened with O_HOT
or O_COLD into the page reclaimers and actually call
free_hot_cold_page() with the correct flag, meaning we might get an
immediate benefit even in the absence of hint supporting disks.
I cc'd linux-mm to see if there might be an interest in this ... or even
if it's worth it: I can also see we don't necessarily want userspace to
be able to tamper with our idea of what's hot and cold in the page
cache, since we get it primarily from the lru lists.
James
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri, 20 Apr 2012, James Bottomley wrote:
> On Fri, 2012-04-20 at 11:45 +0200, Lukas Czerner wrote:
> > On Fri, 20 Apr 2012, Boaz Harrosh wrote:
> >
> > > On 04/19/2012 10:20 PM, Theodore Ts'o wrote:
> > >
> > > > As I had brought up during one of the lightning talks at the Linux
> > > > Storage and Filesystem workshop, I am interested in introducing two new
> > > > open flags, O_HOT and O_COLD. These flags are passed down to the
> > > > individual file system's inode operations' create function, and the file
> > > > system can use these flags as a hint regarding whether the file is
> > > > likely to be accessed frequently or not.
> > > >
> > > > In the future I plan to do further work on how ext4 would use these
> > > > flags, but I want to first get the ability to pass these flags plumbed
> > > > into the VFS layer and the code points for O_HOT and O_COLD reserved.
> > > >
> > > >
> > > > Theodore Ts'o (3):
> > > > fs: add new open flags O_HOT and O_COLD
> > > > fs: propagate the open_flags structure down to the low-level fs's
> > > > create()
> > > > ext4: use the O_HOT and O_COLD open flags to influence inode
> > > > allocation
> > > >
> > >
> > >
> > > I would expect that the first, and most important patch to this
> > > set would be the man page which would define the new API.
> > > What do you mean by cold/normal/hot? what is expected if supported?
> > > how can we know if supported? ....
> >
> > Well, this is exactly my concern as well. There is no way anyone would
> > know what it actually means a what users can expect form using it. The
> > result of this is very simple, everyone will just use O_HOT for
> > everything (if they will use it at all).
> >
> > Ted, as I've mentioned on LSF I think that the HOT/COLD name is really
> > bad choice for exactly this reason. It means nothing. If you want to use
> > this flag to place the inode on the faster part of the disk, then just
> > say so and name the flag accordingly, this way everyone can use it.
> > However for this to actually work we need some fs<->storage interface to
> > query storage layout, which actually should not be that hard to do. I am
> > afraid that in current form it will suit only Google and Taobao. I would
> > really like to have interface to pass tags between user->fs and
> > fs<->storage, but this one does not seem like a good start.
>
> I think this is a little unfair. We already have the notion of hot and
> cold pages within the page cache. The definitions for storage is
> similar: a hot block is one which will likely be read again shortly and
> a cold block is one that likely won't (ignoring the 30 odd gradations of
> in-between that the draft standard currently mandates)
You're right, but there is a crucial difference, you can not compare
a page with a file. Page will be read or .. well not read so often, but
that's just one dimension. Files has a lot more dimensions, will it be
rewritten often ? will it be read often, appended often, do we need
really fast first access ? do we need fast metadata operation ? Will
this file be there forever, or is it just temporary ? Do we need fast
read/write ? and many more...
>
> The concern I have is that the notion of hot and cold files *isn't*
> propagated to the page cache, it's just shared between the fs and the
> disk. It looks like we could tie the notion of file opened with O_HOT
> or O_COLD into the page reclaimers and actually call
> free_hot_cold_page() with the correct flag, meaning we might get an
> immediate benefit even in the absence of hint supporting disks.
And this is actually very good idea, but the file flag should not be
O_HOT/O_COLD (and in this case being it open flag is really disputable
as well), but rather hold-this-file-in-memory-longer-than-others, or
will-read-this-file-quite-often. Moreover since with Ted's patches O_HOT
means put the file on faster part of the disk (or rather whatever fs
thinks is fast part of the disk, since the interface to get such
information is missing) we already have one "meaning" and with this
we'll add yet another, completely different meaning to the single
flag. That seems messy.
Thanks!
-Lukas
>
> I cc'd linux-mm to see if there might be an interest in this ... or even
> if it's worth it: I can also see we don't necessarily want userspace to
> be able to tamper with our idea of what's hot and cold in the page
> cache, since we get it primarily from the lru lists.
>
> James
>
>
>
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> > I cc'd linux-mm to see if there might be an interest in this ... or even
> > if it's worth it: I can also see we don't necessarily want userspace to
> > be able to tamper with our idea of what's hot and cold in the page
> > cache, since we get it primarily from the lru lists.
> >
> > James
The notion of hor and cold in the page allocator refers to processor cache
hotness and is used for pages on the per cpu free lists.
F.e. cold pages are used when I/O is soon expected to occur on them
because we want to avoid having to evict cache lines. Cold pages have been
freed a long time ago.
Hot pages are those that have been recently freed (we know that some
cachelines are present therefore) and thus it is likely that acquisition
by another process will allow that process to reuse the cacheline already
present avoiding a trip to memory.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri, 2012-04-20 at 13:23 +0200, Lukas Czerner wrote:
> On Fri, 20 Apr 2012, James Bottomley wrote:
>
> > On Fri, 2012-04-20 at 11:45 +0200, Lukas Czerner wrote:
> > > On Fri, 20 Apr 2012, Boaz Harrosh wrote:
> > >
> > > > On 04/19/2012 10:20 PM, Theodore Ts'o wrote:
> > > >
> > > > > As I had brought up during one of the lightning talks at the Linux
> > > > > Storage and Filesystem workshop, I am interested in introducing two new
> > > > > open flags, O_HOT and O_COLD. These flags are passed down to the
> > > > > individual file system's inode operations' create function, and the file
> > > > > system can use these flags as a hint regarding whether the file is
> > > > > likely to be accessed frequently or not.
> > > > >
> > > > > In the future I plan to do further work on how ext4 would use these
> > > > > flags, but I want to first get the ability to pass these flags plumbed
> > > > > into the VFS layer and the code points for O_HOT and O_COLD reserved.
> > > > >
> > > > >
> > > > > Theodore Ts'o (3):
> > > > > fs: add new open flags O_HOT and O_COLD
> > > > > fs: propagate the open_flags structure down to the low-level fs's
> > > > > create()
> > > > > ext4: use the O_HOT and O_COLD open flags to influence inode
> > > > > allocation
> > > > >
> > > >
> > > >
> > > > I would expect that the first, and most important patch to this
> > > > set would be the man page which would define the new API.
> > > > What do you mean by cold/normal/hot? what is expected if supported?
> > > > how can we know if supported? ....
> > >
> > > Well, this is exactly my concern as well. There is no way anyone would
> > > know what it actually means a what users can expect form using it. The
> > > result of this is very simple, everyone will just use O_HOT for
> > > everything (if they will use it at all).
> > >
> > > Ted, as I've mentioned on LSF I think that the HOT/COLD name is really
> > > bad choice for exactly this reason. It means nothing. If you want to use
> > > this flag to place the inode on the faster part of the disk, then just
> > > say so and name the flag accordingly, this way everyone can use it.
> > > However for this to actually work we need some fs<->storage interface to
> > > query storage layout, which actually should not be that hard to do. I am
> > > afraid that in current form it will suit only Google and Taobao. I would
> > > really like to have interface to pass tags between user->fs and
> > > fs<->storage, but this one does not seem like a good start.
> >
> > I think this is a little unfair. We already have the notion of hot and
> > cold pages within the page cache. The definitions for storage is
> > similar: a hot block is one which will likely be read again shortly and
> > a cold block is one that likely won't (ignoring the 30 odd gradations of
> > in-between that the draft standard currently mandates)
>
> You're right, but there is a crucial difference, you can not compare
> a page with a file. Page will be read or .. well not read so often, but
> that's just one dimension. Files has a lot more dimensions, will it be
> rewritten often ? will it be read often, appended often, do we need
> really fast first access ? do we need fast metadata operation ? Will
> this file be there forever, or is it just temporary ? Do we need fast
> read/write ? and many more...
Yes and no. I agree with your assessment. The major point you could
ding me on actually is that just because a file is hot doesn't mean all
its pages are it could only have a few hot pages in it. You could also
argue that the time scale over which the page cache considers a page hot
and that over which a disk does the same might be so dissimilar as to
render the two usages orthogonal.
The points about read and write are valid, but we could extend the page
cache to them too. For instance, our readahead decisions are done at a
bit of the wrong level (statically in block). If the page cache knew a
file was streaming (a movie file, for instance), we could adjust the
readahead dynamically for that file.
Where this might be leading is that the file/filesystem hints to the
page cache, and the page cache hints to the device. That way, we could
cope with the hot file with only a few hot pages case.
The drawback is that we really don't have much of this machinery in the
page cache at the moment, and it's questionable if we really want it.
Solving our readahead problem would be brilliant, especially if the
interface were hintable, but not necessarily if it involves huge
algorithmic expense in our current page cache.
> > The concern I have is that the notion of hot and cold files *isn't*
> > propagated to the page cache, it's just shared between the fs and the
> > disk. It looks like we could tie the notion of file opened with O_HOT
> > or O_COLD into the page reclaimers and actually call
> > free_hot_cold_page() with the correct flag, meaning we might get an
> > immediate benefit even in the absence of hint supporting disks.
>
> And this is actually very good idea, but the file flag should not be
> O_HOT/O_COLD (and in this case being it open flag is really disputable
> as well), but rather hold-this-file-in-memory-longer-than-others, or
> will-read-this-file-quite-often. Moreover since with Ted's patches O_HOT
> means put the file on faster part of the disk (or rather whatever fs
> thinks is fast part of the disk, since the interface to get such
> information is missing) we already have one "meaning" and with this
> we'll add yet another, completely different meaning to the single
> flag. That seems messy.
I'm not at all wedded to O_HOT and O_COLD; I think if we establish a
hint hierarchy file->page cache->device then we should, of course,
choose the best API and naming scheme for file->page cache. The only
real point I was making is that we should tie in the page cache, and
currently it only knows about "hot" and "cold" pages.
James
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri, Apr 20, 2012 at 06:42:08PM +0400, James Bottomley wrote:
>
> I'm not at all wedded to O_HOT and O_COLD; I think if we establish a
> hint hierarchy file->page cache->device then we should, of course,
> choose the best API and naming scheme for file->page cache. The only
> real point I was making is that we should tie in the page cache, and
> currently it only knows about "hot" and "cold" pages.
The problem is that "hot" and "cold" will have different meanings from
the perspective of the file system versus the page cache. The file
system may consider a file "hot" if it is accessed frequently ---
compared to the other 2 TB of data on that HDD. The memory subsystem
will consider a page "hot" compared to what has been recently accessed
in the 8GB of memory that you might have your system. Now consider
that you might have a dozen or so 2TB disks that each have their "hot"
areas, and it's not at all obvious that just because a file, or even
part of a file is marked "hot", that it deserves to be in memory at
any particular point in time.
- Ted
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Thu, Apr 19, 2012 at 10:26:06PM -0400, Ted Ts'o wrote:
> On Fri, Apr 20, 2012 at 09:27:57AM +1000, Dave Chinner wrote:
> > So you're assuming that locating the inodes somewhere "hot" is going
> > to improve performance. So say an application has a "hot" file (say
> > an index file) but still has a lot of other files it creates and
> > reads, and they are all in the same directory.
> >
> > If the index file is created "hot", then it is going to be placed a
> > long way away from all the other files that applciation is using,
> > and every time you access the hot file you now seek away to a
> > different location on disk. The net result: the application goes
> > slower because average seek times have increased.
>
> Well, let's assume the application is using all or most of the disk,
> so the objects it is fetching from the 2T disk are randomly
> distributed throughout the disk.
Which is so far from most people's reality that it is not worth
considering.
> Short seeks are faster, yes, but the
> seek time as a function of the seek distance is decidedly non-linear,
> with a sharp "knee" in the curve at around 10-15% of a full-stroke
> seek. (Ref:
> http://static.usenix.org/event/fast05/tech/schlosser/schlosser.pdf)
>
> So most of the time, as you seek back and forth fetching data objects,
> most of the time you will be incurring 75-85% of the cost of a
> worst-case seek anyway. So seeking *is* going to be a fact of life
> that we can't run away from that.
>
> Given that, the question then is whether we are better off (a) putting
> the index files in the exact middle of the disk, trying to minimize
> seeks, (b) scattering the index files all over the disk randomly, or
> (c) concentrating the index files near the beginning of the disk?
> Given the non-linear seek times, it seems to suggest that (c) would
> probably be the best case for this use case.
I disagree - based on that paper, you're better off putting all the
related application data in the same place, and hoping it all fits
in that 10-15% minimal seek time region....
Besides, you missed my point - that it is trivial to come up with
examples of what application writers think are their hot/cold/normal
data whose optimal layout bears no resemblence to your proposed
hot/cold/normal inode layout. That's the fundamental problem here,
there is no obvious definition of HOT/COLD, and that the best
implementation depends on how the application uses those flags
combined with the characteristics of the underlying storage. IOws,
however you optimise it for a single spindle, a large percentage of
the time it is going to be detrimental to performance, not improve
it....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 04/19/2012 03:20 PM, Theodore Ts'o wrote:
> As I had brought up during one of the lightning talks at the Linux
> Storage and Filesystem workshop, I am interested in introducing two new
> open flags, O_HOT and O_COLD. These flags are passed down to the
> individual file system's inode operations' create function, and the file
> system can use these flags as a hint regarding whether the file is
> likely to be accessed frequently or not.
>
> In the future I plan to do further work on how ext4 would use these
> flags, but I want to first get the ability to pass these flags plumbed
> into the VFS layer and the code points for O_HOT and O_COLD reserved.
>
>
> Theodore Ts'o (3):
> fs: add new open flags O_HOT and O_COLD
> fs: propagate the open_flags structure down to the low-level fs's
> create()
> ext4: use the O_HOT and O_COLD open flags to influence inode
> allocation
Full-file seems awfully coarse-grained. What about doing this at page
granularity, and hint via VM as well as block layer?
Jeff
On 04/20/2012 07:01 AM, James Bottomley wrote:
> The concern I have is that the notion of hot and cold files *isn't*
> propagated to the page cache, it's just shared between the fs and the
> disk.
Bingo -- full-file hint is too coarse-grained for some workloads. Page
granularity would propagate to the VM as well as block layer, and give
the required flexibility to all workloads. As well as covering the
full-file case.
Jeff
On Fri, Apr 20, 2012 at 10:58 AM, Ted Ts'o <[email protected]> wrote:
> On Fri, Apr 20, 2012 at 06:42:08PM +0400, James Bottomley wrote:
>>
>> I'm not at all wedded to O_HOT and O_COLD; I think if we establish a
>> hint hierarchy file->page cache->device then we should, of course,
>> choose the best API and naming scheme for file->page cache. ?The only
>> real point I was making is that we should tie in the page cache, and
>> currently it only knows about "hot" and "cold" pages.
>
> The problem is that "hot" and "cold" will have different meanings from
> the perspective of the file system versus the page cache. ?The file
> system may consider a file "hot" if it is accessed frequently ---
> compared to the other 2 TB of data on that HDD. ?The memory subsystem
> will consider a page "hot" compared to what has been recently accessed
> in the 8GB of memory that you might have your system. ?Now consider
> that you might have a dozen or so 2TB disks that each have their "hot"
> areas, and it's not at all obvious that just because a file, or even
> part of a file is marked "hot", that it deserves to be in memory at
> any particular point in time.
So, this have intentionally different meanings I have no seen a reason why
fs uses hot/cold words. It seems to bring a confusion.
But I don't know full story of this feature and I might be overlooking
something.
On 22 April 2012 09:56, KOSAKI Motohiro <[email protected]> wrote:
> On Fri, Apr 20, 2012 at 10:58 AM, Ted Ts'o <[email protected]> wrote:
>> On Fri, Apr 20, 2012 at 06:42:08PM +0400, James Bottomley wrote:
>>>
>>> I'm not at all wedded to O_HOT and O_COLD; I think if we establish a
>>> hint hierarchy file->page cache->device then we should, of course,
>>> choose the best API and naming scheme for file->page cache. ?The only
>>> real point I was making is that we should tie in the page cache, and
>>> currently it only knows about "hot" and "cold" pages.
>>
>> The problem is that "hot" and "cold" will have different meanings from
>> the perspective of the file system versus the page cache. ?The file
>> system may consider a file "hot" if it is accessed frequently ---
>> compared to the other 2 TB of data on that HDD. ?The memory subsystem
>> will consider a page "hot" compared to what has been recently accessed
>> in the 8GB of memory that you might have your system. ?Now consider
>> that you might have a dozen or so 2TB disks that each have their "hot"
>> areas, and it's not at all obvious that just because a file, or even
>> part of a file is marked "hot", that it deserves to be in memory at
>> any particular point in time.
>
> So, this have intentionally different meanings I have no seen a reason why
> fs uses hot/cold words. It seems to bring a confusion.
Right. It has nothing to do with hot/cold usage in the page allocator,
which is about how many lines of that page are in CPU cache.
However it could be propagated up to page reclaim level, at least.
Perhaps readahead/writeback too. But IMO it would be better to nail down
the semantics for block and filesystem before getting worried about that.
>
> But I don't know full story of this feature and I might be overlooking
> something.
Also, "hot" and "cold" (as others have noted) is a big hammer that perhaps
catches a tiny subset of useful work (probably more likely: benchmarks).
Is it read often? Written often? Both? Are reads and writes random or linear?
Is it latency bound, or throughput bound? (i.e., are queue depths high or
low?)
A filesystem and storage device might care about all of these things.
Particularly if you have something more advanced than a single disk.
Caches, tiers of storage, etc.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href
ilto:"[email protected]"> [email protected] </a>
On Sun, 2012-04-22 at 16:30 +1000, Nick Piggin wrote:
> On 22 April 2012 09:56, KOSAKI Motohiro <[email protected]> wrote:
> > On Fri, Apr 20, 2012 at 10:58 AM, Ted Ts'o <[email protected]> wrote:
> >> On Fri, Apr 20, 2012 at 06:42:08PM +0400, James Bottomley wrote:
> >>>
> >>> I'm not at all wedded to O_HOT and O_COLD; I think if we establish a
> >>> hint hierarchy file->page cache->device then we should, of course,
> >>> choose the best API and naming scheme for file->page cache. The only
> >>> real point I was making is that we should tie in the page cache, and
> >>> currently it only knows about "hot" and "cold" pages.
> >>
> >> The problem is that "hot" and "cold" will have different meanings from
> >> the perspective of the file system versus the page cache. The file
> >> system may consider a file "hot" if it is accessed frequently ---
> >> compared to the other 2 TB of data on that HDD. The memory subsystem
> >> will consider a page "hot" compared to what has been recently accessed
> >> in the 8GB of memory that you might have your system. Now consider
> >> that you might have a dozen or so 2TB disks that each have their "hot"
> >> areas, and it's not at all obvious that just because a file, or even
> >> part of a file is marked "hot", that it deserves to be in memory at
> >> any particular point in time.
> >
> > So, this have intentionally different meanings I have no seen a reason why
> > fs uses hot/cold words. It seems to bring a confusion.
>
> Right. It has nothing to do with hot/cold usage in the page allocator,
> which is about how many lines of that page are in CPU cache.
Well, no it's a similar concept: we have no idea whether the page is
cached or not. What we do is estimate that by elapsed time since we
last touched the page. In some sense, this is similar to the fs
definition: a hot page hint would mean we expect to touch the page
frequently and a cold page means we wouldn't. i.e. for a hot page, the
elapsed time between touches would be short and for a cold page it would
be long. Now I still think there's a mismatch in the time scales: a
long elapsed time for mm making the page cold isn't necessarily the same
long elapsed time for the file, because the mm idea is conditioned by
local events (like memory pressure).
> However it could be propagated up to page reclaim level, at least.
> Perhaps readahead/writeback too. But IMO it would be better to nail down
> the semantics for block and filesystem before getting worried about that.
Sure ... I just forwarded the email in case mm people had an interest.
If you want FS and storage to develop the hints first and then figure
out if we can involve the page cache, that's more or less what was
happening anyway.
> > But I don't know full story of this feature and I might be overlooking
> > something.
>
> Also, "hot" and "cold" (as others have noted) is a big hammer that perhaps
> catches a tiny subset of useful work (probably more likely: benchmarks).
>
> Is it read often? Written often? Both? Are reads and writes random or linear?
> Is it latency bound, or throughput bound? (i.e., are queue depths high or
> low?)
>
> A filesystem and storage device might care about all of these things.
> Particularly if you have something more advanced than a single disk.
> Caches, tiers of storage, etc.
Experience has taught me to be wary of fine grained hints: they tend to
be more trouble than they're worth (the definitions are either
inaccurate or so tediously precise that no-one can be bothered to read
them). A small set of broad hints is usually more useable than a huge
set of fine grained ones, so from that point of view, I like the
O_HOT/O_COLD ones.
James
On 23 April 2012 18:23, James Bottomley
<[email protected]> wrote:
> On Sun, 2012-04-22 at 16:30 +1000, Nick Piggin wrote:
>> On 22 April 2012 09:56, KOSAKI Motohiro <[email protected]> wrote:
>> > On Fri, Apr 20, 2012 at 10:58 AM, Ted Ts'o <[email protected]> wrote:
>> >> On Fri, Apr 20, 2012 at 06:42:08PM +0400, James Bottomley wrote:
>> >>>
>> >>> I'm not at all wedded to O_HOT and O_COLD; I think if we establish a
>> >>> hint hierarchy file->page cache->device then we should, of course,
>> >>> choose the best API and naming scheme for file->page cache. ?The only
>> >>> real point I was making is that we should tie in the page cache, and
>> >>> currently it only knows about "hot" and "cold" pages.
>> >>
>> >> The problem is that "hot" and "cold" will have different meanings from
>> >> the perspective of the file system versus the page cache. ?The file
>> >> system may consider a file "hot" if it is accessed frequently ---
>> >> compared to the other 2 TB of data on that HDD. ?The memory subsystem
>> >> will consider a page "hot" compared to what has been recently accessed
>> >> in the 8GB of memory that you might have your system. ?Now consider
>> >> that you might have a dozen or so 2TB disks that each have their "hot"
>> >> areas, and it's not at all obvious that just because a file, or even
>> >> part of a file is marked "hot", that it deserves to be in memory at
>> >> any particular point in time.
>> >
>> > So, this have intentionally different meanings I have no seen a reason why
>> > fs uses hot/cold words. It seems to bring a confusion.
>>
>> Right. It has nothing to do with hot/cold usage in the page allocator,
>> which is about how many lines of that page are in CPU cache.
>
> Well, no it's a similar concept: ?we have no idea whether the page is
> cached or not.
>
> ?What we do is estimate that by elapsed time since we
> last touched the page. ?In some sense, this is similar to the fs
> definition: a hot page hint would mean we expect to touch the page
> frequently and a cold page means we wouldn't. ?i.e. for a hot page, the
> elapsed time between touches would be short and for a cold page it would
> be long. ?Now I still think there's a mismatch in the time scales: a
> long elapsed time for mm making the page cold isn't necessarily the same
> long elapsed time for the file, because the mm idea is conditioned by
> local events (like memory pressure).
I suspect the mismatch would make it have virtually no correlation.
Experiments could surely be made, though.
>> However it could be propagated up to page reclaim level, at least.
>> Perhaps readahead/writeback too. But IMO it would be better to nail down
>> the semantics for block and filesystem before getting worried about that.
>
> Sure ... I just forwarded the email in case mm people had an interest.
> If you want FS and storage to develop the hints first and then figure
> out if we can involve the page cache, that's more or less what was
> happening anyway.
OK, good. mm layers can always look up any such flags quite easily, so
I think there is no problem of mechanism, only policy.
>> > But I don't know full story of this feature and I might be overlooking
>> > something.
>>
>> Also, "hot" and "cold" (as others have noted) is a big hammer that perhaps
>> catches a tiny subset of useful work (probably more likely: benchmarks).
>>
>> Is it read often? Written often? Both? Are reads and writes random or linear?
>> Is it latency bound, or throughput bound? (i.e., are queue depths high or
>> low?)
>>
>> A filesystem and storage device might care about all of these things.
>> Particularly if you have something more advanced than a single disk.
>> Caches, tiers of storage, etc.
>
> Experience has taught me to be wary of fine grained hints: they tend to
> be more trouble than they're worth (the definitions are either
> inaccurate or so tediously precise that no-one can be bothered to read
> them). ?A small set of broad hints is usually more useable than a huge
> set of fine grained ones, so from that point of view, I like the
> O_HOT/O_COLD ones.
So long as the implementations can be sufficiently general that large majority
of "reasonable" application of the flags does not result in a slowdown, perhaps.
But while defining the API, you have to think about these things and not
just dismiss them completely.
Read vs write can be very important for caches and tiers, same for
random/linear,
latency constraints, etc. These things aren't exactly a huge unwieldy matrix. We
already have similar concepts in fadvise and such.
Thanks,
Nick
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href
ilto:"[email protected]"> [email protected] </a>
On 23 April 2012 21:47, Nick Piggin <[email protected]> wrote:
> On 23 April 2012 18:23, James Bottomley
>> Experience has taught me to be wary of fine grained hints: they tend to
>> be more trouble than they're worth (the definitions are either
>> inaccurate or so tediously precise that no-one can be bothered to read
>> them). ?A small set of broad hints is usually more useable than a huge
>> set of fine grained ones, so from that point of view, I like the
>> O_HOT/O_COLD ones.
>
> So long as the implementations can be sufficiently general that large majority
> of "reasonable" application of the flags does not result in a slowdown, perhaps.
>
> But while defining the API, you have to think about these things and not
> just dismiss them completely.
>
> Read vs write can be very important for caches and tiers, same for
> random/linear,
> latency constraints, etc. These things aren't exactly a huge unwieldy matrix. We
> already have similar concepts in fadvise and such.
I'm not saying it's necessarily a bad idea as such. But experience
has taught me that if you define an API before having much
experience of the implementation and its users, and without
being able to write meaningful documentation for it, then it's
going to be a bad API.
So rather than pushing through these flags first, I think it would
be better to actually do implementation work, and get some
benchmarks (if not real apps) and have something working
like that before turning anything into an API.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href
ilto:"[email protected]"> [email protected] </a>
(4/24/12 2:18 AM), Nick Piggin wrote:
> On 23 April 2012 21:47, Nick Piggin<[email protected]> wrote:
>> On 23 April 2012 18:23, James Bottomley
>
>>> Experience has taught me to be wary of fine grained hints: they tend to
>>> be more trouble than they're worth (the definitions are either
>>> inaccurate or so tediously precise that no-one can be bothered to read
>>> them). A small set of broad hints is usually more useable than a huge
>>> set of fine grained ones, so from that point of view, I like the
>>> O_HOT/O_COLD ones.
>>
>> So long as the implementations can be sufficiently general that large majority
>> of "reasonable" application of the flags does not result in a slowdown, perhaps.
>>
>> But while defining the API, you have to think about these things and not
>> just dismiss them completely.
>>
>> Read vs write can be very important for caches and tiers, same for
>> random/linear,
>> latency constraints, etc. These things aren't exactly a huge unwieldy matrix. We
>> already have similar concepts in fadvise and such.
>
> I'm not saying it's necessarily a bad idea as such. But experience
> has taught me that if you define an API before having much
> experience of the implementation and its users, and without
> being able to write meaningful documentation for it, then it's
> going to be a bad API.
>
> So rather than pushing through these flags first, I think it would
> be better to actually do implementation work, and get some
> benchmarks (if not real apps) and have something working
> like that before turning anything into an API.
Fully agreed.
I _guess_ O_COLD has an enough real world usefullness because a backup operation
makes a lot of "write once read never" inodes. Moreover it doesn't have a system wide
side effect.
In the other hands, I don't imagine how O_HOT works yet. Beccause of, many apps want
to run faster than other apps and it definitely don't work _if_ all applications turn on
O_HOT for every open operations. So, I'm not sure why apps don't do such intentional
abuse yet.
So, we might need some API design discussions.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to [email protected]. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On 04/19/2012 02:20 PM, Theodore Ts'o wrote:
> As I had brought up during one of the lightning talks at the Linux
> Storage and Filesystem workshop, I am interested in introducing two new
> open flags, O_HOT and O_COLD. These flags are passed down to the
> individual file system's inode operations' create function, and the file
> system can use these flags as a hint regarding whether the file is
> likely to be accessed frequently or not.
>
> In the future I plan to do further work on how ext4 would use these
> flags, but I want to first get the ability to pass these flags plumbed
> into the VFS layer and the code points for O_HOT and O_COLD reserved.
Consider this: Is this a testable feature?
You're proposing a hint, of course, so it could just have no effect.
But let's just assume for the sake of discussion that it has been
implemented.
How will I (filesystem implementer) know whether I have implemented
it correctly?
How can the user verify that the use of these flags is producing
the expected result?
-Alex
Theodore Ts'o wrote:
> As I had brought up during one of the lightning talks at the Linux
> Storage and Filesystem workshop, I am interested in introducing two new
> open flags, O_HOT and O_COLD. These flags are passed down to the
> individual file system's inode operations' create function, and the file
> system can use these flags as a hint regarding whether the file is
> likely to be accessed frequently or not.
>
> In the future I plan to do further work on how ext4 would use these
> flags, but I want to first get the ability to pass these flags plumbed
> into the VFS layer and the code points for O_HOT and O_COLD reserved.
As a developer of userspsace libraries and applications, I can't tell
when it would be a good idea to use these flags.
I get the impression that the best time to use them is probably
dependent on system-specific details, including the type of
filesystem, underlying storage, and intermediate device-mapper layers,
geometry, file sizes, etc.
I.e. ugly, tweaky stuff where the right answer depends on lots of
system-specific benchmarks.
Things which I can't really test except on the few systems I have
access to myself, so I can only guess how to use the flags for general
purpose code on other peoples' systems.
Suppose I'm writing a database layer (e.g. a MySQL backend).
Is there any reason I should not indiscriminately use O_HOT for all
the database's files? If only to compete on the benchmarks that are
used to compare my database layer against others?
If I use O_HOT for frequently-accessed data, and O_COLD for
infrequently accessed (such as old logs), so that my application can
signal a differential and reap some benefit - what about the concerns
that it will be worse than using no flags at all, due to the seek time
from using different areas of the underlying storage?
Or if signalling a differential works well, will we end up needing a
"hot-cold cgroup" so each application's hot/cold requests indicate a
differential within the app only, allowing the administrator to say
which _whole apps_ are prioritised in this way?
In a nutshell, I can't figure out, as a userspace programmer, when I
should use these flags, and would be inclined to set O_HOT for all
files that have anything to do with something that'll be benchmarked,
or anything to do with a "job" that I want to run at higher priority
than other jobs.
I have queries about the API too. I'd anticipate sometimes having to
use an LD_PRELOAD to set the flag for all opens done by a bunch of
programs run from a script. So why not the ionice/ioprio_{get/set}
interface? That was rhetorical: So that a program can set different
hot/coldness for different files, or the same files at different
times.
But there's a case for sometimes wanting other types of I/O priority
to vary for different open files in the same process too. What's
special about O_HOT/O_COLD that makes it different from other kinds of
I/O priority settings? Wouldn't it be better to devise a way to set
all I/O priority-like things per open file, not just hot/cold?
Sometimes I'd probably want to set O_HOT as a filesystem attribute on
a set of files in the filesystem (such as a subset of files in the
http/ directory), so that all programs opening those files get O_HOT
behaviour. Mainly when it's scripts operating on the files, but also
to make sure any "outside the app" operations on the files (such as
stopping the app, copying its files elsewhere, and starting it at the
new location) don't lose the hot/coldness.
For database-like things, I'd want to set hot/cold on different
regions within a big file, rather than separate files. Perhaps the
same applies to ELF files: The big debugging sections would be better cold.
If I've written a file with O_COLD and later change my mind, do I have
to open the file with O_HOT and rewrite all of the file with the same
contents to get it moved on the storage? Or does O_HOT do that
automatically?
Is there any way I can query whether it's allocated hot/cold already,
or will I have to copy the data "just in case" from time to time? For
example, if a system was restored from backups (normal file backups),
presumably the hottest files will have been restored "normal", whereas
they would have been written initially with O_HOT by the application
producing them.
If the allocated hot/coldness isn't something the application can
query from the filesystem, it won't know whether to inform the user
that performance could be improved by running a tool which converts
the file to an O_HOT-file.
Also, for the backup itself, or when copying files around a system
with normal tools (cp, rsync), or to another system, if there's no way
to query allocated hot/coldness, they won't be able to preserve that.
If there's a real performance difference, and no way to query whether
the file was previously allocated hot/cold, maybe some applications
will recommend "users should run this special tool every month or so
which copies all the data with O_HOT, as it sometimes improves
performance". Which will be true. You know what optimisation
folklore is like.
All the best,
-- Jamie