2012-10-29 04:31:29

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 00/19] vfs: hot data tracking

From: Zhi Yong Wu <[email protected]>

NOTE:

The patchset can be obtained via my kernel dev git on github:
[email protected]:wuzhy/kernel.git hot_tracking
If you're interested, you can also can review them via
https://github.com/wuzhy/kernel/commits/hot_tracking

For more info, please check hot_tracking.txt in Documentation

TODO List:

1.) Need to do scalability or performance tests.
2.) Need one simpler but effective temp calc'ing function
3.) How to save the file temperature among the umount to be able to
preserve the file tempreture after reboot

Ben Chociej, Matt Lupfer and Conor Scott originally wrote this code to
be very btrfs-specific. I've taken their code and attempted to
make it more generic and integrate it at the VFS level.

Changelog from v3:
1.) Rewritten debugfs support based seq_file operation. [Dave Chinner]
2.) Refactored workqueue support. [Dave Chinner]
3.) Turn some Micro into be tunable [Zhiyong, Zheng Liu]
TIME_TO_KICK, and HEAT_UPDATE_DELAY
4.) Introduce hot func registering framework [Zhiyong]
5.) Remove global variable for hot tracking [Zhiyong]
6.) Add xfs hot tracking support [Dave Chinner]
7.) Add ext4 hot tracking support [Zheng Liu]
8.) Cleanedup a lot of other issues [Dave Chinner]

v3:
1.) Converted to Radix trees, not RB-tree [Zhiyong, Dave Chinner]
2.) Added memory shrinker [Dave Chinner]

v2:
1.) Converted to one workqueue to update map info periodically [Dave Chinner]
2.) Cleanedup a lot of other issues [Dave Chinner]

v1:
1.) Reduce new files and put all in fs/hot_tracking.[ch] [Dave Chinner]
2.) Add btrfs hot tracking support [Zhiyong]
3.) The first three patches can probably just be flattened into one.
[Marco Stornelli , Dave Chinner]

Dave Chinner (1):
xfs: add hot tracking support

Zheng Liu (1):
ext4: add hot tracking support

Zhi Yong Wu (17):
vfs: introduce private radix tree structures
vfs: initialize and free data structures
vfs: add I/O frequency update function
vfs: add two map arrays
vfs: add hooks to enable hot tracking
vfs: add temp calculation function
vfs: add map info update function
vfs: add aging function
vfs: add one work queue
vfs: introduce hot func register framework
vfs: register one shrinker
vfs: add one ioctl interface
debugfs: introduce one function
vfs: add debugfs support
sysfs: add two hot_track proc files
btrfs: add hot tracking support
vfs: add documentation

Documentation/filesystems/00-INDEX | 2 +
Documentation/filesystems/hot_tracking.txt | 262 ++++++
fs/Makefile | 2 +-
fs/btrfs/ctree.h | 1 +
fs/btrfs/super.c | 22 +-
fs/compat_ioctl.c | 5 +
fs/dcache.c | 2 +
fs/debugfs/inode.c | 26 +
fs/direct-io.c | 6 +
fs/ext4/ext4.h | 3 +
fs/ext4/super.c | 13 +-
fs/hot_tracking.c | 1367 ++++++++++++++++++++++++++++
fs/hot_tracking.h | 58 ++
fs/ioctl.c | 78 ++
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 16 +
include/linux/debugfs.h | 9 +
include/linux/fs.h | 4 +
include/linux/hot_tracking.h | 149 +++
kernel/sysctl.c | 14 +
mm/filemap.c | 6 +
mm/page-writeback.c | 12 +
mm/readahead.c | 6 +
23 files changed, 2061 insertions(+), 3 deletions(-)
create mode 100644 Documentation/filesystems/hot_tracking.txt
create mode 100644 fs/hot_tracking.c
create mode 100644 fs/hot_tracking.h
create mode 100644 include/linux/hot_tracking.h

--
1.7.6.5



2012-10-29 04:30:46

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 04/19] vfs: add two map arrays

From: Zhi Yong Wu <[email protected]>

Adds two map arrays which contains
a lot of list and is used to efficiently
look up the data temperature of a file or its
ranges.
In each list of map arrays, the array node
will keep track of temperature info.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 60 ++++++++++++++++++++++++++++++++++++++++++
include/linux/hot_tracking.h | 16 +++++++++++
2 files changed, 76 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 0a7d9a3..0a603a1 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -58,6 +58,7 @@ static void hot_range_item_init(struct hot_range_item *hr, u32 start,
hr->hot_inode = he;
kref_init(&hr->hot_range.refs);
spin_lock_init(&hr->hot_range.lock);
+ INIT_LIST_HEAD(&hr->hot_range.n_list);
hr->hot_range.hot_freq_data.avg_delta_reads = (u64) -1;
hr->hot_range.hot_freq_data.avg_delta_writes = (u64) -1;
hr->hot_range.hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
@@ -88,6 +89,16 @@ static void hot_range_item_free(struct kref *kref)
struct hot_comm_item, refs);
struct hot_range_item *hr = container_of(comm_item,
struct hot_range_item, hot_range);
+ struct hot_info *root = container_of(
+ hr->hot_inode->hot_inode_tree,
+ struct hot_info, hot_inode_tree);
+
+ spin_lock(&hr->hot_range.lock);
+ if (!list_empty(&hr->hot_range.n_list)) {
+ list_del_init(&hr->hot_range.n_list);
+ root->hot_map_nr--;
+ }
+ spin_unlock(&hr->hot_range.lock);

radix_tree_delete(&hr->hot_inode->hot_range_tree, hr->start);
kmem_cache_free(hot_range_item_cachep, hr);
@@ -132,6 +143,15 @@ static void hot_inode_item_free(struct kref *kref)
struct hot_comm_item, refs);
struct hot_inode_item *he = container_of(comm_item,
struct hot_inode_item, hot_inode);
+ struct hot_info *root = container_of(he->hot_inode_tree,
+ struct hot_info, hot_inode_tree);
+
+ spin_lock(&he->hot_inode.lock);
+ if (!list_empty(&he->hot_inode.n_list)) {
+ list_del_init(&he->hot_inode.n_list);
+ root->hot_map_nr--;
+ }
+ spin_unlock(&he->hot_inode.lock);

hot_range_tree_free(he);
radix_tree_delete(he->hot_inode_tree, he->i_ino);
@@ -304,6 +324,44 @@ static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
}

/*
+ * Initialize inode and range map arrays.
+ */
+static void hot_map_array_init(struct hot_info *root)
+{
+ int i;
+ for (i = 0; i < HEAT_MAP_SIZE; i++) {
+ INIT_LIST_HEAD(&root->heat_inode_map[i].node_list);
+ INIT_LIST_HEAD(&root->heat_range_map[i].node_list);
+ root->heat_inode_map[i].temp = i;
+ root->heat_range_map[i].temp = i;
+ }
+}
+
+static void hot_map_list_free(struct list_head *node_list,
+ struct hot_info *root)
+{
+ struct list_head *pos, *next;
+ struct hot_comm_item *node;
+
+ list_for_each_safe(pos, next, node_list) {
+ node = list_entry(pos, struct hot_comm_item, n_list);
+ list_del_init(&node->n_list);
+ root->hot_map_nr--;
+ }
+
+}
+
+/* Free inode and range map arrays */
+static void hot_map_array_exit(struct hot_info *root)
+{
+ int i;
+ for (i = 0; i < HEAT_MAP_SIZE; i++) {
+ hot_map_list_free(&root->heat_inode_map[i].node_list, root);
+ hot_map_list_free(&root->heat_range_map[i].node_list, root);
+ }
+}
+
+/*
* Initialize kmem cache for hot_inode_item and hot_range_item.
*/
void __init hot_cache_init(void)
@@ -394,6 +452,7 @@ int hot_track_init(struct super_block *sb)

sb->s_hot_root = root;
hot_inode_tree_init(root);
+ hot_map_array_init(root);

printk(KERN_INFO "VFS: Turning on hot data tracking\n");

@@ -405,6 +464,7 @@ void hot_track_exit(struct super_block *sb)
{
struct hot_info *root = sb->s_hot_root;

+ hot_map_array_exit(root);
hot_inode_tree_exit(root);
kfree(root);
}
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index e2d6028..4f92947 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -20,6 +20,9 @@
#include <linux/kref.h>
#include <linux/fs.h>

+#define HEAT_MAP_BITS 8
+#define HEAT_MAP_SIZE (1 << HEAT_MAP_BITS)
+
/*
* A frequency data struct holds values that are used to
* determine temperature of files and file ranges. These structs
@@ -36,11 +39,18 @@ struct hot_freq_data {
u32 last_temp;
};

+/* List heads in hot map array */
+struct hot_map_head {
+ struct list_head node_list;
+ u8 temp;
+};
+
/* The common info for both following structures */
struct hot_comm_item {
struct hot_freq_data hot_freq_data; /* frequency data */
spinlock_t lock; /* protects object data */
struct kref refs; /* prevents kfree */
+ struct list_head n_list; /* list node index */
};

/* An item representing an inode and its access frequency */
@@ -66,6 +76,12 @@ struct hot_range_item {
struct hot_info {
struct radix_tree_root hot_inode_tree;
spinlock_t lock; /*protect inode tree */
+
+ /* map of inode temperature */
+ struct hot_map_head heat_inode_map[HEAT_MAP_SIZE];
+ /* map of range temperature */
+ struct hot_map_head heat_range_map[HEAT_MAP_SIZE];
+ unsigned int hot_map_nr;
};

extern void __init hot_cache_init(void);
--
1.7.6.5

2012-10-29 04:30:45

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

From: Zhi Yong Wu <[email protected]>

Add some util helpers to update access frequencies
for one file or its range.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 7 ++
include/linux/hot_tracking.h | 2 +
3 files changed, 188 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 68591f0..0a7d9a3 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
}
}

+struct hot_inode_item
+*hot_inode_item_find(struct hot_info *root, u64 ino)
+{
+ struct hot_inode_item *he;
+ int ret;
+
+again:
+ spin_lock(&root->lock);
+ he = radix_tree_lookup(&root->hot_inode_tree, ino);
+ if (he) {
+ kref_get(&he->hot_inode.refs);
+ spin_unlock(&root->lock);
+ return he;
+ }
+ spin_unlock(&root->lock);
+
+ he = kmem_cache_zalloc(hot_inode_item_cachep,
+ GFP_KERNEL | GFP_NOFS);
+ if (!he)
+ return ERR_PTR(-ENOMEM);
+
+ hot_inode_item_init(he, ino, &root->hot_inode_tree);
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret) {
+ kmem_cache_free(hot_inode_item_cachep, he);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->lock);
+ ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
+ if (ret == -EEXIST) {
+ kmem_cache_free(hot_inode_item_cachep, he);
+ spin_unlock(&root->lock);
+ radix_tree_preload_end();
+ goto again;
+ }
+ spin_unlock(&root->lock);
+ radix_tree_preload_end();
+
+ kref_get(&he->hot_inode.refs);
+ return he;
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_find);
+
+static struct hot_range_item
+*hot_range_item_find(struct hot_inode_item *he,
+ u32 start)
+{
+ struct hot_range_item *hr;
+ int ret;
+
+again:
+ spin_lock(&he->lock);
+ hr = radix_tree_lookup(&he->hot_range_tree, start);
+ if (hr) {
+ kref_get(&hr->hot_range.refs);
+ spin_unlock(&he->lock);
+ return hr;
+ }
+ spin_unlock(&he->lock);
+
+ hr = kmem_cache_zalloc(hot_range_item_cachep,
+ GFP_KERNEL | GFP_NOFS);
+ if (!hr)
+ return ERR_PTR(-ENOMEM);
+
+ hot_range_item_init(hr, start, he);
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret) {
+ kmem_cache_free(hot_range_item_cachep, hr);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&he->lock);
+ ret = radix_tree_insert(&he->hot_range_tree, start, hr);
+ if (ret == -EEXIST) {
+ kmem_cache_free(hot_range_item_cachep, hr);
+ spin_unlock(&he->lock);
+ radix_tree_preload_end();
+ goto again;
+ }
+ spin_unlock(&he->lock);
+ radix_tree_preload_end();
+
+ kref_get(&hr->hot_range.refs);
+ return hr;
+}
+
+/*
+ * This function does the actual work of updating
+ * the frequency numbers, whatever they turn out to be.
+ */
+static u64 hot_average_update(struct timespec old_atime,
+ struct timespec cur_time, u64 old_avg)
+{
+ struct timespec delta_ts;
+ u64 new_avg;
+ u64 new_delta;
+
+ delta_ts = timespec_sub(cur_time, old_atime);
+ new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
+
+ new_avg = (old_avg << FREQ_POWER) - old_avg + new_delta;
+ new_avg = new_avg >> FREQ_POWER;
+
+ return new_avg;
+}
+
+static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
+{
+ struct timespec cur_time = current_kernel_time();
+
+ if (write) {
+ freq_data->nr_writes += 1;
+ freq_data->avg_delta_writes = hot_average_update(
+ freq_data->last_write_time,
+ cur_time,
+ freq_data->avg_delta_writes);
+ freq_data->last_write_time = cur_time;
+ } else {
+ freq_data->nr_reads += 1;
+ freq_data->avg_delta_reads = hot_average_update(
+ freq_data->last_read_time,
+ cur_time,
+ freq_data->avg_delta_reads);
+ freq_data->last_read_time = cur_time;
+ }
+}
+
/*
* Initialize kmem cache for hot_inode_item and hot_range_item.
*/
@@ -199,6 +330,54 @@ err:
EXPORT_SYMBOL_GPL(hot_cache_init);

/*
+ * Main function to update access frequency from read/writepage(s) hooks
+ */
+void hot_update_freqs(struct inode *inode, u64 start,
+ u64 len, int rw)
+{
+ struct hot_info *root = inode->i_sb->s_hot_root;
+ struct hot_inode_item *he;
+ struct hot_range_item *hr;
+ u32 cur, end;
+
+ if (!root || (len == 0))
+ return;
+
+ he = hot_inode_item_find(root, inode->i_ino);
+ if (IS_ERR(he)) {
+ WARN_ON(1);
+ return;
+ }
+
+ spin_lock(&he->hot_inode.lock);
+ hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
+ spin_unlock(&he->hot_inode.lock);
+
+ /*
+ * Align ranges on RANGE_SIZE boundary
+ * to prevent proliferation of range structs
+ */
+ end = (start + len + RANGE_SIZE - 1) >> RANGE_BITS;
+ for (cur = (start >> RANGE_BITS); cur < end; cur++) {
+ hr = hot_range_item_find(he, cur);
+ if (IS_ERR(hr)) {
+ WARN_ON(1);
+ hot_inode_item_put(he);
+ return;
+ }
+
+ spin_lock(&hr->hot_range.lock);
+ hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
+ spin_unlock(&hr->hot_range.lock);
+
+ hot_range_item_put(hr);
+ }
+
+ hot_inode_item_put(he);
+}
+EXPORT_SYMBOL_GPL(hot_update_freqs);
+
+/*
* Initialize the data structures for hot data tracking.
*/
int hot_track_init(struct super_block *sb)
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index e7ba121..cc4666e 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -20,6 +20,13 @@
#define FREQ_DATA_TYPE_INODE (1 << 0)
#define FREQ_DATA_TYPE_RANGE (1 << 1)

+/* size of sub-file ranges */
+#define RANGE_BITS 20
+#define RANGE_SIZE (1 << RANGE_BITS)
+
+#define FREQ_POWER 4
+
void hot_inode_item_put(struct hot_inode_item *he);
+struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);

#endif /* __HOT_TRACKING__ */
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index 4233207..e2d6028 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -71,5 +71,7 @@ struct hot_info {
extern void __init hot_cache_init(void);
extern int hot_track_init(struct super_block *sb);
extern void hot_track_exit(struct super_block *sb);
+extern void hot_update_freqs(struct inode *inode, u64 start,
+ u64 len, int rw);

#endif /* _LINUX_HOTTRACK_H */
--
1.7.6.5

2012-10-29 04:30:44

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 02/19] vfs: initialize and free data structures

From: Zhi Yong Wu <[email protected]>

Add initialization function to create some
key data structures when hot tracking is enabled;
Clean up them when hot tracking is disabled

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 124 ++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 2 +
include/linux/fs.h | 4 ++
include/linux/hot_tracking.h | 2 +
4 files changed, 132 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index badf47d..68591f0 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -75,12 +75,103 @@ static void hot_inode_item_init(struct hot_inode_item *he, u64 ino,
he->hot_inode_tree = hot_inode_tree;
kref_init(&he->hot_inode.refs);
spin_lock_init(&he->hot_inode.lock);
+ INIT_LIST_HEAD(&he->hot_inode.n_list);
he->hot_inode.hot_freq_data.avg_delta_reads = (u64) -1;
he->hot_inode.hot_freq_data.avg_delta_writes = (u64) -1;
he->hot_inode.hot_freq_data.flags = FREQ_DATA_TYPE_INODE;
hot_range_tree_init(he);
}

+static void hot_range_item_free(struct kref *kref)
+{
+ struct hot_comm_item *comm_item = container_of(kref,
+ struct hot_comm_item, refs);
+ struct hot_range_item *hr = container_of(comm_item,
+ struct hot_range_item, hot_range);
+
+ radix_tree_delete(&hr->hot_inode->hot_range_tree, hr->start);
+ kmem_cache_free(hot_range_item_cachep, hr);
+}
+
+/*
+ * Drops the reference out on hot_range_item by one
+ * and free the structure if the reference count hits zero
+ */
+static void hot_range_item_put(struct hot_range_item *hr)
+{
+ kref_put(&hr->hot_range.refs, hot_range_item_free);
+}
+
+/* Frees the entire hot_range_tree. */
+static void hot_range_tree_free(struct hot_inode_item *he)
+{
+ struct hot_range_item *hr_nodes[8];
+ u32 start = 0;
+ int i, n;
+
+ while (1) {
+ spin_lock(&he->lock);
+ n = radix_tree_gang_lookup(&he->hot_range_tree,
+ (void **)hr_nodes, start,
+ ARRAY_SIZE(hr_nodes));
+ if (!n) {
+ spin_unlock(&he->lock);
+ break;
+ }
+
+ start = hr_nodes[n - 1]->start + 1;
+ for (i = 0; i < n; i++)
+ hot_range_item_put(hr_nodes[i]);
+ spin_unlock(&he->lock);
+ }
+}
+
+static void hot_inode_item_free(struct kref *kref)
+{
+ struct hot_comm_item *comm_item = container_of(kref,
+ struct hot_comm_item, refs);
+ struct hot_inode_item *he = container_of(comm_item,
+ struct hot_inode_item, hot_inode);
+
+ hot_range_tree_free(he);
+ radix_tree_delete(he->hot_inode_tree, he->i_ino);
+ kmem_cache_free(hot_inode_item_cachep, he);
+}
+
+/*
+ * Drops the reference out on hot_inode_item by one
+ * and free the structure if the reference count hits zero
+ */
+void hot_inode_item_put(struct hot_inode_item *he)
+{
+ kref_put(&he->hot_inode.refs, hot_inode_item_free);
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_put);
+
+/* Frees the entire hot_inode_tree. */
+static void hot_inode_tree_exit(struct hot_info *root)
+{
+ struct hot_inode_item *hi_nodes[8];
+ u64 ino = 0;
+ int i, n;
+
+ while (1) {
+ spin_lock(&root->lock);
+ n = radix_tree_gang_lookup(&root->hot_inode_tree,
+ (void **)hi_nodes, ino,
+ ARRAY_SIZE(hi_nodes));
+ if (!n) {
+ spin_unlock(&root->lock);
+ break;
+ }
+
+ ino = hi_nodes[n - 1]->i_ino + 1;
+ for (i = 0; i < n; i++)
+ hot_inode_item_put(hi_nodes[i]);
+ spin_unlock(&root->lock);
+ }
+}
+
/*
* Initialize kmem cache for hot_inode_item and hot_range_item.
*/
@@ -106,3 +197,36 @@ err:
kmem_cache_destroy(hot_inode_item_cachep);
}
EXPORT_SYMBOL_GPL(hot_cache_init);
+
+/*
+ * Initialize the data structures for hot data tracking.
+ */
+int hot_track_init(struct super_block *sb)
+{
+ struct hot_info *root;
+ int ret = -ENOMEM;
+
+ root = kzalloc(sizeof(struct hot_info), GFP_NOFS);
+ if (!root) {
+ printk(KERN_ERR "%s: Failed to malloc memory for "
+ "hot_info\n", __func__);
+ return ret;
+ }
+
+ sb->s_hot_root = root;
+ hot_inode_tree_init(root);
+
+ printk(KERN_INFO "VFS: Turning on hot data tracking\n");
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hot_track_init);
+
+void hot_track_exit(struct super_block *sb)
+{
+ struct hot_info *root = sb->s_hot_root;
+
+ hot_inode_tree_exit(root);
+ kfree(root);
+}
+EXPORT_SYMBOL_GPL(hot_track_exit);
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index febf699..e7ba121 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -20,4 +20,6 @@
#define FREQ_DATA_TYPE_INODE (1 << 0)
#define FREQ_DATA_TYPE_RANGE (1 << 1)

+void hot_inode_item_put(struct hot_inode_item *he);
+
#endif /* __HOT_TRACKING__ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b33cfc9..c541ae7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -27,6 +27,7 @@
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
+#include <linux/hot_tracking.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -1321,6 +1322,9 @@ struct super_block {

/* Being remounted read-only */
int s_readonly_remount;
+
+ /* Hot data tracking*/
+ struct hot_info *s_hot_root;
};

/* superblock cache pruning functions */
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index 1cec0aa..4233207 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -69,5 +69,7 @@ struct hot_info {
};

extern void __init hot_cache_init(void);
+extern int hot_track_init(struct super_block *sb);
+extern void hot_track_exit(struct super_block *sb);

#endif /* _LINUX_HOTTRACK_H */
--
1.7.6.5

2012-10-29 04:30:43

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 01/19] vfs: introduce private radix tree structures

From: Zhi Yong Wu <[email protected]>

One root structure hot_info is defined, is hooked
up in super_block, and will be used to hold radix tree
root, hash list root and some other information, etc.
Adds hot_inode_tree struct to keep track of
frequently accessed files, and be keyed by {inode, offset}.
Trees contain hot_inode_items representing those files
and ranges.
Having these trees means that vfs can quickly determine the
temperature of some data by doing some calculations on the
hot_freq_data struct that hangs off of the tree item.
Define two items hot_inode_item and hot_range_item,
one of them represents one tracked file
to keep track of its access frequency and the tree of
ranges in this file, while the latter represents
a file range of one inode.
Each of the two structures contains a hot_freq_data
struct with its frequency of access metrics (number of
{reads, writes}, last {read,write} time, frequency of
{reads,writes}).
Also, each hot_inode_item contains one hot_range_tree
struct which is keyed by {inode, offset, length}
and used to keep track of all the ranges in this file.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/Makefile | 2 +-
fs/dcache.c | 2 +
fs/hot_tracking.c | 108 ++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 23 +++++++++
include/linux/hot_tracking.h | 73 ++++++++++++++++++++++++++++
5 files changed, 207 insertions(+), 1 deletions(-)
create mode 100644 fs/hot_tracking.c
create mode 100644 fs/hot_tracking.h
create mode 100644 include/linux/hot_tracking.h

diff --git a/fs/Makefile b/fs/Makefile
index 1d7af79..f966dea 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o statfs.o
+ stack.o fs_struct.o statfs.o hot_tracking.o

ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/dcache.c b/fs/dcache.c
index 3a463d0..7d5be16 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
#include <linux/rculist_bl.h>
#include <linux/prefetch.h>
#include <linux/ratelimit.h>
+#include <linux/hot_tracking.h>
#include "internal.h"
#include "mount.h"

@@ -3172,4 +3173,5 @@ void __init vfs_caches_init(unsigned long mempages)
mnt_init();
bdev_cache_init();
chrdev_init();
+ hot_cache_init();
}
diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
new file mode 100644
index 0000000..badf47d
--- /dev/null
+++ b/fs/hot_tracking.c
@@ -0,0 +1,108 @@
+/*
+ * fs/hot_tracking.c
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/types.h>
+#include <linux/limits.h>
+#include "hot_tracking.h"
+
+/* kmem_cache pointers for slab caches */
+static struct kmem_cache *hot_inode_item_cachep __read_mostly;
+static struct kmem_cache *hot_range_item_cachep __read_mostly;
+
+/*
+ * Initialize the inode tree. Should be called for each new inode
+ * access or other user of the hot_inode interface.
+ */
+static void hot_inode_tree_init(struct hot_info *root)
+{
+ INIT_RADIX_TREE(&root->hot_inode_tree, GFP_ATOMIC);
+ spin_lock_init(&root->lock);
+}
+
+/*
+ * Initialize the hot range tree. Should be called for each new inode
+ * access or other user of the hot_range interface.
+ */
+void hot_range_tree_init(struct hot_inode_item *he)
+{
+ INIT_RADIX_TREE(&he->hot_range_tree, GFP_ATOMIC);
+ spin_lock_init(&he->lock);
+}
+
+/*
+ * Initialize a new hot_range_item structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_range_item()
+ */
+static void hot_range_item_init(struct hot_range_item *hr, u32 start,
+ struct hot_inode_item *he)
+{
+ hr->start = start;
+ hr->len = RANGE_SIZE;
+ hr->hot_inode = he;
+ kref_init(&hr->hot_range.refs);
+ spin_lock_init(&hr->hot_range.lock);
+ hr->hot_range.hot_freq_data.avg_delta_reads = (u64) -1;
+ hr->hot_range.hot_freq_data.avg_delta_writes = (u64) -1;
+ hr->hot_range.hot_freq_data.flags = FREQ_DATA_TYPE_RANGE;
+}
+
+/*
+ * Initialize a new hot_inode_item structure. The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using hot_free_inode_item()
+ */
+static void hot_inode_item_init(struct hot_inode_item *he, u64 ino,
+ struct radix_tree_root *hot_inode_tree)
+{
+ he->i_ino = ino;
+ he->hot_inode_tree = hot_inode_tree;
+ kref_init(&he->hot_inode.refs);
+ spin_lock_init(&he->hot_inode.lock);
+ he->hot_inode.hot_freq_data.avg_delta_reads = (u64) -1;
+ he->hot_inode.hot_freq_data.avg_delta_writes = (u64) -1;
+ he->hot_inode.hot_freq_data.flags = FREQ_DATA_TYPE_INODE;
+ hot_range_tree_init(he);
+}
+
+/*
+ * Initialize kmem cache for hot_inode_item and hot_range_item.
+ */
+void __init hot_cache_init(void)
+{
+ hot_inode_item_cachep = kmem_cache_create("hot_inode_item",
+ sizeof(struct hot_inode_item), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!hot_inode_item_cachep)
+ return;
+
+ hot_range_item_cachep = kmem_cache_create("hot_range_item",
+ sizeof(struct hot_range_item), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!hot_range_item_cachep)
+ goto err;
+
+ return;
+
+err:
+ kmem_cache_destroy(hot_inode_item_cachep);
+}
+EXPORT_SYMBOL_GPL(hot_cache_init);
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
new file mode 100644
index 0000000..febf699
--- /dev/null
+++ b/fs/hot_tracking.h
@@ -0,0 +1,23 @@
+/*
+ * fs/hot_tracking.h
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef __HOT_TRACKING__
+#define __HOT_TRACKING__
+
+#include <linux/radix-tree.h>
+#include <linux/workqueue.h>
+#include <linux/hot_tracking.h>
+
+/* values for hot_freq_data flags */
+#define FREQ_DATA_TYPE_INODE (1 << 0)
+#define FREQ_DATA_TYPE_RANGE (1 << 1)
+
+#endif /* __HOT_TRACKING__ */
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
new file mode 100644
index 0000000..1cec0aa
--- /dev/null
+++ b/include/linux/hot_tracking.h
@@ -0,0 +1,73 @@
+/*
+ * include/linux/hot_tracking.h
+ *
+ * This file has definitions for VFS hot data tracking
+ * structures etc.
+ *
+ * Copyright (C) 2012 IBM Corp. All rights reserved.
+ * Written by Zhi Yong Wu <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ */
+
+#ifndef _LINUX_HOTTRACK_H
+#define _LINUX_HOTTRACK_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/kref.h>
+#include <linux/fs.h>
+
+/*
+ * A frequency data struct holds values that are used to
+ * determine temperature of files and file ranges. These structs
+ * are members of hot_inode_item and hot_range_item
+ */
+struct hot_freq_data {
+ struct timespec last_read_time;
+ struct timespec last_write_time;
+ u32 nr_reads;
+ u32 nr_writes;
+ u64 avg_delta_reads;
+ u64 avg_delta_writes;
+ u32 flags;
+ u32 last_temp;
+};
+
+/* The common info for both following structures */
+struct hot_comm_item {
+ struct hot_freq_data hot_freq_data; /* frequency data */
+ spinlock_t lock; /* protects object data */
+ struct kref refs; /* prevents kfree */
+};
+
+/* An item representing an inode and its access frequency */
+struct hot_inode_item {
+ struct hot_comm_item hot_inode; /* node in hot_inode_tree */
+ struct radix_tree_root hot_range_tree; /* tree of ranges */
+ spinlock_t lock; /* protect range tree */
+ struct radix_tree_root *hot_inode_tree;
+ u64 i_ino; /* inode number from inode */
+};
+
+/*
+ * An item representing a range inside of
+ * an inode whose frequency is being tracked
+ */
+struct hot_range_item {
+ struct hot_comm_item hot_range;
+ struct hot_inode_item *hot_inode; /* associated hot_inode_item */
+ u32 start; /* item index in hot_range_tree */
+ u32 len; /* length in bytes */
+};
+
+struct hot_info {
+ struct radix_tree_root hot_inode_tree;
+ spinlock_t lock; /*protect inode tree */
+};
+
+extern void __init hot_cache_init(void);
+
+#endif /* _LINUX_HOTTRACK_H */
--
1.7.6.5

2012-10-29 04:30:48

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 06/19] vfs: add temp calculation function

From: Zhi Yong Wu <[email protected]>

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 74 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 0a603a1..83e590c 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -323,6 +323,80 @@ static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
}
}

+static u64 hot_raw_shift(u64 counter, u32 bits, bool dir)
+{
+ if (dir)
+ return counter << bits;
+ else
+ return counter >> bits;
+}
+
+/*
+ * hot_temp_calc() is responsible for distilling the six heat
+ * criteria down into a single temperature value for the data,
+ * which is an integer between 0 and HEAT_MAX_VALUE.
+ */
+static u32 hot_temp_calc(struct hot_freq_data *freq_data)
+{
+ u32 result = 0;
+
+ struct timespec ckt = current_kernel_time();
+ u64 cur_time = timespec_to_ns(&ckt);
+
+ u32 nrr_heat = (u32)hot_raw_shift((u64)freq_data->nr_reads,
+ NRR_MULTIPLIER_POWER, true);
+ u32 nrw_heat = (u32)hot_raw_shift((u64)freq_data->nr_writes,
+ NRW_MULTIPLIER_POWER, true);
+
+ u64 ltr_heat =
+ hot_raw_shift((cur_time - timespec_to_ns(&freq_data->last_read_time)),
+ LTR_DIVIDER_POWER, false);
+ u64 ltw_heat =
+ hot_raw_shift((cur_time - timespec_to_ns(&freq_data->last_write_time)),
+ LTW_DIVIDER_POWER, false);
+
+ u64 avr_heat =
+ hot_raw_shift((((u64) -1) - freq_data->avg_delta_reads),
+ AVR_DIVIDER_POWER, false);
+ u64 avw_heat =
+ hot_raw_shift((((u64) -1) - freq_data->avg_delta_writes),
+ AVW_DIVIDER_POWER, false);
+
+ /* ltr_heat is now guaranteed to be u32 safe */
+ if (ltr_heat >= hot_raw_shift((u64) 1, 32, true))
+ ltr_heat = 0;
+ else
+ ltr_heat = hot_raw_shift((u64) 1, 32, true) - ltr_heat;
+
+ /* ltw_heat is now guaranteed to be u32 safe */
+ if (ltw_heat >= hot_raw_shift((u64) 1, 32, true))
+ ltw_heat = 0;
+ else
+ ltw_heat = hot_raw_shift((u64) 1, 32, true) - ltw_heat;
+
+ /* avr_heat is now guaranteed to be u32 safe */
+ if (avr_heat >= hot_raw_shift((u64) 1, 32, true))
+ avr_heat = (u32) -1;
+
+ /* avw_heat is now guaranteed to be u32 safe */
+ if (avw_heat >= hot_raw_shift((u64) 1, 32, true))
+ avw_heat = (u32) -1;
+
+ nrr_heat = (u32)hot_raw_shift((u64)nrr_heat,
+ (3 - NRR_COEFF_POWER), false);
+ nrw_heat = (u32)hot_raw_shift((u64)nrw_heat,
+ (3 - NRW_COEFF_POWER), false);
+ ltr_heat = hot_raw_shift(ltr_heat, (3 - LTR_COEFF_POWER), false);
+ ltw_heat = hot_raw_shift(ltw_heat, (3 - LTW_COEFF_POWER), false);
+ avr_heat = hot_raw_shift(avr_heat, (3 - AVR_COEFF_POWER), false);
+ avw_heat = hot_raw_shift(avw_heat, (3 - AVW_COEFF_POWER), false);
+
+ result = nrr_heat + nrw_heat + (u32) ltr_heat +
+ (u32) ltw_heat + (u32) avr_heat + (u32) avw_heat;
+
+ return result;
+}
+
/*
* Initialize inode and range map arrays.
*/
--
1.7.6.5

2012-10-29 04:30:49

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 07/19] vfs: add map info update function

From: Zhi Yong Wu <[email protected]>

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 21 +++++++++++++++++
2 files changed, 87 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 83e590c..9245dd3 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -398,6 +398,72 @@ static u32 hot_temp_calc(struct hot_freq_data *freq_data)
}

/*
+ * Calculate a new temperature and, if necessary,
+ * move the list_head corresponding to this inode or range
+ * to the proper list with the new temperature
+ */
+static void hot_map_array_update(struct hot_freq_data *freq_data,
+ struct hot_info *root)
+{
+ struct hot_map_head *buckets, *cur_bucket;
+ struct hot_comm_item *comm_item;
+ struct hot_inode_item *he;
+ struct hot_range_item *hr;
+ u32 temp = hot_temp_calc(freq_data);
+ u8 a_temp = temp >> (32 - HEAT_MAP_BITS);
+ u8 b_temp = freq_data->last_temp >> (32 - HEAT_MAP_BITS);
+
+ comm_item = container_of(freq_data,
+ struct hot_comm_item, hot_freq_data);
+
+ if (freq_data->flags & FREQ_DATA_TYPE_INODE) {
+ he = container_of(comm_item,
+ struct hot_inode_item, hot_inode);
+ buckets = root->heat_inode_map;
+
+ if (he == NULL)
+ return;
+
+ spin_lock(&he->hot_inode.lock);
+ if (list_empty(&he->hot_inode.n_list) || (a_temp != b_temp)) {
+ if (!list_empty(&he->hot_inode.n_list)) {
+ list_del_init(&he->hot_inode.n_list);
+ root->hot_map_nr--;
+ }
+
+ cur_bucket = buckets + a_temp;
+ list_add_tail(&he->hot_inode.n_list,
+ &cur_bucket->node_list);
+ root->hot_map_nr++;
+ freq_data->last_temp = temp;
+ }
+ spin_unlock(&he->hot_inode.lock);
+ } else if (freq_data->flags & FREQ_DATA_TYPE_RANGE) {
+ hr = container_of(comm_item,
+ struct hot_range_item, hot_range);
+ buckets = root->heat_range_map;
+
+ if (hr == NULL)
+ return;
+
+ spin_lock(&hr->hot_range.lock);
+ if (list_empty(&hr->hot_range.n_list) || (a_temp != b_temp)) {
+ if (!list_empty(&hr->hot_range.n_list)) {
+ list_del_init(&hr->hot_range.n_list);
+ root->hot_map_nr--;
+ }
+
+ cur_bucket = buckets + a_temp;
+ list_add_tail(&hr->hot_range.n_list,
+ &cur_bucket->node_list);
+ root->hot_map_nr++;
+ freq_data->last_temp = temp;
+ }
+ spin_unlock(&hr->hot_range.lock);
+ }
+}
+
+/*
* Initialize inode and range map arrays.
*/
static void hot_map_array_init(struct hot_info *root)
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index cc4666e..196b894 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -26,6 +26,27 @@

#define FREQ_POWER 4

+/* NRR/NRW heat unit = 2^X accesses */
+#define NRR_MULTIPLIER_POWER 20 /* NRR - number of reads since mount */
+#define NRR_COEFF_POWER 0
+#define NRW_MULTIPLIER_POWER 20 /* NRW - number of writes since mount */
+#define NRW_COEFF_POWER 0
+
+/* LTR/LTW heat unit = 2^X ns of age */
+#define LTR_DIVIDER_POWER 30 /* LTR - time elapsed since last read(ns) */
+#define LTR_COEFF_POWER 1
+#define LTW_DIVIDER_POWER 30 /* LTW - time elapsed since last write(ns) */
+#define LTW_COEFF_POWER 1
+
+/*
+ * AVR/AVW cold unit = 2^X ns of average delta
+ * AVR/AVW heat unit = HEAT_MAX_VALUE - cold unit
+ */
+#define AVR_DIVIDER_POWER 40 /* AVR - average delta between recent reads(ns) */
+#define AVR_COEFF_POWER 0
+#define AVW_DIVIDER_POWER 40 /* AVW - average delta between recent writes(ns) */
+#define AVW_COEFF_POWER 0
+
void hot_inode_item_put(struct hot_inode_item *he);
struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);

--
1.7.6.5

2012-10-29 04:30:54

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 12/19] vfs: add one ioctl interface

From: Zhi Yong Wu <[email protected]>

FS_IOC_GET_HEAT_INFO: return a struct containing the various
metrics collected in hot_freq_data structs, and also return a
calculated data temperature based on those metrics. Optionally, retrieve
the temperature from the hot data hash list instead of recalculating it.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/compat_ioctl.c | 5 +++
fs/ioctl.c | 78 ++++++++++++++++++++++++++++++++++++++++++
include/linux/hot_tracking.h | 19 ++++++++++
3 files changed, 102 insertions(+), 0 deletions(-)

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285f..ad1d603 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -57,6 +57,7 @@
#include <linux/i2c-dev.h>
#include <linux/atalk.h>
#include <linux/gfp.h>
+#include <linux/hot_tracking.h>

#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci.h>
@@ -1400,6 +1401,9 @@ COMPATIBLE_IOCTL(TIOCSTART)
COMPATIBLE_IOCTL(TIOCSTOP)
#endif

+/*Hot data tracking*/
+COMPATIBLE_IOCTL(FS_IOC_GET_HEAT_INFO)
+
/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
but we don't want warnings on other file systems. So declare
them as compatible here. */
@@ -1579,6 +1583,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
case FIBMAP:
case FIGETBSZ:
case FIONREAD:
+ case FS_IOC_GET_HEAT_INFO:
if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode))
break;
/*FALL THROUGH*/
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 3bdad6d..f0e225e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
+#include "hot_tracking.h"

#include <asm/ioctls.h>

@@ -537,6 +538,80 @@ static int ioctl_fsthaw(struct file *filp)
}

/*
+ * Retrieve information about access frequency for the given file. Return it in
+ * a userspace-friendly struct for btrfsctl (or another tool) to parse.
+ *
+ * The temperature that is returned can be "live" -- that is, recalculated when
+ * the ioctl is called -- or it can be returned from the hashtable, reflecting
+ * the (possibly old) value that the system will use when considering files
+ * for migration. This behavior is determined by hot_heat_info->live.
+ */
+static int ioctl_heat_info(struct file *file, void __user *argp)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct hot_heat_info *heat_info;
+ struct hot_inode_item *he;
+ int ret = 0;
+
+ heat_info = kmalloc(sizeof(struct hot_heat_info),
+ GFP_KERNEL | GFP_NOFS);
+
+ if (copy_from_user((void *) heat_info,
+ argp,
+ sizeof(struct hot_heat_info)) != 0) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ he = hot_inode_item_find(inode->i_sb->s_hot_root, inode->i_ino);
+ if (!he) {
+ /* we don't have any info on this file yet */
+ ret = -ENODATA;
+ goto err;
+ }
+
+ spin_lock(&he->hot_inode.lock);
+ heat_info->avg_delta_reads =
+ (__u64) he->hot_inode.hot_freq_data.avg_delta_reads;
+ heat_info->avg_delta_writes =
+ (__u64) he->hot_inode.hot_freq_data.avg_delta_writes;
+ heat_info->last_read_time =
+ (__u64) timespec_to_ns(&he->hot_inode.hot_freq_data.last_read_time);
+ heat_info->last_write_time =
+ (__u64) timespec_to_ns(&he->hot_inode.hot_freq_data.last_write_time);
+ heat_info->num_reads =
+ (__u32) he->hot_inode.hot_freq_data.nr_reads;
+ heat_info->num_writes =
+ (__u32) he->hot_inode.hot_freq_data.nr_writes;
+
+ if (heat_info->live > 0) {
+ /*
+ * got a request for live temperature,
+ * call hot_hash_calc_temperature to recalculate
+ */
+ heat_info->temp =
+ inode->i_sb->s_hot_root->hot_func_type->ops.hot_temp_calc_fn(
+ &he->hot_inode.hot_freq_data);
+ } else {
+ /* not live temperature, get it from the hashlist */
+ heat_info->temp = he->hot_inode.hot_freq_data.last_temp;
+ }
+ spin_unlock(&he->hot_inode.lock);
+
+ hot_inode_item_put(he);
+
+ if (copy_to_user(argp, (void *) heat_info,
+ sizeof(struct hot_heat_info))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+err:
+ kfree(heat_info);
+ return ret;
+}
+
+/*
* When you add any new common ioctls to the switches above and below
* please update compat_sys_ioctl() too.
*
@@ -591,6 +666,9 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
case FIGETBSZ:
return put_user(inode->i_sb->s_blocksize, argp);

+ case FS_IOC_GET_HEAT_INFO:
+ return ioctl_heat_info(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index f82db2d..b3ed251 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -41,6 +41,17 @@ struct hot_freq_data {
u32 last_temp;
};

+struct hot_heat_info {
+ __u64 avg_delta_reads;
+ __u64 avg_delta_writes;
+ __u64 last_read_time;
+ __u64 last_write_time;
+ __u32 num_reads;
+ __u32 num_writes;
+ __u32 temp;
+ __u8 live;
+};
+
/* List heads in hot map array */
struct hot_map_head {
struct list_head node_list;
@@ -110,6 +121,14 @@ struct hot_info {
struct shrinker hot_shrink;
};

+/*
+ * Hot data tracking ioctls:
+ *
+ * HOT_INFO - retrieve info on frequency of access
+ */
+#define FS_IOC_GET_HEAT_INFO _IOR('f', 17, \
+ struct hot_heat_info)
+
extern void __init hot_cache_init(void);
extern int hot_track_init(struct super_block *sb);
extern void hot_track_exit(struct super_block *sb);
--
1.7.6.5

2012-10-29 04:30:52

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

From: Zhi Yong Wu <[email protected]>

Introduce one framwork to enable that specific FS
can register its own hot tracking functions.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 78 ++++++++++++++++++++++++++++++++++++++----
include/linux/hot_tracking.h | 25 +++++++++++++
2 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 0ef9cad..c6c6138 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -24,6 +24,9 @@
#include <linux/limits.h>
#include "hot_tracking.h"

+static DEFINE_SPINLOCK(hot_func_list_lock);
+static LIST_HEAD(hot_func_list);
+
/* kmem_cache pointers for slab caches */
static struct kmem_cache *hot_inode_item_cachep __read_mostly;
static struct kmem_cache *hot_range_item_cachep __read_mostly;
@@ -305,20 +308,23 @@ static u64 hot_average_update(struct timespec old_atime,
return new_avg;
}

-static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
+static void hot_freq_data_update(struct hot_info *root,
+ struct hot_freq_data *freq_data, bool write)
{
struct timespec cur_time = current_kernel_time();

if (write) {
freq_data->nr_writes += 1;
- freq_data->avg_delta_writes = hot_average_update(
+ freq_data->avg_delta_writes =
+ root->hot_func_type->ops.hot_rw_freq_calc_fn(
freq_data->last_write_time,
cur_time,
freq_data->avg_delta_writes);
freq_data->last_write_time = cur_time;
} else {
freq_data->nr_reads += 1;
- freq_data->avg_delta_reads = hot_average_update(
+ freq_data->avg_delta_reads =
+ root->hot_func_type->ops.hot_rw_freq_calc_fn(
freq_data->last_read_time,
cur_time,
freq_data->avg_delta_reads);
@@ -430,7 +436,7 @@ static void hot_map_array_update(struct hot_freq_data *freq_data,
struct hot_comm_item *comm_item;
struct hot_inode_item *he;
struct hot_range_item *hr;
- u32 temp = hot_temp_calc(freq_data);
+ u32 temp = root->hot_func_type->ops.hot_temp_calc_fn(freq_data);
u8 a_temp = temp >> (32 - HEAT_MAP_BITS);
u8 b_temp = freq_data->last_temp >> (32 - HEAT_MAP_BITS);

@@ -511,7 +517,7 @@ static void hot_range_update(struct hot_inode_item *he,
&hr_nodes[i]->hot_range.hot_freq_data, root);

spin_lock(&hr_nodes[i]->hot_range.lock);
- obsolete = hot_is_obsolete(
+ obsolete = root->hot_func_type->ops.hot_is_obsolete_fn(
&hr_nodes[i]->hot_range.hot_freq_data);
spin_unlock(&hr_nodes[i]->hot_range.lock);

@@ -668,7 +674,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
}

spin_lock(&he->hot_inode.lock);
- hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
+ hot_freq_data_update(root, &he->hot_inode.hot_freq_data, rw);
spin_unlock(&he->hot_inode.lock);

/*
@@ -685,7 +691,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
}

spin_lock(&hr->hot_range.lock);
- hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
+ hot_freq_data_update(root, &hr->hot_range.hot_freq_data, rw);
spin_unlock(&hr->hot_range.lock);

hot_range_item_put(hr);
@@ -695,6 +701,61 @@ void hot_update_freqs(struct inode *inode, u64 start,
}
EXPORT_SYMBOL_GPL(hot_update_freqs);

+static struct hot_func_type hot_func_def = {
+ .hot_func_name = "hot_type_def",
+ .ops = {
+ .hot_rw_freq_calc_fn = hot_average_update,
+ .hot_temp_calc_fn = hot_temp_calc,
+ .hot_is_obsolete_fn = hot_is_obsolete,
+ },
+};
+
+static struct hot_func_type *hot_func_get(const char *name)
+{
+ struct hot_func_type *f, *h = &hot_func_def;
+
+ spin_lock(&hot_func_list_lock);
+ list_for_each_entry(f, &hot_func_list, list) {
+ if (!strcmp(f->hot_func_name, name))
+ h = f;
+ }
+ spin_unlock(&hot_func_list_lock);
+
+ return h;
+}
+
+int hot_func_register(struct hot_func_type *h)
+{
+ struct hot_func_type *f, *t = NULL;
+
+ /* register, don't allow duplicate names */
+ spin_lock(&hot_func_list_lock);
+ list_for_each_entry(f, &hot_func_list, list) {
+ if (!strcmp(f->hot_func_name, h->hot_func_name))
+ t = f;
+ }
+
+ if (t) {
+ spin_unlock(&hot_func_list_lock);
+ return -EBUSY;
+ }
+
+ list_add_tail(&h->list, &hot_func_list);
+ spin_unlock(&hot_func_list_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hot_func_register);
+
+void hot_func_unregister(struct hot_func_type *h)
+{
+ /* unregister */
+ spin_lock(&hot_func_list_lock);
+ list_del_init(&h->list);
+ spin_unlock(&hot_func_list_lock);
+}
+EXPORT_SYMBOL_GPL(hot_func_unregister);
+
/*
* Initialize the data structures for hot data tracking.
*/
@@ -714,6 +775,9 @@ int hot_track_init(struct super_block *sb)
hot_inode_tree_init(root);
hot_map_array_init(root);

+ /* Get hot func type */
+ root->hot_func_type = hot_func_get(sb->s_type->name);
+
root->update_wq = alloc_workqueue(
"hot_update_wq", WQ_NON_REENTRANT, 0);
if (!root->update_wq) {
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index 2ee0d02..3941052 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -23,6 +23,8 @@
#define HEAT_MAP_BITS 8
#define HEAT_MAP_SIZE (1 << HEAT_MAP_BITS)

+#define HOT_NAME_MAX 16
+
/*
* A frequency data struct holds values that are used to
* determine temperature of files and file ranges. These structs
@@ -73,6 +75,25 @@ struct hot_range_item {
u32 len; /* length in bytes */
};

+typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
+ struct timespec cur_time, u64 old_avg);
+typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
+typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);
+
+struct hot_func_ops {
+ hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
+ hot_temp_calc_fn *hot_temp_calc_fn;
+ hot_is_obsolete_fn *hot_is_obsolete_fn;
+};
+
+/* identifies an hot func type */
+struct hot_func_type {
+ char hot_func_name[HOT_NAME_MAX];
+ /* fields provided by specific FS */
+ struct hot_func_ops ops;
+ struct list_head list;
+};
+
struct hot_info {
struct radix_tree_root hot_inode_tree;
spinlock_t lock; /*protect inode tree */
@@ -85,6 +106,7 @@ struct hot_info {

struct workqueue_struct *update_wq;
struct delayed_work update_work;
+ struct hot_func_type *hot_func_type;
};

extern void __init hot_cache_init(void);
@@ -93,4 +115,7 @@ extern void hot_track_exit(struct super_block *sb);
extern void hot_update_freqs(struct inode *inode, u64 start,
u64 len, int rw);

+extern int hot_func_register(struct hot_func_type *h);
+extern void hot_func_unregister(struct hot_func_type *h);
+
#endif /* _LINUX_HOTTRACK_H */
--
1.7.6.5

2012-10-29 04:32:49

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 11/19] vfs: register one shrinker

From: Zhi Yong Wu <[email protected]>

Register a shrinker to control the amount of
memory that is used in tracking hot regions - if we are throwing
inodes out of memory due to memory pressure, we most definitely are
going to need to reduce the amount of memory the tracking code is
using, even if it means losing useful information (i.e. the shrinker
accelerates the aging process).

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 61 ++++++++++++++++++++++++++++++++++++++++++
include/linux/hot_tracking.h | 1 +
2 files changed, 62 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index c6c6138..54a8208 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -653,6 +653,61 @@ err:
}
EXPORT_SYMBOL_GPL(hot_cache_init);

+static int hot_track_prune_map(struct hot_map_head *map_head,
+ bool type, int nr)
+{
+ struct hot_comm_item *node;
+ int i;
+
+ for (i = 0; i < HEAT_MAP_SIZE; i++) {
+ while (!list_empty(&(map_head + i)->node_list)) {
+ if (nr-- <= 0)
+ break;
+
+ node = list_first_entry(&(map_head + i)->node_list,
+ struct hot_comm_item, n_list);
+ if (type) {
+ struct hot_inode_item *hot_inode =
+ container_of(node,
+ struct hot_inode_item, hot_inode);
+ hot_inode_item_put(hot_inode);
+ } else {
+ struct hot_range_item *hot_range =
+ container_of(node,
+ struct hot_range_item, hot_range);
+ hot_range_item_put(hot_range);
+ }
+ }
+ }
+
+ return nr;
+}
+
+/* The shrinker callback function */
+static int hot_track_prune(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct hot_info *root =
+ container_of(shrink, struct hot_info, hot_shrink);
+ int ret;
+
+ if (sc->nr_to_scan == 0)
+ return root->hot_map_nr;
+
+ if (!(sc->gfp_mask & __GFP_FS))
+ return -1;
+
+ ret = hot_track_prune_map(root->heat_range_map,
+ false, sc->nr_to_scan);
+ if (ret > 0)
+ ret = hot_track_prune_map(root->heat_inode_map,
+ true, ret);
+ if (ret > 0)
+ root->hot_map_nr -= (sc->nr_to_scan - ret);
+
+ return root->hot_map_nr;
+}
+
/*
* Main function to update access frequency from read/writepage(s) hooks
*/
@@ -791,6 +846,11 @@ int hot_track_init(struct super_block *sb)
queue_delayed_work(root->update_wq, &root->update_work,
msecs_to_jiffies(HEAT_UPDATE_DELAY * MSEC_PER_SEC));

+ /* Register a shrinker callback */
+ root->hot_shrink.shrink = hot_track_prune;
+ root->hot_shrink.seeks = DEFAULT_SEEKS;
+ register_shrinker(&root->hot_shrink);
+
printk(KERN_INFO "VFS: Turning on hot data tracking\n");

return 0;
@@ -807,6 +867,7 @@ void hot_track_exit(struct super_block *sb)
{
struct hot_info *root = sb->s_hot_root;

+ unregister_shrinker(&root->hot_shrink);
cancel_delayed_work_sync(&root->update_work);
destroy_workqueue(root->update_wq);
hot_map_array_exit(root);
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index 3941052..f82db2d 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -107,6 +107,7 @@ struct hot_info {
struct workqueue_struct *update_wq;
struct delayed_work update_work;
struct hot_func_type *hot_func_type;
+ struct shrinker hot_shrink;
};

extern void __init hot_cache_init(void);
--
1.7.6.5


2012-10-29 04:30:57

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 15/19] sysfs: add two hot_track proc files

From: Zhi Yong Wu <[email protected]>

Add two proc files hot-kick-time and hot-update-delay
under the dir /proc/sys/fs/ in order to turn
TIME_TO_KICK and HEAT_UPDATE_DELAY into be tunable.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 12 +++++++++---
fs/hot_tracking.h | 9 ---------
include/linux/hot_tracking.h | 7 +++++++
kernel/sysctl.c | 14 ++++++++++++++
4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 376d7fb..02ac4a2 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -28,6 +28,12 @@
static DEFINE_SPINLOCK(hot_func_list_lock);
static LIST_HEAD(hot_func_list);

+int sysctl_hot_kick_time __read_mostly = 300;
+EXPORT_SYMBOL_GPL(sysctl_hot_kick_time);
+
+int sysctl_hot_update_delay __read_mostly = 300;
+EXPORT_SYMBOL_GPL(sysctl_hot_update_delay);
+
/* kmem_cache pointers for slab caches */
static struct kmem_cache *hot_inode_item_cachep __read_mostly;
static struct kmem_cache *hot_range_item_cachep __read_mostly;
@@ -417,7 +423,7 @@ static bool hot_is_obsolete(struct hot_freq_data *freq_data)
(cur_time - timespec_to_ns(&freq_data->last_read_time));
u64 last_write_ns =
(cur_time - timespec_to_ns(&freq_data->last_write_time));
- u64 kick_ns = TIME_TO_KICK * NSEC_PER_SEC;
+ u64 kick_ns = sysctl_hot_kick_time * NSEC_PER_SEC;

if ((last_read_ns > kick_ns) && (last_write_ns > kick_ns))
ret = 1;
@@ -625,7 +631,7 @@ static void hot_update_worker(struct work_struct *work)

/* Instert next delayed work */
queue_delayed_work(root->update_wq, &root->update_work,
- msecs_to_jiffies(HEAT_UPDATE_DELAY * MSEC_PER_SEC));
+ msecs_to_jiffies(sysctl_hot_update_delay * MSEC_PER_SEC));
}

/*
@@ -1316,7 +1322,7 @@ int hot_track_init(struct super_block *sb)
/* Initialize hot tracking wq and arm one delayed work */
INIT_DELAYED_WORK(&root->update_work, hot_update_worker);
queue_delayed_work(root->update_wq, &root->update_work,
- msecs_to_jiffies(HEAT_UPDATE_DELAY * MSEC_PER_SEC));
+ msecs_to_jiffies(sysctl_hot_update_delay * MSEC_PER_SEC));

/* Register a shrinker callback */
root->hot_shrink.shrink = hot_track_prune;
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index f5ba2d6..095eab0 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -26,15 +26,6 @@

#define FREQ_POWER 4

-/*
- * time to quit keeping track of
- * tracking data (seconds)
- */
-#define TIME_TO_KICK 300
-
-/* set how often to update temperatures (seconds) */
-#define HEAT_UPDATE_DELAY 300
-
/* NRR/NRW heat unit = 2^X accesses */
#define NRR_MULTIPLIER_POWER 20 /* NRR - number of reads since mount */
#define NRR_COEFF_POWER 0
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index a16217f..416c988 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -123,6 +123,13 @@ struct hot_info {
};

/*
+ * Two variables have meanings as below:
+ * 1. time to quit keeping track of tracking data (seconds)
+ * 2. set how often to update temperatures (seconds)
+ */
+extern int sysctl_hot_kick_time, sysctl_hot_update_delay;
+
+/*
* Hot data tracking ioctls:
*
* HOT_INFO - retrieve info on frequency of access
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65ea..37624fb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1545,6 +1545,20 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "hot-kick-time",
+ .data = &sysctl_hot_kick_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "hot-update-delay",
+ .data = &sysctl_hot_update_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};

--
1.7.6.5

2012-10-29 04:30:51

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 09/19] vfs: add one work queue

From: Zhi Yong Wu <[email protected]>

Add a per-superblock workqueue and a delayed_work
to run periodic work to update map info on each superblock.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 85 ++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 3 +
include/linux/hot_tracking.h | 3 +
3 files changed, 91 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index fff0038..0ef9cad 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -15,9 +15,12 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/types.h>
+#include <linux/list_sort.h>
#include <linux/limits.h>
#include "hot_tracking.h"

@@ -557,6 +560,67 @@ static void hot_map_array_exit(struct hot_info *root)
}
}

+/* Temperature compare function*/
+static int hot_temp_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct hot_comm_item *ap =
+ container_of(a, struct hot_comm_item, n_list);
+ struct hot_comm_item *bp =
+ container_of(b, struct hot_comm_item, n_list);
+
+ int diff = ap->hot_freq_data.last_temp
+ - bp->hot_freq_data.last_temp;
+ if (diff > 0)
+ return -1;
+ if (diff < 0)
+ return 1;
+ return 0;
+}
+
+/*
+ * Every sync period we update temperatures for
+ * each hot inode item and hot range item for aging
+ * purposes.
+ */
+static void hot_update_worker(struct work_struct *work)
+{
+ struct hot_info *root = container_of(to_delayed_work(work),
+ struct hot_info, update_work);
+ struct hot_inode_item *hi_nodes[8];
+ u64 ino = 0;
+ int i, n;
+
+ while (1) {
+ n = radix_tree_gang_lookup(&root->hot_inode_tree,
+ (void **)hi_nodes, ino,
+ ARRAY_SIZE(hi_nodes));
+ if (!n)
+ break;
+
+ ino = hi_nodes[n - 1]->i_ino + 1;
+ for (i = 0; i < n; i++) {
+ kref_get(&hi_nodes[i]->hot_inode.refs);
+ hot_map_array_update(
+ &hi_nodes[i]->hot_inode.hot_freq_data, root);
+ hot_range_update(hi_nodes[i], root);
+ hot_inode_item_put(hi_nodes[i]);
+ }
+ }
+
+ /* Sort temperature map info */
+ for (i = 0; i < HEAT_MAP_SIZE; i++) {
+ list_sort(NULL, &root->heat_inode_map[i].node_list,
+ hot_temp_cmp);
+ list_sort(NULL, &root->heat_range_map[i].node_list,
+ hot_temp_cmp);
+ }
+
+ /* Instert next delayed work */
+ queue_delayed_work(root->update_wq, &root->update_work,
+ msecs_to_jiffies(HEAT_UPDATE_DELAY * MSEC_PER_SEC));
+}
+
/*
* Initialize kmem cache for hot_inode_item and hot_range_item.
*/
@@ -650,9 +714,28 @@ int hot_track_init(struct super_block *sb)
hot_inode_tree_init(root);
hot_map_array_init(root);

+ root->update_wq = alloc_workqueue(
+ "hot_update_wq", WQ_NON_REENTRANT, 0);
+ if (!root->update_wq) {
+ printk(KERN_ERR "%s: Failed to create "
+ "hot update workqueue\n", __func__);
+ goto failed_wq;
+ }
+
+ /* Initialize hot tracking wq and arm one delayed work */
+ INIT_DELAYED_WORK(&root->update_work, hot_update_worker);
+ queue_delayed_work(root->update_wq, &root->update_work,
+ msecs_to_jiffies(HEAT_UPDATE_DELAY * MSEC_PER_SEC));
+
printk(KERN_INFO "VFS: Turning on hot data tracking\n");

return 0;
+
+failed_wq:
+ hot_map_array_exit(root);
+ hot_inode_tree_exit(root);
+ kfree(root);
+ return ret;
}
EXPORT_SYMBOL_GPL(hot_track_init);

@@ -660,6 +743,8 @@ void hot_track_exit(struct super_block *sb)
{
struct hot_info *root = sb->s_hot_root;

+ cancel_delayed_work_sync(&root->update_work);
+ destroy_workqueue(root->update_wq);
hot_map_array_exit(root);
hot_inode_tree_exit(root);
kfree(root);
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index f5ec05a..92e31fb 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -32,6 +32,9 @@
*/
#define TIME_TO_KICK 300

+/* set how often to update temperatures (seconds) */
+#define HEAT_UPDATE_DELAY 300
+
/* NRR/NRW heat unit = 2^X accesses */
#define NRR_MULTIPLIER_POWER 20 /* NRR - number of reads since mount */
#define NRR_COEFF_POWER 0
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index 4f92947..2ee0d02 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -82,6 +82,9 @@ struct hot_info {
/* map of range temperature */
struct hot_map_head heat_range_map[HEAT_MAP_SIZE];
unsigned int hot_map_nr;
+
+ struct workqueue_struct *update_wq;
+ struct delayed_work update_work;
};

extern void __init hot_cache_init(void);
--
1.7.6.5

2012-10-29 04:30:58

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 16/19] btrfs: add hot tracking support

From: Zhi Yong Wu <[email protected]>

Introduce one new mount option '-o hot_track',
and add its parsing support.
Its usage looks like:
mount -o hot_track
mount -o nouser,hot_track
mount -o nouser,hot_track,loop
mount -o hot_track,nouser

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/btrfs/ctree.h | 1 +
fs/btrfs/super.c | 22 +++++++++++++++++++++-
2 files changed, 22 insertions(+), 1 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead8..4703178 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1756,6 +1756,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
+#define BTRFS_MOUNT_HOT_TRACK (1 << 23)

#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14..0bcc62b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
+#include <linux/hot_tracking.h>
#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -299,6 +300,10 @@ static void btrfs_put_super(struct super_block *sb)
* last process that kept it busy. Or segfault in the aforementioned
* process... Whom would you report that to?
*/
+
+ /* Hot data tracking */
+ if (btrfs_test_opt(btrfs_sb(sb)->tree_root, HOT_TRACK))
+ hot_track_exit(sb);
}

enum {
@@ -311,7 +316,7 @@ enum {
Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
Opt_check_integrity, Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask, Opt_fatal_errors,
+ Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_hot_track,
Opt_err,
};

@@ -352,6 +357,7 @@ static match_table_t tokens = {
{Opt_check_integrity_including_extent_data, "check_int_data"},
{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
{Opt_fatal_errors, "fatal_errors=%s"},
+ {Opt_hot_track, "hot_track"},
{Opt_err, NULL},
};

@@ -614,6 +620,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
goto out;
}
break;
+ case Opt_hot_track:
+ btrfs_set_opt(info->mount_opt, HOT_TRACK);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -841,11 +850,20 @@ static int btrfs_fill_super(struct super_block *sb,
goto fail_close;
}

+ if (btrfs_test_opt(fs_info->tree_root, HOT_TRACK)) {
+ err = hot_track_init(sb);
+ if (err)
+ goto fail_hot;
+ }
+
save_mount_options(sb, data);
cleancache_init_fs(sb);
sb->s_flags |= MS_ACTIVE;
return 0;

+fail_hot:
+ dput(sb->s_root);
+ sb->s_root = NULL;
fail_close:
close_ctree(fs_info->tree_root);
return err;
@@ -941,6 +959,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",skip_balance");
if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
+ if (btrfs_test_opt(root, HOT_TRACK))
+ seq_puts(seq, ",hot_track");
return 0;
}

--
1.7.6.5

2012-10-29 04:31:00

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 18/19] ext4: add hot tracking support

From: Zheng Liu <[email protected]>

Define a new mount option to add VFS hot
tracking support in order to use it in ext4.

CC: Zhi Yong Wu <[email protected]>
Signed-off-by: Zheng Liu <[email protected]>
---
fs/ext4/ext4.h | 3 +++
fs/ext4/super.c | 13 ++++++++++++-
2 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1..f6cff1e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1298,6 +1298,9 @@ struct ext4_sb_info {

/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
+
+ /* Enable hot tracking or not */
+ int s_hottrack_enable;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80928f7..ba9f376 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -864,6 +864,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_ext_release(sb);
ext4_xattr_put_super(sb);

+ if (sbi->s_hottrack_enable)
+ hot_track_exit(sb);
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -1222,7 +1224,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
- Opt_max_dir_size_kb,
+ Opt_max_dir_size_kb, Opt_hottrack,
};

static const match_table_t tokens = {
@@ -1297,6 +1299,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_hottrack, "hot_track"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1595,6 +1598,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
sbi->s_li_wait_mult = arg;
} else if (token == Opt_max_dir_size_kb) {
sbi->s_max_dir_size_kb = arg;
+ } else if (token == Opt_hottrack) {
+ if (hot_track_init(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: hot tracking initialization"
+ " failed");
+ return -1;
+ }
+ sbi->s_hottrack_enable = 1;
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (m->flags & MOPT_DATAJ) {
--
1.7.6.5


2012-10-29 04:33:42

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 13/19] debugfs: introduce one function

From: Zhi Yong Wu <[email protected]>

The debugfs function is used to get expected dentry.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/debugfs/inode.c | 26 ++++++++++++++++++++++++++
include/linux/debugfs.h | 9 +++++++++
2 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92..c6291bc 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -354,6 +354,32 @@ exit:
return dentry;
}

+struct dentry *debugfs_get_dentry(const char *name,
+ struct dentry *parent, int len)
+{
+ struct dentry *dentry = NULL;
+ int error = 0;
+
+ error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
+ &debugfs_mount_count);
+ if (error)
+ return NULL;
+
+ if (!parent)
+ parent = debugfs_mount->mnt_root;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (!IS_ERR(dentry)) {
+ mutex_unlock(&parent->d_inode->i_mutex);
+ return dentry;
+ }
+ mutex_unlock(&parent->d_inode->i_mutex);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(debugfs_get_dentry);
+
/**
* debugfs_create_file - create a file in the debugfs filesystem
* @name: a pointer to a string containing the name of the file to create.
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 66c434f..8913a4d 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -46,6 +46,9 @@ extern struct dentry *arch_debugfs_dir;
extern const struct file_operations debugfs_file_operations;
extern const struct inode_operations debugfs_link_operations;

+struct dentry *debugfs_get_dentry(const char *name,
+ struct dentry *parent, int len);
+
struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops);
@@ -103,6 +106,12 @@ bool debugfs_initialized(void);

#include <linux/err.h>

+static inline struct dentry *debugfs_get_dentry(const char *name,
+ struct dentry *parent, int len)
+{
+ return ERR_PTR(-ENODEV);
+}
+
/*
* We do not return NULL from these functions if CONFIG_DEBUG_FS is not enabled
* so users have a chance to detect if there was a real error or not. We don't
--
1.7.6.5


2012-10-29 04:31:01

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 19/19] vfs: add documentation

From: Zhi Yong Wu <[email protected]>

Add one doc for VFS hot tracking feature

Signed-off-by: Zhi Yong Wu <[email protected]>
---
Documentation/filesystems/00-INDEX | 2 +
Documentation/filesystems/hot_tracking.txt | 262 ++++++++++++++++++++++++++++
2 files changed, 264 insertions(+), 0 deletions(-)
create mode 100644 Documentation/filesystems/hot_tracking.txt

diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 8c624a1..b68bdff 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -118,3 +118,5 @@ xfs.txt
- info and mount options for the XFS filesystem.
xip.txt
- info on execute-in-place for file mappings.
+hot_tracking.txt
+ - info on hot data tracking in VFS layer
diff --git a/Documentation/filesystems/hot_tracking.txt b/Documentation/filesystems/hot_tracking.txt
new file mode 100644
index 0000000..a39a96d
--- /dev/null
+++ b/Documentation/filesystems/hot_tracking.txt
@@ -0,0 +1,262 @@
+Hot Data Tracking
+
+September, 2012 Zhi Yong Wu <[email protected]>
+
+CONTENTS
+
+1. Introduction
+2. Motivation
+3. The Design
+4. How to Calc Frequency of Reads/Writes & Temperature
+5. Git Development Tree
+6. Usage Example
+
+
+1. Introduction
+
+ The feature adds experimental support for tracking data temperature
+information in VFS layer. Essentially, this means maintaining some key
+stats(like number of reads/writes, last read/write time, frequency of
+reads/writes), then distilling those numbers down to a single
+"temperature" value that reflects what data is "hot," and using that
+temperature to move data to SSDs.
+
+ The long-term goal of the feature is to allow some FSs,
+e.g. Btrfs to intelligently utilize SSDs in a heterogenous volume.
+Incidentally, this project has been motivated by
+the Project Ideas page on the Btrfs wiki.
+
+ Of course, users are warned not to run this code outside of development
+environments. These patches are EXPERIMENTAL, and as such they might eat
+your data and/or memory. That said, the code should be relatively safe
+when the hottrack mount option are disabled.
+
+2. Motivation
+
+ The overall goal of enabling hot data relocation to SSD has been
+motivated by the Project Ideas page on the Btrfs wiki at
+<https://btrfs.wiki.kernel.org/index.php/Project_ideas>.
+It will divide into two steps. VFS provide hot data tracking function
+while specific FS will provide hot data relocation function.
+So as the first step of this goal, it is hoped that the patchset
+for hot data tracking will eventually mature into VFS.
+
+ This is essentially the traditional cache argument: SSD is fast and
+expensive; HDD is cheap but slow. ZFS, for example, can already take
+advantage of SSD caching. Btrfs should also be able to take advantage of
+hybrid storage without many broad, sweeping changes to existing code.
+
+
+3. The Design
+
+These include the following parts:
+
+ * Hooks in existing vfs functions to track data access frequency
+
+ * New radix-trees for tracking access frequency of inodes and sub-file
+ranges
+ The relationship between super_block and radix-tree is as below:
+hot_info.hot_inode_tree
+ Each FS instance can find hot tracking info s_hotinfo.
+In this hot_info, it store a lot of hot tracking info such as hot_inode_tree,
+inode and range list, etc.
+
+ * A list for indexing data by its temperature
+
+ * A debugfs interface for dumping data from the radix-trees
+
+ * A background kthread for updating inode heat info
+
+ * Mount options for enabling temperature tracking(-o hot_track,
+default mean disabled)
+ * An ioctl to retrieve the frequency information collected for a certain
+file
+ * Ioctls to enable/disable frequency tracking per inode.
+
+Let us see their relationship as below:
+
+ * hot_info.hot_inode_tree indexes hot_inode_items, one per inode
+
+ * hot_inode_item contains access frequency data for that inode
+
+ * hot_inode_item holds a heat list node to index the access
+frequency data for that inode
+
+ * hot_inode_item.hot_range_tree indexes hot_range_items for that inode
+
+ * hot_range_item contains access frequency data for that range
+
+ * hot_range_item holds a heat list node to index the access
+frequency data for that range
+
+ * hot_info.heat_inode_map indexes per-inode heat list nodes
+
+ * hot_info.heat_range_map indexes per-range heat list nodes
+
+ How about some ascii art? :) Just looking at the hot inode item case
+(the range item case is the same pattern, though), we have:
+
+heat_inode_map hot_inode_tree
+ | |
+ | V
+ | +-------hot_comm_item--------+
+ | | frequency data |
++---+ | list_head |
+| V ^ | V
+| ...<--hot_comm_item-->... | | ...<--hot_comm_item-->...
+| frequency data | | frequency data
++-------->list_head----------+ +--------->list_head--->.....
+ hot_range_tree hot_range_tree
+ |
+ heat_range_map V
+ | +-------hot_comm_item--------+
+ | | frequency data |
+ +---+ | list_head |
+ | V ^ | V
+ | ...<--hot_comm_item-->... | | ...<--hot_comm_item-->...
+ | frequency data | | frequency data
+ +-------->list_head----------+ +--------->list_head--->.....
+
+
+4. How to Calc Frequency of Reads/Writes & Temperature
+
+1.) hot_average_update()
+
+ This function does the actual work of updating the frequency numbers,
+whatever they turn out to be. FREQ_POWER determines how many atime
+deltas we keep track of (as a power of 2). So, setting it to anything above
+16ish is probably overkill. Also, the higher the power, the more bits get
+right shifted out of the timestamp, reducing precision, so take note of that
+as well.
+
+ The caller should have already locked freq_data's parent's spinlock.
+
+ FREQ_POWER, defined immediately below, determines how heavily to weight
+the current frequency numbers against the newest access. For example, a value
+of 4 means that the new access information will be weighted 1/16th (ie 2^-4)
+as heavily as the existing frequency info. In essence, this is a kludged-
+together version of a weighted average, since we can't afford to keep all of
+the information that it would take to get a _real_ weighted average.
+
+2.) Some Micro explaination
+
+ The following comments explain what exactly comprises a unit of heat.
+Each of six values of heat are calculated and combined in order to form an
+overall temperature for the data:
+
+ * NRR - number of reads since mount
+ * NRW - number of writes since mount
+ * LTR - time elapsed since last read (ns)
+ * LTW - time elapsed since last write (ns)
+ * AVR - average delta between recent reads (ns)
+ * AVW - average delta between recent writes (ns)
+
+ These values are divided (right-shifted) according to the *_DIVIDER_POWER
+values defined below to bring the numbers into a reasonable range. You can
+modify these values to fit your needs. However, each heat unit is a u32 and
+thus maxes out at 2^32 - 1. Therefore, you must choose your dividers quite
+carefully or else they could max out or be stuck at zero quite easily.
+(E.g., if you chose AVR_DIVIDER_POWER = 0, nothing less than 4s of atime
+delta would bring the temperature above zero, ever.)
+
+ Finally, each value is added to the overall temperature between 0 and 8
+times, depending on its *_COEFF_POWER value. Note that the coefficients are
+also actually implemented with shifts, so take care to treat these values
+as powers of 2. (I.e., 0 means we'll add it to the temp once; 1 = 2x, etc.)
+
+ * AVR/AVW cold unit = 2^X ns of average delta
+ * AVR/AVW heat unit = HEAT_MAX_VALUE - cold unit
+
+ E.g., data with an average delta between 0 and 2^X ns will have a cold
+value of 0, which means a heat value equal to HEAT_MAX_VALUE.
+
+3.) hot_temp_calc()
+
+ This function is responsible for distilling the six heat
+criteria, which are described in detail in hot_tracking.h) down into a single
+temperature value for the data, which is an integer between 0
+and HEAT_MAX_VALUE.
+
+ To accomplish this, the raw values from the hot_freq_data structure
+are shifted various ways in order to make the temperature calculation more
+or less sensitive to each value.
+
+ Once this calibration has happened, we do some additional normalization and
+make sure that everything fits nicely in a u32. From there, we take a very
+rudimentary kind of "average" of each of the values, where the *_COEFF_POWER
+values act as weights for the average.
+
+ Finally, we use the HEAT_HASH_BITS value, which determines the size of the
+heat list array, to normalize the temperature to the proper granularity.
+
+
+5. Git Development Tree
+
+ This feature is still on development and review, so if you're interested,
+you can pull from the git repository at the following location:
+
+ https://github.com/wuzhy/kernel.git hot_tracking
+ git://github.com/wuzhy/kernel.git hot_tracking
+
+
+6. Usage Example
+
+1.) To use hot tracking, you should mount like this:
+
+$ mount -o hot_track /dev/sdb /mnt
+[ 1505.894078] device label test devid 1 transid 29 /dev/sdb
+[ 1505.952977] btrfs: disk space caching is enabled
+[ 1506.069678] vfs: turning on hot data tracking
+
+2.) Mount debugfs at first:
+
+$ mount -t debugfs none /sys/kernel/debug
+$ ls -l /sys/kernel/debug/hot_track/
+total 0
+drwxr-xr-x 2 root root 0 Aug 8 04:40 sdb
+$ ls -l /sys/kernel/debug/hot_track/sdb
+total 0
+-rw-r--r-- 1 root root 0 Aug 8 04:40 rt_stats_inode
+-rw-r--r-- 1 root root 0 Aug 8 04:40 rt_stats_range
+
+3.) View information about hot tracking from debugfs:
+
+$ echo "hot tracking test" > /mnt/file
+$ cat /sys/kernel/debug/hot_track/sdb/rt_stats_inode
+inode #279, reads 0, writes 1, avg read time 18446744073709551615,
+avg write time 5251566408153596, temp 109
+$ cat /sys/kernel/debug/hot_track/sdb/range_data
+inode #279, range start 0 (range len 1048576) reads 0, writes 1,
+avg read time 18446744073709551615, avg write time 1128690176623144209, temp 64
+
+$ echo "hot data tracking test" >> /mnt/file
+$ cat /sys/kernel/debug/hot_track/sdb/rt_stats_inode
+inode #279, reads 0, writes 2, avg read time 18446744073709551615,
+avg write time 4923343766042451, temp 109
+$ cat /sys/kernel/debug/hot_track/sdb/range_data
+inode #279, range start 0 (range len 1048576) reads 0, writes 2,
+avg read time 18446744073709551615, avg write time 1058147040842596150, temp 64
+
+4.) Check temp sorting result of some nodes:
+
+$ cat /sys/kernel/debug/hot_track/loop0/hot_spots_inode
+inode #5248773, reads 0, writes 244,
+avg read time 18446744073709, avg write time 822, temp 111
+inode #878523, reads 0, writes 1,
+avg read time 18446744073709, avg write time 5278036898, temp 109
+inode #878524, reads 0, writes 1,
+avg read time 18446744073709, avg write time 5278036898, temp 109
+
+5.) Tune some hot tracking parameters as below:
+
+$ cat /proc/sys/fs/hot-kick-time
+300
+$ echo 360 > /proc/sys/fs/hot-kick-time
+$ cat /proc/sys/fs/hot-kick-time
+360
+$ cat /proc/sys/fs/hot-update-delay
+300
+$ echo 360 > /proc/sys/fs/hot-update-delay
+$ cat /proc/sys/fs/hot-update-delay
+360
+
--
1.7.6.5


2012-10-29 04:30:59

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 17/19] xfs: add hot tracking support

From: Dave Chinner <[email protected]>

Connect up the VFS hot tracking support
so XFS filesystems can make use of it.

Signed-off-by: Dave Chinner <[email protected]>
---
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 16 ++++++++++++++++
2 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e..96d93c2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -217,6 +217,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
must be synchronous except
for space allocations */
+#define XFS_MOUNT_HOTTRACK (1ULL << 1) /* hot inode tracking */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd..48b3bed 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -61,6 +61,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/parser.h>
+#include <linux/hot_tracking.h>

static const struct super_operations xfs_super_operations;
static kmem_zone_t *xfs_ioend_zone;
@@ -114,6 +115,7 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
+#define MNTOPT_HOTTRACK "hot_track" /* hot inode tracking */

/*
* Table driven mount option parser.
@@ -371,6 +373,8 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ } else if (!strcmp(this_char, MNTOPT_HOTTRACK)) {
+ mp->m_flags |= XFS_MOUNT_HOTTRACK;
} else if (!strcmp(this_char, "ihashsize")) {
xfs_warn(mp,
"ihashsize no longer used, option is deprecated.");
@@ -1005,6 +1009,9 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);

+ if (mp->m_flags & XFS_MOUNT_HOTTRACK)
+ hot_track_exit(sb);
+
xfs_filestream_unmount(mp);
cancel_delayed_work_sync(&mp->m_sync_work);
xfs_unmountfs(mp);
@@ -1407,7 +1414,16 @@ xfs_fs_fill_super(
goto out_unmount;
}

+ if (mp->m_flags & XFS_MOUNT_HOTTRACK) {
+ error = hot_track_init(sb);
+ if (error)
+ goto out_free_root;
+ }
+
return 0;
+ out_free_root:
+ dput(sb->s_root);
+ sb->s_root = NULL;
out_syncd_stop:
xfs_syncd_stop(mp);
out_filestream_unmount:
--
1.7.6.5


2012-10-29 04:30:50

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 08/19] vfs: add aging function

From: Zhi Yong Wu <[email protected]>

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 6 +++++
2 files changed, 62 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 9245dd3..fff0038 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -397,6 +397,24 @@ static u32 hot_temp_calc(struct hot_freq_data *freq_data)
return result;
}

+static bool hot_is_obsolete(struct hot_freq_data *freq_data)
+{
+ int ret = 0;
+ struct timespec ckt = current_kernel_time();
+
+ u64 cur_time = timespec_to_ns(&ckt);
+ u64 last_read_ns =
+ (cur_time - timespec_to_ns(&freq_data->last_read_time));
+ u64 last_write_ns =
+ (cur_time - timespec_to_ns(&freq_data->last_write_time));
+ u64 kick_ns = TIME_TO_KICK * NSEC_PER_SEC;
+
+ if ((last_read_ns > kick_ns) && (last_write_ns > kick_ns))
+ ret = 1;
+
+ return ret;
+}
+
/*
* Calculate a new temperature and, if necessary,
* move the list_head corresponding to this inode or range
@@ -463,6 +481,44 @@ static void hot_map_array_update(struct hot_freq_data *freq_data,
}
}

+/* Update temperatures for each range item for aging purposes */
+static void hot_range_update(struct hot_inode_item *he,
+ struct hot_info *root)
+{
+ struct hot_range_item *hr_nodes[8];
+ u32 start = 0;
+ bool obsolete;
+ int i, n;
+
+ while (1) {
+ spin_lock(&he->lock);
+ n = radix_tree_gang_lookup(&he->hot_range_tree,
+ (void **)hr_nodes, start,
+ ARRAY_SIZE(hr_nodes));
+ if (!n) {
+ spin_unlock(&he->lock);
+ break;
+ }
+ spin_unlock(&he->lock);
+
+ start = hr_nodes[n - 1]->start + 1;
+ for (i = 0; i < n; i++) {
+ kref_get(&hr_nodes[i]->hot_range.refs);
+ hot_map_array_update(
+ &hr_nodes[i]->hot_range.hot_freq_data, root);
+
+ spin_lock(&hr_nodes[i]->hot_range.lock);
+ obsolete = hot_is_obsolete(
+ &hr_nodes[i]->hot_range.hot_freq_data);
+ spin_unlock(&hr_nodes[i]->hot_range.lock);
+
+ hot_range_item_put(hr_nodes[i]);
+ if (obsolete)
+ hot_range_item_put(hr_nodes[i]);
+ }
+ }
+}
+
/*
* Initialize inode and range map arrays.
*/
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index 196b894..f5ec05a 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -26,6 +26,12 @@

#define FREQ_POWER 4

+/*
+ * time to quit keeping track of
+ * tracking data (seconds)
+ */
+#define TIME_TO_KICK 300
+
/* NRR/NRW heat unit = 2^X accesses */
#define NRR_MULTIPLIER_POWER 20 /* NRR - number of reads since mount */
#define NRR_COEFF_POWER 0
--
1.7.6.5


2012-10-29 04:30:56

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 14/19] vfs: add debugfs support

From: Zhi Yong Wu <[email protected]>

Add a /sys/kernel/debug/hot_track/<device_name>/ directory for each
volume that contains two files. The first, `inode_stats', contains the
heat information for inodes that have been brought into the hot data map
structures. The second, `range_stats', contains similar information for
subfile ranges.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/hot_tracking.c | 484 ++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 5 +
include/linux/hot_tracking.h | 1 +
3 files changed, 490 insertions(+), 0 deletions(-)

diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 54a8208..376d7fb 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -21,6 +21,7 @@
#include <linux/blkdev.h>
#include <linux/types.h>
#include <linux/list_sort.h>
+#include <linux/debugfs.h>
#include <linux/limits.h>
#include "hot_tracking.h"

@@ -628,6 +629,477 @@ static void hot_update_worker(struct work_struct *work)
}

/*
+ * take the inode, find ranges associated with inode
+ * and print each range data struct
+ */
+static struct hot_range_item
+*hot_range_tree_walk(struct hot_inode_item *he,
+ loff_t *pos, u32 start, bool flag)
+{
+ struct hot_range_item *hr_nodes[8];
+ loff_t l = *pos;
+ int i, n;
+
+ /* Walk the hot_range_tree for inode */
+ while (1) {
+ spin_lock(&he->lock);
+ n = radix_tree_gang_lookup(&he->hot_range_tree,
+ (void **)hr_nodes, start,
+ ARRAY_SIZE(hr_nodes));
+ if (!n) {
+ spin_unlock(&he->lock);
+ break;
+ }
+ spin_unlock(&he->lock);
+
+ start = hr_nodes[n - 1]->start + 1;
+ for (i = 0; i < n; i++) {
+ if ((!flag && !l--) || (flag)) {
+ if (flag)
+ (*pos)++;
+ kref_get(&hr_nodes[i]->hot_range.refs);
+ return hr_nodes[i];
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void
+*hot_inode_tree_walk(struct seq_file *seq, loff_t *pos,
+ u64 ino, bool type, bool flag)
+{
+ struct hot_info *root = seq->private;
+ struct hot_inode_item *hi_nodes[8];
+ struct hot_range_item *hr;
+ loff_t l = *pos;
+ int i, n;
+
+ while (1) {
+ spin_lock(&root->lock);
+ n = radix_tree_gang_lookup(&root->hot_inode_tree,
+ (void **)hi_nodes, ino,
+ ARRAY_SIZE(hi_nodes));
+ if (!n) {
+ spin_unlock(&root->lock);
+ break;
+ }
+ spin_unlock(&root->lock);
+
+ ino = hi_nodes[n - 1]->i_ino + 1;
+ for (i = 0; i < n; i++) {
+ if (!type) {
+ hr = hot_range_tree_walk(hi_nodes[i],
+ pos, 0, flag);
+ if (hr)
+ return hr;
+ } else {
+ if ((!flag && !l--) || (flag)) {
+ if (flag)
+ (*pos)++;
+ kref_get(&hi_nodes[i]->hot_inode.refs);
+ return hi_nodes[i];
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *hot_range_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return hot_inode_tree_walk(seq, pos, 0, false, false);
+}
+
+static void *hot_range_seq_next(struct seq_file *seq,
+ void *v, loff_t *pos)
+{
+ struct hot_range_item *hr_next, *hr = v;
+ u32 start = hr->start + 1;
+
+ /* Walk the hot_range_tree for inode */
+ hr_next = hot_range_tree_walk(hr->hot_inode, pos, start, true);
+ if (hr_next)
+ return hr_next;
+
+ return hot_inode_tree_walk(seq, pos,
+ hr->hot_inode->i_ino + 1, false, true);
+}
+
+static void hot_range_seq_stop(struct seq_file *seq, void *v)
+{
+ struct hot_range_item *hr = v;
+
+ if (hr)
+ hot_range_item_put(hr);
+}
+
+static int hot_range_seq_show(struct seq_file *seq, void *v)
+{
+ struct hot_range_item *hr = v;
+ struct hot_inode_item *he = hr->hot_inode;
+ struct hot_freq_data *freq_data = &hr->hot_range.hot_freq_data;
+
+ /* Always lock hot_inode_item first */
+ spin_lock(&he->hot_inode.lock);
+ spin_lock(&hr->hot_range.lock);
+ seq_printf(seq, "inode #%llu, range start " \
+ "%llu (range len %u) reads %u, writes %u, "
+ "avg read time %llu, avg write time %llu, temp %u\n",
+ he->i_ino,
+ (u64)hr->start * RANGE_SIZE,
+ hr->len,
+ freq_data->nr_reads,
+ freq_data->nr_writes,
+ freq_data->avg_delta_reads / NSEC_PER_MSEC,
+ freq_data->avg_delta_writes / NSEC_PER_MSEC,
+ freq_data->last_temp >> (32 - HEAT_MAP_BITS));
+ spin_unlock(&hr->hot_range.lock);
+ spin_unlock(&he->hot_inode.lock);
+
+ return 0;
+}
+
+static void *hot_inode_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return hot_inode_tree_walk(seq, pos, 0, true, false);
+}
+
+static void *hot_inode_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct hot_inode_item *he = v;
+ u64 ino = he->i_ino + 1;
+
+ hot_inode_item_put(he);
+
+ return hot_inode_tree_walk(seq, pos, ino, true, true);
+}
+
+static void hot_inode_seq_stop(struct seq_file *seq, void *v)
+{
+ struct hot_inode_item *he = v;
+
+ if (he)
+ hot_inode_item_put(he);
+}
+
+static int hot_inode_seq_show(struct seq_file *seq, void *v)
+{
+ struct hot_inode_item *he = v;
+ struct hot_freq_data *freq_data = &he->hot_inode.hot_freq_data;
+
+ spin_lock(&he->hot_inode.lock);
+ seq_printf(seq, "inode #%llu, reads %u, writes %u, " \
+ "avg read time %llu, avg write time %llu, temp %u\n",
+ he->i_ino,
+ freq_data->nr_reads,
+ freq_data->nr_writes,
+ freq_data->avg_delta_reads / NSEC_PER_MSEC,
+ freq_data->avg_delta_writes / NSEC_PER_MSEC,
+ freq_data->last_temp >> (32 - HEAT_MAP_BITS));
+ spin_unlock(&he->hot_inode.lock);
+
+ return 0;
+}
+
+static void *hot_spot_range_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct hot_info *root = seq->private;
+ struct hot_range_item *hr;
+ struct hot_comm_item *comm_item;
+ struct list_head *n_list;
+ int i;
+
+ for (i = HEAT_MAP_SIZE - 1; i >= 0; i--) {
+ n_list = seq_list_start(
+ &root->heat_range_map[i].node_list, *pos);
+ if (n_list) {
+ comm_item = container_of(n_list,
+ struct hot_comm_item, n_list);
+ hr = container_of(comm_item,
+ struct hot_range_item, hot_range);
+ kref_get(&hr->hot_range.refs);
+ return hr;
+ }
+ }
+
+ return NULL;
+}
+
+static void *hot_spot_range_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct hot_info *root = seq->private;
+ struct hot_range_item *hr_next, *hr = v;
+ struct hot_comm_item *comm_item;
+ struct list_head *n_list;
+ int i =
+ hr->hot_range.hot_freq_data.last_temp >> (32 - HEAT_MAP_BITS);
+
+ n_list = seq_list_next(&hr->hot_range.n_list,
+ &root->heat_range_map[i].node_list, pos);
+ hot_range_item_put(hr);
+next:
+ if (n_list) {
+ comm_item = container_of(n_list,
+ struct hot_comm_item, n_list);
+ hr_next = container_of(comm_item,
+ struct hot_range_item, hot_range);
+ kref_get(&hr_next->hot_range.refs);
+ return hr_next;
+ } else if (--i >= 0) {
+ n_list = seq_list_next(&root->heat_range_map[i].node_list,
+ &root->heat_range_map[i].node_list, pos);
+ goto next;
+ }
+
+ return NULL;
+}
+
+static void *hot_spot_inode_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct hot_info *root = seq->private;
+ struct hot_inode_item *he;
+ struct hot_comm_item *comm_item;
+ struct list_head *n_list;
+ int i;
+
+ for (i = HEAT_MAP_SIZE - 1; i >= 0; i--) {
+ n_list = seq_list_start(
+ &root->heat_inode_map[i].node_list, *pos);
+ if (n_list) {
+ comm_item = container_of(n_list,
+ struct hot_comm_item, n_list);
+ he = container_of(comm_item,
+ struct hot_inode_item, hot_inode);
+ kref_get(&he->hot_inode.refs);
+ return he;
+ }
+ }
+
+ return NULL;
+}
+
+static void *hot_spot_inode_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct hot_info *root = seq->private;
+ struct hot_inode_item *he_next, *he = v;
+ struct hot_comm_item *comm_item;
+ struct list_head *n_list;
+ int i =
+ he->hot_inode.hot_freq_data.last_temp >> (32 - HEAT_MAP_BITS);
+
+ n_list = seq_list_next(&he->hot_inode.n_list,
+ &root->heat_inode_map[i].node_list, pos);
+ hot_inode_item_put(he);
+next:
+ if (n_list) {
+ comm_item = container_of(n_list,
+ struct hot_comm_item, n_list);
+ he_next = container_of(comm_item,
+ struct hot_inode_item, hot_inode);
+ kref_get(&he_next->hot_inode.refs);
+ return he_next;
+ } else if (--i >= 0) {
+ n_list = seq_list_next(&root->heat_inode_map[i].node_list,
+ &root->heat_inode_map[i].node_list, pos);
+ goto next;
+ }
+
+ return NULL;
+}
+
+static const struct seq_operations hot_range_seq_ops = {
+ .start = hot_range_seq_start,
+ .next = hot_range_seq_next,
+ .stop = hot_range_seq_stop,
+ .show = hot_range_seq_show
+};
+
+static const struct seq_operations hot_inode_seq_ops = {
+ .start = hot_inode_seq_start,
+ .next = hot_inode_seq_next,
+ .stop = hot_inode_seq_stop,
+ .show = hot_inode_seq_show
+};
+
+static const struct seq_operations hot_spot_range_seq_ops = {
+ .start = hot_spot_range_seq_start,
+ .next = hot_spot_range_seq_next,
+ .stop = hot_range_seq_stop,
+ .show = hot_range_seq_show
+};
+
+static const struct seq_operations hot_spot_inode_seq_ops = {
+ .start = hot_spot_inode_seq_start,
+ .next = hot_spot_inode_seq_next,
+ .stop = hot_inode_seq_stop,
+ .show = hot_inode_seq_show
+};
+
+static int hot_range_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open_private(file, &hot_range_seq_ops, 0);
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ seq->private = inode->i_private;
+ }
+ return ret;
+}
+
+static int hot_inode_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open_private(file, &hot_inode_seq_ops, 0);
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ seq->private = inode->i_private;
+ }
+ return ret;
+}
+
+static int hot_spot_range_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open_private(file, &hot_spot_range_seq_ops, 0);
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ seq->private = inode->i_private;
+ }
+ return ret;
+}
+
+static int hot_spot_inode_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open_private(file, &hot_spot_inode_seq_ops, 0);
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ seq->private = inode->i_private;
+ }
+ return ret;
+}
+
+/* fops to override for printing range data */
+static const struct file_operations hot_debugfs_range_fops = {
+ .open = hot_range_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* fops to override for printing inode data */
+static const struct file_operations hot_debugfs_inode_fops = {
+ .open = hot_inode_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/* fops to override for printing temperature data */
+static const struct file_operations hot_debugfs_spot_range_fops = {
+ .open = hot_spot_range_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations hot_debugfs_spot_inode_fops = {
+ .open = hot_spot_inode_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct hot_debugfs hot_debugfs[] = {
+ {
+ .name = "rt_stats_range",
+ .fops = &hot_debugfs_range_fops,
+ },
+ {
+ .name = "rt_stats_inode",
+ .fops = &hot_debugfs_inode_fops,
+ },
+ {
+ .name = "hot_spots_range",
+ .fops = &hot_debugfs_spot_range_fops,
+ },
+ {
+ .name = "hot_spots_inode",
+ .fops = &hot_debugfs_spot_inode_fops,
+ },
+};
+
+/* initialize debugfs */
+static int hot_debugfs_init(struct super_block *sb)
+{
+ static const char hot_name[] = "hot_track";
+ struct dentry *vol_dentry, *dentry;
+ int i, ret = 0;
+
+ /* Determine if hot debufs root has existed */
+ sb->s_hot_root->debugfs_root =
+ debugfs_get_dentry(hot_name, NULL, strlen(hot_name));
+ if (IS_ERR_OR_NULL(sb->s_hot_root->debugfs_root)
+ || !sb->s_hot_root->debugfs_root->d_inode) {
+ sb->s_hot_root->debugfs_root =
+ debugfs_create_dir(hot_name, NULL);
+ if (IS_ERR(sb->s_hot_root->debugfs_root)) {
+ ret = PTR_ERR(sb->s_hot_root->debugfs_root);
+ return ret;
+ }
+ }
+
+ if (!S_ISDIR(sb->s_hot_root->debugfs_root->d_inode->i_mode))
+ return -ENOTDIR;
+
+ /* create debugfs folder for this volume by mounted dev name */
+ vol_dentry = debugfs_create_dir(sb->s_id, sb->s_hot_root->debugfs_root);
+ if (IS_ERR(vol_dentry)) {
+ ret = PTR_ERR(vol_dentry);
+ goto err;
+ }
+
+ /* create debugfs hot data files */
+ for (i = 0; i < ARRAY_SIZE(hot_debugfs); i++) {
+ dentry = debugfs_create_file(hot_debugfs[i].name,
+ S_IFREG | S_IRUSR | S_IWUSR,
+ vol_dentry,
+ sb->s_hot_root,
+ hot_debugfs[i].fops);
+ if (IS_ERR(dentry)) {
+ ret = PTR_ERR(dentry);
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ debugfs_remove_recursive(vol_dentry);
+
+ if (list_empty(&sb->s_hot_root->debugfs_root->d_subdirs))
+ debugfs_remove(sb->s_hot_root->debugfs_root);
+
+ return ret;
+}
+
+/* remove dentries for debugsfs */
+static void hot_debugfs_exit(struct super_block *sb)
+{
+ struct dentry *vol_dentry;
+
+ vol_dentry = debugfs_get_dentry(sb->s_id,
+ sb->s_hot_root->debugfs_root, strlen(sb->s_id));
+ /* remove all debugfs entries recursively from the volume root */
+ if (vol_dentry)
+ debugfs_remove_recursive(vol_dentry);
+ else
+ BUG_ON(1);
+
+ if (list_empty(&sb->s_hot_root->debugfs_root->d_subdirs))
+ debugfs_remove(sb->s_hot_root->debugfs_root);
+}
+
+/*
* Initialize kmem cache for hot_inode_item and hot_range_item.
*/
void __init hot_cache_init(void)
@@ -851,10 +1323,21 @@ int hot_track_init(struct super_block *sb)
root->hot_shrink.seeks = DEFAULT_SEEKS;
register_shrinker(&root->hot_shrink);

+ ret = hot_debugfs_init(sb);
+ if (ret) {
+ printk(KERN_ERR "%s: hot_debugfs_init error: %d\n",
+ __func__, ret);
+ goto failed_debugfs;
+ }
+
printk(KERN_INFO "VFS: Turning on hot data tracking\n");

return 0;

+failed_debugfs:
+ unregister_shrinker(&root->hot_shrink);
+ cancel_delayed_work_sync(&root->update_work);
+ destroy_workqueue(root->update_wq);
failed_wq:
hot_map_array_exit(root);
hot_inode_tree_exit(root);
@@ -872,6 +1355,7 @@ void hot_track_exit(struct super_block *sb)
destroy_workqueue(root->update_wq);
hot_map_array_exit(root);
hot_inode_tree_exit(root);
+ hot_debugfs_exit(sb);
kfree(root);
}
EXPORT_SYMBOL_GPL(hot_track_exit);
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index 92e31fb..f5ba2d6 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -56,6 +56,11 @@
#define AVW_DIVIDER_POWER 40 /* AVW - average delta between recent writes(ns) */
#define AVW_COEFF_POWER 0

+struct hot_debugfs {
+ const char *name;
+ const struct file_operations *fops;
+};
+
void hot_inode_item_put(struct hot_inode_item *he);
struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);

diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index b3ed251..a16217f 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -119,6 +119,7 @@ struct hot_info {
struct delayed_work update_work;
struct hot_func_type *hot_func_type;
struct shrinker hot_shrink;
+ struct dentry *debugfs_root;
};

/*
--
1.7.6.5

2012-10-29 04:32:08

by Zhi Yong Wu

[permalink] [raw]
Subject: [RFC v4+ hot_track 05/19] vfs: add hooks to enable hot tracking

From: Zhi Yong Wu <[email protected]>

Miscellaneous features that implement hot data tracking
and generally make the hot data functions a bit more friendly.

Signed-off-by: Zhi Yong Wu <[email protected]>
---
fs/direct-io.c | 6 ++++++
mm/filemap.c | 6 ++++++
mm/page-writeback.c | 12 ++++++++++++
mm/readahead.c | 6 ++++++
4 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720..1d23631 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include "hot_tracking.h"

/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -1297,6 +1298,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);

+ /* Hot data tracking */
+ hot_update_freqs(inode, (u64)offset,
+ (u64)iov_length(iov, nr_segs),
+ rw & WRITE);
+
return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
nr_segs, get_block, end_io,
submit_io, flags);
diff --git a/mm/filemap.c b/mm/filemap.c
index 83efee7..51b2c48 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/hot_tracking.h>
#include "internal.h"

/*
@@ -1224,6 +1225,11 @@ readpage:
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
+
+ /* Hot data tracking */
+ hot_update_freqs(inode, (u64)page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b..5220040 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,7 @@
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <linux/timer.h>
+#include <linux/hot_tracking.h>
#include <trace/events/writeback.h>

/*
@@ -1903,13 +1904,24 @@ EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
+ pgoff_t start = 0;
+ u64 count = 0;

if (wbc->nr_to_write <= 0)
return 0;
+
+ start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+ count = (u64)wbc->nr_to_write;
+
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
+
+ /* Hot data tracking */
+ hot_update_freqs(mapping->host, (u64)start,
+ (count - (u64)wbc->nr_to_write) * PAGE_CACHE_SIZE, 1);
+
return ret;
}

diff --git a/mm/readahead.c b/mm/readahead.c
index 7963f23..8a24f1e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
+#include <linux/hot_tracking.h>

/*
* Initialise a struct file's readahead state. Assumes that the caller has
@@ -138,6 +139,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
out:
blk_finish_plug(&plug);

+ /* Hot data tracking */
+ hot_update_freqs(mapping->host, (u64)(list_entry(pages->prev,\
+ struct page, lru)->index) << PAGE_CACHE_SHIFT,
+ (u64)nr_pages * PAGE_CACHE_SIZE, 0);
+
return ret;
}

--
1.7.6.5


2012-10-29 10:30:19

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 00/19] vfs: hot data tracking

[email protected] writes:
>
> TODO List:
>
> 1.) Need to do scalability or performance tests.

You're changing some of the most performance critical code in the
kernel. This step is absolutely not optional.

-Andi

--
[email protected] -- Speaking for myself only

2012-10-29 12:31:47

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 00/19] vfs: hot data tracking

On Mon, Oct 29, 2012 at 6:30 PM, Andi Kleen <[email protected]> wrote:
> [email protected] writes:
>>
>> TODO List:
>>
>> 1.) Need to do scalability or performance tests.
>
> You're changing some of the most performance critical code in the
> kernel. This step is absolutely not optional.
ah, i know, but now i need to make sure all the codes are correct at
first, then do these tests.

>
> -Andi
>
> --
> [email protected] -- Speaking for myself only



--
Regards,

Zhi Yong Wu

2012-10-29 18:10:40

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 15/19] sysfs: add two hot_track proc files

On Mon, Oct 29, 2012 at 12:30:57PM +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> Add two proc files hot-kick-time and hot-update-delay
> under the dir /proc/sys/fs/ in order to turn
> TIME_TO_KICK and HEAT_UPDATE_DELAY into be tunable.

As you say, these are proc files, not sysfs files, so please fix the
Subject: up here.

thanks,

greg k-h

2012-10-29 18:11:35

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 13/19] debugfs: introduce one function

On Mon, Oct 29, 2012 at 12:30:55PM +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> The debugfs function is used to get expected dentry.

Huh? Why do you need this? Why haven't you added documentation for the
function saying what it does?

confused,

greg k-h

2012-10-29 22:25:50

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 13/19] debugfs: introduce one function

On Tue, Oct 30, 2012 at 2:11 AM, Greg KH <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:55PM +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> The debugfs function is used to get expected dentry.
>
> Huh? Why do you need this? Why haven't you added documentation for the
It is used to determine if one sysfs directory has been created. OK, i
will add some doc, thanks for your suggestion.

> function saying what it does?
>
> confused,
>
> greg k-h



--
Regards,

Zhi Yong Wu

2012-10-29 22:26:40

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 15/19] sysfs: add two hot_track proc files

On Tue, Oct 30, 2012 at 2:10 AM, Greg KH <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:57PM +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> Add two proc files hot-kick-time and hot-update-delay
>> under the dir /proc/sys/fs/ in order to turn
>> TIME_TO_KICK and HEAT_UPDATE_DELAY into be tunable.
>
> As you say, these are proc files, not sysfs files, so please fix the
> Subject: up here.
ah, OK, i will fix it, thanks for your pointing it out.
>
> thanks,
>
> greg k-h



--
Regards,

Zhi Yong Wu

2012-10-29 22:34:04

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 13/19] debugfs: introduce one function

On Tue, Oct 30, 2012 at 06:25:50AM +0800, Zhi Yong Wu wrote:
> On Tue, Oct 30, 2012 at 2:11 AM, Greg KH <[email protected]> wrote:
> > On Mon, Oct 29, 2012 at 12:30:55PM +0800, [email protected] wrote:
> >> From: Zhi Yong Wu <[email protected]>
> >>
> >> The debugfs function is used to get expected dentry.
> >
> > Huh? Why do you need this? Why haven't you added documentation for the
> It is used to determine if one sysfs directory has been created. OK, i
> will add some doc, thanks for your suggestion.

You didn't answer the "why" part here. How come you think you need
this? Can't you just save off the dentry you created somewhere so you
don't need to look it up again?

greg k-h

2012-10-29 22:45:21

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 13/19] debugfs: introduce one function

On Tue, Oct 30, 2012 at 6:34 AM, Greg KH <[email protected]> wrote:
> On Tue, Oct 30, 2012 at 06:25:50AM +0800, Zhi Yong Wu wrote:
>> On Tue, Oct 30, 2012 at 2:11 AM, Greg KH <[email protected]> wrote:
>> > On Mon, Oct 29, 2012 at 12:30:55PM +0800, [email protected] wrote:
>> >> From: Zhi Yong Wu <[email protected]>
>> >>
>> >> The debugfs function is used to get expected dentry.
>> >
>> > Huh? Why do you need this? Why haven't you added documentation for the
>> It is used to determine if one sysfs directory has been created. OK, i
>> will add some doc, thanks for your suggestion.
>
> You didn't answer the "why" part here. How come you think you need
ah, Let me say its scenario at first. If we do two mount ops as below:
1.) mount -o loop,hot_track image1 /data1
2.) mount -o loop,hot_track image2 /data2

The mount -o hot_track operation will automatically create one sysfs
directory /sys/kernel/debug/hot_track. To prevent this dir being
created again when 2.) is done, we need to know if it has existed at
first. In my patch, i at first get its dentry by this new function,
then determine if its d_inode field is NULL, if no, it means that this
sysfs dir has existed.
This is the reason that i want to add one new function.

> this? Can't you just save off the dentry you created somewhere so you
> don't need to look it up again?
Because i can't find one appropriate place to save it.
>
> greg k-h



--
Regards,

Zhi Yong Wu

2012-10-29 22:54:21

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 13/19] debugfs: introduce one function

On Tue, Oct 30, 2012 at 06:45:19AM +0800, Zhi Yong Wu wrote:
> On Tue, Oct 30, 2012 at 6:34 AM, Greg KH <[email protected]> wrote:
> > On Tue, Oct 30, 2012 at 06:25:50AM +0800, Zhi Yong Wu wrote:
> >> On Tue, Oct 30, 2012 at 2:11 AM, Greg KH <[email protected]> wrote:
> >> > On Mon, Oct 29, 2012 at 12:30:55PM +0800, [email protected] wrote:
> >> >> From: Zhi Yong Wu <[email protected]>
> >> >>
> >> >> The debugfs function is used to get expected dentry.
> >> >
> >> > Huh? Why do you need this? Why haven't you added documentation for the
> >> It is used to determine if one sysfs directory has been created. OK, i
> >> will add some doc, thanks for your suggestion.
> >
> > You didn't answer the "why" part here. How come you think you need
> ah, Let me say its scenario at first. If we do two mount ops as below:
> 1.) mount -o loop,hot_track image1 /data1
> 2.) mount -o loop,hot_track image2 /data2
>
> The mount -o hot_track operation will automatically create one sysfs
> directory /sys/kernel/debug/hot_track. To prevent this dir being
> created again when 2.) is done, we need to know if it has existed at
> first. In my patch, i at first get its dentry by this new function,
> then determine if its d_inode field is NULL, if no, it means that this
> sysfs dir has existed.
> This is the reason that i want to add one new function.

Why not do like the rest of the kernel does and just have a:
static dentry *hot_track_root;
and use that as your root debugfs directory dentry:

if (!hot_track_root) {
/* Create root directory */
hot_track_root = debugfs_create(...);
}

No need to look anything up :)

thanks,

greg k-h

2012-10-29 22:58:13

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 13/19] debugfs: introduce one function

On Tue, Oct 30, 2012 at 6:54 AM, Greg KH <[email protected]> wrote:
> On Tue, Oct 30, 2012 at 06:45:19AM +0800, Zhi Yong Wu wrote:
>> On Tue, Oct 30, 2012 at 6:34 AM, Greg KH <[email protected]> wrote:
>> > On Tue, Oct 30, 2012 at 06:25:50AM +0800, Zhi Yong Wu wrote:
>> >> On Tue, Oct 30, 2012 at 2:11 AM, Greg KH <[email protected]> wrote:
>> >> > On Mon, Oct 29, 2012 at 12:30:55PM +0800, [email protected] wrote:
>> >> >> From: Zhi Yong Wu <[email protected]>
>> >> >>
>> >> >> The debugfs function is used to get expected dentry.
>> >> >
>> >> > Huh? Why do you need this? Why haven't you added documentation for the
>> >> It is used to determine if one sysfs directory has been created. OK, i
>> >> will add some doc, thanks for your suggestion.
>> >
>> > You didn't answer the "why" part here. How come you think you need
>> ah, Let me say its scenario at first. If we do two mount ops as below:
>> 1.) mount -o loop,hot_track image1 /data1
>> 2.) mount -o loop,hot_track image2 /data2
>>
>> The mount -o hot_track operation will automatically create one sysfs
>> directory /sys/kernel/debug/hot_track. To prevent this dir being
>> created again when 2.) is done, we need to know if it has existed at
>> first. In my patch, i at first get its dentry by this new function,
>> then determine if its d_inode field is NULL, if no, it means that this
>> sysfs dir has existed.
>> This is the reason that i want to add one new function.
>
> Why not do like the rest of the kernel does and just have a:
> static dentry *hot_track_root;
> and use that as your root debugfs directory dentry:
ah, i'm one newbie, don't get familar with other kernel part, but this
is one good point, i will apply it, thanks.
>
> if (!hot_track_root) {
> /* Create root directory */
> hot_track_root = debugfs_create(...);
> }
>
> No need to look anything up :)
>
> thanks,
>
> greg k-h



--
Regards,

Zhi Yong Wu

2012-11-05 11:07:53

by Steven Whitehouse

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

Hi,

On Mon, 2012-10-29 at 12:30 +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> Add some util helpers to update access frequencies
> for one file or its range.
>
> Signed-off-by: Zhi Yong Wu <[email protected]>
> ---
> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
> fs/hot_tracking.h | 7 ++
> include/linux/hot_tracking.h | 2 +
> 3 files changed, 188 insertions(+), 0 deletions(-)
>
> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> index 68591f0..0a7d9a3 100644
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
> }
> }
>
> +struct hot_inode_item
> +*hot_inode_item_find(struct hot_info *root, u64 ino)
> +{
> + struct hot_inode_item *he;
> + int ret;
> +
> +again:
> + spin_lock(&root->lock);
> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
> + if (he) {
> + kref_get(&he->hot_inode.refs);
> + spin_unlock(&root->lock);
> + return he;
> + }
> + spin_unlock(&root->lock);
> +
> + he = kmem_cache_zalloc(hot_inode_item_cachep,
> + GFP_KERNEL | GFP_NOFS);
This doesn't look quite right... which of these two did you mean? I
assume probably just GFP_NOFS

> + if (!he)
> + return ERR_PTR(-ENOMEM);
> +
> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
> +
> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> + if (ret) {
> + kmem_cache_free(hot_inode_item_cachep, he);
> + return ERR_PTR(ret);
> + }
> +
> + spin_lock(&root->lock);
> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
> + if (ret == -EEXIST) {
> + kmem_cache_free(hot_inode_item_cachep, he);
> + spin_unlock(&root->lock);
> + radix_tree_preload_end();
> + goto again;
> + }
> + spin_unlock(&root->lock);
> + radix_tree_preload_end();
> +
> + kref_get(&he->hot_inode.refs);
> + return he;
> +}
> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
> +
> +static struct hot_range_item
> +*hot_range_item_find(struct hot_inode_item *he,
> + u32 start)
> +{
> + struct hot_range_item *hr;
> + int ret;
> +
> +again:
> + spin_lock(&he->lock);
> + hr = radix_tree_lookup(&he->hot_range_tree, start);
> + if (hr) {
> + kref_get(&hr->hot_range.refs);
> + spin_unlock(&he->lock);
> + return hr;
> + }
> + spin_unlock(&he->lock);
> +
> + hr = kmem_cache_zalloc(hot_range_item_cachep,
> + GFP_KERNEL | GFP_NOFS);
Likewise, here too.

Steve.




2012-11-05 11:26:56

by Steven Whitehouse

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 09/19] vfs: add one work queue

Hi,

On Mon, 2012-10-29 at 12:30 +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> Add a per-superblock workqueue and a delayed_work
> to run periodic work to update map info on each superblock.
>
> Signed-off-by: Zhi Yong Wu <[email protected]>
> ---
> fs/hot_tracking.c | 85 ++++++++++++++++++++++++++++++++++++++++++
> fs/hot_tracking.h | 3 +
> include/linux/hot_tracking.h | 3 +
> 3 files changed, 91 insertions(+), 0 deletions(-)
>
> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> index fff0038..0ef9cad 100644
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -15,9 +15,12 @@
> #include <linux/module.h>
> #include <linux/spinlock.h>
> #include <linux/hardirq.h>
> +#include <linux/kthread.h>
> +#include <linux/freezer.h>
> #include <linux/fs.h>
> #include <linux/blkdev.h>
> #include <linux/types.h>
> +#include <linux/list_sort.h>
> #include <linux/limits.h>
> #include "hot_tracking.h"
>
> @@ -557,6 +560,67 @@ static void hot_map_array_exit(struct hot_info *root)
> }
> }
>
> +/* Temperature compare function*/
> +static int hot_temp_cmp(void *priv, struct list_head *a,
> + struct list_head *b)
> +{
> + struct hot_comm_item *ap =
> + container_of(a, struct hot_comm_item, n_list);
> + struct hot_comm_item *bp =
> + container_of(b, struct hot_comm_item, n_list);
> +
> + int diff = ap->hot_freq_data.last_temp
> + - bp->hot_freq_data.last_temp;
> + if (diff > 0)
> + return -1;
> + if (diff < 0)
> + return 1;
> + return 0;
> +}
> +
> +/*
> + * Every sync period we update temperatures for
> + * each hot inode item and hot range item for aging
> + * purposes.
> + */
> +static void hot_update_worker(struct work_struct *work)
> +{
> + struct hot_info *root = container_of(to_delayed_work(work),
> + struct hot_info, update_work);
> + struct hot_inode_item *hi_nodes[8];
> + u64 ino = 0;
> + int i, n;
> +
> + while (1) {
> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
> + (void **)hi_nodes, ino,
> + ARRAY_SIZE(hi_nodes));
> + if (!n)
> + break;
> +
> + ino = hi_nodes[n - 1]->i_ino + 1;
> + for (i = 0; i < n; i++) {
> + kref_get(&hi_nodes[i]->hot_inode.refs);
> + hot_map_array_update(
> + &hi_nodes[i]->hot_inode.hot_freq_data, root);
> + hot_range_update(hi_nodes[i], root);
> + hot_inode_item_put(hi_nodes[i]);
> + }
> + }
> +
> + /* Sort temperature map info */
> + for (i = 0; i < HEAT_MAP_SIZE; i++) {
> + list_sort(NULL, &root->heat_inode_map[i].node_list,
> + hot_temp_cmp);
> + list_sort(NULL, &root->heat_range_map[i].node_list,
> + hot_temp_cmp);
> + }
> +

If this list can potentially have one (or more) entries per inode, then
filesystems with a lot of inodes (millions) may potentially exceed the
max size of list which list_sort() can handle. If that happens it still
works, but you'll get a warning message and it won't be as efficient.

It is something that we've run into with list_sort() and GFS2, but it
only happens very rarely,

Steve.




2012-11-05 11:47:35

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Mon, Nov 5, 2012 at 7:07 PM, Steven Whitehouse <[email protected]> wrote:
> Hi,
>
> On Mon, 2012-10-29 at 12:30 +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> Add some util helpers to update access frequencies
>> for one file or its range.
>>
>> Signed-off-by: Zhi Yong Wu <[email protected]>
>> ---
>> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
>> fs/hot_tracking.h | 7 ++
>> include/linux/hot_tracking.h | 2 +
>> 3 files changed, 188 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> index 68591f0..0a7d9a3 100644
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
>> }
>> }
>>
>> +struct hot_inode_item
>> +*hot_inode_item_find(struct hot_info *root, u64 ino)
>> +{
>> + struct hot_inode_item *he;
>> + int ret;
>> +
>> +again:
>> + spin_lock(&root->lock);
>> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
>> + if (he) {
>> + kref_get(&he->hot_inode.refs);
>> + spin_unlock(&root->lock);
>> + return he;
>> + }
>> + spin_unlock(&root->lock);
>> +
>> + he = kmem_cache_zalloc(hot_inode_item_cachep,
>> + GFP_KERNEL | GFP_NOFS);
> This doesn't look quite right... which of these two did you mean? I
> assume probably just GFP_NOFS
Yes, good catch, thanks.
>
>> + if (!he)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
>> +
>> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> + if (ret) {
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> + return ERR_PTR(ret);
>> + }
>> +
>> + spin_lock(&root->lock);
>> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
>> + if (ret == -EEXIST) {
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> + spin_unlock(&root->lock);
>> + radix_tree_preload_end();
>> + goto again;
>> + }
>> + spin_unlock(&root->lock);
>> + radix_tree_preload_end();
>> +
>> + kref_get(&he->hot_inode.refs);
>> + return he;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
>> +
>> +static struct hot_range_item
>> +*hot_range_item_find(struct hot_inode_item *he,
>> + u32 start)
>> +{
>> + struct hot_range_item *hr;
>> + int ret;
>> +
>> +again:
>> + spin_lock(&he->lock);
>> + hr = radix_tree_lookup(&he->hot_range_tree, start);
>> + if (hr) {
>> + kref_get(&hr->hot_range.refs);
>> + spin_unlock(&he->lock);
>> + return hr;
>> + }
>> + spin_unlock(&he->lock);
>> +
>> + hr = kmem_cache_zalloc(hot_range_item_cachep,
>> + GFP_KERNEL | GFP_NOFS);
> Likewise, here too.
ditto
>
> Steve.
>
>
>



--
Regards,

Zhi Yong Wu

2012-11-05 11:55:40

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 09/19] vfs: add one work queue

On Mon, Nov 5, 2012 at 7:21 PM, Steven Whitehouse <[email protected]> wrote:
> Hi,
>
> On Mon, 2012-10-29 at 12:30 +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> Add a per-superblock workqueue and a delayed_work
>> to run periodic work to update map info on each superblock.
>>
>> Signed-off-by: Zhi Yong Wu <[email protected]>
>> ---
>> fs/hot_tracking.c | 85 ++++++++++++++++++++++++++++++++++++++++++
>> fs/hot_tracking.h | 3 +
>> include/linux/hot_tracking.h | 3 +
>> 3 files changed, 91 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> index fff0038..0ef9cad 100644
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -15,9 +15,12 @@
>> #include <linux/module.h>
>> #include <linux/spinlock.h>
>> #include <linux/hardirq.h>
>> +#include <linux/kthread.h>
>> +#include <linux/freezer.h>
>> #include <linux/fs.h>
>> #include <linux/blkdev.h>
>> #include <linux/types.h>
>> +#include <linux/list_sort.h>
>> #include <linux/limits.h>
>> #include "hot_tracking.h"
>>
>> @@ -557,6 +560,67 @@ static void hot_map_array_exit(struct hot_info *root)
>> }
>> }
>>
>> +/* Temperature compare function*/
>> +static int hot_temp_cmp(void *priv, struct list_head *a,
>> + struct list_head *b)
>> +{
>> + struct hot_comm_item *ap =
>> + container_of(a, struct hot_comm_item, n_list);
>> + struct hot_comm_item *bp =
>> + container_of(b, struct hot_comm_item, n_list);
>> +
>> + int diff = ap->hot_freq_data.last_temp
>> + - bp->hot_freq_data.last_temp;
>> + if (diff > 0)
>> + return -1;
>> + if (diff < 0)
>> + return 1;
>> + return 0;
>> +}
>> +
>> +/*
>> + * Every sync period we update temperatures for
>> + * each hot inode item and hot range item for aging
>> + * purposes.
>> + */
>> +static void hot_update_worker(struct work_struct *work)
>> +{
>> + struct hot_info *root = container_of(to_delayed_work(work),
>> + struct hot_info, update_work);
>> + struct hot_inode_item *hi_nodes[8];
>> + u64 ino = 0;
>> + int i, n;
>> +
>> + while (1) {
>> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
>> + (void **)hi_nodes, ino,
>> + ARRAY_SIZE(hi_nodes));
>> + if (!n)
>> + break;
>> +
>> + ino = hi_nodes[n - 1]->i_ino + 1;
>> + for (i = 0; i < n; i++) {
>> + kref_get(&hi_nodes[i]->hot_inode.refs);
>> + hot_map_array_update(
>> + &hi_nodes[i]->hot_inode.hot_freq_data, root);
>> + hot_range_update(hi_nodes[i], root);
>> + hot_inode_item_put(hi_nodes[i]);
>> + }
>> + }
>> +
>> + /* Sort temperature map info */
>> + for (i = 0; i < HEAT_MAP_SIZE; i++) {
>> + list_sort(NULL, &root->heat_inode_map[i].node_list,
>> + hot_temp_cmp);
>> + list_sort(NULL, &root->heat_range_map[i].node_list,
>> + hot_temp_cmp);
>> + }
>> +
>
> If this list can potentially have one (or more) entries per inode, then
Only one hot_inode_item per inode, while maybe multiple
hot_range_items per inode.
> filesystems with a lot of inodes (millions) may potentially exceed the
> max size of list which list_sort() can handle. If that happens it still
> works, but you'll get a warning message and it won't be as efficient.
I haven't do so large scale test. If we want to find that issue, we
need to do large scale performance test, before that, i want to make
sure the code change is correct at first.
To be honest, for that issue you pointed to, i also have such
concern.But list_sort() performance looks good from the test result of
the following URL:
https://lkml.org/lkml/2010/1/20/485

>
> It is something that we've run into with list_sort() and GFS2, but it
> only happens very rarely,
Beside list_sort(), do you have any other way to share? For this
concern, how does GFS2 resolve it?

>
> Steve.
>
>
>



--
Regards,

Zhi Yong Wu

2012-11-05 12:08:23

by Steven Whitehouse

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 09/19] vfs: add one work queue

Hi,

On Mon, 2012-11-05 at 19:55 +0800, Zhi Yong Wu wrote:
> On Mon, Nov 5, 2012 at 7:21 PM, Steven Whitehouse <[email protected]> wrote:
> > Hi,
> >
> > On Mon, 2012-10-29 at 12:30 +0800, [email protected] wrote:
> >> From: Zhi Yong Wu <[email protected]>
> >>
> >> Add a per-superblock workqueue and a delayed_work
> >> to run periodic work to update map info on each superblock.
> >>
> >> Signed-off-by: Zhi Yong Wu <[email protected]>
> >> ---
> >> fs/hot_tracking.c | 85 ++++++++++++++++++++++++++++++++++++++++++
> >> fs/hot_tracking.h | 3 +
> >> include/linux/hot_tracking.h | 3 +
> >> 3 files changed, 91 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> >> index fff0038..0ef9cad 100644
> >> --- a/fs/hot_tracking.c
> >> +++ b/fs/hot_tracking.c
> >> @@ -15,9 +15,12 @@
> >> #include <linux/module.h>
> >> #include <linux/spinlock.h>
> >> #include <linux/hardirq.h>
> >> +#include <linux/kthread.h>
> >> +#include <linux/freezer.h>
> >> #include <linux/fs.h>
> >> #include <linux/blkdev.h>
> >> #include <linux/types.h>
> >> +#include <linux/list_sort.h>
> >> #include <linux/limits.h>
> >> #include "hot_tracking.h"
> >>
> >> @@ -557,6 +560,67 @@ static void hot_map_array_exit(struct hot_info *root)
> >> }
> >> }
> >>
> >> +/* Temperature compare function*/
> >> +static int hot_temp_cmp(void *priv, struct list_head *a,
> >> + struct list_head *b)
> >> +{
> >> + struct hot_comm_item *ap =
> >> + container_of(a, struct hot_comm_item, n_list);
> >> + struct hot_comm_item *bp =
> >> + container_of(b, struct hot_comm_item, n_list);
> >> +
> >> + int diff = ap->hot_freq_data.last_temp
> >> + - bp->hot_freq_data.last_temp;
> >> + if (diff > 0)
> >> + return -1;
> >> + if (diff < 0)
> >> + return 1;
> >> + return 0;
> >> +}
> >> +
> >> +/*
> >> + * Every sync period we update temperatures for
> >> + * each hot inode item and hot range item for aging
> >> + * purposes.
> >> + */
> >> +static void hot_update_worker(struct work_struct *work)
> >> +{
> >> + struct hot_info *root = container_of(to_delayed_work(work),
> >> + struct hot_info, update_work);
> >> + struct hot_inode_item *hi_nodes[8];
> >> + u64 ino = 0;
> >> + int i, n;
> >> +
> >> + while (1) {
> >> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
> >> + (void **)hi_nodes, ino,
> >> + ARRAY_SIZE(hi_nodes));
> >> + if (!n)
> >> + break;
> >> +
> >> + ino = hi_nodes[n - 1]->i_ino + 1;
> >> + for (i = 0; i < n; i++) {
> >> + kref_get(&hi_nodes[i]->hot_inode.refs);
> >> + hot_map_array_update(
> >> + &hi_nodes[i]->hot_inode.hot_freq_data, root);
> >> + hot_range_update(hi_nodes[i], root);
> >> + hot_inode_item_put(hi_nodes[i]);
> >> + }
> >> + }
> >> +
> >> + /* Sort temperature map info */
> >> + for (i = 0; i < HEAT_MAP_SIZE; i++) {
> >> + list_sort(NULL, &root->heat_inode_map[i].node_list,
> >> + hot_temp_cmp);
> >> + list_sort(NULL, &root->heat_range_map[i].node_list,
> >> + hot_temp_cmp);
> >> + }
> >> +
> >
> > If this list can potentially have one (or more) entries per inode, then
> Only one hot_inode_item per inode, while maybe multiple
> hot_range_items per inode.
> > filesystems with a lot of inodes (millions) may potentially exceed the
> > max size of list which list_sort() can handle. If that happens it still
> > works, but you'll get a warning message and it won't be as efficient.
> I haven't do so large scale test. If we want to find that issue, we
> need to do large scale performance test, before that, i want to make
> sure the code change is correct at first.
> To be honest, for that issue you pointed to, i also have such
> concern.But list_sort() performance looks good from the test result of
> the following URL:
> https://lkml.org/lkml/2010/1/20/485
>
Yes, I think it is good. Also, even when it says that it's performance
is poor (via the warning message) it is still much better than the
alternative (of not sorting) in the GFS2 case. So currently our
workaround is to ignore the warning. Due to what we using it for
(sorting the data blocks for ordered writeback) we only see it very
occasionally when there has been lots of data write activity with little
journal activity on a node with lots of RAM.

> >
> > It is something that we've run into with list_sort() and GFS2, but it
> > only happens very rarely,
> Beside list_sort(), do you have any other way to share? For this
> concern, how does GFS2 resolve it?
>
That is an ongoing investigation :-)

I've pondered various options... increase temp variable space in
list_sort(), not using list_sort() and insertion sorting the blocks
instead, flushing the ordered write data early if the list gets too
long, figuring out how to remove blocks written back by the VM from the
list before the sort, and various other possible solutions. So far I'm
not sure which will be the best to choose, and since your situation is a
bit different it might not make sense to use the same solution.

I just thought it was worth mentioning though since it was something
that we'd run across,

Steve.



2012-11-05 12:20:29

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 09/19] vfs: add one work queue

On Mon, Nov 5, 2012 at 8:07 PM, Steven Whitehouse <[email protected]> wrote:
> Hi,
>
> On Mon, 2012-11-05 at 19:55 +0800, Zhi Yong Wu wrote:
>> On Mon, Nov 5, 2012 at 7:21 PM, Steven Whitehouse <[email protected]> wrote:
>> > Hi,
>> >
>> > On Mon, 2012-10-29 at 12:30 +0800, [email protected] wrote:
>> >> From: Zhi Yong Wu <[email protected]>
>> >>
>> >> Add a per-superblock workqueue and a delayed_work
>> >> to run periodic work to update map info on each superblock.
>> >>
>> >> Signed-off-by: Zhi Yong Wu <[email protected]>
>> >> ---
>> >> fs/hot_tracking.c | 85 ++++++++++++++++++++++++++++++++++++++++++
>> >> fs/hot_tracking.h | 3 +
>> >> include/linux/hot_tracking.h | 3 +
>> >> 3 files changed, 91 insertions(+), 0 deletions(-)
>> >>
>> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> >> index fff0038..0ef9cad 100644
>> >> --- a/fs/hot_tracking.c
>> >> +++ b/fs/hot_tracking.c
>> >> @@ -15,9 +15,12 @@
>> >> #include <linux/module.h>
>> >> #include <linux/spinlock.h>
>> >> #include <linux/hardirq.h>
>> >> +#include <linux/kthread.h>
>> >> +#include <linux/freezer.h>
>> >> #include <linux/fs.h>
>> >> #include <linux/blkdev.h>
>> >> #include <linux/types.h>
>> >> +#include <linux/list_sort.h>
>> >> #include <linux/limits.h>
>> >> #include "hot_tracking.h"
>> >>
>> >> @@ -557,6 +560,67 @@ static void hot_map_array_exit(struct hot_info *root)
>> >> }
>> >> }
>> >>
>> >> +/* Temperature compare function*/
>> >> +static int hot_temp_cmp(void *priv, struct list_head *a,
>> >> + struct list_head *b)
>> >> +{
>> >> + struct hot_comm_item *ap =
>> >> + container_of(a, struct hot_comm_item, n_list);
>> >> + struct hot_comm_item *bp =
>> >> + container_of(b, struct hot_comm_item, n_list);
>> >> +
>> >> + int diff = ap->hot_freq_data.last_temp
>> >> + - bp->hot_freq_data.last_temp;
>> >> + if (diff > 0)
>> >> + return -1;
>> >> + if (diff < 0)
>> >> + return 1;
>> >> + return 0;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Every sync period we update temperatures for
>> >> + * each hot inode item and hot range item for aging
>> >> + * purposes.
>> >> + */
>> >> +static void hot_update_worker(struct work_struct *work)
>> >> +{
>> >> + struct hot_info *root = container_of(to_delayed_work(work),
>> >> + struct hot_info, update_work);
>> >> + struct hot_inode_item *hi_nodes[8];
>> >> + u64 ino = 0;
>> >> + int i, n;
>> >> +
>> >> + while (1) {
>> >> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
>> >> + (void **)hi_nodes, ino,
>> >> + ARRAY_SIZE(hi_nodes));
>> >> + if (!n)
>> >> + break;
>> >> +
>> >> + ino = hi_nodes[n - 1]->i_ino + 1;
>> >> + for (i = 0; i < n; i++) {
>> >> + kref_get(&hi_nodes[i]->hot_inode.refs);
>> >> + hot_map_array_update(
>> >> + &hi_nodes[i]->hot_inode.hot_freq_data, root);
>> >> + hot_range_update(hi_nodes[i], root);
>> >> + hot_inode_item_put(hi_nodes[i]);
>> >> + }
>> >> + }
>> >> +
>> >> + /* Sort temperature map info */
>> >> + for (i = 0; i < HEAT_MAP_SIZE; i++) {
>> >> + list_sort(NULL, &root->heat_inode_map[i].node_list,
>> >> + hot_temp_cmp);
>> >> + list_sort(NULL, &root->heat_range_map[i].node_list,
>> >> + hot_temp_cmp);
>> >> + }
>> >> +
>> >
>> > If this list can potentially have one (or more) entries per inode, then
>> Only one hot_inode_item per inode, while maybe multiple
>> hot_range_items per inode.
>> > filesystems with a lot of inodes (millions) may potentially exceed the
>> > max size of list which list_sort() can handle. If that happens it still
>> > works, but you'll get a warning message and it won't be as efficient.
>> I haven't do so large scale test. If we want to find that issue, we
>> need to do large scale performance test, before that, i want to make
>> sure the code change is correct at first.
>> To be honest, for that issue you pointed to, i also have such
>> concern.But list_sort() performance looks good from the test result of
>> the following URL:
>> https://lkml.org/lkml/2010/1/20/485
>>
> Yes, I think it is good. Also, even when it says that it's performance
> is poor (via the warning message) it is still much better than the
> alternative (of not sorting) in the GFS2 case. So currently our
> workaround is to ignore the warning. Due to what we using it for
> (sorting the data blocks for ordered writeback) we only see it very
> occasionally when there has been lots of data write activity with little
> journal activity on a node with lots of RAM.
OK.
>
>> >
>> > It is something that we've run into with list_sort() and GFS2, but it
>> > only happens very rarely,
>> Beside list_sort(), do you have any other way to share? For this
>> concern, how does GFS2 resolve it?
>>
> That is an ongoing investigation :-)
>
> I've pondered various options... increase temp variable space in
> list_sort(), not using list_sort() and insertion sorting the blocks
> instead, flushing the ordered write data early if the list gets too
> long, figuring out how to remove blocks written back by the VM from the
> list before the sort, and various other possible solutions. So far I'm
> not sure which will be the best to choose, and since your situation is a
> bit different it might not make sense to use the same solution.
>
> I just thought it was worth mentioning though since it was something
> that we'd run across,
thanks for your experience share. anyway, thanks.

By the way, it will be appreciated if you can comment on other patches.
>
> Steve.
>
>



--
Regards,

Zhi Yong Wu

2012-11-06 22:25:18

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 02/19] vfs: initialize and free data structures

On Mon, Oct 29, 2012 at 12:30:44PM +0800, [email protected] wrote:
> +/* Frees the entire hot_range_tree. */
> +static void hot_inode_item_free(struct kref *kref)
> +{
> + struct hot_comm_item *comm_item = container_of(kref,
> + struct hot_comm_item, refs);
> + struct hot_inode_item *he = container_of(comm_item,
> + struct hot_inode_item, hot_inode);
> +
> + hot_range_tree_free(he);
> + radix_tree_delete(he->hot_inode_tree, he->i_ino);

void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)

and he::i_ino is u64, this will not work when
sizeof(unsigned long) != sizeof(u64) (iirc this is a known limitation of
radix tree implementation). This will work on 64bit only, not sure if
this is intentional.

> + kmem_cache_free(hot_inode_item_cachep, he);
> +}
> +
> +/* Frees the entire hot_inode_tree. */
> +static void hot_inode_tree_exit(struct hot_info *root)
> +{
> + struct hot_inode_item *hi_nodes[8];
> + u64 ino = 0;
> + int i, n;

nitpick, put the declarations on separate lines

> +
> + while (1) {
> + spin_lock(&root->lock);
> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
> + (void **)hi_nodes, ino,
> + ARRAY_SIZE(hi_nodes));
> + if (!n) {
> + spin_unlock(&root->lock);
> + break;
> + }
> +
> + ino = hi_nodes[n - 1]->i_ino + 1;
> + for (i = 0; i < n; i++)
> + hot_inode_item_put(hi_nodes[i]);
> + spin_unlock(&root->lock);
> + }
> +}
> +
> /*
> * Initialize kmem cache for hot_inode_item and hot_range_item.
> */
> @@ -106,3 +197,36 @@ err:
> kmem_cache_destroy(hot_inode_item_cachep);
> }
> EXPORT_SYMBOL_GPL(hot_cache_init);
> +
> +/*
> + * Initialize the data structures for hot data tracking.
> + */
> +int hot_track_init(struct super_block *sb)
> +{
> + struct hot_info *root;
> + int ret = -ENOMEM;
> +
> + root = kzalloc(sizeof(struct hot_info), GFP_NOFS);
> + if (!root) {
> + printk(KERN_ERR "%s: Failed to malloc memory for "
> + "hot_info\n", __func__);
> + return ret;

minor: you can drop the variable ret and just reurn ENOMEM here

> + }
> +
> + sb->s_hot_root = root;
> + hot_inode_tree_init(root);
> +
> + printk(KERN_INFO "VFS: Turning on hot data tracking\n");
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(hot_track_init);

david

2012-11-06 22:37:04

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Mon, Oct 29, 2012 at 12:30:45PM +0800, [email protected] wrote:
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> +struct hot_inode_item
> +*hot_inode_item_find(struct hot_info *root, u64 ino)
> +{
> + struct hot_inode_item *he;
> + int ret;
> +
> +again:
> + spin_lock(&root->lock);
> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
> + if (he) {
> + kref_get(&he->hot_inode.refs);
> + spin_unlock(&root->lock);
> + return he;
> + }
> + spin_unlock(&root->lock);
> +
> + he = kmem_cache_zalloc(hot_inode_item_cachep,
> + GFP_KERNEL | GFP_NOFS);
> + if (!he)
> + return ERR_PTR(-ENOMEM);
> +
> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
> +
> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> + if (ret) {
> + kmem_cache_free(hot_inode_item_cachep, he);

radix_tree_preload_end()

> + return ERR_PTR(ret);
> + }
> +
> + spin_lock(&root->lock);
> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
> + if (ret == -EEXIST) {
> + kmem_cache_free(hot_inode_item_cachep, he);
> + spin_unlock(&root->lock);
> + radix_tree_preload_end();
> + goto again;
> + }
> + spin_unlock(&root->lock);
> + radix_tree_preload_end();
> +
> + kref_get(&he->hot_inode.refs);
> + return he;
> +}
> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
> +
> +static struct hot_range_item
> +*hot_range_item_find(struct hot_inode_item *he,
> + u32 start)
> +{
> + struct hot_range_item *hr;
> + int ret;
> +
> +again:
> + spin_lock(&he->lock);
> + hr = radix_tree_lookup(&he->hot_range_tree, start);
> + if (hr) {
> + kref_get(&hr->hot_range.refs);
> + spin_unlock(&he->lock);
> + return hr;
> + }
> + spin_unlock(&he->lock);
> +
> + hr = kmem_cache_zalloc(hot_range_item_cachep,
> + GFP_KERNEL | GFP_NOFS);
> + if (!hr)
> + return ERR_PTR(-ENOMEM);
> +
> + hot_range_item_init(hr, start, he);
> +
> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> + if (ret) {
> + kmem_cache_free(hot_range_item_cachep, hr);

radix_tree_preload_end()

> + return ERR_PTR(ret);
> + }
> +
> + spin_lock(&he->lock);
> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
> + if (ret == -EEXIST) {
> + kmem_cache_free(hot_range_item_cachep, hr);
> + spin_unlock(&he->lock);
> + radix_tree_preload_end();
> + goto again;
> + }
> + spin_unlock(&he->lock);
> + radix_tree_preload_end();
> +
> + kref_get(&hr->hot_range.refs);
> + return hr;
> +}

david

2012-11-06 22:45:39

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Mon, Oct 29, 2012 at 12:30:45PM +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> Add some util helpers to update access frequencies
> for one file or its range.
>
> Signed-off-by: Zhi Yong Wu <[email protected]>
> ---
> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
> fs/hot_tracking.h | 7 ++
> include/linux/hot_tracking.h | 2 +
> 3 files changed, 188 insertions(+), 0 deletions(-)
>
> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> index 68591f0..0a7d9a3 100644
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
> }
> }
>
> +struct hot_inode_item
> +*hot_inode_item_find(struct hot_info *root, u64 ino)
> +{
> + struct hot_inode_item *he;
> + int ret;
> +
> +again:
> + spin_lock(&root->lock);
> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
> + if (he) {
> + kref_get(&he->hot_inode.refs);
> + spin_unlock(&root->lock);
> + return he;
> + }
> + spin_unlock(&root->lock);
> +
> + he = kmem_cache_zalloc(hot_inode_item_cachep,
> + GFP_KERNEL | GFP_NOFS);
> + if (!he)
> + return ERR_PTR(-ENOMEM);
> +
> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
> +
> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> + if (ret) {
> + kmem_cache_free(hot_inode_item_cachep, he);
> + return ERR_PTR(ret);
> + }
> +
> + spin_lock(&root->lock);
> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
> + if (ret == -EEXIST) {
> + kmem_cache_free(hot_inode_item_cachep, he);
> + spin_unlock(&root->lock);
> + radix_tree_preload_end();
> + goto again;
> + }
> + spin_unlock(&root->lock);
> + radix_tree_preload_end();
> +
> + kref_get(&he->hot_inode.refs);
> + return he;
> +}
> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
> +
> +static struct hot_range_item
> +*hot_range_item_find(struct hot_inode_item *he,
> + u32 start)
> +{
> + struct hot_range_item *hr;
> + int ret;
> +
> +again:
> + spin_lock(&he->lock);
> + hr = radix_tree_lookup(&he->hot_range_tree, start);
> + if (hr) {
> + kref_get(&hr->hot_range.refs);
> + spin_unlock(&he->lock);
> + return hr;
> + }
> + spin_unlock(&he->lock);
> +
> + hr = kmem_cache_zalloc(hot_range_item_cachep,
> + GFP_KERNEL | GFP_NOFS);
> + if (!hr)
> + return ERR_PTR(-ENOMEM);
> +
> + hot_range_item_init(hr, start, he);
> +
> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> + if (ret) {
> + kmem_cache_free(hot_range_item_cachep, hr);
> + return ERR_PTR(ret);
> + }
> +
> + spin_lock(&he->lock);
> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
> + if (ret == -EEXIST) {
> + kmem_cache_free(hot_range_item_cachep, hr);
> + spin_unlock(&he->lock);
> + radix_tree_preload_end();
> + goto again;
> + }
> + spin_unlock(&he->lock);
> + radix_tree_preload_end();
> +
> + kref_get(&hr->hot_range.refs);
> + return hr;
> +}
> +
> +/*
> + * This function does the actual work of updating
> + * the frequency numbers, whatever they turn out to be.
> + */
> +static u64 hot_average_update(struct timespec old_atime,
> + struct timespec cur_time, u64 old_avg)
> +{
> + struct timespec delta_ts;
> + u64 new_avg;
> + u64 new_delta;
> +
> + delta_ts = timespec_sub(cur_time, old_atime);
> + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
> +
> + new_avg = (old_avg << FREQ_POWER) - old_avg + new_delta;
> + new_avg = new_avg >> FREQ_POWER;
> +
> + return new_avg;
> +}
> +
> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
> +{
> + struct timespec cur_time = current_kernel_time();
> +
> + if (write) {
> + freq_data->nr_writes += 1;
> + freq_data->avg_delta_writes = hot_average_update(
> + freq_data->last_write_time,
> + cur_time,
> + freq_data->avg_delta_writes);
> + freq_data->last_write_time = cur_time;
> + } else {
> + freq_data->nr_reads += 1;
> + freq_data->avg_delta_reads = hot_average_update(
> + freq_data->last_read_time,
> + cur_time,
> + freq_data->avg_delta_reads);

I think you could just pass in a pointer to
freq_data->avg_delta_{writes,reads} here...

> + freq_data->last_read_time = cur_time;
> + }
> +}
> +
> /*
> * Initialize kmem cache for hot_inode_item and hot_range_item.
> */
> @@ -199,6 +330,54 @@ err:
> EXPORT_SYMBOL_GPL(hot_cache_init);
>
> /*
> + * Main function to update access frequency from read/writepage(s) hooks
> + */
> +void hot_update_freqs(struct inode *inode, u64 start,
> + u64 len, int rw)
> +{
> + struct hot_info *root = inode->i_sb->s_hot_root;
> + struct hot_inode_item *he;
> + struct hot_range_item *hr;
> + u32 cur, end;
> +
> + if (!root || (len == 0))
> + return;
> +
> + he = hot_inode_item_find(root, inode->i_ino);
> + if (IS_ERR(he)) {
> + WARN_ON(1);
> + return;
> + }
> +
> + spin_lock(&he->hot_inode.lock);
> + hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
> + spin_unlock(&he->hot_inode.lock);
> +
> + /*
> + * Align ranges on RANGE_SIZE boundary
> + * to prevent proliferation of range structs
> + */
> + end = (start + len + RANGE_SIZE - 1) >> RANGE_BITS;
> + for (cur = (start >> RANGE_BITS); cur < end; cur++) {

Hm... start is u64, cur is u32, RANGE_BITS is 20. Doesn't this overflow if,
say, I have a sparse file with blocks way out at 2^53 bytes?

Also, RANGE_SIZE means that the hot tracking range granularity is 1MiB? How
did you decide on that? Will we ever want to change that?

> + hr = hot_range_item_find(he, cur);
> + if (IS_ERR(hr)) {
> + WARN_ON(1);

WARN(1, "hot_range_item_find returns %d\n", PTR_ERR(hr)); ?

--D

> + hot_inode_item_put(he);
> + return;
> + }
> +
> + spin_lock(&hr->hot_range.lock);
> + hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
> + spin_unlock(&hr->hot_range.lock);
> +
> + hot_range_item_put(hr);
> + }
> +
> + hot_inode_item_put(he);
> +}
> +EXPORT_SYMBOL_GPL(hot_update_freqs);
> +
> +/*
> * Initialize the data structures for hot data tracking.
> */
> int hot_track_init(struct super_block *sb)
> diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
> index e7ba121..cc4666e 100644
> --- a/fs/hot_tracking.h
> +++ b/fs/hot_tracking.h
> @@ -20,6 +20,13 @@
> #define FREQ_DATA_TYPE_INODE (1 << 0)
> #define FREQ_DATA_TYPE_RANGE (1 << 1)
>
> +/* size of sub-file ranges */
> +#define RANGE_BITS 20
> +#define RANGE_SIZE (1 << RANGE_BITS)
> +
> +#define FREQ_POWER 4
> +
> void hot_inode_item_put(struct hot_inode_item *he);
> +struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);
>
> #endif /* __HOT_TRACKING__ */
> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
> index 4233207..e2d6028 100644
> --- a/include/linux/hot_tracking.h
> +++ b/include/linux/hot_tracking.h
> @@ -71,5 +71,7 @@ struct hot_info {
> extern void __init hot_cache_init(void);
> extern int hot_track_init(struct super_block *sb);
> extern void hot_track_exit(struct super_block *sb);
> +extern void hot_update_freqs(struct inode *inode, u64 start,
> + u64 len, int rw);
>
> #endif /* _LINUX_HOTTRACK_H */
> --
> 1.7.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2012-11-06 22:51:53

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 05/19] vfs: add hooks to enable hot tracking

On Mon, Oct 29, 2012 at 12:30:47PM +0800, [email protected] wrote:
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -19,6 +19,7 @@
> #include <linux/pagemap.h>
> #include <linux/syscalls.h>
> #include <linux/file.h>
> +#include <linux/hot_tracking.h>
>
> /*
> * Initialise a struct file's readahead state. Assumes that the caller has
> @@ -138,6 +139,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
> out:
> blk_finish_plug(&plug);
>
> + /* Hot data tracking */
> + hot_update_freqs(mapping->host, (u64)(list_entry(pages->prev,\
> + struct page, lru)->index) << PAGE_CACHE_SHIFT,
> + (u64)nr_pages * PAGE_CACHE_SIZE, 0);

There's a stale \ at the end of the line, and I find this formatting
hard to read. Does the following look acceptable?

hot_update_freqs(mapping->host,
(u64)(list_entry(pages->prev, struct page, lru)->index)
<< PAGE_CACHE_SHIFT,
(u64)nr_pages * PAGE_CACHE_SIZE, 0);

> +
> return ret;
> }
>

2012-11-06 23:15:08

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

On Mon, Oct 29, 2012 at 12:30:52PM +0800, [email protected] wrote:
> +static struct hot_func_type *hot_func_get(const char *name)
> +{
> + struct hot_func_type *f, *h = &hot_func_def;
> +
> + spin_lock(&hot_func_list_lock);
> + list_for_each_entry(f, &hot_func_list, list) {
> + if (!strcmp(f->hot_func_name, name))
> + h = f;

You probably want to break here

> + }
> + spin_unlock(&hot_func_list_lock);
> +
> + return h;
> +}
> +
> +int hot_func_register(struct hot_func_type *h)
> +{
> + struct hot_func_type *f, *t = NULL;
> +
> + /* register, don't allow duplicate names */
> + spin_lock(&hot_func_list_lock);
> + list_for_each_entry(f, &hot_func_list, list) {
> + if (!strcmp(f->hot_func_name, h->hot_func_name))
> + t = f;

if duplicate names are not allowed, then a warning may make sense to
let us know that something is wrong

> + }
> +
> + if (t) {
> + spin_unlock(&hot_func_list_lock);
> + return -EBUSY;
> + }
> +
> + list_add_tail(&h->list, &hot_func_list);
> + spin_unlock(&hot_func_list_lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(hot_func_register);
> --- a/include/linux/hot_tracking.h
> +++ b/include/linux/hot_tracking.h
> @@ -73,6 +75,25 @@ struct hot_range_item {
> u32 len; /* length in bytes */
> };
>
> +typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
> + struct timespec cur_time, u64 old_avg);
> +typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
> +typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);

I'm thinking, whether these typedefs are useful, similar ops structures
do not introduce them, also when you pick a struct member names exactly
same as the typedefs:

> +struct hot_func_ops {
> + hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
> + hot_temp_calc_fn *hot_temp_calc_fn;
> + hot_is_obsolete_fn *hot_is_obsolete_fn;
> +};

My suggestion is to make the types explicit in the structure.

> +/* identifies an hot func type */
> +struct hot_func_type {
> + char hot_func_name[HOT_NAME_MAX];

'name' would be sufficient IMHO

> + /* fields provided by specific FS */
> + struct hot_func_ops ops;
> + struct list_head list;
> +};

david

2012-11-06 23:30:11

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

On Mon, Oct 29, 2012 at 12:30:52PM +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> Introduce one framwork to enable that specific FS
> can register its own hot tracking functions.
>
> Signed-off-by: Zhi Yong Wu <[email protected]>
> ---
> fs/hot_tracking.c | 78 ++++++++++++++++++++++++++++++++++++++----
> include/linux/hot_tracking.h | 25 +++++++++++++
> 2 files changed, 96 insertions(+), 7 deletions(-)
>
> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> index 0ef9cad..c6c6138 100644
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -24,6 +24,9 @@
> #include <linux/limits.h>
> #include "hot_tracking.h"
>
> +static DEFINE_SPINLOCK(hot_func_list_lock);
> +static LIST_HEAD(hot_func_list);
> +
> /* kmem_cache pointers for slab caches */
> static struct kmem_cache *hot_inode_item_cachep __read_mostly;
> static struct kmem_cache *hot_range_item_cachep __read_mostly;
> @@ -305,20 +308,23 @@ static u64 hot_average_update(struct timespec old_atime,
> return new_avg;
> }
>
> -static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
> +static void hot_freq_data_update(struct hot_info *root,
> + struct hot_freq_data *freq_data, bool write)
> {
> struct timespec cur_time = current_kernel_time();
>
> if (write) {
> freq_data->nr_writes += 1;
> - freq_data->avg_delta_writes = hot_average_update(
> + freq_data->avg_delta_writes =
> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
> freq_data->last_write_time,
> cur_time,
> freq_data->avg_delta_writes);
> freq_data->last_write_time = cur_time;
> } else {
> freq_data->nr_reads += 1;
> - freq_data->avg_delta_reads = hot_average_update(
> + freq_data->avg_delta_reads =
> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
> freq_data->last_read_time,
> cur_time,
> freq_data->avg_delta_reads);
> @@ -430,7 +436,7 @@ static void hot_map_array_update(struct hot_freq_data *freq_data,
> struct hot_comm_item *comm_item;
> struct hot_inode_item *he;
> struct hot_range_item *hr;
> - u32 temp = hot_temp_calc(freq_data);
> + u32 temp = root->hot_func_type->ops.hot_temp_calc_fn(freq_data);
> u8 a_temp = temp >> (32 - HEAT_MAP_BITS);
> u8 b_temp = freq_data->last_temp >> (32 - HEAT_MAP_BITS);
>
> @@ -511,7 +517,7 @@ static void hot_range_update(struct hot_inode_item *he,
> &hr_nodes[i]->hot_range.hot_freq_data, root);
>
> spin_lock(&hr_nodes[i]->hot_range.lock);
> - obsolete = hot_is_obsolete(
> + obsolete = root->hot_func_type->ops.hot_is_obsolete_fn(
> &hr_nodes[i]->hot_range.hot_freq_data);
> spin_unlock(&hr_nodes[i]->hot_range.lock);
>
> @@ -668,7 +674,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
> }
>
> spin_lock(&he->hot_inode.lock);
> - hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
> + hot_freq_data_update(root, &he->hot_inode.hot_freq_data, rw);
> spin_unlock(&he->hot_inode.lock);
>
> /*
> @@ -685,7 +691,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
> }
>
> spin_lock(&hr->hot_range.lock);
> - hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
> + hot_freq_data_update(root, &hr->hot_range.hot_freq_data, rw);
> spin_unlock(&hr->hot_range.lock);
>
> hot_range_item_put(hr);
> @@ -695,6 +701,61 @@ void hot_update_freqs(struct inode *inode, u64 start,
> }
> EXPORT_SYMBOL_GPL(hot_update_freqs);
>
> +static struct hot_func_type hot_func_def = {
> + .hot_func_name = "hot_type_def",
> + .ops = {
> + .hot_rw_freq_calc_fn = hot_average_update,
> + .hot_temp_calc_fn = hot_temp_calc,
> + .hot_is_obsolete_fn = hot_is_obsolete,
> + },
> +};

If these hot_ops are per-filesystem, why not just embed a struct hot_func_ops
inside of struct file_system_type? That eliminates this _get function,
collision avoidance, etc. You can fill in NULL function pointers in
hot_track_init (or just code around them).

--D

> +
> +static struct hot_func_type *hot_func_get(const char *name)
> +{
> + struct hot_func_type *f, *h = &hot_func_def;
> +
> + spin_lock(&hot_func_list_lock);
> + list_for_each_entry(f, &hot_func_list, list) {
> + if (!strcmp(f->hot_func_name, name))
> + h = f;
> + }
> + spin_unlock(&hot_func_list_lock);
> +
> + return h;
> +}
> +
> +int hot_func_register(struct hot_func_type *h)
> +{
> + struct hot_func_type *f, *t = NULL;
> +
> + /* register, don't allow duplicate names */
> + spin_lock(&hot_func_list_lock);
> + list_for_each_entry(f, &hot_func_list, list) {
> + if (!strcmp(f->hot_func_name, h->hot_func_name))
> + t = f;
> + }
> +
> + if (t) {
> + spin_unlock(&hot_func_list_lock);
> + return -EBUSY;
> + }
> +
> + list_add_tail(&h->list, &hot_func_list);
> + spin_unlock(&hot_func_list_lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(hot_func_register);
> +
> +void hot_func_unregister(struct hot_func_type *h)
> +{
> + /* unregister */
> + spin_lock(&hot_func_list_lock);
> + list_del_init(&h->list);
> + spin_unlock(&hot_func_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(hot_func_unregister);
> +
> /*
> * Initialize the data structures for hot data tracking.
> */
> @@ -714,6 +775,9 @@ int hot_track_init(struct super_block *sb)
> hot_inode_tree_init(root);
> hot_map_array_init(root);
>
> + /* Get hot func type */
> + root->hot_func_type = hot_func_get(sb->s_type->name);
> +
> root->update_wq = alloc_workqueue(
> "hot_update_wq", WQ_NON_REENTRANT, 0);
> if (!root->update_wq) {
> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
> index 2ee0d02..3941052 100644
> --- a/include/linux/hot_tracking.h
> +++ b/include/linux/hot_tracking.h
> @@ -23,6 +23,8 @@
> #define HEAT_MAP_BITS 8
> #define HEAT_MAP_SIZE (1 << HEAT_MAP_BITS)
>
> +#define HOT_NAME_MAX 16
> +
> /*
> * A frequency data struct holds values that are used to
> * determine temperature of files and file ranges. These structs
> @@ -73,6 +75,25 @@ struct hot_range_item {
> u32 len; /* length in bytes */
> };
>
> +typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
> + struct timespec cur_time, u64 old_avg);
> +typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
> +typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);
> +
> +struct hot_func_ops {
> + hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
> + hot_temp_calc_fn *hot_temp_calc_fn;
> + hot_is_obsolete_fn *hot_is_obsolete_fn;
> +};
> +
> +/* identifies an hot func type */
> +struct hot_func_type {
> + char hot_func_name[HOT_NAME_MAX];
> + /* fields provided by specific FS */
> + struct hot_func_ops ops;
> + struct list_head list;
> +};
> +
> struct hot_info {
> struct radix_tree_root hot_inode_tree;
> spinlock_t lock; /*protect inode tree */
> @@ -85,6 +106,7 @@ struct hot_info {
>
> struct workqueue_struct *update_wq;
> struct delayed_work update_work;
> + struct hot_func_type *hot_func_type;
> };
>
> extern void __init hot_cache_init(void);
> @@ -93,4 +115,7 @@ extern void hot_track_exit(struct super_block *sb);
> extern void hot_update_freqs(struct inode *inode, u64 start,
> u64 len, int rw);
>
> +extern int hot_func_register(struct hot_func_type *h);
> +extern void hot_func_unregister(struct hot_func_type *h);
> +
> #endif /* _LINUX_HOTTRACK_H */
> --
> 1.7.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2012-11-06 23:30:58

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 12/19] vfs: add one ioctl interface

On Mon, Oct 29, 2012 at 12:30:54PM +0800, [email protected] wrote:
> +static int ioctl_heat_info(struct file *file, void __user *argp)
> +{
> + struct inode *inode = file->f_dentry->d_inode;
> + struct hot_heat_info *heat_info;
> + struct hot_inode_item *he;
> + int ret = 0;
> +
> + heat_info = kmalloc(sizeof(struct hot_heat_info),
> + GFP_KERNEL | GFP_NOFS);

heat_info is small enough to fit onto the stack, so you can avoid the
kmalloc, I don't think there are deep callstacks to be expected.
Nevertheless, if you want to use kmalloc here, then please check the
return value and use GFP_KERNEL.

> +
> + if (copy_from_user((void *) heat_info,
> + argp,
> + sizeof(struct hot_heat_info)) != 0) {
> + ret = -EFAULT;
> + goto err;
> + }
> +
> + he = hot_inode_item_find(inode->i_sb->s_hot_root, inode->i_ino);
> + if (!he) {
> + /* we don't have any info on this file yet */
> + ret = -ENODATA;
> + goto err;
> + }
> +
> + spin_lock(&he->hot_inode.lock);
> + heat_info->avg_delta_reads =
> + (__u64) he->hot_inode.hot_freq_data.avg_delta_reads;
> + heat_info->avg_delta_writes =
> + (__u64) he->hot_inode.hot_freq_data.avg_delta_writes;
> + heat_info->last_read_time =
> + (__u64) timespec_to_ns(&he->hot_inode.hot_freq_data.last_read_time);
> + heat_info->last_write_time =
> + (__u64) timespec_to_ns(&he->hot_inode.hot_freq_data.last_write_time);
> + heat_info->num_reads =
> + (__u32) he->hot_inode.hot_freq_data.nr_reads;
> + heat_info->num_writes =
> + (__u32) he->hot_inode.hot_freq_data.nr_writes;
> +
> + if (heat_info->live > 0) {
> + /*
> + * got a request for live temperature,
> + * call hot_hash_calc_temperature to recalculate
> + */
> + heat_info->temp =
> + inode->i_sb->s_hot_root->hot_func_type->ops.hot_temp_calc_fn(
> + &he->hot_inode.hot_freq_data);
> + } else {
> + /* not live temperature, get it from the hashlist */
> + heat_info->temp = he->hot_inode.hot_freq_data.last_temp;
> + }
> + spin_unlock(&he->hot_inode.lock);
> +
> + hot_inode_item_put(he);
> +
> + if (copy_to_user(argp, (void *) heat_info,
> + sizeof(struct hot_heat_info))) {
> + ret = -EFAULT;
> + goto err;
> + }
> +
> +err:
> + kfree(heat_info);
> + return ret;
> +}

david

2012-11-06 23:46:13

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 14/19] vfs: add debugfs support

On Mon, Oct 29, 2012 at 12:30:56PM +0800, [email protected] wrote:
> +static int hot_range_seq_show(struct seq_file *seq, void *v)
> +{
> + struct hot_range_item *hr = v;
> + struct hot_inode_item *he = hr->hot_inode;
> + struct hot_freq_data *freq_data = &hr->hot_range.hot_freq_data;
> +
> + /* Always lock hot_inode_item first */
> + spin_lock(&he->hot_inode.lock);
> + spin_lock(&hr->hot_range.lock);
> + seq_printf(seq, "inode #%llu, range start " \

the # seems unnecessary to me

> + "%llu (range len %u) reads %u, writes %u, "
> + "avg read time %llu, avg write time %llu, temp %u\n",

compiler will complain if it sees a %llu format and not the expected
type of 'unsigned long long'

> + he->i_ino,

(unsigned long long)he->i_ino,

> + (u64)hr->start * RANGE_SIZE,
> + hr->len,
> + freq_data->nr_reads,
> + freq_data->nr_writes,
> + freq_data->avg_delta_reads / NSEC_PER_MSEC,
> + freq_data->avg_delta_writes / NSEC_PER_MSEC,
> + freq_data->last_temp >> (32 - HEAT_MAP_BITS));
> + spin_unlock(&hr->hot_range.lock);
> + spin_unlock(&he->hot_inode.lock);
> +
> + return 0;
> +}
> +
> +static int hot_inode_seq_show(struct seq_file *seq, void *v)
> +{
> + struct hot_inode_item *he = v;
> + struct hot_freq_data *freq_data = &he->hot_inode.hot_freq_data;
> +
> + spin_lock(&he->hot_inode.lock);
> + seq_printf(seq, "inode #%llu, reads %u, writes %u, " \
> + "avg read time %llu, avg write time %llu, temp %u\n",

(same here)

> + he->i_ino,
> + freq_data->nr_reads,
> + freq_data->nr_writes,
> + freq_data->avg_delta_reads / NSEC_PER_MSEC,
> + freq_data->avg_delta_writes / NSEC_PER_MSEC,
> + freq_data->last_temp >> (32 - HEAT_MAP_BITS));
> + spin_unlock(&he->hot_inode.lock);
> +
> + return 0;
> +}
>
> +static void *hot_spot_range_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct hot_info *root = seq->private;
> + struct hot_range_item *hr_next, *hr = v;
> + struct hot_comm_item *comm_item;
> + struct list_head *n_list;
> + int i =
> + hr->hot_range.hot_freq_data.last_temp >> (32 - HEAT_MAP_BITS);

now I have noticed that I've seen the ... (32 - HEAT_MAP_BITS)
expression so many times that it tend to think it deserves a helper
function

> +
> + n_list = seq_list_next(&hr->hot_range.n_list,
> + &root->heat_range_map[i].node_list, pos);
> + hot_range_item_put(hr);
> +next:
> + if (n_list) {
> + comm_item = container_of(n_list,
> + struct hot_comm_item, n_list);
> + hr_next = container_of(comm_item,
> + struct hot_range_item, hot_range);
> + kref_get(&hr_next->hot_range.refs);
> + return hr_next;
> + } else if (--i >= 0) {
> + n_list = seq_list_next(&root->heat_range_map[i].node_list,
> + &root->heat_range_map[i].node_list, pos);
> + goto next;
> + }
> +
> + return NULL;
> +}
> +
> +static void hot_debugfs_exit(struct super_block *sb)
> +{
> + struct dentry *vol_dentry;
> +
> + vol_dentry = debugfs_get_dentry(sb->s_id,
> + sb->s_hot_root->debugfs_root, strlen(sb->s_id));
> + /* remove all debugfs entries recursively from the volume root */
> + if (vol_dentry)
> + debugfs_remove_recursive(vol_dentry);
> + else
> + BUG_ON(1);

BUG()

> +
> + if (list_empty(&sb->s_hot_root->debugfs_root->d_subdirs))
> + debugfs_remove(sb->s_hot_root->debugfs_root);
> +}
> +
> +/*

david

2012-11-07 00:00:42

by David Sterba

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 16/19] btrfs: add hot tracking support

On Mon, Oct 29, 2012 at 12:30:58PM +0800, [email protected] wrote:
> From: Zhi Yong Wu <[email protected]>
>
> Introduce one new mount option '-o hot_track',
> and add its parsing support.
> Its usage looks like:
> mount -o hot_track
> mount -o nouser,hot_track
> mount -o nouser,hot_track,loop
> mount -o hot_track,nouser
>
> Signed-off-by: Zhi Yong Wu <[email protected]>
Reviewed-by: David Sterba <[email protected]>

2012-11-07 06:55:57

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 02/19] vfs: initialize and free data structures

On Wed, Nov 7, 2012 at 6:24 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:44PM +0800, [email protected] wrote:
>> +/* Frees the entire hot_range_tree. */
>> +static void hot_inode_item_free(struct kref *kref)
>> +{
>> + struct hot_comm_item *comm_item = container_of(kref,
>> + struct hot_comm_item, refs);
>> + struct hot_inode_item *he = container_of(comm_item,
>> + struct hot_inode_item, hot_inode);
>> +
>> + hot_range_tree_free(he);
>> + radix_tree_delete(he->hot_inode_tree, he->i_ino);
>
> void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
>
> and he::i_ino is u64, this will not work when
> sizeof(unsigned long) != sizeof(u64) (iirc this is a known limitation of
> radix tree implementation). This will work on 64bit only, not sure if
> this is intentional.
i actually also realized this. Do you have a better way to handle this?
>
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> +}
>> +
>> +/* Frees the entire hot_inode_tree. */
>> +static void hot_inode_tree_exit(struct hot_info *root)
>> +{
>> + struct hot_inode_item *hi_nodes[8];
>> + u64 ino = 0;
>> + int i, n;
>
> nitpick, put the declarations on separate lines
Will it have any issue? It has passed the check of checkpatch.pl.

>
>> +
>> + while (1) {
>> + spin_lock(&root->lock);
>> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
>> + (void **)hi_nodes, ino,
>> + ARRAY_SIZE(hi_nodes));
>> + if (!n) {
>> + spin_unlock(&root->lock);
>> + break;
>> + }
>> +
>> + ino = hi_nodes[n - 1]->i_ino + 1;
>> + for (i = 0; i < n; i++)
>> + hot_inode_item_put(hi_nodes[i]);
>> + spin_unlock(&root->lock);
>> + }
>> +}
>> +
>> /*
>> * Initialize kmem cache for hot_inode_item and hot_range_item.
>> */
>> @@ -106,3 +197,36 @@ err:
>> kmem_cache_destroy(hot_inode_item_cachep);
>> }
>> EXPORT_SYMBOL_GPL(hot_cache_init);
>> +
>> +/*
>> + * Initialize the data structures for hot data tracking.
>> + */
>> +int hot_track_init(struct super_block *sb)
>> +{
>> + struct hot_info *root;
>> + int ret = -ENOMEM;
>> +
>> + root = kzalloc(sizeof(struct hot_info), GFP_NOFS);
>> + if (!root) {
>> + printk(KERN_ERR "%s: Failed to malloc memory for "
>> + "hot_info\n", __func__);
>> + return ret;
>
> minor: you can drop the variable ret and just reurn ENOMEM here
This variable will also be used in the following patches.

>
>> + }
>> +
>> + sb->s_hot_root = root;
>> + hot_inode_tree_init(root);
>> +
>> + printk(KERN_INFO "VFS: Turning on hot data tracking\n");
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_track_init);
>
> david



--
Regards,

Zhi Yong Wu

2012-11-07 07:03:54

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Wed, Nov 7, 2012 at 6:37 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:45PM +0800, [email protected] wrote:
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> +struct hot_inode_item
>> +*hot_inode_item_find(struct hot_info *root, u64 ino)
>> +{
>> + struct hot_inode_item *he;
>> + int ret;
>> +
>> +again:
>> + spin_lock(&root->lock);
>> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
>> + if (he) {
>> + kref_get(&he->hot_inode.refs);
>> + spin_unlock(&root->lock);
>> + return he;
>> + }
>> + spin_unlock(&root->lock);
>> +
>> + he = kmem_cache_zalloc(hot_inode_item_cachep,
>> + GFP_KERNEL | GFP_NOFS);
>> + if (!he)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
>> +
>> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> + if (ret) {
>> + kmem_cache_free(hot_inode_item_cachep, he);
>
> radix_tree_preload_end()
>
>> + return ERR_PTR(ret);
>> + }
>> +
>> + spin_lock(&root->lock);
>> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
>> + if (ret == -EEXIST) {
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> + spin_unlock(&root->lock);
>> + radix_tree_preload_end();
>> + goto again;
>> + }
>> + spin_unlock(&root->lock);
>> + radix_tree_preload_end();
>> +
>> + kref_get(&he->hot_inode.refs);
>> + return he;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
>> +
>> +static struct hot_range_item
>> +*hot_range_item_find(struct hot_inode_item *he,
>> + u32 start)
>> +{
>> + struct hot_range_item *hr;
>> + int ret;
>> +
>> +again:
>> + spin_lock(&he->lock);
>> + hr = radix_tree_lookup(&he->hot_range_tree, start);
>> + if (hr) {
>> + kref_get(&hr->hot_range.refs);
>> + spin_unlock(&he->lock);
>> + return hr;
>> + }
>> + spin_unlock(&he->lock);
>> +
>> + hr = kmem_cache_zalloc(hot_range_item_cachep,
>> + GFP_KERNEL | GFP_NOFS);
>> + if (!hr)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + hot_range_item_init(hr, start, he);
>> +
>> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> + if (ret) {
>> + kmem_cache_free(hot_range_item_cachep, hr);
>
> radix_tree_preload_end()
I checked some kernel existing cases about the usage of
radix_tree_preload(), it seems that when radix_tree_preload() fail,
its error handling doesn't need call radix_tree_preload_end() any
more.
>
>> + return ERR_PTR(ret);
>> + }
>> +
>> + spin_lock(&he->lock);
>> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
>> + if (ret == -EEXIST) {
>> + kmem_cache_free(hot_range_item_cachep, hr);
>> + spin_unlock(&he->lock);
>> + radix_tree_preload_end();
ditto.
>> + goto again;
>> + }
>> + spin_unlock(&he->lock);
>> + radix_tree_preload_end();
>> +
>> + kref_get(&hr->hot_range.refs);
>> + return hr;
>> +}
>
> david



--
Regards,

Zhi Yong Wu

2012-11-07 07:06:49

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 05/19] vfs: add hooks to enable hot tracking

On Wed, Nov 7, 2012 at 6:51 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:47PM +0800, [email protected] wrote:
>> --- a/mm/readahead.c
>> +++ b/mm/readahead.c
>> @@ -19,6 +19,7 @@
>> #include <linux/pagemap.h>
>> #include <linux/syscalls.h>
>> #include <linux/file.h>
>> +#include <linux/hot_tracking.h>
>>
>> /*
>> * Initialise a struct file's readahead state. Assumes that the caller has
>> @@ -138,6 +139,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
>> out:
>> blk_finish_plug(&plug);
>>
>> + /* Hot data tracking */
>> + hot_update_freqs(mapping->host, (u64)(list_entry(pages->prev,\
>> + struct page, lru)->index) << PAGE_CACHE_SHIFT,
>> + (u64)nr_pages * PAGE_CACHE_SIZE, 0);
>
> There's a stale \ at the end of the line, and I find this formatting
> hard to read. Does the following look acceptable?
yes, great, thanks.
>
> hot_update_freqs(mapping->host,
> (u64)(list_entry(pages->prev, struct page, lru)->index)
> << PAGE_CACHE_SHIFT,
> (u64)nr_pages * PAGE_CACHE_SIZE, 0);
>
>> +
>> return ret;
>> }
>>



--
Regards,

Zhi Yong Wu

2012-11-07 07:18:57

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

On Wed, Nov 7, 2012 at 7:14 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:52PM +0800, [email protected] wrote:
>> +static struct hot_func_type *hot_func_get(const char *name)
>> +{
>> + struct hot_func_type *f, *h = &hot_func_def;
>> +
>> + spin_lock(&hot_func_list_lock);
>> + list_for_each_entry(f, &hot_func_list, list) {
>> + if (!strcmp(f->hot_func_name, name))
>> + h = f;
>
> You probably want to break here
Good catch, done, thanks.
>
>> + }
>> + spin_unlock(&hot_func_list_lock);
>> +
>> + return h;
>> +}
>> +
>> +int hot_func_register(struct hot_func_type *h)
>> +{
>> + struct hot_func_type *f, *t = NULL;
>> +
>> + /* register, don't allow duplicate names */
>> + spin_lock(&hot_func_list_lock);
>> + list_for_each_entry(f, &hot_func_list, list) {
>> + if (!strcmp(f->hot_func_name, h->hot_func_name))
>> + t = f;
>
> if duplicate names are not allowed, then a warning may make sense to
> let us know that something is wrong
done, thanks.
>
>> + }
>> +
>> + if (t) {
>> + spin_unlock(&hot_func_list_lock);
>> + return -EBUSY;
>> + }
>> +
>> + list_add_tail(&h->list, &hot_func_list);
>> + spin_unlock(&hot_func_list_lock);
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_func_register);
>> --- a/include/linux/hot_tracking.h
>> +++ b/include/linux/hot_tracking.h
>> @@ -73,6 +75,25 @@ struct hot_range_item {
>> u32 len; /* length in bytes */
>> };
>>
>> +typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
>> + struct timespec cur_time, u64 old_avg);
>> +typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
>> +typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);
>
> I'm thinking, whether these typedefs are useful, similar ops structures
> do not introduce them, also when you pick a struct member names exactly
> same as the typedefs:
>
>> +struct hot_func_ops {
>> + hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
>> + hot_temp_calc_fn *hot_temp_calc_fn;
>> + hot_is_obsolete_fn *hot_is_obsolete_fn;
>> +};
>
> My suggestion is to make the types explicit in the structure.
sorry, i don't get your point, can you elaborate it about how to do this?
>
>> +/* identifies an hot func type */
>> +struct hot_func_type {
>> + char hot_func_name[HOT_NAME_MAX];
>
> 'name' would be sufficient IMHO
done, thanks.
>
>> + /* fields provided by specific FS */
>> + struct hot_func_ops ops;
>> + struct list_head list;
>> +};
>
> david



--
Regards,

Zhi Yong Wu

2012-11-07 07:36:58

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 12/19] vfs: add one ioctl interface

On Wed, Nov 7, 2012 at 7:30 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:54PM +0800, [email protected] wrote:
>> +static int ioctl_heat_info(struct file *file, void __user *argp)
>> +{
>> + struct inode *inode = file->f_dentry->d_inode;
>> + struct hot_heat_info *heat_info;
>> + struct hot_inode_item *he;
>> + int ret = 0;
>> +
>> + heat_info = kmalloc(sizeof(struct hot_heat_info),
>> + GFP_KERNEL | GFP_NOFS);
>
> heat_info is small enough to fit onto the stack, so you can avoid the
> kmalloc, I don't think there are deep callstacks to be expected.
ok, done.
> Nevertheless, if you want to use kmalloc here, then please check the
> return value and use GFP_KERNEL.
thanks for your pointing out.
>
>> +
>> + if (copy_from_user((void *) heat_info,
>> + argp,
>> + sizeof(struct hot_heat_info)) != 0) {
>> + ret = -EFAULT;
>> + goto err;
>> + }
>> +
>> + he = hot_inode_item_find(inode->i_sb->s_hot_root, inode->i_ino);
>> + if (!he) {
>> + /* we don't have any info on this file yet */
>> + ret = -ENODATA;
>> + goto err;
>> + }
>> +
>> + spin_lock(&he->hot_inode.lock);
>> + heat_info->avg_delta_reads =
>> + (__u64) he->hot_inode.hot_freq_data.avg_delta_reads;
>> + heat_info->avg_delta_writes =
>> + (__u64) he->hot_inode.hot_freq_data.avg_delta_writes;
>> + heat_info->last_read_time =
>> + (__u64) timespec_to_ns(&he->hot_inode.hot_freq_data.last_read_time);
>> + heat_info->last_write_time =
>> + (__u64) timespec_to_ns(&he->hot_inode.hot_freq_data.last_write_time);
>> + heat_info->num_reads =
>> + (__u32) he->hot_inode.hot_freq_data.nr_reads;
>> + heat_info->num_writes =
>> + (__u32) he->hot_inode.hot_freq_data.nr_writes;
>> +
>> + if (heat_info->live > 0) {
>> + /*
>> + * got a request for live temperature,
>> + * call hot_hash_calc_temperature to recalculate
>> + */
>> + heat_info->temp =
>> + inode->i_sb->s_hot_root->hot_func_type->ops.hot_temp_calc_fn(
>> + &he->hot_inode.hot_freq_data);
>> + } else {
>> + /* not live temperature, get it from the hashlist */
>> + heat_info->temp = he->hot_inode.hot_freq_data.last_temp;
>> + }
>> + spin_unlock(&he->hot_inode.lock);
>> +
>> + hot_inode_item_put(he);
>> +
>> + if (copy_to_user(argp, (void *) heat_info,
>> + sizeof(struct hot_heat_info))) {
>> + ret = -EFAULT;
>> + goto err;
>> + }
>> +
>> +err:
>> + kfree(heat_info);
>> + return ret;
>> +}
>
> david



--
Regards,

Zhi Yong Wu

2012-11-07 07:49:27

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 14/19] vfs: add debugfs support

On Wed, Nov 7, 2012 at 7:45 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:56PM +0800, [email protected] wrote:
>> +static int hot_range_seq_show(struct seq_file *seq, void *v)
>> +{
>> + struct hot_range_item *hr = v;
>> + struct hot_inode_item *he = hr->hot_inode;
>> + struct hot_freq_data *freq_data = &hr->hot_range.hot_freq_data;
>> +
>> + /* Always lock hot_inode_item first */
>> + spin_lock(&he->hot_inode.lock);
>> + spin_lock(&hr->hot_range.lock);
>> + seq_printf(seq, "inode #%llu, range start " \
>
> the # seems unnecessary to me
OK, removed.
>
>> + "%llu (range len %u) reads %u, writes %u, "
>> + "avg read time %llu, avg write time %llu, temp %u\n",
>
> compiler will complain if it sees a %llu format and not the expected
> type of 'unsigned long long'
When built, i haven't seen any warning report about this...
>
>> + he->i_ino,
>
> (unsigned long long)he->i_ino,
>
>> + (u64)hr->start * RANGE_SIZE,
>> + hr->len,
>> + freq_data->nr_reads,
>> + freq_data->nr_writes,
>> + freq_data->avg_delta_reads / NSEC_PER_MSEC,
>> + freq_data->avg_delta_writes / NSEC_PER_MSEC,
>> + freq_data->last_temp >> (32 - HEAT_MAP_BITS));
>> + spin_unlock(&hr->hot_range.lock);
>> + spin_unlock(&he->hot_inode.lock);
>> +
>> + return 0;
>> +}
>> +
>> +static int hot_inode_seq_show(struct seq_file *seq, void *v)
>> +{
>> + struct hot_inode_item *he = v;
>> + struct hot_freq_data *freq_data = &he->hot_inode.hot_freq_data;
>> +
>> + spin_lock(&he->hot_inode.lock);
>> + seq_printf(seq, "inode #%llu, reads %u, writes %u, " \
>> + "avg read time %llu, avg write time %llu, temp %u\n",
>
> (same here)
ditto.
>
>> + he->i_ino,
>> + freq_data->nr_reads,
>> + freq_data->nr_writes,
>> + freq_data->avg_delta_reads / NSEC_PER_MSEC,
>> + freq_data->avg_delta_writes / NSEC_PER_MSEC,
>> + freq_data->last_temp >> (32 - HEAT_MAP_BITS));
>> + spin_unlock(&he->hot_inode.lock);
>> +
>> + return 0;
>> +}
>>
>> +static void *hot_spot_range_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>> +{
>> + struct hot_info *root = seq->private;
>> + struct hot_range_item *hr_next, *hr = v;
>> + struct hot_comm_item *comm_item;
>> + struct list_head *n_list;
>> + int i =
>> + hr->hot_range.hot_freq_data.last_temp >> (32 - HEAT_MAP_BITS);
>
> now I have noticed that I've seen the ... (32 - HEAT_MAP_BITS)
> expression so many times that it tend to think it deserves a helper
> function
This helper function has existed, hot_raw_shift(), i will replace this with it.
>
>> +
>> + n_list = seq_list_next(&hr->hot_range.n_list,
>> + &root->heat_range_map[i].node_list, pos);
>> + hot_range_item_put(hr);
>> +next:
>> + if (n_list) {
>> + comm_item = container_of(n_list,
>> + struct hot_comm_item, n_list);
>> + hr_next = container_of(comm_item,
>> + struct hot_range_item, hot_range);
>> + kref_get(&hr_next->hot_range.refs);
>> + return hr_next;
>> + } else if (--i >= 0) {
>> + n_list = seq_list_next(&root->heat_range_map[i].node_list,
>> + &root->heat_range_map[i].node_list, pos);
>> + goto next;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static void hot_debugfs_exit(struct super_block *sb)
>> +{
>> + struct dentry *vol_dentry;
>> +
>> + vol_dentry = debugfs_get_dentry(sb->s_id,
>> + sb->s_hot_root->debugfs_root, strlen(sb->s_id));
>> + /* remove all debugfs entries recursively from the volume root */
>> + if (vol_dentry)
>> + debugfs_remove_recursive(vol_dentry);
>> + else
>> + BUG_ON(1);
>
> BUG()
done, thanks.
>
>> +
>> + if (list_empty(&sb->s_hot_root->debugfs_root->d_subdirs))
>> + debugfs_remove(sb->s_hot_root->debugfs_root);
>> +}
>> +
>> +/*
>
> david



--
Regards,

Zhi Yong Wu

2012-11-07 08:03:57

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 16/19] btrfs: add hot tracking support

On Wed, Nov 7, 2012 at 8:00 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:58PM +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> Introduce one new mount option '-o hot_track',
>> and add its parsing support.
>> Its usage looks like:
>> mount -o hot_track
>> mount -o nouser,hot_track
>> mount -o nouser,hot_track,loop
>> mount -o hot_track,nouser
>>
>> Signed-off-by: Zhi Yong Wu <[email protected]>
> Reviewed-by: David Sterba <[email protected]>
thanks for your review.



--
Regards,

Zhi Yong Wu

2012-11-07 08:27:08

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Wed, Nov 7, 2012 at 6:45 AM, Darrick J. Wong <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:45PM +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> Add some util helpers to update access frequencies
>> for one file or its range.
>>
>> Signed-off-by: Zhi Yong Wu <[email protected]>
>> ---
>> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
>> fs/hot_tracking.h | 7 ++
>> include/linux/hot_tracking.h | 2 +
>> 3 files changed, 188 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> index 68591f0..0a7d9a3 100644
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
>> }
>> }
>>
>> +struct hot_inode_item
>> +*hot_inode_item_find(struct hot_info *root, u64 ino)
>> +{
>> + struct hot_inode_item *he;
>> + int ret;
>> +
>> +again:
>> + spin_lock(&root->lock);
>> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
>> + if (he) {
>> + kref_get(&he->hot_inode.refs);
>> + spin_unlock(&root->lock);
>> + return he;
>> + }
>> + spin_unlock(&root->lock);
>> +
>> + he = kmem_cache_zalloc(hot_inode_item_cachep,
>> + GFP_KERNEL | GFP_NOFS);
>> + if (!he)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
>> +
>> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> + if (ret) {
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> + return ERR_PTR(ret);
>> + }
>> +
>> + spin_lock(&root->lock);
>> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
>> + if (ret == -EEXIST) {
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> + spin_unlock(&root->lock);
>> + radix_tree_preload_end();
>> + goto again;
>> + }
>> + spin_unlock(&root->lock);
>> + radix_tree_preload_end();
>> +
>> + kref_get(&he->hot_inode.refs);
>> + return he;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
>> +
>> +static struct hot_range_item
>> +*hot_range_item_find(struct hot_inode_item *he,
>> + u32 start)
>> +{
>> + struct hot_range_item *hr;
>> + int ret;
>> +
>> +again:
>> + spin_lock(&he->lock);
>> + hr = radix_tree_lookup(&he->hot_range_tree, start);
>> + if (hr) {
>> + kref_get(&hr->hot_range.refs);
>> + spin_unlock(&he->lock);
>> + return hr;
>> + }
>> + spin_unlock(&he->lock);
>> +
>> + hr = kmem_cache_zalloc(hot_range_item_cachep,
>> + GFP_KERNEL | GFP_NOFS);
>> + if (!hr)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + hot_range_item_init(hr, start, he);
>> +
>> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> + if (ret) {
>> + kmem_cache_free(hot_range_item_cachep, hr);
>> + return ERR_PTR(ret);
>> + }
>> +
>> + spin_lock(&he->lock);
>> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
>> + if (ret == -EEXIST) {
>> + kmem_cache_free(hot_range_item_cachep, hr);
>> + spin_unlock(&he->lock);
>> + radix_tree_preload_end();
>> + goto again;
>> + }
>> + spin_unlock(&he->lock);
>> + radix_tree_preload_end();
>> +
>> + kref_get(&hr->hot_range.refs);
>> + return hr;
>> +}
>> +
>> +/*
>> + * This function does the actual work of updating
>> + * the frequency numbers, whatever they turn out to be.
>> + */
>> +static u64 hot_average_update(struct timespec old_atime,
>> + struct timespec cur_time, u64 old_avg)
>> +{
>> + struct timespec delta_ts;
>> + u64 new_avg;
>> + u64 new_delta;
>> +
>> + delta_ts = timespec_sub(cur_time, old_atime);
>> + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
>> +
>> + new_avg = (old_avg << FREQ_POWER) - old_avg + new_delta;
>> + new_avg = new_avg >> FREQ_POWER;
>> +
>> + return new_avg;
>> +}
>> +
>> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
>> +{
>> + struct timespec cur_time = current_kernel_time();
>> +
>> + if (write) {
>> + freq_data->nr_writes += 1;
>> + freq_data->avg_delta_writes = hot_average_update(
>> + freq_data->last_write_time,
>> + cur_time,
>> + freq_data->avg_delta_writes);
>> + freq_data->last_write_time = cur_time;
>> + } else {
>> + freq_data->nr_reads += 1;
>> + freq_data->avg_delta_reads = hot_average_update(
>> + freq_data->last_read_time,
>> + cur_time,
>> + freq_data->avg_delta_reads);
>
> I think you could just pass in a pointer to
> freq_data->avg_delta_{writes,reads} here...
why?
>
>> + freq_data->last_read_time = cur_time;
>> + }
>> +}
>> +
>> /*
>> * Initialize kmem cache for hot_inode_item and hot_range_item.
>> */
>> @@ -199,6 +330,54 @@ err:
>> EXPORT_SYMBOL_GPL(hot_cache_init);
>>
>> /*
>> + * Main function to update access frequency from read/writepage(s) hooks
>> + */
>> +void hot_update_freqs(struct inode *inode, u64 start,
>> + u64 len, int rw)
>> +{
>> + struct hot_info *root = inode->i_sb->s_hot_root;
>> + struct hot_inode_item *he;
>> + struct hot_range_item *hr;
>> + u32 cur, end;
>> +
>> + if (!root || (len == 0))
>> + return;
>> +
>> + he = hot_inode_item_find(root, inode->i_ino);
>> + if (IS_ERR(he)) {
>> + WARN_ON(1);
>> + return;
>> + }
>> +
>> + spin_lock(&he->hot_inode.lock);
>> + hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
>> + spin_unlock(&he->hot_inode.lock);
>> +
>> + /*
>> + * Align ranges on RANGE_SIZE boundary
>> + * to prevent proliferation of range structs
>> + */
>> + end = (start + len + RANGE_SIZE - 1) >> RANGE_BITS;
>> + for (cur = (start >> RANGE_BITS); cur < end; cur++) {
>
> Hm... start is u64, cur is u32, RANGE_BITS is 20. Doesn't this overflow if,
> say, I have a sparse file with blocks way out at 2^53 bytes?
ah, good catch, thanks.
>
> Also, RANGE_SIZE means that the hot tracking range granularity is 1MiB? How
yes.
> did you decide on that? Will we ever want to change that?
It is one assumption, do you think 1 MB is not appropriate? Do you
mean to add one proc file interface for it?
>
>> + hr = hot_range_item_find(he, cur);
>> + if (IS_ERR(hr)) {
>> + WARN_ON(1);
>
> WARN(1, "hot_range_item_find returns %d\n", PTR_ERR(hr)); ?
OK, done.
>
> --D
>
>> + hot_inode_item_put(he);
>> + return;
>> + }
>> +
>> + spin_lock(&hr->hot_range.lock);
>> + hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
>> + spin_unlock(&hr->hot_range.lock);
>> +
>> + hot_range_item_put(hr);
>> + }
>> +
>> + hot_inode_item_put(he);
>> +}
>> +EXPORT_SYMBOL_GPL(hot_update_freqs);
>> +
>> +/*
>> * Initialize the data structures for hot data tracking.
>> */
>> int hot_track_init(struct super_block *sb)
>> diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
>> index e7ba121..cc4666e 100644
>> --- a/fs/hot_tracking.h
>> +++ b/fs/hot_tracking.h
>> @@ -20,6 +20,13 @@
>> #define FREQ_DATA_TYPE_INODE (1 << 0)
>> #define FREQ_DATA_TYPE_RANGE (1 << 1)
>>
>> +/* size of sub-file ranges */
>> +#define RANGE_BITS 20
>> +#define RANGE_SIZE (1 << RANGE_BITS)
>> +
>> +#define FREQ_POWER 4
>> +
>> void hot_inode_item_put(struct hot_inode_item *he);
>> +struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);
>>
>> #endif /* __HOT_TRACKING__ */
>> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
>> index 4233207..e2d6028 100644
>> --- a/include/linux/hot_tracking.h
>> +++ b/include/linux/hot_tracking.h
>> @@ -71,5 +71,7 @@ struct hot_info {
>> extern void __init hot_cache_init(void);
>> extern int hot_track_init(struct super_block *sb);
>> extern void hot_track_exit(struct super_block *sb);
>> +extern void hot_update_freqs(struct inode *inode, u64 start,
>> + u64 len, int rw);
>>
>> #endif /* _LINUX_HOTTRACK_H */
>> --
>> 1.7.6.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Regards,

Zhi Yong Wu

2012-11-07 08:34:37

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

On Wed, Nov 7, 2012 at 7:30 AM, Darrick J. Wong <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:52PM +0800, [email protected] wrote:
>> From: Zhi Yong Wu <[email protected]>
>>
>> Introduce one framwork to enable that specific FS
>> can register its own hot tracking functions.
>>
>> Signed-off-by: Zhi Yong Wu <[email protected]>
>> ---
>> fs/hot_tracking.c | 78 ++++++++++++++++++++++++++++++++++++++----
>> include/linux/hot_tracking.h | 25 +++++++++++++
>> 2 files changed, 96 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> index 0ef9cad..c6c6138 100644
>> --- a/fs/hot_tracking.c
>> +++ b/fs/hot_tracking.c
>> @@ -24,6 +24,9 @@
>> #include <linux/limits.h>
>> #include "hot_tracking.h"
>>
>> +static DEFINE_SPINLOCK(hot_func_list_lock);
>> +static LIST_HEAD(hot_func_list);
>> +
>> /* kmem_cache pointers for slab caches */
>> static struct kmem_cache *hot_inode_item_cachep __read_mostly;
>> static struct kmem_cache *hot_range_item_cachep __read_mostly;
>> @@ -305,20 +308,23 @@ static u64 hot_average_update(struct timespec old_atime,
>> return new_avg;
>> }
>>
>> -static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
>> +static void hot_freq_data_update(struct hot_info *root,
>> + struct hot_freq_data *freq_data, bool write)
>> {
>> struct timespec cur_time = current_kernel_time();
>>
>> if (write) {
>> freq_data->nr_writes += 1;
>> - freq_data->avg_delta_writes = hot_average_update(
>> + freq_data->avg_delta_writes =
>> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
>> freq_data->last_write_time,
>> cur_time,
>> freq_data->avg_delta_writes);
>> freq_data->last_write_time = cur_time;
>> } else {
>> freq_data->nr_reads += 1;
>> - freq_data->avg_delta_reads = hot_average_update(
>> + freq_data->avg_delta_reads =
>> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
>> freq_data->last_read_time,
>> cur_time,
>> freq_data->avg_delta_reads);
>> @@ -430,7 +436,7 @@ static void hot_map_array_update(struct hot_freq_data *freq_data,
>> struct hot_comm_item *comm_item;
>> struct hot_inode_item *he;
>> struct hot_range_item *hr;
>> - u32 temp = hot_temp_calc(freq_data);
>> + u32 temp = root->hot_func_type->ops.hot_temp_calc_fn(freq_data);
>> u8 a_temp = temp >> (32 - HEAT_MAP_BITS);
>> u8 b_temp = freq_data->last_temp >> (32 - HEAT_MAP_BITS);
>>
>> @@ -511,7 +517,7 @@ static void hot_range_update(struct hot_inode_item *he,
>> &hr_nodes[i]->hot_range.hot_freq_data, root);
>>
>> spin_lock(&hr_nodes[i]->hot_range.lock);
>> - obsolete = hot_is_obsolete(
>> + obsolete = root->hot_func_type->ops.hot_is_obsolete_fn(
>> &hr_nodes[i]->hot_range.hot_freq_data);
>> spin_unlock(&hr_nodes[i]->hot_range.lock);
>>
>> @@ -668,7 +674,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
>> }
>>
>> spin_lock(&he->hot_inode.lock);
>> - hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
>> + hot_freq_data_update(root, &he->hot_inode.hot_freq_data, rw);
>> spin_unlock(&he->hot_inode.lock);
>>
>> /*
>> @@ -685,7 +691,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
>> }
>>
>> spin_lock(&hr->hot_range.lock);
>> - hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
>> + hot_freq_data_update(root, &hr->hot_range.hot_freq_data, rw);
>> spin_unlock(&hr->hot_range.lock);
>>
>> hot_range_item_put(hr);
>> @@ -695,6 +701,61 @@ void hot_update_freqs(struct inode *inode, u64 start,
>> }
>> EXPORT_SYMBOL_GPL(hot_update_freqs);
>>
>> +static struct hot_func_type hot_func_def = {
>> + .hot_func_name = "hot_type_def",
>> + .ops = {
>> + .hot_rw_freq_calc_fn = hot_average_update,
>> + .hot_temp_calc_fn = hot_temp_calc,
>> + .hot_is_obsolete_fn = hot_is_obsolete,
>> + },
>> +};
>
> If these hot_ops are per-filesystem, why not just embed a struct hot_func_ops
> inside of struct file_system_type? That eliminates this _get function,
this _get function is very small, only some loc, if hot_func_ops is
embedded in struct file_system_type, i am afraid to introduce some
regressions....
> collision avoidance, etc. You can fill in NULL function pointers in
fill in NULL func pointer? why?
> hot_track_init (or just code around them).
>
> --D
>
>> +
>> +static struct hot_func_type *hot_func_get(const char *name)
>> +{
>> + struct hot_func_type *f, *h = &hot_func_def;
>> +
>> + spin_lock(&hot_func_list_lock);
>> + list_for_each_entry(f, &hot_func_list, list) {
>> + if (!strcmp(f->hot_func_name, name))
>> + h = f;
>> + }
>> + spin_unlock(&hot_func_list_lock);
>> +
>> + return h;
>> +}
>> +
>> +int hot_func_register(struct hot_func_type *h)
>> +{
>> + struct hot_func_type *f, *t = NULL;
>> +
>> + /* register, don't allow duplicate names */
>> + spin_lock(&hot_func_list_lock);
>> + list_for_each_entry(f, &hot_func_list, list) {
>> + if (!strcmp(f->hot_func_name, h->hot_func_name))
>> + t = f;
>> + }
>> +
>> + if (t) {
>> + spin_unlock(&hot_func_list_lock);
>> + return -EBUSY;
>> + }
>> +
>> + list_add_tail(&h->list, &hot_func_list);
>> + spin_unlock(&hot_func_list_lock);
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_func_register);
>> +
>> +void hot_func_unregister(struct hot_func_type *h)
>> +{
>> + /* unregister */
>> + spin_lock(&hot_func_list_lock);
>> + list_del_init(&h->list);
>> + spin_unlock(&hot_func_list_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(hot_func_unregister);
>> +
>> /*
>> * Initialize the data structures for hot data tracking.
>> */
>> @@ -714,6 +775,9 @@ int hot_track_init(struct super_block *sb)
>> hot_inode_tree_init(root);
>> hot_map_array_init(root);
>>
>> + /* Get hot func type */
>> + root->hot_func_type = hot_func_get(sb->s_type->name);
>> +
>> root->update_wq = alloc_workqueue(
>> "hot_update_wq", WQ_NON_REENTRANT, 0);
>> if (!root->update_wq) {
>> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
>> index 2ee0d02..3941052 100644
>> --- a/include/linux/hot_tracking.h
>> +++ b/include/linux/hot_tracking.h
>> @@ -23,6 +23,8 @@
>> #define HEAT_MAP_BITS 8
>> #define HEAT_MAP_SIZE (1 << HEAT_MAP_BITS)
>>
>> +#define HOT_NAME_MAX 16
>> +
>> /*
>> * A frequency data struct holds values that are used to
>> * determine temperature of files and file ranges. These structs
>> @@ -73,6 +75,25 @@ struct hot_range_item {
>> u32 len; /* length in bytes */
>> };
>>
>> +typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
>> + struct timespec cur_time, u64 old_avg);
>> +typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
>> +typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);
>> +
>> +struct hot_func_ops {
>> + hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
>> + hot_temp_calc_fn *hot_temp_calc_fn;
>> + hot_is_obsolete_fn *hot_is_obsolete_fn;
>> +};
>> +
>> +/* identifies an hot func type */
>> +struct hot_func_type {
>> + char hot_func_name[HOT_NAME_MAX];
>> + /* fields provided by specific FS */
>> + struct hot_func_ops ops;
>> + struct list_head list;
>> +};
>> +
>> struct hot_info {
>> struct radix_tree_root hot_inode_tree;
>> spinlock_t lock; /*protect inode tree */
>> @@ -85,6 +106,7 @@ struct hot_info {
>>
>> struct workqueue_struct *update_wq;
>> struct delayed_work update_work;
>> + struct hot_func_type *hot_func_type;
>> };
>>
>> extern void __init hot_cache_init(void);
>> @@ -93,4 +115,7 @@ extern void hot_track_exit(struct super_block *sb);
>> extern void hot_update_freqs(struct inode *inode, u64 start,
>> u64 len, int rw);
>>
>> +extern int hot_func_register(struct hot_func_type *h);
>> +extern void hot_func_unregister(struct hot_func_type *h);
>> +
>> #endif /* _LINUX_HOTTRACK_H */
>> --
>> 1.7.6.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Regards,

Zhi Yong Wu

2012-11-07 08:51:13

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 00/19] vfs: hot data tracking

On Mon, Oct 29, 2012 at 12:30 PM, <[email protected]> wrote:
> From: Zhi Yong Wu <[email protected]>
>
> NOTE:
>
> The patchset can be obtained via my kernel dev git on github:
> [email protected]:wuzhy/kernel.git hot_tracking
> If you're interested, you can also can review them via
> https://github.com/wuzhy/kernel/commits/hot_tracking
hi, guys,

The latest code change has been pushed into my above dev git tree. If
no further comments are done, i will post next version soon.

>
> For more info, please check hot_tracking.txt in Documentation
>
> TODO List:
>
> 1.) Need to do scalability or performance tests.
> 2.) Need one simpler but effective temp calc'ing function
> 3.) How to save the file temperature among the umount to be able to
> preserve the file tempreture after reboot
>
> Ben Chociej, Matt Lupfer and Conor Scott originally wrote this code to
> be very btrfs-specific. I've taken their code and attempted to
> make it more generic and integrate it at the VFS level.
>
> Changelog from v3:
> 1.) Rewritten debugfs support based seq_file operation. [Dave Chinner]
> 2.) Refactored workqueue support. [Dave Chinner]
> 3.) Turn some Micro into be tunable [Zhiyong, Zheng Liu]
> TIME_TO_KICK, and HEAT_UPDATE_DELAY
> 4.) Introduce hot func registering framework [Zhiyong]
> 5.) Remove global variable for hot tracking [Zhiyong]
> 6.) Add xfs hot tracking support [Dave Chinner]
> 7.) Add ext4 hot tracking support [Zheng Liu]
> 8.) Cleanedup a lot of other issues [Dave Chinner]
>
> v3:
> 1.) Converted to Radix trees, not RB-tree [Zhiyong, Dave Chinner]
> 2.) Added memory shrinker [Dave Chinner]
>
> v2:
> 1.) Converted to one workqueue to update map info periodically [Dave Chinner]
> 2.) Cleanedup a lot of other issues [Dave Chinner]
>
> v1:
> 1.) Reduce new files and put all in fs/hot_tracking.[ch] [Dave Chinner]
> 2.) Add btrfs hot tracking support [Zhiyong]
> 3.) The first three patches can probably just be flattened into one.
> [Marco Stornelli , Dave Chinner]
>
> Dave Chinner (1):
> xfs: add hot tracking support
>
> Zheng Liu (1):
> ext4: add hot tracking support
>
> Zhi Yong Wu (17):
> vfs: introduce private radix tree structures
> vfs: initialize and free data structures
> vfs: add I/O frequency update function
> vfs: add two map arrays
> vfs: add hooks to enable hot tracking
> vfs: add temp calculation function
> vfs: add map info update function
> vfs: add aging function
> vfs: add one work queue
> vfs: introduce hot func register framework
> vfs: register one shrinker
> vfs: add one ioctl interface
> debugfs: introduce one function
> vfs: add debugfs support
> sysfs: add two hot_track proc files
> btrfs: add hot tracking support
> vfs: add documentation
>
> Documentation/filesystems/00-INDEX | 2 +
> Documentation/filesystems/hot_tracking.txt | 262 ++++++
> fs/Makefile | 2 +-
> fs/btrfs/ctree.h | 1 +
> fs/btrfs/super.c | 22 +-
> fs/compat_ioctl.c | 5 +
> fs/dcache.c | 2 +
> fs/debugfs/inode.c | 26 +
> fs/direct-io.c | 6 +
> fs/ext4/ext4.h | 3 +
> fs/ext4/super.c | 13 +-
> fs/hot_tracking.c | 1367 ++++++++++++++++++++++++++++
> fs/hot_tracking.h | 58 ++
> fs/ioctl.c | 78 ++
> fs/xfs/xfs_mount.h | 1 +
> fs/xfs/xfs_super.c | 16 +
> include/linux/debugfs.h | 9 +
> include/linux/fs.h | 4 +
> include/linux/hot_tracking.h | 149 +++
> kernel/sysctl.c | 14 +
> mm/filemap.c | 6 +
> mm/page-writeback.c | 12 +
> mm/readahead.c | 6 +
> 23 files changed, 2061 insertions(+), 3 deletions(-)
> create mode 100644 Documentation/filesystems/hot_tracking.txt
> create mode 100644 fs/hot_tracking.c
> create mode 100644 fs/hot_tracking.h
> create mode 100644 include/linux/hot_tracking.h
>
> --
> 1.7.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Regards,

Zhi Yong Wu

2012-11-07 18:49:52

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Wed, Nov 07, 2012 at 04:27:05PM +0800, Zhi Yong Wu wrote:
> On Wed, Nov 7, 2012 at 6:45 AM, Darrick J. Wong <[email protected]> wrote:
> > On Mon, Oct 29, 2012 at 12:30:45PM +0800, [email protected] wrote:
> >> From: Zhi Yong Wu <[email protected]>
> >>
> >> Add some util helpers to update access frequencies
> >> for one file or its range.
> >>
> >> Signed-off-by: Zhi Yong Wu <[email protected]>
> >> ---
> >> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
> >> fs/hot_tracking.h | 7 ++
> >> include/linux/hot_tracking.h | 2 +
> >> 3 files changed, 188 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> >> index 68591f0..0a7d9a3 100644
> >> --- a/fs/hot_tracking.c
> >> +++ b/fs/hot_tracking.c
> >> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
> >> }
> >> }
> >>
> >> +struct hot_inode_item
> >> +*hot_inode_item_find(struct hot_info *root, u64 ino)
> >> +{
> >> + struct hot_inode_item *he;
> >> + int ret;
> >> +
> >> +again:
> >> + spin_lock(&root->lock);
> >> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
> >> + if (he) {
> >> + kref_get(&he->hot_inode.refs);
> >> + spin_unlock(&root->lock);
> >> + return he;
> >> + }
> >> + spin_unlock(&root->lock);
> >> +
> >> + he = kmem_cache_zalloc(hot_inode_item_cachep,
> >> + GFP_KERNEL | GFP_NOFS);
> >> + if (!he)
> >> + return ERR_PTR(-ENOMEM);
> >> +
> >> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
> >> +
> >> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> >> + if (ret) {
> >> + kmem_cache_free(hot_inode_item_cachep, he);
> >> + return ERR_PTR(ret);
> >> + }
> >> +
> >> + spin_lock(&root->lock);
> >> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
> >> + if (ret == -EEXIST) {
> >> + kmem_cache_free(hot_inode_item_cachep, he);
> >> + spin_unlock(&root->lock);
> >> + radix_tree_preload_end();
> >> + goto again;
> >> + }
> >> + spin_unlock(&root->lock);
> >> + radix_tree_preload_end();
> >> +
> >> + kref_get(&he->hot_inode.refs);
> >> + return he;
> >> +}
> >> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
> >> +
> >> +static struct hot_range_item
> >> +*hot_range_item_find(struct hot_inode_item *he,
> >> + u32 start)
> >> +{
> >> + struct hot_range_item *hr;
> >> + int ret;
> >> +
> >> +again:
> >> + spin_lock(&he->lock);
> >> + hr = radix_tree_lookup(&he->hot_range_tree, start);
> >> + if (hr) {
> >> + kref_get(&hr->hot_range.refs);
> >> + spin_unlock(&he->lock);
> >> + return hr;
> >> + }
> >> + spin_unlock(&he->lock);
> >> +
> >> + hr = kmem_cache_zalloc(hot_range_item_cachep,
> >> + GFP_KERNEL | GFP_NOFS);
> >> + if (!hr)
> >> + return ERR_PTR(-ENOMEM);
> >> +
> >> + hot_range_item_init(hr, start, he);
> >> +
> >> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> >> + if (ret) {
> >> + kmem_cache_free(hot_range_item_cachep, hr);
> >> + return ERR_PTR(ret);
> >> + }
> >> +
> >> + spin_lock(&he->lock);
> >> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
> >> + if (ret == -EEXIST) {
> >> + kmem_cache_free(hot_range_item_cachep, hr);
> >> + spin_unlock(&he->lock);
> >> + radix_tree_preload_end();
> >> + goto again;
> >> + }
> >> + spin_unlock(&he->lock);
> >> + radix_tree_preload_end();
> >> +
> >> + kref_get(&hr->hot_range.refs);
> >> + return hr;
> >> +}
> >> +
> >> +/*
> >> + * This function does the actual work of updating
> >> + * the frequency numbers, whatever they turn out to be.
> >> + */
> >> +static u64 hot_average_update(struct timespec old_atime,
> >> + struct timespec cur_time, u64 old_avg)
> >> +{
> >> + struct timespec delta_ts;
> >> + u64 new_avg;
> >> + u64 new_delta;
> >> +
> >> + delta_ts = timespec_sub(cur_time, old_atime);
> >> + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
> >> +
> >> + new_avg = (old_avg << FREQ_POWER) - old_avg + new_delta;
> >> + new_avg = new_avg >> FREQ_POWER;
> >> +
> >> + return new_avg;
> >> +}
> >> +
> >> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
> >> +{
> >> + struct timespec cur_time = current_kernel_time();
> >> +
> >> + if (write) {
> >> + freq_data->nr_writes += 1;
> >> + freq_data->avg_delta_writes = hot_average_update(
> >> + freq_data->last_write_time,
> >> + cur_time,
> >> + freq_data->avg_delta_writes);
> >> + freq_data->last_write_time = cur_time;
> >> + } else {
> >> + freq_data->nr_reads += 1;
> >> + freq_data->avg_delta_reads = hot_average_update(
> >> + freq_data->last_read_time,
> >> + cur_time,
> >> + freq_data->avg_delta_reads);
> >
> > I think you could just pass in a pointer to
> > freq_data->avg_delta_{writes,reads} here...
> why?

freq_data->avg_delta_{reads,writes} seems to be an in/out parameter, but by
specifying it once as an in parameter and again as an lvalue, you're increasing
the chances that someone will screw it up some time later -- you're not
preventing me from accidentally writing this:

freq_data->avg_delta_writes = hot_average_update(..., freq_data->avg_delta_reads);

...which (at least in my head) becomes an easier mistake to make once you start
mixing in the function pointers a few patches later, and (my) brain has to wrap
itself around all the punctuation.

> >> + freq_data->last_read_time = cur_time;
> >> + }
> >> +}
> >> +
> >> /*
> >> * Initialize kmem cache for hot_inode_item and hot_range_item.
> >> */
> >> @@ -199,6 +330,54 @@ err:
> >> EXPORT_SYMBOL_GPL(hot_cache_init);
> >>
> >> /*
> >> + * Main function to update access frequency from read/writepage(s) hooks
> >> + */
> >> +void hot_update_freqs(struct inode *inode, u64 start,
> >> + u64 len, int rw)
> >> +{
> >> + struct hot_info *root = inode->i_sb->s_hot_root;
> >> + struct hot_inode_item *he;
> >> + struct hot_range_item *hr;
> >> + u32 cur, end;
> >> +
> >> + if (!root || (len == 0))
> >> + return;
> >> +
> >> + he = hot_inode_item_find(root, inode->i_ino);
> >> + if (IS_ERR(he)) {
> >> + WARN_ON(1);
> >> + return;
> >> + }
> >> +
> >> + spin_lock(&he->hot_inode.lock);
> >> + hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
> >> + spin_unlock(&he->hot_inode.lock);
> >> +
> >> + /*
> >> + * Align ranges on RANGE_SIZE boundary
> >> + * to prevent proliferation of range structs
> >> + */
> >> + end = (start + len + RANGE_SIZE - 1) >> RANGE_BITS;
> >> + for (cur = (start >> RANGE_BITS); cur < end; cur++) {
> >
> > Hm... start is u64, cur is u32, RANGE_BITS is 20. Doesn't this overflow if,
> > say, I have a sparse file with blocks way out at 2^53 bytes?
> ah, good catch, thanks.

Actually, I should go further -- why not use loff_t? The rest of the fs/ code
does.

> > Also, RANGE_SIZE means that the hot tracking range granularity is 1MiB? How
> yes.
> > did you decide on that? Will we ever want to change that?
> It is one assumption, do you think 1 MB is not appropriate? Do you
> mean to add one proc file interface for it?

I don't know about a procfs interface -- debugfs, perhaps?

But actually, I was thinking that the fs might have a better idea of the range
granularity that it wants to handle. Possibly it might be useful to try to
align with raid stripes or other topology, too... though that's difficult.

Also, for the fses that use allocation units (clusters), it might be useful
to collect heat data per-cluster.

On the other hand, it might not make much of a difference since most files tend
to fit in ~4K anyway, and the extra granularity will increase memory
consumption for large files. I don't mind having a 1MB default, but having a
knob would certainly make it easier to tune, or in the future, to test if that
1MB default still makes sense.

--D

> >
> >> + hr = hot_range_item_find(he, cur);
> >> + if (IS_ERR(hr)) {
> >> + WARN_ON(1);
> >
> > WARN(1, "hot_range_item_find returns %d\n", PTR_ERR(hr)); ?
> OK, done.
> >
> > --D
> >
> >> + hot_inode_item_put(he);
> >> + return;
> >> + }
> >> +
> >> + spin_lock(&hr->hot_range.lock);
> >> + hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
> >> + spin_unlock(&hr->hot_range.lock);
> >> +
> >> + hot_range_item_put(hr);
> >> + }
> >> +
> >> + hot_inode_item_put(he);
> >> +}
> >> +EXPORT_SYMBOL_GPL(hot_update_freqs);
> >> +
> >> +/*
> >> * Initialize the data structures for hot data tracking.
> >> */
> >> int hot_track_init(struct super_block *sb)
> >> diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
> >> index e7ba121..cc4666e 100644
> >> --- a/fs/hot_tracking.h
> >> +++ b/fs/hot_tracking.h
> >> @@ -20,6 +20,13 @@
> >> #define FREQ_DATA_TYPE_INODE (1 << 0)
> >> #define FREQ_DATA_TYPE_RANGE (1 << 1)
> >>
> >> +/* size of sub-file ranges */
> >> +#define RANGE_BITS 20
> >> +#define RANGE_SIZE (1 << RANGE_BITS)
> >> +
> >> +#define FREQ_POWER 4
> >> +
> >> void hot_inode_item_put(struct hot_inode_item *he);
> >> +struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);
> >>
> >> #endif /* __HOT_TRACKING__ */
> >> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
> >> index 4233207..e2d6028 100644
> >> --- a/include/linux/hot_tracking.h
> >> +++ b/include/linux/hot_tracking.h
> >> @@ -71,5 +71,7 @@ struct hot_info {
> >> extern void __init hot_cache_init(void);
> >> extern int hot_track_init(struct super_block *sb);
> >> extern void hot_track_exit(struct super_block *sb);
> >> +extern void hot_update_freqs(struct inode *inode, u64 start,
> >> + u64 len, int rw);
> >>
> >> #endif /* _LINUX_HOTTRACK_H */
> >> --
> >> 1.7.6.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >> the body of a message to [email protected]
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Regards,
>
> Zhi Yong Wu
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2012-11-07 18:58:55

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

On Wed, Nov 07, 2012 at 04:34:35PM +0800, Zhi Yong Wu wrote:
> On Wed, Nov 7, 2012 at 7:30 AM, Darrick J. Wong <[email protected]> wrote:
> > On Mon, Oct 29, 2012 at 12:30:52PM +0800, [email protected] wrote:
> >> From: Zhi Yong Wu <[email protected]>
> >>
> >> Introduce one framwork to enable that specific FS
> >> can register its own hot tracking functions.
> >>
> >> Signed-off-by: Zhi Yong Wu <[email protected]>
> >> ---
> >> fs/hot_tracking.c | 78 ++++++++++++++++++++++++++++++++++++++----
> >> include/linux/hot_tracking.h | 25 +++++++++++++
> >> 2 files changed, 96 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> >> index 0ef9cad..c6c6138 100644
> >> --- a/fs/hot_tracking.c
> >> +++ b/fs/hot_tracking.c
> >> @@ -24,6 +24,9 @@
> >> #include <linux/limits.h>
> >> #include "hot_tracking.h"
> >>
> >> +static DEFINE_SPINLOCK(hot_func_list_lock);
> >> +static LIST_HEAD(hot_func_list);
> >> +
> >> /* kmem_cache pointers for slab caches */
> >> static struct kmem_cache *hot_inode_item_cachep __read_mostly;
> >> static struct kmem_cache *hot_range_item_cachep __read_mostly;
> >> @@ -305,20 +308,23 @@ static u64 hot_average_update(struct timespec old_atime,
> >> return new_avg;
> >> }
> >>
> >> -static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
> >> +static void hot_freq_data_update(struct hot_info *root,
> >> + struct hot_freq_data *freq_data, bool write)
> >> {
> >> struct timespec cur_time = current_kernel_time();
> >>
> >> if (write) {
> >> freq_data->nr_writes += 1;
> >> - freq_data->avg_delta_writes = hot_average_update(
> >> + freq_data->avg_delta_writes =
> >> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
> >> freq_data->last_write_time,
> >> cur_time,
> >> freq_data->avg_delta_writes);
> >> freq_data->last_write_time = cur_time;
> >> } else {
> >> freq_data->nr_reads += 1;
> >> - freq_data->avg_delta_reads = hot_average_update(
> >> + freq_data->avg_delta_reads =
> >> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
> >> freq_data->last_read_time,
> >> cur_time,
> >> freq_data->avg_delta_reads);
> >> @@ -430,7 +436,7 @@ static void hot_map_array_update(struct hot_freq_data *freq_data,
> >> struct hot_comm_item *comm_item;
> >> struct hot_inode_item *he;
> >> struct hot_range_item *hr;
> >> - u32 temp = hot_temp_calc(freq_data);
> >> + u32 temp = root->hot_func_type->ops.hot_temp_calc_fn(freq_data);
> >> u8 a_temp = temp >> (32 - HEAT_MAP_BITS);
> >> u8 b_temp = freq_data->last_temp >> (32 - HEAT_MAP_BITS);
> >>
> >> @@ -511,7 +517,7 @@ static void hot_range_update(struct hot_inode_item *he,
> >> &hr_nodes[i]->hot_range.hot_freq_data, root);
> >>
> >> spin_lock(&hr_nodes[i]->hot_range.lock);
> >> - obsolete = hot_is_obsolete(
> >> + obsolete = root->hot_func_type->ops.hot_is_obsolete_fn(
> >> &hr_nodes[i]->hot_range.hot_freq_data);
> >> spin_unlock(&hr_nodes[i]->hot_range.lock);
> >>
> >> @@ -668,7 +674,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
> >> }
> >>
> >> spin_lock(&he->hot_inode.lock);
> >> - hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
> >> + hot_freq_data_update(root, &he->hot_inode.hot_freq_data, rw);
> >> spin_unlock(&he->hot_inode.lock);
> >>
> >> /*
> >> @@ -685,7 +691,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
> >> }
> >>
> >> spin_lock(&hr->hot_range.lock);
> >> - hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
> >> + hot_freq_data_update(root, &hr->hot_range.hot_freq_data, rw);
> >> spin_unlock(&hr->hot_range.lock);
> >>
> >> hot_range_item_put(hr);
> >> @@ -695,6 +701,61 @@ void hot_update_freqs(struct inode *inode, u64 start,
> >> }
> >> EXPORT_SYMBOL_GPL(hot_update_freqs);
> >>
> >> +static struct hot_func_type hot_func_def = {
> >> + .hot_func_name = "hot_type_def",
> >> + .ops = {
> >> + .hot_rw_freq_calc_fn = hot_average_update,
> >> + .hot_temp_calc_fn = hot_temp_calc,
> >> + .hot_is_obsolete_fn = hot_is_obsolete,
> >> + },
> >> +};
> >
> > If these hot_ops are per-filesystem, why not just embed a struct hot_func_ops
> > inside of struct file_system_type? That eliminates this _get function,
> this _get function is very small, only some loc, if hot_func_ops is
> embedded in struct file_system_type, i am afraid to introduce some
> regressions....

What kind of regressions are you afraid of, specifically? I don't think fstype
is performance-critical enough to worry about wreaking havoc in the caches due
to adding three function pointers.

> > collision avoidance, etc. You can fill in NULL function pointers in
> fill in NULL func pointer? why?
> > hot_track_init (or just code around them).

I guess you could just require that everyone fill out .hot_temp_calc_fn,
even if they just point it to generic_hot_temp_calc.

--D

> >
> > --D
> >
> >> +
> >> +static struct hot_func_type *hot_func_get(const char *name)
> >> +{
> >> + struct hot_func_type *f, *h = &hot_func_def;
> >> +
> >> + spin_lock(&hot_func_list_lock);
> >> + list_for_each_entry(f, &hot_func_list, list) {
> >> + if (!strcmp(f->hot_func_name, name))
> >> + h = f;
> >> + }
> >> + spin_unlock(&hot_func_list_lock);
> >> +
> >> + return h;
> >> +}
> >> +
> >> +int hot_func_register(struct hot_func_type *h)
> >> +{
> >> + struct hot_func_type *f, *t = NULL;
> >> +
> >> + /* register, don't allow duplicate names */
> >> + spin_lock(&hot_func_list_lock);
> >> + list_for_each_entry(f, &hot_func_list, list) {
> >> + if (!strcmp(f->hot_func_name, h->hot_func_name))
> >> + t = f;
> >> + }
> >> +
> >> + if (t) {
> >> + spin_unlock(&hot_func_list_lock);
> >> + return -EBUSY;
> >> + }
> >> +
> >> + list_add_tail(&h->list, &hot_func_list);
> >> + spin_unlock(&hot_func_list_lock);
> >> +
> >> + return 0;
> >> +}
> >> +EXPORT_SYMBOL_GPL(hot_func_register);
> >> +
> >> +void hot_func_unregister(struct hot_func_type *h)
> >> +{
> >> + /* unregister */
> >> + spin_lock(&hot_func_list_lock);
> >> + list_del_init(&h->list);
> >> + spin_unlock(&hot_func_list_lock);
> >> +}
> >> +EXPORT_SYMBOL_GPL(hot_func_unregister);
> >> +
> >> /*
> >> * Initialize the data structures for hot data tracking.
> >> */
> >> @@ -714,6 +775,9 @@ int hot_track_init(struct super_block *sb)
> >> hot_inode_tree_init(root);
> >> hot_map_array_init(root);
> >>
> >> + /* Get hot func type */
> >> + root->hot_func_type = hot_func_get(sb->s_type->name);
> >> +
> >> root->update_wq = alloc_workqueue(
> >> "hot_update_wq", WQ_NON_REENTRANT, 0);
> >> if (!root->update_wq) {
> >> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
> >> index 2ee0d02..3941052 100644
> >> --- a/include/linux/hot_tracking.h
> >> +++ b/include/linux/hot_tracking.h
> >> @@ -23,6 +23,8 @@
> >> #define HEAT_MAP_BITS 8
> >> #define HEAT_MAP_SIZE (1 << HEAT_MAP_BITS)
> >>
> >> +#define HOT_NAME_MAX 16
> >> +
> >> /*
> >> * A frequency data struct holds values that are used to
> >> * determine temperature of files and file ranges. These structs
> >> @@ -73,6 +75,25 @@ struct hot_range_item {
> >> u32 len; /* length in bytes */
> >> };
> >>
> >> +typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
> >> + struct timespec cur_time, u64 old_avg);
> >> +typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
> >> +typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);
> >> +
> >> +struct hot_func_ops {
> >> + hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
> >> + hot_temp_calc_fn *hot_temp_calc_fn;
> >> + hot_is_obsolete_fn *hot_is_obsolete_fn;
> >> +};
> >> +
> >> +/* identifies an hot func type */
> >> +struct hot_func_type {
> >> + char hot_func_name[HOT_NAME_MAX];
> >> + /* fields provided by specific FS */
> >> + struct hot_func_ops ops;
> >> + struct list_head list;
> >> +};
> >> +
> >> struct hot_info {
> >> struct radix_tree_root hot_inode_tree;
> >> spinlock_t lock; /*protect inode tree */
> >> @@ -85,6 +106,7 @@ struct hot_info {
> >>
> >> struct workqueue_struct *update_wq;
> >> struct delayed_work update_work;
> >> + struct hot_func_type *hot_func_type;
> >> };
> >>
> >> extern void __init hot_cache_init(void);
> >> @@ -93,4 +115,7 @@ extern void hot_track_exit(struct super_block *sb);
> >> extern void hot_update_freqs(struct inode *inode, u64 start,
> >> u64 len, int rw);
> >>
> >> +extern int hot_func_register(struct hot_func_type *h);
> >> +extern void hot_func_unregister(struct hot_func_type *h);
> >> +
> >> #endif /* _LINUX_HOTTRACK_H */
> >> --
> >> 1.7.6.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >> the body of a message to [email protected]
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Regards,
>
> Zhi Yong Wu
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2012-11-08 02:52:53

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function

On Thu, Nov 8, 2012 at 2:49 AM, Darrick J. Wong <[email protected]> wrote:
> On Wed, Nov 07, 2012 at 04:27:05PM +0800, Zhi Yong Wu wrote:
>> On Wed, Nov 7, 2012 at 6:45 AM, Darrick J. Wong <[email protected]> wrote:
>> > On Mon, Oct 29, 2012 at 12:30:45PM +0800, [email protected] wrote:
>> >> From: Zhi Yong Wu <[email protected]>
>> >>
>> >> Add some util helpers to update access frequencies
>> >> for one file or its range.
>> >>
>> >> Signed-off-by: Zhi Yong Wu <[email protected]>
>> >> ---
>> >> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
>> >> fs/hot_tracking.h | 7 ++
>> >> include/linux/hot_tracking.h | 2 +
>> >> 3 files changed, 188 insertions(+), 0 deletions(-)
>> >>
>> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> >> index 68591f0..0a7d9a3 100644
>> >> --- a/fs/hot_tracking.c
>> >> +++ b/fs/hot_tracking.c
>> >> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
>> >> }
>> >> }
>> >>
>> >> +struct hot_inode_item
>> >> +*hot_inode_item_find(struct hot_info *root, u64 ino)
>> >> +{
>> >> + struct hot_inode_item *he;
>> >> + int ret;
>> >> +
>> >> +again:
>> >> + spin_lock(&root->lock);
>> >> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
>> >> + if (he) {
>> >> + kref_get(&he->hot_inode.refs);
>> >> + spin_unlock(&root->lock);
>> >> + return he;
>> >> + }
>> >> + spin_unlock(&root->lock);
>> >> +
>> >> + he = kmem_cache_zalloc(hot_inode_item_cachep,
>> >> + GFP_KERNEL | GFP_NOFS);
>> >> + if (!he)
>> >> + return ERR_PTR(-ENOMEM);
>> >> +
>> >> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
>> >> +
>> >> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> >> + if (ret) {
>> >> + kmem_cache_free(hot_inode_item_cachep, he);
>> >> + return ERR_PTR(ret);
>> >> + }
>> >> +
>> >> + spin_lock(&root->lock);
>> >> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
>> >> + if (ret == -EEXIST) {
>> >> + kmem_cache_free(hot_inode_item_cachep, he);
>> >> + spin_unlock(&root->lock);
>> >> + radix_tree_preload_end();
>> >> + goto again;
>> >> + }
>> >> + spin_unlock(&root->lock);
>> >> + radix_tree_preload_end();
>> >> +
>> >> + kref_get(&he->hot_inode.refs);
>> >> + return he;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
>> >> +
>> >> +static struct hot_range_item
>> >> +*hot_range_item_find(struct hot_inode_item *he,
>> >> + u32 start)
>> >> +{
>> >> + struct hot_range_item *hr;
>> >> + int ret;
>> >> +
>> >> +again:
>> >> + spin_lock(&he->lock);
>> >> + hr = radix_tree_lookup(&he->hot_range_tree, start);
>> >> + if (hr) {
>> >> + kref_get(&hr->hot_range.refs);
>> >> + spin_unlock(&he->lock);
>> >> + return hr;
>> >> + }
>> >> + spin_unlock(&he->lock);
>> >> +
>> >> + hr = kmem_cache_zalloc(hot_range_item_cachep,
>> >> + GFP_KERNEL | GFP_NOFS);
>> >> + if (!hr)
>> >> + return ERR_PTR(-ENOMEM);
>> >> +
>> >> + hot_range_item_init(hr, start, he);
>> >> +
>> >> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
>> >> + if (ret) {
>> >> + kmem_cache_free(hot_range_item_cachep, hr);
>> >> + return ERR_PTR(ret);
>> >> + }
>> >> +
>> >> + spin_lock(&he->lock);
>> >> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
>> >> + if (ret == -EEXIST) {
>> >> + kmem_cache_free(hot_range_item_cachep, hr);
>> >> + spin_unlock(&he->lock);
>> >> + radix_tree_preload_end();
>> >> + goto again;
>> >> + }
>> >> + spin_unlock(&he->lock);
>> >> + radix_tree_preload_end();
>> >> +
>> >> + kref_get(&hr->hot_range.refs);
>> >> + return hr;
>> >> +}
>> >> +
>> >> +/*
>> >> + * This function does the actual work of updating
>> >> + * the frequency numbers, whatever they turn out to be.
>> >> + */
>> >> +static u64 hot_average_update(struct timespec old_atime,
>> >> + struct timespec cur_time, u64 old_avg)
>> >> +{
>> >> + struct timespec delta_ts;
>> >> + u64 new_avg;
>> >> + u64 new_delta;
>> >> +
>> >> + delta_ts = timespec_sub(cur_time, old_atime);
>> >> + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
>> >> +
>> >> + new_avg = (old_avg << FREQ_POWER) - old_avg + new_delta;
>> >> + new_avg = new_avg >> FREQ_POWER;
>> >> +
>> >> + return new_avg;
>> >> +}
>> >> +
>> >> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
>> >> +{
>> >> + struct timespec cur_time = current_kernel_time();
>> >> +
>> >> + if (write) {
>> >> + freq_data->nr_writes += 1;
>> >> + freq_data->avg_delta_writes = hot_average_update(
>> >> + freq_data->last_write_time,
>> >> + cur_time,
>> >> + freq_data->avg_delta_writes);
>> >> + freq_data->last_write_time = cur_time;
>> >> + } else {
>> >> + freq_data->nr_reads += 1;
>> >> + freq_data->avg_delta_reads = hot_average_update(
>> >> + freq_data->last_read_time,
>> >> + cur_time,
>> >> + freq_data->avg_delta_reads);
>> >
>> > I think you could just pass in a pointer to
>> > freq_data->avg_delta_{writes,reads} here...
>> why?
>
> freq_data->avg_delta_{reads,writes} seems to be an in/out parameter, but by
> specifying it once as an in parameter and again as an lvalue, you're increasing
> the chances that someone will screw it up some time later -- you're not
> preventing me from accidentally writing this:
>
> freq_data->avg_delta_writes = hot_average_update(..., freq_data->avg_delta_reads);
>
> ...which (at least in my head) becomes an easier mistake to make once you start
> mixing in the function pointers a few patches later, and (my) brain has to wrap
> itself around all the punctuation.
>
>> >> + freq_data->last_read_time = cur_time;
>> >> + }
>> >> +}
>> >> +
>> >> /*
>> >> * Initialize kmem cache for hot_inode_item and hot_range_item.
>> >> */
>> >> @@ -199,6 +330,54 @@ err:
>> >> EXPORT_SYMBOL_GPL(hot_cache_init);
>> >>
>> >> /*
>> >> + * Main function to update access frequency from read/writepage(s) hooks
>> >> + */
>> >> +void hot_update_freqs(struct inode *inode, u64 start,
>> >> + u64 len, int rw)
>> >> +{
>> >> + struct hot_info *root = inode->i_sb->s_hot_root;
>> >> + struct hot_inode_item *he;
>> >> + struct hot_range_item *hr;
>> >> + u32 cur, end;
>> >> +
>> >> + if (!root || (len == 0))
>> >> + return;
>> >> +
>> >> + he = hot_inode_item_find(root, inode->i_ino);
>> >> + if (IS_ERR(he)) {
>> >> + WARN_ON(1);
>> >> + return;
>> >> + }
>> >> +
>> >> + spin_lock(&he->hot_inode.lock);
>> >> + hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
>> >> + spin_unlock(&he->hot_inode.lock);
>> >> +
>> >> + /*
>> >> + * Align ranges on RANGE_SIZE boundary
>> >> + * to prevent proliferation of range structs
>> >> + */
>> >> + end = (start + len + RANGE_SIZE - 1) >> RANGE_BITS;
>> >> + for (cur = (start >> RANGE_BITS); cur < end; cur++) {
>> >
>> > Hm... start is u64, cur is u32, RANGE_BITS is 20. Doesn't this overflow if,
>> > say, I have a sparse file with blocks way out at 2^53 bytes?
>> ah, good catch, thanks.
>
> Actually, I should go further -- why not use loff_t? The rest of the fs/ code
> does.
done, thanks.
>
>> > Also, RANGE_SIZE means that the hot tracking range granularity is 1MiB? How
>> yes.
>> > did you decide on that? Will we ever want to change that?
>> It is one assumption, do you think 1 MB is not appropriate? Do you
>> mean to add one proc file interface for it?
>
> I don't know about a procfs interface -- debugfs, perhaps?
>
> But actually, I was thinking that the fs might have a better idea of the range
> granularity that it wants to handle. Possibly it might be useful to try to
> align with raid stripes or other topology, too... though that's difficult.
>
> Also, for the fses that use allocation units (clusters), it might be useful
> to collect heat data per-cluster.
>
> On the other hand, it might not make much of a difference since most files tend
> to fit in ~4K anyway, and the extra granularity will increase memory
> consumption for large files. I don't mind having a 1MB default, but having a
> knob would certainly make it easier to tune, or in the future, to test if that
> 1MB default still makes sense.
thanks.
>
> --D
>
>> >
>> >> + hr = hot_range_item_find(he, cur);
>> >> + if (IS_ERR(hr)) {
>> >> + WARN_ON(1);
>> >
>> > WARN(1, "hot_range_item_find returns %d\n", PTR_ERR(hr)); ?
>> OK, done.
>> >
>> > --D
>> >
>> >> + hot_inode_item_put(he);
>> >> + return;
>> >> + }
>> >> +
>> >> + spin_lock(&hr->hot_range.lock);
>> >> + hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
>> >> + spin_unlock(&hr->hot_range.lock);
>> >> +
>> >> + hot_range_item_put(hr);
>> >> + }
>> >> +
>> >> + hot_inode_item_put(he);
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(hot_update_freqs);
>> >> +
>> >> +/*
>> >> * Initialize the data structures for hot data tracking.
>> >> */
>> >> int hot_track_init(struct super_block *sb)
>> >> diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
>> >> index e7ba121..cc4666e 100644
>> >> --- a/fs/hot_tracking.h
>> >> +++ b/fs/hot_tracking.h
>> >> @@ -20,6 +20,13 @@
>> >> #define FREQ_DATA_TYPE_INODE (1 << 0)
>> >> #define FREQ_DATA_TYPE_RANGE (1 << 1)
>> >>
>> >> +/* size of sub-file ranges */
>> >> +#define RANGE_BITS 20
>> >> +#define RANGE_SIZE (1 << RANGE_BITS)
>> >> +
>> >> +#define FREQ_POWER 4
>> >> +
>> >> void hot_inode_item_put(struct hot_inode_item *he);
>> >> +struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);
>> >>
>> >> #endif /* __HOT_TRACKING__ */
>> >> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
>> >> index 4233207..e2d6028 100644
>> >> --- a/include/linux/hot_tracking.h
>> >> +++ b/include/linux/hot_tracking.h
>> >> @@ -71,5 +71,7 @@ struct hot_info {
>> >> extern void __init hot_cache_init(void);
>> >> extern int hot_track_init(struct super_block *sb);
>> >> extern void hot_track_exit(struct super_block *sb);
>> >> +extern void hot_update_freqs(struct inode *inode, u64 start,
>> >> + u64 len, int rw);
>> >>
>> >> #endif /* _LINUX_HOTTRACK_H */
>> >> --
>> >> 1.7.6.5
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> >> the body of a message to [email protected]
>> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>> --
>> Regards,
>>
>> Zhi Yong Wu
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Regards,

Zhi Yong Wu

2012-11-08 02:59:04

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework

On Thu, Nov 8, 2012 at 2:58 AM, Darrick J. Wong <[email protected]> wrote:
> On Wed, Nov 07, 2012 at 04:34:35PM +0800, Zhi Yong Wu wrote:
>> On Wed, Nov 7, 2012 at 7:30 AM, Darrick J. Wong <[email protected]> wrote:
>> > On Mon, Oct 29, 2012 at 12:30:52PM +0800, [email protected] wrote:
>> >> From: Zhi Yong Wu <[email protected]>
>> >>
>> >> Introduce one framwork to enable that specific FS
>> >> can register its own hot tracking functions.
>> >>
>> >> Signed-off-by: Zhi Yong Wu <[email protected]>
>> >> ---
>> >> fs/hot_tracking.c | 78 ++++++++++++++++++++++++++++++++++++++----
>> >> include/linux/hot_tracking.h | 25 +++++++++++++
>> >> 2 files changed, 96 insertions(+), 7 deletions(-)
>> >>
>> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
>> >> index 0ef9cad..c6c6138 100644
>> >> --- a/fs/hot_tracking.c
>> >> +++ b/fs/hot_tracking.c
>> >> @@ -24,6 +24,9 @@
>> >> #include <linux/limits.h>
>> >> #include "hot_tracking.h"
>> >>
>> >> +static DEFINE_SPINLOCK(hot_func_list_lock);
>> >> +static LIST_HEAD(hot_func_list);
>> >> +
>> >> /* kmem_cache pointers for slab caches */
>> >> static struct kmem_cache *hot_inode_item_cachep __read_mostly;
>> >> static struct kmem_cache *hot_range_item_cachep __read_mostly;
>> >> @@ -305,20 +308,23 @@ static u64 hot_average_update(struct timespec old_atime,
>> >> return new_avg;
>> >> }
>> >>
>> >> -static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
>> >> +static void hot_freq_data_update(struct hot_info *root,
>> >> + struct hot_freq_data *freq_data, bool write)
>> >> {
>> >> struct timespec cur_time = current_kernel_time();
>> >>
>> >> if (write) {
>> >> freq_data->nr_writes += 1;
>> >> - freq_data->avg_delta_writes = hot_average_update(
>> >> + freq_data->avg_delta_writes =
>> >> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
>> >> freq_data->last_write_time,
>> >> cur_time,
>> >> freq_data->avg_delta_writes);
>> >> freq_data->last_write_time = cur_time;
>> >> } else {
>> >> freq_data->nr_reads += 1;
>> >> - freq_data->avg_delta_reads = hot_average_update(
>> >> + freq_data->avg_delta_reads =
>> >> + root->hot_func_type->ops.hot_rw_freq_calc_fn(
>> >> freq_data->last_read_time,
>> >> cur_time,
>> >> freq_data->avg_delta_reads);
>> >> @@ -430,7 +436,7 @@ static void hot_map_array_update(struct hot_freq_data *freq_data,
>> >> struct hot_comm_item *comm_item;
>> >> struct hot_inode_item *he;
>> >> struct hot_range_item *hr;
>> >> - u32 temp = hot_temp_calc(freq_data);
>> >> + u32 temp = root->hot_func_type->ops.hot_temp_calc_fn(freq_data);
>> >> u8 a_temp = temp >> (32 - HEAT_MAP_BITS);
>> >> u8 b_temp = freq_data->last_temp >> (32 - HEAT_MAP_BITS);
>> >>
>> >> @@ -511,7 +517,7 @@ static void hot_range_update(struct hot_inode_item *he,
>> >> &hr_nodes[i]->hot_range.hot_freq_data, root);
>> >>
>> >> spin_lock(&hr_nodes[i]->hot_range.lock);
>> >> - obsolete = hot_is_obsolete(
>> >> + obsolete = root->hot_func_type->ops.hot_is_obsolete_fn(
>> >> &hr_nodes[i]->hot_range.hot_freq_data);
>> >> spin_unlock(&hr_nodes[i]->hot_range.lock);
>> >>
>> >> @@ -668,7 +674,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
>> >> }
>> >>
>> >> spin_lock(&he->hot_inode.lock);
>> >> - hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
>> >> + hot_freq_data_update(root, &he->hot_inode.hot_freq_data, rw);
>> >> spin_unlock(&he->hot_inode.lock);
>> >>
>> >> /*
>> >> @@ -685,7 +691,7 @@ void hot_update_freqs(struct inode *inode, u64 start,
>> >> }
>> >>
>> >> spin_lock(&hr->hot_range.lock);
>> >> - hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
>> >> + hot_freq_data_update(root, &hr->hot_range.hot_freq_data, rw);
>> >> spin_unlock(&hr->hot_range.lock);
>> >>
>> >> hot_range_item_put(hr);
>> >> @@ -695,6 +701,61 @@ void hot_update_freqs(struct inode *inode, u64 start,
>> >> }
>> >> EXPORT_SYMBOL_GPL(hot_update_freqs);
>> >>
>> >> +static struct hot_func_type hot_func_def = {
>> >> + .hot_func_name = "hot_type_def",
>> >> + .ops = {
>> >> + .hot_rw_freq_calc_fn = hot_average_update,
>> >> + .hot_temp_calc_fn = hot_temp_calc,
>> >> + .hot_is_obsolete_fn = hot_is_obsolete,
>> >> + },
>> >> +};
>> >
>> > If these hot_ops are per-filesystem, why not just embed a struct hot_func_ops
>> > inside of struct file_system_type? That eliminates this _get function,
>> this _get function is very small, only some loc, if hot_func_ops is
>> embedded in struct file_system_type, i am afraid to introduce some
>> regressions....
>
> What kind of regressions are you afraid of, specifically? I don't think fstype
> is performance-critical enough to worry about wreaking havoc in the caches due
> to adding three function pointers.
done, thanks.
>
>> > collision avoidance, etc. You can fill in NULL function pointers in
>> fill in NULL func pointer? why?
>> > hot_track_init (or just code around them).
>
> I guess you could just require that everyone fill out .hot_temp_calc_fn,
> even if they just point it to generic_hot_temp_calc.
>
> --D
>
>> >
>> > --D
>> >
>> >> +
>> >> +static struct hot_func_type *hot_func_get(const char *name)
>> >> +{
>> >> + struct hot_func_type *f, *h = &hot_func_def;
>> >> +
>> >> + spin_lock(&hot_func_list_lock);
>> >> + list_for_each_entry(f, &hot_func_list, list) {
>> >> + if (!strcmp(f->hot_func_name, name))
>> >> + h = f;
>> >> + }
>> >> + spin_unlock(&hot_func_list_lock);
>> >> +
>> >> + return h;
>> >> +}
>> >> +
>> >> +int hot_func_register(struct hot_func_type *h)
>> >> +{
>> >> + struct hot_func_type *f, *t = NULL;
>> >> +
>> >> + /* register, don't allow duplicate names */
>> >> + spin_lock(&hot_func_list_lock);
>> >> + list_for_each_entry(f, &hot_func_list, list) {
>> >> + if (!strcmp(f->hot_func_name, h->hot_func_name))
>> >> + t = f;
>> >> + }
>> >> +
>> >> + if (t) {
>> >> + spin_unlock(&hot_func_list_lock);
>> >> + return -EBUSY;
>> >> + }
>> >> +
>> >> + list_add_tail(&h->list, &hot_func_list);
>> >> + spin_unlock(&hot_func_list_lock);
>> >> +
>> >> + return 0;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(hot_func_register);
>> >> +
>> >> +void hot_func_unregister(struct hot_func_type *h)
>> >> +{
>> >> + /* unregister */
>> >> + spin_lock(&hot_func_list_lock);
>> >> + list_del_init(&h->list);
>> >> + spin_unlock(&hot_func_list_lock);
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(hot_func_unregister);
>> >> +
>> >> /*
>> >> * Initialize the data structures for hot data tracking.
>> >> */
>> >> @@ -714,6 +775,9 @@ int hot_track_init(struct super_block *sb)
>> >> hot_inode_tree_init(root);
>> >> hot_map_array_init(root);
>> >>
>> >> + /* Get hot func type */
>> >> + root->hot_func_type = hot_func_get(sb->s_type->name);
>> >> +
>> >> root->update_wq = alloc_workqueue(
>> >> "hot_update_wq", WQ_NON_REENTRANT, 0);
>> >> if (!root->update_wq) {
>> >> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
>> >> index 2ee0d02..3941052 100644
>> >> --- a/include/linux/hot_tracking.h
>> >> +++ b/include/linux/hot_tracking.h
>> >> @@ -23,6 +23,8 @@
>> >> #define HEAT_MAP_BITS 8
>> >> #define HEAT_MAP_SIZE (1 << HEAT_MAP_BITS)
>> >>
>> >> +#define HOT_NAME_MAX 16
>> >> +
>> >> /*
>> >> * A frequency data struct holds values that are used to
>> >> * determine temperature of files and file ranges. These structs
>> >> @@ -73,6 +75,25 @@ struct hot_range_item {
>> >> u32 len; /* length in bytes */
>> >> };
>> >>
>> >> +typedef u64 (hot_rw_freq_calc_fn) (struct timespec old_atime,
>> >> + struct timespec cur_time, u64 old_avg);
>> >> +typedef u32 (hot_temp_calc_fn) (struct hot_freq_data *freq_data);
>> >> +typedef bool (hot_is_obsolete_fn) (struct hot_freq_data *freq_data);
>> >> +
>> >> +struct hot_func_ops {
>> >> + hot_rw_freq_calc_fn *hot_rw_freq_calc_fn;
>> >> + hot_temp_calc_fn *hot_temp_calc_fn;
>> >> + hot_is_obsolete_fn *hot_is_obsolete_fn;
>> >> +};
>> >> +
>> >> +/* identifies an hot func type */
>> >> +struct hot_func_type {
>> >> + char hot_func_name[HOT_NAME_MAX];
>> >> + /* fields provided by specific FS */
>> >> + struct hot_func_ops ops;
>> >> + struct list_head list;
>> >> +};
>> >> +
>> >> struct hot_info {
>> >> struct radix_tree_root hot_inode_tree;
>> >> spinlock_t lock; /*protect inode tree */
>> >> @@ -85,6 +106,7 @@ struct hot_info {
>> >>
>> >> struct workqueue_struct *update_wq;
>> >> struct delayed_work update_work;
>> >> + struct hot_func_type *hot_func_type;
>> >> };
>> >>
>> >> extern void __init hot_cache_init(void);
>> >> @@ -93,4 +115,7 @@ extern void hot_track_exit(struct super_block *sb);
>> >> extern void hot_update_freqs(struct inode *inode, u64 start,
>> >> u64 len, int rw);
>> >>
>> >> +extern int hot_func_register(struct hot_func_type *h);
>> >> +extern void hot_func_unregister(struct hot_func_type *h);
>> >> +
>> >> #endif /* _LINUX_HOTTRACK_H */
>> >> --
>> >> 1.7.6.5
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
>> >> the body of a message to [email protected]
>> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>> --
>> Regards,
>>
>> Zhi Yong Wu
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Regards,

Zhi Yong Wu

2012-11-16 06:16:15

by Zhi Yong Wu

[permalink] [raw]
Subject: Re: [RFC v4+ hot_track 02/19] vfs: initialize and free data structures

On Wed, Nov 7, 2012 at 6:24 AM, David Sterba <[email protected]> wrote:
> On Mon, Oct 29, 2012 at 12:30:44PM +0800, [email protected] wrote:
>> +/* Frees the entire hot_range_tree. */
>> +static void hot_inode_item_free(struct kref *kref)
>> +{
>> + struct hot_comm_item *comm_item = container_of(kref,
>> + struct hot_comm_item, refs);
>> + struct hot_inode_item *he = container_of(comm_item,
>> + struct hot_inode_item, hot_inode);
>> +
>> + hot_range_tree_free(he);
>> + radix_tree_delete(he->hot_inode_tree, he->i_ino);
>
> void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
>
> and he::i_ino is u64, this will not work when
> sizeof(unsigned long) != sizeof(u64) (iirc this is a known limitation of
> radix tree implementation). This will work on 64bit only, not sure if
> this is intentional.
Fixed, thanks.
>
>> + kmem_cache_free(hot_inode_item_cachep, he);
>> +}
>> +
>> +/* Frees the entire hot_inode_tree. */
>> +static void hot_inode_tree_exit(struct hot_info *root)
>> +{
>> + struct hot_inode_item *hi_nodes[8];
>> + u64 ino = 0;
>> + int i, n;
>
> nitpick, put the declarations on separate lines
>
>> +
>> + while (1) {
>> + spin_lock(&root->lock);
>> + n = radix_tree_gang_lookup(&root->hot_inode_tree,
>> + (void **)hi_nodes, ino,
>> + ARRAY_SIZE(hi_nodes));
>> + if (!n) {
>> + spin_unlock(&root->lock);
>> + break;
>> + }
>> +
>> + ino = hi_nodes[n - 1]->i_ino + 1;
>> + for (i = 0; i < n; i++)
>> + hot_inode_item_put(hi_nodes[i]);
>> + spin_unlock(&root->lock);
>> + }
>> +}
>> +
>> /*
>> * Initialize kmem cache for hot_inode_item and hot_range_item.
>> */
>> @@ -106,3 +197,36 @@ err:
>> kmem_cache_destroy(hot_inode_item_cachep);
>> }
>> EXPORT_SYMBOL_GPL(hot_cache_init);
>> +
>> +/*
>> + * Initialize the data structures for hot data tracking.
>> + */
>> +int hot_track_init(struct super_block *sb)
>> +{
>> + struct hot_info *root;
>> + int ret = -ENOMEM;
>> +
>> + root = kzalloc(sizeof(struct hot_info), GFP_NOFS);
>> + if (!root) {
>> + printk(KERN_ERR "%s: Failed to malloc memory for "
>> + "hot_info\n", __func__);
>> + return ret;
>
> minor: you can drop the variable ret and just reurn ENOMEM here
>
>> + }
>> +
>> + sb->s_hot_root = root;
>> + hot_inode_tree_init(root);
>> +
>> + printk(KERN_INFO "VFS: Turning on hot data tracking\n");
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(hot_track_init);
>
> david



--
Regards,

Zhi Yong Wu