2010-08-12 22:30:00

by Ben Chociej

[permalink] [raw]
Subject: [PATCH 0/2] Btrfs-progs: Add support for hot data migration

This patch set introduces functionality into btrfsctl and mkfs.btrfs to
support the kernel patches for hot data tracking and migration to SSD
with Btrfs. New functionality includes a -h option to mkfs.btrfs to
preallocate approrpiate block group types for SSD data migration, and
also includes additional options for btrfsctl to interact with the new
ioctls introduced by the kernel patches.


DIFFSTAT:

btrfsctl.c | 111 +++++++++++++++++++++++++++++++++++++++-
ctree.h | 2 +
extent-tree.c | 2 +-
ioctl-test.c | 3 +
ioctl.h | 24 +++++++++
mkfs.c | 131 ++++++++++++++++++++++++++++++++++++++++-------
utils.c | 1 +
volumes.c | 73 +++++++++++++++++++++++++-
volumes.h | 3 +-
9 files changed, 326 insertions(+), 24 deletions(-)


Signed-off-by: Ben Chociej <[email protected]>
Signed-off-by: Matt Lupfer <[email protected]>
Tested-by: Conor Scott <[email protected]>


2010-08-12 22:30:09

by Ben Chociej

[permalink] [raw]
Subject: [PATCH 1/2] Btrfs-progs: Add support for hot data ioctls

From: Ben Chociej <[email protected]>

Add support for the new hot data functionality in-kernel. Three ioctls
were added to export hot data statistics and turn hot data tracking on
and off per inode. This patch enables btrfsctl to interact with those
ioctls.

Signed-off-by: Ben Chociej <[email protected]>
Signed-off-by: Matt Lupfer <[email protected]>
Tested-by: Conor Scott <[email protected]>
---
btrfsctl.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ioctl-test.c | 3 ++
ioctl.h | 24 +++++++++++++
3 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/btrfsctl.c b/btrfsctl.c
index be6bf25..8617d06 100644
--- a/btrfsctl.c
+++ b/btrfsctl.c
@@ -48,6 +48,7 @@ static void print_usage(void)
{
printf("usage: btrfsctl [ -d file|dir] [ -s snap_name subvol|tree ]\n");
printf(" [-r size] [-A device] [-a] [-c] [-D dir .]\n");
+ printf(" [-t file] [-T file] [-h filename [0-2]]\n");
printf("\t-d filename: defragments one file\n");
printf("\t-d directory: defragments the entire Btree\n");
printf("\t-s snap_name dir: creates a new snapshot of dir\n");
@@ -57,6 +58,14 @@ static void print_usage(void)
printf("\t-a: scans all devices for Btrfs filesystems\n");
printf("\t-c: forces a single FS sync\n");
printf("\t-D: delete snapshot\n");
+ printf("\t-t filename: dump indexed heat information for a file\n");
+ printf("\t-T filename: dump live heat informaton for a file\n");
+ printf("\t-h filename: query heat tracking/migration status\n");
+ printf("\t-h filename level: set heat tracking level:\n");
+ printf("\t\tlevel =\n");
+ printf("\t\t0: no tracking or relocation\n");
+ printf("\t\t1: access tracking only\n");
+ printf("\t\t2: tracking and automatic migration to SSD\n");
printf("\t-m [tree id] directory: set the default mounted subvolume"
" to the [tree id] or the directory\n");
printf("%s\n", BTRFS_BUILD_VERSION);
@@ -99,12 +108,14 @@ int main(int ac, char **av)
int fd;
int ret;
struct btrfs_ioctl_vol_args args;
+ struct btrfs_ioctl_heat_info hotinfo;
char *name = NULL;
int i;
unsigned long command = 0;
int len;
char *fullpath;
u64 objectid = 0;
+ int heatarg;

if (ac == 2 && strcmp(av[1], "-a") == 0) {
fprintf(stderr, "Scanning for Btrfs filesystems\n");
@@ -205,6 +216,38 @@ int main(int ac, char **av)
exit(1);
}
}
+ } else if (strcmp(av[i], "-t") == 0) {
+ if (i >= ac - 1) {
+ fprintf(stderr,
+ "-t requires a file argument\n");
+ print_usage();
+ }
+ hotinfo.live = 0;
+ command = BTRFS_IOC_GET_HEAT_INFO;
+ } else if (strcmp(av[i], "-T") == 0) {
+ if (i >= ac - 1) {
+ fprintf(stderr,
+ "-T requires a file argument\n");
+ print_usage();
+ }
+ hotinfo.live = 1;
+ command = BTRFS_IOC_GET_HEAT_INFO;
+ } else if (strcmp(av[i], "-h") == 0) {
+ if (i == ac - 2) {
+ command = BTRFS_IOC_GET_HEAT_OPTS;
+ } else if (i == ac - 3) {
+ command = BTRFS_IOC_SET_HEAT_OPTS;
+ heatarg = atoi(av[i + 2]);
+ } else {
+ fprintf(stderr, "-h invalid number of "
+ "arguments\n");
+ print_usage();
+ exit(1);
+ }
+
+ fprintf(stderr, "Btrfs hot data tracking: `%s'\n\n",
+ av[i + 1]);
+ av[i + 2] = av[i + 1];
}
}
if (command == 0) {
@@ -236,6 +279,70 @@ int main(int ac, char **av)
} else if (command == BTRFS_IOC_DEFAULT_SUBVOL) {
printf("objectid is %llu\n", objectid);
ret = ioctl(fd, command, &objectid);
+ } else if (command == BTRFS_IOC_GET_HEAT_INFO) {
+ strcpy(hotinfo.filename, fname);
+ ret = ioctl(fd, command, &hotinfo);
+ if (ret == 0) {
+ printf("Btrfs file hotness information\n");
+ printf("%s\n\n", hotinfo.filename);
+ printf("Last write: %llu\n",
+ (u64) hotinfo.last_write_time);
+ printf("Last read: %llu\n",
+ (u64) hotinfo.last_read_time);
+ printf("Average write delta: %llu\n",
+ (u64) hotinfo.avg_delta_writes);
+ printf("Average read delta: %llu\n",
+ (u64) hotinfo.avg_delta_reads);
+ printf("Number of writes: %u\n",
+ (u32) hotinfo.num_writes);
+ printf("Number of reads: %u\n\n",
+ (u32) hotinfo.num_reads);
+ if (hotinfo.live > 0)
+ printf("Temperature (live): %u\n\n",
+ hotinfo.temperature);
+ else
+ printf("Temperature (indexed): %u\n\n",
+ hotinfo.temperature);
+ }
+ } else if (command == BTRFS_IOC_SET_HEAT_OPTS) {
+ ret = ioctl(fd, command, &heatarg);
+ switch (heatarg) {
+ case 0:
+ printf("Turning OFF heat tracking and migration inode "
+ "flags.\n");
+ break;
+ case 1:
+ printf("Turning ON the heat tracking inode flag.\n");
+ printf("Turning OFF the migration inode flag.\n");
+ break;
+ case 2:
+ printf("Turning ON heat tracking and migration inode "
+ "flags.\n");
+ break;
+ default:
+ printf("Invalid heat tracking argument.\n");
+ }
+ printf("(Inode flags can be overridden by mount options)\n\n");
+ } else if (command == BTRFS_IOC_GET_HEAT_OPTS) {
+ ret = ioctl(fd, command, &heatarg);
+ switch (heatarg) {
+ case 0:
+ printf("Heat tracking and migration inode flags are "
+ "OFF.\n");
+ break;
+ case 1:
+ printf("Heat tracking inode flag is ON, migration "
+ "inode flag is OFF.\n");
+ break;
+ case 2:
+ printf("Heat tracking and migration inode flags are "
+ "both ON.\n");
+ break;
+ default:
+ printf("Wrong filesystem type, or invalid status "
+ "returned.\n");
+ }
+ printf("(Inode flags can be overridden by mount options)\n\n");
} else
ret = ioctl(fd, command, &args);
if (ret < 0) {
diff --git a/ioctl-test.c b/ioctl-test.c
index 7cf3bc2..8d54301 100644
--- a/ioctl-test.c
+++ b/ioctl-test.c
@@ -22,6 +22,9 @@ unsigned long ioctls[] = {
BTRFS_IOC_INO_LOOKUP,
BTRFS_IOC_DEFAULT_SUBVOL,
BTRFS_IOC_SPACE_INFO,
+ BTRFS_IOC_GET_HEAT_INFO,
+ BTRFS_IOC_SET_HEAT_OPTS,
+ BTRFS_IOC_GET_HEAT_OPTS,
0 };

int main(int ac, char **av)
diff --git a/ioctl.h b/ioctl.h
index 776d7a9..5827338 100644
--- a/ioctl.h
+++ b/ioctl.h
@@ -132,6 +132,18 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_space_info spaces[0];
};

+struct btrfs_ioctl_heat_info {
+ __u64 avg_delta_reads;
+ __u64 avg_delta_writes;
+ __u64 last_read_time;
+ __u64 last_write_time;
+ __u32 num_reads;
+ __u32 num_writes;
+ char filename[BTRFS_PATH_NAME_MAX + 1];
+ int temperature;
+ __u8 live;
+};
+
#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -169,4 +181,16 @@ struct btrfs_ioctl_space_args {
#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
struct btrfs_ioctl_space_args)
+
+/*
+ * Hot data tracking ioctls:
+ *
+ * GET_HEAT_INFO - retrieve frequency of access info on a file
+ * SET_HEAT_OPTS - set whether a file is tracked/migratable
+ * GET_HEAT_OPTS - check whether a file is tracked/migratable
+ */
+#define BTRFS_IOC_GET_HEAT_INFO _IOWR(BTRFS_IOCTL_MAGIC, 21, \
+ struct btrfs_ioctl_heat_info)
+#define BTRFS_IOC_SET_HEAT_OPTS _IOW(BTRFS_IOCTL_MAGIC, 22, int)
+#define BTRFS_IOC_GET_HEAT_OPTS _IOR(BTRFS_IOCTL_MAGIC, 23, int)
#endif
--
1.7.1

2010-08-12 22:30:15

by Ben Chociej

[permalink] [raw]
Subject: [PATCH 2/2] Btrfs-progs: Add hot data support in mkfs

From: Ben Chociej <[email protected]>

Modified mkfs.btrfs to add hot data relocation option (-h) which
preallocates BTRFS_BLOCK_GROUP_DATA_SSD and
BTRFS_BLOCK_GROUP_METADATA_SSD at mkfs time for future use by hot data
relocation code. Also added a userspace function to detect whether a
block device is an SSD by reading the sysfs block queue rotational flag.

Signed-off-by: Ben Chociej <[email protected]>
Signed-off-by: Matt Lupfer <[email protected]>
Tested-by: Conor Scott <[email protected]>
---
ctree.h | 2 +
extent-tree.c | 2 +-
mkfs.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++--------
utils.c | 1 +
volumes.c | 73 +++++++++++++++++++++++++++++++-
volumes.h | 3 +-
6 files changed, 190 insertions(+), 22 deletions(-)

diff --git a/ctree.h b/ctree.h
index 64ecf12..8c29122 100644
--- a/ctree.h
+++ b/ctree.h
@@ -640,6 +640,8 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+#define BTRFS_BLOCK_GROUP_DATA_SSD (1 << 7)
+#define BTRFS_BLOCK_GROUP_METADATA_SSD (1 << 8)

struct btrfs_block_group_item {
__le64 used;
diff --git a/extent-tree.c b/extent-tree.c
index b2f9bb2..a6b2beb 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1812,7 +1812,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
thresh)
return 0;

- ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
+ ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags, 0);
if (ret == -ENOSPC) {
space_info->full = 1;
return 0;
diff --git a/mkfs.c b/mkfs.c
index 2e99b95..f45cfc3 100644
--- a/mkfs.c
+++ b/mkfs.c
@@ -69,7 +69,61 @@ static u64 parse_size(char *s)
return atol(s) * mult;
}

-static int make_root_dir(struct btrfs_root *root)
+static int make_root_dir2(struct btrfs_root *root, int hotdata)
+{
+ struct btrfs_trans_handle *trans;
+ u64 chunk_start = 0;
+ u64 chunk_size = 0;
+ int ret;
+
+ trans = btrfs_start_transaction(root, 1);
+
+ /*
+ * If hotdata option is set, preallocate a metadata SSD block group
+ * (not currently used)
+ */
+ if (hotdata) {
+ ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root,
+ &chunk_start, &chunk_size,
+ BTRFS_BLOCK_GROUP_METADATA_SSD, hotdata);
+ BUG_ON(ret);
+ ret = btrfs_make_block_group(trans, root, 0,
+ BTRFS_BLOCK_GROUP_METADATA_SSD,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_start, chunk_size);
+ BUG_ON(ret);
+ }
+
+ ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root,
+ &chunk_start, &chunk_size,
+ BTRFS_BLOCK_GROUP_DATA, hotdata);
+ BUG_ON(ret);
+ ret = btrfs_make_block_group(trans, root, 0,
+ BTRFS_BLOCK_GROUP_DATA,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_start, chunk_size);
+ BUG_ON(ret);
+
+ /*
+ * If hotdata option is set, preallocate a data SSD block group
+ */
+ if (hotdata) {
+ ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root,
+ &chunk_start, &chunk_size,
+ BTRFS_BLOCK_GROUP_DATA_SSD, hotdata);
+ BUG_ON(ret);
+ ret = btrfs_make_block_group(trans, root, 0,
+ BTRFS_BLOCK_GROUP_DATA_SSD,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ chunk_start, chunk_size);
+ BUG_ON(ret);
+ }
+
+ btrfs_commit_transaction(trans, root);
+ return ret;
+}
+
+static int make_root_dir(struct btrfs_root *root, int hotdata)
{
struct btrfs_trans_handle *trans;
struct btrfs_key location;
@@ -90,7 +144,7 @@ static int make_root_dir(struct btrfs_root *root)

ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root,
&chunk_start, &chunk_size,
- BTRFS_BLOCK_GROUP_METADATA);
+ BTRFS_BLOCK_GROUP_METADATA, hotdata);
BUG_ON(ret);
ret = btrfs_make_block_group(trans, root, 0,
BTRFS_BLOCK_GROUP_METADATA,
@@ -103,16 +157,6 @@ static int make_root_dir(struct btrfs_root *root)
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);

- ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root,
- &chunk_start, &chunk_size,
- BTRFS_BLOCK_GROUP_DATA);
- BUG_ON(ret);
- ret = btrfs_make_block_group(trans, root, 0,
- BTRFS_BLOCK_GROUP_DATA,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- chunk_start, chunk_size);
- BUG_ON(ret);
-
ret = btrfs_make_root_dir(trans, root->fs_info->tree_root,
BTRFS_ROOT_TREE_DIR_OBJECTID);
if (ret)
@@ -189,7 +233,7 @@ static int create_one_raid_group(struct btrfs_trans_handle *trans,
int ret;

ret = btrfs_alloc_chunk(trans, root->fs_info->extent_root,
- &chunk_start, &chunk_size, type);
+ &chunk_start, &chunk_size, type, 0);
BUG_ON(ret);
ret = btrfs_make_block_group(trans, root->fs_info->extent_root, 0,
type, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
@@ -198,14 +242,24 @@ static int create_one_raid_group(struct btrfs_trans_handle *trans,
return ret;
}

+/*
+ * counters for SSD and HDD devices to determine which block group types are
+ * allowed when hotdata is enabled
+ */
+static int ssd_devices = 0;
+static int hdd_devices = 0;
+
static int create_raid_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 data_profile,
- u64 metadata_profile)
+ u64 metadata_profile, int hotdata)
{
u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
u64 allowed;
int ret;

+ if (hotdata)
+ num_devices = hdd_devices;
+
if (num_devices == 1)
allowed = BTRFS_BLOCK_GROUP_DUP;
else if (num_devices >= 4) {
@@ -271,6 +325,7 @@ static void print_usage(void)
fprintf(stderr, "\t -A --alloc-start the offset to start the FS\n");
fprintf(stderr, "\t -b --byte-count total number of bytes in the FS\n");
fprintf(stderr, "\t -d --data data profile, raid0, raid1, raid10 or single\n");
+ fprintf(stderr, "\t -h --hotdata allocate hot data block groups to SSD\n");
fprintf(stderr, "\t -l --leafsize size of btree leaves\n");
fprintf(stderr, "\t -L --label set a label\n");
fprintf(stderr, "\t -m --metadata metadata profile, values like data profile\n");
@@ -325,6 +380,7 @@ static char *parse_label(char *input)
static struct option long_options[] = {
{ "alloc-start", 1, NULL, 'A'},
{ "byte-count", 1, NULL, 'b' },
+ { "hotdata", 0, NULL, 'h' },
{ "leafsize", 1, NULL, 'l' },
{ "label", 1, NULL, 'L'},
{ "metadata", 1, NULL, 'm' },
@@ -358,10 +414,11 @@ int main(int ac, char **av)
int first_fd;
int ret;
int i;
+ int hotdata = 0;

while(1) {
int c;
- c = getopt_long(ac, av, "A:b:l:n:s:m:d:L:V", long_options,
+ c = getopt_long(ac, av, "A:b:l:n:s:m:d:L:hV", long_options,
&option_index);
if (c < 0)
break;
@@ -398,6 +455,9 @@ int main(int ac, char **av)
}
zero_end = 0;
break;
+ case 'h':
+ hotdata = 1;
+ break;
case 'V':
print_version();
break;
@@ -405,6 +465,7 @@ int main(int ac, char **av)
print_usage();
}
}
+
sectorsize = max(sectorsize, (u32)getpagesize());
if (leafsize < sectorsize || (leafsize & (sectorsize - 1))) {
fprintf(stderr, "Illegal leafsize %u\n", leafsize);
@@ -414,7 +475,9 @@ int main(int ac, char **av)
fprintf(stderr, "Illegal nodesize %u\n", nodesize);
exit(1);
}
+
ac = ac - optind;
+
if (ac == 0)
print_usage();

@@ -422,6 +485,20 @@ int main(int ac, char **av)
printf("WARNING! - see http://btrfs.wiki.kernel.org before using\n\n");

file = av[optind++];
+
+ /*
+ * Setup for hot data relocation
+ */
+ if (hotdata) {
+ if (btrfs_is_dev_ssd(file)) {
+ fprintf(stderr, "Hot data relocation mode requires "
+ "the first listed device NOT be a SSD (%s)\n",
+ file);
+ exit(1);
+ }
+ hdd_devices++;
+ }
+
ret = check_mounted(file);
if (ret < 0) {
fprintf(stderr, "error checking %s mount status\n", file);
@@ -459,7 +536,7 @@ int main(int ac, char **av)
root = open_ctree(file, 0, O_RDWR);
root->fs_info->alloc_start = alloc_start;

- ret = make_root_dir(root);
+ ret = make_root_dir(root, hotdata);
if (ret) {
fprintf(stderr, "failed to setup the root directory\n");
exit(1);
@@ -479,6 +556,15 @@ int main(int ac, char **av)
zero_end = 1;
while(ac-- > 0) {
file = av[optind++];
+
+ if (hotdata) {
+ if (btrfs_is_dev_ssd(file)) {
+ ssd_devices++;
+ } else {
+ hdd_devices++;
+ }
+ }
+
ret = check_mounted(file);
if (ret < 0) {
fprintf(stderr, "error checking %s mount status\n",
@@ -504,7 +590,6 @@ int main(int ac, char **av)
}
ret = btrfs_prepare_device(fd, file, zero_end,
&dev_block_count);
-
BUG_ON(ret);

ret = btrfs_add_to_fsid(trans, root, fd, file, dev_block_count,
@@ -514,8 +599,18 @@ int main(int ac, char **av)
}

raid_groups:
+ btrfs_commit_transaction(trans, root);
+
+ ret = make_root_dir2(root, hotdata);
+ if (ret) {
+ fprintf(stderr, "failed to setup the root directory\n");
+ exit(1);
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+
ret = create_raid_groups(trans, root, data_profile,
- metadata_profile);
+ metadata_profile, hotdata);
BUG_ON(ret);

ret = create_data_reloc_tree(trans, root);
diff --git a/utils.c b/utils.c
index 2f4c6e1..852c5d6 100644
--- a/utils.c
+++ b/utils.c
@@ -473,6 +473,7 @@ int btrfs_add_to_fsid(struct btrfs_trans_handle *trans,
device->bytes_used = 0;
device->total_ios = 0;
device->dev_root = root->fs_info->dev_root;
+ device->name = path;

ret = btrfs_add_device(trans, root, device);
BUG_ON(ret);
diff --git a/volumes.c b/volumes.c
index 7671855..79d3871 100644
--- a/volumes.c
+++ b/volumes.c
@@ -19,6 +19,7 @@
#define __USE_XOPEN2K
#include <stdio.h>
#include <stdlib.h>
+#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <uuid/uuid.h>
@@ -630,7 +631,7 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,

int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 *start,
- u64 *num_bytes, u64 type)
+ u64 *num_bytes, u64 type, int hotdata)
{
u64 dev_offset;
struct btrfs_fs_info *info = extent_root->fs_info;
@@ -733,8 +734,24 @@ again:
/* build a private list of devices we will allocate from */
while(index < num_stripes) {
device = list_entry(cur, struct btrfs_device, dev_list);
- avail = device->total_bytes - device->bytes_used;
cur = cur->next;
+ int is_ssd = btrfs_is_dev_ssd(device->name);
+
+ if (hotdata) {
+ if (type & BTRFS_BLOCK_GROUP_DATA &&
+ is_ssd)
+ goto skip_device;
+ if (type & BTRFS_BLOCK_GROUP_METADATA &&
+ is_ssd)
+ goto skip_device;
+ if (type & BTRFS_BLOCK_GROUP_DATA_SSD &&
+ !is_ssd)
+ goto skip_device;
+ if (type & BTRFS_BLOCK_GROUP_METADATA_SSD &&
+ !is_ssd)
+ goto skip_device;
+ }
+ avail = device->total_bytes - device->bytes_used;
if (avail >= min_free) {
list_move_tail(&device->dev_list, &private_devs);
index++;
@@ -742,6 +759,7 @@ again:
index++;
} else if (avail > max_avail)
max_avail = avail;
+skip_device:
if (cur == dev_list)
break;
}
@@ -853,6 +871,7 @@ again:
BUG_ON(ret);
}

+
kfree(chunk);
return ret;
}
@@ -1448,3 +1467,53 @@ struct list_head *btrfs_scanned_uuids(void)
{
return &fs_uuids;
}
+
+/*
+ * A userspace function for determining whether a device is
+ * an SSD
+ */
+int btrfs_is_dev_ssd(char *device_path)
+{
+ int fd;
+ int ret = 0;
+ char *deva = "/sys/block/";
+ char *devb = "/queue/rotational";
+ char dev_string[256] = "";
+ char dev[256];
+ size_t dev_name_len;
+ char rot_flag[2];
+ int index;
+
+ memset(rot_flag, 0, 2);
+
+ dev_name_len = strlen(device_path);
+ memcpy(dev, device_path + 5, dev_name_len - 4);
+
+ /* remove partition numbers from device name */
+ index = strlen(dev) - 1;
+ while (isdigit(dev[index]))
+ dev[index--] = '\0';
+
+ strcat(dev_string, deva);
+ strcat(dev_string, dev);
+ strcat(dev_string, devb);
+
+ fd = open(dev_string, O_RDONLY);
+
+ if (fd < 0) {
+ fprintf(stderr, "unable to open %s\n", dev_string);
+ return 0;
+ }
+
+ ret = read(fd, rot_flag, 1);
+ if (ret < 1) {
+ fprintf(stderr, "unable to read rotational flag for %s\n",
+ device_path);
+ return 0;
+ }
+
+ close(fd);
+
+ return !atoi(rot_flag);
+}
+
diff --git a/volumes.h b/volumes.h
index bb78751..bb26580 100644
--- a/volumes.h
+++ b/volumes.h
@@ -106,7 +106,7 @@ int btrfs_read_sys_array(struct btrfs_root *root);
int btrfs_read_chunk_tree(struct btrfs_root *root);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 *start,
- u64 *num_bytes, u64 type);
+ u64 *num_bytes, u64 type, int hotdata);
int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -130,4 +130,5 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int btrfs_is_dev_ssd(char *device_path);
#endif
--
1.7.1

2010-08-13 13:14:35

by Andrey Panin

[permalink] [raw]
Subject: Re: [PATCH 2/2] Btrfs-progs: Add hot data support in mkfs

On 224, 08 12, 2010 at 05:29:37PM -0500, [email protected] wrote:
> From: Ben Chociej <[email protected]>
>
> Modified mkfs.btrfs to add hot data relocation option (-h) which
> preallocates BTRFS_BLOCK_GROUP_DATA_SSD and
> BTRFS_BLOCK_GROUP_METADATA_SSD at mkfs time for future use by hot data
> relocation code. Also added a userspace function to detect whether a
> block device is an SSD by reading the sysfs block queue rotational flag.

IMHO this policy is too inflexible. What if I have large array of slow SATA
disks and some fast SAS ones ?

2010-08-13 14:10:50

by Ben Chociej

[permalink] [raw]
Subject: Re: [PATCH 2/2] Btrfs-progs: Add hot data support in mkfs

It's a good point, of course. Ideally we would be able to prioritize
data and place them on 15k versus 7.2krpm disks, etc. However you get
to a point where's there's only incremental benefit. For that reason,
the scope of this project was simply to take advantage of SSD and HDD
in hybrid. Of course, you could register the same complaint about the
ZFS SSD caching: why not take advantage of faster vs. slower spinning
disks? Unfortunately it just wasn't in the scope of our 12-week
project here.

That's not to say it *shouldn't* be done in the future, of course!
And, incidentally, you could hack it together at this point by setting
the /sys/block/<blockdev>/queue/rotational flag to 0 and using it like
an SSD. :)

BC

On Fri, Aug 13, 2010 at 8:14 AM, Andrey Panin <[email protected]> wrote:
> On 224, 08 12, 2010 at 05:29:37PM -0500, [email protected] wrote:
>> From: Ben Chociej <[email protected]>
>>
>> Modified mkfs.btrfs to add hot data relocation option (-h) which
>> preallocates BTRFS_BLOCK_GROUP_DATA_SSD and
>> BTRFS_BLOCK_GROUP_METADATA_SSD at mkfs time for future use by hot data
>> relocation code. ?Also added a userspace function to detect whether a
>> block device is an SSD by reading the sysfs block queue rotational flag.
>
> IMHO this policy is too inflexible. What if I have large array of slow SATA
> disks and some fast SAS ones ?
>

2010-08-13 14:13:41

by Ben Chociej

[permalink] [raw]
Subject: Re: [PATCH 2/2] Btrfs-progs: Add hot data support in mkfs

On Fri, Aug 13, 2010 at 9:08 AM, Tomasz Torcz <[email protected]> wrote:
> On Fri, Aug 13, 2010 at 05:14:22PM +0400, Andrey Panin wrote:
>> On 224, 08 12, 2010 at 05:29:37PM -0500, [email protected] wrote:
>> > From: Ben Chociej <[email protected]>
>> >
>> > Modified mkfs.btrfs to add hot data relocation option (-h) which
>> > preallocates BTRFS_BLOCK_GROUP_DATA_SSD and
>> > BTRFS_BLOCK_GROUP_METADATA_SSD at mkfs time for future use by hot data
>> > relocation code. ?Also added a userspace function to detect whether a
>> > block device is an SSD by reading the sysfs block queue rotational flag.
>>
>> IMHO this policy is too inflexible. What if I have large array of slow SATA
>> disks and some fast SAS ones ?
>
> ?I'm hoping that this is just first cut, and future versions will have options.
> For now, it is totally unusable without a way of using mirrored SSD for
> hot data.
> ?Ideally, hot storage devices should be online selectable and changeable.
>
> --
> Tomasz Torcz ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 72->| ? 80->|
> xmpp: [email protected] ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?72->| ? 80->|
>
>

This is very much just a first cut. Our goal here was to get a lot of
the legwork and experimental coding out of the way to enable to Btrfs
to *fully* tackle heterogeneous storage in the future.

BC

2010-08-13 14:17:51

by Tomasz Torcz

[permalink] [raw]
Subject: Re: [PATCH 2/2] Btrfs-progs: Add hot data support in mkfs

On Fri, Aug 13, 2010 at 05:14:22PM +0400, Andrey Panin wrote:
> On 224, 08 12, 2010 at 05:29:37PM -0500, [email protected] wrote:
> > From: Ben Chociej <[email protected]>
> >
> > Modified mkfs.btrfs to add hot data relocation option (-h) which
> > preallocates BTRFS_BLOCK_GROUP_DATA_SSD and
> > BTRFS_BLOCK_GROUP_METADATA_SSD at mkfs time for future use by hot data
> > relocation code. Also added a userspace function to detect whether a
> > block device is an SSD by reading the sysfs block queue rotational flag.
>
> IMHO this policy is too inflexible. What if I have large array of slow SATA
> disks and some fast SAS ones ?

I'm hoping that this is just first cut, and future versions will have options.
For now, it is totally unusable without a way of using mirrored SSD for
hot data.
Ideally, hot storage devices should be online selectable and changeable.

--
Tomasz Torcz 72->| 80->|
xmpp: [email protected] 72->| 80->|

2010-08-13 16:42:04

by Goffredo Baroncelli

[permalink] [raw]
Subject: Re: [PATCH 1/2] Btrfs-progs: Add support for hot data ioctls

Mr Chociej,

some months ago I wrote a new command (called "btrfs"), with the aim to
replace "btrfsctl" at all, for a lot of reasons: bugs, largely unmaintained,
difficult to maintain. After a lot of review in the btrfs mailing list now
btrfs is in the official source.
I suggest to update your patches to using the btrfs command. If you need help
in this change, please contact me.

The experimental commands may be prefixed with the "test" verb, like

$ btrfs test <whatYouWant ...>

Chris,
does btrfsctl should be marked as deprecated ?

Regards
G.Baroncelli



On Friday, 13 August, 2010, [email protected] wrote:
> From: Ben Chociej <[email protected]>
>
> Add support for the new hot data functionality in-kernel. Three ioctls
> were added to export hot data statistics and turn hot data tracking on
> and off per inode. This patch enables btrfsctl to interact with those
> ioctls.
>
> Signed-off-by: Ben Chociej <[email protected]>
> Signed-off-by: Matt Lupfer <[email protected]>
> Tested-by: Conor Scott <[email protected]>
> ---
> btrfsctl.c | 107
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> ioctl-test.c | 3 ++
> ioctl.h | 24 +++++++++++++
> 3 files changed, 134 insertions(+), 0 deletions(-)
>
> diff --git a/btrfsctl.c b/btrfsctl.c
> index be6bf25..8617d06 100644
> --- a/btrfsctl.c
> +++ b/btrfsctl.c
> @@ -48,6 +48,7 @@ static void print_usage(void)
> {
> printf("usage: btrfsctl [ -d file|dir] [ -s snap_name subvol|tree
]\n");
> printf(" [-r size] [-A device] [-a] [-c] [-D dir
.]\n");
> + printf(" [-t file] [-T file] [-h filename [0-2]]\n");
> printf("\t-d filename: defragments one file\n");
> printf("\t-d directory: defragments the entire Btree\n");
> printf("\t-s snap_name dir: creates a new snapshot of dir\n");
> @@ -57,6 +58,14 @@ static void print_usage(void)
> printf("\t-a: scans all devices for Btrfs filesystems\n");
> printf("\t-c: forces a single FS sync\n");
> printf("\t-D: delete snapshot\n");
> + printf("\t-t filename: dump indexed heat information for a file\n");
> + printf("\t-T filename: dump live heat informaton for a file\n");
> + printf("\t-h filename: query heat tracking/migration status\n");
> + printf("\t-h filename level: set heat tracking level:\n");
> + printf("\t\tlevel =\n");
> + printf("\t\t0: no tracking or relocation\n");
> + printf("\t\t1: access tracking only\n");
> + printf("\t\t2: tracking and automatic migration to SSD\n");
> printf("\t-m [tree id] directory: set the default mounted subvolume"
> " to the [tree id] or the directory\n");
> printf("%s\n", BTRFS_BUILD_VERSION);
> @@ -99,12 +108,14 @@ int main(int ac, char **av)
> int fd;
> int ret;
> struct btrfs_ioctl_vol_args args;
> + struct btrfs_ioctl_heat_info hotinfo;
> char *name = NULL;
> int i;
> unsigned long command = 0;
> int len;
> char *fullpath;
> u64 objectid = 0;
> + int heatarg;
>
> if (ac == 2 && strcmp(av[1], "-a") == 0) {
> fprintf(stderr, "Scanning for Btrfs filesystems\n");
> @@ -205,6 +216,38 @@ int main(int ac, char **av)
> exit(1);
> }
> }
> + } else if (strcmp(av[i], "-t") == 0) {
> + if (i >= ac - 1) {
> + fprintf(stderr,
> + "-t requires a file argument\n");
> + print_usage();
> + }
> + hotinfo.live = 0;
> + command = BTRFS_IOC_GET_HEAT_INFO;
> + } else if (strcmp(av[i], "-T") == 0) {
> + if (i >= ac - 1) {
> + fprintf(stderr,
> + "-T requires a file argument\n");
> + print_usage();
> + }
> + hotinfo.live = 1;
> + command = BTRFS_IOC_GET_HEAT_INFO;
> + } else if (strcmp(av[i], "-h") == 0) {
> + if (i == ac - 2) {
> + command = BTRFS_IOC_GET_HEAT_OPTS;
> + } else if (i == ac - 3) {
> + command = BTRFS_IOC_SET_HEAT_OPTS;
> + heatarg = atoi(av[i + 2]);
> + } else {
> + fprintf(stderr, "-h invalid number of "
> + "arguments\n");
> + print_usage();
> + exit(1);
> + }
> +
> + fprintf(stderr, "Btrfs hot data tracking: `%s'\n\n",
> + av[i + 1]);
> + av[i + 2] = av[i + 1];
> }
> }
> if (command == 0) {
> @@ -236,6 +279,70 @@ int main(int ac, char **av)
> } else if (command == BTRFS_IOC_DEFAULT_SUBVOL) {
> printf("objectid is %llu\n", objectid);
> ret = ioctl(fd, command, &objectid);
> + } else if (command == BTRFS_IOC_GET_HEAT_INFO) {
> + strcpy(hotinfo.filename, fname);
> + ret = ioctl(fd, command, &hotinfo);
> + if (ret == 0) {
> + printf("Btrfs file hotness information\n");
> + printf("%s\n\n", hotinfo.filename);
> + printf("Last write: %llu\n",
> + (u64) hotinfo.last_write_time);
> + printf("Last read: %llu\n",
> + (u64) hotinfo.last_read_time);
> + printf("Average write delta: %llu\n",
> + (u64) hotinfo.avg_delta_writes);
> + printf("Average read delta: %llu\n",
> + (u64) hotinfo.avg_delta_reads);
> + printf("Number of writes: %u\n",
> + (u32) hotinfo.num_writes);
> + printf("Number of reads: %u\n\n",
> + (u32) hotinfo.num_reads);
> + if (hotinfo.live > 0)
> + printf("Temperature (live): %u\n\n",
> + hotinfo.temperature);
> + else
> + printf("Temperature (indexed): %u\n\n",
> + hotinfo.temperature);
> + }
> + } else if (command == BTRFS_IOC_SET_HEAT_OPTS) {
> + ret = ioctl(fd, command, &heatarg);
> + switch (heatarg) {
> + case 0:
> + printf("Turning OFF heat tracking and migration inode
"
> + "flags.\n");
> + break;
> + case 1:
> + printf("Turning ON the heat tracking inode flag.\n");
> + printf("Turning OFF the migration inode flag.\n");
> + break;
> + case 2:
> + printf("Turning ON heat tracking and migration inode "
> + "flags.\n");
> + break;
> + default:
> + printf("Invalid heat tracking argument.\n");
> + }
> + printf("(Inode flags can be overridden by mount
options)\n\n");
> + } else if (command == BTRFS_IOC_GET_HEAT_OPTS) {
> + ret = ioctl(fd, command, &heatarg);
> + switch (heatarg) {
> + case 0:
> + printf("Heat tracking and migration inode flags are "
> + "OFF.\n");
> + break;
> + case 1:
> + printf("Heat tracking inode flag is ON, migration "
> + "inode flag is OFF.\n");
> + break;
> + case 2:
> + printf("Heat tracking and migration inode flags are "
> + "both ON.\n");
> + break;
> + default:
> + printf("Wrong filesystem type, or invalid status "
> + "returned.\n");
> + }
> + printf("(Inode flags can be overridden by mount
options)\n\n");
> } else
> ret = ioctl(fd, command, &args);
> if (ret < 0) {
> diff --git a/ioctl-test.c b/ioctl-test.c
> index 7cf3bc2..8d54301 100644
> --- a/ioctl-test.c
> +++ b/ioctl-test.c
> @@ -22,6 +22,9 @@ unsigned long ioctls[] = {
> BTRFS_IOC_INO_LOOKUP,
> BTRFS_IOC_DEFAULT_SUBVOL,
> BTRFS_IOC_SPACE_INFO,
> + BTRFS_IOC_GET_HEAT_INFO,
> + BTRFS_IOC_SET_HEAT_OPTS,
> + BTRFS_IOC_GET_HEAT_OPTS,
> 0 };
>
> int main(int ac, char **av)
> diff --git a/ioctl.h b/ioctl.h
> index 776d7a9..5827338 100644
> --- a/ioctl.h
> +++ b/ioctl.h
> @@ -132,6 +132,18 @@ struct btrfs_ioctl_space_args {
> struct btrfs_ioctl_space_info spaces[0];
> };
>
> +struct btrfs_ioctl_heat_info {
> + __u64 avg_delta_reads;
> + __u64 avg_delta_writes;
> + __u64 last_read_time;
> + __u64 last_write_time;
> + __u32 num_reads;
> + __u32 num_writes;
> + char filename[BTRFS_PATH_NAME_MAX + 1];
> + int temperature;
> + __u8 live;
> +};
> +
> #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
> struct btrfs_ioctl_vol_args)
> #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
> @@ -169,4 +181,16 @@ struct btrfs_ioctl_space_args {
> #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
> #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
> struct btrfs_ioctl_space_args)
> +
> +/*
> + * Hot data tracking ioctls:
> + *
> + * GET_HEAT_INFO - retrieve frequency of access info on a file
> + * SET_HEAT_OPTS - set whether a file is tracked/migratable
> + * GET_HEAT_OPTS - check whether a file is tracked/migratable
> + */
> +#define BTRFS_IOC_GET_HEAT_INFO _IOWR(BTRFS_IOCTL_MAGIC, 21, \
> + struct btrfs_ioctl_heat_info)
> +#define BTRFS_IOC_SET_HEAT_OPTS _IOW(BTRFS_IOCTL_MAGIC, 22, int)
> +#define BTRFS_IOC_GET_HEAT_OPTS _IOR(BTRFS_IOCTL_MAGIC, 23, int)
> #endif
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


--
gpg key@ keyserver.linux.it: Goffredo Baroncelli (ghigo) <kreijackATinwind.it>
Key fingerprint = 4769 7E51 5293 D36C 814E C054 BF04 F161 3DC5 0512