2006-12-05 13:43:42

by Amit K. Arora

[permalink] [raw]
Subject: [RFC][Patch 1/1] Persistent preallocation in ext4

This patch implements persistent preallocation using ioctl. Other
options under consideration are posix_fallocate and ftruncate. The code
can be ported to other approach, if ioctl is not the favored one. The
patch takes care of marking/checking uninitialized extents.


Signed-off-by: Amit Arora <[email protected]>

---
fs/ext4/extents.c | 120 ++++++++++++++++++++++++++++++++----------------
fs/ext4/ioctl.c | 41 ++++++++++++++++
include/linux/ext4_fs.h | 21 ++++++++
3 files changed, 144 insertions(+), 38 deletions(-)

Index: linux-2.6.19/fs/ext4/extents.c
===================================================================
--- linux-2.6.19.orig/fs/ext4/extents.c 2006-12-05 16:03:40.000000000 +0530
+++ linux-2.6.19/fs/ext4/extents.c 2006-12-05 16:03:53.000000000 +0530
@@ -282,7 +282,7 @@
} else if (path->p_ext) {
ext_debug(" %d:%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- le16_to_cpu(path->p_ext->ee_len),
+ ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
ext_debug(" []");
@@ -305,7 +305,7 @@

for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
}
@@ -425,7 +425,7 @@
ext_debug(" -> %d:%llu:%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
- le16_to_cpu(path->p_ext->ee_len));
+ ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
{
@@ -686,7 +686,7 @@
ext_debug("move %d:%llu:%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
- le16_to_cpu(path[depth].p_ext->ee_len),
+ ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
@@ -1097,7 +1097,23 @@
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+ unsigned short ext1_ee_len, ext2_ee_len;
+
+
+ /*
+ * Make sure that either both extents are uninitialized, or
+ * both are _not_.
+ */
+ if ((ext4_ext_is_uninitialized(ex1)
+ && !ext4_ext_is_uninitialized(ex2))
+ || (!ext4_ext_is_uninitialized(ex1)
+ && ext4_ext_is_uninitialized(ex2)))
+ return 0;
+
+ ext1_ee_len = ext4_ext_get_actual_len(ex1);
+ ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+ if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
le32_to_cpu(ex2->ee_block))
return 0;

@@ -1106,14 +1122,14 @@
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+ if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
return 0;
#ifdef AGRESSIVE_TEST
if (le16_to_cpu(ex1->ee_len) >= 4)
return 0;
#endif

- if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+ if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
return 1;
return 0;
}
@@ -1132,9 +1148,9 @@
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
- int depth, len, err, next;
+ int depth, len, err, next, uninitialized = 0;

- BUG_ON(newext->ee_len == 0);
+ BUG_ON(ext4_ext_get_actual_len(newext) == 0);
depth = ext_depth(inode);
ex = path[depth].p_ext;
BUG_ON(path[depth].p_hdr == NULL);
@@ -1142,13 +1158,22 @@
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append %d block to %d:%d (from %llu)\n",
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
if ((err = ext4_ext_get_access(handle, inode, path + depth)))
return err;
- ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
- + le16_to_cpu(newext->ee_len));
+
+ /* ext4_can_extents_be_merged should have checked that either
+ * both extents are uninitialized, or both aren't. Thus we
+ * need to check any of them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1203,7 +1228,7 @@
ext_debug("first extent in the leaf: %d:%llu:%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len));
+ ext4_mark_uninitialized_ext(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
@@ -1216,7 +1241,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_mark_uninitialized_ext(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
}
@@ -1229,7 +1254,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
path[depth].p_ext = nearex;
@@ -1248,8 +1273,13 @@
if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
break;
/* merge with next extent! */
- nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
- + le16_to_cpu(nearex[1].ee_len));
+ if (ext4_ext_is_uninitialized(nearex))
+ uninitialized = 1;
+ nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
+ + ext4_ext_get_actual_len(nearex + 1));
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(nearex);
+
if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - nearex - 1)
* sizeof(struct ext4_extent);
@@ -1319,8 +1349,8 @@
end = le32_to_cpu(ex->ee_block);
if (block + num < end)
end = block + num;
- } else if (block >=
- le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
/* need to allocate space after found extent */
start = block;
end = block + num;
@@ -1332,7 +1362,8 @@
* by found extent
*/
start = block;
- end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
if (block + num < end)
end = block + num;
exists = 1;
@@ -1348,7 +1379,7 @@
cbex.ec_type = EXT4_EXT_CACHE_GAP;
} else {
cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = le16_to_cpu(ex->ee_len);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
cbex.ec_start = ext_pblock(ex);
cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
}
@@ -1556,12 +1587,12 @@
unsigned long from, unsigned long to)
{
struct buffer_head *bh;
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i;

#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
@@ -1575,12 +1606,12 @@
}
#endif
if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
unsigned long num;
ext4_fsblk_t start;
- num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
- start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %lu blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1588,12 +1619,12 @@
}
ext4_free_blocks(handle, inode, start, num);
} else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
} else {
printk("strange request: removal(2) %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
}
return 0;
}
@@ -1607,7 +1638,7 @@
struct ext4_extent_header *eh;
unsigned a, b, block, num;
unsigned long ex_ee_block;
- unsigned short ex_ee_len;
+ unsigned short ex_ee_len, uninitialized = 0;
struct ext4_extent *ex;

ext_debug("truncate since %lu in leaf\n", start);
@@ -1622,7 +1653,9 @@
ex = EXT_LAST_EXTENT(eh);

ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ if(ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex_ee_len = ext4_ext_get_actual_len(ex);

while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -1690,6 +1723,8 @@

ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(ex);

err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -1699,7 +1734,7 @@
ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
}

if (correct_index && eh->eh_entries)
@@ -1974,7 +2009,7 @@
if ((ex = path[depth].p_ext)) {
unsigned long ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext_pblock(ex);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
+ unsigned short ee_len;

/*
* Allow future support for preallocated extents to be added
@@ -1982,8 +2017,11 @@
* Uninitialized extents are treated as holes, except that
* we avoid (fail) allocating new blocks during a write.
*/
- if (ee_len > EXT_MAX_LEN)
+ if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN
+ && create != EXT4_CREATE_UNINITIALIZED_EXT)
goto out2;
+
+ ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
if (iblock >= ee_block && iblock < ee_block + ee_len) {
newblock = iblock - ee_block + ee_start;
@@ -1991,8 +2029,10 @@
allocated = ee_len - (iblock - ee_block);
ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
ee_block, ee_len, newblock);
- ext4_ext_put_in_cache(inode, ee_block, ee_len,
- ee_start, EXT4_EXT_CACHE_EXTENT);
+ /* Do not put uninitialized extent in the cache */
+ if(!ext4_ext_is_uninitialized(ex))
+ ext4_ext_put_in_cache(inode, ee_block, ee_len,
+ ee_start, EXT4_EXT_CACHE_EXTENT);
goto out;
}
}
@@ -2027,6 +2067,8 @@
newex.ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
+ if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
+ ext4_mark_uninitialized_ext(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err)
goto out2;
@@ -2038,8 +2080,10 @@
newblock = ext_pblock(&newex);
__set_bit(BH_New, &bh_result->b_state);

- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ /* Cache only when it is _not_ an uninitialized extent */
+ if(create!=EXT4_CREATE_UNINITIALIZED_EXT)
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
out:
if (allocated > max_blocks)
allocated = max_blocks;
Index: linux-2.6.19/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.19.orig/fs/ext4/ioctl.c 2006-12-05 16:03:40.000000000 +0530
+++ linux-2.6.19/fs/ext4/ioctl.c 2006-12-05 16:03:43.000000000 +0530
@@ -248,6 +248,47 @@
return err;
}

+ case EXT4_IOC_PREALLOCATE: {
+ struct ext4_falloc_input input;
+ handle_t *handle;
+ ext4_fsblk_t block, max_blocks;
+ int ret = 0;
+ struct buffer_head map_bh;
+ unsigned int blkbits = inode->i_blkbits;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (copy_from_user(&input,
+ (struct ext4_falloc_input __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ if (input.len == 0)
+ return -EINVAL;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -ENOTTY;
+
+ block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits;
+ max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits;
+ handle=ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ while(ret>=0 && ret<max_blocks)
+ {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ ret = ext4_ext_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_CREATE_UNINITIALIZED_EXT, 1);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return ret>0?0:ret;
+ }
+
default:
return -ENOTTY;
}
Index: linux-2.6.19/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.19.orig/include/linux/ext4_fs.h 2006-12-05 16:03:40.000000000 +0530
+++ linux-2.6.19/include/linux/ext4_fs.h 2006-12-05 16:03:53.000000000 +0530
@@ -102,6 +102,8 @@
EXT4_GOOD_OLD_FIRST_INO : \
(s)->s_first_ino)
#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \
+ (~((1 << blkbits)-1)))

/*
* Macro-instructions used to manage fragments
@@ -118,6 +120,19 @@
#endif

/*
+ * Macro-instructions used to handle (mark/unmark/check/create) unitialized
+ * extents. Applications can issue an IOCTL for preallocation, which results
+ * in assigning unitialized extents to the file
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT 2
+#define ext4_mark_uninitialized_ext(ext) ((ext)->ee_len |= \
+ cpu_to_le16(0x8000))
+#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x8000)
+#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x7FFF)
+
+/*
* Structure of a blocks group descriptor
*/
struct ext4_group_desc
@@ -225,6 +240,11 @@
__u32 free_blocks_count;
};

+/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */
+struct ext4_falloc_input {
+ __u64 offset;
+ __u64 len;
+};

/*
* ioctl commands
@@ -242,6 +262,7 @@
#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input)

/*
* ioctl commands in 32 bit emulation


2006-12-06 05:58:47

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

This patch takes care of few suggestions given by Suzuki.
Suzuki, Thanks!

Signed-off-by: Amit Arora <[email protected]>
---
fs/ext4/extents.c | 117 ++++++++++++++++++++++++++++++++----------------
fs/ext4/ioctl.c | 41 ++++++++++++++++
include/linux/ext4_fs.h | 21 ++++++++
3 files changed, 141 insertions(+), 38 deletions(-)

Index: linux-2.6.19/fs/ext4/extents.c
===================================================================
--- linux-2.6.19.orig/fs/ext4/extents.c 2006-12-06 10:18:12.000000000 +0530
+++ linux-2.6.19/fs/ext4/extents.c 2006-12-06 10:23:02.000000000 +0530
@@ -282,7 +282,7 @@
} else if (path->p_ext) {
ext_debug(" %d:%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- le16_to_cpu(path->p_ext->ee_len),
+ ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
ext_debug(" []");
@@ -305,7 +305,7 @@

for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
}
@@ -425,7 +425,7 @@
ext_debug(" -> %d:%llu:%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
- le16_to_cpu(path->p_ext->ee_len));
+ ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
{
@@ -686,7 +686,7 @@
ext_debug("move %d:%llu:%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
- le16_to_cpu(path[depth].p_ext->ee_len),
+ ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
@@ -1097,7 +1097,20 @@
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+ unsigned short ext1_ee_len, ext2_ee_len;
+
+
+ /*
+ * Make sure that either both extents are uninitialized, or
+ * both are _not_.
+ */
+ if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ return 0;
+
+ ext1_ee_len = ext4_ext_get_actual_len(ex1);
+ ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+ if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
le32_to_cpu(ex2->ee_block))
return 0;

@@ -1106,14 +1119,14 @@
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+ if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
return 0;
#ifdef AGRESSIVE_TEST
if (le16_to_cpu(ex1->ee_len) >= 4)
return 0;
#endif

- if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+ if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
return 1;
return 0;
}
@@ -1132,9 +1145,9 @@
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
- int depth, len, err, next;
+ int depth, len, err, next, uninitialized = 0;

- BUG_ON(newext->ee_len == 0);
+ BUG_ON(ext4_ext_get_actual_len(newext) == 0);
depth = ext_depth(inode);
ex = path[depth].p_ext;
BUG_ON(path[depth].p_hdr == NULL);
@@ -1142,13 +1155,22 @@
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append %d block to %d:%d (from %llu)\n",
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
if ((err = ext4_ext_get_access(handle, inode, path + depth)))
return err;
- ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
- + le16_to_cpu(newext->ee_len));
+
+ /* ext4_can_extents_be_merged should have checked that either
+ * both extents are uninitialized, or both aren't. Thus we
+ * need to check any of them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1203,7 +1225,7 @@
ext_debug("first extent in the leaf: %d:%llu:%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len));
+ ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
@@ -1216,7 +1238,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
}
@@ -1229,7 +1251,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
path[depth].p_ext = nearex;
@@ -1248,8 +1270,13 @@
if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
break;
/* merge with next extent! */
- nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
- + le16_to_cpu(nearex[1].ee_len));
+ if (ext4_ext_is_uninitialized(nearex))
+ uninitialized = 1;
+ nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
+ + ext4_ext_get_actual_len(nearex + 1));
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(nearex);
+
if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - nearex - 1)
* sizeof(struct ext4_extent);
@@ -1319,8 +1346,8 @@
end = le32_to_cpu(ex->ee_block);
if (block + num < end)
end = block + num;
- } else if (block >=
- le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
/* need to allocate space after found extent */
start = block;
end = block + num;
@@ -1332,7 +1359,8 @@
* by found extent
*/
start = block;
- end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
if (block + num < end)
end = block + num;
exists = 1;
@@ -1348,7 +1376,7 @@
cbex.ec_type = EXT4_EXT_CACHE_GAP;
} else {
cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = le16_to_cpu(ex->ee_len);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
cbex.ec_start = ext_pblock(ex);
cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
}
@@ -1556,12 +1584,12 @@
unsigned long from, unsigned long to)
{
struct buffer_head *bh;
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i;

#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
@@ -1575,12 +1603,12 @@
}
#endif
if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
unsigned long num;
ext4_fsblk_t start;
- num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
- start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %lu blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1588,12 +1616,12 @@
}
ext4_free_blocks(handle, inode, start, num);
} else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
} else {
printk("strange request: removal(2) %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
}
return 0;
}
@@ -1607,7 +1635,7 @@
struct ext4_extent_header *eh;
unsigned a, b, block, num;
unsigned long ex_ee_block;
- unsigned short ex_ee_len;
+ unsigned short ex_ee_len, uninitialized = 0;
struct ext4_extent *ex;

ext_debug("truncate since %lu in leaf\n", start);
@@ -1622,7 +1650,9 @@
ex = EXT_LAST_EXTENT(eh);

ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ if(ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex_ee_len = ext4_ext_get_actual_len(ex);

while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -1690,6 +1720,8 @@

ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
+ if(uninitialized)
+ ext4_mark_uninitialized_ext(ex);

err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -1699,7 +1731,7 @@
ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
}

if (correct_index && eh->eh_entries)
@@ -1974,7 +2006,7 @@
if ((ex = path[depth].p_ext)) {
unsigned long ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext_pblock(ex);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
+ unsigned short ee_len;

/*
* Allow future support for preallocated extents to be added
@@ -1982,8 +2014,11 @@
* Uninitialized extents are treated as holes, except that
* we avoid (fail) allocating new blocks during a write.
*/
- if (ee_len > EXT_MAX_LEN)
+ if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN
+ && create != EXT4_CREATE_UNINITIALIZED_EXT)
goto out2;
+
+ ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
if (iblock >= ee_block && iblock < ee_block + ee_len) {
newblock = iblock - ee_block + ee_start;
@@ -1991,8 +2026,10 @@
allocated = ee_len - (iblock - ee_block);
ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
ee_block, ee_len, newblock);
- ext4_ext_put_in_cache(inode, ee_block, ee_len,
- ee_start, EXT4_EXT_CACHE_EXTENT);
+ /* Do not put uninitialized extent in the cache */
+ if(!ext4_ext_is_uninitialized(ex))
+ ext4_ext_put_in_cache(inode, ee_block, ee_len,
+ ee_start, EXT4_EXT_CACHE_EXTENT);
goto out;
}
}
@@ -2027,6 +2064,8 @@
newex.ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
+ if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
+ ext4_mark_uninitialized_ext(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err)
goto out2;
@@ -2038,8 +2077,10 @@
newblock = ext_pblock(&newex);
__set_bit(BH_New, &bh_result->b_state);

- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ /* Cache only when it is _not_ an uninitialized extent */
+ if(create!=EXT4_CREATE_UNINITIALIZED_EXT)
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
out:
if (allocated > max_blocks)
allocated = max_blocks;
Index: linux-2.6.19/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.19.orig/fs/ext4/ioctl.c 2006-12-06 10:18:12.000000000 +0530
+++ linux-2.6.19/fs/ext4/ioctl.c 2006-12-06 10:18:15.000000000 +0530
@@ -248,6 +248,47 @@
return err;
}

+ case EXT4_IOC_PREALLOCATE: {
+ struct ext4_falloc_input input;
+ handle_t *handle;
+ ext4_fsblk_t block, max_blocks;
+ int ret = 0;
+ struct buffer_head map_bh;
+ unsigned int blkbits = inode->i_blkbits;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (copy_from_user(&input,
+ (struct ext4_falloc_input __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ if (input.len == 0)
+ return -EINVAL;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -ENOTTY;
+
+ block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits;
+ max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits;
+ handle=ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ while(ret>=0 && ret<max_blocks)
+ {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ ret = ext4_ext_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_CREATE_UNINITIALIZED_EXT, 1);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return ret>0?0:ret;
+ }
+
default:
return -ENOTTY;
}
Index: linux-2.6.19/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.19.orig/include/linux/ext4_fs.h 2006-12-06 10:18:12.000000000 +0530
+++ linux-2.6.19/include/linux/ext4_fs.h 2006-12-06 10:18:16.000000000 +0530
@@ -102,6 +102,8 @@
EXT4_GOOD_OLD_FIRST_INO : \
(s)->s_first_ino)
#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \
+ (~((1 << blkbits)-1)))

/*
* Macro-instructions used to manage fragments
@@ -118,6 +120,19 @@
#endif

/*
+ * Macro-instructions used to handle (mark/unmark/check/create) unitialized
+ * extents. Applications can issue an IOCTL for preallocation, which results
+ * in assigning unitialized extents to the file
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT 2
+#define ext4_mark_uninitialized_ext(ext) ((ext)->ee_len |= \
+ cpu_to_le16(0x8000))
+#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x8000)
+#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x7FFF)
+
+/*
* Structure of a blocks group descriptor
*/
struct ext4_group_desc
@@ -225,6 +240,11 @@
__u32 free_blocks_count;
};

+/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */
+struct ext4_falloc_input {
+ __u64 offset;
+ __u64 len;
+};

/*
* ioctl commands
@@ -242,6 +262,7 @@
#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input)

/*
* ioctl commands in 32 bit emulation

2006-12-12 01:28:21

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

On Wed, 2006-12-06 at 11:28 +0530, Amit K. Arora wrote:

> @@ -1142,13 +1155,22 @@
> /* try to insert block into found extent and return */
> if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
> ext_debug("append %d block to %d:%d (from %llu)\n",
> - le16_to_cpu(newext->ee_len),
> + ext4_ext_get_actual_len(newext),
> le32_to_cpu(ex->ee_block),
> - le16_to_cpu(ex->ee_len), ext_pblock(ex));
> + ext4_ext_get_actual_len(ex), ext_pblock(ex));
> if ((err = ext4_ext_get_access(handle, inode, path + depth)))
> return err;
> - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
> - + le16_to_cpu(newext->ee_len));
> +
> + /* ext4_can_extents_be_merged should have checked that either
> + * both extents are uninitialized, or both aren't. Thus we
> + * need to check any of them here.
> + */
> + if (ext4_ext_is_uninitialized(ex))
> + uninitialized = 1;
> + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
> + + ext4_ext_get_actual_len(newext));
> + if(uninitialized)
> + ext4_mark_uninitialized_ext(ex);
> eh = path[depth].p_hdr;
> nearex = ex;
> goto merge;

Hmm, I missed the point to re-mark an uninitialized extent here. If ex
is an uninitialized extent, the mark(the first bit the ee_len) shall
still there after the update, isn't? We already make sure that two
large uninitialized extent can't get merged if the resulting length will
take the first bit, which used as the mark of uninitialized extent.

2006-12-12 06:23:04

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

Hi Mingming,

On Mon, Dec 11, 2006 at 05:28:15PM -0800, Mingming Cao wrote:
> On Wed, 2006-12-06 at 11:28 +0530, Amit K. Arora wrote:
>
> > @@ -1142,13 +1155,22 @@
> > /* try to insert block into found extent and return */
> > if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
> > ext_debug("append %d block to %d:%d (from %llu)\n",
> > - le16_to_cpu(newext->ee_len),
> > + ext4_ext_get_actual_len(newext),
> > le32_to_cpu(ex->ee_block),
> > - le16_to_cpu(ex->ee_len), ext_pblock(ex));
> > + ext4_ext_get_actual_len(ex), ext_pblock(ex));
> > if ((err = ext4_ext_get_access(handle, inode, path + depth)))
> > return err;
> > - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
> > - + le16_to_cpu(newext->ee_len));
> > +
> > + /* ext4_can_extents_be_merged should have checked that either
> > + * both extents are uninitialized, or both aren't. Thus we
> > + * need to check any of them here.
> > + */
> > + if (ext4_ext_is_uninitialized(ex))
> > + uninitialized = 1;
> > + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
> > + + ext4_ext_get_actual_len(newext));

Above line will "remove" the uninitialized bit from "ex", if it was set.
We call ext4_ext_get_actual_len() to get the "actual" lengths of the two
extents, which removes this MSB in ee_len (MSB in ee_len is used to mark
an extent uninitialized). Now, we do this because if lengths of two
uninitialized extents will be added as it is (i.e. without masking out
the MSB in the length), it will result in removing the MSB in ee_len.
For example, 0x8002 + 0x8003 => 0x10005 => 0x5 (since ee_len is 16 bit).

That is why just before this line, we save the "state" of this extent,
whether it was uninitialized or not. And, we restore this "state" below.

> > + if(uninitialized)
> > + ext4_mark_uninitialized_ext(ex);
> > eh = path[depth].p_hdr;
> > nearex = ex;
> > goto merge;
>
> Hmm, I missed the point to re-mark an uninitialized extent here. If ex
> is an uninitialized extent, the mark(the first bit the ee_len) shall
> still there after the update, isn't? We already make sure that two
> large uninitialized extent can't get merged if the resulting length will
> take the first bit, which used as the mark of uninitialized extent.

Please get back if you do not agree with the explanation above and if I
am missing something here. Thanks!

Regards,
Amit Arora

2006-12-13 00:20:44

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote:
> Hi Mingming,
>
Hi Amit,

> On Mon, Dec 11, 2006 at 05:28:15PM -0800, Mingming Cao wrote:
> > On Wed, 2006-12-06 at 11:28 +0530, Amit K. Arora wrote:
> >
> > > @@ -1142,13 +1155,22 @@
> > > /* try to insert block into found extent and return */
> > > if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
> > > ext_debug("append %d block to %d:%d (from %llu)\n",
> > > - le16_to_cpu(newext->ee_len),
> > > + ext4_ext_get_actual_len(newext),
> > > le32_to_cpu(ex->ee_block),
> > > - le16_to_cpu(ex->ee_len), ext_pblock(ex));
> > > + ext4_ext_get_actual_len(ex), ext_pblock(ex));
> > > if ((err = ext4_ext_get_access(handle, inode, path + depth)))
> > > return err;
> > > - ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
> > > - + le16_to_cpu(newext->ee_len));
> > > +
> > > + /* ext4_can_extents_be_merged should have checked that either
> > > + * both extents are uninitialized, or both aren't. Thus we
> > > + * need to check any of them here.
> > > + */
> > > + if (ext4_ext_is_uninitialized(ex))
> > > + uninitialized = 1;
> > > + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
> > > + + ext4_ext_get_actual_len(newext));
>
> Above line will "remove" the uninitialized bit from "ex", if it was set.
> We call ext4_ext_get_actual_len() to get the "actual" lengths of the two
> extents, which removes this MSB in ee_len (MSB in ee_len is used to mark
> an extent uninitialized). Now, we do this because if lengths of two
> uninitialized extents will be added as it is (i.e. without masking out
> the MSB in the length), it will result in removing the MSB in ee_len.
> For example, 0x8002 + 0x8003 => 0x10005 => 0x5 (since ee_len is 16 bit).
>
> That is why just before this line, we save the "state" of this extent,
> whether it was uninitialized or not. And, we restore this "state" below.

Ah...you are right:)

More comments below...

> Index: linux-2.6.19/fs/ext4/ioctl.c
> ===================================================================
> --- linux-2.6.19.orig/fs/ext4/ioctl.c 2006-12-06 10:18:12.000000000 +0530
> +++ linux-2.6.19/fs/ext4/ioctl.c 2006-12-06 10:18:15.000000000 +0530
> @@ -248,6 +248,47 @@
> return err;
> }
>
> + case EXT4_IOC_PREALLOCATE: {
> + struct ext4_falloc_input input;
> + handle_t *handle;
> + ext4_fsblk_t block, max_blocks;
> + int ret = 0;
> + struct buffer_head map_bh;
> + unsigned int blkbits = inode->i_blkbits;
> +
> + if (IS_RDONLY(inode))
> + return -EROFS;
> +
> + if (copy_from_user(&input,
> + (struct ext4_falloc_input __user *) arg, sizeof(input)))
> + return -EFAULT;
> +
> + if (input.len == 0)
> + return -EINVAL;
> +
> + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
> + return -ENOTTY;
>

Supporting preallocation for extent based files seems fairly
straightforward. I agree we should look at this first. After get this
done, it probably worth re-consider whether to support preallocation for
non-extent based files on ext4. I could imagine user upgrade from ext3
to ext4, and expecting to use preallocation on those existing files....

> +
> + block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits;
> + max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits;
> + handle=ext4_journal_start(inode,
> + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
> + while(ret>=0 && ret<max_blocks)
> + {
> + block = block + ret;
> + max_blocks = max_blocks - ret;
> + ret = ext4_ext_get_blocks(handle, inode, block,
> + max_blocks, &map_bh,
> + EXT4_CREATE_UNINITIALIZED_EXT, 1);

Since the interface takes offset and number of blocks to allocate, I
assuming we are going to handle holes in preallocation, thus, we cannot
not mark the extend_size flag to 1 when calling ext4_ext_get_blocks.

I think we should update i_size and i_disksize after preallocation. Oh,
to protect parallel updating i_size, we have to take i_mutex down.

> + }
> + ext4_mark_inode_dirty(handle, inode);
> + ext4_journal_stop(handle);
> +

Error code returned by ext4_journal_stop() is being ignored here, is
this right?
Well, there are other places in ext34/ioctl.c which ignore the return
returned by ext4_journal_stop(), maybe should fix this in a separate
patch.

> + return ret>0?0:ret;
> + }
>
>
Oh, what if we failed to allocate the full amount of blocks? i.e, the
ext4_ext_get_blocks() returns -ENOSPC error and exit the loop early. Are
we going to return error, or try something like

if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry

I wonder it might be useful to return the actual number of blocks
preallocated back to the application.


Mingming

2006-12-13 10:02:08

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

On Tue, Dec 12, 2006 at 04:20:38PM -0800, Mingming Cao wrote:
> On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote:
> > +
> > + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
> > + return -ENOTTY;
> >
>
> Supporting preallocation for extent based files seems fairly
> straightforward. I agree we should look at this first. After get this
> done, it probably worth re-consider whether to support preallocation for
> non-extent based files on ext4. I could imagine user upgrade from ext3
> to ext4, and expecting to use preallocation on those existing files....

I gave a thought on this initially. But, I was not sure how we should
implement preallocation in a non-extent based file. Using extents we can
mark a set of blocks as unitialized, but how will we do this for
non-extent based files ? If we do not have a way to mark blocks
uninitialized, when someone will try to read from a preallocated block,
it will return junk/stale data instead of zeroes.

But, if we can think of a solution here then it will be as simple as
removing the check above and replacing "ext4_ext_get_blocks()" with
"ext4_get_blocks_wrap()" in the while() loop.

>
> > +
> > + block = EXT4_BLOCK_ALIGN(input.offset, blkbits) >> blkbits;
> > + max_blocks = EXT4_BLOCK_ALIGN(input.len, blkbits) >> blkbits;

I was wondering if I should change above lines to this :

+ block = input.offset >> blkbits;
+ max_blocks = (EXT4_BLOCK_ALIGN(input.len+input.offset,
blkbits) >> blkbits) - block;

Reason is that the block which contains the offset, should also be
preallocated. And the max_blocks should be calculated accordingly.

> > + while(ret>=0 && ret<max_blocks)
> > + {
> > + block = block + ret;
> > + max_blocks = max_blocks - ret;
> > + ret = ext4_ext_get_blocks(handle, inode, block,
> > + max_blocks, &map_bh,
> > + EXT4_CREATE_UNINITIALIZED_EXT, 1);
>
> Since the interface takes offset and number of blocks to allocate, I
> assuming we are going to handle holes in preallocation, thus, we cannot
> not mark the extend_size flag to 1 when calling ext4_ext_get_blocks.
>
> I think we should update i_size and i_disksize after preallocation. Oh,
> to protect parallel updating i_size, we have to take i_mutex down.

Okay. So, is this what you want to be done here :

+retry:
+ ret = 0;
+ while(ret>=0 && ret<max_blocks)
+ {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ ret = ext4_ext_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_CREATE_UNINITIALIZED_EXT,0);
+ if(ret > 0 && test_bit(BH_New, &map_bh.b_state))
+ nblocks = nblocks + ret;
+ }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
+ &retries))
+ goto retry;
+
+ if(nblocks) {
+ mutex_lock(&inode->i_mutex);
+ inode->i_size = inode->i_size + (nblocks >> blkbits);
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ mutex_unlock(&inode->i_mutex);
+ }

>
> > + }
> > + ext4_mark_inode_dirty(handle, inode);
> > + ext4_journal_stop(handle);
> > +
>
> Error code returned by ext4_journal_stop() is being ignored here, is
> this right?
> Well, there are other places in ext34/ioctl.c which ignore the return
> returned by ext4_journal_stop(), maybe should fix this in a separate
> patch.

Agreed. I think following should take care of it:

+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if(ret > 0)
+ ret = ret2;
+ return ret > 0 ? nblocks : ret;

> > + return ret>0?0:ret;
> > + }
> >
> >
> Oh, what if we failed to allocate the full amount of blocks? i.e, the
> ext4_ext_get_blocks() returns -ENOSPC error and exit the loop early. Are
> we going to return error, or try something like
>
> if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
> goto retry
>
> I wonder it might be useful to return the actual number of blocks
> preallocated back to the application.

Ok. Yes, makes sense. We can return the number of "new" blocks like
this:
+ return ret > 0 ? nblocks : ret;



Please let me know if you agree with the above set of changes, and any
further comments you have. I will then update and test the new patch and
post it again. Thanks!

Regards,
Amit Arora

2006-12-13 13:36:37

by Dave Kleikamp

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

On Wed, 2006-12-13 at 15:31 +0530, Amit K. Arora wrote:
> On Tue, Dec 12, 2006 at 04:20:38PM -0800, Mingming Cao wrote:
> > On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote:

> > Supporting preallocation for extent based files seems fairly
> > straightforward. I agree we should look at this first. After get this
> > done, it probably worth re-consider whether to support preallocation for
> > non-extent based files on ext4. I could imagine user upgrade from ext3
> > to ext4, and expecting to use preallocation on those existing files....

I disagree here. Why add the complexity for what is going to be a rare
case? In cases where a user is going to benefit from preallocation,
she'll probably also benefit from extents, and would be better off
making a copy of the file, thus converting it to extents.

> I gave a thought on this initially. But, I was not sure how we should
> implement preallocation in a non-extent based file. Using extents we can
> mark a set of blocks as unitialized, but how will we do this for
> non-extent based files ? If we do not have a way to mark blocks
> uninitialized, when someone will try to read from a preallocated block,
> it will return junk/stale data instead of zeroes.

If anything, the block-based preallocation could initialize all of the
data to zero. It would be slow, but it would still provide the correct
function and result in contiguous allocation.

Shaggy
--
David Kleikamp
IBM Linux Technology Center

2006-12-13 15:34:42

by Suparna Bhattacharya

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

On Wed, Dec 13, 2006 at 07:36:29AM -0600, Dave Kleikamp wrote:
> On Wed, 2006-12-13 at 15:31 +0530, Amit K. Arora wrote:
> > On Tue, Dec 12, 2006 at 04:20:38PM -0800, Mingming Cao wrote:
> > > On Tue, 2006-12-12 at 11:53 +0530, Amit K. Arora wrote:
>
> > > Supporting preallocation for extent based files seems fairly
> > > straightforward. I agree we should look at this first. After get this
> > > done, it probably worth re-consider whether to support preallocation for
> > > non-extent based files on ext4. I could imagine user upgrade from ext3
> > > to ext4, and expecting to use preallocation on those existing files....
>
> I disagree here. Why add the complexity for what is going to be a rare
> case? In cases where a user is going to benefit from preallocation,
> she'll probably also benefit from extents, and would be better off
> making a copy of the file, thus converting it to extents.
>
> > I gave a thought on this initially. But, I was not sure how we should
> > implement preallocation in a non-extent based file. Using extents we can
> > mark a set of blocks as unitialized, but how will we do this for
> > non-extent based files ? If we do not have a way to mark blocks
> > uninitialized, when someone will try to read from a preallocated block,
> > it will return junk/stale data instead of zeroes.
>
> If anything, the block-based preallocation could initialize all of the
> data to zero. It would be slow, but it would still provide the correct
> function and result in contiguous allocation.

And posix_fallocate does that already ...

Regards
Suparna

>
> Shaggy
> --
> David Kleikamp
> IBM Linux Technology Center
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
Suparna Bhattacharya ([email protected])
Linux Technology Center
IBM Software Lab, India

2006-12-13 15:54:41

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC][Patch 1/1] Persistent preallocation in ext4

Dave Kleikamp wrote:
>
> If anything, the block-based preallocation could initialize all of the
> data to zero. It would be slow, but it would still provide the correct
> function and result in contiguous allocation.
>
Agree. That seems goood enough.


Mingming

2006-12-15 12:35:39

by Amit K. Arora

[permalink] [raw]
Subject: [RFC][Patch 1/2] Persistent preallocation in ext4

This is the first patch in the set of two.

It implements the ioctl which will be used for persistent preallocation. It is a respun of the previous patch which was posted earlier, and includes following changes:
* Takes care of review comments by Mingming
* The declaration of extent related macros are now moved to ext4_fs_extent.h (from ext4_fs.h)
* Updated the logic to calculate block and max_blocks in ext4/ioctl.c, which is used to call get_blocks.

It does _not_ take care of implementing persistent preallocation for non-extent based files. It is because of the following reasons:
* It is being considered as a rare case
* Users can/should convert their file(s) to extent format to use this feature
* Moreover, posix_fallocate() can be used for this purpose, if the user does not want to convert the file(s) to the extent based format.


Signed-off-by: Amit Arora ([email protected])

---
fs/ext4/extents.c | 116 ++++++++++++++++++++++++++--------------
fs/ext4/ioctl.c | 59 ++++++++++++++++++++
include/linux/ext4_fs.h | 13 ++++
include/linux/ext4_fs_extents.h | 13 ++++
4 files changed, 163 insertions(+), 38 deletions(-)

Index: linux-2.6.19.prealloc/fs/ext4/extents.c
===================================================================
--- linux-2.6.19.prealloc.orig/fs/ext4/extents.c 2006-12-15 16:44:35.000000000 +0530
+++ linux-2.6.19.prealloc/fs/ext4/extents.c 2006-12-15 17:53:48.000000000 +0530
@@ -282,7 +282,7 @@
} else if (path->p_ext) {
ext_debug(" %d:%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- le16_to_cpu(path->p_ext->ee_len),
+ ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
ext_debug(" []");
@@ -305,7 +305,7 @@

for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
}
@@ -425,7 +425,7 @@
ext_debug(" -> %d:%llu:%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
- le16_to_cpu(path->p_ext->ee_len));
+ ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
{
@@ -686,7 +686,7 @@
ext_debug("move %d:%llu:%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
- le16_to_cpu(path[depth].p_ext->ee_len),
+ ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
@@ -1097,7 +1097,19 @@
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+ unsigned short ext1_ee_len, ext2_ee_len;
+
+ /*
+ * Make sure that either both extents are uninitialized, or
+ * both are _not_.
+ */
+ if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ return 0;
+
+ ext1_ee_len = ext4_ext_get_actual_len(ex1);
+ ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+ if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
le32_to_cpu(ex2->ee_block))
return 0;

@@ -1106,14 +1118,14 @@
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+ if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
return 0;
#ifdef AGRESSIVE_TEST
if (le16_to_cpu(ex1->ee_len) >= 4)
return 0;
#endif

- if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+ if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
return 1;
return 0;
}
@@ -1132,9 +1144,9 @@
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
- int depth, len, err, next;
+ int depth, len, err, next, uninitialized = 0;

- BUG_ON(newext->ee_len == 0);
+ BUG_ON(ext4_ext_get_actual_len(newext) == 0);
depth = ext_depth(inode);
ex = path[depth].p_ext;
BUG_ON(path[depth].p_hdr == NULL);
@@ -1142,13 +1154,22 @@
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append %d block to %d:%d (from %llu)\n",
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
if ((err = ext4_ext_get_access(handle, inode, path + depth)))
return err;
- ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
- + le16_to_cpu(newext->ee_len));
+
+ /* ext4_can_extents_be_merged should have checked that either
+ * both extents are uninitialized, or both aren't. Thus we
+ * need to check any of them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1203,7 +1224,7 @@
ext_debug("first extent in the leaf: %d:%llu:%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len));
+ ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
@@ -1216,7 +1237,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
}
@@ -1229,7 +1250,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
path[depth].p_ext = nearex;
@@ -1248,8 +1269,13 @@
if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
break;
/* merge with next extent! */
- nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
- + le16_to_cpu(nearex[1].ee_len));
+ if (ext4_ext_is_uninitialized(nearex))
+ uninitialized = 1;
+ nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
+ + ext4_ext_get_actual_len(nearex + 1));
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(nearex);
+
if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - nearex - 1)
* sizeof(struct ext4_extent);
@@ -1319,8 +1345,8 @@
end = le32_to_cpu(ex->ee_block);
if (block + num < end)
end = block + num;
- } else if (block >=
- le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
/* need to allocate space after found extent */
start = block;
end = block + num;
@@ -1332,7 +1358,8 @@
* by found extent
*/
start = block;
- end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
if (block + num < end)
end = block + num;
exists = 1;
@@ -1348,7 +1375,7 @@
cbex.ec_type = EXT4_EXT_CACHE_GAP;
} else {
cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = le16_to_cpu(ex->ee_len);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
cbex.ec_start = ext_pblock(ex);
cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
}
@@ -1556,12 +1583,12 @@
unsigned long from, unsigned long to)
{
struct buffer_head *bh;
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i;

#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
@@ -1575,12 +1602,12 @@
}
#endif
if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
unsigned long num;
ext4_fsblk_t start;
- num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
- start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %lu blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1588,12 +1615,12 @@
}
ext4_free_blocks(handle, inode, start, num);
} else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
} else {
printk("strange request: removal(2) %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
}
return 0;
}
@@ -1607,7 +1634,7 @@
struct ext4_extent_header *eh;
unsigned a, b, block, num;
unsigned long ex_ee_block;
- unsigned short ex_ee_len;
+ unsigned short ex_ee_len, uninitialized = 0;
struct ext4_extent *ex;

ext_debug("truncate since %lu in leaf\n", start);
@@ -1622,7 +1649,9 @@
ex = EXT_LAST_EXTENT(eh);

ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ if(ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex_ee_len = ext4_ext_get_actual_len(ex);

while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -1690,6 +1719,8 @@

ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(ex);

err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -1699,7 +1730,7 @@
ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
}

if (correct_index && eh->eh_entries)
@@ -1974,7 +2005,7 @@
if ((ex = path[depth].p_ext)) {
unsigned long ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext_pblock(ex);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
+ unsigned short ee_len;

/*
* Allow future support for preallocated extents to be added
@@ -1982,8 +2013,11 @@
* Uninitialized extents are treated as holes, except that
* we avoid (fail) allocating new blocks during a write.
*/
- if (ee_len > EXT_MAX_LEN)
+ if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN
+ && create != EXT4_CREATE_UNINITIALIZED_EXT)
goto out2;
+
+ ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
if (iblock >= ee_block && iblock < ee_block + ee_len) {
newblock = iblock - ee_block + ee_start;
@@ -1991,8 +2025,10 @@
allocated = ee_len - (iblock - ee_block);
ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
ee_block, ee_len, newblock);
- ext4_ext_put_in_cache(inode, ee_block, ee_len,
- ee_start, EXT4_EXT_CACHE_EXTENT);
+ /* Do not put uninitialized extent in the cache */
+ if(!ext4_ext_is_uninitialized(ex))
+ ext4_ext_put_in_cache(inode, ee_block, ee_len,
+ ee_start, EXT4_EXT_CACHE_EXTENT);
goto out;
}
}
@@ -2027,6 +2063,8 @@
newex.ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
+ if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
+ ext4_ext_mark_uninitialized(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err)
goto out2;
@@ -2038,8 +2076,10 @@
newblock = ext_pblock(&newex);
__set_bit(BH_New, &bh_result->b_state);

- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ /* Cache only when it is _not_ an uninitialized extent */
+ if(create!=EXT4_CREATE_UNINITIALIZED_EXT)
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
out:
if (allocated > max_blocks)
allocated = max_blocks;
Index: linux-2.6.19.prealloc/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-15 16:44:35.000000000 +0530
+++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-15 17:47:00.000000000 +0530
@@ -248,6 +248,65 @@
return err;
}

+ case EXT4_IOC_PREALLOCATE: {
+ struct ext4_falloc_input input;
+ handle_t *handle;
+ ext4_fsblk_t block, max_blocks;
+ int ret, ret2, nblocks = 0, retries = 0;
+ struct buffer_head map_bh;
+ unsigned int blkbits = inode->i_blkbits;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (copy_from_user(&input,
+ (struct ext4_falloc_input __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ if (input.len == 0)
+ return -EINVAL;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -ENOTTY;
+
+ block = input.offset >> blkbits;
+ max_blocks = (EXT4_BLOCK_ALIGN(input.len + input.offset,
+ blkbits) >> blkbits) - block;
+ handle=ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+retry:
+ ret = 0;
+ while(ret>=0 && ret<max_blocks)
+ {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ ret = ext4_ext_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ if(ret > 0 && test_bit(BH_New, &map_bh.b_state))
+ nblocks = nblocks + ret;
+ }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
+ &retries))
+ goto retry;
+
+ if(nblocks) {
+ mutex_lock(&inode->i_mutex);
+ inode->i_size = inode->i_size + (nblocks >> blkbits);
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if(ret > 0)
+ ret = ret2;
+
+ return ret > 0 ? nblocks : ret;
+ }
+
default:
return -ENOTTY;
}
Index: linux-2.6.19.prealloc/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.19.prealloc.orig/include/linux/ext4_fs.h 2006-12-15 16:44:35.000000000 +0530
+++ linux-2.6.19.prealloc/include/linux/ext4_fs.h 2006-12-15 16:52:18.000000000 +0530
@@ -102,6 +102,8 @@
EXT4_GOOD_OLD_FIRST_INO : \
(s)->s_first_ino)
#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \
+ (~((1 << blkbits)-1)))

/*
* Macro-instructions used to manage fragments
@@ -225,6 +227,16 @@
__u32 free_blocks_count;
};

+/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */
+struct ext4_falloc_input {
+ __u64 offset;
+ __u64 len;
+};
+
+/* Following is used by preallocation logic to tell get_blocks() that we
+ * want uninitialzed extents.
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT 2

/*
* ioctl commands
@@ -242,6 +254,7 @@
#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input)

/*
* ioctl commands in 32 bit emulation
Index: linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h
===================================================================
--- linux-2.6.19.prealloc.orig/include/linux/ext4_fs_extents.h 2006-12-15 16:44:35.000000000 +0530
+++ linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h 2006-12-15 17:50:08.000000000 +0530
@@ -125,6 +125,19 @@
#define EXT4_EXT_CACHE_EXTENT 2

/*
+ * Macro-instructions to handle (mark/unmark/check/create) unitialized
+ * extents. Applications can issue an IOCTL for preallocation, which results
+ * in assigning unitialized extents to the file.
+ */
+#define ext4_ext_mark_uninitialized(ext) ((ext)->ee_len |= \
+ cpu_to_le16(0x8000))
+#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x8000)
+#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x7FFF)
+
+
+/*
* to be called by ext4_ext_walk_space()
* negative retcode - error
* positive retcode - signal for ext4_ext_walk_space(), see below

2006-12-15 12:39:28

by Amit K. Arora

[permalink] [raw]
Subject: [RFC][Patch 2/2] Persistent preallocation in ext4

This is the second patch in the set.

This patch makes writing to the unitialized extent possible. A write operation on an unitialized extent *may* (depending on the relative block location in the extent and number of blocks being written) result in spliting the extent. There are three possibilities:
1. The extent does not split : This will happen when the entire extent is being written to. In this case the extent will be marked "initialized" and merged (if possible) with the neighbouring extents in the tree.
2. The extent splits in two portions : This will happen when someone is writing to any one end of the extent (i.e. not in the middle, and not to the entire extent). This will result in breaking the extent in two portions, an initialized extent (the set of blocks being written to) and an uninitialized extent (rest of the blocks in the parent extent).
3. The extent is split in three parts: This occurs when someone writes in the middle of the extent. It will result into three extents, two uninitialized (at the both ends) and one initialized (in middle).

Since the extent merge logic was getting redundant, it has been put into a new function ext4_ext_try_to_merge(). This gets called from ext4_ext_insert_extent() and ext4_ext_get_blocks(), when required.


Signed-off-by: Amit Arora ([email protected])

---
fs/ext4/extents.c | 200 ++++++++++++++++++++++++++++++++++------
include/linux/ext4_fs_extents.h | 1
2 files changed, 171 insertions(+), 30 deletions(-)

Index: linux-2.6.19.prealloc/fs/ext4/extents.c
===================================================================
--- linux-2.6.19.prealloc.orig/fs/ext4/extents.c 2006-12-15 17:53:48.000000000 +0530
+++ linux-2.6.19.prealloc/fs/ext4/extents.c 2006-12-15 17:54:32.000000000 +0530
@@ -1131,6 +1131,50 @@
}

/*
+ * ext4_ext_try_to_merge:
+ * tries to merge the "ex" extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass "ex - 1" as argument instead of "ex".
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+int ext4_ext_try_to_merge(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex)
+{
+ struct ext4_extent_header *eh;
+ unsigned int depth, len;
+ int merge_done=0, uninitialized = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ while (ex < EXT_LAST_EXTENT(eh)) {
+ if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+ break;
+ /* merge with next extent! */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(ex + 1));
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+
+ if (ex + 1 < EXT_LAST_EXTENT(eh)) {
+ len = (EXT_LAST_EXTENT(eh) - ex - 1)
+ * sizeof(struct ext4_extent);
+ memmove(ex + 1, ex + 2, len);
+ }
+ eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+ merge_done = 1;
+ BUG_ON(eh->eh_entries == 0);
+ }
+
+ return merge_done;
+}
+
+/*
* ext4_ext_insert_extent:
* tries to merge requsted extent into the existing extent or
* inserts requested extent as new one into the tree,
@@ -1265,25 +1309,7 @@

merge:
/* try to merge extents to the right */
- while (nearex < EXT_LAST_EXTENT(eh)) {
- if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
- break;
- /* merge with next extent! */
- if (ext4_ext_is_uninitialized(nearex))
- uninitialized = 1;
- nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
- + ext4_ext_get_actual_len(nearex + 1));
- if(uninitialized)
- ext4_ext_mark_uninitialized(nearex);
-
- if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
- len = (EXT_LAST_EXTENT(eh) - nearex - 1)
- * sizeof(struct ext4_extent);
- memmove(nearex + 1, nearex + 2, len);
- }
- eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
- BUG_ON(eh->eh_entries == 0);
- }
+ ext4_ext_try_to_merge(inode, path, nearex);

/* try to merge extents to the left */

@@ -1952,9 +1978,10 @@
int create, int extend_disksize)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent newex, *ex;
+ struct ext4_extent_header *eh;
+ struct ext4_extent newex, *ex, *ex1 = NULL, *ex2 = NULL, *ex3 = NULL;
ext4_fsblk_t goal, newblock;
- int err = 0, depth;
+ int err = 0, depth, ret;
unsigned long allocated = 0;

__clear_bit(BH_New, &bh_result->b_state);
@@ -2001,6 +2028,7 @@
* this is why assert can't be put in ext4_ext_find_extent()
*/
BUG_ON(path[depth].p_ext == NULL && depth != 0);
+ eh = path[depth].p_hdr;

if ((ex = path[depth].p_ext)) {
unsigned long ee_block = le32_to_cpu(ex->ee_block);
@@ -2008,15 +2036,9 @@
unsigned short ee_len;

/*
- * Allow future support for preallocated extents to be added
- * as an RO_COMPAT feature:
* Uninitialized extents are treated as holes, except that
- * we avoid (fail) allocating new blocks during a write.
+ * we split out initialized portions during a write.
*/
- if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN
- && create != EXT4_CREATE_UNINITIALIZED_EXT)
- goto out2;
-
ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
if (iblock >= ee_block && iblock < ee_block + ee_len) {
@@ -2026,10 +2048,126 @@
ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
- if(!ext4_ext_is_uninitialized(ex))
+ if(!ext4_ext_is_uninitialized(ex)) {
ext4_ext_put_in_cache(inode, ee_block, ee_len,
ee_start, EXT4_EXT_CACHE_EXTENT);
- goto out;
+ goto out;
+ }
+ if(create == EXT4_CREATE_UNINITIALIZED_EXT)
+ goto out;
+ if (!create)
+ goto out2;
+
+ /* Initializing write within an uninitialized extent */
+ /* Split in upto 3 extents .
+ * There are three possibilities:
+ * a> No split required: Entire extent should be
+ * initialized.
+ * b> Split into two extents: Only one end of the
+ * extent is being written to.
+ * c> Split into three extents: Somone is writing
+ * in middle of the extent.
+ */
+ ex2 = ex;
+
+ /* ex3: to ee_block + ee_len : uninitialised */
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ ex3 = &newex;
+ ex3->ee_block = cpu_to_le32(iblock
+ + max_blocks);
+ ext4_ext_store_pblock(ex3, newblock
+ + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated
+ - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode,
+ path, ex3);
+ if (err)
+ goto out2;
+
+ /* The depth, and hence eh & ex might change
+ * as part of the insert above.
+ */
+ newdepth = ext_depth(inode);
+ if(newdepth != depth)
+ {
+ depth=newdepth;
+ path = ext4_ext_find_extent(inode,
+ iblock, NULL);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out2;
+ }
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ex2 = ex;
+ }
+ allocated = max_blocks;
+ }
+
+ /* ex1: ee_block to iblock - 1 : uninitialized */
+ if (iblock > ee_block) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+
+ /* ex2: iblock to iblock + maxblocks-1 : initialised */
+ ex2->ee_block = cpu_to_le32(iblock);
+ ex2->ee_start = newblock;
+ ext4_ext_store_pblock(ex2, newblock);
+ ex2->ee_len = cpu_to_le16(allocated);
+ if (ex2 != ex)
+ goto insert;
+
+ if ((err = ext4_ext_get_access(handle, inode,
+ path + depth)))
+ goto out2;
+
+ /* New (initialized) extent starts from the first block
+ * in the current extent. i.e., ex2 == ex
+ * We have to see if it can be merged with the extent
+ * on the left.
+ */
+ if(ex2 > EXT_FIRST_EXTENT(eh)) {
+ /* To merge left, pass "ex2 - 1" to try_to_merge(),
+ * since it merges towards right _only_.
+ */
+ ret = ext4_ext_try_to_merge(inode,
+ path, ex2 - 1);
+ if(ret) {
+ err = ext4_ext_correct_indexes(handle,
+ inode, path);
+ if (err)
+ goto out2;
+ depth = ext_depth(inode);
+ ex2--;
+ }
+ }
+
+ /* Try to Merge towards right. This might be required
+ * only when the whole extent is being written to.
+ * i.e. ex2==ex and ex3==NULL.
+ */
+ if(!ex3) {
+ ret = ext4_ext_try_to_merge(inode, path, ex2);
+ if(ret) {
+ err = ext4_ext_correct_indexes(handle,
+ inode, path);
+ if (err)
+ goto out2;
+ }
+ }
+
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode,
+ path + depth);
+ if (err)
+ goto out2;
+ goto outnew;
}
}

@@ -2065,6 +2203,7 @@
newex.ee_len = cpu_to_le16(allocated);
if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
ext4_ext_mark_uninitialized(&newex);
+insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err)
goto out2;
@@ -2074,6 +2213,7 @@

/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
+outnew:
__set_bit(BH_New, &bh_result->b_state);

/* Cache only when it is _not_ an uninitialized extent */
Index: linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h
===================================================================
--- linux-2.6.19.prealloc.orig/include/linux/ext4_fs_extents.h 2006-12-15 17:50:08.000000000 +0530
+++ linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h 2006-12-15 17:54:17.000000000 +0530
@@ -203,6 +203,7 @@

extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_try_to_merge(struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);

2006-12-15 23:02:27

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

On Dec 15, 2006 18:09 +0530, Amit K. Arora wrote:
> This patch makes writing to the unitialized extent possible. A write operation on an unitialized extent *may* (depending on the relative block location in the extent and number of blocks being written) result in spliting the extent. There are three possibilities:
> 1. The extent does not split : This will happen when the entire extent is being written to. In this case the extent will be marked "initialized" and merged (if possible) with the neighbouring extents in the tree.

This should also be true if the write is at the beginning or the end of the
uninitialized extent and the disk allocation matches the previous or next
extent. The newly-written part is merged with the adjacent extent, and the
uninitialized extent is shrunk appropriately.

Doing this as a special case of #2 may result in extra tree rebalancing as
the extra extent is added and removed repeatedly (consider the case of a
large hole being overwritten in smaller chunks that is just at the limit
of the number of extents in the parent block).

> 2. The extent splits in two portions : This will happen when someone is writing to any one end of the extent (i.e. not in the middle, and not to the entire extent). This will result in breaking the extent in two portions, an initialized extent (the set of blocks being written to) and an uninitialized extent (rest of the blocks in the parent extent).
> 3. The extent is split in three parts: This occurs when someone writes in the middle of the extent. It will result into three extents, two uninitialized (at the both ends) and one initialized (in middle).
>
> Since the extent merge logic was getting redundant, it has been put into a new function ext4_ext_try_to_merge(). This gets called from ext4_ext_insert_extent() and ext4_ext_get_blocks(), when required.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-12-16 04:30:13

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

On Fri, Dec 15, 2006 at 04:02:25PM -0700, Andreas Dilger wrote:
> On Dec 15, 2006 18:09 +0530, Amit K. Arora wrote:
> > This patch makes writing to the unitialized extent possible. A write operation on an unitialized extent *may* (depending on the relative block location in the extent and number of blocks being written) result in spliting the extent. There are three possibilities:
> > 1. The extent does not split : This will happen when the entire extent is being written to. In this case the extent will be marked "initialized" and merged (if possible) with the neighbouring extents in the tree.
>
> This should also be true if the write is at the beginning or the end of the
> uninitialized extent and the disk allocation matches the previous or next
> extent. The newly-written part is merged with the adjacent extent, and the
> uninitialized extent is shrunk appropriately.

You are right. And the current patch takes care of that. If the write is
at the begining of the uninitialized extent, the first extent (from the
split) will be initialized ("ex2" in this case), and we do call
try_to_merge() to merge this with the previous extent, if possible. This
scenario can be seen as ex1 == NULL && ex2 == ex && ex3 != NULL

(Please note that "ex" is the uninitialized extent, and "ex2" is
_always_ the "initialized" extent being created, whether it is on left,
right or middle of the "parent" uninitialized extent)

If the initialized extent is the second one in the split (i.e. write is
happening on the later part of the uninitialized extent), it will result
in shirinking the existing uninitialized extent and "inserting" the new
initialized extent. insert_extent() will be called in this case, which
also tries to merge the extent with the neighbouring extents (both,
towards left and right side).
The following condition will hold true in this case:
ex1 != NULL && ex2 != ex && ex3 == NULL

>
> Doing this as a special case of #2 may result in extra tree rebalancing as
> the extra extent is added and removed repeatedly (consider the case of a
> large hole being overwritten in smaller chunks that is just at the limit
> of the number of extents in the parent block).

Yes, as I mentioned, the case #2 already handles this. I guess, I should
have been explicit about it in the description...

>
> > 2. The extent splits in two portions : This will happen when someone is writing to any one end of the extent (i.e. not in the middle, and not to the entire extent). This will result in breaking the extent in two portions, an initialized extent (the set of blocks being written to) and an uninitialized extent (rest of the blocks in the parent extent).
> > 3. The extent is split in three parts: This occurs when someone writes in the middle of the extent. It will result into three extents, two uninitialized (at the both ends) and one initialized (in middle).
> >
> > Since the extent merge logic was getting redundant, it has been put into a new function ext4_ext_try_to_merge(). This gets called from ext4_ext_insert_extent() and ext4_ext_get_blocks(), when required.
>
> Cheers, Andreas
> --
> Andreas Dilger
> Principal Software Engineer
> Cluster File Systems, Inc.


Regards,
----
Amit Arora ([email protected])
Linux Technology Center
IBM India

2006-12-19 11:05:38

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/2] Persistent preallocation in ext4

On Fri, Dec 15, 2006 at 06:05:28PM +0530, Amit K. Arora wrote:
> --- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-15 16:44:35.000000000 +0530
> +++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-15 17:47:00.000000000 +0530
:
:
> + handle=ext4_journal_start(inode,
> + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);

The current way how buffer credits are passed to ext4_journal_start()
above, is not correct. The max. number of blocks that we might modify
here should be calculated using ext4_ext_calc_credits_for_insert().
Thus the above line should be replaced with :

mutex_lock(&EXT4_I(inode)->truncate_mutex);
credits = ext4_ext_calc_credits_for_insert(inode, NULL);
mutex_unlock(&EXT4_I(inode)->truncate_mutex);
handle=ext4_journal_start(inode, credits +
EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 1);

Following is the revised patch with the above change.

Signed-off-by: Amit Arora ([email protected])
---
fs/ext4/extents.c | 116 ++++++++++++++++++++++++++--------------
fs/ext4/ioctl.c | 63 +++++++++++++++++++++
include/linux/ext4_fs.h | 13 ++++
include/linux/ext4_fs_extents.h | 13 ++++
4 files changed, 167 insertions(+), 38 deletions(-)

Index: linux-2.6.19.prealloc/fs/ext4/extents.c
===================================================================
--- linux-2.6.19.prealloc.orig/fs/ext4/extents.c 2006-12-19 16:09:00.000000000 +0530
+++ linux-2.6.19.prealloc/fs/ext4/extents.c 2006-12-19 16:23:37.000000000 +0530
@@ -282,7 +282,7 @@
} else if (path->p_ext) {
ext_debug(" %d:%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- le16_to_cpu(path->p_ext->ee_len),
+ ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
ext_debug(" []");
@@ -305,7 +305,7 @@

for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
}
@@ -425,7 +425,7 @@
ext_debug(" -> %d:%llu:%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
- le16_to_cpu(path->p_ext->ee_len));
+ ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
{
@@ -686,7 +686,7 @@
ext_debug("move %d:%llu:%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
- le16_to_cpu(path[depth].p_ext->ee_len),
+ ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
sizeof(struct ext4_extent));
@@ -1097,7 +1097,19 @@
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+ unsigned short ext1_ee_len, ext2_ee_len;
+
+ /*
+ * Make sure that either both extents are uninitialized, or
+ * both are _not_.
+ */
+ if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ return 0;
+
+ ext1_ee_len = ext4_ext_get_actual_len(ex1);
+ ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+ if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
le32_to_cpu(ex2->ee_block))
return 0;

@@ -1106,14 +1118,14 @@
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+ if (ext1_ee_len + ext2_ee_len > EXT_MAX_LEN)
return 0;
#ifdef AGRESSIVE_TEST
if (le16_to_cpu(ex1->ee_len) >= 4)
return 0;
#endif

- if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+ if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
return 1;
return 0;
}
@@ -1132,9 +1144,9 @@
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
- int depth, len, err, next;
+ int depth, len, err, next, uninitialized = 0;

- BUG_ON(newext->ee_len == 0);
+ BUG_ON(ext4_ext_get_actual_len(newext) == 0);
depth = ext_depth(inode);
ex = path[depth].p_ext;
BUG_ON(path[depth].p_hdr == NULL);
@@ -1142,13 +1154,22 @@
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append %d block to %d:%d (from %llu)\n",
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ ext4_ext_get_actual_len(ex), ext_pblock(ex));
if ((err = ext4_ext_get_access(handle, inode, path + depth)))
return err;
- ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
- + le16_to_cpu(newext->ee_len));
+
+ /* ext4_can_extents_be_merged should have checked that either
+ * both extents are uninitialized, or both aren't. Thus we
+ * need to check any of them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1203,7 +1224,7 @@
ext_debug("first extent in the leaf: %d:%llu:%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len));
+ ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
@@ -1216,7 +1237,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
}
@@ -1229,7 +1250,7 @@
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
- le16_to_cpu(newext->ee_len),
+ ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
path[depth].p_ext = nearex;
@@ -1248,8 +1269,13 @@
if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
break;
/* merge with next extent! */
- nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
- + le16_to_cpu(nearex[1].ee_len));
+ if (ext4_ext_is_uninitialized(nearex))
+ uninitialized = 1;
+ nearex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(nearex)
+ + ext4_ext_get_actual_len(nearex + 1));
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(nearex);
+
if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - nearex - 1)
* sizeof(struct ext4_extent);
@@ -1319,8 +1345,8 @@
end = le32_to_cpu(ex->ee_block);
if (block + num < end)
end = block + num;
- } else if (block >=
- le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
/* need to allocate space after found extent */
start = block;
end = block + num;
@@ -1332,7 +1358,8 @@
* by found extent
*/
start = block;
- end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
if (block + num < end)
end = block + num;
exists = 1;
@@ -1348,7 +1375,7 @@
cbex.ec_type = EXT4_EXT_CACHE_GAP;
} else {
cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = le16_to_cpu(ex->ee_len);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
cbex.ec_start = ext_pblock(ex);
cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
}
@@ -1556,12 +1583,12 @@
unsigned long from, unsigned long to)
{
struct buffer_head *bh;
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
int i;

#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
spin_lock(&sbi->s_ext_stats_lock);
sbi->s_ext_blocks += ee_len;
sbi->s_ext_extents++;
@@ -1575,12 +1602,12 @@
}
#endif
if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
/* tail removal */
unsigned long num;
ext4_fsblk_t start;
- num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
- start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %lu blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
@@ -1588,12 +1615,12 @@
}
ext4_free_blocks(handle, inode, start, num);
} else if (from == le32_to_cpu(ex->ee_block)
- && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
} else {
printk("strange request: removal(2) %lu-%lu from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
}
return 0;
}
@@ -1607,7 +1634,7 @@
struct ext4_extent_header *eh;
unsigned a, b, block, num;
unsigned long ex_ee_block;
- unsigned short ex_ee_len;
+ unsigned short ex_ee_len, uninitialized = 0;
struct ext4_extent *ex;

ext_debug("truncate since %lu in leaf\n", start);
@@ -1622,7 +1649,9 @@
ex = EXT_LAST_EXTENT(eh);

ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ if(ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex_ee_len = ext4_ext_get_actual_len(ex);

while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -1690,6 +1719,8 @@

ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
+ if(uninitialized)
+ ext4_ext_mark_uninitialized(ex);

err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -1699,7 +1730,7 @@
ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
- ex_ee_len = le16_to_cpu(ex->ee_len);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
}

if (correct_index && eh->eh_entries)
@@ -1974,7 +2005,7 @@
if ((ex = path[depth].p_ext)) {
unsigned long ee_block = le32_to_cpu(ex->ee_block);
ext4_fsblk_t ee_start = ext_pblock(ex);
- unsigned short ee_len = le16_to_cpu(ex->ee_len);
+ unsigned short ee_len;

/*
* Allow future support for preallocated extents to be added
@@ -1982,8 +2013,11 @@
* Uninitialized extents are treated as holes, except that
* we avoid (fail) allocating new blocks during a write.
*/
- if (ee_len > EXT_MAX_LEN)
+ if (le16_to_cpu(ex->ee_len) > EXT_MAX_LEN
+ && create != EXT4_CREATE_UNINITIALIZED_EXT)
goto out2;
+
+ ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
if (iblock >= ee_block && iblock < ee_block + ee_len) {
newblock = iblock - ee_block + ee_start;
@@ -1991,8 +2025,10 @@
allocated = ee_len - (iblock - ee_block);
ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
ee_block, ee_len, newblock);
- ext4_ext_put_in_cache(inode, ee_block, ee_len,
- ee_start, EXT4_EXT_CACHE_EXTENT);
+ /* Do not put uninitialized extent in the cache */
+ if(!ext4_ext_is_uninitialized(ex))
+ ext4_ext_put_in_cache(inode, ee_block, ee_len,
+ ee_start, EXT4_EXT_CACHE_EXTENT);
goto out;
}
}
@@ -2027,6 +2063,8 @@
newex.ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(allocated);
+ if(create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
+ ext4_ext_mark_uninitialized(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err)
goto out2;
@@ -2038,8 +2076,10 @@
newblock = ext_pblock(&newex);
__set_bit(BH_New, &bh_result->b_state);

- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
- EXT4_EXT_CACHE_EXTENT);
+ /* Cache only when it is _not_ an uninitialized extent */
+ if(create!=EXT4_CREATE_UNINITIALIZED_EXT)
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
out:
if (allocated > max_blocks)
allocated = max_blocks;
Index: linux-2.6.19.prealloc/fs/ext4/ioctl.c
===================================================================
--- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-19 16:09:00.000000000 +0530
+++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-19 16:23:08.000000000 +0530
@@ -11,6 +11,7 @@
#include <linux/jbd2.h>
#include <linux/capability.h>
#include <linux/ext4_fs.h>
+#include <linux/ext4_fs_extents.h>
#include <linux/ext4_jbd2.h>
#include <linux/time.h>
#include <linux/compat.h>
@@ -248,6 +249,68 @@
return err;
}

+ case EXT4_IOC_PREALLOCATE: {
+ struct ext4_falloc_input input;
+ handle_t *handle;
+ ext4_fsblk_t block, max_blocks;
+ int ret, ret2, nblocks = 0, retries = 0;
+ struct buffer_head map_bh;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (copy_from_user(&input,
+ (struct ext4_falloc_input __user *) arg, sizeof(input)))
+ return -EFAULT;
+
+ if (input.len == 0)
+ return -EINVAL;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -ENOTTY;
+
+ block = input.offset >> blkbits;
+ max_blocks = (EXT4_BLOCK_ALIGN(input.len + input.offset,
+ blkbits) >> blkbits) - block;
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ credits = ext4_ext_calc_credits_for_insert(inode, NULL);
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+ handle=ext4_journal_start(inode, credits +
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+retry:
+ ret = 0;
+ while(ret>=0 && ret<max_blocks)
+ {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ ret = ext4_ext_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ if(ret > 0 && test_bit(BH_New, &map_bh.b_state))
+ nblocks = nblocks + ret;
+ }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
+ &retries))
+ goto retry;
+
+ if(nblocks) {
+ mutex_lock(&inode->i_mutex);
+ inode->i_size = inode->i_size + (nblocks >> blkbits);
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if(ret > 0)
+ ret = ret2;
+
+ return ret > 0 ? nblocks : ret;
+ }
+
default:
return -ENOTTY;
}
Index: linux-2.6.19.prealloc/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.19.prealloc.orig/include/linux/ext4_fs.h 2006-12-19 16:09:00.000000000 +0530
+++ linux-2.6.19.prealloc/include/linux/ext4_fs.h 2006-12-19 16:09:12.000000000 +0530
@@ -102,6 +102,8 @@
EXT4_GOOD_OLD_FIRST_INO : \
(s)->s_first_ino)
#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) (((size)+(1 << blkbits)-1) & \
+ (~((1 << blkbits)-1)))

/*
* Macro-instructions used to manage fragments
@@ -225,6 +227,16 @@
__u32 free_blocks_count;
};

+/* The struct ext4_falloc_input in kernel, for EXT4_IOC_PREALLOCATE */
+struct ext4_falloc_input {
+ __u64 offset;
+ __u64 len;
+};
+
+/* Following is used by preallocation logic to tell get_blocks() that we
+ * want uninitialzed extents.
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT 2

/*
* ioctl commands
@@ -242,6 +254,7 @@
#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_PREALLOCATE _IOW('f', 9, struct ext4_falloc_input)

/*
* ioctl commands in 32 bit emulation
Index: linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h
===================================================================
--- linux-2.6.19.prealloc.orig/include/linux/ext4_fs_extents.h 2006-12-19 16:09:00.000000000 +0530
+++ linux-2.6.19.prealloc/include/linux/ext4_fs_extents.h 2006-12-19 16:23:37.000000000 +0530
@@ -125,6 +125,19 @@
#define EXT4_EXT_CACHE_EXTENT 2

/*
+ * Macro-instructions to handle (mark/unmark/check/create) unitialized
+ * extents. Applications can issue an IOCTL for preallocation, which results
+ * in assigning unitialized extents to the file.
+ */
+#define ext4_ext_mark_uninitialized(ext) ((ext)->ee_len |= \
+ cpu_to_le16(0x8000))
+#define ext4_ext_is_uninitialized(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x8000)
+#define ext4_ext_get_actual_len(ext) ((le16_to_cpu((ext)->ee_len))& \
+ 0x7FFF)
+
+
+/*
* to be called by ext4_ext_walk_space()
* negative retcode - error
* positive retcode - signal for ext4_ext_walk_space(), see below

2006-12-19 11:42:56

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

I wrote a simple tool to test these patches. The tool takes four
arguments:

* command: It may have either of the two values - "prealloc" or "write"
* filename: This is the filename with relative path
* offset: The offset within the file from where the preallocation, or
the write should start.
* length: Total number of bytes to be allocated/written from offset.

Following cases were tested :
1. * preallocation from 0 to 32MB
* write to various parts of the preallocated space in sets
* observed that the extents get split and also get merged

2. * preallocate with holes at various places in the file
* write to blocks starting from a hole and ending into preallocated
blocks and vice-versa
* try to write to entire set of blocks (i.e. from 0 to the last
preallocated block) which has holes in between.


I also tried some random preallocation and write operations. They seem
to work fine. There is a patch also ready for e2fsprogs utils to
recognize uninitialized extents, which I used to verify the results of
the above testcases. I will post that patch in the next mail.

Here is the code for the simple tool :


#include<stdio.h>
#include<stdlib.h>
#include<fcntl.h>
#include<errno.h>

#define EXT4_IOC_FALLOCATE 0x40106609

struct ext4_falloc_input {
unsigned long long offset;
unsigned long long len;
};

int do_prealloc(char* fname, struct ext4_falloc_input input)
{
int ret, fd = open(fname, O_CREAT|O_RDWR, 0666);

if(fd<0) {
printf("Error opening file %s\n", fname);
return 1;
}

printf("%s : Trying to preallocate blocks (offset=%llu, len=%llu)\n",
fname, input.offset, input.len);
ret = ioctl(fd, EXT4_IOC_FALLOCATE, &input);

if(ret <0) {
printf("IOCTL: received error %d, ret=%d\n", errno, ret);
close(fd);
exit(1);
}
printf("IOCTL succedded ! ret=%d\n", ret);
close(fd);
}

int do_write(char* fname, struct ext4_falloc_input input)
{
int ret, fd;
char *buf;

buf = (char *)malloc(input.len);

fd = open(fname, O_CREAT|O_RDWR, 0666);
if(fd<0) {
printf("Error opening file %s\n", fname);
return 1;
}

printf("%s : Trying to write to file (offset=%llu, len=%llu)\n",
fname, input.offset, input.len);

ret = lseek(fd, input.offset, SEEK_SET);
if(ret != input.offset) {
printf("lseek() failed error=%d, ret=%d\n", errno, ret);
close(fd);
return(1);
}

ret = write(fd, buf, input.len);
if(ret != input.len) {
printf("write() failed error=%d, ret=%d\n", errno, ret);
close(fd);
return(1);
}

printf("Write succedded ! Written %llu bytes ret=%d\n", input.len, ret);
close(fd);
}


int main(int argc, char **argv)
{
struct ext4_falloc_input input;
int ret = 1, fd;
char *fname;

if(argc<5) {
printf("%s <CMD: prealloc/write> <filename-with-path> <offset> <length>\n", argv[0]);
exit(1);
}

fname = argv[2];
input.offset=(unsigned long long)atol(argv[3]);;
input.len=(unsigned long long)atol(argv[4]);

if(input.offset<0 || input.len<= 0) {
printf("%s: Invalid arguments.\n", argv[0]);
exit(1);
}

if(!strcmp(argv[1], "prealloc"))
ret = do_prealloc(fname, input);
else if(!strcmp(argv[1], "write"))
ret = do_write(fname, input);
else
printf("%s: Invalid arguments.\n", argv[0]);

return ret;
}

2006-12-19 11:54:51

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

On Tue, Dec 19, 2006 at 05:12:51PM +0530, Amit K. Arora wrote:
> I also tried some random preallocation and write operations. They seem
> to work fine. There is a patch also ready for e2fsprogs utils to
> recognize uninitialized extents, which I used to verify the results of
> the above testcases. I will post that patch in the next mail.
>

This is a patch on top of e2fsprogs-1.39 + ext4 updates, which can be
used to see extent details (like - actual size, whether initialized or
uninitialized extent etc.) on an extent based file in ext4. debugfs
tool can be used for this, after compiling e2fsprogs with this patch.
This patch also enables EXT_DEBUG flag, so that the extent details get
displayed.


---
lib/ext2fs/bmap.c | 3 +-
lib/ext2fs/ext4_extents.h | 12 +++++++++-
lib/ext2fs/extents.c | 55 ++++++++++++++++++++++++++++------------------
3 files changed, 47 insertions(+), 23 deletions(-)

Index: e2fsprogs-1.39/lib/ext2fs/bmap.c
===================================================================
--- e2fsprogs-1.39.orig/lib/ext2fs/bmap.c 2006-12-19 11:53:48.000000000 +0530
+++ e2fsprogs-1.39/lib/ext2fs/bmap.c 2006-12-19 11:53:52.000000000 +0530
@@ -45,7 +45,8 @@
ex = EXT_FIRST_EXTENT(eh);
for (i = 0; i < eh->eh_entries; i++, ex++) {
if ((ex->ee_block <= block) &&
- (block < ex->ee_block + ex->ee_len)) {
+ (block < ex->ee_block +
+ ext4_ext_get_actual_len(ex))) {
*phys_blk = EXT4_EE_START(ex) +
(block - ex->ee_block);
return 0;
Index: e2fsprogs-1.39/lib/ext2fs/ext4_extents.h
===================================================================
--- e2fsprogs-1.39.orig/lib/ext2fs/ext4_extents.h 2006-12-19 11:53:48.000000000 +0530
+++ e2fsprogs-1.39/lib/ext2fs/ext4_extents.h 2006-12-19 15:55:32.000000000 +0530
@@ -37,7 +37,7 @@
* if EXT_DEBUG is defined you can use 'extdebug' mount option
* to get lots of info what's going on
*/
-//#define EXT_DEBUG
+#define EXT_DEBUG
#ifdef EXT_DEBUG
#define ext_debug(tree,fmt,a...) \
do { \
@@ -170,6 +170,16 @@

#define EXT_ASSERT(__x__) if (!(__x__)) BUG();

+/*
+ * Macro-instructions used to handle (mark/unmark/check/create) unitialized
+ * extents. Applications can issue an IOCTL for preallocation, which results
+ * in assigning unitialized extents to the file
+ */
+#define EXT4_CREATE_UNINITIALIZED_EXT 2
+#define ext4_ext_mark_uninitialized(ext) ((ext)->ee_len |= 0x8000)
+#define ext4_ext_is_uninitialized(ext) ((ext)->ee_len & 0x8000)
+#define ext4_ext_get_actual_len(ext) ((ext)->ee_len & 0x7FFF)
+

/*
* this structure is used to gather extents from the tree via ioctl
Index: e2fsprogs-1.39/lib/ext2fs/extents.c
===================================================================
--- e2fsprogs-1.39.orig/lib/ext2fs/extents.c 2006-12-19 11:53:48.000000000 +0530
+++ e2fsprogs-1.39/lib/ext2fs/extents.c 2006-12-19 11:55:03.000000000 +0530
@@ -36,9 +36,11 @@

void show_extent(struct ext4_extent *ex)
{
- printf("extent: block=%u-%u len=%u start=%u start_hi=%u\n",
- ex->ee_block, ex->ee_block + ex->ee_len - 1,
- ex->ee_len, ex->ee_start, ex->ee_start_hi);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
+ printf("extent[%c]: block=%u-%u len=%u start=%u start_hi=%u\n",
+ ext4_ext_is_uninitialized(ex) ? 'u' : 'i',
+ ex->ee_block, ex->ee_block + ee_len - 1, ee_len,
+ ex->ee_start, ex->ee_start_hi);
}
#else
#define show_header(eh) do { } while (0)
@@ -75,7 +77,7 @@
if (EXT4_EE_START(ex) > EXT2_BLOCKS_COUNT(fs->super))
return EXT2_ET_EXTENT_LEAF_BAD;

- if (ex->ee_len == 0)
+ if (ext4_ext_get_actual_len(ex) == 0)
return EXT2_ET_EXTENT_LEAF_BAD;

if (ex_prev) {
@@ -84,13 +86,14 @@
return EXT2_ET_EXTENT_LEAF_BAD;

/* extents must be in logical offset order */
- if (ex->ee_block < ex_prev->ee_block + ex_prev->ee_len)
+ if (ex->ee_block < ex_prev->ee_block +
+ ext4_ext_get_actual_len(ex_prev))
return EXT2_ET_EXTENT_LEAF_BAD;

/* extents must not overlap physical blocks */
- if ((EXT4_EE_START(ex) <
- EXT4_EE_START(ex_prev) + ex_prev->ee_len) &&
- (EXT4_EE_START(ex) + ex->ee_len > EXT4_EE_START(ex_prev)))
+ if ((EXT4_EE_START(ex) < EXT4_EE_START(ex_prev) +
+ ext4_ext_get_actual_len(ex_prev)) && (EXT4_EE_START(ex)
+ + ext4_ext_get_actual_len(ex) > EXT4_EE_START(ex_prev)))
return EXT2_ET_EXTENT_LEAF_BAD;
}

@@ -98,7 +101,8 @@
if (ex->ee_block < ix->ei_block)
return EXT2_ET_EXTENT_LEAF_BAD;

- if (ix_len && ex->ee_block + ex->ee_len > ix->ei_block + ix_len)
+ if (ix_len && ex->ee_block + ext4_ext_get_actual_len(ex) >
+ ix->ei_block + ix_len)
return EXT2_ET_EXTENT_LEAF_BAD;
}

@@ -144,6 +148,7 @@
{
int entry = ex - EXT_FIRST_EXTENT(eh);
struct ext4_extent *ex_new = ex + 1;
+ unsigned uninitialized=0;

if (entry < 0 || entry > eh->eh_entries)
return EXT2_ET_EXTENT_LEAF_BAD;
@@ -151,18 +156,25 @@
if (eh->eh_entries >= eh->eh_max)
return EXT2_ET_EXTENT_NO_SPACE;

- if (count > ex->ee_len)
+ if (count > ext4_ext_get_actual_len(ex))
return EXT2_ET_EXTENT_LEAF_BAD;

- if (count > ex->ee_len)
+ if (count > ext4_ext_get_actual_len(ex))
return EXT2_ET_EXTENT_LEAF_BAD;

+ if(ext4_ext_is_uninitialized(ex))
+ uninitialized=1;
+
memmove(ex_new, ex, (eh->eh_entries - entry) * sizeof(*ex));
++eh->eh_entries;

ex->ee_len = count;
ex_new->ee_len -= count;
ex_new->ee_block += count;
+ if(uninitialized) {
+ ext4_ext_mark_uninitialized(ex);
+ ext4_ext_mark_uninitialized(ex_new);
+ }
EXT4_EE_START_SET(ex_new, EXT4_EE_START(ex_new) + count);

return 0;
@@ -195,7 +207,7 @@
ex = EXT_FIRST_EXTENT(eh);
for (i = 0; i < eh->eh_entries; i++, ex++) {
show_extent(ex);
- for (j = 0; j < ex->ee_len; j++) {
+ for (j = 0; j < ext4_ext_get_actual_len(ex); j++) {
block_address = EXT4_EE_START(ex) + j;
flags = (*ctx->func)(ctx->fs, &block_address,
(ex->ee_block + j),
@@ -216,15 +228,15 @@
#endif

if (ex_prev &&
- block_address ==
- EXT4_EE_START(ex_prev) + ex_prev->ee_len &&
- ex->ee_block + j ==
- ex_prev->ee_block + ex_prev->ee_len) {
+ block_address == EXT4_EE_START(ex_prev) +
+ ext4_ext_get_actual_len(ex_prev) &&
+ ex->ee_block + j == ex_prev->ee_block +
+ ext4_ext_get_actual_len(ex_prev)) {
/* can merge block with prev extent */
ex_prev->ee_len++;

ex->ee_len--;
- if (ex->ee_len == 0) {
+ if (ext4_ext_get_actual_len(ex) == 0) {
/* no blocks left in this one */
ext2fs_extent_remove(eh, ex);
i--; ex--;
@@ -238,7 +250,7 @@
}
ret |= BLOCK_CHANGED;

- } else if (ex->ee_len == 1) {
+ } else if (ext4_ext_get_actual_len(ex) == 1) {
/* single-block extent is easy -
* change extent directly */
EXT4_EE_START_SET(ex, block_address);
@@ -250,7 +262,8 @@
ret |= BLOCK_ABORT | BLOCK_ERROR;
return ret;

- } else if (j > 0 && (ex + 1)->ee_len > 1 &&
+ } else if (j > 0 &&
+ ext4_ext_get_actual_len(ex + 1) > 1 &&
ext2fs_extent_split(eh, ex + 1, 1)) {
/* split after new block failed */
/* No multi-level split yet */
@@ -258,7 +271,7 @@
return ret;

} else if (j == 0) {
- if (ex->ee_len != 1) {
+ if (ext4_ext_get_actual_len(ex) != 1) {
/* this is an internal error */
ret |= BLOCK_ABORT |BLOCK_ERROR;
return ret;
@@ -269,7 +282,7 @@
} else {
ex++;
i++;
- if (ex->ee_len != 1) {
+ if (ext4_ext_get_actual_len(ex) != 1) {
/* this is an internal error */
ret |= BLOCK_ABORT |BLOCK_ERROR;
return ret;

2006-12-19 21:14:12

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote:
> I wrote a simple tool to test these patches. The tool takes four
> arguments:
>
> * command: It may have either of the two values - "prealloc" or "write"
> * filename: This is the filename with relative path
> * offset: The offset within the file from where the preallocation, or
> the write should start.
> * length: Total number of bytes to be allocated/written from offset.
>
> Following cases were tested :
> 1. * preallocation from 0 to 32MB
> * write to various parts of the preallocated space in sets
> * observed that the extents get split and also get merged
>
> 2. * preallocate with holes at various places in the file
> * write to blocks starting from a hole and ending into preallocated
> blocks and vice-versa
> * try to write to entire set of blocks (i.e. from 0 to the last
> preallocated block) which has holes in between.

An ideal test would be to modify fsx to (randomly) do preallocations
instead of truncates that increase the size.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-12-19 21:30:04

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

Andreas Dilger wrote:
> On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote:
>> I wrote a simple tool to test these patches. The tool takes four
>> arguments:
>>
>> * command: It may have either of the two values - "prealloc" or "write"
>> * filename: This is the filename with relative path
>> * offset: The offset within the file from where the preallocation, or
>> the write should start.
>> * length: Total number of bytes to be allocated/written from offset.
>>
>> Following cases were tested :
>> 1. * preallocation from 0 to 32MB
>> * write to various parts of the preallocated space in sets
>> * observed that the extents get split and also get merged
>>
>> 2. * preallocate with holes at various places in the file
>> * write to blocks starting from a hole and ending into preallocated
>> blocks and vice-versa
>> * try to write to entire set of blocks (i.e. from 0 to the last
>> preallocated block) which has holes in between.
>
> An ideal test would be to modify fsx to (randomly) do preallocations
> instead of truncates that increase the size.

the fsx in the xfs qa suite already does this (albeit with the special
xfs calls, of course)

But it might be worth looking at as a starting point, and also maybe to
keep the options and behavior similar to one* version of fsx that is out
there. :)

http://oss.sgi.com/cgi-bin/cvsweb.cgi/xfs-cmds/xfstests/ltp/fsx.c?rev=1.7

-Eric

*http://kernelslacker.livejournal.com/?skip=43

2006-12-20 06:28:45

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/2] Persistent preallocation in ext4

On Tue, Dec 19, 2006 at 02:12:06PM -0700, Andreas Dilger wrote:
> Minor edits (not worth a resubmit by itself):

Thanks, Andreas ! I will take care of these comments in the next
submission.

Regards,
Amit Arora
>
> On Dec 19, 2006 16:35 +0530, Amit K. Arora wrote:
> > + /* ext4_can_extents_be_merged should have checked that either
> > + * both extents are uninitialized, or both aren't. Thus we
> > + * need to check any of them here.
>
> s/any/only one/
>
> >
> > + case EXT4_IOC_PREALLOCATE: {
> > + if (IS_RDONLY(inode))
> > + return -EROFS;
> > +
> > + if (copy_from_user(&input,
> > + (struct ext4_falloc_input __user *) arg, sizeof(input)))
> > + return -EFAULT;
> > +
> > + if (input.len == 0)
> > + return -EINVAL;
> > +
> > + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
> > + return -ENOTTY;
>
> May as well put this check before copy_from_user(), since it doesn't need
> the user data and is much faster to check first.
>
> > +retry:
> > + ret = 0;
> > + while(ret>=0 && ret<max_blocks)
> > + {
>
> Opening brace always on same line, like "while() {"
>
> > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
> > + &retries))
>
> &retries should be aligned with start of (inode->i_sb, on previous line.
>
> > + if(nblocks) {
>
> Space between "if (" everywhere.
>
> Cheers, Andreas
> --
> Andreas Dilger
> Principal Software Engineer
> Cluster File Systems, Inc.

2006-12-20 08:24:27

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

On Tue, Dec 19, 2006 at 03:23:41PM -0600, Eric Sandeen wrote:
> Andreas Dilger wrote:
> > On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote:
> >> I wrote a simple tool to test these patches. The tool takes four
> >> arguments:
> >>
> >> * command: It may have either of the two values - "prealloc" or "write"
> >> * filename: This is the filename with relative path
> >> * offset: The offset within the file from where the preallocation, or
> >> the write should start.
> >> * length: Total number of bytes to be allocated/written from offset.
> >>
> >> Following cases were tested :
> >> 1. * preallocation from 0 to 32MB
> >> * write to various parts of the preallocated space in sets
> >> * observed that the extents get split and also get merged
> >>
> >> 2. * preallocate with holes at various places in the file
> >> * write to blocks starting from a hole and ending into preallocated
> >> blocks and vice-versa
> >> * try to write to entire set of blocks (i.e. from 0 to the last
> >> preallocated block) which has holes in between.
> >
> > An ideal test would be to modify fsx to (randomly) do preallocations
> > instead of truncates that increase the size.
>
> the fsx in the xfs qa suite already does this (albeit with the special
> xfs calls, of course)

Hmm... the fsx in xfs qa suite does preallocation, but _only_ at the
start of the testcase - i.e. only once. I guess what Andreas suggested is to
use random preallocations instead of random truncates. That will essentially
mean to call "doprealloc(size)" (which will preallocate size number of
bytes from start of the file), instead of dotruncate(size) in the
"test(void)" function; when fsx is passed with a relevant option (say
"-x"). Am I right ?

BTW, I plan to modify the fsx-linux.c in LTP for this. Will update on
how it goes...

>
> But it might be worth looking at as a starting point, and also maybe to
> keep the options and behavior similar to one* version of fsx that is out
> there. :)

Yes, it will be nice to keep the behavior and the option same here.

>
> http://oss.sgi.com/cgi-bin/cvsweb.cgi/xfs-cmds/xfstests/ltp/fsx.c?rev=1.7
>
> -Eric
>
> *http://kernelslacker.livejournal.com/?skip=43

2006-12-22 15:17:53

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

On Tue, Dec 19, 2006 at 02:14:09PM -0700, Andreas Dilger wrote:
> On Dec 19, 2006 17:12 +0530, Amit K. Arora wrote:
> > I wrote a simple tool to test these patches. The tool takes four
> > arguments:
> >
> > * command: It may have either of the two values - "prealloc" or "write"
> > * filename: This is the filename with relative path
> > * offset: The offset within the file from where the preallocation, or
> > the write should start.
> > * length: Total number of bytes to be allocated/written from offset.
> >
> > Following cases were tested :
> > 1. * preallocation from 0 to 32MB
> > * write to various parts of the preallocated space in sets
> > * observed that the extents get split and also get merged
> >
> > 2. * preallocate with holes at various places in the file
> > * write to blocks starting from a hole and ending into preallocated
> > blocks and vice-versa
> > * try to write to entire set of blocks (i.e. from 0 to the last
> > preallocated block) which has holes in between.

Hi Andreas,

> An ideal test would be to modify fsx to (randomly) do preallocations
> instead of truncates that increase the size.
Thanks for the suggestion. The modified fsx (used "fsx-linux" in LTP as
the base) did uncover couple of bugs. One has a straight forward fix of
using ext4_ext_get_actual_len() in ext4_ext_put_gap_in_cache(). Without
this change, it is resulting in a panic when a ftruncate() of size zero
is done on an extent enabled file with preallocated blocks.

The other problem is slightly more complicated and occurs in some
specific conditions (filesize >= 512KB and number of operations >=
100000). When it happens, it results in duplicate entries for a few
logical block numbers in the tree. It looks as if some kind of "leak" is
happening while splitting/merging extents. Following is a typical
example (this is an output from debugfs in e2fsprogs-1.39 utils, which
has the patch supporting preallocation applied to it):

debugfs 1.39 (29-May-2006)
debugfs: stat testfile
header: magic=f30a entries=1 max=4 depth=1 generation=0
index: block=0 leaf=1201 leaf_hi=0 unused=0
header: magic=f30a entries=16 max=84 depth=0 generation=0
extent[u]: block=0-25 len=26 start=645 start_hi=0
extent[i]: block=26-67 len=42 start=671 start_hi=0
extent[u]: block=68-502 len=435 start=713 start_hi=0
extent[i]: block=76-107 len=32 start=589 start_hi=0
extent[i]: block=116-131 len=16 start=629 start_hi=0
extent[i]: block=132-134 len=3 start=1257 start_hi=0
extent[i]: block=135-177 len=43 start=2154 start_hi=0
extent[i]: block=178-194 len=17 start=4206 start_hi=0
extent[i]: block=195-247 len=53 start=1148 start_hi=0
extent[i]: block=248-248 len=1 start=1202 start_hi=0
extent[i]: block=249-249 len=1 start=1249 start_hi=0
extent[i]: block=250-289 len=40 start=1203 start_hi=0
extent[i]: block=290-361 len=72 start=3161 start_hi=0
extent[i]: block=362-366 len=5 start=5222 start_hi=0
extent[i]: block=367-397 len=31 start=3238 start_hi=0
extent[i]: block=398-511 len=114 start=1351 start_hi=0
Inode: 11 Type: regular Mode: 0644 Flags: 0x80000 Generation:
3161384699
User: 0 Group: 0 Size: 524288
File ACL: 0 Directory ACL: 0
Links: 1 Blockcount: 1864
Fragment: Address: 0 Number: 0 Size: 0
ctime: 0x458bc897 -- Fri Dec 22 06:59:19 2006
atime: 0x458bc897 -- Fri Dec 22 06:59:19 2006
mtime: 0x458bc897 -- Fri Dec 22 06:59:19 2006
BLOCKS:
(IND):1201, (0-502):645-1147, (76-107):589-620, (116-131):629-644,
(132-134):1257-1259, (135-177):2154-2196, (178-194):4206-4222,
(195-247):1148-1200, (248):1202, (249):1249, (250-289):1203-1242,
(290-361):3161-3232, (362-366):5222-5226, (367-397):3238-3268,
(398-511):1351-1464
TOTAL: 932


Above we can see that block numbers from 68 to 502 are each covered by
more than one extent (besides couple of holes, which also might be part
of the same problem).

Note: A "u" in extent[u] donates that this extent is
uninitialized, and thus was created as part of preallocation and noone
has written to it. An "i" signifies that the extent is initialized.

I am trying to solve this issue currently. Any suggestions are more than
welcome.. :)

Regards,
Amit Arora

>
> Cheers, Andreas
> --
> Andreas Dilger
> Principal Software Engineer
> Cluster File Systems, Inc.

2006-12-22 16:10:06

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC][Patch 2/2] Persistent preallocation in ext4

>>>>> Amit K Arora (AKA) writes:

AKA> Above we can see that block numbers from 68 to 502 are each covered by
AKA> more than one extent (besides couple of holes, which also might be part
AKA> of the same problem).

AKA> Note: A "u" in extent[u] donates that this extent is
AKA> uninitialized, and thus was created as part of preallocation and noone
AKA> has written to it. An "i" signifies that the extent is initialized.

AKA> I am trying to solve this issue currently. Any suggestions are more than
AKA> welcome.. :)

I'd write a simple function that check given leaf for consistency
(in this case you need to check that every subsequent extent doesn't
overlap previous one) and add this check in few points (before and
after change). this worked very well, especially if all extents fit
single block.

thanks, Alex

2006-12-27 23:30:49

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC][Patch 1/2] Persistent preallocation in ext4

On Fri, 2006-12-15 at 18:05 +0530, Amit K. Arora wrote:
> This is the first patch in the set of two.
>
> It implements the ioctl which will be used for persistent preallocation. It is a respun of the previous patch which was posted earlier, and includes following changes:
> * Takes care of review comments by Mingming
> * The declaration of extent related macros are now moved to ext4_fs_extent.h (from ext4_fs.h)
> * Updated the logic to calculate block and max_blocks in ext4/ioctl.c, which is used to call get_blocks.
>
> It does _not_ take care of implementing persistent preallocation for non-extent based files. It is because of the following reasons:
> * It is being considered as a rare case
> * Users can/should convert their file(s) to extent format to use this feature
> * Moreover, posix_fallocate() can be used for this purpose, if the user does not want to convert the file(s) to the extent based format.
>
>
> Signed-off-by: Amit Arora ([email protected])
>
Hi Amit,

looks good to me, a few comments :)
.....
> Index: linux-2.6.19.prealloc/fs/ext4/ioctl.c
> ===================================================================
> --- linux-2.6.19.prealloc.orig/fs/ext4/ioctl.c 2006-12-15 16:44:35.000000000 +0530
> +++ linux-2.6.19.prealloc/fs/ext4/ioctl.c 2006-12-15 17:47:00.000000000 +0530
> @@ -248,6 +248,65 @@
> return err;
> }
>
> + case EXT4_IOC_PREALLOCATE: {
> + struct ext4_falloc_input input;
> + handle_t *handle;
> + ext4_fsblk_t block, max_blocks;
> + int ret, ret2, nblocks = 0, retries = 0;
> + struct buffer_head map_bh;
> + unsigned int blkbits = inode->i_blkbits;
> +
> + if (IS_RDONLY(inode))
> + return -EROFS;
> +
> + if (copy_from_user(&input,
> + (struct ext4_falloc_input __user *) arg, sizeof(input)))
> + return -EFAULT;
> +
> + if (input.len == 0)
> + return -EINVAL;
> +
> + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
> + return -ENOTTY;
> +
> + block = input.offset >> blkbits;
> + max_blocks = (EXT4_BLOCK_ALIGN(input.len + input.offset,
> + blkbits) >> blkbits) - block;
> + handle=ext4_journal_start(inode,
> + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)+max_blocks);
> + if (IS_ERR(handle))
> + return PTR_ERR(handle);
> +retry:
> + ret = 0;
> + while(ret>=0 && ret<max_blocks)
> + {
> + block = block + ret;
> + max_blocks = max_blocks - ret;
> + ret = ext4_ext_get_blocks(handle, inode, block,
> + max_blocks, &map_bh,
> + EXT4_CREATE_UNINITIALIZED_EXT, 0);
> + if(ret > 0 && test_bit(BH_New, &map_bh.b_state))
> + nblocks = nblocks + ret;
> + }


ext4_ext_get_blocks() returns 0 when it is mapping (non allocating) a
hole. In our case, we are doing allocating, so here it is not possible
to returns a 0 from ext4_ext_get_blocks(). I think we should quit the
loop and BUGON if ret == 0 here.

> + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
> + &retries))
> + goto retry;
> +
> + if(nblocks) {
> + mutex_lock(&inode->i_mutex);
> + inode->i_size = inode->i_size + (nblocks >> blkbits);
> + EXT4_I(inode)->i_disksize = inode->i_size;
> + mutex_unlock(&inode->i_mutex);
> + }

Hmm... We should not need to worry about the inode->i_size if we are
preallocating blocks for holes.

And, Looking at other places calling ext4_*_get_blocks() in the kernel,
it seems not all of them protected by i_mutex lock. I think it probably
okay to not holding i_mutex during calling ext4_ext4_get_blocks().

> +
> + ext4_mark_inode_dirty(handle, inode);
> + ret2 = ext4_journal_stop(handle);
> + if(ret > 0)
> + ret = ret2;
> +
> + return ret > 0 ? nblocks : ret;
> + }
> +

Since the API takes the number of bytes to preallocate, at return time,
shall we convert the blocks to bytes to the user?

Here it returns the number of allocated blocks to the user. Do we need
to worry about the case when dealing with a range with partial hole and
partial blocks already allocated? In that case nblocks(the new
preallocated blocks) will less than the maxblocks (the number of blocks
asked by application). I am wondering what does other filesystem like
xfs do? Maybe we should do the same thing.

Thanks,
Mingming

2007-01-02 11:04:17

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/2] Persistent preallocation in ext4

Hi Mingming,

On Wed, Dec 27, 2006 at 03:30:44PM -0800, Mingming Cao wrote:
> looks good to me, a few comments :)
Thanks for your comments!

> .....
> > + ret = ext4_ext_get_blocks(handle, inode, block,
> > + max_blocks, &map_bh,
> > + EXT4_CREATE_UNINITIALIZED_EXT, 0);
> > + if(ret > 0 && test_bit(BH_New, &map_bh.b_state))
> > + nblocks = nblocks + ret;
> > + }
>
>
> ext4_ext_get_blocks() returns 0 when it is mapping (non allocating) a
> hole. In our case, we are doing allocating, so here it is not possible
> to returns a 0 from ext4_ext_get_blocks(). I think we should quit the
> loop and BUGON if ret == 0 here.

Okay. I will add "BUG_ON(!ret);" just after get_blocks, above.

>
> > + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb,
> > + &retries))
> > + goto retry;
> > +
> > + if(nblocks) {
> > + mutex_lock(&inode->i_mutex);
> > + inode->i_size = inode->i_size + (nblocks >> blkbits);
> > + EXT4_I(inode)->i_disksize = inode->i_size;
> > + mutex_unlock(&inode->i_mutex);
> > + }
>
> Hmm... We should not need to worry about the inode->i_size if we are
> preallocating blocks for holes.

You are right. Will take care of this.

> And, Looking at other places calling ext4_*_get_blocks() in the kernel,
> it seems not all of them protected by i_mutex lock. I think it probably
> okay to not holding i_mutex during calling ext4_ext4_get_blocks().

We are not holding i_mutex lock during ext4_ext_get_blocks() call.
Above, this lock is being held inorder to avoid race while updating the
filesize in inode (reference your comment in a previous mail "I think we
should update i_size and i_disksize after preallocation. Oh,
to protect parallel updating i_size, we have to take i_mutex down.").
Perhaps, truncate_mutex lock should be used here, and not i_mutex.

>
> > +
> > + ext4_mark_inode_dirty(handle, inode);
> > + ret2 = ext4_journal_stop(handle);
> > + if(ret > 0)
> > + ret = ret2;
> > +
> > + return ret > 0 ? nblocks : ret;
> > + }
> > +
>
> Since the API takes the number of bytes to preallocate, at return time,
> shall we convert the blocks to bytes to the user?
>
> Here it returns the number of allocated blocks to the user. Do we need
> to worry about the case when dealing with a range with partial hole and
> partial blocks already allocated? In that case nblocks(the new
> preallocated blocks) will less than the maxblocks (the number of blocks
> asked by application). I am wondering what does other filesystem like
> xfs do? Maybe we should do the same thing.

I think xfs just returns 0 on success, and errno on an error. Do we
want to keep the same behavior here ? Or, should we return the number of
bytes preallocated ?

Thanks!

--
Regards,
Amit Arora

2007-01-02 22:47:42

by Mingming Cao

[permalink] [raw]
Subject: Re: [RFC][Patch 1/2] Persistent preallocation in ext4

On Tue, 2007-01-02 at 16:34 +0530, Amit K. Arora wrote:
> Hi Mingming,
> > And, Looking at other places calling ext4_*_get_blocks() in the kernel,
> > it seems not all of them protected by i_mutex lock. I think it probably
> > okay to not holding i_mutex during calling ext4_ext4_get_blocks().
>
> We are not holding i_mutex lock during ext4_ext_get_blocks() call.
> Above, this lock is being held inorder to avoid race while updating the
> filesize in inode (reference your comment in a previous mail "I think we
> should update i_size and i_disksize after preallocation. Oh,
> to protect parallel updating i_size, we have to take i_mutex down.").
> Perhaps, truncate_mutex lock should be used here, and not i_mutex.
>

truncate_mutex is hold at the entry of ext4_**_get_blocks() to protect
parallel block allocation. Here I was worried about concurrent modifying
i_size and i_disksize which were protected by the i_mutex lock. Sorry
for any confusion.

2007-01-09 09:05:44

by Amit K. Arora

[permalink] [raw]
Subject: Re: [RFC][Patch 1/2] Persistent preallocation in ext4

On Tue, Jan 02, 2007 at 04:34:09PM +0530, Amit K. Arora wrote:
> On Wed, Dec 27, 2006 at 03:30:44PM -0800, Mingming Cao wrote:
> > Since the API takes the number of bytes to preallocate, at return time,
> > shall we convert the blocks to bytes to the user?
> >
> > Here it returns the number of allocated blocks to the user. Do we need
> > to worry about the case when dealing with a range with partial hole and
> > partial blocks already allocated? In that case nblocks(the new
> > preallocated blocks) will less than the maxblocks (the number of blocks
> > asked by application). I am wondering what does other filesystem like
> > xfs do? Maybe we should do the same thing.
>
> I think xfs just returns 0 on success, and errno on an error. Do we
> want to keep the same behavior here ? Or, should we return the number of
> bytes preallocated ?

We still need to decide on what the ioctl should return. Should it
return zero on success and errno on error, like how posix_fallocate and
xfs behave ? If yes, then should we undo partial preallocation (if any)
in case of an error (say ENOSPC) ?

If no, then should we return the number of bytes preallocated ? In this
case we have to think about the situation Mingming mentioned above (i.e.
when the preallocation request partially spans through a hole and
partially through few already allocated blocks).

--
Regards,
Amit Arora