Hi all.
This patch series implements delayed extent tree for ext4.
Compared with previous patches, delayed extents are managed by
RB tree instead.
For more infomation, please refer to the 3rd patch, the introduction
is put in fs/ext4/delayed_extents.c.
Feedbacks are welcome.
Yongqiang.
[RFC PATCH V2 1/6] ext4: add two structures supporting delayed
[RFC PATCH V2 2/6] ext4: add a delayed extents tree in inode info
[RFC PATCH V2 3/6] ext4: add operations on delayed extent tree
[RFC PATCH V2 4/6] ext4: initialize delayed extent tree
[RFC PATCH V2 5/6] ext4: let ext4 maintian delayed extent trees
[RFC PATCH V2 6/6] ext4: reimplement fiemap on delayed extent tree
This patch adds two structures that supports delayed extent tree.
Signed-off-by: Yongqiang Yang <[email protected]>
---
fs/ext4/delayed_extents.h | 22 ++++++++++++++++++++++
1 files changed, 22 insertions(+), 0 deletions(-)
create mode 100644 fs/ext4/delayed_extents.h
diff --git a/fs/ext4/delayed_extents.h b/fs/ext4/delayed_extents.h
new file mode 100644
index 0000000..66d6390
--- /dev/null
+++ b/fs/ext4/delayed_extents.h
@@ -0,0 +1,22 @@
+/*
+ * fs/ext4/delayed_extents.h
+ *
+ * Written by Yongqiang Yang <[email protected]>
+ *
+ */
+
+#ifndef _EXT4_DELAYED_EXTENTS_H
+#define _EXT4_DELAYED_EXTENTS_H
+
+struct delayed_extent {
+ struct rb_node rb_node;
+ ext4_lblk_t start; /* first block extent covers */
+ ext4_lblk_t len; /* length of extent ib block */
+};
+
+struct ext4_de_tree {
+ struct rb_root root;
+ struct delayed_extent *cache_de; /* recently accessed extent */
+};
+
+#endif /* _EXT4_DELAYED_EXTENTS_H */
--
1.7.5.1
This patch adds a delayed extents tree in inode info, so that
delayed extents can be look up quickly. Without the tree
delayed extents are identified by looking up page cache.
With the tree, FIEMAP, SEEK_HOLE/DATA, bigalloc and writeout
path can be optimized a lot.
Signed-off-by: Yongqiang Yang <[email protected]>
---
fs/ext4/ext4.h | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ccfa81f..d3c6b97 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -760,6 +760,8 @@ struct ext4_ext_cache {
__u32 ec_len; /* must be 32bit to return holes */
};
+#include "delayed_extents.h"
+
/*
* fourth extended file system inode data in memory
*/
@@ -837,6 +839,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* delayed extents */
+ struct ext4_de_tree i_de_tree;
+
/* ialloc */
ext4_group_t i_last_alloc_group;
--
1.7.5.1
This patch adds operations on a delayed extent tree.
Signed-off-by; Yongqiang Yang <[email protected]>
---
fs/ext4/Makefile | 2 +-
fs/ext4/delayed_extents.c | 412 +++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/delayed_extents.h | 18 ++
3 files changed, 431 insertions(+), 1 deletions(-)
create mode 100644 fs/ext4/delayed_extents.c
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 56fd8f86..ee16ad3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
- mmp.o indirect.o
+ mmp.o indirect.o delayed_extents.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
new file mode 100644
index 0000000..8da7b78
--- /dev/null
+++ b/fs/ext4/delayed_extents.c
@@ -0,0 +1,412 @@
+/*
+ * fs/ext4/delayed_extents.c
+ *
+ * Written by Yongqiang Yang <[email protected]>
+ *
+ * Ext4 delayed extents core functions.
+ */
+#include <linux/rbtree.h>
+#include "ext4.h"
+#include "delayed_extents.h"
+#include "ext4_extents.h"
+
+/*
+ * delayed extent implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * 1. Why delayed extent implementation ?
+ *
+ * Without delayed extent, ext4 identifies a delayed extent by looking up
+ * page cache, this has several deficiencies - complicated, buggy, and
+ * inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need to know if
+ * a block or a range of blocks are belonged to a delayed extent.
+ *
+ * Let us have a look at how they do without delayed extents implementation.
+ * -- FIEMAP
+ * FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ * -- SEEK_HOLE/DATA
+ * SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ * -- bigalloc
+ * bigalloc looks up page cache to figure out if a block is already
+ * under delayed allocation or not to determine whether quota reserving
+ * is needed for the cluster.
+ *
+ * -- punch hole
+ * punch hole looks up page cache to identify a delayed extent.
+ *
+ * -- writeout
+ * Writeout looks up whole page cache to see if a buffer is mapped, If
+ * there are not very many delayed buffers, then it is time comsuming.
+ *
+ * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, bigalloc and
+ * writeout can figure out if a block or a range of blocks is under delayed
+ * allocation(belonged to a delayed extent) or not by searching the delayed
+ * extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. ext4 delayed extents impelmentation
+ *
+ * -- delayed extent
+ * A delayed extent is a range of blocks which are contiguous logically and
+ * under delayed allocation. Unlike extent in ext4, delayed extent in ext4
+ * is a in-memory struct, there is no corresponding on-disk data. There is
+ * no limit on length of delayed extent, so a delayed extent can contain as
+ * many blocks as they are contiguous logically.
+ *
+ * -- delayed extent tree
+ * Every inode has a delayed extent tree and all under delayed allocation
+ * blocks are added to the tree as dealyed extents. Delayed extents in
+ * the tree are ordered by logical block no.
+ *
+ * -- operations on a delayed extent tree
+ * There are three operations on a delayed extent tree: find next delayed
+ * extent, adding a space(a range of blocks) and removing a space.
+ *
+ * -- race on a delayed extent tree
+ * Delayed extent tree is protected inode->i_data_sem like extent tree.
+ *
+ *
+ * ==========================================================================
+ * 3. performance analysis
+ * -- overhead
+ * 1. Apart from operations on a delayed extent tree, we need to
+ * down_write(inode->i_data_sem) in delayed write path to maintain delayed
+ * extent tree, this can have impact on parallel read-write and write-write
+ *
+ * 2. There is a cache extent for write access, so if writes are not very
+ * random, adding space operaions are in O(1) time.
+ *
+ * -- gain
+ * 3. Code is much simpler, more readable, more maintainable and
+ * more efficient.
+ *
+ * 4. TODO
+ * -- cache_de in de_find_extent
+ * de_find_extent is used heavily by bgalloc, so de_find_extent should use
+ * cache_de.
+ *
+ */
+static struct kmem_cache *ext4_de_cachep;
+
+int __init ext4_init_de(void)
+{
+ ext4_de_cachep = KMEM_CACHE(delayed_extent, SLAB_RECLAIM_ACCOUNT);
+ if (ext4_de_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_de(void)
+{
+ if (ext4_de_cachep)
+ kmem_cache_destroy(ext4_de_cachep);
+}
+
+void ext4_de_init_tree(struct ext4_de_tree *tree)
+{
+ tree->root = RB_ROOT;
+ tree->cache_de = NULL;
+}
+
+#ifdef DE_DEBUG
+static void ext4_de_print_tree(struct inode *inode)
+{
+ struct ext4_de_tree *tree;
+ struct rb_node *node;
+
+ printk(KERN_DEBUG "delayed extents for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_de_tree;
+ node = rb_first(&tree->root);
+ while(node) {
+ struct delayed_extent *de;
+ de = rb_entry(node, struct delayed_extent, rb_node);
+ printk(KERN_DEBUG " [%u/%u)", de->start, de->len);
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_de_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t delayed_extent_end(struct delayed_extent *de)
+{
+ if (de->start + de->len < de->start)
+ return (ext4_lblk_t)-1;
+ return de->start + de->len;
+}
+
+/*
+ * search through the tree for an delayed_extent with a given offset. If
+ * it can't be found, try to find next extent.
+ */
+static struct delayed_extent * __de_tree_search(struct rb_root *root,
+ ext4_lblk_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct delayed_extent *de = NULL;
+
+ while (node) {
+ de = rb_entry(node, struct delayed_extent, rb_node);
+ if (offset < de->start)
+ node = node->rb_left;
+ else if (offset >= delayed_extent_end(de))
+ node = node->rb_right;
+ else
+ return de;
+ }
+
+ if (de && offset < de->start)
+ return de;
+
+ if (de && offset >= delayed_extent_end(de))
+ return rb_entry(rb_next(&de->rb_node),
+ struct delayed_extent, rb_node);
+
+ return NULL;
+}
+
+/*
+ * ext4_de_first_extent_since: find the 1st delayed extent covering @start
+ * if it exists, otherwise, the next extent after @start.
+ *
+ * @inode: the inode which owns delayed extents
+ * @offset: logic block
+ *
+ * Returns next block beyond the found extent.
+ * Delayed extent is returned via @de.
+ */
+ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de)
+{
+ struct ext4_de_tree *tree;
+ struct delayed_extent *de1;
+ struct rb_node *node;
+
+ de->len = 0;
+ tree = &EXT4_I(inode)->i_de_tree;
+ de1 = __de_tree_search(&tree->root, de->start);
+ if (de1) {
+ de->start = de1->start;
+ de->len = de1->len;
+ node = rb_next(&de1->rb_node);
+ if (node) {
+ de1 = rb_entry(node, struct delayed_extent, rb_node);
+ return de1->start;
+ }
+ }
+
+ return EXT_MAX_BLOCKS;
+}
+
+static struct delayed_extent *
+ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+{
+ struct delayed_extent *de;
+ de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
+ if (de == NULL)
+ return NULL;
+ de->start = start;
+ de->len = len;
+ return de;
+}
+
+static void ext4_de_free_extent(struct delayed_extent *de)
+{
+ kmem_cache_free(ext4_de_cachep, de);
+}
+
+static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree,
+ struct delayed_extent *de)
+{
+ struct delayed_extent *de1;
+ struct rb_node *node;
+
+ node = rb_prev(&de->rb_node);
+ if (!node)
+ return;
+
+ de1 = rb_entry(node, struct delayed_extent, rb_node);
+ if (delayed_extent_end(de1) == de->start) {
+ de1->len += de->len;
+ rb_erase(&de->rb_node, &tree->root);
+ if (de == tree->cache_de)
+ tree->cache_de = de1;
+ ext4_de_free_extent(de);
+ }
+}
+
+static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree,
+ struct delayed_extent *de)
+{
+ struct delayed_extent *de1;
+ struct rb_node *node;
+
+ node = rb_next(&de->rb_node);
+ if (!node)
+ return;
+
+ de1 = rb_entry(node, struct delayed_extent, rb_node);
+ if (de1->start == delayed_extent_end(de)) {
+ de->len += de1->len;
+ rb_erase(node, &tree->root);
+ if (de1 == tree->cache_de)
+ tree->cache_de = de;
+ ext4_de_free_extent(de1);
+ }
+}
+
+/*
+ * ext4_de_add_space adds a space to a delayed extent tree.
+ * Caller holds inode->i_data_sem.
+ *
+ * ext4_de_add_space is callyed by ext4_dealyed_write_begin and
+ * ext4_de_remove_space.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len)
+{
+ struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree;
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct delayed_extent *de;
+ ext4_lblk_t end = offset + len;
+
+ BUG_ON(end <= offset);
+
+ de = tree->cache_de;
+ de_debug("add [%u/%u) to delayed extent list of inode %lu\n",
+ offset, len, inode->i_ino);
+
+ if (de && delayed_extent_end(de) == offset) {
+ de->len += len;
+ ext4_de_try_to_merge_right(tree, de);
+ goto out;
+ } else if (de && de->start == end) {
+ de->start = offset;
+ de->len += len;
+ ext4_de_try_to_merge_left(tree, de);
+ goto out;
+ } else if (de && de->start <= offset &&
+ delayed_extent_end(de) >= end)
+ goto out;
+
+ while (*p) {
+ parent = *p;
+ de = rb_entry(parent, struct delayed_extent, rb_node);
+
+ if (offset < de->start) {
+ if (end == de->start) {
+ de->len += len;
+ de->start = offset;
+ goto out;
+ }
+ p = &(*p)->rb_left;
+ } else if (offset > delayed_extent_end(de)) {
+ if (delayed_extent_end(de) == offset) {
+ de->len += len;
+ goto out;
+ }
+ p = &(*p)->rb_right;
+ } else
+ goto out;
+ }
+
+ de = ext4_de_alloc_extent(offset, len);
+ if (!de)
+ return -ENOMEM;
+ rb_link_node(&de->rb_node, parent, p);
+ rb_insert_color(&de->rb_node, &tree->root);
+
+out:
+ tree->cache_de = de;
+ ext4_de_print_tree(inode);
+
+ return 0;
+}
+
+/*
+ * ext4_de_remove_space() removes a space from a delayed extent tree.
+ * Caller holds inode->i_data_sem.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_de_remove_space(struct inode *inode, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct rb_node *node;
+ struct ext4_de_tree *tree;
+ struct delayed_extent *de;
+ struct delayed_extent orig_de;
+ ext4_lblk_t len1, len2, end;
+
+ de_debug("remove [%u/%u) from delayed extent list of inode %lu\n",
+ offset, len, inode->i_ino);
+
+ end = offset + len;
+ BUG_ON(end <= offset);
+ tree = &EXT4_I(inode)->i_de_tree;
+ de = __de_tree_search(&tree->root, offset);
+ if (!de)
+ goto out;
+
+ orig_de.start = de->start;
+ orig_de.len = de->len;
+ len1 = offset > de->start ? offset - de->start : 0;
+ len2 = delayed_extent_end(de) > end ?
+ delayed_extent_end(de) - end : 0;
+ if (len1 > 0)
+ de->len = len1;
+ if (len2 > 0) {
+ if (len1 > 0) {
+ int err;
+ err = ext4_de_add_space(inode, end, len2);
+ if (err) {
+ de->start = orig_de.start;
+ de->len = orig_de.len;
+ return err;
+ }
+ } else {
+ de->start = end;
+ de->len = len2;
+ }
+ goto out;
+ }
+
+ if (len1 > 0) {
+ node = rb_next(&de->rb_node);
+ if (!node)
+ de = rb_entry(node, struct delayed_extent, rb_node);
+ else
+ de = NULL;
+ }
+
+ while (de && delayed_extent_end(de) <= end) {
+ node = rb_next(&de->rb_node);
+ rb_erase(&de->rb_node, &tree->root);
+ if (de == tree->cache_de)
+ tree->cache_de = NULL;
+ ext4_de_free_extent(de);
+ if (!node) {
+ de = NULL;
+ break;
+ }
+ de = rb_entry(node, struct delayed_extent, rb_node);
+ }
+
+ if (de && de->start < end) {
+ len1 = delayed_extent_end(de) - end;
+ de->start = end;
+ de->len = len1;
+ }
+
+out:
+ ext4_de_print_tree(inode);
+ return 0;
+}
diff --git a/fs/ext4/delayed_extents.h b/fs/ext4/delayed_extents.h
index 66d6390..3f649d9 100644
--- a/fs/ext4/delayed_extents.h
+++ b/fs/ext4/delayed_extents.h
@@ -8,6 +8,13 @@
#ifndef _EXT4_DELAYED_EXTENTS_H
#define _EXT4_DELAYED_EXTENTS_H
+#define DE_DEBUG
+#ifdef DE_DEBUG
+#define de_debug(a...) printk(a)
+#else
+#define de_debug(a...)
+#endif
+
struct delayed_extent {
struct rb_node rb_node;
ext4_lblk_t start; /* first block extent covers */
@@ -19,4 +26,15 @@ struct ext4_de_tree {
struct delayed_extent *cache_de; /* recently accessed extent */
};
+extern int __init ext4_init_de(void);
+extern void ext4_exit_de(void);
+extern void ext4_de_init_tree(struct ext4_de_tree *tree);
+
+extern int ext4_de_add_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
+extern int ext4_de_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
+extern ext4_lblk_t ext4_de_find_extent(struct inode *inode,
+ struct delayed_extent *de);
+
#endif /* _EXT4_DELAYED_EXTENTS_H */
--
1.7.5.1
Let ext4 initialize delayed extent tree of a inode.
Signed-off-by: Yongqiang Yang <[email protected]>
---
fs/ext4/super.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 44d0c8d..247fcdd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -882,6 +882,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ ext4_de_init_tree(&ei->i_de_tree);
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
--
1.7.5.1
This patch let ext4 maintain delayed extent trees.
Signed-off-by: Yongqiang Yang <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/extents.c | 2 ++
fs/ext4/indirect.c | 3 +++
fs/ext4/inode.c | 28 ++++++++++++++++++++++++++--
fs/ext4/super.c | 12 +++++++++++-
5 files changed, 43 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3c6b97..177ec0a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -519,6 +519,7 @@ struct ext4_new_group_data {
#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
+#define EXT4_GET_BLOCKS_DEALLOC 0x0080
/*
* Flags used by ext4_free_blocks
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9124cd2..bdbb984 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3688,6 +3688,8 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+ err = ext4_de_remove_space(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
err = ext4_ext_remove_space(inode, last_block);
/* In a multi-transaction truncate, we only make the final
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 0962642..25cdb5b 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
#include "truncate.h"
#include <trace/events/ext4.h>
@@ -1383,6 +1384,8 @@ void ext4_ind_truncate(struct inode *inode)
down_write(&ei->i_data_sem);
ext4_discard_preallocations(inode);
+ ext4_de_remove_space(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
/*
* The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f86b149..0f9f108 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -442,7 +442,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ int ret;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ /* delayed alloc may be allocated by fallocate,
+ * we need to handle delayed extent here.
+ */
+ down_write((&EXT4_I(inode)->i_data_sem));
+ goto delayed_mapped;
+ }
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -517,8 +525,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode, retval, 1);
}
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+delayed_mapped:
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret;
+ /* delayed allocation blocks has been allocated */
+ ret = ext4_de_remove_space(inode, map->m_lblk,
+ map->m_len);
+ if (ret < 0)
+ retval = ret;
+ }
+ }
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -1630,6 +1648,12 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
/* not enough space to reserve */
return ret;
+ down_write((&EXT4_I(inode)->i_data_sem));
+ ret = ext4_de_add_space(inode, map.m_lblk, map.m_len);
+ up_write((&EXT4_I(inode)->i_data_sem));
+ if (ret)
+ return ret;
+
map_bh(bh, inode->i_sb, invalid_block);
set_buffer_new(bh);
set_buffer_delay(bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 247fcdd..a248551 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -49,6 +49,7 @@
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
+#include "ext4_extents.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -967,6 +968,7 @@ void ext4_clear_inode(struct inode *inode)
end_writeback(inode);
dquot_drop(inode);
ext4_discard_preallocations(inode);
+ ext4_de_remove_space(inode, 0, EXT_MAX_BLOCKS);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -4976,9 +4978,14 @@ static int __init ext4_init_fs(void)
init_waitqueue_head(&ext4__ioend_wq[i]);
}
- err = ext4_init_pageio();
+ err = ext4_init_de();
if (err)
return err;
+
+ err = ext4_init_pageio();
+ if (err)
+ goto out8;
+
err = ext4_init_system_zone();
if (err)
goto out7;
@@ -5030,6 +5037,9 @@ out6:
ext4_exit_system_zone();
out7:
ext4_exit_pageio();
+out8:
+ ext4_exit_de();
+
return err;
}
--
1.7.5.1
This patch reimplements fiemap on delayed extent tree.
Signed-off-by: Yongqiang Yang <[email protected]>
---
fs/ext4/extents.c | 186 +++++++----------------------------------------------
1 files changed, 23 insertions(+), 163 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bdbb984..c2ae125 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3909,193 +3909,53 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
struct ext4_ext_cache *newex, struct ext4_extent *ex,
void *data)
{
+ struct delayed_extent de;
__u64 logical;
__u64 physical;
__u64 length;
__u32 flags = 0;
+ ext4_lblk_t next_del;
int ret = 0;
struct fiemap_extent_info *fieinfo = data;
unsigned char blksize_bits;
- blksize_bits = inode->i_sb->s_blocksize_bits;
- logical = (__u64)newex->ec_block << blksize_bits;
+ de.start = newex->ec_block;
+ down_read(&EXT4_I(inode)->i_data_sem);
+ next_del = ext4_de_find_extent(inode, &de);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ next = min(next_del, next);
if (newex->ec_start == 0) {
/*
* No extent in extent-tree contains block @newex->ec_start,
* then the block may stay in 1)a hole or 2)delayed-extent.
- *
- * Holes or delayed-extents are processed as follows.
- * 1. lookup dirty pages with specified range in pagecache.
- * If no page is got, then there is no delayed-extent and
- * return with EXT_CONTINUE.
- * 2. find the 1st mapped buffer,
- * 3. check if the mapped buffer is both in the request range
- * and a delayed buffer. If not, there is no delayed-extent,
- * then return.
- * 4. a delayed-extent is found, the extent will be collected.
*/
- ext4_lblk_t end = 0;
- pgoff_t last_offset;
- pgoff_t offset;
- pgoff_t index;
- pgoff_t start_index = 0;
- struct page **pages = NULL;
- struct buffer_head *bh = NULL;
- struct buffer_head *head = NULL;
- unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
-
- pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (pages == NULL)
- return -ENOMEM;
-
- offset = logical >> PAGE_SHIFT;
-repeat:
- last_offset = offset;
- head = NULL;
- ret = find_get_pages_tag(inode->i_mapping, &offset,
- PAGECACHE_TAG_DIRTY, nr_pages, pages);
-
- if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
- /* First time, try to find a mapped buffer. */
- if (ret == 0) {
-out:
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- /* just a hole. */
- kfree(pages);
- return EXT_CONTINUE;
- }
- index = 0;
-
-next_page:
- /* Try to find the 1st mapped buffer. */
- end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
- blksize_bits;
- if (!page_has_buffers(pages[index]))
- goto out;
- head = page_buffers(pages[index]);
- if (!head)
- goto out;
-
- index++;
- bh = head;
- do {
- if (end >= newex->ec_block +
- newex->ec_len)
- /* The buffer is out of
- * the request range.
- */
- goto out;
-
- if (buffer_mapped(bh) &&
- end >= newex->ec_block) {
- start_index = index - 1;
- /* get the 1st mapped buffer. */
- goto found_mapped_buffer;
- }
-
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
-
- /* No mapped buffer in the range found in this page,
- * We need to look up next page.
- */
- if (index >= ret) {
- /* There is no page left, but we need to limit
- * newex->ec_len.
- */
- newex->ec_len = end - newex->ec_block;
- goto out;
- }
- goto next_page;
- } else {
- /*Find contiguous delayed buffers. */
- if (ret > 0 && pages[0]->index == last_offset)
- head = page_buffers(pages[0]);
- bh = head;
- index = 1;
- start_index = 0;
- }
-
-found_mapped_buffer:
- if (bh != NULL && buffer_delay(bh)) {
- /* 1st or contiguous delayed buffer found. */
- if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
- /*
- * 1st delayed buffer found, record
- * the start of extent.
- */
- flags |= FIEMAP_EXTENT_DELALLOC;
- newex->ec_block = end;
- logical = (__u64)end << blksize_bits;
- }
- /* Find contiguous delayed buffers. */
- do {
- if (!buffer_delay(bh))
- goto found_delayed_extent;
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
-
- for (; index < ret; index++) {
- if (!page_has_buffers(pages[index])) {
- bh = NULL;
- break;
- }
- head = page_buffers(pages[index]);
- if (!head) {
- bh = NULL;
- break;
- }
-
- if (pages[index]->index !=
- pages[start_index]->index + index
- - start_index) {
- /* Blocks are not contiguous. */
- bh = NULL;
- break;
- }
- bh = head;
- do {
- if (!buffer_delay(bh))
- /* Delayed-extent ends. */
- goto found_delayed_extent;
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
- }
- } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
- /* a hole found. */
- goto out;
-
-found_delayed_extent:
- newex->ec_len = min(end - newex->ec_block,
- (ext4_lblk_t)EXT_INIT_MAX_LEN);
- if (ret == nr_pages && bh != NULL &&
- newex->ec_len < EXT_INIT_MAX_LEN &&
- buffer_delay(bh)) {
- /* Have not collected an extent and continue. */
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- goto repeat;
+ if (de.len == 0)
+ /* A hole found. */
+ return EXT_CONTINUE;
+
+ if (de.start > newex->ec_block) {
+ /* A hole found. */
+ newex->ec_len = min(de.start - newex->ec_block,
+ newex->ec_len);
+ return EXT_CONTINUE;
}
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- kfree(pages);
+ flags |= FIEMAP_EXTENT_DELALLOC;
+ newex->ec_len = de.start + de.len - newex->ec_block;
}
- physical = (__u64)newex->ec_start << blksize_bits;
- length = (__u64)newex->ec_len << blksize_bits;
Hi Yongqiang,
On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
> This patch adds a delayed extents tree in inode info, so that
> delayed extents can be look up quickly. Without the tree
> delayed extents are identified by looking up page cache.
>
> With the tree, FIEMAP, SEEK_HOLE/DATA, bigalloc and writeout
> path can be optimized a lot.
>
> Signed-off-by: Yongqiang Yang <[email protected]>
> ---
> fs/ext4/ext4.h | 5 +++++
> 1 files changed, 5 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index ccfa81f..d3c6b97 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -760,6 +760,8 @@ struct ext4_ext_cache {
> __u32 ec_len; /* must be 32bit to return holes */
> };
>
> +#include "delayed_extents.h"
> +
> /*
> * fourth extended file system inode data in memory
> */
> @@ -837,6 +839,9 @@ struct ext4_inode_info {
> struct list_head i_prealloc_list;
> spinlock_t i_prealloc_lock;
>
> + /* delayed extents */
> + struct ext4_de_tree i_de_tree;
> +
Can we integrate this patch with patch 1/6?
I think it would be beneficial for the reviewer to see both the
definition of the structure and the place you put it.
Thanks
Tao
> /* ialloc */
> ext4_group_t i_last_alloc_group;
>
On Thu, Sep 29, 2011 at 2:56 PM, Tao Ma <[email protected]> wrote:
> Hi Yongqiang,
> On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
>> This patch adds a delayed extents tree in inode info, so that
>> delayed extents can be look up quickly. ?Without the tree
>> delayed extents are identified by looking up page cache.
>>
>> With the tree, FIEMAP, SEEK_HOLE/DATA, bigalloc and writeout
>> path can be optimized a lot.
>>
>> Signed-off-by: Yongqiang Yang <[email protected]>
>> ---
>> ?fs/ext4/ext4.h | ? ?5 +++++
>> ?1 files changed, 5 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index ccfa81f..d3c6b97 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -760,6 +760,8 @@ struct ext4_ext_cache {
>> ? ? ? __u32 ? ? ? ? ? ec_len; /* must be 32bit to return holes */
>> ?};
>>
>> +#include "delayed_extents.h"
>> +
>> ?/*
>> ? * fourth extended file system inode data in memory
>> ? */
>> @@ -837,6 +839,9 @@ struct ext4_inode_info {
>> ? ? ? struct list_head i_prealloc_list;
>> ? ? ? spinlock_t i_prealloc_lock;
>>
>> + ? ? /* delayed extents */
>> + ? ? struct ext4_de_tree i_de_tree;
>> +
> Can we integrate this patch with patch 1/6?
> I think it would be beneficial for the reviewer to see both the
> definition of the structure and the place you put it.
Ok. Thank you for your advice.
I will do that in next version.
Yongqiang.
>
> Thanks
> Tao
>> ? ? ? /* ialloc */
>> ? ? ? ext4_group_t ? ?i_last_alloc_group;
>>
>
>
--
Best Wishes
Yongqiang Yang
Hi yongqiang,
On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
> This patch adds operations on a delayed extent tree.
>
> Signed-off-by; Yongqiang Yang <[email protected]>
> ---
> fs/ext4/Makefile | 2 +-
> fs/ext4/delayed_extents.c | 412 +++++++++++++++++++++++++++++++++++++++++++++
> fs/ext4/delayed_extents.h | 18 ++
> 3 files changed, 431 insertions(+), 1 deletions(-)
> create mode 100644 fs/ext4/delayed_extents.c
>
> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
> index 56fd8f86..ee16ad3 100644
> --- a/fs/ext4/Makefile
> +++ b/fs/ext4/Makefile
> @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
> ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
> ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
> ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
> - mmp.o indirect.o
> + mmp.o indirect.o delayed_extents.o
>
> ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
> ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
> diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
> new file mode 100644
> index 0000000..8da7b78
> --- /dev/null
> +++ b/fs/ext4/delayed_extents.c
<snip>
> +/*
> + * search through the tree for an delayed_extent with a given offset. If
> + * it can't be found, try to find next extent.
> + */
> +static struct delayed_extent * __de_tree_search(struct rb_root *root,
> + ext4_lblk_t offset)
> +{
> + struct rb_node *node = root->rb_node;
> + struct delayed_extent *de = NULL;
> +
> + while (node) {
> + de = rb_entry(node, struct delayed_extent, rb_node);
> + if (offset < de->start)
> + node = node->rb_left;
> + else if (offset >= delayed_extent_end(de))
> + node = node->rb_right;
> + else
> + return de;
> + }
> +
> + if (de && offset < de->start)
> + return de;
> +
> + if (de && offset >= delayed_extent_end(de))
> + return rb_entry(rb_next(&de->rb_node),
> + struct delayed_extent, rb_node);
what if the de is the most right one? rb_next should return NULL in this
case I guess? What is more, you will return a non-NULL value and use it
later in the caller. The kernel will panic. Or do I miss something here?
> +
> + return NULL;
> +}
> +
> +/*
> + * ext4_de_first_extent_since: find the 1st delayed extent covering @start
> + * if it exists, otherwise, the next extent after @start.
This comment seems to be unrelated to the below function.
> + *
> + * @inode: the inode which owns delayed extents
> + * @offset: logic block
> + *
> + * Returns next block beyond the found extent.
> + * Delayed extent is returned via @de.
> + */
> +ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de)
> +{
> + struct ext4_de_tree *tree;
> + struct delayed_extent *de1;
> + struct rb_node *node;
> +
> + de->len = 0;
> + tree = &EXT4_I(inode)->i_de_tree;
> + de1 = __de_tree_search(&tree->root, de->start);
> + if (de1) {
> + de->start = de1->start;
> + de->len = de1->len;
> + node = rb_next(&de1->rb_node);
Sorry, but I don't understand what you are going to do here. In
__de_tree_search, you have already use rb_next to get the next de if
start < del->start. why we still need a rb_next here?
> + if (node) {
> + de1 = rb_entry(node, struct delayed_extent, rb_node);
> + return de1->start;
> + }
> + }
> +
> + return EXT_MAX_BLOCKS;
> +}
> +
> +static struct delayed_extent *
> +ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
> +{
> + struct delayed_extent *de;
> + de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
> + if (de == NULL)
> + return NULL;
> + de->start = start;
> + de->len = len;
> + return de;
> +}
> +
> +static void ext4_de_free_extent(struct delayed_extent *de)
> +{
> + kmem_cache_free(ext4_de_cachep, de);
> +}
> +
> +static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree,
> + struct delayed_extent *de)
> +{
> + struct delayed_extent *de1;
> + struct rb_node *node;
> +
> + node = rb_prev(&de->rb_node);
> + if (!node)
> + return;
> +
> + de1 = rb_entry(node, struct delayed_extent, rb_node);
> + if (delayed_extent_end(de1) == de->start) {
> + de1->len += de->len;
> + rb_erase(&de->rb_node, &tree->root);
> + if (de == tree->cache_de)
> + tree->cache_de = de1;
> + ext4_de_free_extent(de);
> + }
> +}
> +
> +static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree,
> + struct delayed_extent *de)
> +{
> + struct delayed_extent *de1;
> + struct rb_node *node;
> +
> + node = rb_next(&de->rb_node);
> + if (!node)
> + return;
> +
> + de1 = rb_entry(node, struct delayed_extent, rb_node);
> + if (de1->start == delayed_extent_end(de)) {
> + de->len += de1->len;
> + rb_erase(node, &tree->root);
> + if (de1 == tree->cache_de)
> + tree->cache_de = de;
> + ext4_de_free_extent(de1);
> + }
> +}
> +
> +/*
> + * ext4_de_add_space adds a space to a delayed extent tree.
> + * Caller holds inode->i_data_sem.
> + *
> + * ext4_de_add_space is callyed by ext4_dealyed_write_begin and
> + * ext4_de_remove_space.
> + *
> + * Return 0 on success, error code on failure.
> + */
> +int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len)
> +{
> + struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree;
> + struct rb_node **p = &tree->root.rb_node;
> + struct rb_node *parent = NULL;
> + struct delayed_extent *de;
> + ext4_lblk_t end = offset + len;
> +
> + BUG_ON(end <= offset);
> +
> + de = tree->cache_de;
> + de_debug("add [%u/%u) to delayed extent list of inode %lu\n",
> + offset, len, inode->i_ino);
> +
> + if (de && delayed_extent_end(de) == offset) {
> + de->len += len;
> + ext4_de_try_to_merge_right(tree, de);
> + goto out;
> + } else if (de && de->start == end) {
> + de->start = offset;
> + de->len += len;
> + ext4_de_try_to_merge_left(tree, de);
> + goto out;
> + } else if (de && de->start <= offset &&
> + delayed_extent_end(de) >= end)
> + goto out;
> +
> + while (*p) {
> + parent = *p;
> + de = rb_entry(parent, struct delayed_extent, rb_node);
> +
> + if (offset < de->start) {
> + if (end == de->start) {
> + de->len += len;
> + de->start = offset;
> + goto out;
> + }
> + p = &(*p)->rb_left;
> + } else if (offset > delayed_extent_end(de)) {
> + if (delayed_extent_end(de) == offset) {
> + de->len += len;
> + goto out;
> + }
> + p = &(*p)->rb_right;
> + } else
> + goto out;
> + }
is the above what __de_tree_search try to do?
btw, we'd better have a BUG_ON when we meet with an intersection since
you only check the offset in your rb_tree search. So what if offset <
de->start while offset + len > de->start? It would cause your algorithm
not work I guess.
> +
> + de = ext4_de_alloc_extent(offset, len);
> + if (!de)
> + return -ENOMEM;
> + rb_link_node(&de->rb_node, parent, p);
> + rb_insert_color(&de->rb_node, &tree->root);
> +
> +out:
> + tree->cache_de = de;
> + ext4_de_print_tree(inode);
> +
> + return 0;
> +}
> +
> +/*
> + * ext4_de_remove_space() removes a space from a delayed extent tree.
> + * Caller holds inode->i_data_sem.
> + *
> + * Return 0 on success, error code on failure.
> + */
> +int ext4_de_remove_space(struct inode *inode, ext4_lblk_t offset,
> + ext4_lblk_t len)
> +{
> + struct rb_node *node;
> + struct ext4_de_tree *tree;
> + struct delayed_extent *de;
> + struct delayed_extent orig_de;
> + ext4_lblk_t len1, len2, end;
> +
> + de_debug("remove [%u/%u) from delayed extent list of inode %lu\n",
> + offset, len, inode->i_ino);
> +
> + end = offset + len;
> + BUG_ON(end <= offset);
> + tree = &EXT4_I(inode)->i_de_tree;
> + de = __de_tree_search(&tree->root, offset);
> + if (!de)
> + goto out;
> +
> + orig_de.start = de->start;
> + orig_de.len = de->len;
> + len1 = offset > de->start ? offset - de->start : 0;
> + len2 = delayed_extent_end(de) > end ?
> + delayed_extent_end(de) - end : 0;
> + if (len1 > 0)
> + de->len = len1;
> + if (len2 > 0) {
> + if (len1 > 0) {
> + int err;
> + err = ext4_de_add_space(inode, end, len2);
I don't find any error in your ext4_de_add_space.
> + if (err) {
> + de->start = orig_de.start;
> + de->len = orig_de.len;
> + return err;
> + }
> + } else {
> + de->start = end;
> + de->len = len2;
> + }
> + goto out;
> + }
Thanks
Tao
On Thu, Sep 29, 2011 at 4:05 PM, Tao Ma <[email protected]> wrote:
> Hi yongqiang,
> On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
>> This patch adds operations on a delayed extent tree.
>>
>> Signed-off-by; Yongqiang Yang <[email protected]>
>> ---
>> ?fs/ext4/Makefile ? ? ? ? ?| ? ?2 +-
>> ?fs/ext4/delayed_extents.c | ?412 +++++++++++++++++++++++++++++++++++++++++++++
>> ?fs/ext4/delayed_extents.h | ? 18 ++
>> ?3 files changed, 431 insertions(+), 1 deletions(-)
>> ?create mode 100644 fs/ext4/delayed_extents.c
>>
>> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
>> index 56fd8f86..ee16ad3 100644
>> --- a/fs/ext4/Makefile
>> +++ b/fs/ext4/Makefile
>> @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
>> ?ext4-y ? ? ? := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
>> ? ? ? ? ? ? ? ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
>> ? ? ? ? ? ? ? ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
>> - ? ? ? ? ? ? mmp.o indirect.o
>> + ? ? ? ? ? ? mmp.o indirect.o delayed_extents.o
>>
>> ?ext4-$(CONFIG_EXT4_FS_XATTR) ? ? ? ? += xattr.o xattr_user.o xattr_trusted.o
>> ?ext4-$(CONFIG_EXT4_FS_POSIX_ACL) ? ? += acl.o
>> diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
>> new file mode 100644
>> index 0000000..8da7b78
>> --- /dev/null
>> +++ b/fs/ext4/delayed_extents.c
> <snip>
>> +/*
>> + * search through the tree for an delayed_extent with a given offset. ?If
>> + * it can't be found, try to find next extent.
>> + */
>> +static struct delayed_extent * __de_tree_search(struct rb_root *root,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ext4_lblk_t offset)
>> +{
>> + ? ? struct rb_node *node = root->rb_node;
>> + ? ? struct delayed_extent *de = NULL;
>> +
>> + ? ? while (node) {
>> + ? ? ? ? ? ? de = rb_entry(node, struct delayed_extent, rb_node);
>> + ? ? ? ? ? ? if (offset < de->start)
>> + ? ? ? ? ? ? ? ? ? ? node = node->rb_left;
>> + ? ? ? ? ? ? else if (offset >= delayed_extent_end(de))
>> + ? ? ? ? ? ? ? ? ? ? node = node->rb_right;
>> + ? ? ? ? ? ? else
>> + ? ? ? ? ? ? ? ? ? ? return de;
>> + ? ? }
>> +
>> + ? ? if (de && offset < de->start)
>> + ? ? ? ? ? ? return de;
>> +
>> + ? ? if (de && offset >= delayed_extent_end(de))
>> + ? ? ? ? ? ? return rb_entry(rb_next(&de->rb_node),
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct delayed_extent, rb_node);
> what if the de is the most right one? rb_next should return NULL in this
> case I guess? What is more, you will return a non-NULL value and use it
> later in the caller. The kernel will panic. Or do I miss something here?
Yes, it returns NULL if de is the most right one. Callers should
check the return value.
>> +
>> + ? ? return NULL;
>> +}
>> +
>> +/*
>> + * ext4_de_first_extent_since: find the 1st delayed extent covering @start
>> + * if it exists, otherwise, the next extent after @start.
> This comment seems to be unrelated to the below function.
Sorry, the function name is changed in version2, however comment is not changed.
>> + *
>> + * @inode: the inode which owns delayed extents
>> + * @offset: logic block
>> + *
>> + * Returns next block beyond the found extent.
>> + * Delayed extent is returned via @de.
>> + */
>> +ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de)
>> +{
>> + ? ? struct ext4_de_tree *tree;
>> + ? ? struct delayed_extent *de1;
>> + ? ? struct rb_node *node;
>> +
>> + ? ? de->len = 0;
>> + ? ? tree = &EXT4_I(inode)->i_de_tree;
>> + ? ? de1 = __de_tree_search(&tree->root, de->start);
>> + ? ? if (de1) {
>> + ? ? ? ? ? ? de->start = de1->start;
>> + ? ? ? ? ? ? de->len = de1->len;
>> + ? ? ? ? ? ? node = rb_next(&de1->rb_node);
> Sorry, but I don't understand what you are going to do here. In
> __de_tree_search, you have already use rb_next to get the next de if
> start < del->start. why we still need a rb_next here?
This function is supposed to be used by fiemap and bigalloc. The
former one needs next block of the returned extent. You can have a
loot at the 6th patch which implements fiemap based on delayed extent
tree.
rb_next in __de_tree_search is used if no extent covering given offset exists.
>> + ? ? ? ? ? ? if (node) {
>> + ? ? ? ? ? ? ? ? ? ? de1 = rb_entry(node, struct delayed_extent, rb_node);
>> + ? ? ? ? ? ? ? ? ? ? return de1->start;
>> + ? ? ? ? ? ? }
>> + ? ? }
>> +
>> + ? ? return EXT_MAX_BLOCKS;
>> +}
>> +
>> +static struct delayed_extent *
>> +ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
>> +{
>> + ? ? struct delayed_extent *de;
>> + ? ? de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
>> + ? ? if (de == NULL)
>> + ? ? ? ? ? ? return NULL;
>> + ? ? de->start = start;
>> + ? ? de->len = len;
>> + ? ? return de;
>> +}
>> +
>> +static void ext4_de_free_extent(struct delayed_extent *de)
>> +{
>> + ? ? kmem_cache_free(ext4_de_cachep, de);
>> +}
>> +
>> +static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct delayed_extent *de)
>> +{
>> + ? ? struct delayed_extent *de1;
>> + ? ? struct rb_node *node;
>> +
>> + ? ? node = rb_prev(&de->rb_node);
>> + ? ? if (!node)
>> + ? ? ? ? ? ? return;
>> +
>> + ? ? de1 = rb_entry(node, struct delayed_extent, rb_node);
>> + ? ? if (delayed_extent_end(de1) == de->start) {
>> + ? ? ? ? ? ? de1->len += de->len;
>> + ? ? ? ? ? ? rb_erase(&de->rb_node, &tree->root);
>> + ? ? ? ? ? ? if (de == tree->cache_de)
>> + ? ? ? ? ? ? ? ? ? ? tree->cache_de = de1;
>> + ? ? ? ? ? ? ext4_de_free_extent(de);
>> + ? ? }
>> +}
>> +
>> +static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct delayed_extent *de)
>> +{
>> + ? ? struct delayed_extent *de1;
>> + ? ? struct rb_node *node;
>> +
>> + ? ? node = rb_next(&de->rb_node);
>> + ? ? if (!node)
>> + ? ? ? ? ? ? return;
>> +
>> + ? ? de1 = rb_entry(node, struct delayed_extent, rb_node);
>> + ? ? if (de1->start == delayed_extent_end(de)) {
>> + ? ? ? ? ? ? de->len += de1->len;
>> + ? ? ? ? ? ? rb_erase(node, &tree->root);
>> + ? ? ? ? ? ? if (de1 == tree->cache_de)
>> + ? ? ? ? ? ? ? ? ? ? tree->cache_de = de;
>> + ? ? ? ? ? ? ext4_de_free_extent(de1);
>> + ? ? }
>> +}
>> +
>> +/*
>> + * ext4_de_add_space adds a space to a delayed extent tree.
>> + * Caller holds inode->i_data_sem.
>> + *
>> + * ext4_de_add_space is callyed by ext4_dealyed_write_begin and
>> + * ext4_de_remove_space.
>> + *
>> + * Return 0 on success, error code on failure.
>> + */
>> +int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len)
>> +{
>> + ? ? struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree;
>> + ? ? struct rb_node **p = &tree->root.rb_node;
>> + ? ? struct rb_node *parent = NULL;
>> + ? ? struct delayed_extent *de;
>> + ? ? ext4_lblk_t end = offset + len;
>> +
>> + ? ? BUG_ON(end <= offset);
>> +
>> + ? ? de = tree->cache_de;
>> + ? ? de_debug("add [%u/%u) to delayed extent list of inode %lu\n",
>> + ? ? ? ? ? ? ?offset, len, inode->i_ino);
>> +
>> + ? ? if (de && delayed_extent_end(de) == offset) {
>> + ? ? ? ? ? ? de->len += len;
>> + ? ? ? ? ? ? ext4_de_try_to_merge_right(tree, de);
>> + ? ? ? ? ? ? goto out;
>> + ? ? } else if (de && de->start == end) {
>> + ? ? ? ? ? ? de->start = offset;
>> + ? ? ? ? ? ? de->len += len;
>> + ? ? ? ? ? ? ext4_de_try_to_merge_left(tree, de);
>> + ? ? ? ? ? ? goto out;
>> + ? ? } else if (de && de->start <= offset &&
>> + ? ? ? ? ? ? ? ?delayed_extent_end(de) >= end)
>> + ? ? ? ? ? ? goto out;
>> +
>> + ? ? while (*p) {
>> + ? ? ? ? ? ? parent = *p;
>> + ? ? ? ? ? ? de = rb_entry(parent, struct delayed_extent, rb_node);
>> +
>> + ? ? ? ? ? ? if (offset < de->start) {
>> + ? ? ? ? ? ? ? ? ? ? if (end == de->start) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->len += len;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->start = offset;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? ? ? ? ? p = &(*p)->rb_left;
>> + ? ? ? ? ? ? } else if (offset > delayed_extent_end(de)) {
>> + ? ? ? ? ? ? ? ? ? ? if (delayed_extent_end(de) == offset) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->len += len;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? ? ? ? ? p = &(*p)->rb_right;
>> + ? ? ? ? ? ? } else
>> + ? ? ? ? ? ? ? ? ? ? goto out;
>> + ? ? }
> is the above what ?__de_tree_search try to do?
> btw, we'd better have a BUG_ON when we meet with an intersection since
> you only check the offset in your rb_tree search. So what if offset <
> de->start while offset + len > de->start? It would cause your algorithm
> not work I guess.
Yes. ext4_de_add_space is used in 2 cases: 1. delayed allocation, in
this case len always equals one. 2. ext4_de_remove_space, in this
case, if ext4_de_remove_space does right thing, no intersection
happens.
In version 1, there was BUG_ON here, and I found we can not guarantee
no intersection exists in case1. If copy_from_user fails with 0 bytes
copied, then page is not dirtied and i_size is not changed as well,
rewrite on the page can introduce intersection.
>> +
>> + ? ? de = ext4_de_alloc_extent(offset, len);
>> + ? ? if (!de)
>> + ? ? ? ? ? ? return -ENOMEM;
>> + ? ? rb_link_node(&de->rb_node, parent, p);
>> + ? ? rb_insert_color(&de->rb_node, &tree->root);
>> +
>> +out:
>> + ? ? tree->cache_de = de;
>> + ? ? ext4_de_print_tree(inode);
>> +
>> + ? ? return 0;
>> +}
>> +
>> +/*
>> + * ext4_de_remove_space() removes a space from a delayed extent tree.
>> + * Caller holds inode->i_data_sem.
>> + *
>> + * Return 0 on success, error code on failure.
>> + */
>> +int ext4_de_remove_space(struct inode *inode, ext4_lblk_t offset,
>> + ? ? ? ? ? ? ? ? ? ? ?ext4_lblk_t len)
>> +{
>> + ? ? struct rb_node *node;
>> + ? ? struct ext4_de_tree *tree;
>> + ? ? struct delayed_extent *de;
>> + ? ? struct delayed_extent orig_de;
>> + ? ? ext4_lblk_t len1, len2, end;
>> +
>> + ? ? de_debug("remove [%u/%u) from delayed extent list of inode %lu\n",
>> + ? ? ? ? ? ? ?offset, len, inode->i_ino);
>> +
>> + ? ? end = offset + len;
>> + ? ? BUG_ON(end <= offset);
>> + ? ? tree = &EXT4_I(inode)->i_de_tree;
>> + ? ? de = __de_tree_search(&tree->root, offset);
>> + ? ? if (!de)
>> + ? ? ? ? ? ? goto out;
>> +
>> + ? ? orig_de.start = de->start;
>> + ? ? orig_de.len = de->len;
>> + ? ? len1 = offset > de->start ? offset - de->start : 0;
>> + ? ? len2 = delayed_extent_end(de) > end ?
>> + ? ? ? ? ? ?delayed_extent_end(de) - end : 0;
>> + ? ? if (len1 > 0)
>> + ? ? ? ? ? ? de->len = len1;
>> + ? ? if (len2 > 0) {
>> + ? ? ? ? ? ? if (len1 > 0) {
>> + ? ? ? ? ? ? ? ? ? ? int err;
>> + ? ? ? ? ? ? ? ? ? ? err = ext4_de_add_space(inode, end, len2);
> I don't find any error in your ext4_de_add_space.
ENOMEM could be returned.
Thank you for review.
Yongqiang.
>> + ? ? ? ? ? ? ? ? ? ? if (err) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->start = orig_de.start;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->len = orig_de.len;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? return err;
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? } else {
>> + ? ? ? ? ? ? ? ? ? ? de->start = end;
>> + ? ? ? ? ? ? ? ? ? ? de->len = len2;
>> + ? ? ? ? ? ? }
>> + ? ? ? ? ? ? goto out;
>> + ? ? }
>
> Thanks
> Tao
>
--
Best Wishes
Yongqiang Yang
On Thu, Sep 29, 2011 at 8:08 AM, Yongqiang Yang <[email protected]> wrote:
> This patch let ext4 maintain delayed extent trees.
>
> Signed-off-by: Yongqiang Yang <[email protected]>
> ---
> ?fs/ext4/ext4.h ? ? | ? ?1 +
> ?fs/ext4/extents.c ?| ? ?2 ++
> ?fs/ext4/indirect.c | ? ?3 +++
> ?fs/ext4/inode.c ? ?| ? 28 ++++++++++++++++++++++++++--
> ?fs/ext4/super.c ? ?| ? 12 +++++++++++-
> ?5 files changed, 43 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d3c6b97..177ec0a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -519,6 +519,7 @@ struct ext4_new_group_data {
> ?#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT ? ? ? ? ?0x0020
> ? ? ? ?/* Don't normalize allocation size (used for fallocate) */
> ?#define EXT4_GET_BLOCKS_NO_NORMALIZE ? ? ? ? ? 0x0040
> +#define EXT4_GET_BLOCKS_DEALLOC ? ? ? ? ? ? ? ? ? ? ? ?0x0080
typo + unused macro ?
Cheers,
Amir.
>
> ?/*
> ?* Flags used by ext4_free_blocks
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 9124cd2..bdbb984 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3688,6 +3688,8 @@ void ext4_ext_truncate(struct inode *inode)
>
> ? ? ? ?last_block = (inode->i_size + sb->s_blocksize - 1)
> ? ? ? ? ? ? ? ? ? ? ? ?>> EXT4_BLOCK_SIZE_BITS(sb);
> + ? ? ? err = ext4_de_remove_space(inode, last_block,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT_MAX_BLOCKS - last_block);
> ? ? ? ?err = ext4_ext_remove_space(inode, last_block);
>
> ? ? ? ?/* In a multi-transaction truncate, we only make the final
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 0962642..25cdb5b 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -22,6 +22,7 @@
>
> ?#include <linux/module.h>
> ?#include "ext4_jbd2.h"
> +#include "ext4_extents.h"
> ?#include "truncate.h"
>
> ?#include <trace/events/ext4.h>
> @@ -1383,6 +1384,8 @@ void ext4_ind_truncate(struct inode *inode)
> ? ? ? ?down_write(&ei->i_data_sem);
>
> ? ? ? ?ext4_discard_preallocations(inode);
> + ? ? ? ext4_de_remove_space(inode, last_block,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT_MAX_BLOCKS - last_block);
>
> ? ? ? ?/*
> ? ? ? ? * The orphan list entry will now protect us from any crash which
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f86b149..0f9f108 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -442,7 +442,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> ? ? ? ?up_read((&EXT4_I(inode)->i_data_sem));
>
> ? ? ? ?if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> - ? ? ? ? ? ? ? int ret = check_block_validity(inode, map);
> + ? ? ? ? ? ? ? int ret;
> + ? ? ? ? ? ? ? if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> + ? ? ? ? ? ? ? ? ? ? ? /* delayed alloc may be allocated by fallocate,
> + ? ? ? ? ? ? ? ? ? ? ? ?* we need to handle delayed extent here.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? down_write((&EXT4_I(inode)->i_data_sem));
> + ? ? ? ? ? ? ? ? ? ? ? goto delayed_mapped;
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? ret = check_block_validity(inode, map);
> ? ? ? ? ? ? ? ?if (ret != 0)
> ? ? ? ? ? ? ? ? ? ? ? ?return ret;
> ? ? ? ?}
> @@ -517,8 +525,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> ? ? ? ? ? ? ? ? ? ? ? ?(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
> ? ? ? ? ? ? ? ? ? ? ? ?ext4_da_update_reserve_space(inode, retval, 1);
> ? ? ? ?}
> - ? ? ? if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> + ? ? ? if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> ? ? ? ? ? ? ? ?ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
> +delayed_mapped:
> + ? ? ? ? ? ? ? if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> + ? ? ? ? ? ? ? ? ? ? ? int ret;
> + ? ? ? ? ? ? ? ? ? ? ? /* delayed allocation blocks has been allocated */
> + ? ? ? ? ? ? ? ? ? ? ? ret = ext4_de_remove_space(inode, map->m_lblk,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?map->m_len);
> + ? ? ? ? ? ? ? ? ? ? ? if (ret < 0)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? retval = ret;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
>
> ? ? ? ?up_write((&EXT4_I(inode)->i_data_sem));
> ? ? ? ?if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> @@ -1630,6 +1648,12 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
> ? ? ? ? ? ? ? ? ? ? ? ?/* not enough space to reserve */
> ? ? ? ? ? ? ? ? ? ? ? ?return ret;
>
> + ? ? ? ? ? ? ? down_write((&EXT4_I(inode)->i_data_sem));
> + ? ? ? ? ? ? ? ret = ext4_de_add_space(inode, map.m_lblk, map.m_len);
> + ? ? ? ? ? ? ? up_write((&EXT4_I(inode)->i_data_sem));
> + ? ? ? ? ? ? ? if (ret)
> + ? ? ? ? ? ? ? ? ? ? ? return ret;
> +
> ? ? ? ? ? ? ? ?map_bh(bh, inode->i_sb, invalid_block);
> ? ? ? ? ? ? ? ?set_buffer_new(bh);
> ? ? ? ? ? ? ? ?set_buffer_delay(bh);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 247fcdd..a248551 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -49,6 +49,7 @@
> ?#include "xattr.h"
> ?#include "acl.h"
> ?#include "mballoc.h"
> +#include "ext4_extents.h"
>
> ?#define CREATE_TRACE_POINTS
> ?#include <trace/events/ext4.h>
> @@ -967,6 +968,7 @@ void ext4_clear_inode(struct inode *inode)
> ? ? ? ?end_writeback(inode);
> ? ? ? ?dquot_drop(inode);
> ? ? ? ?ext4_discard_preallocations(inode);
> + ? ? ? ext4_de_remove_space(inode, 0, EXT_MAX_BLOCKS);
> ? ? ? ?if (EXT4_I(inode)->jinode) {
> ? ? ? ? ? ? ? ?jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? EXT4_I(inode)->jinode);
> @@ -4976,9 +4978,14 @@ static int __init ext4_init_fs(void)
> ? ? ? ? ? ? ? ?init_waitqueue_head(&ext4__ioend_wq[i]);
> ? ? ? ?}
>
> - ? ? ? err = ext4_init_pageio();
> + ? ? ? err = ext4_init_de();
> ? ? ? ?if (err)
> ? ? ? ? ? ? ? ?return err;
> +
> + ? ? ? err = ext4_init_pageio();
> + ? ? ? if (err)
> + ? ? ? ? ? ? ? goto out8;
> +
> ? ? ? ?err = ext4_init_system_zone();
> ? ? ? ?if (err)
> ? ? ? ? ? ? ? ?goto out7;
> @@ -5030,6 +5037,9 @@ out6:
> ? ? ? ?ext4_exit_system_zone();
> ?out7:
> ? ? ? ?ext4_exit_pageio();
> +out8:
> + ? ? ? ext4_exit_de();
> +
> ? ? ? ?return err;
> ?}
>
> --
> 1.7.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>
On 09/29/2011 04:36 PM, Yongqiang Yang wrote:
> On Thu, Sep 29, 2011 at 4:05 PM, Tao Ma <[email protected]> wrote:
>> Hi yongqiang,
>> On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
>>> This patch adds operations on a delayed extent tree.
>>>
>>> Signed-off-by; Yongqiang Yang <[email protected]>
>>> ---
>>> fs/ext4/Makefile | 2 +-
>>> fs/ext4/delayed_extents.c | 412 +++++++++++++++++++++++++++++++++++++++++++++
>>> fs/ext4/delayed_extents.h | 18 ++
>>> 3 files changed, 431 insertions(+), 1 deletions(-)
>>> create mode 100644 fs/ext4/delayed_extents.c
>>>
>>> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
>>> index 56fd8f86..ee16ad3 100644
>>> --- a/fs/ext4/Makefile
>>> +++ b/fs/ext4/Makefile
>>> @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
>>> ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
>>> ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
>>> ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
>>> - mmp.o indirect.o
>>> + mmp.o indirect.o delayed_extents.o
>>>
>>> ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
>>> ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
>>> diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
>>> new file mode 100644
>>> index 0000000..8da7b78
>>> --- /dev/null
>>> +++ b/fs/ext4/delayed_extents.c
>> <snip>
>>> +/*
>>> + * search through the tree for an delayed_extent with a given offset. If
>>> + * it can't be found, try to find next extent.
>>> + */
>>> +static struct delayed_extent * __de_tree_search(struct rb_root *root,
>>> + ext4_lblk_t offset)
>>> +{
>>> + struct rb_node *node = root->rb_node;
>>> + struct delayed_extent *de = NULL;
>>> +
>>> + while (node) {
>>> + de = rb_entry(node, struct delayed_extent, rb_node);
>>> + if (offset < de->start)
>>> + node = node->rb_left;
>>> + else if (offset >= delayed_extent_end(de))
>>> + node = node->rb_right;
>>> + else
>>> + return de;
>>> + }
>>> +
>>> + if (de && offset < de->start)
>>> + return de;
>>> +
>>> + if (de && offset >= delayed_extent_end(de))
>>> + return rb_entry(rb_next(&de->rb_node),
>>> + struct delayed_extent, rb_node);
>> what if the de is the most right one? rb_next should return NULL in this
>> case I guess? What is more, you will return a non-NULL value and use it
>> later in the caller. The kernel will panic. Or do I miss something here?
> Yes, it returns NULL if de is the most right one. Callers should
> check the return value.
No rb_next is NULL doesn't guarantee rb_entry(...) is NULL. ;)
You returns rb_entry(NULL, struct delayed_extent, rb_node) which depends
on the offset of rb_node in delayed_extent. Currently it is NULL because
your delayed_extent has rb_node as the first parameter.
But it is bug-prone to says that rb_entry(NULL,...) is also NULL.
And it may cause a kernel panic later if some guys change the layout of
delayed_extent. We'd better be careful here. :)
>
>>> +
>>> + return NULL;
>>> +}
>>> +
>>> +/*
>>> + * ext4_de_first_extent_since: find the 1st delayed extent covering @start
>>> + * if it exists, otherwise, the next extent after @start.
>> This comment seems to be unrelated to the below function.
> Sorry, the function name is changed in version2, however comment is not changed.
>>> + *
>>> + * @inode: the inode which owns delayed extents
>>> + * @offset: logic block
>>> + *
>>> + * Returns next block beyond the found extent.
>>> + * Delayed extent is returned via @de.
>>> + */
>>> +ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de)
>>> +{
>>> + struct ext4_de_tree *tree;
>>> + struct delayed_extent *de1;
>>> + struct rb_node *node;
>>> +
>>> + de->len = 0;
>>> + tree = &EXT4_I(inode)->i_de_tree;
>>> + de1 = __de_tree_search(&tree->root, de->start);
>>> + if (de1) {
>>> + de->start = de1->start;
>>> + de->len = de1->len;
>>> + node = rb_next(&de1->rb_node);
>> Sorry, but I don't understand what you are going to do here. In
>> __de_tree_search, you have already use rb_next to get the next de if
>> start < del->start. why we still need a rb_next here?
> This function is supposed to be used by fiemap and bigalloc. The
> former one needs next block of the returned extent. You can have a
> loot at the 6th patch which implements fiemap based on delayed extent
> tree.
>
> rb_next in __de_tree_search is used if no extent covering given offset exists.
oh, so it is too tricky for the caller and the reviewer of course.
Some times you suppose the function can find one with a de which has
de->start <= offset && offset < de->start + de->end(situation 1).
Sometimes you suppose that we should find one de with offset <
de->start(situation 2). It is really a bit hard for us(the other reader)
to know from the very beginning.
In your case here, we may pass in a parameter to __de_tree_search to
describe explicitly whether we want a de suitable for 1 or 2, so that
__de_tree_search can find what we want or return NULL. Otherwise we(the
caller) have to check every time whether the returned de is OK for our need.
>
>>> + if (node) {
>>> + de1 = rb_entry(node, struct delayed_extent, rb_node);
>>> + return de1->start;
>>> + }
>>> + }
>>> +
>>> + return EXT_MAX_BLOCKS;
>>> +}
>>> +
>>> +static struct delayed_extent *
>>> +ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
>>> +{
>>> + struct delayed_extent *de;
>>> + de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
>>> + if (de == NULL)
>>> + return NULL;
>>> + de->start = start;
>>> + de->len = len;
>>> + return de;
>>> +}
>>> +
>>> +static void ext4_de_free_extent(struct delayed_extent *de)
>>> +{
>>> + kmem_cache_free(ext4_de_cachep, de);
>>> +}
>>> +
>>> +static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree,
>>> + struct delayed_extent *de)
>>> +{
>>> + struct delayed_extent *de1;
>>> + struct rb_node *node;
>>> +
>>> + node = rb_prev(&de->rb_node);
>>> + if (!node)
>>> + return;
>>> +
>>> + de1 = rb_entry(node, struct delayed_extent, rb_node);
>>> + if (delayed_extent_end(de1) == de->start) {
>>> + de1->len += de->len;
>>> + rb_erase(&de->rb_node, &tree->root);
>>> + if (de == tree->cache_de)
>>> + tree->cache_de = de1;
>>> + ext4_de_free_extent(de);
>>> + }
>>> +}
>>> +
>>> +static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree,
>>> + struct delayed_extent *de)
>>> +{
>>> + struct delayed_extent *de1;
>>> + struct rb_node *node;
>>> +
>>> + node = rb_next(&de->rb_node);
>>> + if (!node)
>>> + return;
>>> +
>>> + de1 = rb_entry(node, struct delayed_extent, rb_node);
>>> + if (de1->start == delayed_extent_end(de)) {
>>> + de->len += de1->len;
>>> + rb_erase(node, &tree->root);
>>> + if (de1 == tree->cache_de)
>>> + tree->cache_de = de;
>>> + ext4_de_free_extent(de1);
>>> + }
>>> +}
>>> +
>>> +/*
>>> + * ext4_de_add_space adds a space to a delayed extent tree.
>>> + * Caller holds inode->i_data_sem.
>>> + *
>>> + * ext4_de_add_space is callyed by ext4_dealyed_write_begin and
>>> + * ext4_de_remove_space.
>>> + *
>>> + * Return 0 on success, error code on failure.
>>> + */
>>> +int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len)
>>> +{
>>> + struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree;
>>> + struct rb_node **p = &tree->root.rb_node;
>>> + struct rb_node *parent = NULL;
>>> + struct delayed_extent *de;
>>> + ext4_lblk_t end = offset + len;
>>> +
>>> + BUG_ON(end <= offset);
>>> +
>>> + de = tree->cache_de;
>>> + de_debug("add [%u/%u) to delayed extent list of inode %lu\n",
>>> + offset, len, inode->i_ino);
>>> +
>>> + if (de && delayed_extent_end(de) == offset) {
>>> + de->len += len;
>>> + ext4_de_try_to_merge_right(tree, de);
>>> + goto out;
>>> + } else if (de && de->start == end) {
>>> + de->start = offset;
>>> + de->len += len;
>>> + ext4_de_try_to_merge_left(tree, de);
>>> + goto out;
>>> + } else if (de && de->start <= offset &&
>>> + delayed_extent_end(de) >= end)
>>> + goto out;
>>> +
>>> + while (*p) {
>>> + parent = *p;
>>> + de = rb_entry(parent, struct delayed_extent, rb_node);
>>> +
>>> + if (offset < de->start) {
>>> + if (end == de->start) {
>>> + de->len += len;
>>> + de->start = offset;
>>> + goto out;
>>> + }
>>> + p = &(*p)->rb_left;
>>> + } else if (offset > delayed_extent_end(de)) {
>>> + if (delayed_extent_end(de) == offset) {
>>> + de->len += len;
>>> + goto out;
>>> + }
>>> + p = &(*p)->rb_right;
>>> + } else
>>> + goto out;
>>> + }
>> is the above what __de_tree_search try to do?
>> btw, we'd better have a BUG_ON when we meet with an intersection since
>> you only check the offset in your rb_tree search. So what if offset <
>> de->start while offset + len > de->start? It would cause your algorithm
>> not work I guess.
> Yes. ext4_de_add_space is used in 2 cases: 1. delayed allocation, in
> this case len always equals one. 2. ext4_de_remove_space, in this
> case, if ext4_de_remove_space does right thing, no intersection
> happens.
>
> In version 1, there was BUG_ON here, and I found we can not guarantee
> no intersection exists in case1. If copy_from_user fails with 0 bytes
> copied, then page is not dirtied and i_size is not changed as well,
> rewrite on the page can introduce intersection.
Sorry I don't quite get what's your meaning. Should it be handled by the
above codes(the merge and go out stuff) already? Or if as you said, the
user can give us with intersections, then what type of intersection?
IMHO, we should handle this in a grace way. Your codes seems not handle it.
Thanks
Tao
On Thu, Sep 29, 2011 at 5:40 PM, Tao Ma <[email protected]> wrote:
> On 09/29/2011 04:36 PM, Yongqiang Yang wrote:
>> On Thu, Sep 29, 2011 at 4:05 PM, Tao Ma <[email protected]> wrote:
>>> Hi yongqiang,
>>> On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
>>>> This patch adds operations on a delayed extent tree.
>>>>
>>>> Signed-off-by; Yongqiang Yang <[email protected]>
>>>> ---
>>>> ?fs/ext4/Makefile ? ? ? ? ?| ? ?2 +-
>>>> ?fs/ext4/delayed_extents.c | ?412 +++++++++++++++++++++++++++++++++++++++++++++
>>>> ?fs/ext4/delayed_extents.h | ? 18 ++
>>>> ?3 files changed, 431 insertions(+), 1 deletions(-)
>>>> ?create mode 100644 fs/ext4/delayed_extents.c
>>>>
>>>> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
>>>> index 56fd8f86..ee16ad3 100644
>>>> --- a/fs/ext4/Makefile
>>>> +++ b/fs/ext4/Makefile
>>>> @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
>>>> ?ext4-y ? ? ? := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
>>>> ? ? ? ? ? ? ? ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
>>>> ? ? ? ? ? ? ? ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
>>>> - ? ? ? ? ? ? mmp.o indirect.o
>>>> + ? ? ? ? ? ? mmp.o indirect.o delayed_extents.o
>>>>
>>>> ?ext4-$(CONFIG_EXT4_FS_XATTR) ? ? ? ? += xattr.o xattr_user.o xattr_trusted.o
>>>> ?ext4-$(CONFIG_EXT4_FS_POSIX_ACL) ? ? += acl.o
>>>> diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
>>>> new file mode 100644
>>>> index 0000000..8da7b78
>>>> --- /dev/null
>>>> +++ b/fs/ext4/delayed_extents.c
>>> <snip>
>>>> +/*
>>>> + * search through the tree for an delayed_extent with a given offset. ?If
>>>> + * it can't be found, try to find next extent.
>>>> + */
>>>> +static struct delayed_extent * __de_tree_search(struct rb_root *root,
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ext4_lblk_t offset)
>>>> +{
>>>> + ? ? struct rb_node *node = root->rb_node;
>>>> + ? ? struct delayed_extent *de = NULL;
>>>> +
>>>> + ? ? while (node) {
>>>> + ? ? ? ? ? ? de = rb_entry(node, struct delayed_extent, rb_node);
>>>> + ? ? ? ? ? ? if (offset < de->start)
>>>> + ? ? ? ? ? ? ? ? ? ? node = node->rb_left;
>>>> + ? ? ? ? ? ? else if (offset >= delayed_extent_end(de))
>>>> + ? ? ? ? ? ? ? ? ? ? node = node->rb_right;
>>>> + ? ? ? ? ? ? else
>>>> + ? ? ? ? ? ? ? ? ? ? return de;
>>>> + ? ? }
>>>> +
>>>> + ? ? if (de && offset < de->start)
>>>> + ? ? ? ? ? ? return de;
>>>> +
>>>> + ? ? if (de && offset >= delayed_extent_end(de))
>>>> + ? ? ? ? ? ? return rb_entry(rb_next(&de->rb_node),
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct delayed_extent, rb_node);
>>> what if the de is the most right one? rb_next should return NULL in this
>>> case I guess? What is more, you will return a non-NULL value and use it
>>> later in the caller. The kernel will panic. Or do I miss something here?
>> Yes, it returns NULL if de is the most right one. ?Callers should
>> check the return value.
> No rb_next is NULL doesn't guarantee rb_entry(...) is NULL. ;)
> You returns rb_entry(NULL, struct delayed_extent, rb_node) which depends
> on the offset of rb_node in delayed_extent. Currently it is NULL because
> your delayed_extent has rb_node as the first parameter.
> But it is bug-prone to says that rb_entry(NULL,...) is also NULL.
> And it may cause a kernel panic later if some guys change the layout of
> delayed_extent. We'd better be careful here. :)
I see. It's a bug:-)
>>
>>>> +
>>>> + ? ? return NULL;
>>>> +}
>>>> +
>>>> +/*
>>>> + * ext4_de_first_extent_since: find the 1st delayed extent covering @start
>>>> + * if it exists, otherwise, the next extent after @start.
>>> This comment seems to be unrelated to the below function.
>> Sorry, the function name is changed in version2, however comment is not changed.
>>>> + *
>>>> + * @inode: the inode which owns delayed extents
>>>> + * @offset: logic block
>>>> + *
>>>> + * Returns next block beyond the found extent.
>>>> + * Delayed extent is returned via @de.
>>>> + */
>>>> +ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de)
>>>> +{
>>>> + ? ? struct ext4_de_tree *tree;
>>>> + ? ? struct delayed_extent *de1;
>>>> + ? ? struct rb_node *node;
>>>> +
>>>> + ? ? de->len = 0;
>>>> + ? ? tree = &EXT4_I(inode)->i_de_tree;
>>>> + ? ? de1 = __de_tree_search(&tree->root, de->start);
>>>> + ? ? if (de1) {
>>>> + ? ? ? ? ? ? de->start = de1->start;
>>>> + ? ? ? ? ? ? de->len = de1->len;
>>>> + ? ? ? ? ? ? node = rb_next(&de1->rb_node);
>>> Sorry, but I don't understand what you are going to do here. In
>>> __de_tree_search, you have already use rb_next to get the next de if
>>> start < del->start. why we still need a rb_next here?
>> This function is supposed to be used by fiemap and bigalloc. The
>> former one needs next block of the returned extent. ?You can have a
>> loot at the 6th patch which implements fiemap based on delayed extent
>> tree.
>>
>> rb_next in __de_tree_search is used if no extent covering given offset exists.
> oh, so it is too tricky for the caller and the reviewer of course.
>
> Some times you suppose the function can find one with a de which has
> de->start <= offset && offset < de->start + de->end(situation 1).
> Sometimes you suppose that we should find one de with offset <
> de->start(situation 2). It is really a bit hard for us(the other reader)
> to know from the very beginning.
>
> In your case here, we may pass in a parameter to __de_tree_search to
> describe explicitly whether we want a de suitable for 1 or 2, so that
> __de_tree_search can find what we want or return NULL. Otherwise we(the
> caller) have to check every time whether the returned de is OK for our need.
Accepted.
>
>>
>>>> + ? ? ? ? ? ? if (node) {
>>>> + ? ? ? ? ? ? ? ? ? ? de1 = rb_entry(node, struct delayed_extent, rb_node);
>>>> + ? ? ? ? ? ? ? ? ? ? return de1->start;
>>>> + ? ? ? ? ? ? }
>>>> + ? ? }
>>>> +
>>>> + ? ? return EXT_MAX_BLOCKS;
>>>> +}
>>>> +
>>>> +static struct delayed_extent *
>>>> +ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
>>>> +{
>>>> + ? ? struct delayed_extent *de;
>>>> + ? ? de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
>>>> + ? ? if (de == NULL)
>>>> + ? ? ? ? ? ? return NULL;
>>>> + ? ? de->start = start;
>>>> + ? ? de->len = len;
>>>> + ? ? return de;
>>>> +}
>>>> +
>>>> +static void ext4_de_free_extent(struct delayed_extent *de)
>>>> +{
>>>> + ? ? kmem_cache_free(ext4_de_cachep, de);
>>>> +}
>>>> +
>>>> +static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree,
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct delayed_extent *de)
>>>> +{
>>>> + ? ? struct delayed_extent *de1;
>>>> + ? ? struct rb_node *node;
>>>> +
>>>> + ? ? node = rb_prev(&de->rb_node);
>>>> + ? ? if (!node)
>>>> + ? ? ? ? ? ? return;
>>>> +
>>>> + ? ? de1 = rb_entry(node, struct delayed_extent, rb_node);
>>>> + ? ? if (delayed_extent_end(de1) == de->start) {
>>>> + ? ? ? ? ? ? de1->len += de->len;
>>>> + ? ? ? ? ? ? rb_erase(&de->rb_node, &tree->root);
>>>> + ? ? ? ? ? ? if (de == tree->cache_de)
>>>> + ? ? ? ? ? ? ? ? ? ? tree->cache_de = de1;
>>>> + ? ? ? ? ? ? ext4_de_free_extent(de);
>>>> + ? ? }
>>>> +}
>>>> +
>>>> +static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree,
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct delayed_extent *de)
>>>> +{
>>>> + ? ? struct delayed_extent *de1;
>>>> + ? ? struct rb_node *node;
>>>> +
>>>> + ? ? node = rb_next(&de->rb_node);
>>>> + ? ? if (!node)
>>>> + ? ? ? ? ? ? return;
>>>> +
>>>> + ? ? de1 = rb_entry(node, struct delayed_extent, rb_node);
>>>> + ? ? if (de1->start == delayed_extent_end(de)) {
>>>> + ? ? ? ? ? ? de->len += de1->len;
>>>> + ? ? ? ? ? ? rb_erase(node, &tree->root);
>>>> + ? ? ? ? ? ? if (de1 == tree->cache_de)
>>>> + ? ? ? ? ? ? ? ? ? ? tree->cache_de = de;
>>>> + ? ? ? ? ? ? ext4_de_free_extent(de1);
>>>> + ? ? }
>>>> +}
>>>> +
>>>> +/*
>>>> + * ext4_de_add_space adds a space to a delayed extent tree.
>>>> + * Caller holds inode->i_data_sem.
>>>> + *
>>>> + * ext4_de_add_space is callyed by ext4_dealyed_write_begin and
>>>> + * ext4_de_remove_space.
>>>> + *
>>>> + * Return 0 on success, error code on failure.
>>>> + */
>>>> +int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len)
>>>> +{
>>>> + ? ? struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree;
>>>> + ? ? struct rb_node **p = &tree->root.rb_node;
>>>> + ? ? struct rb_node *parent = NULL;
>>>> + ? ? struct delayed_extent *de;
>>>> + ? ? ext4_lblk_t end = offset + len;
>>>> +
>>>> + ? ? BUG_ON(end <= offset);
>>>> +
>>>> + ? ? de = tree->cache_de;
>>>> + ? ? de_debug("add [%u/%u) to delayed extent list of inode %lu\n",
>>>> + ? ? ? ? ? ? ?offset, len, inode->i_ino);
>>>> +
>>>> + ? ? if (de && delayed_extent_end(de) == offset) {
>>>> + ? ? ? ? ? ? de->len += len;
>>>> + ? ? ? ? ? ? ext4_de_try_to_merge_right(tree, de);
>>>> + ? ? ? ? ? ? goto out;
>>>> + ? ? } else if (de && de->start == end) {
>>>> + ? ? ? ? ? ? de->start = offset;
>>>> + ? ? ? ? ? ? de->len += len;
>>>> + ? ? ? ? ? ? ext4_de_try_to_merge_left(tree, de);
>>>> + ? ? ? ? ? ? goto out;
>>>> + ? ? } else if (de && de->start <= offset &&
>>>> + ? ? ? ? ? ? ? ?delayed_extent_end(de) >= end)
>>>> + ? ? ? ? ? ? goto out;
>>>> +
>>>> + ? ? while (*p) {
>>>> + ? ? ? ? ? ? parent = *p;
>>>> + ? ? ? ? ? ? de = rb_entry(parent, struct delayed_extent, rb_node);
>>>> +
>>>> + ? ? ? ? ? ? if (offset < de->start) {
>>>> + ? ? ? ? ? ? ? ? ? ? if (end == de->start) {
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->len += len;
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->start = offset;
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>>>> + ? ? ? ? ? ? ? ? ? ? }
>>>> + ? ? ? ? ? ? ? ? ? ? p = &(*p)->rb_left;
>>>> + ? ? ? ? ? ? } else if (offset > delayed_extent_end(de)) {
>>>> + ? ? ? ? ? ? ? ? ? ? if (delayed_extent_end(de) == offset) {
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? de->len += len;
>>>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>>>> + ? ? ? ? ? ? ? ? ? ? }
>>>> + ? ? ? ? ? ? ? ? ? ? p = &(*p)->rb_right;
>>>> + ? ? ? ? ? ? } else
>>>> + ? ? ? ? ? ? ? ? ? ? goto out;
>>>> + ? ? }
>>> is the above what ?__de_tree_search try to do?
>>> btw, we'd better have a BUG_ON when we meet with an intersection since
>>> you only check the offset in your rb_tree search. So what if offset <
>>> de->start while offset + len > de->start? It would cause your algorithm
>>> not work I guess.
>> Yes. ?ext4_de_add_space is used in 2 cases: 1. delayed allocation, in
>> this case len always equals one. ?2. ext4_de_remove_space, in this
>> case, if ext4_de_remove_space does right thing, no intersection
>> happens.
>>
>> In version 1, there was BUG_ON here, and I found we can not guarantee
>> no intersection exists in case1. ?If copy_from_user fails with 0 bytes
>> copied, then page is not dirtied and i_size is not changed as well,
>> rewrite on the page can introduce intersection.
> Sorry I don't quite get what's your meaning. Should it be handled by the
> above codes(the merge and go out stuff) already? Or if as you said, the
> user can give us with intersections, then what type of intersection?
Writes do as follows, allocating pages, mapping pages, copying bytes
from user space, dirtying page, changing i_size. Errors could happen
in copying and the page could be released after. so write on the same
block will introduce intersection. Am I right?
Thank you for your review.
Yongqiang.
> IMHO, we should handle this in a grace way. Your codes seems not handle it.
>
> Thanks
> Tao
>
>
--
Best Wishes
Yongqiang Yang
On 09/29/2011 09:12 PM, Yongqiang Yang wrote:
> On Thu, Sep 29, 2011 at 5:40 PM, Tao Ma <[email protected]> wrote:
>> On 09/29/2011 04:36 PM, Yongqiang Yang wrote:
>>> On Thu, Sep 29, 2011 at 4:05 PM, Tao Ma <[email protected]> wrote:
>>>> Hi yongqiang,
>>>> On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
>>>>> This patch adds operations on a delayed extent tree.
>>>>>
>>>>> Signed-off-by; Yongqiang Yang <[email protected]>
>>>>> ---
>>>>> fs/ext4/Makefile | 2 +-
>>>>> fs/ext4/delayed_extents.c | 412 +++++++++++++++++++++++++++++++++++++++++++++
>>>>> fs/ext4/delayed_extents.h | 18 ++
>>>>> 3 files changed, 431 insertions(+), 1 deletions(-)
>>>>> create mode 100644 fs/ext4/delayed_extents.c
>>>>>
>>>>> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
>>>>> index 56fd8f86..ee16ad3 100644
>>>>> --- a/fs/ext4/Makefile
>>>>> +++ b/fs/ext4/Makefile
>>>>> @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
>>>>> ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
>>>>> ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
>>>>> ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
>>>>> - mmp.o indirect.o
>>>>> + mmp.o indirect.o delayed_extents.o
>>>>>
>>>>> ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
>>>>> ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
>>>>> diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
>>>>> new file mode 100644
>>>>> index 0000000..8da7b78
>>>>> --- /dev/null
>>>>> +++ b/fs/ext4/delayed_extents.c
>>>> <snip>
>>>>> +/*
>>>>> + * search through the tree for an delayed_extent with a given offset. If
>>>>> + * it can't be found, try to find next extent.
>>>>> + */
>>>>> +static struct delayed_extent * __de_tree_search(struct rb_root *root,
>>>>> + ext4_lblk_t offset)
>>>>> +{
>>>>> + struct rb_node *node = root->rb_node;
>>>>> + struct delayed_extent *de = NULL;
>>>>> +
>>>>> + while (node) {
>>>>> + de = rb_entry(node, struct delayed_extent, rb_node);
>>>>> + if (offset < de->start)
>>>>> + node = node->rb_left;
>>>>> + else if (offset >= delayed_extent_end(de))
>>>>> + node = node->rb_right;
>>>>> + else
>>>>> + return de;
>>>>> + }
>>>>> +
>>>>> + if (de && offset < de->start)
>>>>> + return de;
>>>>> +
>>>>> + if (de && offset >= delayed_extent_end(de))
>>>>> + return rb_entry(rb_next(&de->rb_node),
>>>>> + struct delayed_extent, rb_node);
>>>> what if the de is the most right one? rb_next should return NULL in this
>>>> case I guess? What is more, you will return a non-NULL value and use it
>>>> later in the caller. The kernel will panic. Or do I miss something here?
>>> Yes, it returns NULL if de is the most right one. Callers should
>>> check the return value.
>> No rb_next is NULL doesn't guarantee rb_entry(...) is NULL. ;)
>> You returns rb_entry(NULL, struct delayed_extent, rb_node) which depends
>> on the offset of rb_node in delayed_extent. Currently it is NULL because
>> your delayed_extent has rb_node as the first parameter.
>> But it is bug-prone to says that rb_entry(NULL,...) is also NULL.
>> And it may cause a kernel panic later if some guys change the layout of
>> delayed_extent. We'd better be careful here. :)
> I see. It's a bug:-)
>>>
>>>>> +
>>>>> + return NULL;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * ext4_de_first_extent_since: find the 1st delayed extent covering @start
>>>>> + * if it exists, otherwise, the next extent after @start.
>>>> This comment seems to be unrelated to the below function.
>>> Sorry, the function name is changed in version2, however comment is not changed.
>>>>> + *
>>>>> + * @inode: the inode which owns delayed extents
>>>>> + * @offset: logic block
>>>>> + *
>>>>> + * Returns next block beyond the found extent.
>>>>> + * Delayed extent is returned via @de.
>>>>> + */
>>>>> +ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de)
>>>>> +{
>>>>> + struct ext4_de_tree *tree;
>>>>> + struct delayed_extent *de1;
>>>>> + struct rb_node *node;
>>>>> +
>>>>> + de->len = 0;
>>>>> + tree = &EXT4_I(inode)->i_de_tree;
>>>>> + de1 = __de_tree_search(&tree->root, de->start);
>>>>> + if (de1) {
>>>>> + de->start = de1->start;
>>>>> + de->len = de1->len;
>>>>> + node = rb_next(&de1->rb_node);
>>>> Sorry, but I don't understand what you are going to do here. In
>>>> __de_tree_search, you have already use rb_next to get the next de if
>>>> start < del->start. why we still need a rb_next here?
>>> This function is supposed to be used by fiemap and bigalloc. The
>>> former one needs next block of the returned extent. You can have a
>>> loot at the 6th patch which implements fiemap based on delayed extent
>>> tree.
>>>
>>> rb_next in __de_tree_search is used if no extent covering given offset exists.
>> oh, so it is too tricky for the caller and the reviewer of course.
>>
>> Some times you suppose the function can find one with a de which has
>> de->start <= offset && offset < de->start + de->end(situation 1).
>> Sometimes you suppose that we should find one de with offset <
>> de->start(situation 2). It is really a bit hard for us(the other reader)
>> to know from the very beginning.
>>
>> In your case here, we may pass in a parameter to __de_tree_search to
>> describe explicitly whether we want a de suitable for 1 or 2, so that
>> __de_tree_search can find what we want or return NULL. Otherwise we(the
>> caller) have to check every time whether the returned de is OK for our need.
> Accepted.
>>
>>>
>>>>> + if (node) {
>>>>> + de1 = rb_entry(node, struct delayed_extent, rb_node);
>>>>> + return de1->start;
>>>>> + }
>>>>> + }
>>>>> +
>>>>> + return EXT_MAX_BLOCKS;
>>>>> +}
>>>>> +
>>>>> +static struct delayed_extent *
>>>>> +ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
>>>>> +{
>>>>> + struct delayed_extent *de;
>>>>> + de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
>>>>> + if (de == NULL)
>>>>> + return NULL;
>>>>> + de->start = start;
>>>>> + de->len = len;
>>>>> + return de;
>>>>> +}
>>>>> +
>>>>> +static void ext4_de_free_extent(struct delayed_extent *de)
>>>>> +{
>>>>> + kmem_cache_free(ext4_de_cachep, de);
>>>>> +}
>>>>> +
>>>>> +static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree,
>>>>> + struct delayed_extent *de)
>>>>> +{
>>>>> + struct delayed_extent *de1;
>>>>> + struct rb_node *node;
>>>>> +
>>>>> + node = rb_prev(&de->rb_node);
>>>>> + if (!node)
>>>>> + return;
>>>>> +
>>>>> + de1 = rb_entry(node, struct delayed_extent, rb_node);
>>>>> + if (delayed_extent_end(de1) == de->start) {
>>>>> + de1->len += de->len;
>>>>> + rb_erase(&de->rb_node, &tree->root);
>>>>> + if (de == tree->cache_de)
>>>>> + tree->cache_de = de1;
>>>>> + ext4_de_free_extent(de);
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree,
>>>>> + struct delayed_extent *de)
>>>>> +{
>>>>> + struct delayed_extent *de1;
>>>>> + struct rb_node *node;
>>>>> +
>>>>> + node = rb_next(&de->rb_node);
>>>>> + if (!node)
>>>>> + return;
>>>>> +
>>>>> + de1 = rb_entry(node, struct delayed_extent, rb_node);
>>>>> + if (de1->start == delayed_extent_end(de)) {
>>>>> + de->len += de1->len;
>>>>> + rb_erase(node, &tree->root);
>>>>> + if (de1 == tree->cache_de)
>>>>> + tree->cache_de = de;
>>>>> + ext4_de_free_extent(de1);
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * ext4_de_add_space adds a space to a delayed extent tree.
>>>>> + * Caller holds inode->i_data_sem.
>>>>> + *
>>>>> + * ext4_de_add_space is callyed by ext4_dealyed_write_begin and
>>>>> + * ext4_de_remove_space.
>>>>> + *
>>>>> + * Return 0 on success, error code on failure.
>>>>> + */
>>>>> +int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len)
>>>>> +{
>>>>> + struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree;
>>>>> + struct rb_node **p = &tree->root.rb_node;
>>>>> + struct rb_node *parent = NULL;
>>>>> + struct delayed_extent *de;
>>>>> + ext4_lblk_t end = offset + len;
>>>>> +
>>>>> + BUG_ON(end <= offset);
>>>>> +
>>>>> + de = tree->cache_de;
>>>>> + de_debug("add [%u/%u) to delayed extent list of inode %lu\n",
>>>>> + offset, len, inode->i_ino);
>>>>> +
>>>>> + if (de && delayed_extent_end(de) == offset) {
>>>>> + de->len += len;
>>>>> + ext4_de_try_to_merge_right(tree, de);
>>>>> + goto out;
>>>>> + } else if (de && de->start == end) {
>>>>> + de->start = offset;
>>>>> + de->len += len;
>>>>> + ext4_de_try_to_merge_left(tree, de);
>>>>> + goto out;
>>>>> + } else if (de && de->start <= offset &&
>>>>> + delayed_extent_end(de) >= end)
>>>>> + goto out;
>>>>> +
>>>>> + while (*p) {
>>>>> + parent = *p;
>>>>> + de = rb_entry(parent, struct delayed_extent, rb_node);
>>>>> +
>>>>> + if (offset < de->start) {
>>>>> + if (end == de->start) {
>>>>> + de->len += len;
>>>>> + de->start = offset;
>>>>> + goto out;
>>>>> + }
>>>>> + p = &(*p)->rb_left;
>>>>> + } else if (offset > delayed_extent_end(de)) {
>>>>> + if (delayed_extent_end(de) == offset) {
>>>>> + de->len += len;
>>>>> + goto out;
>>>>> + }
>>>>> + p = &(*p)->rb_right;
>>>>> + } else
>>>>> + goto out;
>>>>> + }
>>>> is the above what __de_tree_search try to do?
>>>> btw, we'd better have a BUG_ON when we meet with an intersection since
>>>> you only check the offset in your rb_tree search. So what if offset <
>>>> de->start while offset + len > de->start? It would cause your algorithm
>>>> not work I guess.
>>> Yes. ext4_de_add_space is used in 2 cases: 1. delayed allocation, in
>>> this case len always equals one. 2. ext4_de_remove_space, in this
>>> case, if ext4_de_remove_space does right thing, no intersection
>>> happens.
>>>
>>> In version 1, there was BUG_ON here, and I found we can not guarantee
>>> no intersection exists in case1. If copy_from_user fails with 0 bytes
>>> copied, then page is not dirtied and i_size is not changed as well,
>>> rewrite on the page can introduce intersection.
>> Sorry I don't quite get what's your meaning. Should it be handled by the
>> above codes(the merge and go out stuff) already? Or if as you said, the
>> user can give us with intersections, then what type of intersection?
> Writes do as follows, allocating pages, mapping pages, copying bytes
> from user space, dirtying page, changing i_size. Errors could happen
> in copying and the page could be released after. so write on the same
> block will introduce intersection. Am I right?
so you mean we call ext4_de_add_space twice. We should handle this.
Say if the user call it first with (100, 1) and you add it to the rb
tree and then the process fails. and the 2nd time the user may call it
with again. I see.
btw, there is another bug above which I haven't noticed before.
> + } else if (offset > delayed_extent_end(de)) {
I guess here should be offset >= delayed_extent_end(de)? otherwise how
could the below check work? ;)
> + if (delayed_extent_end(de) == offset) {
> + de->len += len;
> + goto out;
Thanks
Tao
On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
> This patch let ext4 maintain delayed extent trees.
>
> Signed-off-by: Yongqiang Yang <[email protected]>
> ---
> fs/ext4/ext4.h | 1 +
> fs/ext4/extents.c | 2 ++
> fs/ext4/indirect.c | 3 +++
> fs/ext4/inode.c | 28 ++++++++++++++++++++++++++--
> fs/ext4/super.c | 12 +++++++++++-
> 5 files changed, 43 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index d3c6b97..177ec0a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -519,6 +519,7 @@ struct ext4_new_group_data {
> #define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
> /* Don't normalize allocation size (used for fallocate) */
> #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
> +#define EXT4_GET_BLOCKS_DEALLOC 0x0080
>
> /*
> * Flags used by ext4_free_blocks
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 9124cd2..bdbb984 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3688,6 +3688,8 @@ void ext4_ext_truncate(struct inode *inode)
>
> last_block = (inode->i_size + sb->s_blocksize - 1)
> >> EXT4_BLOCK_SIZE_BITS(sb);
> + err = ext4_de_remove_space(inode, last_block,
> + EXT_MAX_BLOCKS - last_block);
> err = ext4_ext_remove_space(inode, last_block);
>
> /* In a multi-transaction truncate, we only make the final
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 0962642..25cdb5b 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -22,6 +22,7 @@
>
> #include <linux/module.h>
> #include "ext4_jbd2.h"
> +#include "ext4_extents.h"
> #include "truncate.h"
>
> #include <trace/events/ext4.h>
> @@ -1383,6 +1384,8 @@ void ext4_ind_truncate(struct inode *inode)
> down_write(&ei->i_data_sem);
>
> ext4_discard_preallocations(inode);
> + ext4_de_remove_space(inode, last_block,
> + EXT_MAX_BLOCKS - last_block);
>
> /*
> * The orphan list entry will now protect us from any crash which
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index f86b149..0f9f108 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -442,7 +442,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> up_read((&EXT4_I(inode)->i_data_sem));
>
> if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> - int ret = check_block_validity(inode, map);
> + int ret;
> + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> + /* delayed alloc may be allocated by fallocate,
> + * we need to handle delayed extent here.
> + */
> + down_write((&EXT4_I(inode)->i_data_sem));
> + goto delayed_mapped;
> + }
> + ret = check_block_validity(inode, map);
I am not quite sure of this. So do you mean when we write_begin the
extent isn't allocated, while in the time of writepage, the extent is
fallocted, right? If this is the case, where do we update the reserve_space?
I mean in ext4_da_get_block_prep, we call ext4_da_reserve_space, and if
there is no fallocate, we will call ext4_da_update_reserve_space in
ext4_ext_handle_uninitialized_extents. So in your case, the 2nd
ext4_da_update_reserve_space wouldn't be called. I am not sure whether
there will be some problem or not.
Thanks
Tao
> if (ret != 0)
> return ret;
> }
> @@ -517,8 +525,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
> (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
> ext4_da_update_reserve_space(inode, retval, 1);
> }
> - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
> ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
> +delayed_mapped:
> + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> + int ret;
> + /* delayed allocation blocks has been allocated */
> + ret = ext4_de_remove_space(inode, map->m_lblk,
> + map->m_len);
> + if (ret < 0)
> + retval = ret;
> + }
> + }
>
> up_write((&EXT4_I(inode)->i_data_sem));
> if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
> @@ -1630,6 +1648,12 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
> /* not enough space to reserve */
> return ret;
>
> + down_write((&EXT4_I(inode)->i_data_sem));
> + ret = ext4_de_add_space(inode, map.m_lblk, map.m_len);
> + up_write((&EXT4_I(inode)->i_data_sem));
> + if (ret)
> + return ret;
> +
> map_bh(bh, inode->i_sb, invalid_block);
> set_buffer_new(bh);
> set_buffer_delay(bh);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 247fcdd..a248551 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -49,6 +49,7 @@
> #include "xattr.h"
> #include "acl.h"
> #include "mballoc.h"
> +#include "ext4_extents.h"
>
> #define CREATE_TRACE_POINTS
> #include <trace/events/ext4.h>
> @@ -967,6 +968,7 @@ void ext4_clear_inode(struct inode *inode)
> end_writeback(inode);
> dquot_drop(inode);
> ext4_discard_preallocations(inode);
> + ext4_de_remove_space(inode, 0, EXT_MAX_BLOCKS);
> if (EXT4_I(inode)->jinode) {
> jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
> EXT4_I(inode)->jinode);
> @@ -4976,9 +4978,14 @@ static int __init ext4_init_fs(void)
> init_waitqueue_head(&ext4__ioend_wq[i]);
> }
>
> - err = ext4_init_pageio();
> + err = ext4_init_de();
> if (err)
> return err;
> +
> + err = ext4_init_pageio();
> + if (err)
> + goto out8;
> +
> err = ext4_init_system_zone();
> if (err)
> goto out7;
> @@ -5030,6 +5037,9 @@ out6:
> ext4_exit_system_zone();
> out7:
> ext4_exit_pageio();
> +out8:
> + ext4_exit_de();
> +
> return err;
> }
>
Hi Yongqiang,
> This patch reimplements fiemap on delayed extent tree.
>
> Signed-off-by: Yongqiang Yang <[email protected]>
> ---
> fs/ext4/extents.c | 186 +++++++----------------------------------------------
> 1 files changed, 23 insertions(+), 163 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index bdbb984..c2ae125 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3909,193 +3909,53 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
> struct ext4_ext_cache *newex, struct ext4_extent *ex,
> void *data)
> {
> + struct delayed_extent de;
> __u64 logical;
> __u64 physical;
> __u64 length;
> __u32 flags = 0;
> + ext4_lblk_t next_del;
> int ret = 0;
> struct fiemap_extent_info *fieinfo = data;
> unsigned char blksize_bits;
>
> - blksize_bits = inode->i_sb->s_blocksize_bits;
> - logical = (__u64)newex->ec_block << blksize_bits;
> + de.start = newex->ec_block;
> + down_read(&EXT4_I(inode)->i_data_sem);
> + next_del = ext4_de_find_extent(inode, &de);
> + up_read(&EXT4_I(inode)->i_data_sem);
>
> + next = min(next_del, next);
> if (newex->ec_start == 0) {
> /*
> * No extent in extent-tree contains block @newex->ec_start,
> * then the block may stay in 1)a hole or 2)delayed-extent.
> - *
> - * Holes or delayed-extents are processed as follows.
> - * 1. lookup dirty pages with specified range in pagecache.
> - * If no page is got, then there is no delayed-extent and
> - * return with EXT_CONTINUE.
> - * 2. find the 1st mapped buffer,
> - * 3. check if the mapped buffer is both in the request range
> - * and a delayed buffer. If not, there is no delayed-extent,
> - * then return.
> - * 4. a delayed-extent is found, the extent will be collected.
> */
> - ext4_lblk_t end = 0;
> - pgoff_t last_offset;
> - pgoff_t offset;
> - pgoff_t index;
> - pgoff_t start_index = 0;
> - struct page **pages = NULL;
> - struct buffer_head *bh = NULL;
> - struct buffer_head *head = NULL;
> - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
> -
> - pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> - if (pages == NULL)
> - return -ENOMEM;
> -
> - offset = logical >> PAGE_SHIFT;
> -repeat:
> - last_offset = offset;
> - head = NULL;
> - ret = find_get_pages_tag(inode->i_mapping, &offset,
> - PAGECACHE_TAG_DIRTY, nr_pages, pages);
> -
> - if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
> - /* First time, try to find a mapped buffer. */
> - if (ret == 0) {
> -out:
> - for (index = 0; index < ret; index++)
> - page_cache_release(pages[index]);
> - /* just a hole. */
> - kfree(pages);
> - return EXT_CONTINUE;
> - }
> - index = 0;
> -
> -next_page:
> - /* Try to find the 1st mapped buffer. */
> - end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
> - blksize_bits;
> - if (!page_has_buffers(pages[index]))
> - goto out;
> - head = page_buffers(pages[index]);
> - if (!head)
> - goto out;
> -
> - index++;
> - bh = head;
> - do {
> - if (end >= newex->ec_block +
> - newex->ec_len)
> - /* The buffer is out of
> - * the request range.
> - */
> - goto out;
> -
> - if (buffer_mapped(bh) &&
> - end >= newex->ec_block) {
> - start_index = index - 1;
> - /* get the 1st mapped buffer. */
> - goto found_mapped_buffer;
> - }
> -
> - bh = bh->b_this_page;
> - end++;
> - } while (bh != head);
> -
> - /* No mapped buffer in the range found in this page,
> - * We need to look up next page.
> - */
> - if (index >= ret) {
> - /* There is no page left, but we need to limit
> - * newex->ec_len.
> - */
> - newex->ec_len = end - newex->ec_block;
> - goto out;
> - }
> - goto next_page;
> - } else {
> - /*Find contiguous delayed buffers. */
> - if (ret > 0 && pages[0]->index == last_offset)
> - head = page_buffers(pages[0]);
> - bh = head;
> - index = 1;
> - start_index = 0;
> - }
> -
> -found_mapped_buffer:
> - if (bh != NULL && buffer_delay(bh)) {
> - /* 1st or contiguous delayed buffer found. */
> - if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
> - /*
> - * 1st delayed buffer found, record
> - * the start of extent.
> - */
> - flags |= FIEMAP_EXTENT_DELALLOC;
> - newex->ec_block = end;
> - logical = (__u64)end << blksize_bits;
> - }
> - /* Find contiguous delayed buffers. */
> - do {
> - if (!buffer_delay(bh))
> - goto found_delayed_extent;
> - bh = bh->b_this_page;
> - end++;
> - } while (bh != head);
> -
> - for (; index < ret; index++) {
> - if (!page_has_buffers(pages[index])) {
> - bh = NULL;
> - break;
> - }
> - head = page_buffers(pages[index]);
> - if (!head) {
> - bh = NULL;
> - break;
> - }
> -
> - if (pages[index]->index !=
> - pages[start_index]->index + index
> - - start_index) {
> - /* Blocks are not contiguous. */
> - bh = NULL;
> - break;
> - }
> - bh = head;
> - do {
> - if (!buffer_delay(bh))
> - /* Delayed-extent ends. */
> - goto found_delayed_extent;
> - bh = bh->b_this_page;
> - end++;
> - } while (bh != head);
> - }
> - } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
> - /* a hole found. */
> - goto out;
> -
> -found_delayed_extent:
> - newex->ec_len = min(end - newex->ec_block,
> - (ext4_lblk_t)EXT_INIT_MAX_LEN);
> - if (ret == nr_pages && bh != NULL &&
> - newex->ec_len < EXT_INIT_MAX_LEN &&
> - buffer_delay(bh)) {
> - /* Have not collected an extent and continue. */
> - for (index = 0; index < ret; index++)
> - page_cache_release(pages[index]);
> - goto repeat;
> + if (de.len == 0)
> + /* A hole found. */
> + return EXT_CONTINUE;
> +
> + if (de.start > newex->ec_block) {
> + /* A hole found. */
> + newex->ec_len = min(de.start - newex->ec_block,
> + newex->ec_len);
> + return EXT_CONTINUE;
> }
>
> - for (index = 0; index < ret; index++)
> - page_cache_release(pages[index]);
> - kfree(pages);
> + flags |= FIEMAP_EXTENT_DELALLOC;
> + newex->ec_len = de.start + de.len - newex->ec_block;
> }
How about to isolate "get delayed extent info" as above to an inline function?
We can gather them via newex->xxx then, so other features need to iterate through file special range like SEEK_HOLE/DATA could be benefit
from it in their callback functions.
Thanks,
-Jeff
>
> - physical = (__u64)newex->ec_start << blksize_bits;
> - length = (__u64)newex->ec_len << blksize_bits;
> -
> if (ex && ext4_ext_is_uninitialized(ex))
> flags |= FIEMAP_EXTENT_UNWRITTEN;
>
> if (next == EXT_MAX_BLOCKS)
> flags |= FIEMAP_EXTENT_LAST;
>
> + blksize_bits = inode->i_sb->s_blocksize_bits;
> + logical = (__u64)newex->ec_block << blksize_bits;
> + physical = (__u64)newex->ec_start << blksize_bits;
> + length = (__u64)newex->ec_len << blksize_bits;
> +
> ret = fiemap_fill_next_extent(fieinfo, logical, physical,
> length, flags);
> if (ret < 0)
> --
> 1.7.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 29, 2011 at 11:28 PM, Jeff liu <[email protected]> wrote:
> Hi Yongqiang,
>
>> This patch reimplements fiemap on delayed extent tree.
>>
>> Signed-off-by: Yongqiang Yang <[email protected]>
>> ---
>> fs/ext4/extents.c | ?186 +++++++----------------------------------------------
>> 1 files changed, 23 insertions(+), 163 deletions(-)
>>
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index bdbb984..c2ae125 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -3909,193 +3909,53 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
>> ? ? ? ? ? ? ? ? ? ? ?struct ext4_ext_cache *newex, struct ext4_extent *ex,
>> ? ? ? ? ? ? ? ? ? ? ?void *data)
>> {
>> + ? ? struct delayed_extent de;
>> ? ? ? __u64 ? logical;
>> ? ? ? __u64 ? physical;
>> ? ? ? __u64 ? length;
>> ? ? ? __u32 ? flags = 0;
>> + ? ? ext4_lblk_t next_del;
>> ? ? ? int ? ? ? ? ? ? ret = 0;
>> ? ? ? struct fiemap_extent_info *fieinfo = data;
>> ? ? ? unsigned char blksize_bits;
>>
>> - ? ? blksize_bits = inode->i_sb->s_blocksize_bits;
>> - ? ? logical = (__u64)newex->ec_block << blksize_bits;
>> + ? ? de.start = newex->ec_block;
>> + ? ? down_read(&EXT4_I(inode)->i_data_sem);
>> + ? ? next_del = ext4_de_find_extent(inode, &de);
>> + ? ? up_read(&EXT4_I(inode)->i_data_sem);
>>
>> + ? ? next = min(next_del, next);
>> ? ? ? if (newex->ec_start == 0) {
>> ? ? ? ? ? ? ? /*
>> ? ? ? ? ? ? ? ?* No extent in extent-tree contains block @newex->ec_start,
>> ? ? ? ? ? ? ? ?* then the block may stay in 1)a hole or 2)delayed-extent.
>> - ? ? ? ? ? ? ?*
>> - ? ? ? ? ? ? ?* Holes or delayed-extents are processed as follows.
>> - ? ? ? ? ? ? ?* 1. lookup dirty pages with specified range in pagecache.
>> - ? ? ? ? ? ? ?* ? ?If no page is got, then there is no delayed-extent and
>> - ? ? ? ? ? ? ?* ? ?return with EXT_CONTINUE.
>> - ? ? ? ? ? ? ?* 2. find the 1st mapped buffer,
>> - ? ? ? ? ? ? ?* 3. check if the mapped buffer is both in the request range
>> - ? ? ? ? ? ? ?* ? ?and a delayed buffer. If not, there is no delayed-extent,
>> - ? ? ? ? ? ? ?* ? ?then return.
>> - ? ? ? ? ? ? ?* 4. a delayed-extent is found, the extent will be collected.
>> ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? ext4_lblk_t ? ? end = 0;
>> - ? ? ? ? ? ? pgoff_t ? ? ? ? last_offset;
>> - ? ? ? ? ? ? pgoff_t ? ? ? ? offset;
>> - ? ? ? ? ? ? pgoff_t ? ? ? ? index;
>> - ? ? ? ? ? ? pgoff_t ? ? ? ? start_index = 0;
>> - ? ? ? ? ? ? struct page ? ? **pages = NULL;
>> - ? ? ? ? ? ? struct buffer_head *bh = NULL;
>> - ? ? ? ? ? ? struct buffer_head *head = NULL;
>> - ? ? ? ? ? ? unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
>> -
>> - ? ? ? ? ? ? pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
>> - ? ? ? ? ? ? if (pages == NULL)
>> - ? ? ? ? ? ? ? ? ? ? return -ENOMEM;
>> -
>> - ? ? ? ? ? ? offset = logical >> PAGE_SHIFT;
>> -repeat:
>> - ? ? ? ? ? ? last_offset = offset;
>> - ? ? ? ? ? ? head = NULL;
>> - ? ? ? ? ? ? ret = find_get_pages_tag(inode->i_mapping, &offset,
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? PAGECACHE_TAG_DIRTY, nr_pages, pages);
>> -
>> - ? ? ? ? ? ? if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
>> - ? ? ? ? ? ? ? ? ? ? /* First time, try to find a mapped buffer. */
>> - ? ? ? ? ? ? ? ? ? ? if (ret == 0) {
>> -out:
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? for (index = 0; index < ret; index++)
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? page_cache_release(pages[index]);
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* just a hole. */
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? kfree(pages);
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? return EXT_CONTINUE;
>> - ? ? ? ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? ? ? ? ? index = 0;
>> -
>> -next_page:
>> - ? ? ? ? ? ? ? ? ? ? /* Try to find the 1st mapped buffer. */
>> - ? ? ? ? ? ? ? ? ? ? end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? blksize_bits;
>> - ? ? ? ? ? ? ? ? ? ? if (!page_has_buffers(pages[index]))
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> - ? ? ? ? ? ? ? ? ? ? head = page_buffers(pages[index]);
>> - ? ? ? ? ? ? ? ? ? ? if (!head)
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> -
>> - ? ? ? ? ? ? ? ? ? ? index++;
>> - ? ? ? ? ? ? ? ? ? ? bh = head;
>> - ? ? ? ? ? ? ? ? ? ? do {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (end >= newex->ec_block +
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? newex->ec_len)
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* The buffer is out of
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?* the request range.
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> -
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (buffer_mapped(bh) &&
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? end >= newex->ec_block) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? start_index = index - 1;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* get the 1st mapped buffer. */
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto found_mapped_buffer;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? }
>> -
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = bh->b_this_page;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? end++;
>> - ? ? ? ? ? ? ? ? ? ? } while (bh != head);
>> -
>> - ? ? ? ? ? ? ? ? ? ? /* No mapped buffer in the range found in this page,
>> - ? ? ? ? ? ? ? ? ? ? ?* We need to look up next page.
>> - ? ? ? ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? ? ? ? ? if (index >= ret) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* There is no page left, but we need to limit
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?* newex->ec_len.
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? newex->ec_len = end - newex->ec_block;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> - ? ? ? ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? ? ? ? ? goto next_page;
>> - ? ? ? ? ? ? } else {
>> - ? ? ? ? ? ? ? ? ? ? /*Find contiguous delayed buffers. */
>> - ? ? ? ? ? ? ? ? ? ? if (ret > 0 && pages[0]->index == last_offset)
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? head = page_buffers(pages[0]);
>> - ? ? ? ? ? ? ? ? ? ? bh = head;
>> - ? ? ? ? ? ? ? ? ? ? index = 1;
>> - ? ? ? ? ? ? ? ? ? ? start_index = 0;
>> - ? ? ? ? ? ? }
>> -
>> -found_mapped_buffer:
>> - ? ? ? ? ? ? if (bh != NULL && buffer_delay(bh)) {
>> - ? ? ? ? ? ? ? ? ? ? /* 1st or contiguous delayed buffer found. */
>> - ? ? ? ? ? ? ? ? ? ? if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? /*
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?* 1st delayed buffer found, record
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?* the start of extent.
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? flags |= FIEMAP_EXTENT_DELALLOC;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? newex->ec_block = end;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? logical = (__u64)end << blksize_bits;
>> - ? ? ? ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? ? ? ? ? /* Find contiguous delayed buffers. */
>> - ? ? ? ? ? ? ? ? ? ? do {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (!buffer_delay(bh))
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto found_delayed_extent;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = bh->b_this_page;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? end++;
>> - ? ? ? ? ? ? ? ? ? ? } while (bh != head);
>> -
>> - ? ? ? ? ? ? ? ? ? ? for (; index < ret; index++) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (!page_has_buffers(pages[index])) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = NULL;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? head = page_buffers(pages[index]);
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (!head) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = NULL;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? }
>> -
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (pages[index]->index !=
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? pages[start_index]->index + index
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? - start_index) {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* Blocks are not contiguous. */
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = NULL;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = head;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? do {
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (!buffer_delay(bh))
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* Delayed-extent ends. */
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto found_delayed_extent;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bh = bh->b_this_page;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? end++;
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? } while (bh != head);
>> - ? ? ? ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
>> - ? ? ? ? ? ? ? ? ? ? /* a hole found. */
>> - ? ? ? ? ? ? ? ? ? ? goto out;
>> -
>> -found_delayed_extent:
>> - ? ? ? ? ? ? newex->ec_len = min(end - newex->ec_block,
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? (ext4_lblk_t)EXT_INIT_MAX_LEN);
>> - ? ? ? ? ? ? if (ret == nr_pages && bh != NULL &&
>> - ? ? ? ? ? ? ? ? ? ? newex->ec_len < EXT_INIT_MAX_LEN &&
>> - ? ? ? ? ? ? ? ? ? ? buffer_delay(bh)) {
>> - ? ? ? ? ? ? ? ? ? ? /* Have not collected an extent and continue. */
>> - ? ? ? ? ? ? ? ? ? ? for (index = 0; index < ret; index++)
>> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? page_cache_release(pages[index]);
>> - ? ? ? ? ? ? ? ? ? ? goto repeat;
>> + ? ? ? ? ? ? if (de.len == 0)
>> + ? ? ? ? ? ? ? ? ? ? /* A hole found. */
>> + ? ? ? ? ? ? ? ? ? ? return EXT_CONTINUE;
>> +
>> + ? ? ? ? ? ? if (de.start > newex->ec_block) {
>> + ? ? ? ? ? ? ? ? ? ? /* A hole found. */
>> + ? ? ? ? ? ? ? ? ? ? newex->ec_len = min(de.start - newex->ec_block,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? newex->ec_len);
>> + ? ? ? ? ? ? ? ? ? ? return EXT_CONTINUE;
>> ? ? ? ? ? ? ? }
>>
>> - ? ? ? ? ? ? for (index = 0; index < ret; index++)
>> - ? ? ? ? ? ? ? ? ? ? page_cache_release(pages[index]);
>> - ? ? ? ? ? ? kfree(pages);
>> + ? ? ? ? ? ? flags |= FIEMAP_EXTENT_DELALLOC;
>> + ? ? ? ? ? ? newex->ec_len = de.start + de.len - newex->ec_block;
>> ? ? ? }
>
> How about to isolate "get delayed extent info" as above to an inline function?
> We can gather them via newex->xxx then, so other features need to iterate through file special range like SEEK_HOLE/DATA could be benefit
> from it in their callback functions.
Sure.
Thx!
Yongqiang.
>
>
> Thanks,
> -Jeff
>>
>> - ? ? physical = (__u64)newex->ec_start << blksize_bits;
>> - ? ? length = ? (__u64)newex->ec_len << blksize_bits;
>> -
>> ? ? ? if (ex && ext4_ext_is_uninitialized(ex))
>> ? ? ? ? ? ? ? flags |= FIEMAP_EXTENT_UNWRITTEN;
>>
>> ? ? ? if (next == EXT_MAX_BLOCKS)
>> ? ? ? ? ? ? ? flags |= FIEMAP_EXTENT_LAST;
>>
>> + ? ? blksize_bits = inode->i_sb->s_blocksize_bits;
>> + ? ? logical = (__u64)newex->ec_block << blksize_bits;
>> + ? ? physical = (__u64)newex->ec_start << blksize_bits;
>> + ? ? length = ? (__u64)newex->ec_len << blksize_bits;
>> +
>> ? ? ? ret = fiemap_fill_next_extent(fieinfo, logical, physical,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? length, flags);
>> ? ? ? if (ret < 0)
>> --
>> 1.7.5.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>
>
--
Best Wishes
Yongqiang Yang
On Thu, Sep 29, 2011 at 10:31 PM, Tao Ma <[email protected]> wrote:
> On 09/29/2011 01:08 PM, Yongqiang Yang wrote:
>> This patch let ext4 maintain delayed extent trees.
>>
>> Signed-off-by: Yongqiang Yang <[email protected]>
>> ---
>> ?fs/ext4/ext4.h ? ? | ? ?1 +
>> ?fs/ext4/extents.c ?| ? ?2 ++
>> ?fs/ext4/indirect.c | ? ?3 +++
>> ?fs/ext4/inode.c ? ?| ? 28 ++++++++++++++++++++++++++--
>> ?fs/ext4/super.c ? ?| ? 12 +++++++++++-
>> ?5 files changed, 43 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index d3c6b97..177ec0a 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -519,6 +519,7 @@ struct ext4_new_group_data {
>> ?#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT ? ? ? ? ? ? ? ?0x0020
>> ? ? ? /* Don't normalize allocation size (used for fallocate) */
>> ?#define EXT4_GET_BLOCKS_NO_NORMALIZE ? ? ? ? 0x0040
>> +#define EXT4_GET_BLOCKS_DEALLOC ? ? ? ? ? ? ? ? ? ? ?0x0080
>>
>> ?/*
>> ? * Flags used by ext4_free_blocks
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index 9124cd2..bdbb984 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -3688,6 +3688,8 @@ void ext4_ext_truncate(struct inode *inode)
>>
>> ? ? ? last_block = (inode->i_size + sb->s_blocksize - 1)
>> ? ? ? ? ? ? ? ? ? ? ? >> EXT4_BLOCK_SIZE_BITS(sb);
>> + ? ? err = ext4_de_remove_space(inode, last_block,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT_MAX_BLOCKS - last_block);
>> ? ? ? err = ext4_ext_remove_space(inode, last_block);
>>
>> ? ? ? /* In a multi-transaction truncate, we only make the final
>> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
>> index 0962642..25cdb5b 100644
>> --- a/fs/ext4/indirect.c
>> +++ b/fs/ext4/indirect.c
>> @@ -22,6 +22,7 @@
>>
>> ?#include <linux/module.h>
>> ?#include "ext4_jbd2.h"
>> +#include "ext4_extents.h"
>> ?#include "truncate.h"
>>
>> ?#include <trace/events/ext4.h>
>> @@ -1383,6 +1384,8 @@ void ext4_ind_truncate(struct inode *inode)
>> ? ? ? down_write(&ei->i_data_sem);
>>
>> ? ? ? ext4_discard_preallocations(inode);
>> + ? ? ext4_de_remove_space(inode, last_block,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ?EXT_MAX_BLOCKS - last_block);
>>
>> ? ? ? /*
>> ? ? ? ?* The orphan list entry will now protect us from any crash which
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index f86b149..0f9f108 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -442,7 +442,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>> ? ? ? up_read((&EXT4_I(inode)->i_data_sem));
>>
>> ? ? ? if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
>> - ? ? ? ? ? ? int ret = check_block_validity(inode, map);
>> + ? ? ? ? ? ? int ret;
>> + ? ? ? ? ? ? if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
>> + ? ? ? ? ? ? ? ? ? ? /* delayed alloc may be allocated by fallocate,
>> + ? ? ? ? ? ? ? ? ? ? ?* we need to handle delayed extent here.
>> + ? ? ? ? ? ? ? ? ? ? ?*/
>> + ? ? ? ? ? ? ? ? ? ? down_write((&EXT4_I(inode)->i_data_sem));
>> + ? ? ? ? ? ? ? ? ? ? goto delayed_mapped;
>> + ? ? ? ? ? ? }
>> + ? ? ? ? ? ? ret = check_block_validity(inode, map);
> I am not quite sure of this. So do you mean when we write_begin the
> extent isn't allocated, while in the time of writepage, the extent is
> fallocted, right? If this is the case, where do we update the reserve_space?
> I mean in ext4_da_get_block_prep, we call ext4_da_reserve_space, and if
> there is no fallocate, we will call ext4_da_update_reserve_space in
> ext4_ext_handle_uninitialized_extents. So in your case, the 2nd
> ext4_da_update_reserve_space wouldn't be called. I am not sure whether
> there will be some problem or not.
Hi Tao,
What if fallocated blocks have been allocated by direct I/O? I placed
a BUG() here, it was indeed triggered by xfstests 127.
The comment is not clear, sorry for that.
Yongqiang.
>
> Thanks
> Tao
>> ? ? ? ? ? ? ? if (ret != 0)
>> ? ? ? ? ? ? ? ? ? ? ? return ret;
>> ? ? ? }
>> @@ -517,8 +525,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
>> ? ? ? ? ? ? ? ? ? ? ? (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
>> ? ? ? ? ? ? ? ? ? ? ? ext4_da_update_reserve_space(inode, retval, 1);
>> ? ? ? }
>> - ? ? if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
>> + ? ? if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
>> ? ? ? ? ? ? ? ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
>> +delayed_mapped:
>> + ? ? ? ? ? ? if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
>> + ? ? ? ? ? ? ? ? ? ? int ret;
>> + ? ? ? ? ? ? ? ? ? ? /* delayed allocation blocks has been allocated */
>> + ? ? ? ? ? ? ? ? ? ? ret = ext4_de_remove_space(inode, map->m_lblk,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?map->m_len);
>> + ? ? ? ? ? ? ? ? ? ? if (ret < 0)
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? retval = ret;
>> + ? ? ? ? ? ? }
>> + ? ? }
>>
>> ? ? ? up_write((&EXT4_I(inode)->i_data_sem));
>> ? ? ? if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
>> @@ -1630,6 +1648,12 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
>> ? ? ? ? ? ? ? ? ? ? ? /* not enough space to reserve */
>> ? ? ? ? ? ? ? ? ? ? ? return ret;
>>
>> + ? ? ? ? ? ? down_write((&EXT4_I(inode)->i_data_sem));
>> + ? ? ? ? ? ? ret = ext4_de_add_space(inode, map.m_lblk, map.m_len);
>> + ? ? ? ? ? ? up_write((&EXT4_I(inode)->i_data_sem));
>> + ? ? ? ? ? ? if (ret)
>> + ? ? ? ? ? ? ? ? ? ? return ret;
>> +
>> ? ? ? ? ? ? ? map_bh(bh, inode->i_sb, invalid_block);
>> ? ? ? ? ? ? ? set_buffer_new(bh);
>> ? ? ? ? ? ? ? set_buffer_delay(bh);
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 247fcdd..a248551 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -49,6 +49,7 @@
>> ?#include "xattr.h"
>> ?#include "acl.h"
>> ?#include "mballoc.h"
>> +#include "ext4_extents.h"
>>
>> ?#define CREATE_TRACE_POINTS
>> ?#include <trace/events/ext4.h>
>> @@ -967,6 +968,7 @@ void ext4_clear_inode(struct inode *inode)
>> ? ? ? end_writeback(inode);
>> ? ? ? dquot_drop(inode);
>> ? ? ? ext4_discard_preallocations(inode);
>> + ? ? ext4_de_remove_space(inode, 0, EXT_MAX_BLOCKS);
>> ? ? ? if (EXT4_I(inode)->jinode) {
>> ? ? ? ? ? ? ? jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?EXT4_I(inode)->jinode);
>> @@ -4976,9 +4978,14 @@ static int __init ext4_init_fs(void)
>> ? ? ? ? ? ? ? init_waitqueue_head(&ext4__ioend_wq[i]);
>> ? ? ? }
>>
>> - ? ? err = ext4_init_pageio();
>> + ? ? err = ext4_init_de();
>> ? ? ? if (err)
>> ? ? ? ? ? ? ? return err;
>> +
>> + ? ? err = ext4_init_pageio();
>> + ? ? if (err)
>> + ? ? ? ? ? ? goto out8;
>> +
>> ? ? ? err = ext4_init_system_zone();
>> ? ? ? if (err)
>> ? ? ? ? ? ? ? goto out7;
>> @@ -5030,6 +5037,9 @@ out6:
>> ? ? ? ext4_exit_system_zone();
>> ?out7:
>> ? ? ? ext4_exit_pageio();
>> +out8:
>> + ? ? ext4_exit_de();
>> +
>> ? ? ? return err;
>> ?}
>>
>
>
--
Best Wishes
Yongqiang Yang
On Thu, 29 Sep 2011, Yongqiang Yang wrote:
> This patch reimplements fiemap on delayed extent tree.
>
> Signed-off-by: Yongqiang Yang <[email protected]>
> ---
> fs/ext4/extents.c | 186 +++++++----------------------------------------------
> 1 files changed, 23 insertions(+), 163 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index bdbb984..c2ae125 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3909,193 +3909,53 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
> struct ext4_ext_cache *newex, struct ext4_extent *ex,
> void *data)
> {
> + struct delayed_extent de;
> __u64 logical;
> __u64 physical;
> __u64 length;
> __u32 flags = 0;
> + ext4_lblk_t next_del;
> int ret = 0;
> struct fiemap_extent_info *fieinfo = data;
> unsigned char blksize_bits;
>
> - blksize_bits = inode->i_sb->s_blocksize_bits;
> - logical = (__u64)newex->ec_block << blksize_bits;
> + de.start = newex->ec_block;
> + down_read(&EXT4_I(inode)->i_data_sem);
> + next_del = ext4_de_find_extent(inode, &de);
> + up_read(&EXT4_I(inode)->i_data_sem);
>
> + next = min(next_del, next);
> if (newex->ec_start == 0) {
> /*
> * No extent in extent-tree contains block @newex->ec_start,
> * then the block may stay in 1)a hole or 2)delayed-extent.
> - *
> - * Holes or delayed-extents are processed as follows.
> - * 1. lookup dirty pages with specified range in pagecache.
> - * If no page is got, then there is no delayed-extent and
> - * return with EXT_CONTINUE.
> - * 2. find the 1st mapped buffer,
> - * 3. check if the mapped buffer is both in the request range
> - * and a delayed buffer. If not, there is no delayed-extent,
> - * then return.
> - * 4. a delayed-extent is found, the extent will be collected.
> */
> - ext4_lblk_t end = 0;
> - pgoff_t last_offset;
> - pgoff_t offset;
> - pgoff_t index;
> - pgoff_t start_index = 0;
> - struct page **pages = NULL;
> - struct buffer_head *bh = NULL;
> - struct buffer_head *head = NULL;
> - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
> -
> - pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
> - if (pages == NULL)
> - return -ENOMEM;
> -
> - offset = logical >> PAGE_SHIFT;
> -repeat:
> - last_offset = offset;
> - head = NULL;
> - ret = find_get_pages_tag(inode->i_mapping, &offset,
> - PAGECACHE_TAG_DIRTY, nr_pages, pages);
> -
> - if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
> - /* First time, try to find a mapped buffer. */
> - if (ret == 0) {
> -out:
> - for (index = 0; index < ret; index++)
> - page_cache_release(pages[index]);
> - /* just a hole. */
> - kfree(pages);
> - return EXT_CONTINUE;
> - }
> - index = 0;
> -
> -next_page:
> - /* Try to find the 1st mapped buffer. */
> - end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
> - blksize_bits;
> - if (!page_has_buffers(pages[index]))
> - goto out;
> - head = page_buffers(pages[index]);
> - if (!head)
> - goto out;
> -
> - index++;
> - bh = head;
> - do {
> - if (end >= newex->ec_block +
> - newex->ec_len)
> - /* The buffer is out of
> - * the request range.
> - */
> - goto out;
> -
> - if (buffer_mapped(bh) &&
> - end >= newex->ec_block) {
> - start_index = index - 1;
> - /* get the 1st mapped buffer. */
> - goto found_mapped_buffer;
> - }
> -
> - bh = bh->b_this_page;
> - end++;
> - } while (bh != head);
> -
> - /* No mapped buffer in the range found in this page,
> - * We need to look up next page.
> - */
> - if (index >= ret) {
> - /* There is no page left, but we need to limit
> - * newex->ec_len.
> - */
> - newex->ec_len = end - newex->ec_block;
> - goto out;
> - }
> - goto next_page;
> - } else {
> - /*Find contiguous delayed buffers. */
> - if (ret > 0 && pages[0]->index == last_offset)
> - head = page_buffers(pages[0]);
> - bh = head;
> - index = 1;
> - start_index = 0;
> - }
> -
> -found_mapped_buffer:
> - if (bh != NULL && buffer_delay(bh)) {
> - /* 1st or contiguous delayed buffer found. */
> - if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
> - /*
> - * 1st delayed buffer found, record
> - * the start of extent.
> - */
> - flags |= FIEMAP_EXTENT_DELALLOC;
> - newex->ec_block = end;
> - logical = (__u64)end << blksize_bits;
> - }
> - /* Find contiguous delayed buffers. */
> - do {
> - if (!buffer_delay(bh))
> - goto found_delayed_extent;
> - bh = bh->b_this_page;
> - end++;
> - } while (bh != head);
> -
> - for (; index < ret; index++) {
> - if (!page_has_buffers(pages[index])) {
> - bh = NULL;
> - break;
> - }
> - head = page_buffers(pages[index]);
> - if (!head) {
> - bh = NULL;
> - break;
> - }
> -
> - if (pages[index]->index !=
> - pages[start_index]->index + index
> - - start_index) {
> - /* Blocks are not contiguous. */
> - bh = NULL;
> - break;
> - }
> - bh = head;
> - do {
> - if (!buffer_delay(bh))
> - /* Delayed-extent ends. */
> - goto found_delayed_extent;
> - bh = bh->b_this_page;
> - end++;
> - } while (bh != head);
> - }
> - } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
> - /* a hole found. */
> - goto out;
> -
> -found_delayed_extent:
> - newex->ec_len = min(end - newex->ec_block,
> - (ext4_lblk_t)EXT_INIT_MAX_LEN);
> - if (ret == nr_pages && bh != NULL &&
> - newex->ec_len < EXT_INIT_MAX_LEN &&
> - buffer_delay(bh)) {
> - /* Have not collected an extent and continue. */
> - for (index = 0; index < ret; index++)
> - page_cache_release(pages[index]);
> - goto repeat;
> + if (de.len == 0)
> + /* A hole found. */
> + return EXT_CONTINUE;
> +
> + if (de.start > newex->ec_block) {
> + /* A hole found. */
> + newex->ec_len = min(de.start - newex->ec_block,
> + newex->ec_len);
> + return EXT_CONTINUE;
> }
>
> - for (index = 0; index < ret; index++)
> - page_cache_release(pages[index]);
> - kfree(pages);
> + flags |= FIEMAP_EXTENT_DELALLOC;
> + newex->ec_len = de.start + de.len - newex->ec_block;
> }
>
> - physical = (__u64)newex->ec_start << blksize_bits;
> - length = (__u64)newex->ec_len << blksize_bits;
> -
> if (ex && ext4_ext_is_uninitialized(ex))
> flags |= FIEMAP_EXTENT_UNWRITTEN;
>
> if (next == EXT_MAX_BLOCKS)
> flags |= FIEMAP_EXTENT_LAST;
As we know now, it is not enough to check for next allocated extent,
we also have to check that there is not any delayed extents after
this one.
Just now I have noticed this lines at the beginning of the function:
next_del = ext4_de_find_extent(inode, &de);
...
next = min(next_del, next);
which takes care of that, so it seems that it will fix both problems,
setting FIEMAP_EXTENT_LAST for the last extent in the file and taking
care of delayed extents as well.
The patch seems fine.
Thanks!
-Lukas
>
> + blksize_bits = inode->i_sb->s_blocksize_bits;
> + logical = (__u64)newex->ec_block << blksize_bits;
> + physical = (__u64)newex->ec_start << blksize_bits;
> + length = (__u64)newex->ec_len << blksize_bits;
> +
> ret = fiemap_fill_next_extent(fieinfo, logical, physical,
> length, flags);
> if (ret < 0)
>
--