Keep the per-sb reservations list in an rbtree, instead of a linear
list. All new reservation requests (eg. for new files) can use an
rbtree search to find the existing reservation nearest the allocation
"goal". Searching for the next free space is still by linear search
within the block group.
Signed-off-by: Stephen Tweedie <[email protected]>
--- linux-2.6.9-rc1-mm4/fs/ext3/balloc.c.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/fs/ext3/balloc.c
@@ -110,16 +110,55 @@ error_out:
* we could easily switch to that without changing too much
* code.
*/
-static inline void rsv_window_dump(struct reserve_window *head, char *fn)
-{
- struct reserve_window *rsv;
-
+#if 0
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+ const char *fn)
+{
+ struct rb_node *n;
+ struct reserve_window *rsv, *prev;
+ int bad;
+
+restart:
+ n = rb_first(root);
+ bad = 0;
+ prev = NULL;
+
printk("Block Allocation Reservation Windows Map (%s):\n", fn);
- list_for_each_entry(rsv, &head->rsv_list, rsv_list) {
- printk("reservation window 0x%p start: %d, end: %d\n",
- rsv, rsv->rsv_start, rsv->rsv_end);
+ while (n) {
+ rsv = list_entry(n, struct reserve_window, rsv_node);
+ if (verbose)
+ printk("reservation window 0x%p "
+ "start: %d, end: %d\n",
+ rsv, rsv->rsv_start, rsv->rsv_end);
+ if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+ printk("Bad reservation %p (start >= end)\n",
+ rsv);
+ bad = 1;
+ }
+ if (prev && prev->rsv_end >= rsv->rsv_start) {
+ printk("Bad reservation %p (prev->end >= start)\n",
+ rsv);
+ bad = 1;
+ }
+ if (bad) {
+ if (!verbose) {
+ printk("Restarting reservation walk in verbose mode\n");
+ verbose = 1;
+ goto restart;
+ }
+ }
+ n = rb_next(n);
+ prev = rsv;
}
+ printk("Window map complete.\n");
+ if (bad)
+ BUG();
}
+#define rsv_window_dump(root, verbose) \
+ __rsv_window_dump((root), (verbose), __FUNCTION__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
static int
goal_in_my_reservation(struct reserve_window *rsv, int goal,
@@ -140,20 +179,79 @@ goal_in_my_reservation(struct reserve_wi
return 1;
}
-static inline void rsv_window_add(struct reserve_window *rsv,
- struct reserve_window *prev)
+/*
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct reserve_window *search_reserve_window(struct rb_root *root,
+ unsigned long goal)
{
- /* insert the new reservation window after the head */
- list_add(&rsv->rsv_list, &prev->rsv_list);
+ struct rb_node *n = root->rb_node;
+ struct reserve_window *rsv;
+
+ if (!n)
+ return NULL;
+
+ while (n)
+ {
+ rsv = rb_entry(n, struct reserve_window, rsv_node);
+
+ if (goal < rsv->rsv_start)
+ n = n->rb_left;
+ else if (goal > rsv->rsv_end)
+ n = n->rb_right;
+ else
+ return rsv;
+ }
+ /*
+ * We've fallen off the end of the tree: the goal wasn't inside
+ * any particular node. OK, the previous node must be to one
+ * side of the interval containing the goal. If it's the RHS,
+ * we need to back up one.
+ */
+ if (rsv->rsv_start > goal) {
+ n = rb_prev(&rsv->rsv_node);
+ rsv = rb_entry(n, struct reserve_window, rsv_node);
+ }
+ return rsv;
}
-static inline void rsv_window_remove(struct reserve_window *rsv)
+void rsv_window_add(struct super_block *sb,
+ struct reserve_window *rsv)
+{
+ struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
+ struct rb_node *node = &rsv->rsv_node;
+ unsigned int start = rsv->rsv_start;
+
+ struct rb_node ** p = &root->rb_node;
+ struct rb_node * parent = NULL;
+ struct reserve_window *this;
+
+ while (*p)
+ {
+ parent = *p;
+ this = rb_entry(parent, struct reserve_window, rsv_node);
+
+ if (start < this->rsv_start)
+ p = &(*p)->rb_left;
+ else if (start > this->rsv_end)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+}
+
+static void rsv_window_remove(struct super_block *sb,
+ struct reserve_window *rsv)
{
rsv->rsv_start = 0;
rsv->rsv_end = 0;
- rsv->rsv_alloc_hit = 0;
- list_del(&rsv->rsv_list);
- INIT_LIST_HEAD(&rsv->rsv_list);
+ rsv->rsv_alloc_hit = 0;
+ rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
}
static inline int rsv_is_empty(struct reserve_window *rsv)
@@ -170,7 +268,7 @@ void ext3_discard_reservation(struct ino
if (!rsv_is_empty(rsv)) {
spin_lock(rsv_lock);
- rsv_window_remove(rsv);
+ rsv_window_remove(inode->i_sb, rsv);
spin_unlock(rsv_lock);
}
}
@@ -583,18 +681,18 @@ fail_access:
* but we will shift to the place where start_block is,
* then start from there, when looking for a reservable space.
*
- * @fs_rsv_head: per-filesystem reservation list head.
- *
* @size: the target new reservation window size
+ *
* @group_first_block: the first block we consider to start
* the real search from
*
* @last_block:
- * the maxium block number that our goal reservable space
+ * the maximum block number that our goal reservable space
* could start from. This is normally the last block in this
* group. The search will end when we found the start of next
- * possiblereservable space is out of this boundary.
- * This could handle the cross bounday reservation window request.
+ * possible reservable space is out of this boundary.
+ * This could handle the cross boundary reservation window
+ * request.
*
* basically we search from the given range, rather than the whole
* reservation double linked list, (start_block, last_block)
@@ -604,29 +702,23 @@ fail_access:
* on succeed, it returns the reservation window to be appended to.
* failed, return NULL.
*/
-static inline
-struct reserve_window *find_next_reservable_window(
+static struct reserve_window *find_next_reservable_window(
struct reserve_window *search_head,
- struct reserve_window *fs_rsv_head,
unsigned long size, int *start_block,
int last_block)
{
- struct reserve_window *rsv;
+ struct rb_node *next;
+ struct reserve_window *rsv, *prev;
int cur;
- /* TODO:make the start of the reservation window byte-aligned */
+ /* TODO: make the start of the reservation window byte-aligned */
/* cur = *start_block & ~7;*/
cur = *start_block;
- rsv = list_entry(search_head->rsv_list.next,
- struct reserve_window, rsv_list);
- while (rsv != fs_rsv_head) {
- if (cur + size <= rsv->rsv_start) {
- /*
- * Found a reserveable space big enough. We could
- * have a reservation across the group boundary here
- */
- break;
- }
+ rsv = search_head;
+ if (!rsv)
+ return NULL;
+
+ while (1) {
if (cur <= rsv->rsv_end)
cur = rsv->rsv_end + 1;
@@ -639,14 +731,31 @@ struct reserve_window *find_next_reserva
* For now it will fail if we could not find the reservable
* space with expected-size (or more)...
*/
- rsv = list_entry(rsv->rsv_list.next,
- struct reserve_window, rsv_list);
if (cur > last_block)
return NULL; /* fail */
+
+ prev = rsv;
+ next = rb_next(&rsv->rsv_node);
+ rsv = list_entry(next, struct reserve_window, rsv_node);
+
+ /*
+ * Reached the last reservation, we can just append to the
+ * previous one.
+ */
+ if (!next)
+ break;
+
+ if (cur + size <= rsv->rsv_start) {
+ /*
+ * Found a reserveable space big enough. We could
+ * have a reservation across the group boundary here
+ */
+ break;
+ }
}
/*
* we come here either :
- * when we rearch to the end of the whole list,
+ * when we reach the end of the whole list,
* and there is empty reservable space after last entry in the list.
* append it to the end of the list.
*
@@ -655,7 +764,7 @@ struct reserve_window *find_next_reserva
* succeed.
*/
*start_block = cur;
- return list_entry(rsv->rsv_list.prev, struct reserve_window, rsv_list);
+ return prev;
}
/**
@@ -713,7 +822,7 @@ static int alloc_new_reservation(struct
int first_free_block;
int reservable_space_start;
struct reserve_window *prev_rsv;
- struct reserve_window *fs_rsv_head = &EXT3_SB(sb)->s_rsv_window_head;
+ struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
unsigned long size;
group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
@@ -767,7 +876,7 @@ static int alloc_new_reservation(struct
* we set our goal(start_block) and
* the list head for the search
*/
- search_head = fs_rsv_head;
+ search_head = search_reserve_window(fs_rsv_root, start_block);
}
/*
@@ -778,7 +887,7 @@ static int alloc_new_reservation(struct
* need to check the bitmap after we found a reservable window.
*/
retry:
- prev_rsv = find_next_reservable_window(search_head, fs_rsv_head, size,
+ prev_rsv = find_next_reservable_window(search_head, size,
&start_block, group_end_block);
if (prev_rsv == NULL)
goto failed;
@@ -833,11 +942,13 @@ found_rsv_window:
*/
if (my_rsv != prev_rsv) {
if (!rsv_is_empty(my_rsv))
- rsv_window_remove(my_rsv);
- rsv_window_add(my_rsv, prev_rsv);
+ rsv_window_remove(sb, my_rsv);
}
my_rsv->rsv_start = reservable_space_start;
my_rsv->rsv_end = my_rsv->rsv_start + size - 1;
+ if (my_rsv != prev_rsv) {
+ rsv_window_add(sb, my_rsv);
+ }
return 0; /* succeed */
failed:
return -1; /* failed */
--- linux-2.6.9-rc1-mm4/fs/ext3/ialloc.c.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/fs/ext3/ialloc.c
@@ -586,7 +586,6 @@ got:
ei->i_rsv_window.rsv_end = 0;
atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
ei->i_rsv_window.rsv_alloc_hit = 0;
- INIT_LIST_HEAD(&ei->i_rsv_window.rsv_list);
ei->i_block_group = group;
ext3_set_inode_flags(inode);
--- linux-2.6.9-rc1-mm4/fs/ext3/inode.c.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/fs/ext3/inode.c
@@ -2463,7 +2463,6 @@ void ext3_read_inode(struct inode * inod
ei->i_rsv_window.rsv_start = 0;
ei->i_rsv_window.rsv_end= 0;
atomic_set(&ei->i_rsv_window.rsv_goal_size, EXT3_DEFAULT_RESERVE_BLOCKS);
- INIT_LIST_HEAD(&ei->i_rsv_window.rsv_list);
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
--- linux-2.6.9-rc1-mm4/fs/ext3/super.c.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/fs/ext3/super.c
@@ -1488,12 +1488,17 @@ static int ext3_fill_super (struct super
spin_lock_init(&sbi->s_next_gen_lock);
/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
- INIT_LIST_HEAD(&sbi->s_rsv_window_head.rsv_list);
+ sbi->s_rsv_window_root = RB_ROOT;
+ /* Add a single, static dummy reservation to the start of the
+ * reservation window list --- it gives us a placeholder for
+ * append-at-start-of-list which makes the allocation logic
+ * _much_ simpler. */
sbi->s_rsv_window_head.rsv_start = 0;
sbi->s_rsv_window_head.rsv_end = 0;
sbi->s_rsv_window_head.rsv_alloc_hit = 0;
atomic_set(&sbi->s_rsv_window_head.rsv_goal_size, 0);
-
+ rsv_window_add(sb, &sbi->s_rsv_window_head);
+
/*
* set up enough so that it can read an inode
*/
--- linux-2.6.9-rc1-mm4/include/linux/ext3_fs.h.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/include/linux/ext3_fs.h
@@ -720,6 +720,7 @@ extern struct ext3_group_desc * ext3_get
unsigned int block_group,
struct buffer_head ** bh);
extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
+extern void rsv_window_add(struct super_block *sb, struct reserve_window *rsv);
/* dir.c */
extern int ext3_check_dir_entry(const char *, struct inode *,
--- linux-2.6.9-rc1-mm4/include/linux/ext3_fs_i.h.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/include/linux/ext3_fs_i.h
@@ -17,11 +17,12 @@
#define _LINUX_EXT3_FS_I
#include <linux/rwsem.h>
+#include <linux/rbtree.h>
struct reserve_window {
- struct list_head rsv_list;
- __u32 rsv_start;
- __u32 rsv_end;
+ struct rb_node rsv_node;
+ __u32 rsv_start; /* First byte reserved */
+ __u32 rsv_end; /* Last byte reserved or 0 */
atomic_t rsv_goal_size;
__u32 rsv_alloc_hit;
};
--- linux-2.6.9-rc1-mm4/include/linux/ext3_fs_sb.h.=K0003=.orig
+++ linux-2.6.9-rc1-mm4/include/linux/ext3_fs_sb.h
@@ -22,6 +22,7 @@
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#endif
+#include <linux/rbtree.h>
/*
* third extended-fs super-block data in memory
@@ -59,10 +60,11 @@ struct ext3_sb_info {
struct percpu_counter s_dirs_counter;
struct blockgroup_lock s_blockgroup_lock;
- /* head of the per fs reservation window tree */
+ /* root of the per fs reservation window tree */
spinlock_t s_rsv_window_lock;
+ struct rb_root s_rsv_window_root;
struct reserve_window s_rsv_window_head;
-
+
/* Journaling */
struct inode * s_journal_inode;
struct journal_s * s_journal;
Stephen Tweedie <[email protected]> wrote:
>
> struct reserve_window {
> - struct list_head rsv_list;
> - __u32 rsv_start;
> - __u32 rsv_end;
> + struct rb_node rsv_node;
> + __u32 rsv_start; /* First byte reserved */
> + __u32 rsv_end; /* Last byte reserved or 0 */
> atomic_t rsv_goal_size;
> __u32 rsv_alloc_hit;
> };
Takes this structure up to 32 bytes on x86. That's a moderate amount of
inode bloat for something which is only used when an application currently
has the inode open for writing.
Have you given any thought to dynamic allocation of the above?
And if we were to do that, there are a few things which we could move out
of the ext3 in-core inode and into the above structure, such as
i_next_alloc_block and i_next_alloc_goal.
Does the reservation code work for directory growth, btw?
Hi,
On Tue, 2004-09-07 at 23:48, Andrew Morton wrote:
> Takes this structure up to 32 bytes on x86. That's a moderate amount of
> inode bloat for something which is only used when an application currently
> has the inode open for writing.
>
> Have you given any thought to dynamic allocation of the above?
No, none at all. <thinks>
We already have an ext3 function that's called on all opens ---
currently it checks nothing except O_LARGEFILE, but it's an easy and
obvious place where we could set up the may-write bits.
We'd have to take special care with directories, and potentially with
xattrs, but I don't see any great problems with doing it.
> And if we were to do that, there are a few things which we could move out
> of the ext3 in-core inode and into the above structure, such as
> i_next_alloc_block and i_next_alloc_goal.
Yep.
> Does the reservation code work for directory growth, btw?
Yes, it should be hidden inside the allocation code, and should work
straight out of the box for directories. But thinking about that,
there's at least one obvious difference that we need to worry about:
there's no "close" on directories to indicate when we've stopped
extending the dir, and can discard the reservation.
That shouldn't lead to spurious ENOSPC, but might lead to suboptimal
layout on full filesystems if we have reserved all the free space and
have to fall back to the old allocation mechanism.
--Stephen
On Wed, 2004-09-08 at 11:05, Stephen C. Tweedie wrote:
> Hi,
>
> On Tue, 2004-09-07 at 23:48, Andrew Morton wrote:
>
> > Takes this structure up to 32 bytes on x86. That's a moderate amount of
> > inode bloat for something which is only used when an application currently
> > has the inode open for writing.
> >
> > Have you given any thought to dynamic allocation of the above?
>
> No, none at all. <thinks>
>
> We already have an ext3 function that's called on all opens ---
> currently it checks nothing except O_LARGEFILE, but it's an easy and
> obvious place where we could set up the may-write bits.
>
Yes, we could add a check anout FMODE_WRITE and a check about
inode->i_writecount in ext3_open_file(), to just do reservation
structure allocation for the first writer of the inode.
> We'd have to take special care with directories, and potentially with
> xattrs, but I don't see any great problems with doing it.
>
> > And if we were to do that, there are a few things which we could move out
> > of the ext3 in-core inode and into the above structure, such as
> > i_next_alloc_block and i_next_alloc_goal.
>
> Yep.
>
> > Does the reservation code work for directory growth, btw?
>
> Yes, it should be hidden inside the allocation code, and should work
> straight out of the box for directories.
Yes, as you said, current reservation code could work for directories.
But we did not do reservation for directory right now because we don't
really sure whether it is worth doing it.
One thing is, if later we decide to make reservation per-fd instead of
per-inode, we probably need fd->private_data to store the per-fd
reservation structure. But this field is already been used by directory
for htree structures.
Found that in the current reservation code, if there is a need to
allocate a new window to replace the old window to satisfy the new
allocation goal, no matter where is the allocation goal, the new window
will be allocated _after_ the old window.
So, if the allocation goal block is smaller than the start block of the
old reservation window, the goal block will _not_ be satisfied. This
happens due to we adjusted the goal block to the block next to the end
of the old reservation window, and then start from the old reservation,
we search for a space to make a new reservation window. This is not our
intention, as we always want to grant the goal block.
Though the bug present with/without recent red-black tree changes, here
is the fix based on red-black tree change: we will do a red-black tree
search from the old reservation window , to locate a window nearest the
allocation goal.
---
linux-2.6.9-rc1-mm5-ming/fs/ext3/balloc.c | 26 ++++++--------------------
1 files changed, 6 insertions(+), 20 deletions(-)
diff -puN fs/ext3/balloc.c~ext3_reservation_window_allocation_fix fs/ext3/balloc.c
--- linux-2.6.9-rc1-mm5/fs/ext3/balloc.c~ext3_reservation_window_allocation_fix 2004-09-14 00:35:50.662385224 -0700
+++ linux-2.6.9-rc1-mm5-ming/fs/ext3/balloc.c 2004-09-14 00:35:50.670384008 -0700
@@ -184,10 +184,9 @@ goal_in_my_reservation(struct reserve_wi
* if the goal is not in any window.
* Returns NULL if there are no windows or if all windows start after the goal.
*/
-static struct reserve_window_node *search_reserve_window(struct rb_root *root,
+static struct reserve_window_node *search_reserve_window(struct rb_node *n,
unsigned long goal)
{
- struct rb_node *n = root->rb_node;
struct reserve_window_node *rsv;
if (!n)
@@ -767,19 +766,11 @@ static struct reserve_window_node *find_
/**
* alloc_new_reservation()--allocate a new reservation window
- * if there is an existing reservation, discard it first
- * then allocate the new one from there
- * otherwise allocate the new reservation from the given
- * start block, or the beginning of the group, if a goal
- * is not given.
*
* To make a new reservation, we search part of the filesystem
- * reservation list (the list that inside the group).
- *
- * If we have a old reservation, the search goal is the end of
- * last reservation. If we do not have a old reservation, then we
- * start from a given goal, or the first block of the group, if
- * the goal is not given.
+ * reservation list (the list that inside the group). We try to
+ * allocate a new reservation window near the allocation goal,
+ * or the beginning of the group, if there is no goal.
*
* We first find a reservable space after the goal, then from
* there, we check the bitmap for the first free block after
@@ -801,8 +792,6 @@ static struct reserve_window_node *find_
*
* @goal: The goal (group-relative). It is where the search for a
* free reservable space should start from.
- * if we have a old reservation, start_block is the end of
- * old reservation. Otherwise,
* if we have a goal(goal >0 ), then start from there,
* no goal(goal = -1), we start from the first block
* of the group.
@@ -852,10 +841,7 @@ static int alloc_new_reservation(struct
(my_rsv->rsv_end > group_end_block))
return -1;
- /* remember where we are before we discard the old one */
- if (my_rsv->rsv_end + 1 > start_block)
- start_block = my_rsv->rsv_end + 1;
- search_head = my_rsv;
+ search_head = search_reserve_window(&my_rsv->rsv_node, start_block);
if ((atomic_read(&my_rsv->rsv_alloc_hit) >
(my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
/*
@@ -875,7 +861,7 @@ static int alloc_new_reservation(struct
* we set our goal(start_block) and
* the list head for the search
*/
- search_head = search_reserve_window(fs_rsv_root, start_block);
+ search_head = search_reserve_window(fs_rsv_root->rb_node, start_block);
}
/*
_
Two ext3 reservation bug fixes.
In current reservation code (with or without the recent red-black tree
changes), when an existing old reservation is re-used as the new window
without adjusting it's position in the current per-filesystem tree, we
just need to update window's start and end block value, without any
remove/insert business. But we missed the the allocation hit ratio bit,
it is not reset.
Since this is the magic number used to determine whether the window size
should be doubled next time, this will cause the window size increase
incorrectly or too quickly.
Patch applies to 2.6.9-rc1-mm5, tested.
---
linux-2.6.9-rc1-mm5-ming/fs/ext3/balloc.c | 1 +
1 files changed, 1 insertion(+)
diff -puN fs/ext3/balloc.c~ext3_reservation_window_hit_ratio_fix fs/ext3/balloc.c
--- linux-2.6.9-rc1-mm5/fs/ext3/balloc.c~ext3_reservation_window_hit_ratio_fix 2004-09-14 00:30:41.667359608 -0700
+++ linux-2.6.9-rc1-mm5-ming/fs/ext3/balloc.c 2004-09-14 00:34:19.170294136 -0700
@@ -945,6 +945,7 @@ found_rsv_window:
}
my_rsv->rsv_start = reservable_space_start;
my_rsv->rsv_end = my_rsv->rsv_start + size - 1;
+ atomic_set(&my_rsv->rsv_alloc_hit, 0);
if (my_rsv != prev_rsv) {
rsv_window_add(sb, my_rsv);
}
_