2007-07-05 18:04:20

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH]mballoc rebased on top of ext4-patch-queue.

The patch set is forward port of the changes found at

ftp://ftp.clusterfs.com/pub/people/alex/2.6.19-rc6/

on top of 2.6.22-rc6 (ext4-patch-queue)

Only visible change i made is to enabled MBALLOC option default so that we
get wider testing.

Test booted and did minimal testing on ext4 files system via Qemu.


2007-07-05 18:04:20

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 1/4] This patch adds new operation to struct super_operations - sync_inodes,

From: Hans Reiser <[email protected]>

Reiser4 flushes dirty pages on basic of atoms, not of inodes. sync_sb_inodes
used to call address space flushing method (writepages) for every dirty inode.
For reiser4 it caused having to commit atoms unnecessarily often. This
turned into substantial slowdown. Having this method helped to fix that
problem.

Also, make generic_sync_sb_inodes spin lock itself. It helps reiser4 to
get rid of some oddities.

sync_sb_inodes is always called like:
spin_lock(&inode_lock);
sync_sb_inodes(sb, wbc);
spin_unlock(&inode_lock);
This patch moves spin_lock/spin_unlock down to sync_sb_inodes.

Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/fs-writeback.c | 26 ++++++++++++++++----------
include/linux/fs.h | 3 +++
2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a4b142a..cdcff8c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -296,8 +296,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
* that it can be located for waiting on in __writeback_single_inode().
*
- * Called under inode_lock.
- *
* If `bdi' is non-zero then we're being asked to writeback a specific queue.
* This function assumes that the blockdev superblock's inodes are backed by
* a variety of queues, so all inodes are searched. For other superblocks,
@@ -313,11 +311,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* on the writer throttling path, and we get decent balancing between many
* throttled threads: we don't want them all piling up on __wait_on_inode.
*/
-static void
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+void
+generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{
const unsigned long start = jiffies; /* livelock avoidance */

+ spin_lock(&inode_lock);
+
if (!wbc->for_kupdate || list_empty(&sb->s_io))
list_splice_init(&sb->s_dirty, &sb->s_io);

@@ -397,8 +397,19 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0)
break;
}
+ spin_unlock(&inode_lock);
return; /* Leave any unwritten inodes on s_io */
}
+EXPORT_SYMBOL(generic_sync_sb_inodes);
+
+static void
+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+{
+ if (sb->s_op->sync_inodes)
+ sb->s_op->sync_inodes(sb, wbc);
+ else
+ generic_sync_sb_inodes(sb, wbc);
+}

/*
* Start writeback of dirty pagecache data against all unlocked inodes.
@@ -439,11 +450,8 @@ restart:
* be unmounted by the time it is released.
*/
if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root) {
- spin_lock(&inode_lock);
+ if (sb->s_root)
sync_sb_inodes(sb, wbc);
- spin_unlock(&inode_lock);
- }
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
@@ -481,9 +489,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
nr_dirty + nr_unstable;
wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
- spin_lock(&inode_lock);
sync_sb_inodes(sb, &wbc);
- spin_unlock(&inode_lock);
}

/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c90a212..12546d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1193,6 +1193,8 @@ struct super_operations {
void (*clear_inode) (struct inode *);
void (*umount_begin) (struct vfsmount *, int);

+ void (*sync_inodes) (struct super_block *sb,
+ struct writeback_control *wbc);
int (*show_options)(struct seq_file *, struct vfsmount *);
int (*show_stats)(struct seq_file *, struct vfsmount *);
#ifdef CONFIG_QUOTA
@@ -1644,6 +1646,7 @@ extern int invalidate_inode_pages2(struct address_space *mapping);
extern int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
extern int write_inode_now(struct inode *, int);
+extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
--
1.5.3.rc0.30.g114fd-dirty

2007-07-05 18:05:15

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 2/4] Add support for locality group.

From: Alex Tomas <[email protected]>

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/Makefile | 2 +-
fs/ext4/lg.c | 576 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/super.c | 5 +
fs/fs-writeback.c | 8 +-
include/linux/ext4_fs.h | 37 +++
include/linux/ext4_fs_i.h | 2 +
include/linux/ext4_fs_sb.h | 6 +
7 files changed, 630 insertions(+), 6 deletions(-)
create mode 100644 fs/ext4/lg.c

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 7b24c73..f3d8ba7 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o

ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o writeback.o
+ ext4_jbd2.o writeback.o lg.o

ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/lg.c b/fs/ext4/lg.c
new file mode 100644
index 0000000..7fcdfe1
--- /dev/null
+++ b/fs/ext4/lg.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2006, Cluster File Systems, Inc, [email protected]
+ * Written by Alex Tomas <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+/*
+ * locality groups
+ *
+ */
+
+/*
+ * TODO:
+ * - too many of tricks
+ * - mmap'ed files support (we need to link them to some group)
+ * - too silly grouping policy
+ * - free non-used groups after some timeout
+ * - anonymous group for non-regular inodes
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_fs_i.h>
+#include <linux/ext4_fs_sb.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/writeback.h>
+
+#ifndef TestClearPageChecked
+#define TestClearPageChecked(page) test_and_clear_bit(PG_checked, &(page)->flags)
+#endif
+#ifndef TestSetPageChecked
+#define TestSetPageChecked(page) test_and_set_bit(PG_checked, &(page)->flags)
+#endif
+
+
+extern struct super_block *blockdev_superblock;
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
+
+extern int __writeback_single_inode(struct inode *, struct writeback_control *);
+
+struct ext4_locality_group *ext4_lg_find_group(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg = NULL;
+ struct list_head *cur;
+
+ rcu_read_lock();
+ list_for_each_rcu(cur, &sbi->s_locality_groups) {
+ lg = list_entry(cur, struct ext4_locality_group, lg_hash);
+ if (lg->lg_pgid == current->signal->pgrp) {
+ spin_lock(&lg->lg_lock);
+ if (lg->lg_deleted == 0) {
+ atomic_inc(&lg->lg_count);
+ spin_unlock(&lg->lg_lock);
+ break;
+ }
+ spin_unlock(&lg->lg_lock);
+ }
+ lg = NULL;
+ }
+ rcu_read_unlock();
+ return lg;
+}
+
+void ext4_lg_put_group(struct ext4_locality_group *lg)
+{
+ atomic_dec(&lg->lg_count);
+}
+
+struct ext4_locality_group *ext4_lg_new_group(struct super_block *sb)
+{
+ struct ext4_locality_group *lg;
+
+ lg = kmalloc(sizeof(struct ext4_locality_group), GFP_NOFS);
+ if (lg == NULL)
+ return NULL;
+
+ lg->lg_pgid = current->signal->pgrp;
+ lg->lg_sid = current->signal->session;
+ spin_lock_init(&lg->lg_lock);
+ lg->lg_deleted = 0;
+ lg->lg_flags = 0;
+ atomic_set(&lg->lg_count, 1);
+ atomic_set(&lg->lg_inodes_nr, 0);
+ INIT_LIST_HEAD(&lg->lg_list);
+ INIT_LIST_HEAD(&lg->lg_inodes);
+ INIT_LIST_HEAD(&lg->lg_dirty);
+ INIT_LIST_HEAD(&lg->lg_io);
+ atomic_set(&lg->lg_dirty_pages, 0);
+ atomic_set(&lg->lg_nonallocated, 0);
+
+ return lg;
+}
+
+struct ext4_locality_group *
+ext4_lg_assign_to_group_nolock(struct inode *inode, struct ext4_locality_group *lg)
+{
+ /*
+ * XXX locking here?
+ */
+ if (EXT4_I(inode)->i_locality_group == NULL) {
+ EXT4_I(inode)->i_locality_group = lg;
+ list_add(&EXT4_I(inode)->i_lg_list, &lg->lg_inodes);
+ atomic_inc(&lg->lg_inodes_nr);
+ } else {
+ printk("somebody has already set lg %p (our %p) to inode %lu(%p)\n",
+ EXT4_I(inode)->i_locality_group, lg, inode->i_ino, inode);
+ ext4_lg_put_group(lg);
+ lg = EXT4_I(inode)->i_locality_group;
+ }
+ return lg;
+}
+
+struct ext4_locality_group *
+ext4_lg_assign_to_group(struct inode *inode, struct ext4_locality_group *lg)
+{
+ spin_lock(&inode_lock);
+ ext4_lg_assign_to_group_nolock(inode, lg);
+ spin_unlock(&inode_lock);
+ return lg;
+
+}
+
+struct ext4_locality_group *ext4_lg_find_or_allocate_group(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_locality_group *lg, *olg;
+
+ lg = ext4_lg_find_group(inode->i_sb);
+ if (lg == NULL) {
+ lg = ext4_lg_new_group(inode->i_sb);
+ if (lg == NULL)
+ return NULL;
+
+ spin_lock(&sbi->s_locality_lock);
+ olg = ext4_lg_find_group(inode->i_sb);
+ if (olg == NULL) {
+ list_add_rcu(&lg->lg_hash, &sbi->s_locality_groups);
+ } else {
+ kfree(lg);
+ lg = olg;
+ }
+ spin_unlock(&sbi->s_locality_lock);
+ }
+
+ lg = ext4_lg_assign_to_group(inode, lg);
+ return lg;
+}
+
+/*
+ * every dirty page should be counted
+ */
+void ext4_lg_page_enter_inode(struct inode *inode,
+ struct page *page, int allocated)
+{
+ struct ext4_locality_group *lg;
+
+ lg = EXT4_I(inode)->i_locality_group;
+ if (lg == NULL) {
+ lg = ext4_lg_find_or_allocate_group(inode);
+ if (lg == NULL)
+ return;
+ }
+
+ if (!TestSetPageChecked(page)) {
+ atomic_inc(&lg->lg_dirty_pages);
+ if (!allocated)
+ atomic_inc(&lg->lg_nonallocated);
+ }
+}
+
+
+/*
+ *
+ */
+void ext4_lg_page_leave_inode(struct inode *inode,
+ struct page *page, int allocated)
+{
+ struct ext4_locality_group *lg;
+
+ lg = EXT4_I(inode)->i_locality_group;
+ if (lg == NULL) {
+ if (S_ISREG(inode->i_mode))
+ printk("regular file %lu/%u with no locality group?!\n",
+ inode->i_ino, inode->i_generation);
+ return;
+ }
+
+ if (!TestClearPageChecked(page))
+ return;
+
+ atomic_dec(&lg->lg_dirty_pages);
+ if (!allocated)
+ atomic_dec(&lg->lg_nonallocated);
+}
+
+/*
+ * Inode leave group
+ */
+void ext4_lg_inode_leave_group(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_locality_group *lg;
+
+ if (inode->i_nlink != 0 && S_ISREG(inode->i_mode)) {
+ BUG_ON(mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY));
+ }
+
+ spin_lock(&inode_lock);
+ lg = ei->i_locality_group;
+ ei->i_locality_group = NULL;
+ spin_unlock(&inode_lock);
+
+ if (lg != NULL) {
+ spin_lock(&lg->lg_lock);
+ list_del(&ei->i_lg_list);
+ spin_unlock(&lg->lg_lock);
+ atomic_dec(&lg->lg_inodes_nr);
+ ext4_lg_put_group(lg);
+ }
+}
+
+#define EXT4_LG_DIRTY 0
+
+#define EXT4_CONTINUE_WRITEBACK 1
+#define EXT4_STOP_WRITEBACK 2
+
+static char *__sync_modes[] = { "NONE", "ALL", "HOLD" };
+
+/*
+ * The function syncs a single group like generic_sync_sb_inodes() does
+ * returns:
+ * 0 - continue syncing with a next group
+ * 1 - break syncing
+ */
+int ext4_lg_sync_single_group(struct super_block *sb,
+ struct ext4_locality_group *lg,
+ struct writeback_control *wbc,
+ unsigned long start)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int nr_to_write = wbc->nr_to_write;
+ int dirty_pages, nonallocated;
+ int rc, code = 0;
+
+ dirty_pages = atomic_read(&lg->lg_dirty_pages);
+ nonallocated = atomic_read(&lg->lg_nonallocated);
+
+ rc = EXT4_CONTINUE_WRITEBACK;
+
+ spin_lock(&inode_lock);
+
+ if (!wbc->for_kupdate || list_empty(&lg->lg_io))
+ list_splice_init(&lg->lg_dirty, &lg->lg_io);
+
+ while (!list_empty(&lg->lg_io)) {
+ struct inode *inode = list_entry(lg->lg_io.prev,
+ struct inode, i_list);
+ struct address_space *mapping = inode->i_mapping;
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ long pages_skipped;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ /* underlying device is congested
+ * break all writeback immediately */
+ wbc->encountered_congestion = 1;
+
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* same for the locality group */
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move(&lg->lg_list, &sbi->s_locality_io);
+
+ /* signal to the caller */
+ rc = EXT4_STOP_WRITEBACK;
+ code = 1;
+ break;
+ }
+
+ if (wbc->bdi && bdi != wbc->bdi) {
+ printk("wbc->bdi (%p) != bdi (%p)\n", wbc->bdi, bdi);
+ list_move(&inode->i_list, &inode_in_use);
+ rc = EXT4_CONTINUE_WRITEBACK;
+ code = 2;
+ break;
+ }
+
+ /* Was this inode dirtied after sync_sb_inodes was called? */
+ if (time_after(inode->dirtied_when, start)) {
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* continue with next locality group
+ * move this one to the dirty tail */
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move_tail(&lg->lg_list, &sbi->s_locality_dirty);
+
+ rc = EXT4_CONTINUE_WRITEBACK;
+ code = 3;
+ break;
+ }
+
+ /* Was this inode dirtied too recently? */
+ if (wbc->older_than_this && time_after(inode->dirtied_when,
+ *wbc->older_than_this)) {
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* continue with next locality group
+ * move this one to the dirty tail */
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move_tail(&lg->lg_list, &sbi->s_locality_dirty);
+
+ rc = EXT4_CONTINUE_WRITEBACK;
+ code = 4;
+ break;
+ }
+
+ /* Is another pdflush already flushing this queue? */
+ if (current_is_pdflush() && !writeback_acquire(bdi)) {
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* same for the locality group */
+ list_move(&lg->lg_list, &sbi->s_locality_io);
+
+ rc = EXT4_STOP_WRITEBACK;
+ code = 5;
+ break;
+ }
+
+ BUG_ON(inode->i_state & I_FREEING);
+ __iget(inode);
+ pages_skipped = wbc->pages_skipped;
+ __writeback_single_inode(inode, wbc);
+ if (wbc->sync_mode == WB_SYNC_HOLD) {
+ inode->dirtied_when = jiffies;
+ list_move(&inode->i_list, &lg->lg_dirty);
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move(&lg->lg_list, &sbi->s_locality_dirty);
+ }
+ if (current_is_pdflush())
+ writeback_release(bdi);
+ if (wbc->pages_skipped != pages_skipped) {
+ /*
+ * writeback is not making progress due to locked
+ * buffers. Skip this inode for now.
+ */
+ list_move(&inode->i_list, &lg->lg_dirty);
+
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move(&lg->lg_list, &sbi->s_locality_dirty);
+ }
+ spin_unlock(&inode_lock);
+ iput(inode);
+ cond_resched();
+ spin_lock(&inode_lock);
+ if (wbc->nr_to_write <= 0) {
+ rc = EXT4_STOP_WRITEBACK;
+ code = 6;
+ break;
+ }
+ }
+
+ spin_unlock(&inode_lock);
+
+ if (0 && nr_to_write - wbc->nr_to_write) {
+ printk("#%u: %s/%lu/%s%s%s%s%s%s M: %lu/%lu/%lu "
+ "LG:%p/%u/%u[%u/%u] wrote %lu/%d\n",
+ current->pid, __sync_modes[wbc->sync_mode],
+ wbc->nr_to_write,
+ wbc->nonblocking ? "N" : "",
+ wbc->encountered_congestion ? "C" : "",
+ wbc->for_kupdate ? "U" : "",
+ wbc->for_reclaim ? "R" : "",
+ wbc->for_writepages ? "W" : "",
+ wbc->range_cyclic ? "I" : "",
+ global_page_state(NR_FILE_DIRTY),
+ global_page_state(NR_UNSTABLE_NFS),
+ global_page_state(NR_WRITEBACK),
+ lg, atomic_read(&lg->lg_count), lg->lg_pgid,
+ dirty_pages, nonallocated,
+ nr_to_write - wbc->nr_to_write, code);
+ }
+
+ return rc;
+}
+
+/*
+ * the core of inode syncer:
+ * - loop over locality groups
+ * - maintain them in order to avoid starvation
+ */
+void ext4_lg_sync_groups(struct super_block *sb, struct writeback_control *wbc)
+{
+ const unsigned long start = jiffies; /* livelock avoidance */
+ struct ext4_locality_group *lg = NULL, *prev = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int rc;
+
+ spin_lock(&inode_lock);
+
+ /*printk("#%u: mode %s, nr2wr %lu, %s%s%s%s%s%s M: %lu/%lu/%lu "
+ "LGs: %sdirty %sio\n", current->pid,
+ __sync_modes[wbc->sync_mode], wbc->nr_to_write,
+ wbc->nonblocking ? "nonblock " : "",
+ wbc->encountered_congestion ? "congested " : "",
+ wbc->for_kupdate ? "kupdate " : "",
+ wbc->for_reclaim ? "reclaim " : "",
+ wbc->for_writepages ? "writepages " : "",
+ wbc->range_cyclic ? "cyclic " : "",
+ global_page_state(NR_FILE_DIRTY),
+ global_page_state(NR_UNSTABLE_NFS),
+ global_page_state(NR_WRITEBACK),
+ list_empty(&sbi->s_locality_dirty) ? "-" : "+",
+ list_empty(&sbi->s_locality_io) ? "-" : "+");*/
+
+ if (!wbc->for_kupdate || list_empty(&sbi->s_locality_io))
+ list_splice_init(&sbi->s_locality_dirty, &sbi->s_locality_io);
+
+ while (!list_empty(&sbi->s_locality_io)) {
+
+ /* we should handle same group twice in a row */
+ WARN_ON(prev && prev == lg);
+ prev = lg;
+
+ lg = list_entry(sbi->s_locality_io.prev,
+ struct ext4_locality_group, lg_list);
+
+ /* protect locality group */
+ atomic_inc(&lg->lg_count);
+
+ /* to avoid two concurrent threads flushing same group */
+ list_del_init(&lg->lg_list);
+
+ spin_unlock(&inode_lock);
+
+ clear_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ rc = ext4_lg_sync_single_group(sb, lg, wbc, start);
+
+ spin_lock(&inode_lock);
+ ext4_lg_put_group(lg);
+
+ if (rc == EXT4_STOP_WRITEBACK)
+ break;
+ }
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * entry function for inode syncing
+ * it's responsbility is to sort all inode out in their locality groups
+ */
+void ext4_lg_sync_inodes(struct super_block *sb, struct writeback_control *wbc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg;
+
+ /* refill pending groups from s_dirty */
+ spin_lock(&inode_lock);
+ while (!list_empty(&sb->s_dirty)) {
+ struct inode *inode = list_entry(sb->s_dirty.prev,
+ struct inode, i_list);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ lg = ei->i_locality_group;
+ if (lg == NULL) {
+ if (S_ISDIR(inode->i_mode) || i_size_read(inode) == 0) {
+ if (atomic_read(&inode->i_count)) {
+ /*
+ * The inode is clean, inuse
+ */
+ list_move(&inode->i_list, &inode_in_use);
+ } else {
+ /*
+ * The inode is clean, unused
+ */
+ list_move(&inode->i_list, &inode_unused);
+ }
+ continue;
+ }
+ /* XXX: atime changed ? or mmap?
+ * anyway, assign the inode to anonymous group */
+ lg = sbi->s_locality_anon;
+ atomic_inc(&lg->lg_count);
+ lg = ext4_lg_assign_to_group_nolock(inode, lg);
+ }
+
+ /* move inode in proper locality group's dirty list */
+ spin_lock(&lg->lg_lock);
+ list_move_tail(&inode->i_list, &lg->lg_dirty);
+ spin_unlock(&lg->lg_lock);
+
+ if (!test_and_set_bit(EXT4_LG_DIRTY, &lg->lg_flags))
+ list_move(&lg->lg_list, &sbi->s_locality_dirty);
+ }
+ spin_unlock(&inode_lock);
+
+ ext4_lg_sync_groups(sb, wbc);
+}
+
+void ext4_lg_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg;
+
+ sb->s_flags |= 2048; /* XXX: i'll fix this, i promise */
+ spin_lock_init(&sbi->s_locality_lock);
+ INIT_LIST_HEAD(&sbi->s_locality_groups);
+ INIT_LIST_HEAD(&sbi->s_locality_dirty);
+ INIT_LIST_HEAD(&sbi->s_locality_io);
+
+ lg = ext4_lg_new_group(sb);
+ if (lg != NULL)
+ sbi->s_locality_anon = lg;
+}
+
+void ext4_lg_release(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg;
+ struct list_head *cur, *tmp;
+
+ list_for_each_safe_rcu(cur, tmp, &sbi->s_locality_groups) {
+ lg = list_entry(cur, struct ext4_locality_group, lg_hash);
+ if (atomic_read(&lg->lg_count))
+ printk("LG %p/%d (pgid %u), %u inodes, dirty %d, non-allocated %d\n",
+ lg, atomic_read(&lg->lg_count),
+ atomic_read(&lg->lg_inodes_nr), lg->lg_pgid,
+ atomic_read(&lg->lg_dirty_pages),
+ atomic_read(&lg->lg_nonallocated));
+ list_del(&lg->lg_hash);
+ kfree(lg);
+ }
+ lg = sbi->s_locality_anon;
+ if (lg) {
+ if (atomic_read(&lg->lg_count) > 1)
+ printk("LG anon/%d, %u inodes, dirty %d, non-allocated %d\n",
+ atomic_read(&lg->lg_count),
+ atomic_read(&lg->lg_inodes_nr),
+ atomic_read(&lg->lg_dirty_pages),
+ atomic_read(&lg->lg_nonallocated));
+ kfree(lg);
+ }
+}
+
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5bd2762..efc9270 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -452,6 +452,7 @@ static void ext4_put_super (struct super_block * sb)
mark_buffer_dirty(sbi->s_sbh);
ext4_commit_super(sb, es, 1);
}
+ ext4_lg_release(sb);

for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -501,6 +502,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
+ ei->i_locality_group = NULL;
#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
ei->i_acl = EXT4_ACL_NOT_CACHED;
ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -571,6 +573,7 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ ext4_lg_inode_leave_group(inode);
}

static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -713,6 +716,7 @@ static const struct super_operations ext4_sops = {
.remount_fs = ext4_remount,
.clear_inode = ext4_clear_inode,
.show_options = ext4_show_options,
+ .sync_inodes = ext4_lg_sync_inodes,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
@@ -1960,6 +1964,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");

+ ext4_lg_init(sb);
ext4_ext_init(sb);
ext4_reserve_init(sb);
ext4_wb_init(sb);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cdcff8c..7806778 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -149,8 +149,7 @@ static int write_inode(struct inode *inode, int sync)
*
* Called under inode_lock.
*/
-static int
-__sync_single_inode(struct inode *inode, struct writeback_control *wbc)
+int __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
{
unsigned dirty;
struct address_space *mapping = inode->i_mapping;
@@ -240,8 +239,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
* caller has ref on the inode (either via __iget or via syscall against an fd)
* or the inode has I_WILL_FREE set (via generic_forget_inode)
*/
-static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
wait_queue_head_t *wqh;

@@ -440,7 +438,7 @@ writeback_inodes(struct writeback_control *wbc)
restart:
sb = sb_entry(super_blocks.prev);
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
- if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
+ if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io) || (sb->s_flags & 2048)) {
/* we're making our own get_super here */
sb->s_count++;
spin_unlock(&sb_lock);
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index 138fcbc..cd477e2 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -824,6 +824,34 @@ struct dx_hash_info


/*
+ * Locality group:
+ * we try to group all related changes together
+ * so that writeback can flush/allocate them together as well
+ */
+struct ext4_locality_group {
+ int lg_parent;
+ int lg_pgid;
+ int lg_sid;
+ struct list_head lg_hash;
+ spinlock_t lg_lock;
+ int lg_deleted;
+ atomic_t lg_count;
+ atomic_t lg_inodes_nr;
+
+ /* */
+ unsigned long lg_flags;
+ struct list_head lg_list;
+
+ /* inode lists for the group */
+ struct list_head lg_inodes; /* inodes in the group */
+ struct list_head lg_dirty; /* dirty inodes from s_dirty */
+ struct list_head lg_io; /* inodes scheduled for flush */
+
+ atomic_t lg_dirty_pages; /* pages to write */
+ atomic_t lg_nonallocated;/* non-allocated pages */
+};
+
+/*
* Describe an inode's exact location on disk and in memory
*/
struct ext4_iloc
@@ -881,6 +909,15 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
# define ATTRIB_NORET __attribute__((noreturn))
# define NORET_AND noreturn,

+/* lg.c */
+extern void ext4_lg_init(struct super_block *sb);
+extern void ext4_lg_release(struct super_block *sb);
+extern void ext4_lg_inode_leave_group(struct inode *inode);
+extern void ext4_lg_page_enter_inode(struct inode *inode, struct page *page, int allocated);
+extern void ext4_lg_page_leave_inode(struct inode *inode, struct page *page, int allocated);
+extern void ext4_lg_sync_inodes(struct super_block *, struct writeback_control *);
+
+
/* balloc.c */
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
index 9dea1f7..6d9f9db 100644
--- a/include/linux/ext4_fs_i.h
+++ b/include/linux/ext4_fs_i.h
@@ -150,6 +150,8 @@ struct ext4_inode_info {
*/
struct mutex truncate_mutex;
struct inode vfs_inode;
+ struct list_head i_lg_list;
+ struct ext4_locality_group *i_locality_group;

unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 9768b32..08b0645 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -86,6 +86,12 @@ struct ext4_sb_info {
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */

+ struct ext4_locality_group *s_locality_anon;
+ struct list_head s_locality_dirty;
+ struct list_head s_locality_io;
+ struct list_head s_locality_groups;
+ spinlock_t s_locality_lock;
+
#ifdef EXTENTS_STATS
/* ext4 extents stats */
unsigned long s_ext_min;
--
1.5.3.rc0.30.g114fd-dirty

2007-07-05 18:05:36

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH 3/4] Add some new function for searching extent tree.

From: Alex Tomas <[email protected]>

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/extents.c | 142 +++++++++++++++++++++++++++++++++++++++
include/linux/ext4_fs_extents.h | 2 +
2 files changed, 144 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 16df6e0..4e31439 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1011,6 +1011,148 @@ out:
}

/*
+ * search the closest allocated block to the left for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+int
+ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t *logical, ext4_fsblk_t *phys)
+{
+ struct ext4_extent_idx *ix;
+ struct ext4_extent *ex;
+ int depth;
+
+ BUG_ON(path == NULL);
+ depth = path->p_depth;
+ *phys = 0;
+
+ if (depth == 0 && path->p_ext == NULL)
+ return 0;
+
+ /* usually extent in the path covers blocks smaller
+ * then *logical, but it can be that extent is the
+ * first one in the file */
+
+ ex = path[depth].p_ext;
+ if (*logical < le32_to_cpu(ex->ee_block)) {
+ BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+ while (--depth >= 0) {
+ ix = path[depth].p_idx;
+ BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+ }
+ return 0;
+ }
+
+ BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len));
+
+ *logical = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1;
+ *phys = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - 1;
+ return 0;
+}
+
+/*
+ * search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+int
+ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t *logical, ext4_fsblk_t *phys)
+{
+ struct buffer_head *bh = NULL;
+ struct ext4_extent_header *eh;
+ struct ext4_extent_idx *ix;
+ struct ext4_extent *ex;
+ ext4_fsblk_t block;
+ int depth;
+
+ BUG_ON(path == NULL);
+ depth = path->p_depth;
+ *phys = 0;
+
+ if (depth == 0 && path->p_ext == NULL)
+ return 0;
+
+ /* usually extent in the path covers blocks smaller
+ * then *logical, but it can be that extent is the
+ * first one in the file */
+
+ ex = path[depth].p_ext;
+ if (*logical < le32_to_cpu(ex->ee_block)) {
+ BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+ while (--depth >= 0) {
+ ix = path[depth].p_idx;
+ BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+ }
+ *logical = le32_to_cpu(ex->ee_block);
+ *phys = ext_pblock(ex);
+ return 0;
+ }
+
+ BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len));
+
+ if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
+ /* next allocated block in this leaf */
+ ex++;
+ *logical = le32_to_cpu(ex->ee_block);
+ *phys = ext_pblock(ex);
+ return 0;
+ }
+
+ /* go up and search for index to the right */
+ while (--depth >= 0) {
+ ix = path[depth].p_idx;
+ if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
+ break;
+ }
+
+ if (depth < 0) {
+ /* we've gone up to the root and
+ * found no index to the right */
+ return 0;
+ }
+
+ /* we've found index to the right, let's
+ * follow it and find the closest allocated
+ * block to the right */
+ ix++;
+ block = idx_pblock(ix);
+ while (++depth < path->p_depth) {
+ bh = sb_bread(inode->i_sb, block);
+ if (bh == NULL)
+ return -EIO;
+ eh = ext_block_hdr(bh);
+ if (ext4_ext_check_header(inode, eh, depth)) {
+ brelse(bh);
+ return -EIO;
+ }
+ ix = EXT_FIRST_INDEX(eh);
+ block = idx_pblock(ix);
+ brelse(bh);
+ }
+
+ bh = sb_bread(inode->i_sb, block);
+ if (bh == NULL)
+ return -EIO;
+ eh = ext_block_hdr(bh);
+ if (ext4_ext_check_header(inode, eh, depth)) {
+ brelse(bh);
+ return -EIO;
+ }
+ ex = EXT_FIRST_EXTENT(eh);
+ *logical = le32_to_cpu(ex->ee_block);
+ *phys = ext_pblock(ex);
+ brelse(bh);
+ return 0;
+
+}
+
+/*
* ext4_ext_next_allocated_block:
* returns allocated block in subsequent extent or EXT_MAX_BLOCK.
* NOTE: it considers block number from index entry as
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
index f67c755..1909a78 100644
--- a/include/linux/ext4_fs_extents.h
+++ b/include/linux/ext4_fs_extents.h
@@ -213,6 +213,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_pa
extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, ext4_fsblk_t *, ext4_fsblk_t *);
+extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_fsblk_t *, ext4_fsblk_t *);

#endif /* _LINUX_EXT4_EXTENTS */

--
1.5.3.rc0.30.g114fd-dirty