2006-12-01 00:18:17

by Alex Tomas

[permalink] [raw]
Subject: [RFC] ext4-locality-groups patch


this patch implements locality groups idea in a very
simplified form. the policy is silly and ->sync_inodes()
not very well tested on different workloads.

thanks, Alex


Index: linux-2.6.19-rc6/include/linux/ext4_fs_i.h
===================================================================
--- linux-2.6.19-rc6.orig/include/linux/ext4_fs_i.h 2006-11-16 07:03:40.000000000 +0300
+++ linux-2.6.19-rc6/include/linux/ext4_fs_i.h 2006-12-01 02:51:13.000000000 +0300
@@ -150,6 +150,8 @@ struct ext4_inode_info {
*/
struct mutex truncate_mutex;
struct inode vfs_inode;
+ struct list_head i_lg_list;
+ struct ext4_locality_group *i_locality_group;

unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
Index: linux-2.6.19-rc6/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.19-rc6.orig/include/linux/ext4_fs.h 2006-11-16 07:03:40.000000000 +0300
+++ linux-2.6.19-rc6/include/linux/ext4_fs.h 2006-12-01 02:51:13.000000000 +0300
@@ -727,6 +727,34 @@ struct dx_hash_info


/*
+ * Locality group:
+ * we try to group all related changes together
+ * so that writeback can flush/allocate them together as well
+ */
+struct ext4_locality_group {
+ int lg_parent;
+ int lg_pgid;
+ int lg_sid;
+ struct list_head lg_hash;
+ spinlock_t lg_lock;
+ int lg_deleted;
+ atomic_t lg_count;
+ atomic_t lg_inodes_nr;
+
+ /* */
+ unsigned long lg_flags;
+ struct list_head lg_list;
+
+ /* inode lists for the group */
+ struct list_head lg_inodes; /* inodes in the group */
+ struct list_head lg_dirty; /* dirty inodes from s_dirty */
+ struct list_head lg_io; /* inodes scheduled for flush */
+
+ atomic_t lg_dirty_pages; /* pages to write */
+ atomic_t lg_nonallocated;/* non-allocated pages */
+};
+
+/*
* Describe an inode's exact location on disk and in memory
*/
struct ext4_iloc
@@ -784,6 +812,15 @@ void ext4_get_group_no_and_offset(struct
# define ATTRIB_NORET __attribute__((noreturn))
# define NORET_AND noreturn,

+/* lg.c */
+extern void ext4_lg_init(struct super_block *sb);
+extern void ext4_lg_release(struct super_block *sb);
+extern void ext4_lg_inode_leave_group(struct inode *inode);
+extern void ext4_lg_page_enter_inode(struct inode *inode, struct page *page, int allocated);
+extern void ext4_lg_page_leave_inode(struct inode *inode, struct page *page, int allocated);
+extern void ext4_lg_sync_inodes(struct super_block *, struct writeback_control *);
+
+
/* balloc.c */
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
Index: linux-2.6.19-rc6/include/linux/ext4_fs_sb.h
===================================================================
--- linux-2.6.19-rc6.orig/include/linux/ext4_fs_sb.h 2006-11-16 07:03:40.000000000 +0300
+++ linux-2.6.19-rc6/include/linux/ext4_fs_sb.h 2006-12-01 02:51:13.000000000 +0300
@@ -80,6 +80,11 @@ struct ext4_sb_info {
int s_jquota_fmt; /* Format of quota to use */
#endif

+ struct list_head s_locality_dirty;
+ struct list_head s_locality_io;
+ struct list_head s_locality_groups;
+ spinlock_t s_locality_lock;
+
#ifdef EXTENTS_STATS
/* ext4 extents stats */
unsigned long s_ext_min;
Index: linux-2.6.19-rc6/fs/ext4/lg.c
===================================================================
--- linux-2.6.19-rc6.orig/fs/ext4/lg.c 2006-11-30 15:32:10.563465031 +0300
+++ linux-2.6.19-rc6/fs/ext4/lg.c 2006-12-01 02:54:29.000000000 +0300
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2006, Cluster File Systems, Inc, [email protected]
+ * Written by Alex Tomas <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+/*
+ * locality groups
+ *
+ */
+
+/*
+ * TODO:
+ * - too many of tricks
+ * - mmap'ed files support (we need to link them to some group)
+ * - too silly grouping policy
+ * - free non-used groups after some timeout
+ * - anonymous group for non-regular inodes
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_fs_i.h>
+#include <linux/ext4_fs_sb.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/writeback.h>
+
+#ifndef TestClearPageChecked
+#define TestClearPageChecked(page) test_and_clear_bit(PG_checked, &(page)->flags)
+#endif
+#ifndef TestSetPageChecked
+#define TestSetPageChecked(page) test_and_set_bit(PG_checked, &(page)->flags)
+#endif
+
+
+extern struct super_block *blockdev_superblock;
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+ return sb == blockdev_superblock;
+}
+
+extern int __writeback_single_inode(struct inode *, struct writeback_control *);
+
+struct ext4_locality_group *ext4_lg_find_group(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg;
+ struct list_head *cur;
+
+ rcu_read_lock();
+ list_for_each_rcu(cur, &sbi->s_locality_groups) {
+ lg = list_entry(cur, struct ext4_locality_group, lg_hash);
+ if (lg->lg_pgid == current->signal->pgrp) {
+ spin_lock(&lg->lg_lock);
+ if (lg->lg_deleted == 0) {
+ atomic_inc(&lg->lg_count);
+ spin_unlock(&lg->lg_lock);
+ break;
+ }
+ spin_unlock(&lg->lg_lock);
+ }
+ lg = NULL;
+ }
+ rcu_read_unlock();
+ return lg;
+}
+
+void ext4_lg_put_group(struct ext4_locality_group *lg)
+{
+ atomic_dec(&lg->lg_count);
+}
+
+struct ext4_locality_group *ext4_lg_find_or_allocate_group(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_locality_group *lg, *olg;
+
+ lg = ext4_lg_find_group(inode->i_sb);
+ if (lg == NULL) {
+ lg = kmalloc(sizeof(struct ext4_locality_group), GFP_NOFS);
+ if (lg == NULL)
+ return NULL;
+ lg->lg_pgid = current->signal->pgrp;
+ lg->lg_sid = current->signal->session;
+ spin_lock_init(&lg->lg_lock);
+ lg->lg_deleted = 0;
+ lg->lg_flags = 0;
+ atomic_set(&lg->lg_count, 1);
+ atomic_set(&lg->lg_inodes_nr, 0);
+ INIT_LIST_HEAD(&lg->lg_list);
+ INIT_LIST_HEAD(&lg->lg_inodes);
+ INIT_LIST_HEAD(&lg->lg_dirty);
+ INIT_LIST_HEAD(&lg->lg_io);
+ atomic_set(&lg->lg_dirty_pages, 0);
+ atomic_set(&lg->lg_nonallocated, 0);
+
+ spin_lock(&sbi->s_locality_lock);
+ olg = ext4_lg_find_group(inode->i_sb);
+ if (olg == NULL) {
+ list_add(&lg->lg_hash, &sbi->s_locality_groups);
+ } else {
+ kfree(lg);
+ lg = olg;
+ }
+ spin_unlock(&sbi->s_locality_lock);
+ }
+
+ /*
+ * XXX locking here?
+ */
+ spin_lock(&inode_lock);
+ if (EXT4_I(inode)->i_locality_group == NULL) {
+ EXT4_I(inode)->i_locality_group = lg;
+ list_add(&EXT4_I(inode)->i_lg_list, &lg->lg_inodes);
+ atomic_inc(&lg->lg_inodes_nr);
+ } else {
+ printk("somebody has already set lg %p (our %p) to inode %lu(%p)\n",
+ EXT4_I(inode)->i_locality_group, lg, inode->i_ino, inode);
+ ext4_lg_put_group(lg);
+ lg = EXT4_I(inode)->i_locality_group;
+ }
+ spin_unlock(&inode_lock);
+
+ return lg;
+}
+
+/*
+ * every dirty page should be counted
+ */
+void ext4_lg_page_enter_inode(struct inode *inode,
+ struct page *page, int allocated)
+{
+ struct ext4_locality_group *lg;
+
+ lg = EXT4_I(inode)->i_locality_group;
+ if (lg == NULL) {
+ lg = ext4_lg_find_or_allocate_group(inode);
+ if (lg == NULL)
+ return;
+ }
+
+ if (!TestSetPageChecked(page)) {
+ atomic_inc(&lg->lg_dirty_pages);
+ if (!allocated)
+ atomic_inc(&lg->lg_nonallocated);
+ }
+}
+
+
+/*
+ *
+ */
+void ext4_lg_page_leave_inode(struct inode *inode,
+ struct page *page, int allocated)
+{
+ struct ext4_locality_group *lg;
+
+ lg = EXT4_I(inode)->i_locality_group;
+ if (lg == NULL) {
+ if (S_ISREG(inode->i_mode))
+ printk("regular file %lu/%u with no locality group?!\n",
+ inode->i_ino, inode->i_generation);
+ return;
+ }
+
+ if (!TestClearPageChecked(page))
+ return;
+
+ atomic_dec(&lg->lg_dirty_pages);
+ if (!allocated)
+ atomic_dec(&lg->lg_nonallocated);
+}
+
+/*
+ * Inode leave group
+ */
+void ext4_lg_inode_leave_group(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_locality_group *lg;
+
+ if (inode->i_nlink != 0 && S_ISREG(inode->i_mode)) {
+ BUG_ON(mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY));
+ }
+
+ spin_lock(&inode_lock);
+ lg = ei->i_locality_group;
+ ei->i_locality_group = NULL;
+ spin_unlock(&inode_lock);
+
+ if (lg != NULL) {
+ spin_lock(&lg->lg_lock);
+ list_del(&ei->i_lg_list);
+ spin_unlock(&lg->lg_lock);
+ atomic_dec(&lg->lg_inodes_nr);
+ ext4_lg_put_group(lg);
+ }
+}
+
+#define EXT4_LG_DIRTY 0
+
+#define EXT4_CONTINUE_WRITEBACK 1
+#define EXT4_STOP_WRITEBACK 2
+
+static char *__sync_modes[] = { "NONE", "ALL", "HOLD" };
+
+/*
+ * The function syncs a single group like generic_sync_sb_inodes() does
+ * returns:
+ * 0 - continue syncing with a next group
+ * 1 - break syncing
+ */
+int ext4_lg_sync_single_group(struct super_block *sb,
+ struct ext4_locality_group *lg,
+ struct writeback_control *wbc,
+ unsigned long start)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int nr_to_write = wbc->nr_to_write;
+ int dirty_pages, nonallocated;
+ int rc, code = 0;
+
+ dirty_pages = atomic_read(&lg->lg_dirty_pages);
+ nonallocated = atomic_read(&lg->lg_nonallocated);
+
+ rc = EXT4_CONTINUE_WRITEBACK;
+
+ spin_lock(&inode_lock);
+
+ if (!wbc->for_kupdate || list_empty(&lg->lg_io))
+ list_splice_init(&lg->lg_dirty, &lg->lg_io);
+
+ while (!list_empty(&lg->lg_io)) {
+ struct inode *inode = list_entry(lg->lg_io.prev,
+ struct inode, i_list);
+ struct address_space *mapping = inode->i_mapping;
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ long pages_skipped;
+
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ /* underlying device is congested
+ * break all writeback immediately */
+ wbc->encountered_congestion = 1;
+
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* same for the locality group */
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move(&lg->lg_list, &sbi->s_locality_io);
+
+ /* signal to the caller */
+ rc = EXT4_STOP_WRITEBACK;
+ code = 1;
+ break;
+ }
+
+ if (wbc->bdi && bdi != wbc->bdi) {
+ printk("wbc->bdi (%p) != bdi (%p)\n", wbc->bdi, bdi);
+ list_move(&inode->i_list, &inode_in_use);
+ rc = EXT4_CONTINUE_WRITEBACK;
+ code = 2;
+ break;
+ }
+
+ /* Was this inode dirtied after sync_sb_inodes was called? */
+ if (time_after(inode->dirtied_when, start)) {
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* continue with next locality group
+ * move this one to the dirty tail */
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move_tail(&lg->lg_list, &sbi->s_locality_dirty);
+
+ rc = EXT4_CONTINUE_WRITEBACK;
+ code = 3;
+ break;
+ }
+
+ /* Was this inode dirtied too recently? */
+ if (wbc->older_than_this && time_after(inode->dirtied_when,
+ *wbc->older_than_this)) {
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* continue with next locality group
+ * move this one to the dirty tail */
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move_tail(&lg->lg_list, &sbi->s_locality_dirty);
+
+ rc = EXT4_CONTINUE_WRITEBACK;
+ code = 4;
+ break;
+ }
+
+ /* Is another pdflush already flushing this queue? */
+ if (current_is_pdflush() && !writeback_acquire(bdi)) {
+ /* keep this inode on the head so that
+ * we'll continue writeback with it
+ * when we return to this locality group */
+
+ /* same for the locality group */
+ list_move(&lg->lg_list, &sbi->s_locality_io);
+
+ rc = EXT4_STOP_WRITEBACK;
+ code = 5;
+ break;
+ }
+
+ BUG_ON(inode->i_state & I_FREEING);
+ __iget(inode);
+ pages_skipped = wbc->pages_skipped;
+ __writeback_single_inode(inode, wbc);
+ if (wbc->sync_mode == WB_SYNC_HOLD) {
+ inode->dirtied_when = jiffies;
+ list_move(&inode->i_list, &lg->lg_dirty);
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move(&lg->lg_list, &sbi->s_locality_dirty);
+ }
+ if (current_is_pdflush())
+ writeback_release(bdi);
+ if (wbc->pages_skipped != pages_skipped) {
+ /*
+ * writeback is not making progress due to locked
+ * buffers. Skip this inode for now.
+ */
+ list_move(&inode->i_list, &lg->lg_dirty);
+
+ set_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ list_move(&lg->lg_list, &sbi->s_locality_dirty);
+ }
+ spin_unlock(&inode_lock);
+ iput(inode);
+ cond_resched();
+ spin_lock(&inode_lock);
+ if (wbc->nr_to_write <= 0) {
+ rc = EXT4_STOP_WRITEBACK;
+ code = 6;
+ break;
+ }
+ }
+
+ spin_unlock(&inode_lock);
+
+ if (0 && nr_to_write - wbc->nr_to_write) {
+ printk("#%u: %s/%lu/%s%s%s%s%s%s M: %lu/%lu/%lu "
+ "LG:%p/%u/%u[%u/%u] wrote %lu/%d\n",
+ current->pid, __sync_modes[wbc->sync_mode],
+ wbc->nr_to_write,
+ wbc->nonblocking ? "N" : "",
+ wbc->encountered_congestion ? "C" : "",
+ wbc->for_kupdate ? "U" : "",
+ wbc->for_reclaim ? "R" : "",
+ wbc->for_writepages ? "W" : "",
+ wbc->range_cyclic ? "I" : "",
+ global_page_state(NR_FILE_DIRTY),
+ global_page_state(NR_UNSTABLE_NFS),
+ global_page_state(NR_WRITEBACK),
+ lg, atomic_read(&lg->lg_count), lg->lg_pgid,
+ dirty_pages, nonallocated,
+ nr_to_write - wbc->nr_to_write, code);
+ }
+
+ return rc;
+}
+
+/*
+ * the core of inode syncer:
+ * - loop over locality groups
+ * - maintain them in order to avoid starvation
+ */
+void ext4_lg_sync_groups(struct super_block *sb, struct writeback_control *wbc)
+{
+ const unsigned long start = jiffies; /* livelock avoidance */
+ struct ext4_locality_group *lg = NULL, *prev = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int rc;
+
+ spin_lock(&inode_lock);
+
+ /*printk("#%u: mode %s, nr2wr %lu, %s%s%s%s%s%s M: %lu/%lu/%lu "
+ "LGs: %sdirty %sio\n", current->pid,
+ __sync_modes[wbc->sync_mode], wbc->nr_to_write,
+ wbc->nonblocking ? "nonblock " : "",
+ wbc->encountered_congestion ? "congested " : "",
+ wbc->for_kupdate ? "kupdate " : "",
+ wbc->for_reclaim ? "reclaim " : "",
+ wbc->for_writepages ? "writepages " : "",
+ wbc->range_cyclic ? "cyclic " : "",
+ global_page_state(NR_FILE_DIRTY),
+ global_page_state(NR_UNSTABLE_NFS),
+ global_page_state(NR_WRITEBACK),
+ list_empty(&sbi->s_locality_dirty) ? "-" : "+",
+ list_empty(&sbi->s_locality_io) ? "-" : "+");*/
+
+ if (!wbc->for_kupdate || list_empty(&sbi->s_locality_io))
+ list_splice_init(&sbi->s_locality_dirty, &sbi->s_locality_io);
+
+ while (!list_empty(&sbi->s_locality_io)) {
+
+ /* we should handle same group twice in a row */
+ WARN_ON(prev && prev == lg);
+ prev = lg;
+
+ lg = list_entry(sbi->s_locality_io.prev,
+ struct ext4_locality_group, lg_list);
+
+ /* protect locality group */
+ atomic_inc(&lg->lg_count);
+
+ /* to avoid two concurrent threads flushing same group */
+ list_del_init(&lg->lg_list);
+
+ spin_unlock(&inode_lock);
+
+ clear_bit(EXT4_LG_DIRTY, &lg->lg_flags);
+ rc = ext4_lg_sync_single_group(sb, lg, wbc, start);
+
+ spin_lock(&inode_lock);
+ ext4_lg_put_group(lg);
+
+ if (rc == EXT4_STOP_WRITEBACK)
+ break;
+ }
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * entry function for inode syncing
+ * it's responsbility is to sort all inode out in their locality groups
+ */
+void ext4_lg_sync_inodes(struct super_block *sb, struct writeback_control *wbc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg;
+
+ /* refill pending groups from s_dirty */
+ spin_lock(&inode_lock);
+ while (!list_empty(&sb->s_dirty)) {
+ struct inode *inode = list_entry(sb->s_dirty.prev,
+ struct inode, i_list);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ lg = ei->i_locality_group;
+ if (lg == NULL) {
+ if (S_ISDIR(inode->i_mode) || i_size_read(inode) == 0) {
+ if (atomic_read(&inode->i_count)) {
+ /*
+ * The inode is clean, inuse
+ */
+ list_move(&inode->i_list, &inode_in_use);
+ } else {
+ /*
+ * The inode is clean, unused
+ */
+ list_move(&inode->i_list, &inode_unused);
+ }
+ continue;
+ }
+ /* XXX: atime changed ? */
+ list_move(&inode->i_list, &inode_in_use);
+ continue;
+ }
+
+ /* move inode in proper locality group's dirty list */
+ spin_lock(&lg->lg_lock);
+ list_move_tail(&inode->i_list, &lg->lg_dirty);
+ spin_unlock(&lg->lg_lock);
+
+ if (!test_and_set_bit(EXT4_LG_DIRTY, &lg->lg_flags))
+ list_move(&lg->lg_list, &sbi->s_locality_dirty);
+ }
+ spin_unlock(&inode_lock);
+
+ ext4_lg_sync_groups(sb, wbc);
+}
+
+void ext4_lg_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ sb->s_flags |= 2048; /* XXX: i'll fix this, i promise */
+ spin_lock_init(&sbi->s_locality_lock);
+ INIT_LIST_HEAD(&sbi->s_locality_groups);
+ INIT_LIST_HEAD(&sbi->s_locality_dirty);
+ INIT_LIST_HEAD(&sbi->s_locality_io);
+}
+
+void ext4_lg_release(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_locality_group *lg;
+ struct list_head *cur, *tmp;
+
+ list_for_each_safe_rcu(cur, tmp, &sbi->s_locality_groups) {
+ lg = list_entry(cur, struct ext4_locality_group, lg_hash);
+ if (atomic_read(&lg->lg_count))
+ printk("LG %p/%d (pgid %u), %u inodes, dirty %d, non-allocated %d\n",
+ lg, atomic_read(&lg->lg_count),
+ atomic_read(&lg->lg_inodes_nr), lg->lg_pgid,
+ atomic_read(&lg->lg_dirty_pages),
+ atomic_read(&lg->lg_nonallocated));
+ list_del(&lg->lg_hash);
+ kfree(lg);
+ }
+}
+
Index: linux-2.6.19-rc6/fs/ext4/super.c
===================================================================
--- linux-2.6.19-rc6.orig/fs/ext4/super.c 2006-11-16 07:03:40.000000000 +0300
+++ linux-2.6.19-rc6/fs/ext4/super.c 2006-12-01 02:51:13.000000000 +0300
@@ -449,6 +449,7 @@ static void ext4_put_super (struct super
mark_buffer_dirty(sbi->s_sbh);
ext4_commit_super(sb, es, 1);
}
+ ext4_lg_release(sb);

for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -498,6 +499,7 @@ static struct inode *ext4_alloc_inode(st
ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
if (!ei)
return NULL;
+ ei->i_locality_group = NULL;
#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
ei->i_acl = EXT4_ACL_NOT_CACHED;
ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -564,6 +566,7 @@ static void ext4_clear_inode(struct inod
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ ext4_lg_inode_leave_group(inode);
}

static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -706,6 +709,7 @@ static struct super_operations ext4_sops
.remount_fs = ext4_remount,
.clear_inode = ext4_clear_inode,
.show_options = ext4_show_options,
+ .sync_inodes = ext4_lg_sync_inodes,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
@@ -1860,6 +1864,7 @@ static int ext4_fill_super (struct super
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");

+ ext4_lg_init(sb);
ext4_ext_init(sb);

lock_kernel();
Index: linux-2.6.19-rc6/fs/ext4/Makefile
===================================================================
--- linux-2.6.19-rc6.orig/fs/ext4/Makefile 2006-11-16 07:03:40.000000000 +0300
+++ linux-2.6.19-rc6/fs/ext4/Makefile 2006-12-01 02:51:13.000000000 +0300
@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o

ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o lg.o

ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
Index: linux-2.6.19-rc6/fs/fs-writeback.c
===================================================================
--- linux-2.6.19-rc6.orig/fs/fs-writeback.c 2006-11-29 02:32:06.000000000 +0300
+++ linux-2.6.19-rc6/fs/fs-writeback.c 2006-12-01 02:51:13.000000000 +0300
@@ -149,8 +149,7 @@ static int write_inode(struct inode *ino
*
* Called under inode_lock.
*/
-static int
-__sync_single_inode(struct inode *inode, struct writeback_control *wbc)
+int __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
{
unsigned dirty;
struct address_space *mapping = inode->i_mapping;
@@ -240,8 +239,7 @@ __sync_single_inode(struct inode *inode,
* caller has ref on the inode (either via __iget or via syscall against an fd)
* or the inode has I_WILL_FREE set (via generic_forget_inode)
*/
-static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
wait_queue_head_t *wqh;

@@ -429,7 +427,7 @@ writeback_inodes(struct writeback_contro
restart:
sb = sb_entry(super_blocks.prev);
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
- if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
+ if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io) || (sb->s_flags & 2048)) {
/* we're making our own get_super here */
sb->s_count++;
spin_unlock(&sb_lock);


2006-12-12 16:50:55

by Valerie Clement

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

Index: linux-2.6.19-rc6/fs/ext4/lg.c
===================================================================
--- linux-2.6.19-rc6.orig/fs/ext4/lg.c 2006-12-12 17:23:46.000000000 +0100
+++ linux-2.6.19-rc6/fs/ext4/lg.c 2006-12-12 17:24:53.000000000 +0100
@@ -66,7 +66,7 @@ extern int __writeback_single_inode(stru
struct ext4_locality_group *ext4_lg_find_group(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_locality_group *lg;
+ struct ext4_locality_group *lg = NULL;
struct list_head *cur;

rcu_read_lock();


Attachments:
ext4_fix_oops_in_lg_sync_inodes.patch (547.00 B)

2006-12-12 16:59:08

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

>>>>> Valerie Clement (VC) writes:


VC> The crash occurs because ei->i_locality_group is not well initialized.
VC> The patch in attachment fixes the problem on my system (x86_64).

thanks!

strange that I didn't hit this on i386 ... and gcc doesn't warn.

thanks, Alex

2006-12-13 14:29:06

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

>>>>> Valerie Clement (VC) writes:

VC> Hi Alex,
VC> I did the following test:
VC> # mkfs /dev/sdc1
VC> # mount -t ext4dev -o extents,mballoc,delalloc /dev/sdc1 /test
VC> # dumpe2fs -h /dev/sdc1
VC> # cp linux.tar /test
VC> # tar xf /test/linux.tar
VC> # dumpe2fs -h /dev/sdc1

VC> The "Free blocks" and "Free inodes" counters in the dumpe2fs output
VC> did not change after running the cp and tar commands.

sync should help, I think.

VC> I think it misses a call to ext4_commit_super() at the end of the
VC> ext4_lg_sync_inodes function.
VC> Am I right ?

what for? allocation was delayed, no blocks were allocated.

thanks, Alex

2006-12-13 14:39:19

by Valerie Clement

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

Alex Tomas wrote:
>>>>>> Valerie Clement (VC) writes:
>
> VC> Hi Alex,
> VC> I did the following test:
> VC> # mkfs /dev/sdc1
> VC> # mount -t ext4dev -o extents,mballoc,delalloc /dev/sdc1 /test
> VC> # dumpe2fs -h /dev/sdc1
> VC> # cp linux.tar /test
> VC> # tar xf /test/linux.tar
> VC> # dumpe2fs -h /dev/sdc1
>
> VC> The "Free blocks" and "Free inodes" counters in the dumpe2fs output
> VC> did not change after running the cp and tar commands.
>
> sync should help, I think.

No, I've just tried, it does not change anything.
Val?rie

>
> VC> I think it misses a call to ext4_commit_super() at the end of the
> VC> ext4_lg_sync_inodes function.
> VC> Am I right ?
>
> what for? allocation was delayed, no blocks were allocated.
>
> thanks, Alex
>

2006-12-13 14:39:52

by Valerie Clement

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

Alex Tomas wrote:

> +/*
> + * entry function for inode syncing
> + * it's responsbility is to sort all inode out in their locality groups
> + */
> +void ext4_lg_sync_inodes(struct super_block *sb, struct writeback_control *wbc)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_locality_group *lg;
> +
> + /* refill pending groups from s_dirty */
> + spin_lock(&inode_lock);
> + while (!list_empty(&sb->s_dirty)) {
> + struct inode *inode = list_entry(sb->s_dirty.prev,
> + struct inode, i_list);
> + struct ext4_inode_info *ei = EXT4_I(inode);
> +
> + lg = ei->i_locality_group;
> + if (lg == NULL) {
> + if (S_ISDIR(inode->i_mode) || i_size_read(inode) == 0) {
> + if (atomic_read(&inode->i_count)) {
> + /*
> + * The inode is clean, inuse
> + */
> + list_move(&inode->i_list, &inode_in_use);
> + } else {
> + /*
> + * The inode is clean, unused
> + */
> + list_move(&inode->i_list, &inode_unused);
> + }
> + continue;
> + }
> + /* XXX: atime changed ? */
> + list_move(&inode->i_list, &inode_in_use);
> + continue;
> + }
> +
> + /* move inode in proper locality group's dirty list */
> + spin_lock(&lg->lg_lock);
> + list_move_tail(&inode->i_list, &lg->lg_dirty);
> + spin_unlock(&lg->lg_lock);
> +
> + if (!test_and_set_bit(EXT4_LG_DIRTY, &lg->lg_flags))
> + list_move(&lg->lg_list, &sbi->s_locality_dirty);
> + }
> + spin_unlock(&inode_lock);
> +
> + ext4_lg_sync_groups(sb, wbc);
> +}
Hi Alex,
I did the following test:
# mkfs /dev/sdc1
# mount -t ext4dev -o extents,mballoc,delalloc /dev/sdc1 /test
# dumpe2fs -h /dev/sdc1
# cp linux.tar /test
# tar xf /test/linux.tar
# dumpe2fs -h /dev/sdc1

The "Free blocks" and "Free inodes" counters in the dumpe2fs output did
not change after running the cp and tar commands.

I think it misses a call to ext4_commit_super() at the end of the
ext4_lg_sync_inodes function.
Am I right ?

Val?rie

2006-12-13 14:48:10

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

Alex Tomas wrote:
>>>>>> Valerie Clement (VC) writes:
>
> VC> Hi Alex,
> VC> I did the following test:
> VC> # mkfs /dev/sdc1
> VC> # mount -t ext4dev -o extents,mballoc,delalloc /dev/sdc1 /test
> VC> # dumpe2fs -h /dev/sdc1
> VC> # cp linux.tar /test
> VC> # tar xf /test/linux.tar
> VC> # dumpe2fs -h /dev/sdc1
>
> VC> The "Free blocks" and "Free inodes" counters in the dumpe2fs output
> VC> did not change after running the cp and tar commands.
>
> sync should help, I think.

This is probably a discrepancy that comes from reading the block device
directly, right? Which is not necessarily in sync with the address
space for the filesystem.

-Eric

2006-12-13 14:51:48

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

>>>>> Valerie Clement (VC) writes:

VC> Alex Tomas wrote:
>>>>>>> Valerie Clement (VC) writes:
>>
VC> Hi Alex,
VC> I did the following test:
VC> # mkfs /dev/sdc1
VC> # mount -t ext4dev -o extents,mballoc,delalloc /dev/sdc1 /test
VC> # dumpe2fs -h /dev/sdc1
VC> # cp linux.tar /test
VC> # tar xf /test/linux.tar
VC> # dumpe2fs -h /dev/sdc1
>>
VC> The "Free blocks" and "Free inodes" counters in the dumpe2fs output
VC> did not change after running the cp and tar commands.
>>
>> sync should help, I think.

VC> No, I've just tried, it does not change anything.
VC> Val?rie

not sure ....

[root@tom ~]# mke2fs -m0 -j /dev/sda1 >&/dev/null
[root@tom ~]# mount -t ext4dev -o extents,mballoc,delalloc /dev/sda1 /test
[root@tom ~]# dumpe2fs /dev/sda1 >/tmp/before
dumpe2fs 1.39.cfs1 (29-May-2006)
[root@tom ~]# dd if=/dev/zero of=/test/f bs=1M count=20
20+0 records in
20+0 records out
[root@tom ~]# dumpe2fs /dev/sda1 >/tmp/after1
dumpe2fs 1.39.cfs1 (29-May-2006)
[root@tom ~]# sync
[root@tom ~]# dumpe2fs /dev/sda1 >/tmp/after2
dumpe2fs 1.39.cfs1 (29-May-2006)
[root@tom ~]# diff -pu /tmp/before /tmp/after1
--- /tmp/before 2006-12-13 14:39:42.541828787 +0300
+++ /tmp/after1 2006-12-13 14:39:59.602086138 +0300
@@ -45,9 +45,9 @@ Group 0: (Blocks 0-32767)
Reserved GDT blocks at 2-245
Block bitmap at 246 (+246), Inode bitmap at 247 (+247)
Inode table at 248-752 (+248)
- 15607 free blocks, 16149 free inodes, 2 directories
+ 15607 free blocks, 16148 free inodes, 2 directories
Free blocks: 17161-32767
- Free inodes: 12-16160
+ Free inodes: 13-16160
Group 1: (Blocks 32768-65535)
Backup superblock at 32768, Group descriptors at 32769-32769
Reserved GDT blocks at 32770-33013
[root@tom ~]# diff -pu /tmp/after1 /tmp/after2
--- /tmp/after1 2006-12-13 14:39:59.602086138 +0300
+++ /tmp/after2 2006-12-13 14:40:05.592176495 +0300
@@ -45,8 +45,8 @@ Group 0: (Blocks 0-32767)
Reserved GDT blocks at 2-245
Block bitmap at 246 (+246), Inode bitmap at 247 (+247)
Inode table at 248-752 (+248)
- 15607 free blocks, 16148 free inodes, 2 directories
- Free blocks: 17161-32767
+ 10487 free blocks, 16148 free inodes, 2 directories
+ Free blocks: 17161-22527, 27648-32767
Free inodes: 13-16160
Group 1: (Blocks 32768-65535)
Backup superblock at 32768, Group descriptors at 32769-32769
[root@tom ~]#


try to repeat with a single file?

thanks, Alex

2006-12-13 14:54:24

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

>>>>> Valerie Clement (VC) writes:

VC> Alex Tomas wrote:
>>>>>>> Valerie Clement (VC) writes:
>>
VC> Hi Alex,
VC> I did the following test:
VC> # mkfs /dev/sdc1
VC> # mount -t ext4dev -o extents,mballoc,delalloc /dev/sdc1 /test
VC> # dumpe2fs -h /dev/sdc1
VC> # cp linux.tar /test
VC> # tar xf /test/linux.tar
VC> # dumpe2fs -h /dev/sdc1
>>
VC> The "Free blocks" and "Free inodes" counters in the dumpe2fs output
VC> did not change after running the cp and tar commands.
>>
>> sync should help, I think.

VC> No, I've just tried, it does not change anything.

ah, it's simple. you're looking at superblock counters.
we don't update them every time. this was changed years
ago. try the same with regular ext3

thanks, Alex

2006-12-13 15:04:42

by Valerie Clement

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

Alex Tomas wrote:
> ah, it's simple. you're looking at superblock counters.
> we don't update them every time. this was changed years
> ago. try the same with regular ext3
>
Yes, sorry, I didn't know that and I didn't notice that before.
And the per-group counters are right.

Thanks a lot for your clarification.
Val?rie

2006-12-13 22:03:40

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

On Dec 13, 2006 08:40 -0600, Eric Sandeen wrote:
> This is probably a discrepancy that comes from reading the block device
> directly, right? Which is not necessarily in sync with the address
> space for the filesystem.

In fact, the block device IS in sync with the filesystem address space,
but only for metadata. The file data each has its own address space and
is not coherent.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-12-13 22:03:40

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

On Dec 13, 2006 16:02 +0100, Valerie Clement wrote:
> Alex Tomas wrote:
> >ah, it's simple. you're looking at superblock counters.
> >we don't update them every time. this was changed years
> >ago. try the same with regular ext3
>
> Yes, sorry, I didn't know that and I didn't notice that before.
> And the per-group counters are right.

It would be good, IMHO, to update the superblock counters periodically
anyways (e.g. statfs already has done the calculation). Otherwise,
e2fsck on a mounted-but-idle system always complains about them being
incorrect.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2006-12-13 22:26:35

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC] ext4-locality-groups patch

Andreas Dilger wrote:
> On Dec 13, 2006 08:40 -0600, Eric Sandeen wrote:
>> This is probably a discrepancy that comes from reading the block device
>> directly, right? Which is not necessarily in sync with the address
>> space for the filesystem.
>
> In fact, the block device IS in sync with the filesystem address space,
> but only for metadata. The file data each has its own address space and
> is not coherent.

Ah, thanks for the clarification.

-Eric