2006-12-22 21:09:45

by Alex Tomas

[permalink] [raw]
Subject: [RFC] delayed allocation for ext4


Good day,

probably the previous set of patches (including mballoc/lg)
is too large. so, I reworked delayed allocation a bit so
that it can be used on top of regular balloc, though it
still can be used with extents-enabled files only.

this time series contains just 3 patches:

- booked-page-flag.patch
adds PG_booked bit to page->flags. it's used in delayed
allocation to mark space is already reserved for page
(including possible metadata)

- ext4-block-reservation.patch
this is scalable free space management. every time we
delay allocation of some page, a space (including metadata)
should be reserved

- ext4-delayed-allocation.patch
delayed allocation itself, enabled by "delalloc" mount option.
extents support is also required. currently it works only
with blocksize=pagesize.


all the patches can be used in ftp://ftp.clusterfs.com/pub/people/alex/2.6.20-rc1/

the series passed basic tests like dd/dbench/fsx.

any comments/questions are very welcome.

thanks, Alex


2006-12-22 20:23:55

by Alex Tomas

[permalink] [raw]
Subject: [RFC] booked-page-flag.patch



Index: linux-2.6.20-rc1/include/linux/page-flags.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/page-flags.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/page-flags.h 2006-12-22 20:05:31.000000000 +0300
@@ -90,6 +90,7 @@
#define PG_reclaim 17 /* To be reclaimed asap */
#define PG_nosave_free 18 /* Used for system suspend/resume */
#define PG_buddy 19 /* Page is free, on buddy lists */
+#define PG_booked 20 /* Has blocks reserved on-disk */


#if (BITS_PER_LONG > 32)
@@ -230,6 +231,10 @@ static inline void SetPageUptodate(struc
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)

+#define PageBooked(page) test_bit(PG_booked, &(page)->flags)
+#define SetPageBooked(page) set_bit(PG_booked, &(page)->flags)
+#define ClearPageBooked(page) clear_bit(PG_booked, &(page)->flags)
+
#define PageReclaim(page) test_bit(PG_reclaim, &(page)->flags)
#define SetPageReclaim(page) set_bit(PG_reclaim, &(page)->flags)
#define ClearPageReclaim(page) clear_bit(PG_reclaim, &(page)->flags)

2006-12-22 21:09:55

by Alex Tomas

[permalink] [raw]
Subject: [RFC] ext4-delayed-allocation.patch



Index: linux-2.6.20-rc1/include/linux/ext4_fs_i.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_i.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_i.h 2006-12-22 22:56:04.000000000 +0300
@@ -153,6 +153,11 @@ struct ext4_inode_info {

unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
+
+ __u32 i_blocks_reserved;
+ __u32 i_md_reserved;
+ spinlock_t i_wb_reserved_lock; /* to protect i_md_reserved */
+ atomic_t i_wb_writers;
};

#endif /* _LINUX_EXT4_FS_I */
Index: linux-2.6.20-rc1/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs.h 2006-12-22 22:56:03.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs.h 2006-12-22 22:56:04.000000000 +0300
@@ -401,6 +401,7 @@ struct ext4_inode {
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
+#define EXT4_MOUNT_DELAYED_ALLOC 0x1000000/* Delayed allocation support */

/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
@@ -994,6 +995,18 @@ ext4_get_blocks_wrap(handle_t *handle, s
}


+/* writeback.c */
+extern int ext4_wb_writepages(struct address_space *, struct writeback_control *);
+extern int ext4_wb_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to);
+extern int ext4_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int ext4_wb_writepage(struct page *, struct writeback_control *);
+extern void ext4_wb_invalidatepage(struct page *, unsigned long);
+extern int ext4_wb_releasepage(struct page *, gfp_t);
+extern int ext4_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+extern void ext4_wb_init(struct super_block *);
+extern void ext4_wb_release(struct super_block *);
+
#endif /* __KERNEL__ */

#endif /* _LINUX_EXT4_FS_H */
Index: linux-2.6.20-rc1/include/linux/ext4_fs_sb.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_sb.h 2006-12-22 22:56:03.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_sb.h 2006-12-22 22:56:04.000000000 +0300
@@ -94,6 +94,17 @@ struct ext4_sb_info {
unsigned long s_ext_blocks;
unsigned long s_ext_extents;
#endif
+
+ atomic_t s_wb_congested;
+ atomic_t s_wb_single_pages;
+ atomic_t s_wb_collisions_sp;
+ atomic_t s_wb_allocated;
+ atomic_t s_wb_reqs;
+ atomic_t s_wb_nr_to_write;
+ atomic_t s_wb_collisions;
+ atomic_t s_wb_blocks;
+ atomic_t s_wb_extents;
+ atomic_t s_wb_dropped;
};

#endif /* _LINUX_EXT4_FS_SB */
Index: linux-2.6.20-rc1/include/linux/ext4_fs_extents.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_extents.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_extents.h 2006-12-22 22:56:04.000000000 +0300
@@ -193,6 +193,7 @@ extern int ext4_ext_calc_credits_for_ins
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);

#endif /* _LINUX_EXT4_EXTENTS */

Index: linux-2.6.20-rc1/fs/ext4/super.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/super.c 2006-12-22 22:56:03.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/super.c 2006-12-22 22:56:04.000000000 +0300
@@ -439,6 +439,7 @@ static void ext4_put_super (struct super
struct ext4_super_block *es = sbi->s_es;
int i;

+ ext4_wb_release(sb);
ext4_reserve_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
@@ -506,6 +507,13 @@ static struct inode *ext4_alloc_inode(st
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+
+ /* FIXME: these wb-related fields could be initialized once */
+ ei->i_blocks_reserved = 0;
+ ei->i_md_reserved = 0;
+ atomic_set(&ei->i_wb_writers, 0);
+ spin_lock_init(&ei->i_wb_reserved_lock);
+
return &ei->vfs_inode;
}

@@ -729,7 +737,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_extents,
+ Opt_grpquota, Opt_extents, Opt_delayed_alloc,
};

static match_table_t tokens = {
@@ -780,6 +788,7 @@ static match_table_t tokens = {
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
{Opt_extents, "extents"},
+ {Opt_delayed_alloc, "delalloc"},
{Opt_err, NULL},
{Opt_resize, "resize"},
};
@@ -1094,6 +1103,9 @@ clear_qf_name:
else
clear_opt(sbi->s_mount_opt, BARRIER);
break;
+ case Opt_delayed_alloc:
+ set_opt(sbi->s_mount_opt, DELAYED_ALLOC);
+ break;
case Opt_ignore:
break;
case Opt_resize:
@@ -1869,6 +1881,7 @@ static int ext4_fill_super (struct super

ext4_ext_init(sb);
ext4_reserve_init(sb);
+ ext4_wb_init(sb);

lock_kernel();
return 0;
Index: linux-2.6.20-rc1/fs/ext4/extents.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/extents.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/extents.c 2006-12-22 22:56:04.000000000 +0300
@@ -2159,6 +2159,36 @@ int ext4_ext_writepage_trans_blocks(stru
return needed;
}

+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+
+ rcap = ext4_ext_space_root(inode);
+ if (blocks <= rcap) {
+ /* all extents fit to the root */
+ return 0;
+ }
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ num = leafs = (blocks + lcap - 1) / lcap;
+ if (leafs <= rcap) {
+ /* all pointers to leafs fit to the root */
+ return leafs;
+ }
+
+ /* ok. we need separate index block(s) to link all leaf blocks */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
EXPORT_SYMBOL(ext4_mark_inode_dirty);
EXPORT_SYMBOL(ext4_ext_invalidate_cache);
EXPORT_SYMBOL(ext4_ext_insert_extent);
Index: linux-2.6.20-rc1/fs/ext4/Makefile
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/Makefile 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/Makefile 2006-12-22 22:56:04.000000000 +0300
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o

ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o
+ ext4_jbd2.o writeback.o

ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
Index: linux-2.6.20-rc1/fs/ext4/writeback.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/writeback.c 2006-11-30 15:32:10.563465031 +0300
+++ linux-2.6.20-rc1/fs/ext4/writeback.c 2006-12-22 22:59:33.000000000 +0300
@@ -0,0 +1,1167 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, [email protected]
+ * Written by Alex Tomas <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+/*
+ * TODO:
+ * MUST:
+ * - flush dirty pages in -ENOSPC case in order to free reserved blocks
+ * - direct I/O support
+ * - blocksize != PAGE_CACHE_SIZE support
+ * - store last unwritten page in ext4_wb_writepages() and
+ * continue from it in a next run
+ * WISH:
+ * - should ext4_wb_writepage() try to flush neighbours?
+ * - ext4_wb_block_truncate_page() must flush partial truncated pages
+ * - reservation can be done per write-request in ext4_file_write()
+ * rather than per-page in ext4_wb_commit_write() -- it's quite
+ * expensive to recalculate amount of required metadata for evey page
+ * - re-allocation to improve layout
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd.h>
+#include <linux/ext4_fs_extents.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
+#include <linux/spinlock.h>
+
+/*
+ * If EXT4_WB_STATS is defined, then some stats are collected.
+ * It will be showed upont umount time.
+ */
+#define EXT4_WB_STATS
+
+
+/*
+ * With EXT4_WB_SKIP_SMALL defined the patch will try to avoid
+ * small I/Os ignoring ->writepages() if mapping hasn't enough
+ * contig. dirty pages
+ */
+#define EXT4_WB_SKIP_SMALL__
+
+#define WB_ASSERT(__x__) if (!(__x__)) BUG();
+
+#define WB_DEBUG__
+#ifdef WB_DEBUG
+#define wb_debug(fmt,a...) printk(fmt, ##a);
+#else
+#define wb_debug(fmt,a...)
+#endif
+
+#define WB_MAX_PAGES_PER_EXTENT 32768
+
+#define WB_PAGES_PER_ARRAY 60
+
+struct ext4_wb_pages {
+ struct list_head list;
+ struct page *pages[WB_PAGES_PER_ARRAY];
+ unsigned short num, start;
+};
+
+struct ext4_wb_control {
+ pgoff_t start;
+ int len, extents;
+ int blocks_to_release;
+ struct ext4_wb_pages *pages;
+ struct list_head list;
+ struct address_space *mapping;
+};
+
+
+void ext4_wb_invalidatepage(struct page *, unsigned long);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+
+
+static struct page * ext4_wb_pull_page(struct ext4_wb_control *wc)
+{
+ struct ext4_wb_pages *wp = wc->pages;
+
+ BUG_ON(wp == NULL);
+ BUG_ON(list_empty(&wc->list));
+ BUG_ON(list_empty(&wp->list));
+ if (wp->start == wp->num) {
+ list_del(&wp->list);
+ kfree(wp);
+ if (list_empty(&wc->list))
+ return NULL;
+ wp = list_entry(wc->list.next, struct ext4_wb_pages, list);
+ wc->pages = wp;
+ }
+ BUG_ON(list_empty(&wp->list));
+ return wp->pages[wp->start++];
+}
+
+static struct bio * ext4_wb_bio_alloc(struct inode *inode,
+ sector_t first_block, int nr_vecs)
+{
+ int gfp_flags = GFP_NOFS | __GFP_HIGH;
+ struct bio *bio;
+ int maxreq;
+
+ maxreq = bio_get_nr_vecs(inode->i_sb->s_bdev);
+ if (maxreq < nr_vecs)
+ nr_vecs = maxreq;
+
+ bio = bio_alloc(gfp_flags, nr_vecs);
+
+ if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+ while (!bio && (nr_vecs /= 2))
+ bio = bio_alloc(gfp_flags, nr_vecs);
+ }
+
+ if (bio) {
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_sector = first_block << (inode->i_blkbits - 9);
+ }
+ return bio;
+}
+
+static int ext4_wb_end_io(struct bio *bio, unsigned int bytes, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+ if (bio->bi_size)
+ return 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (!uptodate)
+ SetPageError(page);
+ end_page_writeback(page);
+ } while (bvec >= bio->bi_io_vec);
+ bio_put(bio);
+ return 0;
+}
+
+static struct bio *ext4_wb_bio_submit(struct bio *bio, handle_t *handle)
+{
+ bio->bi_end_io = ext4_wb_end_io;
+ submit_bio(WRITE, bio);
+ return NULL;
+}
+
+int inline ext4_wb_reserve_space_page(struct page *page, int blocks)
+{
+ struct inode *inode = page->mapping->host;
+ int total, mdb, err;
+
+ wb_debug("reserve %d blocks for page %lu from inode %lu\n",
+ blocks, page->index, inode->i_ino);
+
+ /* user wants us to reserve blocks for his file. reserving space
+ * for his (data) blocks isn't enough because adding block may
+ * involve allocation index/leaf blocks for tree/blockmap.
+ * so, we need to calculate numbers of needed metadata for worst
+ * case: block per extent */
+
+ spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
+ total = EXT4_I(inode)->i_blocks_reserved + blocks;
+ mdb = ext4_ext_calc_metadata_amount(inode, total);
+
+ /* if blockmap needs more metadata, we have to reserve difference */
+ BUG_ON(mdb < EXT4_I(inode)->i_md_reserved);
+ mdb = mdb - EXT4_I(inode)->i_md_reserved;
+
+ err = ext4_reserve_blocks(inode->i_sb, mdb + blocks);
+ if (err) {
+ /* blocks are exhausted? */
+ spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+ return err;
+ }
+
+ /* blocks have been reserved, account this. I believe
+ * inode's fields are protected by inode->i_sem */
+ EXT4_I(inode)->i_blocks_reserved += blocks;
+ EXT4_I(inode)->i_md_reserved += mdb;
+ spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+ /* we have reserved space on a disk for the page */
+ SetPageBooked(page);
+ return 0;
+}
+
+/*
+ * release space reserved for @blocks of data
+ * @used signals that @blocks got really allocated and we just
+ * need to release corresponded over-reserved metadata
+ */
+int inline ext4_wb_release_space(struct inode *inode, int blocks, int used)
+{
+ int total, mdb, release;
+
+ spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+ total = EXT4_I(inode)->i_blocks_reserved - blocks;
+ mdb = ext4_ext_calc_metadata_amount(inode, total);
+
+ /* if blockmap needs lesser metadata, we may release difference */
+ BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
+ mdb = EXT4_I(inode)->i_md_reserved - mdb;
+
+ release = mdb;
+ /* drop reservation only for non-used blocks */
+ if (!used)
+ release += blocks;
+ wb_debug("%u %s: release %d/%d blocks from %u/%u reserved for inode %lu\n",
+ blocks, used ? "allocated" : "dropped", used ? 0 : blocks,
+ mdb, EXT4_I(inode)->i_blocks_reserved,
+ EXT4_I(inode)->i_md_reserved, inode->i_ino);
+ if (release)
+ ext4_release_blocks(inode->i_sb, release);
+
+ /* update per-inode reservations */
+ BUG_ON(blocks > EXT4_I(inode)->i_blocks_reserved);
+ EXT4_I(inode)->i_blocks_reserved -= blocks;
+ BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
+ EXT4_I(inode)->i_md_reserved -= mdb;
+
+ spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
+
+ return 0;
+}
+
+static inline int ext4_wb_drop_page_reservation(struct page *page)
+{
+ /* we just allocated blocks for this page. those blocks (and
+ * probably metadata for them) were reserved before. now we
+ * should drop reservation mark from the page. if we didn't
+ * do that then ->invalidatepage() may think page still holds
+ * reserved blocks. we could release reserved blocks right
+ * now, but I'd prefer to make this once per several blocks */
+ wb_debug("drop reservation from page %lu from inode %lu\n",
+ page->index, page->mapping->host->i_ino);
+ BUG_ON(!PageBooked(page));
+ ClearPageBooked(page);
+ return 0;
+}
+
+static int ext4_wb_submit_extent(struct ext4_wb_control *wc, handle_t *handle,
+ struct ext4_extent *ex, int new)
+{
+ struct inode *inode = wc->mapping->host;
+ int blkbits = inode->i_blkbits;
+ struct page *page;
+ unsigned long blk, off, len, remain;
+ unsigned long pstart, plen, prev;
+ struct bio *bio = NULL;
+ int nr_pages;
+
+ /*
+ * we have list of pages in wc and block numbers in ex
+ * let's cook bios from them and start real I/O
+ */
+
+ BUG_ON(PAGE_CACHE_SHIFT < blkbits);
+ BUG_ON(list_empty(&wc->list));
+
+ wb_debug("cook and submit bios for %u/%u/%u for %lu/%u\n",
+ ex->ee_block, ex->ee_len, ex->ee_start, wc->start, wc->len);
+
+ blk = ex->ee_block;
+ remain = ex->ee_len;
+ wc->extents++;
+
+ while (remain) {
+ page = ext4_wb_pull_page(wc);
+ if (page == NULL)
+ break;
+
+ pstart = page->index << (PAGE_CACHE_SHIFT - blkbits);
+ plen = PAGE_SIZE >> blkbits;
+ if (pstart > blk) {
+ /* probably extent covers long space and page
+ * to be written in the middle of it */
+ BUG_ON(pstart - blk >= remain);
+ remain -= pstart - blk;
+ blk = pstart;
+ }
+ BUG_ON(blk < pstart || blk >= pstart + plen);
+
+ BUG_ON(!PageUptodate(page));
+ /* page can get here via mmap(2)
+ * BUG_ON(!PagePrivate(page));*/
+ BUG_ON(new && PageMappedToDisk(page));
+ BUG_ON(!new && !PageMappedToDisk(page));
+ SetPageMappedToDisk(page);
+ if (new && PagePrivate(page)) {
+ /* space is just allocated and it was reserved in
+ * ->commit_write(). time to release reservation.
+ * space may not be reserved if page gets dirty
+ * via mmap. should we reserve it in ->mmap() ? */
+ prev = min(plen, remain);
+ ext4_wb_drop_page_reservation(page);
+ wc->blocks_to_release += prev;
+ }
+
+alloc_new_bio:
+ if (bio == NULL) {
+ /* +2 because head/tail may belong to different pages */
+ nr_pages = (ex->ee_len - (blk - ex->ee_block));
+ nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
+ off = ex->ee_start + (blk - ex->ee_block);
+ bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
+ if (bio == NULL)
+ return -ENOMEM;
+ }
+
+ off = (blk - pstart) << blkbits;
+ prev = min(plen, remain);
+ len = prev << blkbits;
+ if (bio_add_page(bio, page, len, off) < len) {
+ bio = ext4_wb_bio_submit(bio, handle);
+ goto alloc_new_bio;
+ }
+ remain -= prev;
+ blk += prev;
+ if (blk < pstart + plen) {
+ /* extent covers part of the page only.
+ * it's possible that next extent covers
+ * the tail. so, we leave page */
+ printk("blk %lu pstart %lu plen %lu remain %lu prev %lu\n",
+ blk, pstart, plen, remain, prev);
+ wc->pages->start--;
+ BUG_ON(remain != 0);
+ }
+ }
+ if (bio)
+ ext4_wb_bio_submit(bio, handle);
+ BUG_ON(new && remain != 0);
+ return 0;
+}
+
+static ext4_fsblk_t
+ext4_wb_find_goal(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t block)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t bg_start;
+ unsigned long colour;
+ int depth;
+
+ if (path) {
+ struct ext4_extent *ex;
+ depth = path->p_depth;
+
+ /* try to predict block placement */
+ if ((ex = path[depth].p_ext))
+ return ex->ee_start + (block - ex->ee_block);
+
+ /* it looks index is empty
+ * try to find starting from index itself */
+ if (path[depth].p_bh)
+ return path[depth].p_bh->b_blocknr;
+ }
+
+ /* OK. use inode's group */
+ bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ return bg_start + colour + block;
+}
+
+static int ext4_wb_handle_extent(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_ext_cache *ec,
+ void *cbdata)
+{
+ struct ext4_wb_control *wc = cbdata;
+ struct super_block *sb = inode->i_sb;
+ ext4_fsblk_t goal, pblock;
+ unsigned long tgen, count;
+ struct ext4_extent nex;
+ loff_t new_i_size;
+ handle_t *handle;
+ int i, err;
+
+ if (ec->ec_type == EXT4_EXT_CACHE_EXTENT) {
+ /*
+ * The extent is already allocated. The only thing
+ * we have to do is to flush correspondend pages.
+ */
+ wb_debug("extent %u/%u/%u exist\n",
+ (unsigned) ec->ec_block,
+ (unsigned) ec->ec_len,
+ (unsigned) ec->ec_start);
+ nex.ee_start = ec->ec_start;
+ nex.ee_block = ec->ec_block;
+ nex.ee_len = ec->ec_len;
+ err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
+
+ /* correct on-disk size, if we grow within
+ * already allocated block */
+ new_i_size = (loff_t) nex.ee_block + nex.ee_len;
+ new_i_size = new_i_size << inode->i_blkbits;
+ if (new_i_size > i_size_read(inode))
+ new_i_size = i_size_read(inode);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ EXT4_I(inode)->i_disksize = new_i_size;
+ ext4_dirty_inode(inode);
+ }
+ return err;
+ }
+
+ wb_debug("extent %u/%u DOES NOT exist\n", ec->ec_block, ec->ec_len);
+
+ /* space for some pages we want to flush hasn't allocated
+ * yet. so, it's time to allocate space */
+ tgen = EXT4_I(inode)->i_ext_generation;
+ count = ext4_ext_calc_credits_for_insert(inode, path);
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+ handle = ext4_journal_start(inode, count + EXT4_DATA_TRANS_BLOCKS(sb) + 1);
+ if (IS_ERR(handle)) {
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ return PTR_ERR(handle);
+ }
+
+ /* FIXME: we could analyze current path and advice allocator
+ * to find additional blocks if goal can't be allocated
+ * this is for better interaction between extents and mballoc
+ * plus this should improve overall performance */
+
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ if (tgen != EXT4_I(inode)->i_ext_generation) {
+ /* the tree has changed. so path can be invalid at moment */
+ ext4_journal_stop(handle);
+ return EXT_REPEAT;
+ }
+
+ goal = ext4_wb_find_goal(inode, path, ec->ec_block);
+ count = ec->ec_len;
+
+ /* if this is a tail of closed file, ask allocator don't preallocate */
+ new_i_size = i_size_read(inode) + sb->s_blocksize - 1;
+ new_i_size = new_i_size >> inode->i_blkbits;
+ if (ec->ec_block + count == new_i_size &&
+ !atomic_read(&inode->i_writecount)) {
+ /* XXX: disable preallocation for tail */
+ }
+
+ /* this is a hack to tell the allocator that blocks
+ * we are going to allocated are already reserved */
+ EXT4_I(inode)->i_state |= EXT4_STATE_BLOCKS_RESERVED;
+ pblock = ext4_new_blocks(handle, inode, goal, &count, &err);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_BLOCKS_RESERVED;
+
+ if (!pblock)
+ goto out;
+
+ BUG_ON(count > ec->ec_len);
+ BUG_ON(count == 0);
+ wb_debug("allocated %llu/%lu for %lu (asked %u)\n",
+ pblock, count, inode->i_ino, ec->ec_len);
+
+ /* insert new extent */
+ nex.ee_start = pblock;
+ nex.ee_start_hi = 0;
+ nex.ee_len = count;
+ nex.ee_block = ec->ec_block;
+ err = ext4_ext_insert_extent(handle, inode, path, &nex);
+ if (err)
+ goto out;
+
+ /*
+ * Putting len of the actual extent we just inserted,
+ * we are asking ext4_ext_walk_space() to continue
+ * scaning after that block
+ */
+ ec->ec_len = nex.ee_len;
+ BUG_ON(nex.ee_len == 0);
+
+#ifdef EXT4_WB_STATS
+ atomic_add(nex.ee_len, &EXT4_SB(inode->i_sb)->s_wb_allocated);
+#endif
+
+ wb_debug("inserted %lu/%lu/%lu for %lu (asked %u)\n",
+ (unsigned long) nex.ee_block, (unsigned long) nex.ee_len,
+ (unsigned long) nex.ee_start, inode->i_ino, ec->ec_len);
+
+ /*
+ * Important! The nex can change after insert. So do not
+ * use ec for following
+ */
+
+ /* block have been allocated for data, so time to drop dirty
+ * in correspondend buffer_heads to prevent corruptions */
+ for (i = 0; i < nex.ee_len; i++)
+ unmap_underlying_metadata(sb->s_bdev, nex.ee_start + i);
+
+ /* correct on-disk inode size */
+ if (nex.ee_len > 0) {
+ new_i_size = (loff_t) nex.ee_block + nex.ee_len;
+ new_i_size = new_i_size << inode->i_blkbits;
+ if (new_i_size > i_size_read(inode))
+ new_i_size = i_size_read(inode);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ EXT4_I(inode)->i_disksize = new_i_size;
+ err = ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+
+ if (ext4_should_order_data(inode))
+ err = ext4_wb_submit_extent(wc, handle, &nex, 1);
+ else
+ err = ext4_wb_submit_extent(wc, NULL, &nex, 1);
+
+ /* we don't want to recalculate needed reservation for
+ * each page. we may do this for each new extent */
+ ext4_wb_release_space(inode, wc->blocks_to_release, 1);
+ wc->blocks_to_release = 0;
+
+out:
+ ext4_journal_stop(handle);
+ if (err)
+ printk("EXT4-fs: writeback error = %d\n", err);
+ return err;
+}
+
+static int ext4_wb_flush(struct ext4_wb_control *wc)
+{
+ struct list_head *cur, *tmp;
+ struct inode *inode;
+ int err, num = 0;
+
+ if (wc->len == 0)
+ return 0;
+
+ inode = wc->mapping->host;
+ wb_debug("start flushing %lu/%u from inode %lu\n",
+ wc->start, wc->len, inode->i_ino);
+
+ wc->pages = list_entry(wc->list.next, struct ext4_wb_pages, list);
+ wc->extents = 0;
+
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ /* FIXME: last page may be partial */
+ err = ext4_ext_walk_space(inode, wc->start, wc->len,
+ ext4_wb_handle_extent, wc);
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+ list_for_each_safe(cur, tmp, &wc->list) {
+ struct ext4_wb_pages *wp;
+ wp = list_entry(cur, struct ext4_wb_pages, list);
+ if (err) {
+ while (wp->start < wp->num) {
+ struct page *page = wp->pages[wp->start];
+ BUG_ON(!PageWriteback(page));
+ end_page_writeback(page);
+ __set_page_dirty_nobuffers(page);
+ wp->start++;
+ }
+ } else {
+ BUG_ON(num != 0);
+ BUG_ON(wp->start != wp->num - 1 &&
+ wp->start != wp->num);
+ }
+ list_del(&wp->list);
+ kfree(wp);
+ num++;
+ }
+ wc->pages = NULL;
+ wc->len = 0;
+ wc->extents = 0;
+
+ return err;
+}
+
+static int ext4_wb_add_page(struct ext4_wb_control *wc, struct page *page)
+{
+ struct ext4_wb_pages * wp = wc->pages;
+
+ if (wp == NULL || wp->num == WB_PAGES_PER_ARRAY) {
+ wp = kmalloc(sizeof(struct ext4_wb_pages), GFP_NOFS);
+ if (wp == NULL) {
+ printk("no mem for ext4_wb_pages!\n");
+ return -ENOMEM;
+ }
+ wp->num = 0;
+ wp->start = 0;
+ list_add_tail(&wp->list, &wc->list);
+ wc->pages = wp;
+ }
+
+ wp->pages[wp->num] = page;
+ wp->num++;
+
+ return 0;
+}
+
+static inline void
+ext4_wb_init_control(struct ext4_wb_control *wc, struct address_space *mapping)
+{
+ wc->mapping = mapping;
+ wc->len = 0;
+ wc->blocks_to_release = 0;
+ INIT_LIST_HEAD(&wc->list);
+ wc->pages = NULL;
+}
+
+static inline int
+ext4_wb_can_merge(struct ext4_wb_control *wc, unsigned long next)
+{
+ if (wc->start + wc->len == next &&
+ wc->len <= WB_MAX_PAGES_PER_EXTENT)
+ return 1;
+ return 0;
+}
+
+int ext4_wb_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct inode *inode = mapping->host;
+ int nr_pages, i, err = 0, done = 0;
+ struct ext4_wb_control wc;
+ struct pagevec pvec;
+ pgoff_t index = 0;
+ int written = 0;
+ int extents = 0;
+ pgoff_t pindex = 0;
+
+ wb_debug("->writepages on inode %lu (%u reserved)\n",
+ inode->i_ino, EXT4_I(inode)->i_blocks_reserved);
+#ifdef EXT4_WB_SKIP_SMALL
+ if (wbc->nr_to_write <= 64 && wbc->sync_mode == WB_SYNC_NONE)
+ return 0;
+#endif
+ atomic_inc(&EXT4_I(inode)->i_wb_writers);
+#ifdef EXT4_WB_STATS
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_reqs);
+ atomic_add(wbc->nr_to_write, &EXT4_SB(inode->i_sb)->s_wb_nr_to_write);
+ if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions);
+#endif
+
+ /* skip opened-for-write small files
+ * XXX: what do we do if most of files hit the condition? */
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ atomic_read(&inode->i_writecount) &&
+ i_size_read(inode) <= 64*1024) {
+ return 0;
+ }
+
+ ext4_wb_init_control(&wc, mapping);
+
+ pagevec_init(&pvec, 0);
+ while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ lock_page(page);
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+ if (PageWriteback(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (wc.len && ext4_wb_can_merge(&wc, page->index) &&
+ wbc->nr_to_write <= 0) {
+ /*
+ * If we already exhausted blocks we got
+ * to write and new extent starts, stop
+ * writeback
+ */
+ unlock_page(page);
+ done = 1;
+ break;
+
+ }
+
+ if (!clear_page_dirty_for_io(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ set_page_writeback(page);
+ unlock_page(page);
+
+ if (wc.len == 0) {
+ wc.start = page->index;
+ wc.len = 1;
+ extents++;
+ } else if (ext4_wb_can_merge(&wc, page->index)) {
+ wc.len++;
+ } else {
+ /* end of current extent: flush it ... */
+#if 0
+ if (wc.len < 64 && wc.len > 0) {
+ printk("#%u: wow! short extent %d for flush on #%lu\n",
+ (unsigned) current->pid, wc.len, inode->i_ino);
+ printk("#%u: done = %d, nr_to_write %ld, sync = %d\n",
+ (unsigned) current->pid, done, wbc->nr_to_write,
+ wbc->sync_mode);
+ printk("#%u: written %d, extents %d\n",
+ (unsigned) current->pid, written, extents);
+ printk("#%u: cur %lu, prev %lu\n",
+ (unsigned) current->pid,
+ (unsigned long) page->index,
+ (unsigned long) pindex);
+ }
+#endif
+ err = ext4_wb_flush(&wc);
+ if (err) {
+ done = 1;
+ end_page_writeback(page);
+ break;
+ }
+
+ /* ... and start new one */
+ BUG_ON(!PageWriteback(page));
+ wc.start = page->index;
+ wc.len = 1;
+ extents++;
+ }
+
+ pindex = page->index;
+ err = ext4_wb_add_page(&wc, page);
+ if (err) {
+ done = 1;
+ end_page_writeback(page);
+ break;
+ }
+ written++;
+
+ wbc->nr_to_write--;
+#if 0
+ if ((--(wbc->nr_to_write) <= 0))
+ done = 1;
+#endif
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+#ifdef EXT4_WB_STATS
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_congested);
+#endif
+ wbc->encountered_congestion = 1;
+ done = 1;
+ }
+ }
+ pagevec_release(&pvec);
+ }
+ if (!err) {
+#ifdef EXT4_WB_SKIP_SMALL
+ if (wc.len > 0 && wc.len < 64 && wbc->sync_mode == WB_SYNC_NONE) {
+ struct list_head *cur, *tmp;
+ list_for_each_safe(cur, tmp, &wc.list) {
+ struct ext4_wb_pages *wp;
+ wp = list_entry(cur, struct ext4_wb_pages, list);
+ for (i = wp->start; i < wp->num; i++) {
+ struct page *page = wp->pages[i];
+ BUG_ON(!PageWriteback(page));
+ end_page_writeback(page);
+ __set_page_dirty_nobuffers(page);
+ }
+ wbc->nr_to_write += i;
+ list_del(&wp->list);
+ kfree(wp);
+ }
+ } else
+#endif
+ ext4_wb_flush(&wc);
+ }
+
+ atomic_dec(&EXT4_I(inode)->i_wb_writers);
+
+#ifdef EXT4_WB_STATS
+ atomic_add(written, &EXT4_SB(inode->i_sb)->s_wb_blocks);
+ atomic_add(extents, &EXT4_SB(inode->i_sb)->s_wb_extents);
+#endif
+ return 0;
+}
+
+static void ext4_wb_clear_page(struct page *page, int from, int to)
+{
+ void *kaddr;
+
+ if (to < PAGE_CACHE_SIZE || from > 0) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (PAGE_CACHE_SIZE > to)
+ memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+ if (0 < from)
+ memset(kaddr, 0, from);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+}
+
+int ext4_wb_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ struct buffer_head bh, *bhw = &bh;
+ int err = 0;
+
+ wb_debug("prepare page %lu (%u-%u) for inode %lu\n",
+ page->index, from, to, page->mapping->host->i_ino);
+
+ /* if page is uptodate this means that ->prepare_write() has
+ * been called on page before and page is mapped to disk or
+ * we did reservation. page is protected and nobody can
+ * access it. hence, it safe to use page->private to pass
+ * flag that ->commit_write() has to reserve blocks. because
+ * an error may occur after ->prepare_write() we should not
+ * reserve block here. it's better to do in ->commit_write()
+ * when we're sure page is to be written */
+ page->private = 0;
+ if (!PageUptodate(page)) {
+ /* first write to this page */
+ bh.b_state = 0;
+ err = ext4_get_block(inode, page->index, bhw, 0);
+ if (err)
+ return err;
+ if (!buffer_mapped(bhw)) {
+ /* this block isn't allocated yet, reserve space */
+ wb_debug("reserve space for new block\n");
+ page->private = 1;
+ ext4_wb_clear_page(page, from, to);
+ ClearPageMappedToDisk(page);
+ } else {
+ /* block is already mapped, so no need to reserve */
+ BUG_ON(PagePrivate(page));
+ if (to - from < PAGE_CACHE_SIZE) {
+ wb_debug("read block %u\n",
+ (unsigned) bhw->b_blocknr);
+ set_bh_page(bhw, page, 0);
+ bhw->b_this_page = 0;
+ bhw->b_size = 1 << inode->i_blkbits;
+ atomic_set(&bhw->b_count, 1);
+ ll_rw_block(READ, 1, &bhw);
+ wait_on_buffer(bhw);
+ if (!buffer_uptodate(bhw))
+ return -EIO;
+ }
+ SetPageMappedToDisk(page);
+ }
+ } else if (!PageMappedToDisk(page) && !PagePrivate(page)) {
+ /* this page was a hole at time of mmap() calling
+ * now someone wants to modify it by sys_write() */
+ wb_debug("reserve block for hole\n");
+ page->private = 1;
+ }
+
+ return 0;
+}
+
+int ext4_wb_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ struct inode *inode = page->mapping->host;
+ int err = 0;
+
+ wb_debug("commit page %lu (%u-%u) for inode %lu\n",
+ page->index, from, to, inode->i_ino);
+
+ /* mark page private so that we get
+ * called to invalidate/release page */
+ SetPagePrivate(page);
+
+ if (!PageBooked(page) && !PageMappedToDisk(page)) {
+ /* ->prepare_write() observed that block for this
+ * page hasn't been allocated yet. there fore it
+ * asked to reserve block for later allocation */
+ BUG_ON(page->private == 0);
+ page->private = 0;
+ err = ext4_wb_reserve_space_page(page, 1);
+ if (err)
+ return err;
+ }
+
+ /* ok. block for this page is allocated already or it has
+ * been reserved succesfully. so, user may use it */
+ __set_page_dirty_nobuffers(page);
+
+ SetPageUptodate(page);
+
+ /* correct in-core size, on-disk size will
+ * be corrected upon allocation */
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+
+ return err;
+}
+
+int ext4_wb_write_single_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct ext4_wb_control wc;
+ int err;
+
+ atomic_inc(&EXT4_I(inode)->i_wb_writers);
+
+#ifdef EXT4_WB_STATS
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_single_pages);
+ if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions_sp);
+#endif
+
+ ext4_wb_init_control(&wc, page->mapping);
+
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+
+ wc.start = page->index;
+ wc.len = 1;
+
+ err = ext4_wb_add_page(&wc, page);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: cant add page at %s:%d - %d\n",
+ __FILE__, __LINE__, err);
+ end_page_writeback(page);
+ return err;
+ }
+ err = ext4_wb_flush(&wc);
+ atomic_dec(&EXT4_I(inode)->i_wb_writers);
+
+ return err;
+}
+
+int ext4_wb_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+ void *kaddr;
+
+ wb_debug("writepage %lu from inode %lu\n", page->index, inode->i_ino);
+
+ /*
+ * FIXME: just to play ...
+ * If another thread is writing inode's data and the page
+ * hasn't space on a disk yet, leave it for that thread
+ */
+#if 1
+ if (atomic_read(&EXT4_I(page->mapping->host)->i_wb_writers)
+ && !PageMappedToDisk(page)) {
+ __set_page_dirty_nobuffers(page);
+ unlock_page(page);
+ return 0;
+ }
+#endif
+
+ /* we give up here if we're reentered, because
+ * it might be for a different filesystem */
+ if (ext4_journal_current_handle()) {
+ __set_page_dirty_nobuffers(page);
+ unlock_page(page);
+ return 0;
+ }
+
+ /* Is the page fully inside i_size? */
+ if (page->index < end_index)
+ return ext4_wb_write_single_page(page, wbc);
+
+ /* Is the page fully outside i_size? (truncate in progress) */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index >= end_index + 1 || !offset) {
+ /*
+ * The page may have dirty, unmapped buffers. For example,
+ * they may have been added in ext4_writepage(). Make them
+ * freeable here, so the page does not leak.
+ */
+ ext4_wb_invalidatepage(page, 0);
+ unlock_page(page);
+ return 0; /* don't care */
+ }
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ return ext4_wb_write_single_page(page, wbc);
+}
+
+int ext4_wb_releasepage(struct page *page, gfp_t wait)
+{
+ wb_debug("release %sM%sR page %lu from inode %lu (wait %d)\n",
+ PageMappedToDisk(page) ? "" : "!",
+ PageBooked(page) ? "" : "!",
+ page->index, page->mapping->host->i_ino, wait);
+
+ if (PageWriteback(page))
+ return 0;
+
+ if (PagePrivate(page))
+ ClearPagePrivate(page);
+ return 0;
+}
+
+void ext4_wb_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ int ret = 0;
+
+ /* ->invalidatepage() is called when page is marked Private.
+ * for our page being Private mean that space has been
+ * reserved for this page and it is being truncated. so,
+ * it's time to drop reservation */
+ wb_debug("invalidate %sM%sR page %lu from inode %lu (offset %lu)\n",
+ PageMappedToDisk(page) ? "" : "!",
+ PageBooked(page) ? "" : "!",
+ page->index, inode->i_ino, offset);
+
+ if (offset == 0) {
+ if (PageBooked(page)) {
+ atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_dropped);
+ ext4_wb_release_space(inode, 1, 0);
+ ext4_wb_drop_page_reservation(page);
+ }
+ ret = try_to_release_page(page, 0);
+ }
+ return;
+}
+
+int ext4_wb_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ struct inode *inode = mapping->host;
+ struct buffer_head bh, *bhw = &bh;
+ unsigned blocksize, length;
+ void *kaddr;
+ int err = 0;
+
+ wb_debug("partial truncate from %lu on page %lu from inode %lu\n",
+ (unsigned long) from, page->index, inode->i_ino);
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ /* if page isn't uptodate we have to check has it assigned block
+ * if it has then that block is to be read before memset() */
+ if (!PageUptodate(page)) {
+ BUG_ON(PageMappedToDisk(page));
+ bh.b_state = 0;
+ err = ext4_get_block(inode, page->index, bhw, 0);
+ if (err)
+ goto err_out;
+ BUG_ON(buffer_new(bhw));
+ if (buffer_mapped(bhw)) {
+ /* time to retrieve data from a disk */
+ wb_debug("read block %u for part.trunc on %lu\n",
+ (unsigned) bhw->b_blocknr, page->index);
+ set_bh_page(bhw, page, 0);
+ bhw->b_this_page = 0;
+ bhw->b_size = 1 << inode->i_blkbits;
+ atomic_set(&bhw->b_count, 1);
+ ll_rw_block(READ, 1, &bhw);
+ wait_on_buffer(bhw);
+ err = -EIO;
+ if (!buffer_uptodate(bhw))
+ goto err_out;
+ SetPageMappedToDisk(page);
+ } else {
+ wb_debug("zero page %lu (part.trunc)\n", page->index);
+ offset = 0;
+ length = blocksize;
+ }
+ }
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ SetPageUptodate(page);
+ __set_page_dirty_nobuffers(page);
+
+err_out:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+void ext4_wb_init(struct super_block *sb)
+{
+ if (!test_opt(sb, DELAYED_ALLOC))
+ return;
+
+ if (PAGE_CACHE_SHIFT != sb->s_blocksize_bits) {
+ printk(KERN_ERR "EXT4-fs: delayed allocation isn't"
+ "supported for PAGE_CACHE_SIZE != blocksize yet\n");
+ clear_opt (EXT4_SB(sb)->s_mount_opt, DELAYED_ALLOC);
+ return;
+ }
+ printk("EXT4-fs: delayed allocation enabled\n");
+}
+
+void ext4_wb_release(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt(sb, DELAYED_ALLOC))
+ return;
+
+#ifdef EXT4_WB_STATS
+ if (atomic_read(&sbi->s_wb_reqs) == 0)
+ return;
+
+ printk("EXT4-fs: writeback: %d blocks %d extents in %d reqs (%d ave)\n",
+ atomic_read(&sbi->s_wb_blocks),
+ atomic_read(&sbi->s_wb_extents),
+ atomic_read(&sbi->s_wb_reqs),
+ atomic_read(&sbi->s_wb_blocks) / atomic_read(&sbi->s_wb_reqs));
+ printk("EXT4-fs: writeback: %d nr_to_write, %d congestions, %d singles\n",
+ atomic_read(&sbi->s_wb_nr_to_write),
+ atomic_read(&sbi->s_wb_congested),
+ atomic_read(&sbi->s_wb_single_pages));
+ printk("EXT4-fs: writeback: %d collisions, %d single-page collisions\n",
+ atomic_read(&sbi->s_wb_collisions),
+ atomic_read(&sbi->s_wb_collisions_sp));
+ printk("EXT4-fs: writeback: %d allocated, %d dropped\n",
+ atomic_read(&sbi->s_wb_allocated),
+ atomic_read(&sbi->s_wb_dropped));
+#endif
+}
+
Index: linux-2.6.20-rc1/fs/ext4/file.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/file.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/file.c 2006-12-22 22:56:04.000000000 +0300
@@ -35,8 +35,8 @@ static int ext4_release_file (struct ino
{
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
- {
+ (atomic_read(&inode->i_writecount) == 1) &&
+ EXT4_I(inode)->i_blocks_reserved == 0) {
mutex_lock(&EXT4_I(inode)->truncate_mutex);
ext4_discard_reservation(inode);
mutex_unlock(&EXT4_I(inode)->truncate_mutex);
Index: linux-2.6.20-rc1/fs/ext4/inode.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/inode.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/inode.c 2006-12-22 22:56:04.000000000 +0300
@@ -943,7 +943,7 @@ out:

#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)

-static int ext4_get_block(struct inode *inode, sector_t iblock,
+int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
handle_t *handle = journal_current_handle();
@@ -1807,9 +1807,34 @@ static const struct address_space_operat
.releasepage = ext4_releasepage,
};

+static int ext4_wb_set_page_dirty(struct page *page)
+{
+ return __set_page_dirty_nobuffers(page);
+}
+
+static struct address_space_operations ext4_writeback_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_wb_writepage,
+ .writepages = ext4_wb_writepages,
+ .sync_page = block_sync_page,
+ .prepare_write = ext4_wb_prepare_write,
+ .commit_write = ext4_wb_commit_write,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_wb_invalidatepage,
+ .releasepage = ext4_wb_releasepage,
+ .set_page_dirty = ext4_wb_set_page_dirty,
+ .direct_IO = ext4_direct_IO,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (S_ISREG(inode->i_mode) &&
+ (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ test_opt(inode->i_sb, EXTENTS) &&
+ test_opt(inode->i_sb, DELAYED_ALLOC))
+ inode->i_mapping->a_ops = &ext4_writeback_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
@@ -1834,6 +1859,11 @@ int ext4_block_truncate_page(handle_t *h
int err = 0;
void *kaddr;

+ if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ test_opt(inode->i_sb, EXTENTS) &&
+ test_opt(inode->i_sb, DELAYED_ALLOC))
+ return ext4_wb_block_truncate_page(handle, page, mapping, from);
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);

2006-12-22 21:09:50

by Alex Tomas

[permalink] [raw]
Subject: [RFC] ext4-block-reservation.patch



Index: linux-2.6.20-rc1/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs.h 2006-12-22 20:21:12.000000000 +0300
@@ -201,6 +201,7 @@ struct ext4_group_desc
#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
+#define EXT4_STATE_BLOCKS_RESERVED 0x00000008 /* blocks reserved */

/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -808,6 +809,10 @@ extern struct ext4_group_desc * ext4_get
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern void ext4_init_block_alloc_info(struct inode *);
extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
+int ext4_reserve_init(struct super_block *sb);
+void ext4_reserve_release(struct super_block *sb);
+void ext4_release_blocks(struct super_block *sb, int blocks);
+int ext4_reserve_blocks(struct super_block *sb, int blocks);

/* dir.c */
extern int ext4_check_dir_entry(const char *, struct inode *,
Index: linux-2.6.20-rc1/include/linux/ext4_fs_sb.h
===================================================================
--- linux-2.6.20-rc1.orig/include/linux/ext4_fs_sb.h 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/include/linux/ext4_fs_sb.h 2006-12-22 20:20:10.000000000 +0300
@@ -24,6 +24,8 @@
#endif
#include <linux/rbtree.h>

+struct ext4_reservation_slot;
+
/*
* third extended-fs super-block data in memory
*/
@@ -65,6 +67,9 @@ struct ext4_sb_info {
struct rb_root s_rsv_window_root;
struct ext4_reserve_window_node s_rsv_window_head;

+ /* global reservation structures */
+ struct ext4_reservation_slot *s_reservation_slots;
+
/* Journaling */
struct inode * s_journal_inode;
struct journal_s * s_journal;
Index: linux-2.6.20-rc1/fs/ext4/super.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/super.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/super.c 2006-12-22 20:20:10.000000000 +0300
@@ -439,6 +439,7 @@ static void ext4_put_super (struct super
struct ext4_super_block *es = sbi->s_es;
int i;

+ ext4_reserve_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
@@ -1867,6 +1868,7 @@ static int ext4_fill_super (struct super
"writeback");

ext4_ext_init(sb);
+ ext4_reserve_init(sb);

lock_kernel();
return 0;
Index: linux-2.6.20-rc1/fs/ext4/balloc.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/balloc.c 2006-12-14 04:14:23.000000000 +0300
+++ linux-2.6.20-rc1/fs/ext4/balloc.c 2006-12-22 20:32:11.000000000 +0300
@@ -630,8 +630,10 @@ void ext4_free_blocks(handle_t *handle,
return;
}
ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
- if (dquot_freed_blocks)
+ if (dquot_freed_blocks) {
+ ext4_release_blocks(sb, dquot_freed_blocks);
DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+ }
return;
}

@@ -1440,7 +1442,7 @@ ext4_fsblk_t ext4_new_blocks(handle_t *h
struct ext4_sb_info *sbi;
struct ext4_reserve_window_node *my_rsv = NULL;
struct ext4_block_alloc_info *block_i;
- unsigned short windowsz = 0;
+ unsigned short windowsz = 0, reserved = 0;
#ifdef EXT4FS_DEBUG
static int goal_hits, goal_attempts;
#endif
@@ -1462,6 +1464,13 @@ ext4_fsblk_t ext4_new_blocks(handle_t *h
return 0;
}

+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_BLOCKS_RESERVED)) {
+ *errp = ext4_reserve_blocks(sb, num);
+ if (*errp)
+ return 0;
+ reserved = num;
+ }
+
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
ext4_debug("goal=%lu.\n", goal);
@@ -1674,8 +1683,11 @@ out:
/*
* Undo the block allocation
*/
- if (!performed_allocation)
+ if (!performed_allocation) {
DQUOT_FREE_BLOCK(inode, *count);
+ if (reserved)
+ ext4_release_blocks(sb, reserved);
+ }
brelse(bitmap_bh);
return 0;
}
@@ -1834,3 +1846,161 @@ unsigned long ext4_bg_num_gdb(struct sup
return ext4_bg_num_gdb_meta(sb,group);

}
+
+/*
+ * reservation.c contains routines to reserve blocks.
+ * we need this for delayed allocation, otherwise we
+ * could meet -ENOSPC at flush time
+ */
+
+/*
+ * as ->commit_write() where we're going to reserve
+ * non-allocated-yet blocks is well known hotpath,
+ * we have to make it scalable and avoid global
+ * data as much as possible
+ *
+ * there is per-sb array
+ */
+
+struct ext4_reservation_slot {
+ __u64 rs_reserved;
+ spinlock_t rs_lock;
+} ____cacheline_aligned;
+
+
+int ext4_reserve_local(struct super_block *sb, int blocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_reservation_slot *rs;
+ int rc = -ENOSPC;
+
+ preempt_disable();
+ rs = sbi->s_reservation_slots + smp_processor_id();
+
+ spin_lock(&rs->rs_lock);
+ if (likely(rs->rs_reserved >= blocks)) {
+ rs->rs_reserved -= blocks;
+ rc = 0;
+ }
+ spin_unlock(&rs->rs_lock);
+
+ preempt_enable();
+ return rc;
+}
+
+
+void ext4_rebalance_reservation(struct ext4_reservation_slot *rs, __u64 free)
+{
+ int i, used_slots = 0;
+ __u64 chunk;
+
+ /* let's know what slots have been used */
+ for (i = 0; i < NR_CPUS; i++)
+ if (rs[i].rs_reserved || i == smp_processor_id())
+ used_slots++;
+
+ /* chunk is a number of block every used
+ * slot will get. make sure it isn't 0 */
+ chunk = free + used_slots - 1;
+ do_div(chunk, used_slots);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (free < chunk)
+ chunk = free;
+ if (rs[i].rs_reserved || i == smp_processor_id()) {
+ rs[i].rs_reserved = chunk;
+ free -= chunk;
+ BUG_ON(free < 0);
+ }
+ }
+ BUG_ON(free);
+}
+
+int ext4_reserve_global(struct super_block *sb, int blocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_reservation_slot *rs;
+ int i, rc = -ENOENT;
+ __u64 free = 0;
+
+ rs = sbi->s_reservation_slots;
+
+ /* lock all slots */
+ for (i = 0; i < NR_CPUS; i++) {
+ spin_lock(&rs[i].rs_lock);
+ free += rs[i].rs_reserved;
+ }
+
+ if (free >= blocks) {
+ free -= blocks;
+ ext4_rebalance_reservation(rs, free);
+ rc = 0;
+ }
+
+ for (i = 0; i < NR_CPUS; i++)
+ spin_unlock(&rs[i].rs_lock);
+
+ return rc;
+}
+
+int ext4_reserve_blocks(struct super_block *sb, int blocks)
+{
+ int ret;
+
+ BUG_ON(blocks <= 0);
+
+ ret = ext4_reserve_local(sb, blocks);
+ if (likely(ret == 0))
+ return 0;
+
+ return ext4_reserve_global(sb, blocks);
+}
+
+void ext4_release_blocks(struct super_block *sb, int blocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_reservation_slot *rs;
+
+ BUG_ON(blocks <= 0);
+
+ preempt_disable();
+ rs = sbi->s_reservation_slots + smp_processor_id();
+
+ spin_lock(&rs->rs_lock);
+ rs->rs_reserved += blocks;
+ spin_unlock(&rs->rs_lock);
+
+ preempt_enable();
+}
+
+int ext4_reserve_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_reservation_slot *rs;
+ int i;
+
+ rs = kmalloc(sizeof(struct ext4_reservation_slot) * NR_CPUS, GFP_KERNEL);
+ if (rs == NULL)
+ return -ENOMEM;
+ sbi->s_reservation_slots = rs;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ spin_lock_init(&rs[i].rs_lock);
+ rs[i].rs_reserved = 0;
+ }
+ rs[0].rs_reserved = percpu_counter_sum(&sbi->s_freeblocks_counter);
+
+ return 0;
+}
+
+void ext4_reserve_release(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_reservation_slot *rs;
+
+ rs = sbi->s_reservation_slots;
+ BUG_ON(sbi->s_reservation_slots == NULL);
+ kfree(sbi->s_reservation_slots);
+ sbi->s_reservation_slots = NULL;
+}
+

2006-12-23 03:31:34

by David Chinner

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4

On Fri, Dec 22, 2006 at 11:20:08PM +0300, Alex Tomas wrote:
>
> Good day,
>
> probably the previous set of patches (including mballoc/lg)
> is too large. so, I reworked delayed allocation a bit so
> that it can be used on top of regular balloc, though it
> still can be used with extents-enabled files only.
>
> this time series contains just 3 patches:
>
> - booked-page-flag.patch
> adds PG_booked bit to page->flags. it's used in delayed
> allocation to mark space is already reserved for page
> (including possible metadata)

So that mean's we'll have 2 separate mechanisms for marking
pages as delalloc. XFS uses the BH_delay flag to indicate
that a buffer (block) attached to the page is using delalloc.

FWIW, how does this mechanism deal with block size < page size?
Don't you have to track delalloc on a block basis rather than
a page basis?

> - ext4-block-reservation.patch
> this is scalable free space management. every time we
> delay allocation of some page, a space (including metadata)
> should be reserved
>
> - ext4-delayed-allocation.patch
> delayed allocation itself, enabled by "delalloc" mount option.
> extents support is also required. currently it works only
> with blocksize=pagesize.

Ah, that's why you can get away with a page flag - you've ignored
the partial page delay state problem. Any plans to use the
existing method in the future so we will be able to use ext4 delalloc
on machines with a page size larger than 4k?

Cheers,

Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group

2006-12-23 09:27:23

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4

On Sat, Dec 23, 2006 at 02:31:23PM +1100, David Chinner wrote:
> > - ext4-delayed-allocation.patch
> > delayed allocation itself, enabled by "delalloc" mount option.
> > extents support is also required. currently it works only
> > with blocksize=pagesize.
>
> Ah, that's why you can get away with a page flag - you've ignored
> the partial page delay state problem. Any plans to use the
> existing method in the future so we will be able to use ext4 delalloc
> on machines with a page size larger than 4k?

I think fixing this up for blocksize < pagesize is an absolute requirement
to get things merged. We don't need more filesystems that are crippled
on half of our platforms.

Note that recording delayed alloc state at a page granularity in addition
to just the buffer heads has a lot of advantages aswell and would help
xfs, too. But I think it makes a lot more sense to record it as a radix
tree tag to speed up the gang lookups for delalloc conversion.

2006-12-23 19:10:09

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4


Good day,

>>>>> David Chinner (DC) writes:

DC> So that mean's we'll have 2 separate mechanisms for marking
DC> pages as delalloc. XFS uses the BH_delay flag to indicate
DC> that a buffer (block) attached to the page is using delalloc.

well, for blocksize=pagesize we can save 56 bytes on every page.

DC> FWIW, how does this mechanism deal with block size < page size?
DC> Don't you have to track delalloc on a block basis rather than
DC> a page basis?

I'm still thinking how better to deal with that w/o much code duplication.

DC> Ah, that's why you can get away with a page flag - you've ignored
DC> the partial page delay state problem. Any plans to use the
DC> existing method in the future so we will be able to use ext4 delalloc
DC> on machines with a page size larger than 4k?

what do you mean by "exsiting"? BH_delay?


thanks, Alex

2006-12-23 19:15:57

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4

>>>>> Christoph Hellwig (CH) writes:

CH> Note that recording delayed alloc state at a page granularity in addition
CH> to just the buffer heads has a lot of advantages aswell and would help
CH> xfs, too. But I think it makes a lot more sense to record it as a radix
CH> tree tag to speed up the gang lookups for delalloc conversion.

please, exaplein about radix tree tag. in ext4-delalloc I use this
bit the only way - to avoid multiple reservation space for same
page. I guess you need to find non-allocated pages. probably to
flush them and update number of reserved blocks in case of -ENOSPC?

thanks, Alex

2006-12-23 22:41:04

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC] ext4-block-reservation.patch

On Fri, 22 Dec 2006 23:25:16 +0300
Alex Tomas <[email protected]> wrote:

Once this code is settled in we should consider removal of the existing
reservations code from ext4.

> +
> +struct ext4_reservation_slot {
> + __u64 rs_reserved;
> + spinlock_t rs_lock;
> +} ____cacheline_aligned;

Should be ____cacheline_aligned_in_smp.

That's assuming it needs to be cacheline aligned at all. It can consume a
lot of space.

<looks>

oh, this should be allocated with alloc_percpu(), in which case the
open-coded alignment can perhaps go away.

> +
> +int ext4_reserve_local(struct super_block *sb, int blocks)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_reservation_slot *rs;
> + int rc = -ENOSPC;
> +
> + preempt_disable();
> + rs = sbi->s_reservation_slots + smp_processor_id();

use get_cpu() here.

> +void ext4_rebalance_reservation(struct ext4_reservation_slot *rs, __u64 free)
> +{
> + int i, used_slots = 0;
> + __u64 chunk;
> +
> + /* let's know what slots have been used */
> + for (i = 0; i < NR_CPUS; i++)
> + if (rs[i].rs_reserved || i == smp_processor_id())
> + used_slots++;
> +
> + /* chunk is a number of block every used
> + * slot will get. make sure it isn't 0 */
> + chunk = free + used_slots - 1;
> + do_div(chunk, used_slots);
> +
> + for (i = 0; i < NR_CPUS; i++) {

all these NR_CPUS loops need to go away. Use either
for_each_possible_cpu() or, preferably, for_each_online_cpu() and a hotplug
notifier.

Why is this code using per-cpu data at all, btw? These optimisations tend
to be marginal in filesystems. What is the perfomance impact of making
this data be single-superblock-wide-instance?

> +int ext4_reserve_init(struct super_block *sb)
> +{
> + struct ext4_sb_info *sbi = EXT4_SB(sb);
> + struct ext4_reservation_slot *rs;
> + int i;
> +
> + rs = kmalloc(sizeof(struct ext4_reservation_slot) * NR_CPUS, GFP_KERNEL);

alloc_percpu()

2006-12-23 22:59:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [RFC] ext4-delayed-allocation.patch

On Fri, 22 Dec 2006 23:28:32 +0300
Alex Tomas <[email protected]> wrote:

> +/*
> + * With EXT4_WB_SKIP_SMALL defined the patch will try to avoid
> + * small I/Os ignoring ->writepages() if mapping hasn't enough
> + * contig. dirty pages
> + */
> +#define EXT4_WB_SKIP_SMALL__
> +
> +#define WB_ASSERT(__x__) if (!(__x__)) BUG();

This is unused. Please kill.

> +#define WB_DEBUG__
> +#ifdef WB_DEBUG
> +#define wb_debug(fmt,a...) printk(fmt, ##a);
> +#else
> +#define wb_debug(fmt,a...)
> +#endif

It's tiresome for each piece of kernel code to implement private debug
macros. Why not use pr_debug()?

In general, this patch adds a mountain of code which I suspect just
shouldn't be in a filesystem. It should be library code in mm/ or fs/.
It'll take some thought and refactoring and definition of new
address_space_operations entries, etc. But burying all this inside a
single filesystem is just The Wrong Thing To Do.

I am not inclined to review the code in detail because the lack of suitable
code comments would make that task much larger and significantly less
useful than it should be. Please spend a day or so commenting the code in
a similar manner to other parts of ext4 and jbd2. When doing so, put
yourself in the position of an experienced kernel developer who seeks to
understand what the code is doing and why it is doing it. Skilful
commenting is essential if the code is to be maintainable and it has an
immediate impact upon the code's quality. Every non-trivial function
should have an introductory comment which tells the reader why this
function exists, what it does and, if not obvious, how it does it. Don't
bother with kernel-doc comments - for some reason they tend to be useless
for code conprehension.

I shouldn't need to sit here scratching my head and wondering why the heck
a writepages function is running __set_page_dirty_nobuffers().

Thanks.

2006-12-23 23:13:18

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] ext4-block-reservation.patch


Hi,

>>>>> Andrew Morton (AM) writes:

AM> Should be ____cacheline_aligned_in_smp.

AM> That's assuming it needs to be cacheline aligned at all. It can consume a
AM> lot of space.

the idea is to make block reservation cheap because it's called
for every page.

AM> <looks>

AM> oh, this should be allocated with alloc_percpu(), in which case the
AM> open-coded alignment can perhaps go away.

got it.

>> +
>> +int ext4_reserve_local(struct super_block *sb, int blocks)
>> +{
>> + struct ext4_sb_info *sbi = EXT4_SB(sb);
>> + struct ext4_reservation_slot *rs;
>> + int rc = -ENOSPC;
>> +
>> + preempt_disable();
>> + rs = sbi->s_reservation_slots + smp_processor_id();

AM> use get_cpu() here.

ok.

>> +void ext4_rebalance_reservation(struct ext4_reservation_slot *rs, __u64 free)
>> +{
>> + int i, used_slots = 0;
>> + __u64 chunk;
>> +
>> + /* let's know what slots have been used */
>> + for (i = 0; i < NR_CPUS; i++)
>> + if (rs[i].rs_reserved || i == smp_processor_id())
>> + used_slots++;
>> +
>> + /* chunk is a number of block every used
>> + * slot will get. make sure it isn't 0 */
>> + chunk = free + used_slots - 1;
>> + do_div(chunk, used_slots);
>> +
>> + for (i = 0; i < NR_CPUS; i++) {

AM> all these NR_CPUS loops need to go away. Use either
AM> for_each_possible_cpu() or, preferably, for_each_online_cpu() and a hotplug
AM> notifier.

hmm, i see.

AM> Why is this code using per-cpu data at all, btw? These optimisations tend
AM> to be marginal in filesystems. What is the perfomance impact of making
AM> this data be single-superblock-wide-instance?

well, even on 2way box a single-lock reservation was in top10.

thanks, Alex

2006-12-29 02:50:42

by David Chinner

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4

On Sat, Dec 23, 2006 at 09:27:18AM +0000, Christoph Hellwig wrote:
> On Sat, Dec 23, 2006 at 02:31:23PM +1100, David Chinner wrote:
> > > - ext4-delayed-allocation.patch
> > > delayed allocation itself, enabled by "delalloc" mount option.
> > > extents support is also required. currently it works only
> > > with blocksize=pagesize.
> >
> > Ah, that's why you can get away with a page flag - you've ignored
> > the partial page delay state problem. Any plans to use the
> > existing method in the future so we will be able to use ext4 delalloc
> > on machines with a page size larger than 4k?
>
> I think fixing this up for blocksize < pagesize is an absolute requirement
> to get things merged. We don't need more filesystems that are crippled
> on half of our platforms.
>
> Note that recording delayed alloc state at a page granularity in addition
> to just the buffer heads has a lot of advantages aswell and would help
> xfs, too. But I think it makes a lot more sense to record it as a radix
> tree tag to speed up the gang lookups for delalloc conversion.

I'm not sure it will make that much difference, really. Looking up
by delalloc tag is only going to save a few tail pages in pagevec we
use for the look up and could be more expensive if delalloc pages
are sparsely distributed through the file.

We'd still have to keep the bufferheads around for partial page
state, and that becomes an interesting exercise in keeping things
coherent between the radix tree and the buffer heads.

Of course, then there's the unwritten state that XFS also carries
around per block (bufferhead) which has all the same issues as the
delalloc state. I'd hate to have a generic method for handling
delalloc state which is different from the handling of the unwritten
state and needing two different sets of code to handle what is
essentially the same thing....

Cheers,

Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group

2006-12-29 02:52:53

by David Chinner

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4

On Sat, Dec 23, 2006 at 10:09:57PM +0300, Alex Tomas wrote:
>
> Good day,
>
> >>>>> David Chinner (DC) writes:
>
> DC> So that mean's we'll have 2 separate mechanisms for marking
> DC> pages as delalloc. XFS uses the BH_delay flag to indicate
> DC> that a buffer (block) attached to the page is using delalloc.
>
> well, for blocksize=pagesize we can save 56 bytes on every page.

Sure, but it means that ext4 w/ delalloc won't work on lots of
machines....

> DC> FWIW, how does this mechanism deal with block size < page size?
> DC> Don't you have to track delalloc on a block basis rather than
> DC> a page basis?
>
> I'm still thinking how better to deal with that w/o much code duplication.

Code duplication in ext4, or across all filesystems?

> DC> Ah, that's why you can get away with a page flag - you've ignored
> DC> the partial page delay state problem. Any plans to use the
> DC> existing method in the future so we will be able to use ext4 delalloc
> DC> on machines with a page size larger than 4k?
>
> what do you mean by "exsiting"? BH_delay?

Yes.

Cheers,

Dave.
--
Dave Chinner
Principal Engineer
SGI Australian Software Group

2006-12-29 04:56:37

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] delayed allocation for ext4

>>>>> David Chinner (DC) writes:

DC> So that mean's we'll have 2 separate mechanisms for marking
DC> pages as delalloc. XFS uses the BH_delay flag to indicate
DC> that a buffer (block) attached to the page is using delalloc.
>>
>> well, for blocksize=pagesize we can save 56 bytes on every page.

DC> Sure, but it means that ext4 w/ delalloc won't work on lots of
DC> machines....

it does not currenly. but I'm going to implement that. not sure
whether it's worth to have two different codepaths for
block size=page size and block size < page size.

DC> FWIW, how does this mechanism deal with block size < page size?
DC> Don't you have to track delalloc on a block basis rather than
DC> a page basis?
>>
>> I'm still thinking how better to deal with that w/o much code duplication.

DC> Code duplication in ext4, or across all filesystems?

given what Andrew said about moving the code into VFS, it's more
about all filesystems.

thanks, Alex

2007-01-12 14:45:05

by Valerie Clement

[permalink] [raw]
Subject: Re: [RFC] ext4-delayed-allocation.patch

Index: linux-2.6.20-rc1/fs/ext4/writeback.c
===================================================================
--- linux-2.6.20-rc1.orig/fs/ext4/writeback.c 2007-01-09 18:41:27.000000000 +0100
+++ linux-2.6.20-rc1/fs/ext4/writeback.c 2007-01-11 13:25:38.000000000 +0100
@@ -274,7 +274,8 @@ static int ext4_wb_submit_extent(struct
struct inode *inode = wc->mapping->host;
int blkbits = inode->i_blkbits;
struct page *page;
- unsigned long blk, off, len, remain;
+ ext4_fsblk_t off;
+ unsigned long blk, len, remain;
unsigned long pstart, plen, prev;
struct bio *bio = NULL;
int nr_pages;
@@ -332,6 +333,7 @@ alloc_new_bio:
nr_pages = (ex->ee_len - (blk - ex->ee_block));
nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
off = ex->ee_start + (blk - ex->ee_block);
+ off |= (ext4_fsblk_t) ex->ee_start_hi << 32;
bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
if (bio == NULL)
return -ENOMEM;
@@ -377,7 +379,9 @@ ext4_wb_find_goal(struct inode *inode, s

/* try to predict block placement */
if ((ex = path[depth].p_ext))
- return ex->ee_start + (block - ex->ee_block);
+ return ((ex->ee_start
+ | ((ext4_fsblk_t) ex->ee_start_hi << 32))
+ + (block - ex->ee_block);

/* it looks index is empty
* try to find starting from index itself */
@@ -416,7 +420,8 @@ static int ext4_wb_handle_extent(struct
(unsigned) ec->ec_block,
(unsigned) ec->ec_len,
(unsigned) ec->ec_start);
- nex.ee_start = ec->ec_start;
+ nex.ee_start = ec->ec_start & 0xffffffff;
+ nex.ee_start_hi = (ec->ec_start >> 32) & 0xffff;
nex.ee_block = ec->ec_block;
nex.ee_len = ec->ec_len;
err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
@@ -488,8 +493,8 @@ static int ext4_wb_handle_extent(struct
pblock, count, inode->i_ino, ec->ec_len);

/* insert new extent */
- nex.ee_start = pblock;
- nex.ee_start_hi = 0;
+ nex.ee_start = pblock & 0xffffffff;
+ nex.ee_start_hi = (pblock >> 32) & 0xffff;
nex.ee_len = count;
nex.ee_block = ec->ec_block;
err = ext4_ext_insert_extent(handle, inode, path, &nex);
@@ -520,7 +525,9 @@ static int ext4_wb_handle_extent(struct
/* block have been allocated for data, so time to drop dirty
* in correspondend buffer_heads to prevent corruptions */
for (i = 0; i < nex.ee_len; i++)
- unmap_underlying_metadata(sb->s_bdev, nex.ee_start + i);
+ unmap_underlying_metadata(sb->s_bdev,
+ ((ext4_fsblk_t) nex.ee_start_hi << 32)
+ + nex.ee_start + i);

/* correct on-disk inode size */
if (nex.ee_len > 0) {


Attachments:
ext4-delalloc-extents-48bit.patch (2.47 kB)

2007-01-12 14:52:34

by Alex Tomas

[permalink] [raw]
Subject: Re: [RFC] ext4-delayed-allocation.patch


ah, you're right. thanks!

thanks, Alex

>>>>> Valerie Clement (VC) writes:

VC> Hi Alex,
VC> I tested your patch on my system with a 20TB device, but some tests
VC> failed. Looking at the code, I saw that the support of 48-bit block
VC> number in extents is lacking.

VC> I made some changes in the code (see the patch in attachment) and now
VC> all my tests are OK.
VC> The patch is not complete, I didn't update calls to wb_debug() which
VC> dump "ee_start".

VC> Hope this helps.

VC> Val?rie

VC> Index: linux-2.6.20-rc1/fs/ext4/writeback.c
VC> ===================================================================
VC> --- linux-2.6.20-rc1.orig/fs/ext4/writeback.c 2007-01-09 18:41:27.000000000 +0100
VC> +++ linux-2.6.20-rc1/fs/ext4/writeback.c 2007-01-11 13:25:38.000000000 +0100
VC> @@ -274,7 +274,8 @@ static int ext4_wb_submit_extent(struct
VC> struct inode *inode = wc->mapping->host;
VC> int blkbits = inode->i_blkbits;
VC> struct page *page;
VC> - unsigned long blk, off, len, remain;
VC> + ext4_fsblk_t off;
VC> + unsigned long blk, len, remain;
VC> unsigned long pstart, plen, prev;
VC> struct bio *bio = NULL;
VC> int nr_pages;
VC> @@ -332,6 +333,7 @@ alloc_new_bio:
VC> nr_pages = (ex->ee_len - (blk - ex->ee_block));
VC> nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
VC> off = ex->ee_start + (blk - ex->ee_block);
VC> + off |= (ext4_fsblk_t) ex->ee_start_hi << 32;
VC> bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
VC> if (bio == NULL)
VC> return -ENOMEM;
VC> @@ -377,7 +379,9 @@ ext4_wb_find_goal(struct inode *inode, s

VC> /* try to predict block placement */
VC> if ((ex = path[depth].p_ext))
VC> - return ex->ee_start + (block - ex->ee_block);
VC> + return ((ex->ee_start
VC> + | ((ext4_fsblk_t) ex->ee_start_hi << 32))
VC> + + (block - ex->ee_block);

VC> /* it looks index is empty
VC> * try to find starting from index itself */
VC> @@ -416,7 +420,8 @@ static int ext4_wb_handle_extent(struct
VC> (unsigned) ec->ec_block,
VC> (unsigned) ec->ec_len,
VC> (unsigned) ec->ec_start);
VC> - nex.ee_start = ec->ec_start;
VC> + nex.ee_start = ec->ec_start & 0xffffffff;
VC> + nex.ee_start_hi = (ec->ec_start >> 32) & 0xffff;
VC> nex.ee_block = ec->ec_block;
VC> nex.ee_len = ec->ec_len;
VC> err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
VC> @@ -488,8 +493,8 @@ static int ext4_wb_handle_extent(struct
VC> pblock, count, inode->i_ino, ec->ec_len);

VC> /* insert new extent */
VC> - nex.ee_start = pblock;
VC> - nex.ee_start_hi = 0;
VC> + nex.ee_start = pblock & 0xffffffff;
VC> + nex.ee_start_hi = (pblock >> 32) & 0xffff;
VC> nex.ee_len = count;
VC> nex.ee_block = ec->ec_block;
VC> err = ext4_ext_insert_extent(handle, inode, path, &nex);
VC> @@ -520,7 +525,9 @@ static int ext4_wb_handle_extent(struct
VC> /* block have been allocated for data, so time to drop dirty
VC> * in correspondend buffer_heads to prevent corruptions */
VC> for (i = 0; i < nex.ee_len; i++)
VC> - unmap_underlying_metadata(sb->s_bdev, nex.ee_start + i);
VC> + unmap_underlying_metadata(sb->s_bdev,
VC> + ((ext4_fsblk_t) nex.ee_start_hi << 32)
VC> + + nex.ee_start + i);

VC> /* correct on-disk inode size */
VC> if (nex.ee_len > 0) {