Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754985AbZFYKn5 (ORCPT ); Thu, 25 Jun 2009 06:43:57 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753106AbZFYKmY (ORCPT ); Thu, 25 Jun 2009 06:42:24 -0400 Received: from brick.kernel.dk ([93.163.65.50]:59737 "EHLO kernel.dk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752005AbZFYKmK (ORCPT ); Thu, 25 Jun 2009 06:42:10 -0400 From: Jens Axboe To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org Cc: chris.mason@oracle.com, david@fromorbit.com, hch@infradead.org, akpm@linux-foundation.org, jack@suse.cz, yanmin_zhang@linux.intel.com, richard@rsk.demon.co.uk, damien.wyart@free.fr, fweisbec@gmail.com, Alan.Brunelle@hp.com, Jens Axboe Subject: [PATCH 04/10] writeback: separate the flushing state/task from the bdi Date: Thu, 25 Jun 2009 12:41:57 +0200 Message-Id: <1245926523-21959-5-git-send-email-jens.axboe@oracle.com> X-Mailer: git-send-email 1.6.3.rc0.1.gf800 In-Reply-To: <1245926523-21959-1-git-send-email-jens.axboe@oracle.com> References: <1245926523-21959-1-git-send-email-jens.axboe@oracle.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 19920 Lines: 663 Add a struct bdi_writeback for tracking and handling dirty IO. This is in preparation for adding > 1 flusher task per bdi. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 147 ++++++++++++++++++++++++++---------------- include/linux/backing-dev.h | 38 +++++++----- mm/backing-dev.c | 120 ++++++++++++++++++++++++++--------- 3 files changed, 204 insertions(+), 101 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7b49d90..86fb2a9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -29,6 +29,10 @@ #define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) +static void generic_sync_wb_inodes(struct bdi_writeback *wb, + struct super_block *sb, + struct writeback_control *wbc); + /* * We don't actually have pdflush, but this one is exported though /proc... */ @@ -46,9 +50,11 @@ int nr_pdflush_threads; * unless they implement their own. Which is somewhat inefficient, as this * may prevent concurrent writeback against multiple devices. */ -static int writeback_acquire(struct backing_dev_info *bdi) +static int writeback_acquire(struct bdi_writeback *wb) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + struct backing_dev_info *bdi = wb->bdi; + + return !test_and_set_bit(wb->nr, &bdi->wb_active); } /** @@ -59,19 +65,37 @@ static int writeback_acquire(struct backing_dev_info *bdi) */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return bdi->wb_active != 0; } /** * writeback_release - relinquish exclusive writeback access against a device. * @bdi: the device's backing_dev_info structure */ -static void writeback_release(struct backing_dev_info *bdi) +static void writeback_release(struct bdi_writeback *wb) +{ + struct backing_dev_info *bdi = wb->bdi; + + wb->nr_pages = 0; + wb->sb = NULL; + clear_bit(wb->nr, &bdi->wb_active); +} + +static void wb_start_writeback(struct bdi_writeback *wb, struct super_block *sb, + long nr_pages, + enum writeback_sync_modes sync_mode) { - WARN_ON_ONCE(!writeback_in_progress(bdi)); - bdi->wb_arg.nr_pages = 0; - bdi->wb_arg.sb = NULL; - clear_bit(BDI_pdflush, &bdi->state); + if (!wb_has_dirty_io(wb)) + return; + + if (writeback_acquire(wb)) { + wb->nr_pages = nr_pages; + wb->sb = sb; + wb->sync_mode = sync_mode; + + if (wb->task) + wake_up_process(wb->task); + } } void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, @@ -81,17 +105,10 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, * This only happens the first time someone kicks this bdi, so put * it out-of-line. */ - if (unlikely(!bdi->task)) - wake_up_process(default_backing_dev_info.task); + if (unlikely(!bdi->wb.task)) + wake_up_process(default_backing_dev_info.wb.task); - if (writeback_acquire(bdi)) { - bdi->wb_arg.nr_pages = nr_pages; - bdi->wb_arg.sb = sb; - bdi->wb_arg.sync_mode = sync_mode; - - if (bdi->task) - wake_up_process(bdi->task); - } + wb_start_writeback(&bdi->wb, sb, nr_pages, sync_mode); } /* @@ -128,17 +145,17 @@ static inline bool over_bground_thresh(void) * older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ -static void bdi_flush(struct backing_dev_info *bdi, int for_kupdate) +static void wb_writeback(struct bdi_writeback *wb, int for_kupdate) { struct writeback_control wbc = { - .bdi = bdi, - .sync_mode = bdi->wb_arg.sync_mode, + .bdi = wb->bdi, + .sync_mode = wb->sync_mode, .older_than_this = NULL, .for_kupdate = for_kupdate, .range_cyclic = 1, }; unsigned long oldest_jif; - long nr_pages = bdi->wb_arg.nr_pages; + long nr_pages = wb->nr_pages; if (wbc.for_kupdate) { wbc.older_than_this = &oldest_jif; @@ -155,7 +172,7 @@ static void bdi_flush(struct backing_dev_info *bdi, int for_kupdate) wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; - generic_sync_bdi_inodes(bdi->wb_arg.sb, &wbc); + generic_sync_wb_inodes(wb, wb->sb, &wbc); nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; /* * If we ran out of stuff to write, bail unless more_io got set @@ -172,17 +189,12 @@ static void bdi_flush(struct backing_dev_info *bdi, int for_kupdate) * Handle writeback of dirty data for the device backed by this bdi. Also * wakes up periodically and does kupdated style flushing. */ -int bdi_writeback_task(struct backing_dev_info *bdi) +int bdi_writeback_task(struct bdi_writeback *wb) { while (!kthread_should_stop()) { unsigned long wait_jiffies; int for_kupdate; - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(wait_jiffies); - try_to_freeze(); - /* * We get here in two cases: * @@ -197,7 +209,7 @@ int bdi_writeback_task(struct backing_dev_info *bdi) * pdflush style writeout. * */ - for_kupdate = writeback_acquire(bdi); + for_kupdate = writeback_acquire(wb); if (for_kupdate) { long nr; @@ -205,13 +217,18 @@ int bdi_writeback_task(struct backing_dev_info *bdi) global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - bdi->wb_arg.nr_pages = nr; - bdi->wb_arg.sb = NULL; - bdi->wb_arg.sync_mode = WB_SYNC_NONE; + wb->nr_pages = nr; + wb->sb = NULL; + wb->sync_mode = WB_SYNC_NONE; } - bdi_flush(bdi, for_kupdate); - writeback_release(bdi); + wb_writeback(wb, for_kupdate); + writeback_release(wb); + + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(wait_jiffies); + try_to_freeze(); } return 0; @@ -254,6 +271,14 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) } } +/* + * We have only a single wb per bdi, so just return that. + */ +static inline struct bdi_writeback *inode_get_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + /** * __mark_inode_dirty - internal function * @inode: inode to mark @@ -337,9 +362,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct bdi_writeback *wb = inode_get_wb(inode); + inode->dirtied_when = jiffies; - list_move(&inode->i_list, - &inode_to_bdi(inode)->b_dirty); + list_move(&inode->i_list, &wb->b_dirty); } } out: @@ -366,16 +392,16 @@ static int write_inode(struct inode *inode, int sync) */ static void redirty_tail(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_get_wb(inode); - if (!list_empty(&bdi->b_dirty)) { + if (!list_empty(&wb->b_dirty)) { struct inode *tail; - tail = list_entry(bdi->b_dirty.next, struct inode, i_list); + tail = list_entry(wb->b_dirty.next, struct inode, i_list); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &bdi->b_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* @@ -383,7 +409,9 @@ static void redirty_tail(struct inode *inode) */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io); + struct bdi_writeback *wb = inode_get_wb(inode); + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -430,11 +458,10 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct backing_dev_info *bdi, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&bdi->b_more_io, bdi->b_io.prev); - move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } /* @@ -593,20 +620,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) return ret; } -void generic_sync_bdi_inodes(struct super_block *sb, - struct writeback_control *wbc) +static void generic_sync_wb_inodes(struct bdi_writeback *wb, + struct super_block *sb, + struct writeback_control *wbc) { const int is_blkdev_sb = sb_is_blkdev_sb(sb); - struct backing_dev_info *bdi = wbc->bdi; const unsigned long start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&bdi->b_io)) - queue_io(bdi, wbc->older_than_this); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); - while (!list_empty(&bdi->b_io)) { - struct inode *inode = list_entry(bdi->b_io.prev, + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); long pages_skipped; @@ -618,7 +645,7 @@ void generic_sync_bdi_inodes(struct super_block *sb, continue; } - if (!bdi_cap_writeback_dirty(bdi)) { + if (!bdi_cap_writeback_dirty(wb->bdi)) { redirty_tail(inode); if (is_blkdev_sb) { /* @@ -640,7 +667,7 @@ void generic_sync_bdi_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; if (!is_blkdev_sb) break; /* Skip a congested fs */ @@ -674,7 +701,7 @@ void generic_sync_bdi_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&bdi->b_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } @@ -682,6 +709,14 @@ void generic_sync_bdi_inodes(struct super_block *sb, /* Leave any unwritten inodes on b_io */ } +void generic_sync_bdi_inodes(struct super_block *sb, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + + generic_sync_wb_inodes(&bdi->wb, sb, wbc); +} + /* * Write out a superblock's list of dirty inodes. A wait will be performed * upon no inodes, all inodes or the final one, depending upon sync_mode. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fb3b870..456154b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -24,8 +24,8 @@ struct dentry; * Bits in backing_dev_info.state */ enum bdi_state { - BDI_pdflush, /* A pdflush thread is working this device */ BDI_pending, /* On its way to being activated */ + BDI_wb_alloc, /* Default embedded wb allocated */ BDI_async_congested, /* The async (write) queue is getting full */ BDI_sync_congested, /* The sync queue is getting full */ BDI_unused, /* Available bits start here */ @@ -41,15 +41,22 @@ enum bdi_stat_item { #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) -struct bdi_writeback_arg { - unsigned long nr_pages; - struct super_block *sb; +struct bdi_writeback { + struct backing_dev_info *bdi; /* our parent bdi */ + unsigned int nr; + + struct task_struct *task; /* writeback task */ + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + + unsigned long nr_pages; + struct super_block *sb; enum writeback_sync_modes sync_mode; }; struct backing_dev_info { struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ @@ -66,13 +73,11 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - struct device *dev; + struct bdi_writeback wb; /* default writeback info for this bdi */ + unsigned long wb_active; /* bitmap of active tasks */ + unsigned long wb_mask; /* number of registered tasks */ - struct task_struct *task; /* writeback task */ - struct bdi_writeback_arg wb_arg; /* protected by BDI_pdflush */ - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ + struct device *dev; #ifdef CONFIG_DEBUG_FS struct dentry *debug_dir; @@ -89,17 +94,18 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, long nr_pages, enum writeback_sync_modes sync_mode); -int bdi_writeback_task(struct backing_dev_info *bdi); +int bdi_writeback_task(struct bdi_writeback *wb); void bdi_writeback_all(struct super_block *sb, struct writeback_control *wbc); +int bdi_has_dirty_io(struct backing_dev_info *bdi); extern spinlock_t bdi_lock; extern struct list_head bdi_list; -static inline int bdi_has_dirty_io(struct backing_dev_info *bdi) +static inline int wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&bdi->b_dirty) || - !list_empty(&bdi->b_io) || - !list_empty(&bdi->b_more_io); + return !list_empty(&wb->b_dirty) || + !list_empty(&wb->b_io) || + !list_empty(&wb->b_more_io); } static inline void __add_bdi_stat(struct backing_dev_info *bdi, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9e17bae..5bd4984 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -215,10 +215,45 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static int wb_assign_nr(struct backing_dev_info *bdi, struct bdi_writeback *wb) +{ + set_bit(0, &bdi->wb_mask); + wb->nr = 0; + return 0; +} + +static void bdi_put_wb(struct backing_dev_info *bdi, struct bdi_writeback *wb) +{ + clear_bit(wb->nr, &bdi->wb_mask); + clear_bit(BDI_wb_alloc, &bdi->state); +} + +static struct bdi_writeback *bdi_new_wb(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + set_bit(BDI_wb_alloc, &bdi->state); + wb = &bdi->wb; + wb_assign_nr(bdi, wb); + return wb; +} + static int bdi_start_fn(void *ptr) { - struct backing_dev_info *bdi = ptr; + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; struct task_struct *tsk = current; + int ret; /* * Add us to the active bdi_list @@ -242,7 +277,16 @@ static int bdi_start_fn(void *ptr) smp_mb__after_clear_bit(); wake_up_bit(&bdi->state, BDI_pending); - return bdi_writeback_task(bdi); + ret = bdi_writeback_task(wb); + + wb->task = NULL; + bdi_put_wb(bdi, wb); + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); } static void bdi_flush_io(struct backing_dev_info *bdi) @@ -297,17 +341,18 @@ static void sync_supers_timer_fn(unsigned long unused) static int bdi_forker_task(void *ptr) { - struct backing_dev_info *me = ptr; + struct bdi_writeback *me = ptr; for (;;) { struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; /* * Temporary measure, we want to make sure we don't see * dirty data on the default backing_dev_info */ - if (bdi_has_dirty_io(me)) - bdi_flush_io(me); + if (wb_has_dirty_io(me)) + bdi_flush_io(me->bdi); spin_lock(&bdi_lock); @@ -316,7 +361,7 @@ static int bdi_forker_task(void *ptr) * a thread registered. If so, set that up. */ list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { - if (bdi->task || !bdi_has_dirty_io(bdi)) + if (bdi->wb.task || !bdi_has_dirty_io(bdi)) continue; bdi_add_default_flusher_task(bdi); @@ -345,9 +390,11 @@ static int bdi_forker_task(void *ptr) list_del_init(&bdi->bdi_list); spin_unlock(&bdi_lock); - BUG_ON(bdi->task); + wb = bdi_new_wb(bdi); + if (!wb) + goto readd_flush; - bdi->task = kthread_run(bdi_start_fn, bdi, "flush-%s", + wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", dev_name(bdi->dev)); /* * If task creation fails, then readd the bdi to @@ -355,9 +402,10 @@ static int bdi_forker_task(void *ptr) * from this forker thread. That will free some memory * and we can try again. */ - if (IS_ERR(bdi->task)) { - bdi->task = NULL; - + if (IS_ERR(wb->task)) { + wb->task = NULL; + bdi_put_wb(bdi, wb); +readd_flush: /* * Add this 'bdi' to the back, so we get * a chance to flush other bdi's to free @@ -380,6 +428,9 @@ static int bdi_forker_task(void *ptr) */ static void bdi_add_default_flusher_task(struct backing_dev_info *bdi) { + if (!bdi_cap_writeback_dirty(bdi)) + return; + /* * Someone already marked this pending for task creation */ @@ -390,7 +441,7 @@ static void bdi_add_default_flusher_task(struct backing_dev_info *bdi) list_move_tail(&bdi->bdi_list, &bdi_pending_list); spin_unlock(&bdi_lock); - wake_up_process(default_backing_dev_info.task); + wake_up_process(default_backing_dev_info.wb.task); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -423,14 +474,24 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, * on-demand when they need it. */ if (bdi_cap_flush_forker(bdi)) { - bdi->task = kthread_run(bdi_forker_task, bdi, "bdi-%s", + struct bdi_writeback *wb; + + wb = bdi_new_wb(bdi); + if (!wb) { + ret = -ENOMEM; + goto remove_err; + } + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", dev_name(dev)); - if (IS_ERR(bdi->task)) { - bdi->task = NULL; + if (IS_ERR(wb->task)) { + wb->task = NULL; + bdi_put_wb(bdi, wb); + ret = -ENOMEM; +remove_err: spin_lock(&bdi_lock); list_del(&bdi->bdi_list); spin_unlock(&bdi_lock); - ret = -ENOMEM; goto exit; } } @@ -454,10 +515,13 @@ static int sched_wait(void *word) } /* - * Remove bdi from the global list + * Remove bdi from the global list and shutdown any threads we have running */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { + if (!bdi_cap_writeback_dirty(bdi)) + return; + /* * If setup is pending, wait for that to complete first */ @@ -469,18 +533,18 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) spin_lock(&bdi_lock); list_del(&bdi->bdi_list); spin_unlock(&bdi_lock); + + /* + * Finally, kill the kernel thread + */ + kthread_stop(bdi->wb.task); } void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { - if (!bdi_cap_flush_forker(bdi)) { + if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); - if (bdi->task) { - kthread_stop(bdi->task); - bdi->task = NULL; - } - } bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -498,9 +562,9 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->b_io); - INIT_LIST_HEAD(&bdi->b_dirty); - INIT_LIST_HEAD(&bdi->b_more_io); + bdi->wb_mask = bdi->wb_active = 0; + + bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -525,9 +589,7 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - WARN_ON(!list_empty(&bdi->b_dirty)); - WARN_ON(!list_empty(&bdi->b_io)); - WARN_ON(!list_empty(&bdi->b_more_io)); + WARN_ON(bdi_has_dirty_io(bdi)); bdi_unregister(bdi); -- 1.6.3.rc0.1.gf800 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/