Date: Tue, 24 Mar 2009 17:17:24 +0100
From: Jan Kara <jack@suse.cz>
To: Jens Axboe <jens.axboe@oracle.com>
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
       chris.mason@oracle.com, david@fromorbit.com, npiggin@suse.de
Subject: Re: [PATCH 1/7] writeback: move dirty inodes from super_block to backing_dev_info
Message-ID: <20090324161724.GA15524@atrey.karlin.mff.cuni.cz>
References: <1236868428-20408-1-git-send-email-jens.axboe@oracle.com> <1236868428-20408-2-git-send-email-jens.axboe@oracle.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <1236868428-20408-2-git-send-email-jens.axboe@oracle.com>
User-Agent: Mutt/1.5.13 (2006-08-11)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 19252
Lines: 524

> This is a first step at introducing per-bdi flusher threads. We should
> have no change in behaviour, although sb_has_dirty_inodes() is now
> ridiculously expensive, as there's no easy way to answer that question.
> Not a huge problem, since it'll be deleted in subsequent patches.
  Could you maybe expand the changelog a bit? If I read the patch right
the only thing it does is that it moves from per-sb inode lists to
per-bdi inode lists, right? Also sync_sb_inodes() now writes all the
inodes in the system, not just the ones for that superblock, doesn't it?

								Honza
> 
> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
> ---
>  fs/fs-writeback.c           |  186 +++++++++++++++++++++++++++----------------
>  fs/super.c                  |    3 -
>  include/linux/backing-dev.h |    9 ++
>  include/linux/fs.h          |    5 +-
>  mm/backing-dev.c            |   31 +++++++-
>  mm/page-writeback.c         |    1 -
>  6 files changed, 156 insertions(+), 79 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index e5eaa62..c107cff 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -25,6 +25,7 @@
>  #include <linux/buffer_head.h>
>  #include "internal.h"
>  
> +#define inode_to_bdi(inode)	(inode)->i_mapping->backing_dev_info
>  
>  /**
>   * writeback_acquire - attempt to get exclusive writeback access to a device
> @@ -158,12 +159,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  			goto out;
>  
>  		/*
> -		 * If the inode was already on s_dirty/s_io/s_more_io, don't
> -		 * reposition it (that would break s_dirty time-ordering).
> +		 * If the inode was already on b_dirty/b_io/b_more_io, don't
> +		 * reposition it (that would break b_dirty time-ordering).
>  		 */
>  		if (!was_dirty) {
>  			inode->dirtied_when = jiffies;
> -			list_move(&inode->i_list, &sb->s_dirty);
> +			list_move(&inode->i_list,
> +					&inode_to_bdi(inode)->b_dirty);
>  		}
>  	}
>  out:
> @@ -184,31 +186,30 @@ static int write_inode(struct inode *inode, int sync)
>   * furthest end of its superblock's dirty-inode list.
>   *
>   * Before stamping the inode's ->dirtied_when, we check to see whether it is
> - * already the most-recently-dirtied inode on the s_dirty list.  If that is
> + * already the most-recently-dirtied inode on the b_dirty list.  If that is
>   * the case then the inode must have been redirtied while it was being written
>   * out and we don't reset its dirtied_when.
>   */
>  static void redirty_tail(struct inode *inode)
>  {
> -	struct super_block *sb = inode->i_sb;
> +	struct backing_dev_info *bdi = inode_to_bdi(inode);
>  
> -	if (!list_empty(&sb->s_dirty)) {
> -		struct inode *tail_inode;
> +	if (!list_empty(&bdi->b_dirty)) {
> +		struct inode *tail;
>  
> -		tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
> -		if (!time_after_eq(inode->dirtied_when,
> -				tail_inode->dirtied_when))
> +		tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
> +		if (!time_after_eq(inode->dirtied_when, tail->dirtied_when))
>  			inode->dirtied_when = jiffies;
>  	}
> -	list_move(&inode->i_list, &sb->s_dirty);
> +	list_move(&inode->i_list, &bdi->b_dirty);
>  }
>  
>  /*
> - * requeue inode for re-scanning after sb->s_io list is exhausted.
> + * requeue inode for re-scanning after bdi->b_io list is exhausted.
>   */
>  static void requeue_io(struct inode *inode)
>  {
> -	list_move(&inode->i_list, &inode->i_sb->s_more_io);
> +	list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
>  }
>  
>  static void inode_sync_complete(struct inode *inode)
> @@ -240,18 +241,50 @@ static void move_expired_inodes(struct list_head *delaying_queue,
>  /*
>   * Queue all expired dirty inodes for io, eldest first.
>   */
> -static void queue_io(struct super_block *sb,
> -				unsigned long *older_than_this)
> +static void queue_io(struct backing_dev_info *bdi,
> +		     unsigned long *older_than_this)
>  {
> -	list_splice_init(&sb->s_more_io, sb->s_io.prev);
> -	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
> +	list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
> +	move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
> +}
> +
> +static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
> +{
> +	struct inode *inode;
> +	int ret = 0;
> +
> +	spin_lock(&inode_lock);
> +	list_for_each_entry(inode, list, i_list) {
> +		if (inode->i_sb == sb) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	spin_unlock(&inode_lock);
> +	return ret;
>  }
>  
>  int sb_has_dirty_inodes(struct super_block *sb)
>  {
> -	return !list_empty(&sb->s_dirty) ||
> -	       !list_empty(&sb->s_io) ||
> -	       !list_empty(&sb->s_more_io);
> +	struct backing_dev_info *bdi;
> +	int ret = 0;
> +
> +	/*
> +	 * This is REALLY expensive right now, but it'll go away
> +	 * when the bdi writeback is introduced
> +	 */
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
> +		if (sb_on_inode_list(sb, &bdi->b_dirty) ||
> +		    sb_on_inode_list(sb, &bdi->b_io) ||
> +		    sb_on_inode_list(sb, &bdi->b_more_io)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL(sb_has_dirty_inodes);
>  
> @@ -305,11 +338,11 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
>  			/*
>  			 * We didn't write back all the pages.  nfs_writepages()
>  			 * sometimes bales out without doing anything. Redirty
> -			 * the inode; Move it from s_io onto s_more_io/s_dirty.
> +			 * the inode; Move it from b_io onto b_more_io/b_dirty.
>  			 */
>  			/*
>  			 * akpm: if the caller was the kupdate function we put
> -			 * this inode at the head of s_dirty so it gets first
> +			 * this inode at the head of b_dirty so it gets first
>  			 * consideration.  Otherwise, move it to the tail, for
>  			 * the reasons described there.  I'm not really sure
>  			 * how much sense this makes.  Presumably I had a good
> @@ -319,7 +352,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
>  			if (wbc->for_kupdate) {
>  				/*
>  				 * For the kupdate function we move the inode
> -				 * to s_more_io so it will get more writeout as
> +				 * to b_more_io so it will get more writeout as
>  				 * soon as the queue becomes uncongested.
>  				 */
>  				inode->i_state |= I_DIRTY_PAGES;
> @@ -385,10 +418,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>  	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
>  		/*
>  		 * We're skipping this inode because it's locked, and we're not
> -		 * doing writeback-for-data-integrity.  Move it to s_more_io so
> -		 * that writeback can proceed with the other inodes on s_io.
> +		 * doing writeback-for-data-integrity.  Move it to b_more_io so
> +		 * that writeback can proceed with the other inodes on b_io.
>  		 * We'll have another go at writing back this inode when we
> -		 * completed a full scan of s_io.
> +		 * completed a full scan of b_io.
>  		 */
>  		requeue_io(inode);
>  		return 0;
> @@ -411,51 +444,24 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>  	return __sync_single_inode(inode, wbc);
>  }
>  
> -/*
> - * Write out a superblock's list of dirty inodes.  A wait will be performed
> - * upon no inodes, all inodes or the final one, depending upon sync_mode.
> - *
> - * If older_than_this is non-NULL, then only write out inodes which
> - * had their first dirtying at a time earlier than *older_than_this.
> - *
> - * If we're a pdlfush thread, then implement pdflush collision avoidance
> - * against the entire list.
> - *
> - * If `bdi' is non-zero then we're being asked to writeback a specific queue.
> - * This function assumes that the blockdev superblock's inodes are backed by
> - * a variety of queues, so all inodes are searched.  For other superblocks,
> - * assume that all inodes are backed by the same queue.
> - *
> - * FIXME: this linear search could get expensive with many fileystems.  But
> - * how to fix?  We need to go from an address_space to all inodes which share
> - * a queue with that address_space.  (Easy: have a global "dirty superblocks"
> - * list).
> - *
> - * The inodes to be written are parked on sb->s_io.  They are moved back onto
> - * sb->s_dirty as they are selected for writing.  This way, none can be missed
> - * on the writer throttling path, and we get decent balancing between many
> - * throttled threads: we don't want them all piling up on inode_sync_wait.
> - */
> -void generic_sync_sb_inodes(struct super_block *sb,
> -				struct writeback_control *wbc)
> +static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
> +				    struct writeback_control *wbc,
> +				    int is_blkdev)
>  {
>  	const unsigned long start = jiffies;	/* livelock avoidance */
> -	int sync = wbc->sync_mode == WB_SYNC_ALL;
>  
>  	spin_lock(&inode_lock);
> -	if (!wbc->for_kupdate || list_empty(&sb->s_io))
> -		queue_io(sb, wbc->older_than_this);
> +	if (!wbc->for_kupdate || list_empty(&bdi->b_io))
> +		queue_io(bdi, wbc->older_than_this);
>  
> -	while (!list_empty(&sb->s_io)) {
> -		struct inode *inode = list_entry(sb->s_io.prev,
> +	while (!list_empty(&bdi->b_io)) {
> +		struct inode *inode = list_entry(bdi->b_io.prev,
>  						struct inode, i_list);
> -		struct address_space *mapping = inode->i_mapping;
> -		struct backing_dev_info *bdi = mapping->backing_dev_info;
>  		long pages_skipped;
>  
>  		if (!bdi_cap_writeback_dirty(bdi)) {
>  			redirty_tail(inode);
> -			if (sb_is_blkdev_sb(sb)) {
> +			if (is_blkdev) {
>  				/*
>  				 * Dirty memory-backed blockdev: the ramdisk
>  				 * driver does this.  Skip just this inode
> @@ -472,14 +478,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
>  
>  		if (wbc->nonblocking && bdi_write_congested(bdi)) {
>  			wbc->encountered_congestion = 1;
> -			if (!sb_is_blkdev_sb(sb))
> +			if (!is_blkdev)
>  				break;		/* Skip a congested fs */
>  			requeue_io(inode);
>  			continue;		/* Skip a congested blockdev */
>  		}
>  
>  		if (wbc->bdi && bdi != wbc->bdi) {
> -			if (!sb_is_blkdev_sb(sb))
> +			if (!is_blkdev)
>  				break;		/* fs has the wrong queue */
>  			requeue_io(inode);
>  			continue;		/* blockdev has wrong queue */
> @@ -514,13 +520,55 @@ void generic_sync_sb_inodes(struct super_block *sb,
>  			wbc->more_io = 1;
>  			break;
>  		}
> -		if (!list_empty(&sb->s_more_io))
> +		if (!list_empty(&bdi->b_more_io))
>  			wbc->more_io = 1;
>  	}
>  
> -	if (sync) {
> +	spin_unlock(&inode_lock);
> +	/* Leave any unwritten inodes on b_io */
> +}
> +
> +/*
> + * Write out a superblock's list of dirty inodes.  A wait will be performed
> + * upon no inodes, all inodes or the final one, depending upon sync_mode.
> + *
> + * If older_than_this is non-NULL, then only write out inodes which
> + * had their first dirtying at a time earlier than *older_than_this.
> + *
> + * If we're a pdlfush thread, then implement pdflush collision avoidance
> + * against the entire list.
> + *
> + * If `bdi' is non-zero then we're being asked to writeback a specific queue.
> + * This function assumes that the blockdev superblock's inodes are backed by
> + * a variety of queues, so all inodes are searched.  For other superblocks,
> + * assume that all inodes are backed by the same queue.
> + *
> + * FIXME: this linear search could get expensive with many fileystems.  But
> + * how to fix?  We need to go from an address_space to all inodes which share
> + * a queue with that address_space.  (Easy: have a global "dirty superblocks"
> + * list).
> + *
> + * The inodes to be written are parked on bdi->b_io.  They are moved back onto
> + * bdi->b_dirty as they are selected for writing.  This way, none can be missed
> + * on the writer throttling path, and we get decent balancing between many
> + * throttled threads: we don't want them all piling up on inode_sync_wait.
> + */
> +void generic_sync_sb_inodes(struct super_block *sb,
> +				struct writeback_control *wbc)
> +{
> +	const int is_blkdev = sb_is_blkdev_sb(sb);
> +	struct backing_dev_info *bdi;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
> +		generic_sync_bdi_inodes(bdi, wbc, is_blkdev);
> +	rcu_read_unlock();
> +
> +	if (wbc->sync_mode == WB_SYNC_ALL) {
>  		struct inode *inode, *old_inode = NULL;
>  
> +		spin_lock(&inode_lock);
> +
>  		/*
>  		 * Data integrity sync. Must wait for all pages under writeback,
>  		 * because there may have been pages dirtied before our sync
> @@ -557,10 +605,8 @@ void generic_sync_sb_inodes(struct super_block *sb,
>  		}
>  		spin_unlock(&inode_lock);
>  		iput(old_inode);
> -	} else
> -		spin_unlock(&inode_lock);
> +	}
>  
> -	return;		/* Leave any unwritten inodes on s_io */
>  }
>  EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
>  
> @@ -575,8 +621,8 @@ static void sync_sb_inodes(struct super_block *sb,
>   *
>   * Note:
>   * We don't need to grab a reference to superblock here. If it has non-empty
> - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
> - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
> + * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
> + * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
>   * empty. Since __sync_single_inode() regains inode_lock before it finally moves
>   * inode from superblock lists we are OK.
>   *
> diff --git a/fs/super.c b/fs/super.c
> index 8349ed6..e3c5b6f 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -64,9 +64,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
>  			s = NULL;
>  			goto out;
>  		}
> -		INIT_LIST_HEAD(&s->s_dirty);
> -		INIT_LIST_HEAD(&s->s_io);
> -		INIT_LIST_HEAD(&s->s_more_io);
>  		INIT_LIST_HEAD(&s->s_files);
>  		INIT_LIST_HEAD(&s->s_instances);
>  		INIT_HLIST_HEAD(&s->s_anon);
> diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
> index bee52ab..bb58c95 100644
> --- a/include/linux/backing-dev.h
> +++ b/include/linux/backing-dev.h
> @@ -40,6 +40,8 @@ enum bdi_stat_item {
>  #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
>  
>  struct backing_dev_info {
> +	struct list_head bdi_list;
> +
>  	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
>  	unsigned long state;	/* Always use atomic bitops on this */
>  	unsigned int capabilities; /* Device capabilities */
> @@ -58,6 +60,10 @@ struct backing_dev_info {
>  
>  	struct device *dev;
>  
> +	struct list_head	b_dirty;	/* dirty inodes */
> +	struct list_head	b_io;		/* parked for writeback */
> +	struct list_head	b_more_io;	/* parked for more writeback */
> +
>  #ifdef CONFIG_DEBUG_FS
>  	struct dentry *debug_dir;
>  	struct dentry *debug_stats;
> @@ -72,6 +78,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
>  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
>  void bdi_unregister(struct backing_dev_info *bdi);
>  
> +extern spinlock_t bdi_lock;
> +extern struct list_head bdi_list;
> +
>  static inline void __add_bdi_stat(struct backing_dev_info *bdi,
>  		enum bdi_stat_item item, s64 amount)
>  {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 92734c0..3c90eb4 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -648,7 +648,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
>  
>  struct inode {
>  	struct hlist_node	i_hash;
> -	struct list_head	i_list;
> +	struct list_head	i_list;		/* backing dev IO list */
>  	struct list_head	i_sb_list;
>  	struct list_head	i_dentry;
>  	unsigned long		i_ino;
> @@ -1155,9 +1155,6 @@ struct super_block {
>  	struct xattr_handler	**s_xattr;
>  
>  	struct list_head	s_inodes;	/* all inodes */
> -	struct list_head	s_dirty;	/* dirty inodes */
> -	struct list_head	s_io;		/* parked for writeback */
> -	struct list_head	s_more_io;	/* parked for more writeback */
>  	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
>  	struct list_head	s_files;
>  	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 8e85874..cf1528b 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -7,8 +7,9 @@
>  #include <linux/writeback.h>
>  #include <linux/device.h>
>  
> -
>  static struct class *bdi_class;
> +DEFINE_SPINLOCK(bdi_lock);
> +LIST_HEAD(bdi_list);
>  
>  #ifdef CONFIG_DEBUG_FS
>  #include <linux/debugfs.h>
> @@ -187,6 +188,10 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
>  		goto exit;
>  	}
>  
> +	spin_lock(&bdi_lock);
> +	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
> +	spin_unlock(&bdi_lock);
> +
>  	bdi->dev = dev;
>  	bdi_debug_register(bdi, dev_name(dev));
>  
> @@ -201,9 +206,23 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
>  }
>  EXPORT_SYMBOL(bdi_register_dev);
>  
> +static void bdi_remove_from_list(struct backing_dev_info *bdi)
> +{
> +	spin_lock(&bdi_lock);
> +	list_del_rcu(&bdi->bdi_list);
> +	spin_unlock(&bdi_lock);
> +
> +	/*
> +	 * In case the bdi is freed right after unregister, we need to
> +	 * make sure any RCU sections have exited
> +	 */
> +	synchronize_rcu();
> +}
> +
>  void bdi_unregister(struct backing_dev_info *bdi)
>  {
>  	if (bdi->dev) {
> +		bdi_remove_from_list(bdi);
>  		bdi_debug_unregister(bdi);
>  		device_unregister(bdi->dev);
>  		bdi->dev = NULL;
> @@ -221,6 +240,10 @@ int bdi_init(struct backing_dev_info *bdi)
>  	bdi->min_ratio = 0;
>  	bdi->max_ratio = 100;
>  	bdi->max_prop_frac = PROP_FRAC_BASE;
> +	INIT_LIST_HEAD(&bdi->bdi_list);
> +	INIT_LIST_HEAD(&bdi->b_io);
> +	INIT_LIST_HEAD(&bdi->b_dirty);
> +	INIT_LIST_HEAD(&bdi->b_more_io);
>  
>  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
>  		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
> @@ -235,6 +258,8 @@ int bdi_init(struct backing_dev_info *bdi)
>  err:
>  		while (i--)
>  			percpu_counter_destroy(&bdi->bdi_stat[i]);
> +
> +		bdi_remove_from_list(bdi);
>  	}
>  
>  	return err;
> @@ -245,6 +270,10 @@ void bdi_destroy(struct backing_dev_info *bdi)
>  {
>  	int i;
>  
> +	WARN_ON(!list_empty(&bdi->b_dirty));
> +	WARN_ON(!list_empty(&bdi->b_io));
> +	WARN_ON(!list_empty(&bdi->b_more_io));
> +
>  	bdi_unregister(bdi);
>  
>  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 74dc57c..3ec11d8 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -319,7 +319,6 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
>  /*
>   *
>   */
> -static DEFINE_SPINLOCK(bdi_lock);
>  static unsigned int bdi_min_ratio;
>  
>  int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
> -- 
> 1.6.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/