From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
To: Nick Piggin <npiggin@kernel.dk>
Subject: Re: [patch] mm: vmscan implement per-zone shrinkers
Cc: kosaki.motohiro@jp.fujitsu.com, Andrew Morton <akpm@linux-foundation.org>,
        linux-mm@kvack.org, linux-kernel@vger.kernel.org,
        Linus Torvalds <torvalds@linux-foundation.org>
In-Reply-To: <20101109123246.GA11477@amd>
References: <20101109123246.GA11477@amd>
Message-Id: <20101114182614.BEE5.A69D9226@jp.fujitsu.com>
MIME-Version: 1.0
Content-Type: text/plain; charset="US-ASCII"
Content-Transfer-Encoding: 7bit
Date: Sun, 14 Nov 2010 19:07:17 +0900 (JST)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 27552
Lines: 783

Hi

> Hi,
> 
> I'm doing some works that require per-zone shrinkers, I'd like to get
> the vmscan part signed off and merged by interested mm people, please.
> 
> [And before anybody else kindly suggests per-node shrinkers, please go
> back and read all the discussion about this first.]

vmscan part looks good to me. however I hope fs folks review too even though
I'm not sure who is best.

btw, I have some nitpick comments. see below.


> ---
>  fs/drop_caches.c    |    6 
>  include/linux/mm.h  |   47 ++++++-
>  mm/memory-failure.c |   10 -
>  mm/vmscan.c         |  341 +++++++++++++++++++++++++++++++++++++---------------
>  4 files changed, 297 insertions(+), 107 deletions(-)
> 
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h	2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/include/linux/mm.h	2010-11-09 22:11:10.000000000 +1100
> @@ -1008,6 +1008,10 @@ static inline void sync_mm_rss(struct ta
>  /*
>   * A callback you can register to apply pressure to ageable caches.
>   *
> + * 'shrink_zone' is the new shrinker API. It is to be used in preference
> + * to 'shrink'. One must point to a shrinker function, the other must
> + * be NULL. See 'shrink_slab' for details about the shrink_zone API.
> + *
>   * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
>   * look through the least-recently-used 'nr_to_scan' entries and
>   * attempt to free them up.  It should return the number of objects
> @@ -1024,13 +1028,53 @@ struct shrinker {
>  	int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask);
>  	int seeks;	/* seeks to recreate an obj */
>  
> +	/*
> +	 * shrink_zone - slab shrinker callback for reclaimable objects
> +	 * @shrink: this struct shrinker
> +	 * @zone: zone to scan
> +	 * @scanned: pagecache lru pages scanned in zone
> +	 * @total: total pagecache lru pages in zone
> +	 * @global: global pagecache lru pages (for zone-unaware shrinkers)
> +	 * @flags: shrinker flags
> +	 * @gfp_mask: gfp context we are operating within
> +	 *
> +	 * The shrinkers are responsible for calculating the appropriate
> +	 * pressure to apply, batching up scanning (and cond_resched,
> +	 * cond_resched_lock etc), and updating events counters including
> +	 * count_vm_event(SLABS_SCANNED, nr).
> +	 *
> +	 * This approach gives flexibility to the shrinkers. They know best how
> +	 * to do batching, how much time between cond_resched is appropriate,
> +	 * what statistics to increment, etc.
> +	 */
> +	void (*shrink_zone)(struct shrinker *shrink,
> +		struct zone *zone, unsigned long scanned,
> +		unsigned long total, unsigned long global,
> +		unsigned long flags, gfp_t gfp_mask);
> +

shrink_zone is slightly grep unfriendly. Can you consider shrink_slab_zone() 
or something else?


>  	/* These are for internal use */
>  	struct list_head list;
>  	long nr;	/* objs pending delete */
>  };
> +
> +/* Constants for use by old shrinker API */
>  #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
> +
> +/* Constants for use by new shrinker API */
> +/*
> + * SHRINK_DEFAULT_SEEKS is shifted by 4 to match an arbitrary constant
> + * in the old shrinker code.
> + */
> +#define SHRINK_FACTOR	(128UL) /* Fixed point shift */
> +#define SHRINK_DEFAULT_SEEKS	(SHRINK_FACTOR*DEFAULT_SEEKS/4)
> +#define SHRINK_BATCH	128	/* A good number if you don't know better */
> +
>  extern void register_shrinker(struct shrinker *);
>  extern void unregister_shrinker(struct shrinker *);
> +extern void shrinker_add_scan(unsigned long *dst,
> +				unsigned long scanned, unsigned long total,
> +				unsigned long objects, unsigned int ratio);
> +extern unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch);
>  
>  int vma_wants_writenotify(struct vm_area_struct *vma);
>  
> @@ -1464,8 +1508,7 @@ int in_gate_area_no_task(unsigned long a
>  
>  int drop_caches_sysctl_handler(struct ctl_table *, int,
>  					void __user *, size_t *, loff_t *);
> -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> -			unsigned long lru_pages);
> +void shrink_all_slab(struct zone *zone);
>  
>  #ifndef CONFIG_MMU
>  #define randomize_va_space 0
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c	2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/mm/vmscan.c	2010-11-09 22:11:10.000000000 +1100
> @@ -80,6 +80,9 @@ struct scan_control {
>  	/* Can pages be swapped as part of reclaim? */
>  	int may_swap;
>  
> +	/* Can slab pages be reclaimed? */
> +	int may_reclaim_slab;
> +
>  	int swappiness;
>  
>  	int order;
> @@ -169,6 +172,8 @@ static unsigned long zone_nr_lru_pages(s
>   */
>  void register_shrinker(struct shrinker *shrinker)
>  {
> +	BUG_ON(shrinker->shrink && shrinker->shrink_zone);
> +	BUG_ON(!shrinker->shrink && !shrinker->shrink_zone);
>  	shrinker->nr = 0;
>  	down_write(&shrinker_rwsem);
>  	list_add_tail(&shrinker->list, &shrinker_list);
> @@ -187,43 +192,101 @@ void unregister_shrinker(struct shrinker
>  }
>  EXPORT_SYMBOL(unregister_shrinker);
>  
> -#define SHRINK_BATCH 128
>  /*
> - * Call the shrink functions to age shrinkable caches
> + * shrinker_add_scan - accumulate shrinker scan
> + * @dst: scan counter variable
> + * @scanned: pagecache pages scanned
> + * @total: total pagecache objects
> + * @tot: total objects in this cache
> + * @ratio: ratio of pagecache value to object value
>   *
> - * Here we assume it costs one seek to replace a lru page and that it also
> - * takes a seek to recreate a cache object.  With this in mind we age equal
> - * percentages of the lru and ageable caches.  This should balance the seeks
> - * generated by these structures.
> + * shrinker_add_scan accumulates a number of objects to scan into @dst,
> + * based on the following ratio:
>   *
> - * If the vm encountered mapped pages on the LRU it increase the pressure on
> - * slab to avoid swapping.
> + * proportion = scanned / total        // proportion of pagecache scanned
> + * obj_prop   = objects * proportion   // same proportion of objects
> + * to_scan    = obj_prop / ratio       // modify by ratio
> + * *dst += (total / scanned)           // accumulate to dst
>   *
> - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
> + * The ratio is a fixed point integer with a factor SHRINK_FACTOR.
> + * Higher ratios give objects higher value.
>   *
> - * `lru_pages' represents the number of on-LRU pages in all the zones which
> - * are eligible for the caller's allocation attempt.  It is used for balancing
> - * slab reclaim versus page reclaim.
> + * @dst is also fixed point, so cannot be used as a simple count.
> + * shrinker_do_scan will take care of that for us.
>   *
> - * Returns the number of slab objects which we shrunk.
> + * There is no synchronisation here, which is fine really. A rare lost
> + * update is no huge deal in reclaim code.
>   */
> -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> -			unsigned long lru_pages)
> +void shrinker_add_scan(unsigned long *dst,
> +			unsigned long scanned, unsigned long total,
> +			unsigned long objects, unsigned int ratio)
>  {
> -	struct shrinker *shrinker;
> -	unsigned long ret = 0;
> +	unsigned long long delta;
>  
> -	if (scanned == 0)
> -		scanned = SWAP_CLUSTER_MAX;
> +	delta = (unsigned long long)scanned * objects;
> +	delta *= SHRINK_FACTOR;
> +	do_div(delta, total + 1);

> +	delta *= SHRINK_FACTOR; /* ratio is also in SHRINK_FACTOR units */
> +	do_div(delta, ratio + 1);

introdusing tiny macro is better than the comment.

>  
> -	if (!down_read_trylock(&shrinker_rwsem))
> -		return 1;	/* Assume we'll be able to shrink next time */
> +	/*
> +	 * Avoid risking looping forever due to too large nr value:
> +	 * never try to free more than twice the estimate number of
> +	 * freeable entries.
> +	 */
> +	*dst += delta;
> +
> +	if (*dst / SHRINK_FACTOR > objects)
> +		*dst = objects * SHRINK_FACTOR;

objects * SHRINK_FACTOR appear twice in this function.
calculate "objects = obj * SHRINK_FACTOR" at first improve
code readability slightly.


> +}
> +EXPORT_SYMBOL(shrinker_add_scan);
> +
> +/*
> + * shrinker_do_scan - scan a batch of objects
> + * @dst: scan counter
> + * @batch: number of objects to scan in this batch
> + * @Returns: number of objects to scan
> + *
> + * shrinker_do_scan takes the scan counter accumulated by shrinker_add_scan,
> + * and decrements it by @batch if it is greater than batch and returns batch.
> + * Otherwise returns 0. The caller should use the return value as the number
> + * of objects to scan next.
> + *
> + * Between shrinker_do_scan calls, the caller should drop locks if possible
> + * and call cond_resched.
> + *
> + * Note, @dst is a fixed point scaled integer. See shrinker_add_scan.
> + *
> + * Like shrinker_add_scan, shrinker_do_scan is not SMP safe, but it doesn't
> + * really need to be.
> + */
> +unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch)

Seems misleading name a bit. shrinker_do_scan() does NOT scan. 
It only does batch adjustment.


> +{
> +	unsigned long nr = ACCESS_ONCE(*dst);

Dumb question: why is this ACCESS_ONCE() necessary?


> +	if (nr < batch * SHRINK_FACTOR)
> +		return 0;
> +	*dst = nr - batch * SHRINK_FACTOR;
> +	return batch;


{
	unsigned long nr = ACCESS_ONCE(*dst);
	batch *= SHRINK_FACTOR;

	if (nr < batch)
		return 0;
	*dst = nr - batch;
	return batch;
}

is slighly cleaner. however It's unclear why dst and batch argument
need to have different unit (i.e why caller can't do batch * FACTOR?).

> +}
> +EXPORT_SYMBOL(shrinker_do_scan);
> +
> +#define SHRINK_BATCH 128
> +/*
> + * Scan the deprecated shrinkers. This will go away soon in favour of
> + * converting everybody to new shrinker API.
> + */
> +static void shrink_slab_old(unsigned long scanned, gfp_t gfp_mask,
> +			unsigned long lru_pages)
> +{
> +	struct shrinker *shrinker;
>  
>  	list_for_each_entry(shrinker, &shrinker_list, list) {
>  		unsigned long long delta;
>  		unsigned long total_scan;
>  		unsigned long max_pass;
>  
> +		if (!shrinker->shrink)
> +			continue;
>  		max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
>  		delta = (4 * scanned) / shrinker->seeks;
>  		delta *= max_pass;
> @@ -250,15 +313,11 @@ unsigned long shrink_slab(unsigned long
>  		while (total_scan >= SHRINK_BATCH) {
>  			long this_scan = SHRINK_BATCH;
>  			int shrink_ret;
> -			int nr_before;
>  
> -			nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
>  			shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
>  								gfp_mask);
>  			if (shrink_ret == -1)
>  				break;
> -			if (shrink_ret < nr_before)
> -				ret += nr_before - shrink_ret;
>  			count_vm_events(SLABS_SCANNED, this_scan);
>  			total_scan -= this_scan;
>  
> @@ -267,8 +326,86 @@ unsigned long shrink_slab(unsigned long
>  
>  		shrinker->nr += total_scan;
>  	}
> +}
> +/*
> + * shrink_slab - Call the shrink functions to age shrinkable caches
> + * @zone: the zone we are currently reclaiming from
> + * @scanned: how many pagecache pages were scanned in this zone
> + * @total: total number of reclaimable pagecache pages in this zone
> + * @global: total number of reclaimable pagecache pages in the system
> + * @gfp_mask: gfp context that we are in
> + *
> + * Slab shrinkers should scan their objects in a proportion to the ratio of
> + * scanned to total pagecache pages in this zone, modified by a "cost"
> + * constant.
> + *
> + * For example, we have a slab cache with 100 reclaimable objects in a
> + * particular zone, and the cost of reclaiming an object is determined to be
> + * twice as expensive as reclaiming a pagecache page (due to likelihood and
> + * cost of reconstruction). If we have 200 reclaimable pagecache pages in that
> + * zone particular zone, and scan 20 of them (10%), we should scan 5% (5) of
> + * the objects in our slab cache.
> + *
> + * If we have a single global list of objects and no per-zone lists, the
> + * global count of objects can be used to find the correct ratio to scan.
> + *
> + * See shrinker_add_scan and shrinker_do_scan for helper functions and
> + * details on how to calculate these numbers.
> + */
> +static void shrink_slab(struct zone *zone, unsigned long scanned,
> +			unsigned long total, unsigned long global,
> +			gfp_t gfp_mask)
> +{
> +	struct shrinker *shrinker;
> +
> +	if (scanned == 0)
> +		scanned = SWAP_CLUSTER_MAX;
> +
> +	if (!down_read_trylock(&shrinker_rwsem))
> +		return;
> +
> +	/* do a global shrink with the old shrinker API */
> +	shrink_slab_old(scanned, gfp_mask, global);
> +
> +	list_for_each_entry(shrinker, &shrinker_list, list) {
> +		if (!shrinker->shrink_zone)
> +			continue;
> +		(*shrinker->shrink_zone)(shrinker, zone, scanned,
> +					total, global, 0, gfp_mask);

flags argument is unused?


> +	}
>  	up_read(&shrinker_rwsem);
> -	return ret;
> +}
> +
> +/**
> + * shrink_all_slab - shrinks slabs in a given zone or system wide
> + * @zone: NULL to shrink slab from all zones, or non-NULL to shrink from a particular zone
> + *
> + * shrink_all_slab is a bit of a big hammer, and it's not really well defined what it should
> + * do (how much, how hard to shrink, etc), and it will throw out the reclaim balance. So it
> + * must only be used very carefully (drop_caches and hardware memory error handler are good
> + * examples).
> + */
> +void shrink_all_slab(struct zone *zone)
> +{
> +	struct reclaim_state reclaim_state;
> +
> +	current->reclaim_state = &reclaim_state;
> +	do {
> +		reclaim_state.reclaimed_slab = 0;
> +		/*
> +		 * Use "100" for "scanned", "total", and "global", so
> +		 * that shrinkers scan a large proportion of their
> +		 * objects. 100 rather than 1 in order to reduce rounding
> +		 * errors.
> +		 */
> +		if (!zone) {
> +			for_each_populated_zone(zone)
> +				shrink_slab(zone, 100, 100, 100, GFP_KERNEL);
> +		} else
> +			shrink_slab(zone, 100, 100, 100, GFP_KERNEL);
> +	} while (reclaim_state.reclaimed_slab);
> +
> +	current->reclaim_state = NULL;
>  }
>  
>  static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
> @@ -1801,16 +1938,22 @@ static void get_scan_count(struct zone *
>   * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
>   */
>  static void shrink_zone(int priority, struct zone *zone,
> -				struct scan_control *sc)
> +			struct scan_control *sc, unsigned long global_lru_pages)
>  {
>  	unsigned long nr[NR_LRU_LISTS];
>  	unsigned long nr_to_scan;
>  	enum lru_list l;
>  	unsigned long nr_reclaimed = sc->nr_reclaimed;
>  	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
> +	unsigned long nr_scanned = sc->nr_scanned;
> +	unsigned long lru_pages = 0;
>  
>  	get_scan_count(zone, sc, nr, priority);
>  
> +	/* Used by slab shrinking, below */
> +	if (sc->may_reclaim_slab)
> +		lru_pages = zone_reclaimable_pages(zone);
> +
>  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
>  					nr[LRU_INACTIVE_FILE]) {
>  		for_each_evictable_lru(l) {
> @@ -1835,8 +1978,6 @@ static void shrink_zone(int priority, st
>  			break;
>  	}
>  
> -	sc->nr_reclaimed = nr_reclaimed;
> -
>  	/*
>  	 * Even if we did not try to evict anon pages at all, we want to
>  	 * rebalance the anon lru active/inactive ratio.
> @@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
>  	if (inactive_anon_is_low(zone, sc))
>  		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
>  
> +	/*
> +	 * Don't shrink slabs when reclaiming memory from
> +	 * over limit cgroups
> +	 */
> +	if (sc->may_reclaim_slab) {
> +		struct reclaim_state *reclaim_state = current->reclaim_state;
> +
> +		shrink_slab(zone, sc->nr_scanned - nr_scanned,

Doubtful calculation. What mean "sc->nr_scanned - nr_scanned"?
I think nr_scanned simply keep old slab balancing behavior.


> +			lru_pages, global_lru_pages, sc->gfp_mask);
> +		if (reclaim_state) {
> +			nr_reclaimed += reclaim_state->reclaimed_slab;
> +			reclaim_state->reclaimed_slab = 0;
> +		}
> +	}
> +
> +	sc->nr_reclaimed = nr_reclaimed;
> +
>  	throttle_vm_writeout(sc->gfp_mask);
>  }
>  
> @@ -1864,7 +2022,7 @@ static void shrink_zone(int priority, st
>   * scan then give up on it.
>   */
>  static void shrink_zones(int priority, struct zonelist *zonelist,
> -					struct scan_control *sc)
> +		struct scan_control *sc, unsigned long global_lru_pages)
>  {
>  	struct zoneref *z;
>  	struct zone *zone;
> @@ -1884,7 +2042,7 @@ static void shrink_zones(int priority, s
>  				continue;	/* Let kswapd poll it */
>  		}
>  
> -		shrink_zone(priority, zone, sc);
> +		shrink_zone(priority, zone, sc, global_lru_pages);
>  	}
>  }
>  
> @@ -1941,7 +2099,6 @@ static unsigned long do_try_to_free_page
>  {
>  	int priority;
>  	unsigned long total_scanned = 0;
> -	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	struct zoneref *z;
>  	struct zone *zone;
>  	unsigned long writeback_threshold;
> @@ -1953,30 +2110,20 @@ static unsigned long do_try_to_free_page
>  		count_vm_event(ALLOCSTALL);
>  
>  	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
> -		sc->nr_scanned = 0;
> -		if (!priority)
> -			disable_swap_token();
> -		shrink_zones(priority, zonelist, sc);
> -		/*
> -		 * Don't shrink slabs when reclaiming memory from
> -		 * over limit cgroups
> -		 */
> -		if (scanning_global_lru(sc)) {
> -			unsigned long lru_pages = 0;
> -			for_each_zone_zonelist(zone, z, zonelist,
> -					gfp_zone(sc->gfp_mask)) {
> -				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> -					continue;
> +		unsigned long lru_pages = 0;
>  
> -				lru_pages += zone_reclaimable_pages(zone);
> -			}
> +		for_each_zone_zonelist(zone, z, zonelist,
> +				gfp_zone(sc->gfp_mask)) {
> +			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> +				continue;
>  
> -			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
> -			if (reclaim_state) {
> -				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> -				reclaim_state->reclaimed_slab = 0;
> -			}
> +			lru_pages += zone_reclaimable_pages(zone);

Do we really need this doubtful cpuset hardwall filtering? Why do we
need to change slab reclaim pressure if cpuset is used. In old days,
we didn't have per-zone slab shrinker, then we need artificial slab
pressure boost for preventing false positive oom-killer. but now we have.

However, If you strongly keep old behavior at this time, I don't oppose.
We can change it later.


>  		}
> +
> +		sc->nr_scanned = 0;
> +		if (!priority)
> +			disable_swap_token();
> +		shrink_zones(priority, zonelist, sc, lru_pages);
>  		total_scanned += sc->nr_scanned;
>  		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
>  			goto out;
> @@ -2029,6 +2176,7 @@ unsigned long try_to_free_pages(struct z
>  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
>  		.may_unmap = 1,
>  		.may_swap = 1,
> +		.may_reclaim_slab = 1,
>  		.swappiness = vm_swappiness,
>  		.order = order,
>  		.mem_cgroup = NULL,
> @@ -2058,6 +2206,7 @@ unsigned long mem_cgroup_shrink_node_zon
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
>  		.may_swap = !noswap,
> +		.may_reclaim_slab = 0,
>  		.swappiness = swappiness,
>  		.order = 0,
>  		.mem_cgroup = mem,
> @@ -2076,7 +2225,7 @@ unsigned long mem_cgroup_shrink_node_zon
>  	 * will pick up pages from other mem cgroup's as well. We hack
>  	 * the priority and make it zero.
>  	 */
> -	shrink_zone(0, zone, &sc);
> +	shrink_zone(0, zone, &sc, zone_reclaimable_pages(zone));
>  
>  	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
>  
> @@ -2094,6 +2243,7 @@ unsigned long try_to_free_mem_cgroup_pag
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
>  		.may_swap = !noswap,
> +		.may_reclaim_slab = 0,
>  		.nr_to_reclaim = SWAP_CLUSTER_MAX,
>  		.swappiness = swappiness,
>  		.order = 0,
> @@ -2171,11 +2321,11 @@ static unsigned long balance_pgdat(pg_da
>  	int priority;
>  	int i;
>  	unsigned long total_scanned;
> -	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	struct scan_control sc = {
>  		.gfp_mask = GFP_KERNEL,
>  		.may_unmap = 1,
>  		.may_swap = 1,
> +		.may_reclaim_slab = 1,
>  		/*
>  		 * kswapd doesn't want to be bailed out while reclaim. because
>  		 * we want to put equal scanning pressure on each zone.
> @@ -2249,7 +2399,6 @@ static unsigned long balance_pgdat(pg_da
>  		 */
>  		for (i = 0; i <= end_zone; i++) {
>  			struct zone *zone = pgdat->node_zones + i;
> -			int nr_slab;
>  
>  			if (!populated_zone(zone))
>  				continue;
> @@ -2271,15 +2420,11 @@ static unsigned long balance_pgdat(pg_da
>  			 */
>  			if (!zone_watermark_ok(zone, order,
>  					8*high_wmark_pages(zone), end_zone, 0))
> -				shrink_zone(priority, zone, &sc);
> -			reclaim_state->reclaimed_slab = 0;
> -			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> -						lru_pages);
> -			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
> +				shrink_zone(priority, zone, &sc, lru_pages);
>  			total_scanned += sc.nr_scanned;
>  			if (zone->all_unreclaimable)
>  				continue;
> -			if (nr_slab == 0 && !zone_reclaimable(zone))
> +			if (!zone_reclaimable(zone))
>  				zone->all_unreclaimable = 1;
>  			/*
>  			 * If we've done a decent amount of scanning and
> @@ -2545,6 +2690,7 @@ unsigned long shrink_all_memory(unsigned
>  		.may_swap = 1,
>  		.may_unmap = 1,
>  		.may_writepage = 1,
> +		.may_reclaim_slab = 1,
>  		.nr_to_reclaim = nr_to_reclaim,
>  		.hibernation_mode = 1,
>  		.swappiness = vm_swappiness,
> @@ -2728,13 +2874,14 @@ static int __zone_reclaim(struct zone *z
>  		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
>  		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
>  		.may_swap = 1,
> +		.may_reclaim_slab = 0,
>  		.nr_to_reclaim = max_t(unsigned long, nr_pages,
>  				       SWAP_CLUSTER_MAX),
>  		.gfp_mask = gfp_mask,
>  		.swappiness = vm_swappiness,
>  		.order = order,
>  	};
> -	unsigned long nr_slab_pages0, nr_slab_pages1;
> +	unsigned long lru_pages, slab_pages;
>  
>  	cond_resched();
>  	/*
> @@ -2747,51 +2894,61 @@ static int __zone_reclaim(struct zone *z
>  	reclaim_state.reclaimed_slab = 0;
>  	p->reclaim_state = &reclaim_state;
>  
> +	lru_pages = zone_reclaimable_pages(zone);
> +	slab_pages = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> +
>  	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
> +		if (slab_pages > zone->min_slab_pages)
> +			sc.may_reclaim_slab = 1;
>  		/*
>  		 * Free memory by calling shrink zone with increasing
>  		 * priorities until we have enough memory freed.
>  		 */
>  		priority = ZONE_RECLAIM_PRIORITY;
>  		do {
> -			shrink_zone(priority, zone, &sc);
> +			shrink_zone(priority, zone, &sc, lru_pages);
>  			priority--;
>  		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
> -	}
>  
> -	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> -	if (nr_slab_pages0 > zone->min_slab_pages) {
> +	} else if (slab_pages > zone->min_slab_pages) {
>  		/*
> -		 * shrink_slab() does not currently allow us to determine how
> -		 * many pages were freed in this zone. So we take the current
> -		 * number of slab pages and shake the slab until it is reduced
> -		 * by the same nr_pages that we used for reclaiming unmapped
> -		 * pages.
> -		 *
> -		 * Note that shrink_slab will free memory on all zones and may
> -		 * take a long time.
> +		 * Scanning slab without pagecache, have to open code
> +		 * call to shrink_slab (shirnk_zone drives slab reclaim via
> +		 * pagecache scanning, so it isn't set up to shrink slab
> +		 * without scanning pagecache.
>  		 */
> -		for (;;) {
> -			unsigned long lru_pages = zone_reclaimable_pages(zone);
> -
> -			/* No reclaimable slab or very low memory pressure */
> -			if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
> -				break;
>  
> -			/* Freed enough memory */
> -			nr_slab_pages1 = zone_page_state(zone,
> -							NR_SLAB_RECLAIMABLE);
> -			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
> -				break;
> -		}
> +		/*
> +		 * lru_pages / 10  -- put a 10% pressure on the slab
> +		 * which roughly corresponds to ZONE_RECLAIM_PRIORITY
> +		 * scanning 1/16th of pagecache.
> +		 *
> +		 * Global slabs will be shrink at a relatively more
> +		 * aggressive rate because we don't calculate the
> +		 * global lru size for speed. But they really should
> +		 * be converted to per zone slabs if they are important
> +		 */
> +		shrink_slab(zone, lru_pages / 10, lru_pages, lru_pages,
> +				gfp_mask);

Why don't you use sc.nr_scanned? It seems straight forward.


>  
>  		/*
> -		 * Update nr_reclaimed by the number of slab pages we
> -		 * reclaimed from this zone.
> +		 * Although we have a zone based slab shrinker API, some slabs
> +		 * are still scanned globally. This means we can't quite
> +		 * determine how many pages were freed in this zone by
> +		 * checking reclaimed_slab. However the regular shrink_zone
> +		 * paths have exactly the same problem that they largely
> +		 * ignore. So don't be different.
> +		 *
> +		 * The situation will improve dramatically as important slabs
> +		 * are switched over to using reclaimed_slab after the
> +		 * important slabs are converted to using per zone shrinkers.
> +		 *
> +		 * Note that shrink_slab may free memory on all zones and may
> +		 * take a long time, but again switching important slabs to
> +		 * zone based shrinkers will solve this problem.
>  		 */
> -		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> -		if (nr_slab_pages1 < nr_slab_pages0)
> -			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
> +		sc.nr_reclaimed += reclaim_state.reclaimed_slab;
> +		reclaim_state.reclaimed_slab = 0;
>  	}
>  
>  	p->reclaim_state = NULL;
> Index: linux-2.6/fs/drop_caches.c
> ===================================================================
> --- linux-2.6.orig/fs/drop_caches.c	2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/fs/drop_caches.c	2010-11-09 22:11:10.000000000 +1100
> @@ -35,11 +35,7 @@ static void drop_pagecache_sb(struct sup
>  
>  static void drop_slab(void)
>  {
> -	int nr_objects;
> -
> -	do {
> -		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
> -	} while (nr_objects > 10);
> +	shrink_all_slab(NULL); /* NULL - all zones */
>  }
>  
>  int drop_caches_sysctl_handler(ctl_table *table, int write,
> Index: linux-2.6/mm/memory-failure.c
> ===================================================================
> --- linux-2.6.orig/mm/memory-failure.c	2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/mm/memory-failure.c	2010-11-09 22:11:10.000000000 +1100
> @@ -235,14 +235,8 @@ void shake_page(struct page *p, int acce
>  	 * Only all shrink_slab here (which would also
>  	 * shrink other caches) if access is not potentially fatal.
>  	 */
> -	if (access) {
> -		int nr;
> -		do {
> -			nr = shrink_slab(1000, GFP_KERNEL, 1000);
> -			if (page_count(p) == 1)
> -				break;
> -		} while (nr > 10);
> -	}
> +	if (access)
> +		shrink_all_slab(page_zone(p));
>  }
>  EXPORT_SYMBOL_GPL(shake_page);
>  
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/