Hi,
I'm doing some works that require per-zone shrinkers, I'd like to get
the vmscan part signed off and merged by interested mm people, please.
[And before anybody else kindly suggests per-node shrinkers, please go
back and read all the discussion about this first.]
Patches against Linus's current tree.
--
Allow the shrinker to do per-zone shrinking. This requires adding a zone
argument to the shrinker callback and calling shrinkers for each zone
scanned. The logic somewhat in vmscan code gets simpler: the shrinkers are
invoked for each zone, around the same time as the pagecache scanner.
Zone reclaim needed a bit of surgery to cope with the change, but the
idea is the same.
But all shrinkers are currently global-based, so they need a way to
convert per-zone ratios into global scan ratios. So seeing as we are
changing the shrinker API anyway, let's reorganise it to make it saner.
So the shrinker callback is passed:
- the number of pagecache pages scanned in this zone
- the number of pagecache pages in this zone
- the total number of pagecache pages in all zones to be scanned
The shrinker is now completely responsible for calculating and batching
(given helpers), which provides better flexibility. vmscan helper functions
are provided to accumulate these ratios, and help with batching.
Finally, add some fixed-point scaling to the ratio, which helps rounding.
The old shrinker API remains for unconverted code. There is no urgency
to convert them at once.
Signed-off-by: Nick Piggin <[email protected]>
---
fs/drop_caches.c | 6
include/linux/mm.h | 47 ++++++-
mm/memory-failure.c | 10 -
mm/vmscan.c | 341 +++++++++++++++++++++++++++++++++++++---------------
4 files changed, 297 insertions(+), 107 deletions(-)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2010-11-09 22:11:03.000000000 +1100
+++ linux-2.6/include/linux/mm.h 2010-11-09 22:11:10.000000000 +1100
@@ -1008,6 +1008,10 @@ static inline void sync_mm_rss(struct ta
/*
* A callback you can register to apply pressure to ageable caches.
*
+ * 'shrink_zone' is the new shrinker API. It is to be used in preference
+ * to 'shrink'. One must point to a shrinker function, the other must
+ * be NULL. See 'shrink_slab' for details about the shrink_zone API.
+ *
* 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should
* look through the least-recently-used 'nr_to_scan' entries and
* attempt to free them up. It should return the number of objects
@@ -1024,13 +1028,53 @@ struct shrinker {
int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask);
int seeks; /* seeks to recreate an obj */
+ /*
+ * shrink_zone - slab shrinker callback for reclaimable objects
+ * @shrink: this struct shrinker
+ * @zone: zone to scan
+ * @scanned: pagecache lru pages scanned in zone
+ * @total: total pagecache lru pages in zone
+ * @global: global pagecache lru pages (for zone-unaware shrinkers)
+ * @flags: shrinker flags
+ * @gfp_mask: gfp context we are operating within
+ *
+ * The shrinkers are responsible for calculating the appropriate
+ * pressure to apply, batching up scanning (and cond_resched,
+ * cond_resched_lock etc), and updating events counters including
+ * count_vm_event(SLABS_SCANNED, nr).
+ *
+ * This approach gives flexibility to the shrinkers. They know best how
+ * to do batching, how much time between cond_resched is appropriate,
+ * what statistics to increment, etc.
+ */
+ void (*shrink_zone)(struct shrinker *shrink,
+ struct zone *zone, unsigned long scanned,
+ unsigned long total, unsigned long global,
+ unsigned long flags, gfp_t gfp_mask);
+
/* These are for internal use */
struct list_head list;
long nr; /* objs pending delete */
};
+
+/* Constants for use by old shrinker API */
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
+
+/* Constants for use by new shrinker API */
+/*
+ * SHRINK_DEFAULT_SEEKS is shifted by 4 to match an arbitrary constant
+ * in the old shrinker code.
+ */
+#define SHRINK_FACTOR (128UL) /* Fixed point shift */
+#define SHRINK_DEFAULT_SEEKS (SHRINK_FACTOR*DEFAULT_SEEKS/4)
+#define SHRINK_BATCH 128 /* A good number if you don't know better */
+
extern void register_shrinker(struct shrinker *);
extern void unregister_shrinker(struct shrinker *);
+extern void shrinker_add_scan(unsigned long *dst,
+ unsigned long scanned, unsigned long total,
+ unsigned long objects, unsigned int ratio);
+extern unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch);
int vma_wants_writenotify(struct vm_area_struct *vma);
@@ -1464,8 +1508,7 @@ int in_gate_area_no_task(unsigned long a
int drop_caches_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages);
+void shrink_all_slab(struct zone *zone);
#ifndef CONFIG_MMU
#define randomize_va_space 0
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2010-11-09 22:11:03.000000000 +1100
+++ linux-2.6/mm/vmscan.c 2010-11-09 22:11:10.000000000 +1100
@@ -80,6 +80,9 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
int may_swap;
+ /* Can slab pages be reclaimed? */
+ int may_reclaim_slab;
+
int swappiness;
int order;
@@ -169,6 +172,8 @@ static unsigned long zone_nr_lru_pages(s
*/
void register_shrinker(struct shrinker *shrinker)
{
+ BUG_ON(shrinker->shrink && shrinker->shrink_zone);
+ BUG_ON(!shrinker->shrink && !shrinker->shrink_zone);
shrinker->nr = 0;
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
@@ -187,43 +192,101 @@ void unregister_shrinker(struct shrinker
}
EXPORT_SYMBOL(unregister_shrinker);
-#define SHRINK_BATCH 128
/*
- * Call the shrink functions to age shrinkable caches
+ * shrinker_add_scan - accumulate shrinker scan
+ * @dst: scan counter variable
+ * @scanned: pagecache pages scanned
+ * @total: total pagecache objects
+ * @tot: total objects in this cache
+ * @ratio: ratio of pagecache value to object value
*
- * Here we assume it costs one seek to replace a lru page and that it also
- * takes a seek to recreate a cache object. With this in mind we age equal
- * percentages of the lru and ageable caches. This should balance the seeks
- * generated by these structures.
+ * shrinker_add_scan accumulates a number of objects to scan into @dst,
+ * based on the following ratio:
*
- * If the vm encountered mapped pages on the LRU it increase the pressure on
- * slab to avoid swapping.
+ * proportion = scanned / total // proportion of pagecache scanned
+ * obj_prop = objects * proportion // same proportion of objects
+ * to_scan = obj_prop / ratio // modify by ratio
+ * *dst += (total / scanned) // accumulate to dst
*
- * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ * The ratio is a fixed point integer with a factor SHRINK_FACTOR.
+ * Higher ratios give objects higher value.
*
- * `lru_pages' represents the number of on-LRU pages in all the zones which
- * are eligible for the caller's allocation attempt. It is used for balancing
- * slab reclaim versus page reclaim.
+ * @dst is also fixed point, so cannot be used as a simple count.
+ * shrinker_do_scan will take care of that for us.
*
- * Returns the number of slab objects which we shrunk.
+ * There is no synchronisation here, which is fine really. A rare lost
+ * update is no huge deal in reclaim code.
*/
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+void shrinker_add_scan(unsigned long *dst,
+ unsigned long scanned, unsigned long total,
+ unsigned long objects, unsigned int ratio)
{
- struct shrinker *shrinker;
- unsigned long ret = 0;
+ unsigned long long delta;
- if (scanned == 0)
- scanned = SWAP_CLUSTER_MAX;
+ delta = (unsigned long long)scanned * objects;
+ delta *= SHRINK_FACTOR;
+ do_div(delta, total + 1);
+ delta *= SHRINK_FACTOR; /* ratio is also in SHRINK_FACTOR units */
+ do_div(delta, ratio + 1);
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ *dst += delta;
+
+ if (*dst / SHRINK_FACTOR > objects)
+ *dst = objects * SHRINK_FACTOR;
+}
+EXPORT_SYMBOL(shrinker_add_scan);
+
+/*
+ * shrinker_do_scan - scan a batch of objects
+ * @dst: scan counter
+ * @batch: number of objects to scan in this batch
+ * @Returns: number of objects to scan
+ *
+ * shrinker_do_scan takes the scan counter accumulated by shrinker_add_scan,
+ * and decrements it by @batch if it is greater than batch and returns batch.
+ * Otherwise returns 0. The caller should use the return value as the number
+ * of objects to scan next.
+ *
+ * Between shrinker_do_scan calls, the caller should drop locks if possible
+ * and call cond_resched.
+ *
+ * Note, @dst is a fixed point scaled integer. See shrinker_add_scan.
+ *
+ * Like shrinker_add_scan, shrinker_do_scan is not SMP safe, but it doesn't
+ * really need to be.
+ */
+unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch)
+{
+ unsigned long nr = ACCESS_ONCE(*dst);
+ if (nr < batch * SHRINK_FACTOR)
+ return 0;
+ *dst = nr - batch * SHRINK_FACTOR;
+ return batch;
+}
+EXPORT_SYMBOL(shrinker_do_scan);
+
+#define SHRINK_BATCH 128
+/*
+ * Scan the deprecated shrinkers. This will go away soon in favour of
+ * converting everybody to new shrinker API.
+ */
+static void shrink_slab_old(unsigned long scanned, gfp_t gfp_mask,
+ unsigned long lru_pages)
+{
+ struct shrinker *shrinker;
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
unsigned long max_pass;
+ if (!shrinker->shrink)
+ continue;
max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
delta *= max_pass;
@@ -250,15 +313,11 @@ unsigned long shrink_slab(unsigned long
while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
int shrink_ret;
- int nr_before;
- nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
gfp_mask);
if (shrink_ret == -1)
break;
- if (shrink_ret < nr_before)
- ret += nr_before - shrink_ret;
count_vm_events(SLABS_SCANNED, this_scan);
total_scan -= this_scan;
@@ -267,8 +326,86 @@ unsigned long shrink_slab(unsigned long
shrinker->nr += total_scan;
}
+}
+/*
+ * shrink_slab - Call the shrink functions to age shrinkable caches
+ * @zone: the zone we are currently reclaiming from
+ * @scanned: how many pagecache pages were scanned in this zone
+ * @total: total number of reclaimable pagecache pages in this zone
+ * @global: total number of reclaimable pagecache pages in the system
+ * @gfp_mask: gfp context that we are in
+ *
+ * Slab shrinkers should scan their objects in a proportion to the ratio of
+ * scanned to total pagecache pages in this zone, modified by a "cost"
+ * constant.
+ *
+ * For example, we have a slab cache with 100 reclaimable objects in a
+ * particular zone, and the cost of reclaiming an object is determined to be
+ * twice as expensive as reclaiming a pagecache page (due to likelihood and
+ * cost of reconstruction). If we have 200 reclaimable pagecache pages in that
+ * zone particular zone, and scan 20 of them (10%), we should scan 5% (5) of
+ * the objects in our slab cache.
+ *
+ * If we have a single global list of objects and no per-zone lists, the
+ * global count of objects can be used to find the correct ratio to scan.
+ *
+ * See shrinker_add_scan and shrinker_do_scan for helper functions and
+ * details on how to calculate these numbers.
+ */
+static void shrink_slab(struct zone *zone, unsigned long scanned,
+ unsigned long total, unsigned long global,
+ gfp_t gfp_mask)
+{
+ struct shrinker *shrinker;
+
+ if (scanned == 0)
+ scanned = SWAP_CLUSTER_MAX;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ return;
+
+ /* do a global shrink with the old shrinker API */
+ shrink_slab_old(scanned, gfp_mask, global);
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ if (!shrinker->shrink_zone)
+ continue;
+ (*shrinker->shrink_zone)(shrinker, zone, scanned,
+ total, global, 0, gfp_mask);
+ }
up_read(&shrinker_rwsem);
- return ret;
+}
+
+/**
+ * shrink_all_slab - shrinks slabs in a given zone or system wide
+ * @zone: NULL to shrink slab from all zones, or non-NULL to shrink from a particular zone
+ *
+ * shrink_all_slab is a bit of a big hammer, and it's not really well defined what it should
+ * do (how much, how hard to shrink, etc), and it will throw out the reclaim balance. So it
+ * must only be used very carefully (drop_caches and hardware memory error handler are good
+ * examples).
+ */
+void shrink_all_slab(struct zone *zone)
+{
+ struct reclaim_state reclaim_state;
+
+ current->reclaim_state = &reclaim_state;
+ do {
+ reclaim_state.reclaimed_slab = 0;
+ /*
+ * Use "100" for "scanned", "total", and "global", so
+ * that shrinkers scan a large proportion of their
+ * objects. 100 rather than 1 in order to reduce rounding
+ * errors.
+ */
+ if (!zone) {
+ for_each_populated_zone(zone)
+ shrink_slab(zone, 100, 100, 100, GFP_KERNEL);
+ } else
+ shrink_slab(zone, 100, 100, 100, GFP_KERNEL);
+ } while (reclaim_state.reclaimed_slab);
+
+ current->reclaim_state = NULL;
}
static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
@@ -1801,16 +1938,22 @@ static void get_scan_count(struct zone *
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+ struct scan_control *sc, unsigned long global_lru_pages)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ unsigned long nr_scanned = sc->nr_scanned;
+ unsigned long lru_pages = 0;
get_scan_count(zone, sc, nr, priority);
+ /* Used by slab shrinking, below */
+ if (sc->may_reclaim_slab)
+ lru_pages = zone_reclaimable_pages(zone);
+
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
@@ -1835,8 +1978,6 @@ static void shrink_zone(int priority, st
break;
}
- sc->nr_reclaimed = nr_reclaimed;
-
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
@@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ /*
+ * Don't shrink slabs when reclaiming memory from
+ * over limit cgroups
+ */
+ if (sc->may_reclaim_slab) {
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+
+ shrink_slab(zone, sc->nr_scanned - nr_scanned,
+ lru_pages, global_lru_pages, sc->gfp_mask);
+ if (reclaim_state) {
+ nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+
+ sc->nr_reclaimed = nr_reclaimed;
+
throttle_vm_writeout(sc->gfp_mask);
}
@@ -1864,7 +2022,7 @@ static void shrink_zone(int priority, st
* scan then give up on it.
*/
static void shrink_zones(int priority, struct zonelist *zonelist,
- struct scan_control *sc)
+ struct scan_control *sc, unsigned long global_lru_pages)
{
struct zoneref *z;
struct zone *zone;
@@ -1884,7 +2042,7 @@ static void shrink_zones(int priority, s
continue; /* Let kswapd poll it */
}
- shrink_zone(priority, zone, sc);
+ shrink_zone(priority, zone, sc, global_lru_pages);
}
}
@@ -1941,7 +2099,6 @@ static unsigned long do_try_to_free_page
{
int priority;
unsigned long total_scanned = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
struct zone *zone;
unsigned long writeback_threshold;
@@ -1953,30 +2110,20 @@ static unsigned long do_try_to_free_page
count_vm_event(ALLOCSTALL);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- sc->nr_scanned = 0;
- if (!priority)
- disable_swap_token();
- shrink_zones(priority, zonelist, sc);
- /*
- * Don't shrink slabs when reclaiming memory from
- * over limit cgroups
- */
- if (scanning_global_lru(sc)) {
- unsigned long lru_pages = 0;
- for_each_zone_zonelist(zone, z, zonelist,
- gfp_zone(sc->gfp_mask)) {
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
+ unsigned long lru_pages = 0;
- lru_pages += zone_reclaimable_pages(zone);
- }
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
- shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ lru_pages += zone_reclaimable_pages(zone);
}
+
+ sc->nr_scanned = 0;
+ if (!priority)
+ disable_swap_token();
+ shrink_zones(priority, zonelist, sc, lru_pages);
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
@@ -2029,6 +2176,7 @@ unsigned long try_to_free_pages(struct z
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
.may_swap = 1,
+ .may_reclaim_slab = 1,
.swappiness = vm_swappiness,
.order = order,
.mem_cgroup = NULL,
@@ -2058,6 +2206,7 @@ unsigned long mem_cgroup_shrink_node_zon
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
+ .may_reclaim_slab = 0,
.swappiness = swappiness,
.order = 0,
.mem_cgroup = mem,
@@ -2076,7 +2225,7 @@ unsigned long mem_cgroup_shrink_node_zon
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_zone(0, zone, &sc);
+ shrink_zone(0, zone, &sc, zone_reclaimable_pages(zone));
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2094,6 +2243,7 @@ unsigned long try_to_free_mem_cgroup_pag
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
+ .may_reclaim_slab = 0,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.swappiness = swappiness,
.order = 0,
@@ -2171,11 +2321,11 @@ static unsigned long balance_pgdat(pg_da
int priority;
int i;
unsigned long total_scanned;
- struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
.may_swap = 1,
+ .may_reclaim_slab = 1,
/*
* kswapd doesn't want to be bailed out while reclaim. because
* we want to put equal scanning pressure on each zone.
@@ -2249,7 +2399,6 @@ static unsigned long balance_pgdat(pg_da
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- int nr_slab;
if (!populated_zone(zone))
continue;
@@ -2271,15 +2420,11 @@ static unsigned long balance_pgdat(pg_da
*/
if (!zone_watermark_ok(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
- shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ shrink_zone(priority, zone, &sc, lru_pages);
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
+ if (!zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -2545,6 +2690,7 @@ unsigned long shrink_all_memory(unsigned
.may_swap = 1,
.may_unmap = 1,
.may_writepage = 1,
+ .may_reclaim_slab = 1,
.nr_to_reclaim = nr_to_reclaim,
.hibernation_mode = 1,
.swappiness = vm_swappiness,
@@ -2728,13 +2874,14 @@ static int __zone_reclaim(struct zone *z
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.may_swap = 1,
+ .may_reclaim_slab = 0,
.nr_to_reclaim = max_t(unsigned long, nr_pages,
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
.order = order,
};
- unsigned long nr_slab_pages0, nr_slab_pages1;
+ unsigned long lru_pages, slab_pages;
cond_resched();
/*
@@ -2747,51 +2894,61 @@ static int __zone_reclaim(struct zone *z
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
+ lru_pages = zone_reclaimable_pages(zone);
+ slab_pages = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+
if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ if (slab_pages > zone->min_slab_pages)
+ sc.may_reclaim_slab = 1;
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
- shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc, lru_pages);
priority--;
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
- }
- nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages0 > zone->min_slab_pages) {
+ } else if (slab_pages > zone->min_slab_pages) {
/*
- * shrink_slab() does not currently allow us to determine how
- * many pages were freed in this zone. So we take the current
- * number of slab pages and shake the slab until it is reduced
- * by the same nr_pages that we used for reclaiming unmapped
- * pages.
- *
- * Note that shrink_slab will free memory on all zones and may
- * take a long time.
+ * Scanning slab without pagecache, have to open code
+ * call to shrink_slab (shirnk_zone drives slab reclaim via
+ * pagecache scanning, so it isn't set up to shrink slab
+ * without scanning pagecache.
*/
- for (;;) {
- unsigned long lru_pages = zone_reclaimable_pages(zone);
-
- /* No reclaimable slab or very low memory pressure */
- if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
- break;
- /* Freed enough memory */
- nr_slab_pages1 = zone_page_state(zone,
- NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
- break;
- }
+ /*
+ * lru_pages / 10 -- put a 10% pressure on the slab
+ * which roughly corresponds to ZONE_RECLAIM_PRIORITY
+ * scanning 1/16th of pagecache.
+ *
+ * Global slabs will be shrink at a relatively more
+ * aggressive rate because we don't calculate the
+ * global lru size for speed. But they really should
+ * be converted to per zone slabs if they are important
+ */
+ shrink_slab(zone, lru_pages / 10, lru_pages, lru_pages,
+ gfp_mask);
/*
- * Update nr_reclaimed by the number of slab pages we
- * reclaimed from this zone.
+ * Although we have a zone based slab shrinker API, some slabs
+ * are still scanned globally. This means we can't quite
+ * determine how many pages were freed in this zone by
+ * checking reclaimed_slab. However the regular shrink_zone
+ * paths have exactly the same problem that they largely
+ * ignore. So don't be different.
+ *
+ * The situation will improve dramatically as important slabs
+ * are switched over to using reclaimed_slab after the
+ * important slabs are converted to using per zone shrinkers.
+ *
+ * Note that shrink_slab may free memory on all zones and may
+ * take a long time, but again switching important slabs to
+ * zone based shrinkers will solve this problem.
*/
- nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (nr_slab_pages1 < nr_slab_pages0)
- sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
+ sc.nr_reclaimed += reclaim_state.reclaimed_slab;
+ reclaim_state.reclaimed_slab = 0;
}
p->reclaim_state = NULL;
Index: linux-2.6/fs/drop_caches.c
===================================================================
--- linux-2.6.orig/fs/drop_caches.c 2010-11-09 22:11:03.000000000 +1100
+++ linux-2.6/fs/drop_caches.c 2010-11-09 22:11:10.000000000 +1100
@@ -35,11 +35,7 @@ static void drop_pagecache_sb(struct sup
static void drop_slab(void)
{
- int nr_objects;
-
- do {
- nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
- } while (nr_objects > 10);
+ shrink_all_slab(NULL); /* NULL - all zones */
}
int drop_caches_sysctl_handler(ctl_table *table, int write,
Index: linux-2.6/mm/memory-failure.c
===================================================================
--- linux-2.6.orig/mm/memory-failure.c 2010-11-09 22:11:03.000000000 +1100
+++ linux-2.6/mm/memory-failure.c 2010-11-09 22:11:10.000000000 +1100
@@ -235,14 +235,8 @@ void shake_page(struct page *p, int acce
* Only all shrink_slab here (which would also
* shrink other caches) if access is not potentially fatal.
*/
- if (access) {
- int nr;
- do {
- nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 1)
- break;
- } while (nr > 10);
- }
+ if (access)
+ shrink_all_slab(page_zone(p));
}
EXPORT_SYMBOL_GPL(shake_page);
On Tue, Nov 09, 2010 at 11:32:46PM +1100, Nick Piggin wrote:
> Hi,
>
> I'm doing some works that require per-zone shrinkers, I'd like to get
> the vmscan part signed off and merged by interested mm people, please.
>
There are still plenty of unresolved issues with this general
approach to scaling object caches that I'd like to see sorted out
before we merge any significant shrinker API changes. Some things of
the top of my head:
- how to solve impendence mismatches between VM scalability
techniques and subsystem scalabilty techniques that result
in shrinker cross-muliplication explosions. e.g. XFS
tracks reclaimable inodes in per-allocation group trees,
so we'd get AG x per-zone LRU trees using this shrinker
method. Think of the overhead on a 1000AG filesystem on a
1000 node machine with 3-5 zones per node....
- changes from global LRU behaviour to something that is not
at all global - effect on workloads that depend on large
scale caches that span multiple nodes is largely unknown.
It will change IO patterns and affect system balance and
performance of the system. How do we
test/categorise/understand these problems and address such
balance issues?
- your use of this shrinker architecture for VFS
inode/dentry cache scalability requires adding lists and
locks to the MM struct zone for each object cache type
(inode, dentry, etc). As such, it is not a generic
solution because it cannot be used for per-instance caches
like the per-mount inode caches XFS uses.
i.e. nothing can actually use this infrastructure change
without tying itself directly into the VM implementation,
and even then not every existing shrinker can use this
method of scaling. i.e. some level of abstraction from the
VM implementation is needed in the shrinker API.
- it has been pointed out that slab caches are generally
allocated out of a single zone per node, so per-zone
shrinker granularity seems unnecessary.
- doesn't solve the unbound direct reclaim shrinker
parallelism that is already causing excessive LRU lock
contention on 8p single node systems. While
per-LRU/per-node solves the larger scalability issue, it
doesn't address scalability within the node. This is soon
going to be 24p per node and that's more than enough to
cause severe problems with a single lock and list...
> [And before anybody else kindly suggests per-node shrinkers, please go
> back and read all the discussion about this first.]
I don't care for any particular solution, but I want these issues
resolved before we make any move forward. per-node abstractions is
just one possible way that has been suggested to address some of
these issues, so it shouldn't be dismissed out of hand like this.
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Nov 10, 2010 at 04:18:13PM +1100, Dave Chinner wrote:
> On Tue, Nov 09, 2010 at 11:32:46PM +1100, Nick Piggin wrote:
> > Hi,
> >
> > I'm doing some works that require per-zone shrinkers, I'd like to get
> > the vmscan part signed off and merged by interested mm people, please.
> >
>
> There are still plenty of unresolved issues with this general
> approach to scaling object caches that I'd like to see sorted out
> before we merge any significant shrinker API changes. Some things of
No changes, it just adds new APIs.
> the top of my head:
>
> - how to solve impendence mismatches between VM scalability
> techniques and subsystem scalabilty techniques that result
> in shrinker cross-muliplication explosions. e.g. XFS
> tracks reclaimable inodes in per-allocation group trees,
> so we'd get AG x per-zone LRU trees using this shrinker
> method. Think of the overhead on a 1000AG filesystem on a
> 1000 node machine with 3-5 zones per node....
That's an interesting question, but no reason to hold anything up.
It's up to the subsystem to handle it, or if they can't handle it
sanely, you can elect to get no different behaviour than today.
I would say that per-zone is important for the above case, because it
shrinks properly, based on memory pressure and pressure settings (eg.
zone reclaim), and also obviously so that the reclaim threads are
scanning node-local-memory so it doesn't have to pull objects from
all over the interconnect.
With 1000 nodes and 1000AG filesystem, probably NxM is overkill, but
there is no reason you couldn't have one LRU per X AGs.
All this is obviously implementation details -- pagecache reclaim does
not want to know about this, so it doesn't affect the API.
> - changes from global LRU behaviour to something that is not
> at all global - effect on workloads that depend on large
> scale caches that span multiple nodes is largely unknown.
> It will change IO patterns and affect system balance and
> performance of the system. How do we
> test/categorise/understand these problems and address such
> balance issues?
You've brought this up before several times and I've answered it.
The default configuration basically doesn't change much. Caches are
allowed to fill up all zones and nodes, exactly the same as today.
On the reclaim side, when you have a global shortage, it will evenly
shrink objects from all zones, so it still approximates LRU behaviour
because number of objects far exceeds number of zones. This is exactly
how per-zone pagecache scanning works too.
> - your use of this shrinker architecture for VFS
> inode/dentry cache scalability requires adding lists and
> locks to the MM struct zone for each object cache type
> (inode, dentry, etc). As such, it is not a generic
> solution because it cannot be used for per-instance caches
> like the per-mount inode caches XFS uses.
Of course it doesn't. You can use kmalloc.
> i.e. nothing can actually use this infrastructure change
> without tying itself directly into the VM implementation,
> and even then not every existing shrinker can use this
> method of scaling. i.e. some level of abstraction from the
> VM implementation is needed in the shrinker API.
"zones" is the abstraction. The thing which all allocation and
management of memory is based on, everywhere you do any allocations
in the kernel. A *shrinker* implementation, a thing which is called
in response to memory shortage in a given zone, has to know about
zones. End of story. "Tied to the memory management implementation"
is true to the extent that it is part of the memory management
implementation.
> - it has been pointed out that slab caches are generally
> allocated out of a single zone per node, so per-zone
> shrinker granularity seems unnecessary.
No they are not, that's just total FUD. Where was that "pointed out"?
Slabs are generally allocated from every zone except for highmem and
movable, and moreover there is nothing to prevent a shrinker
implementation from needing to shrink highmem and movable pages as
well.
> - doesn't solve the unbound direct reclaim shrinker
> parallelism that is already causing excessive LRU lock
> contention on 8p single node systems. While
> per-LRU/per-node solves the larger scalability issue, it
> doesn't address scalability within the node. This is soon
> going to be 24p per node and that's more than enough to
> cause severe problems with a single lock and list...
It doesn't aim to solve that. It doesn't prevent that from being
solved either (and improving parallelism in a subsystem generally,
you know, helps with lock contention due to lots of threads in there,
right?).
> > [And before anybody else kindly suggests per-node shrinkers, please go
> > back and read all the discussion about this first.]
>
> I don't care for any particular solution, but I want these issues
> resolved before we make any move forward. per-node abstractions is
> just one possible way that has been suggested to address some of
> these issues, so it shouldn't be dismissed out of hand like this.
Well if you listen to my follow ups to the FUD that keeps appearing,
maybe we can "move forward".
On Wed, Nov 10, 2010 at 05:32:29PM +1100, Nick Piggin wrote:
> On Wed, Nov 10, 2010 at 04:18:13PM +1100, Dave Chinner wrote:
> > On Tue, Nov 09, 2010 at 11:32:46PM +1100, Nick Piggin wrote:
> > > Hi,
> > >
> > > I'm doing some works that require per-zone shrinkers, I'd like to get
> > > the vmscan part signed off and merged by interested mm people, please.
> > >
> >
> > There are still plenty of unresolved issues with this general
> > approach to scaling object caches that I'd like to see sorted out
> > before we merge any significant shrinker API changes. Some things of
>
> No changes, it just adds new APIs.
I might add that you've previously brought up and I answered every
single issue below and you've not followed up on my answers. So just
going back to the start and bringing them all up again is ridiculous.
Please stop it.
> > - it has been pointed out that slab caches are generally
> > allocated out of a single zone per node, so per-zone
> > shrinker granularity seems unnecessary.
>
> No they are not, that's just total FUD. Where was that "pointed out"?
>
> Slabs are generally allocated from every zone except for highmem and
> movable, and moreover there is nothing to prevent a shrinker
> implementation from needing to shrink highmem and movable pages as
> well.
Perhaps you meant here that many nodes have only one populated zone.
If that is _really_ a huge problem to add a couple of list heads per
zone per node (which it isn't), then the subsystem can do per-node
shrinking and just do zone_to_nid(zone) in the shrinker callback (but
that would be retarded so it shouldn't).
I answered Christoph's thread here, and I showed how a node based
callback can result in significant imbalances between zones in a node,
so it won't fly.
If you think that nodes are sufficient, then you can get patches through
mm to make pagecache reclaim per-node based, and changes to the
shirnker API will naturally propogate through.
On Wed, Nov 10, 2010 at 05:32:29PM +1100, Nick Piggin wrote:
> On Wed, Nov 10, 2010 at 04:18:13PM +1100, Dave Chinner wrote:
> > On Tue, Nov 09, 2010 at 11:32:46PM +1100, Nick Piggin wrote:
> > > Hi,
> > >
> > > I'm doing some works that require per-zone shrinkers, I'd like to get
> > > the vmscan part signed off and merged by interested mm people, please.
> > >
> >
> > There are still plenty of unresolved issues with this general
> > approach to scaling object caches that I'd like to see sorted out
> > before we merge any significant shrinker API changes. Some things of
>
> No changes, it just adds new APIs.
>
>
> > the top of my head:
> >
> > - how to solve impendence mismatches between VM scalability
> > techniques and subsystem scalabilty techniques that result
> > in shrinker cross-muliplication explosions. e.g. XFS
> > tracks reclaimable inodes in per-allocation group trees,
> > so we'd get AG x per-zone LRU trees using this shrinker
> > method. Think of the overhead on a 1000AG filesystem on a
> > 1000 node machine with 3-5 zones per node....
>
> That's an interesting question, but no reason to hold anything up.
> It's up to the subsystem to handle it, or if they can't handle it
> sanely, you can elect to get no different behaviour than today.
See, this is what I think is the problem - you answer any concerns
by saying that the problem is entirely the responsibility of the
subsystem to handle. You've decided how you want shrinkers to work,
and everything else is SEP (somebody else's problem).
> I would say that per-zone is important for the above case, because it
> shrinks properly, based on memory pressure and pressure settings (eg.
> zone reclaim), and also obviously so that the reclaim threads are
> scanning node-local-memory so it doesn't have to pull objects from
> all over the interconnect.
>
> With 1000 nodes and 1000AG filesystem, probably NxM is overkill, but
> there is no reason you couldn't have one LRU per X AGs.
And when it's only 1 node or 50 nodes or 50 AGs or 5000 AGs? I don't
like encoding _guesses_ at what might be a good configuration for an
arbitrary NxM configuration because they are rarely right. That's
exactly what you are suggesting here - that I solve this problem by
making guesses at how it might scale and coming up with some
arbitrary mapping scheme to handle it.
> All this is obviously implementation details -- pagecache reclaim does
> not want to know about this, so it doesn't affect the API.
i.e. it's another SEP solution.
> > - changes from global LRU behaviour to something that is not
> > at all global - effect on workloads that depend on large
> > scale caches that span multiple nodes is largely unknown.
> > It will change IO patterns and affect system balance and
> > performance of the system. How do we
> > test/categorise/understand these problems and address such
> > balance issues?
>
> You've brought this up before several times and I've answered it.
>
> The default configuration basically doesn't change much. Caches are
> allowed to fill up all zones and nodes, exactly the same as today.
>
> On the reclaim side, when you have a global shortage, it will evenly
> shrink objects from all zones, so it still approximates LRU behaviour
> because number of objects far exceeds number of zones. This is exactly
> how per-zone pagecache scanning works too.
The whole point of the zone-based reclaim is that shrinkers run
when there are _local_ shortages on the _local_ LRUs and this will
happen much more often than global shortages, especially on large
machines. It will result in a change of behaviour, no question about
it.
However, this is not reason for not moving to this model - what I'm
asking is what the plan for categorising problems that arise as a
result of such a change? How do we go about translating random
reports like "I do this and it goes X% slower on 2.6.xx" to "that's
a result of the new LRU reclaim model, and you should do <this> to
try to resolve it". How do we triage such reports? What are our
options to resolve such problems? Are there any knobs we should add
at the beginning to give users ways of changing the behaviour to be
more like the curent code? We've got lots of different knobs to
control page cache reclaim behaviour - won't some of them be
relevant to per-zone slab cache reclaim?
> > - your use of this shrinker architecture for VFS
> > inode/dentry cache scalability requires adding lists and
> > locks to the MM struct zone for each object cache type
> > (inode, dentry, etc). As such, it is not a generic
> > solution because it cannot be used for per-instance caches
> > like the per-mount inode caches XFS uses.
>
> Of course it doesn't. You can use kmalloc.
Right - another SEP solution.
The reason people are using shrinkers is that it is simple to
implement a basic one. But implementing a scalable shrinker right
now is fucking hard - look at all the problems we've had with XFS
mostly because of the current API and the unbound parallelism of
direct reclaim.
This new API is a whole new adventure that not many people are going
to have machines capable of executing behavioural testing or
stressing. If everyone is forced to implement their own "scalable
shrinker" we'll end up with a rat's nest of different
implementations with similar but subtly different sets of bugs...
> > i.e. nothing can actually use this infrastructure change
> > without tying itself directly into the VM implementation,
> > and even then not every existing shrinker can use this
> > method of scaling. i.e. some level of abstraction from the
> > VM implementation is needed in the shrinker API.
>
> "zones" is the abstraction. The thing which all allocation and
> management of memory is based on, everywhere you do any allocations
> in the kernel. A *shrinker* implementation, a thing which is called
> in response to memory shortage in a given zone, has to know about
> zones. End of story. "Tied to the memory management implementation"
> is true to the extent that it is part of the memory management
> implementation.
>
>
> > - it has been pointed out that slab caches are generally
> > allocated out of a single zone per node, so per-zone
> > shrinker granularity seems unnecessary.
>
> No they are not, that's just total FUD. Where was that "pointed out"?
Christoph Hellwig mentioned it, I think Christoph Lameter asked
exactly this question, I've mentioned it, and you even mentioned at
one point that zones were effectively a per-node construct....
> Slabs are generally allocated from every zone except for highmem and
> movable, and moreover there is nothing to prevent a shrinker
> implementation from needing to shrink highmem and movable pages as
> well.
>
> > - doesn't solve the unbound direct reclaim shrinker
> > parallelism that is already causing excessive LRU lock
> > contention on 8p single node systems. While
> > per-LRU/per-node solves the larger scalability issue, it
> > doesn't address scalability within the node. This is soon
> > going to be 24p per node and that's more than enough to
> > cause severe problems with a single lock and list...
>
> It doesn't aim to solve that. It doesn't prevent that from being
> solved either (and improving parallelism in a subsystem generally,
> you know, helps with lock contention due to lots of threads in there,
> right?).
Sure, but you keep missing the point that it's a problem that is
more important to solve for the typical 2S or 4S server than scaling
to 1000 nodes is. An architecture that allows unbounded parallelism
is, IMO, fundamentally broken from a scalability point of view.
Such behaviour indicates that the effects of such levels of
parallelism weren't really considered with the subsystem was
designed.
I make this point because of the observation I made that shrinkers
are by far the most efficient when only a single thread is running
reclaim on a particular LRU. Reclaim parallelism on a single LRU
only made reclaim go slower and consume more CPU, and direct reclaim
is definitely causing such parallelism and slowdowns (I can point
you to the XFS mailing list posting again if you want).
I've previously stated that reducing/controlling the level of
parallelism can be just as effective at providing serious
scalability improvements as fine grained locking. So you don't
simply scoff and mock me for suggesting it like you did last time,
here's a real live example:
249a8c11 "[XFS] Move AIL pushing into it's own thread"
This commit protected tail pushing in XFS from unbounded
parallelism. The problematic workload was a MPI job that was doing
synchronised closing of 6 files per thread on a 2048 CPU machine.
It was taking an *hour and half* to complete this because of lock
contention. The above patch moved the tail pushing into it's own
thread which shifted all the multiple thread queuing back onto the
pre-existing wait queueѕ where parallelism is supposed to be
controlled. The result was that the same 12,000 file close operation
took less than 9s to run. i.e. 3 orders of magnitude improvement in
run time, simply by controlling the amount of parallelism on a
single critical structure.
>From that experience, what I suggest is that we provide the same
sort of _controlled parallelism_ for the generic shrinker
infrastructure. We abstract out the lists and locks completely, and
simply provide a "can you free this object" callback to the
subsystem. I suggested we move shrinker callbacks out of direct
reclaim to the (per-node) kswapd. having thought about it a bit
more, a simpler (and probably better) alternative would be to allow
only a single direct reclaimer to act on a specific LRU at a time,
as this would allow different LRUs to be reclaimed in parallel.
Either way, this bounds the amount of parallelism the shrinker will
place on the lists and locks, thereby solving both the internal node
scalability issues and the scaling to many nodes problems.
Such an abstraction also keeps all the details of how the objects
are stored in LRUs and selected for reclaim completely hidden from
the subsystems. They don't need to care about the actual
implementation at all, just use a few simple functions (register,
unregister, list add, list remove, free this object callback). And
it doesn't prevent the internal shrinker implementation from using a
list per zone, either, as you desire.
IOWs, I think the LRUs should not be handled separately from the
shrinker as the functionality of the two is intimately tied
together. We should be able to use a single scalable LRU/shrinker
implementation for all caches that require such functionality.
[ And once we've got such an abstraction, the LRUs could even be
moved into the slab cache itself and we could finally begin to
think about solving some of the other slab reclaim wish list
items like defragmentation again. ]
Now, this doesn't solve the mismatches between VM and subsystem
scalability models. However, if the LRU+shrinker API is so simple
and "just scales" then all my concerns about forcing knowledge of
the MM zone architecture on all subsystem evaporate because it's all
abstracted away from the subsystems. i.e. the subsystems do not have
to try to for a square peg into a round hole anymore. And with such
a simple abstraction, I'd even (ab)use it for XFS....
Yes, it may require some changes to the way the VM handles slab
cache reclaim, but I think changing the shrinker architecture as
I've proposed results in subsystem LRUs and shrinkers that are
simpler to implement, test and maintain whilst providing better
scalability in more conditions than your proposed approach. And it
also provides a way forward to implement other desired slab reclaim
functionality, too...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Wed, Nov 10, 2010 at 10:05:49PM +1100, Dave Chinner wrote:
> On Wed, Nov 10, 2010 at 05:32:29PM +1100, Nick Piggin wrote:
> > On Wed, Nov 10, 2010 at 04:18:13PM +1100, Dave Chinner wrote:
> > > On Tue, Nov 09, 2010 at 11:32:46PM +1100, Nick Piggin wrote:
> > > > Hi,
> > > >
> > > > I'm doing some works that require per-zone shrinkers, I'd like to get
> > > > the vmscan part signed off and merged by interested mm people, please.
> > > >
> > >
> > > There are still plenty of unresolved issues with this general
> > > approach to scaling object caches that I'd like to see sorted out
> > > before we merge any significant shrinker API changes. Some things of
> >
> > No changes, it just adds new APIs.
> >
> >
> > > the top of my head:
> > >
> > > - how to solve impendence mismatches between VM scalability
> > > techniques and subsystem scalabilty techniques that result
> > > in shrinker cross-muliplication explosions. e.g. XFS
> > > tracks reclaimable inodes in per-allocation group trees,
> > > so we'd get AG x per-zone LRU trees using this shrinker
> > > method. Think of the overhead on a 1000AG filesystem on a
> > > 1000 node machine with 3-5 zones per node....
> >
> > That's an interesting question, but no reason to hold anything up.
> > It's up to the subsystem to handle it, or if they can't handle it
> > sanely, you can elect to get no different behaviour than today.
>
> See, this is what I think is the problem - you answer any concerns
> by saying that the problem is entirely the responsibility of the
> subsystem to handle. You've decided how you want shrinkers to work,
> and everything else is SEP (somebody else's problem).
Um, wrong. I provide an API to drive shrinkers, giving sufficient
information (and nothing more). You are the one deciding that it is
the problem of the API to somehow fix up all the implementations.
If you can actually suggest how such a magical API would look like,
please do. Otherwise, please try to get a better understanding of
my API and understand it does nothing more than provide required
information for proper zone based scanning.
> > I would say that per-zone is important for the above case, because it
> > shrinks properly, based on memory pressure and pressure settings (eg.
> > zone reclaim), and also obviously so that the reclaim threads are
> > scanning node-local-memory so it doesn't have to pull objects from
> > all over the interconnect.
> >
> > With 1000 nodes and 1000AG filesystem, probably NxM is overkill, but
> > there is no reason you couldn't have one LRU per X AGs.
>
> And when it's only 1 node or 50 nodes or 50 AGs or 5000 AGs? I don't
> like encoding _guesses_ at what might be a good configuration for an
> arbitrary NxM configuration because they are rarely right. That's
> exactly what you are suggesting here - that I solve this problem by
> making guesses at how it might scale and coming up with some
> arbitrary mapping scheme to handle it.
Still, it's off topic for shrinker API discussion, I'd be happy to
help with ideas or questions about your particular implementation.
Note you can a: ignore the new information the API gives you, or b:
understand that your current case of 1000 LRUs spread globally over
1000 nodes sucks much much worse than even a poor per-zone distribution,
and make some actual use of the new information.
> > All this is obviously implementation details -- pagecache reclaim does
> > not want to know about this, so it doesn't affect the API.
>
> i.e. it's another SEP solution.
Uh, yes in fact it is the problem of the shrinker implementation to
be a good implementation.
> > > - changes from global LRU behaviour to something that is not
> > > at all global - effect on workloads that depend on large
> > > scale caches that span multiple nodes is largely unknown.
> > > It will change IO patterns and affect system balance and
> > > performance of the system. How do we
> > > test/categorise/understand these problems and address such
> > > balance issues?
> >
> > You've brought this up before several times and I've answered it.
> >
> > The default configuration basically doesn't change much. Caches are
> > allowed to fill up all zones and nodes, exactly the same as today.
> >
> > On the reclaim side, when you have a global shortage, it will evenly
> > shrink objects from all zones, so it still approximates LRU behaviour
> > because number of objects far exceeds number of zones. This is exactly
> > how per-zone pagecache scanning works too.
>
> The whole point of the zone-based reclaim is that shrinkers run
> when there are _local_ shortages on the _local_ LRUs and this will
> happen much more often than global shortages, especially on large
> machines. It will result in a change of behaviour, no question about
> it.
But if you have a global workload, then you tend to get relatively
even pressure and it tends to approximate global LRU (especially if
you do memory spreading of your slabs).
Yes there will be some differences, but it's not some giant scary
change, it just can be tested with some benchmarks and people can
complain if something hurts. Bringing reclaim closer to how pagecache
is reclaimed I think should make things more understandable though.
> However, this is not reason for not moving to this model - what I'm
> asking is what the plan for categorising problems that arise as a
> result of such a change? How do we go about translating random
> reports like "I do this and it goes X% slower on 2.6.xx" to "that's
> a result of the new LRU reclaim model, and you should do <this> to
> try to resolve it". How do we triage such reports? What are our
> options to resolve such problems? Are there any knobs we should add
> at the beginning to give users ways of changing the behaviour to be
> more like the curent code? We've got lots of different knobs to
> control page cache reclaim behaviour - won't some of them be
> relevant to per-zone slab cache reclaim?
Really? Same as any other heuristic or performance change. We merge
thousands of them every release cycle, and at least a couple of major
ones.
> > > - your use of this shrinker architecture for VFS
> > > inode/dentry cache scalability requires adding lists and
> > > locks to the MM struct zone for each object cache type
> > > (inode, dentry, etc). As such, it is not a generic
> > > solution because it cannot be used for per-instance caches
> > > like the per-mount inode caches XFS uses.
> >
> > Of course it doesn't. You can use kmalloc.
>
> Right - another SEP solution.
What??? Stop saying this. In the above paragraph you were going crazy
about your wrong and obviously unfounded solution that other subsystems
can't use it. I tell you that they can, ie. with kmalloc.
I could spell it out if it is not obvious. You would allocate an array
of LRU structures to fit all zones in the system. And then you would
index into that array with zone_to_nid and zone_idx. OK?
It is not "SEP solution". If a shrinker implementation wants to use per
zone LRUs, then it is that shrinker implementation's problem to allocate
space for them. OK so far?
And if a shrinker implementation doesn't want to do that, and instead
wants a global LRU, then it's the implementation's problem to allocate
space for that. And if it wants per AG LRUs, then it's the
implementation's problem to allocate space for that. See any patterns
emerging?
> The reason people are using shrinkers is that it is simple to
> implement a basic one. But implementing a scalable shrinker right
> now is fucking hard - look at all the problems we've had with XFS
> mostly because of the current API and the unbound parallelism of
> direct reclaim.
>
> This new API is a whole new adventure that not many people are going
> to have machines capable of executing behavioural testing or
> stressing. If everyone is forced to implement their own "scalable
> shrinker" we'll end up with a rat's nest of different
> implementations with similar but subtly different sets of bugs...
No, they are not forced to do anything. Stop with this strawman already.
Did you read the patch? 0 change to any existing implementation.
> > > i.e. nothing can actually use this infrastructure change
> > > without tying itself directly into the VM implementation,
> > > and even then not every existing shrinker can use this
> > > method of scaling. i.e. some level of abstraction from the
> > > VM implementation is needed in the shrinker API.
> >
> > "zones" is the abstraction. The thing which all allocation and
> > management of memory is based on, everywhere you do any allocations
> > in the kernel. A *shrinker* implementation, a thing which is called
> > in response to memory shortage in a given zone, has to know about
> > zones. End of story. "Tied to the memory management implementation"
> > is true to the extent that it is part of the memory management
> > implementation.
> >
> >
> > > - it has been pointed out that slab caches are generally
> > > allocated out of a single zone per node, so per-zone
> > > shrinker granularity seems unnecessary.
> >
> > No they are not, that's just total FUD. Where was that "pointed out"?
>
> Christoph Hellwig mentioned it, I think Christoph Lameter asked
> exactly this question, I've mentioned it, and you even mentioned at
> one point that zones were effectively a per-node construct....
Christoph Hellwig mentioned lots of things that were wrong.
Lameter had a reasonable discussion and I showed how per-node shrinker
can cause imbalance, and how subsystems that only care about nodes can
use the zone shrinker to get exactly the same information (because it
provides a superset). And that was the end of that. If you had followed
the discussion you wouldn't keep bringing up this again and again.
> > Slabs are generally allocated from every zone except for highmem and
> > movable, and moreover there is nothing to prevent a shrinker
> > implementation from needing to shrink highmem and movable pages as
> > well.
> >
> > > - doesn't solve the unbound direct reclaim shrinker
> > > parallelism that is already causing excessive LRU lock
> > > contention on 8p single node systems. While
> > > per-LRU/per-node solves the larger scalability issue, it
> > > doesn't address scalability within the node. This is soon
> > > going to be 24p per node and that's more than enough to
> > > cause severe problems with a single lock and list...
> >
> > It doesn't aim to solve that. It doesn't prevent that from being
> > solved either (and improving parallelism in a subsystem generally,
> > you know, helps with lock contention due to lots of threads in there,
> > right?).
>
> Sure, but you keep missing the point that it's a problem that is
> more important to solve for the typical 2S or 4S server than scaling
I am not missing that point. I am solving a different problem. Other
people are looking at the reclaim parallelism problem. Why don't you
go and bother them?
> to 1000 nodes is. An architecture that allows unbounded parallelism
> is, IMO, fundamentally broken from a scalability point of view.
> Such behaviour indicates that the effects of such levels of
> parallelism weren't really considered with the subsystem was
> designed.
>
> I make this point because of the observation I made that shrinkers
> are by far the most efficient when only a single thread is running
> reclaim on a particular LRU. Reclaim parallelism on a single LRU
> only made reclaim go slower and consume more CPU, and direct reclaim
> is definitely causing such parallelism and slowdowns (I can point
> you to the XFS mailing list posting again if you want).
And having more LRUs, more locks, fewer CPUs contending each lock
and LRU, and node locality (yes it is important even on 2S and 4S
systems) is going to do nothing but help that.
> I've previously stated that reducing/controlling the level of
> parallelism can be just as effective at providing serious
> scalability improvements as fine grained locking. So you don't
> simply scoff and mock me for suggesting it like you did last time,
I didn't mock you. On the contrary I agreed that there are 2 problems
here, and that lots of threads in reclaim is one of them. I know this
myself first hand because the pagecache LRUs have the same problems.
And I pointed out that people were looking at the other problem, and
that it was mostly indpendent of per-zone locking in implementation,
and in functionality the per-zone locking would help the unbounded
paralellism anyway.
That you thought I was mocking you by providing an answer like that,
I don't know. Maybe I got frustrated sometime after the 5th time you
brought up exactly the same thing.
> >From that experience, what I suggest is that we provide the same
> sort of _controlled parallelism_ for the generic shrinker
> infrastructure. We abstract out the lists and locks completely, and
> simply provide a "can you free this object" callback to the
> subsystem. I suggested we move shrinker callbacks out of direct
> reclaim to the (per-node) kswapd. having thought about it a bit
> more, a simpler (and probably better) alternative would be to allow
> only a single direct reclaimer to act on a specific LRU at a time,
> as this would allow different LRUs to be reclaimed in parallel.
> Either way, this bounds the amount of parallelism the shrinker will
> place on the lists and locks, thereby solving both the internal node
> scalability issues and the scaling to many nodes problems.
No to abstracting lists and locks completely, and adding callbacks.
That just way over complicates things and doesn't allow shrinkers
flexibility. There is nothing wrong with allowing shrinkers to do
whatever they need to shrink memory.
The problem is that reclaim allows too many threads into the shrinkers.
And it is not a shrinker only problem, it also affects the pagecache
LRUs. So the right way to do it is solve it the same way for both
shrinkers and pagecache, and put some limiting in reclaim/direct reclaim
paths. When you do that, shrinkers will be limited as well. People have
looked at this at various points (I don't know what progress is on it
now though).
> Such an abstraction also keeps all the details of how the objects
> are stored in LRUs and selected for reclaim completely hidden from
> the subsystems.
Well then that would be wrong because the locking and the access
patterns are fundamental properties of the subsystem, not the MM
reclaim subsystem.
On Thu, Nov 11, 2010 at 11:23:39AM +1100, Nick Piggin wrote:
> On Wed, Nov 10, 2010 at 10:05:49PM +1100, Dave Chinner wrote:
> > I've previously stated that reducing/controlling the level of
> > parallelism can be just as effective at providing serious
> > scalability improvements as fine grained locking. So you don't
> > simply scoff and mock me for suggesting it like you did last time,
>
> I didn't mock you. On the contrary I agreed that there are 2 problems
> here, and that lots of threads in reclaim is one of them. I know this
Here, this is the second time I wrote _exactly_ the same reply to your
_exact_ same question (the first time I wrote _exactly_ the same reply
to your exact same question was IIRC a few months before but I couldn't
find the archive.
http://archives.free.net.ph/message/20101015.033017.65c46426.ca.html
So at this point, shut up. You had ample time to put up, and didn't.
Now you're just obstructing and being unreasonable. To quote Christoph,
you don't have carte blance power to delay progress. So stop being an
arsehole, and don't fucking accuse me of scoffing and mocking you.
Because I've been bending over backwards to answer _exactly_ the same
questions multiple times from you and trying to show how either your
assumptions are wrong, or reasonable ways we can mitigate potential
regressions, and trying to be polite and civil the whole time. (which I
might add is much more courtesy than you showed me when you tried to
railroad your inode changes through without bothering to answer or even
read half my comments about them). And what you have done is just ignore
totally my replies to your concerns, stay quiet about them for a month,
and then come back with exactly the same thing again. Like 3 or 4 times.
Really. At this point, if you have already posted a particular comment
more than 50 times, do everybody a favour and redirect the next verbatim
copy /dev/null, OK? I'm sick of it. *Constructive* criticism only, from
now on. Thanks.
I need a zone input to the shinker API, I have demonstrated what for
and why, with good justification, and so I am going to get that API
change merged. It is a simple superset of the current API's
functionality and not some big scary monster (except by somebody who
fundmantally doesn't understand it).
Hi
> Hi,
>
> I'm doing some works that require per-zone shrinkers, I'd like to get
> the vmscan part signed off and merged by interested mm people, please.
>
> [And before anybody else kindly suggests per-node shrinkers, please go
> back and read all the discussion about this first.]
vmscan part looks good to me. however I hope fs folks review too even though
I'm not sure who is best.
btw, I have some nitpick comments. see below.
> ---
> fs/drop_caches.c | 6
> include/linux/mm.h | 47 ++++++-
> mm/memory-failure.c | 10 -
> mm/vmscan.c | 341 +++++++++++++++++++++++++++++++++++++---------------
> 4 files changed, 297 insertions(+), 107 deletions(-)
>
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h 2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/include/linux/mm.h 2010-11-09 22:11:10.000000000 +1100
> @@ -1008,6 +1008,10 @@ static inline void sync_mm_rss(struct ta
> /*
> * A callback you can register to apply pressure to ageable caches.
> *
> + * 'shrink_zone' is the new shrinker API. It is to be used in preference
> + * to 'shrink'. One must point to a shrinker function, the other must
> + * be NULL. See 'shrink_slab' for details about the shrink_zone API.
> + *
> * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should
> * look through the least-recently-used 'nr_to_scan' entries and
> * attempt to free them up. It should return the number of objects
> @@ -1024,13 +1028,53 @@ struct shrinker {
> int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask);
> int seeks; /* seeks to recreate an obj */
>
> + /*
> + * shrink_zone - slab shrinker callback for reclaimable objects
> + * @shrink: this struct shrinker
> + * @zone: zone to scan
> + * @scanned: pagecache lru pages scanned in zone
> + * @total: total pagecache lru pages in zone
> + * @global: global pagecache lru pages (for zone-unaware shrinkers)
> + * @flags: shrinker flags
> + * @gfp_mask: gfp context we are operating within
> + *
> + * The shrinkers are responsible for calculating the appropriate
> + * pressure to apply, batching up scanning (and cond_resched,
> + * cond_resched_lock etc), and updating events counters including
> + * count_vm_event(SLABS_SCANNED, nr).
> + *
> + * This approach gives flexibility to the shrinkers. They know best how
> + * to do batching, how much time between cond_resched is appropriate,
> + * what statistics to increment, etc.
> + */
> + void (*shrink_zone)(struct shrinker *shrink,
> + struct zone *zone, unsigned long scanned,
> + unsigned long total, unsigned long global,
> + unsigned long flags, gfp_t gfp_mask);
> +
shrink_zone is slightly grep unfriendly. Can you consider shrink_slab_zone()
or something else?
> /* These are for internal use */
> struct list_head list;
> long nr; /* objs pending delete */
> };
> +
> +/* Constants for use by old shrinker API */
> #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
> +
> +/* Constants for use by new shrinker API */
> +/*
> + * SHRINK_DEFAULT_SEEKS is shifted by 4 to match an arbitrary constant
> + * in the old shrinker code.
> + */
> +#define SHRINK_FACTOR (128UL) /* Fixed point shift */
> +#define SHRINK_DEFAULT_SEEKS (SHRINK_FACTOR*DEFAULT_SEEKS/4)
> +#define SHRINK_BATCH 128 /* A good number if you don't know better */
> +
> extern void register_shrinker(struct shrinker *);
> extern void unregister_shrinker(struct shrinker *);
> +extern void shrinker_add_scan(unsigned long *dst,
> + unsigned long scanned, unsigned long total,
> + unsigned long objects, unsigned int ratio);
> +extern unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch);
>
> int vma_wants_writenotify(struct vm_area_struct *vma);
>
> @@ -1464,8 +1508,7 @@ int in_gate_area_no_task(unsigned long a
>
> int drop_caches_sysctl_handler(struct ctl_table *, int,
> void __user *, size_t *, loff_t *);
> -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> - unsigned long lru_pages);
> +void shrink_all_slab(struct zone *zone);
>
> #ifndef CONFIG_MMU
> #define randomize_va_space 0
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c 2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/mm/vmscan.c 2010-11-09 22:11:10.000000000 +1100
> @@ -80,6 +80,9 @@ struct scan_control {
> /* Can pages be swapped as part of reclaim? */
> int may_swap;
>
> + /* Can slab pages be reclaimed? */
> + int may_reclaim_slab;
> +
> int swappiness;
>
> int order;
> @@ -169,6 +172,8 @@ static unsigned long zone_nr_lru_pages(s
> */
> void register_shrinker(struct shrinker *shrinker)
> {
> + BUG_ON(shrinker->shrink && shrinker->shrink_zone);
> + BUG_ON(!shrinker->shrink && !shrinker->shrink_zone);
> shrinker->nr = 0;
> down_write(&shrinker_rwsem);
> list_add_tail(&shrinker->list, &shrinker_list);
> @@ -187,43 +192,101 @@ void unregister_shrinker(struct shrinker
> }
> EXPORT_SYMBOL(unregister_shrinker);
>
> -#define SHRINK_BATCH 128
> /*
> - * Call the shrink functions to age shrinkable caches
> + * shrinker_add_scan - accumulate shrinker scan
> + * @dst: scan counter variable
> + * @scanned: pagecache pages scanned
> + * @total: total pagecache objects
> + * @tot: total objects in this cache
> + * @ratio: ratio of pagecache value to object value
> *
> - * Here we assume it costs one seek to replace a lru page and that it also
> - * takes a seek to recreate a cache object. With this in mind we age equal
> - * percentages of the lru and ageable caches. This should balance the seeks
> - * generated by these structures.
> + * shrinker_add_scan accumulates a number of objects to scan into @dst,
> + * based on the following ratio:
> *
> - * If the vm encountered mapped pages on the LRU it increase the pressure on
> - * slab to avoid swapping.
> + * proportion = scanned / total // proportion of pagecache scanned
> + * obj_prop = objects * proportion // same proportion of objects
> + * to_scan = obj_prop / ratio // modify by ratio
> + * *dst += (total / scanned) // accumulate to dst
> *
> - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
> + * The ratio is a fixed point integer with a factor SHRINK_FACTOR.
> + * Higher ratios give objects higher value.
> *
> - * `lru_pages' represents the number of on-LRU pages in all the zones which
> - * are eligible for the caller's allocation attempt. It is used for balancing
> - * slab reclaim versus page reclaim.
> + * @dst is also fixed point, so cannot be used as a simple count.
> + * shrinker_do_scan will take care of that for us.
> *
> - * Returns the number of slab objects which we shrunk.
> + * There is no synchronisation here, which is fine really. A rare lost
> + * update is no huge deal in reclaim code.
> */
> -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
> - unsigned long lru_pages)
> +void shrinker_add_scan(unsigned long *dst,
> + unsigned long scanned, unsigned long total,
> + unsigned long objects, unsigned int ratio)
> {
> - struct shrinker *shrinker;
> - unsigned long ret = 0;
> + unsigned long long delta;
>
> - if (scanned == 0)
> - scanned = SWAP_CLUSTER_MAX;
> + delta = (unsigned long long)scanned * objects;
> + delta *= SHRINK_FACTOR;
> + do_div(delta, total + 1);
> + delta *= SHRINK_FACTOR; /* ratio is also in SHRINK_FACTOR units */
> + do_div(delta, ratio + 1);
introdusing tiny macro is better than the comment.
>
> - if (!down_read_trylock(&shrinker_rwsem))
> - return 1; /* Assume we'll be able to shrink next time */
> + /*
> + * Avoid risking looping forever due to too large nr value:
> + * never try to free more than twice the estimate number of
> + * freeable entries.
> + */
> + *dst += delta;
> +
> + if (*dst / SHRINK_FACTOR > objects)
> + *dst = objects * SHRINK_FACTOR;
objects * SHRINK_FACTOR appear twice in this function.
calculate "objects = obj * SHRINK_FACTOR" at first improve
code readability slightly.
> +}
> +EXPORT_SYMBOL(shrinker_add_scan);
> +
> +/*
> + * shrinker_do_scan - scan a batch of objects
> + * @dst: scan counter
> + * @batch: number of objects to scan in this batch
> + * @Returns: number of objects to scan
> + *
> + * shrinker_do_scan takes the scan counter accumulated by shrinker_add_scan,
> + * and decrements it by @batch if it is greater than batch and returns batch.
> + * Otherwise returns 0. The caller should use the return value as the number
> + * of objects to scan next.
> + *
> + * Between shrinker_do_scan calls, the caller should drop locks if possible
> + * and call cond_resched.
> + *
> + * Note, @dst is a fixed point scaled integer. See shrinker_add_scan.
> + *
> + * Like shrinker_add_scan, shrinker_do_scan is not SMP safe, but it doesn't
> + * really need to be.
> + */
> +unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch)
Seems misleading name a bit. shrinker_do_scan() does NOT scan.
It only does batch adjustment.
> +{
> + unsigned long nr = ACCESS_ONCE(*dst);
Dumb question: why is this ACCESS_ONCE() necessary?
> + if (nr < batch * SHRINK_FACTOR)
> + return 0;
> + *dst = nr - batch * SHRINK_FACTOR;
> + return batch;
{
unsigned long nr = ACCESS_ONCE(*dst);
batch *= SHRINK_FACTOR;
if (nr < batch)
return 0;
*dst = nr - batch;
return batch;
}
is slighly cleaner. however It's unclear why dst and batch argument
need to have different unit (i.e why caller can't do batch * FACTOR?).
> +}
> +EXPORT_SYMBOL(shrinker_do_scan);
> +
> +#define SHRINK_BATCH 128
> +/*
> + * Scan the deprecated shrinkers. This will go away soon in favour of
> + * converting everybody to new shrinker API.
> + */
> +static void shrink_slab_old(unsigned long scanned, gfp_t gfp_mask,
> + unsigned long lru_pages)
> +{
> + struct shrinker *shrinker;
>
> list_for_each_entry(shrinker, &shrinker_list, list) {
> unsigned long long delta;
> unsigned long total_scan;
> unsigned long max_pass;
>
> + if (!shrinker->shrink)
> + continue;
> max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
> delta = (4 * scanned) / shrinker->seeks;
> delta *= max_pass;
> @@ -250,15 +313,11 @@ unsigned long shrink_slab(unsigned long
> while (total_scan >= SHRINK_BATCH) {
> long this_scan = SHRINK_BATCH;
> int shrink_ret;
> - int nr_before;
>
> - nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
> shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
> gfp_mask);
> if (shrink_ret == -1)
> break;
> - if (shrink_ret < nr_before)
> - ret += nr_before - shrink_ret;
> count_vm_events(SLABS_SCANNED, this_scan);
> total_scan -= this_scan;
>
> @@ -267,8 +326,86 @@ unsigned long shrink_slab(unsigned long
>
> shrinker->nr += total_scan;
> }
> +}
> +/*
> + * shrink_slab - Call the shrink functions to age shrinkable caches
> + * @zone: the zone we are currently reclaiming from
> + * @scanned: how many pagecache pages were scanned in this zone
> + * @total: total number of reclaimable pagecache pages in this zone
> + * @global: total number of reclaimable pagecache pages in the system
> + * @gfp_mask: gfp context that we are in
> + *
> + * Slab shrinkers should scan their objects in a proportion to the ratio of
> + * scanned to total pagecache pages in this zone, modified by a "cost"
> + * constant.
> + *
> + * For example, we have a slab cache with 100 reclaimable objects in a
> + * particular zone, and the cost of reclaiming an object is determined to be
> + * twice as expensive as reclaiming a pagecache page (due to likelihood and
> + * cost of reconstruction). If we have 200 reclaimable pagecache pages in that
> + * zone particular zone, and scan 20 of them (10%), we should scan 5% (5) of
> + * the objects in our slab cache.
> + *
> + * If we have a single global list of objects and no per-zone lists, the
> + * global count of objects can be used to find the correct ratio to scan.
> + *
> + * See shrinker_add_scan and shrinker_do_scan for helper functions and
> + * details on how to calculate these numbers.
> + */
> +static void shrink_slab(struct zone *zone, unsigned long scanned,
> + unsigned long total, unsigned long global,
> + gfp_t gfp_mask)
> +{
> + struct shrinker *shrinker;
> +
> + if (scanned == 0)
> + scanned = SWAP_CLUSTER_MAX;
> +
> + if (!down_read_trylock(&shrinker_rwsem))
> + return;
> +
> + /* do a global shrink with the old shrinker API */
> + shrink_slab_old(scanned, gfp_mask, global);
> +
> + list_for_each_entry(shrinker, &shrinker_list, list) {
> + if (!shrinker->shrink_zone)
> + continue;
> + (*shrinker->shrink_zone)(shrinker, zone, scanned,
> + total, global, 0, gfp_mask);
flags argument is unused?
> + }
> up_read(&shrinker_rwsem);
> - return ret;
> +}
> +
> +/**
> + * shrink_all_slab - shrinks slabs in a given zone or system wide
> + * @zone: NULL to shrink slab from all zones, or non-NULL to shrink from a particular zone
> + *
> + * shrink_all_slab is a bit of a big hammer, and it's not really well defined what it should
> + * do (how much, how hard to shrink, etc), and it will throw out the reclaim balance. So it
> + * must only be used very carefully (drop_caches and hardware memory error handler are good
> + * examples).
> + */
> +void shrink_all_slab(struct zone *zone)
> +{
> + struct reclaim_state reclaim_state;
> +
> + current->reclaim_state = &reclaim_state;
> + do {
> + reclaim_state.reclaimed_slab = 0;
> + /*
> + * Use "100" for "scanned", "total", and "global", so
> + * that shrinkers scan a large proportion of their
> + * objects. 100 rather than 1 in order to reduce rounding
> + * errors.
> + */
> + if (!zone) {
> + for_each_populated_zone(zone)
> + shrink_slab(zone, 100, 100, 100, GFP_KERNEL);
> + } else
> + shrink_slab(zone, 100, 100, 100, GFP_KERNEL);
> + } while (reclaim_state.reclaimed_slab);
> +
> + current->reclaim_state = NULL;
> }
>
> static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
> @@ -1801,16 +1938,22 @@ static void get_scan_count(struct zone *
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> static void shrink_zone(int priority, struct zone *zone,
> - struct scan_control *sc)
> + struct scan_control *sc, unsigned long global_lru_pages)
> {
> unsigned long nr[NR_LRU_LISTS];
> unsigned long nr_to_scan;
> enum lru_list l;
> unsigned long nr_reclaimed = sc->nr_reclaimed;
> unsigned long nr_to_reclaim = sc->nr_to_reclaim;
> + unsigned long nr_scanned = sc->nr_scanned;
> + unsigned long lru_pages = 0;
>
> get_scan_count(zone, sc, nr, priority);
>
> + /* Used by slab shrinking, below */
> + if (sc->may_reclaim_slab)
> + lru_pages = zone_reclaimable_pages(zone);
> +
> while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> nr[LRU_INACTIVE_FILE]) {
> for_each_evictable_lru(l) {
> @@ -1835,8 +1978,6 @@ static void shrink_zone(int priority, st
> break;
> }
>
> - sc->nr_reclaimed = nr_reclaimed;
> -
> /*
> * Even if we did not try to evict anon pages at all, we want to
> * rebalance the anon lru active/inactive ratio.
> @@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
> if (inactive_anon_is_low(zone, sc))
> shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
>
> + /*
> + * Don't shrink slabs when reclaiming memory from
> + * over limit cgroups
> + */
> + if (sc->may_reclaim_slab) {
> + struct reclaim_state *reclaim_state = current->reclaim_state;
> +
> + shrink_slab(zone, sc->nr_scanned - nr_scanned,
Doubtful calculation. What mean "sc->nr_scanned - nr_scanned"?
I think nr_scanned simply keep old slab balancing behavior.
> + lru_pages, global_lru_pages, sc->gfp_mask);
> + if (reclaim_state) {
> + nr_reclaimed += reclaim_state->reclaimed_slab;
> + reclaim_state->reclaimed_slab = 0;
> + }
> + }
> +
> + sc->nr_reclaimed = nr_reclaimed;
> +
> throttle_vm_writeout(sc->gfp_mask);
> }
>
> @@ -1864,7 +2022,7 @@ static void shrink_zone(int priority, st
> * scan then give up on it.
> */
> static void shrink_zones(int priority, struct zonelist *zonelist,
> - struct scan_control *sc)
> + struct scan_control *sc, unsigned long global_lru_pages)
> {
> struct zoneref *z;
> struct zone *zone;
> @@ -1884,7 +2042,7 @@ static void shrink_zones(int priority, s
> continue; /* Let kswapd poll it */
> }
>
> - shrink_zone(priority, zone, sc);
> + shrink_zone(priority, zone, sc, global_lru_pages);
> }
> }
>
> @@ -1941,7 +2099,6 @@ static unsigned long do_try_to_free_page
> {
> int priority;
> unsigned long total_scanned = 0;
> - struct reclaim_state *reclaim_state = current->reclaim_state;
> struct zoneref *z;
> struct zone *zone;
> unsigned long writeback_threshold;
> @@ -1953,30 +2110,20 @@ static unsigned long do_try_to_free_page
> count_vm_event(ALLOCSTALL);
>
> for (priority = DEF_PRIORITY; priority >= 0; priority--) {
> - sc->nr_scanned = 0;
> - if (!priority)
> - disable_swap_token();
> - shrink_zones(priority, zonelist, sc);
> - /*
> - * Don't shrink slabs when reclaiming memory from
> - * over limit cgroups
> - */
> - if (scanning_global_lru(sc)) {
> - unsigned long lru_pages = 0;
> - for_each_zone_zonelist(zone, z, zonelist,
> - gfp_zone(sc->gfp_mask)) {
> - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> - continue;
> + unsigned long lru_pages = 0;
>
> - lru_pages += zone_reclaimable_pages(zone);
> - }
> + for_each_zone_zonelist(zone, z, zonelist,
> + gfp_zone(sc->gfp_mask)) {
> + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> + continue;
>
> - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
> - if (reclaim_state) {
> - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> - reclaim_state->reclaimed_slab = 0;
> - }
> + lru_pages += zone_reclaimable_pages(zone);
Do we really need this doubtful cpuset hardwall filtering? Why do we
need to change slab reclaim pressure if cpuset is used. In old days,
we didn't have per-zone slab shrinker, then we need artificial slab
pressure boost for preventing false positive oom-killer. but now we have.
However, If you strongly keep old behavior at this time, I don't oppose.
We can change it later.
> }
> +
> + sc->nr_scanned = 0;
> + if (!priority)
> + disable_swap_token();
> + shrink_zones(priority, zonelist, sc, lru_pages);
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> goto out;
> @@ -2029,6 +2176,7 @@ unsigned long try_to_free_pages(struct z
> .nr_to_reclaim = SWAP_CLUSTER_MAX,
> .may_unmap = 1,
> .may_swap = 1,
> + .may_reclaim_slab = 1,
> .swappiness = vm_swappiness,
> .order = order,
> .mem_cgroup = NULL,
> @@ -2058,6 +2206,7 @@ unsigned long mem_cgroup_shrink_node_zon
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> .may_swap = !noswap,
> + .may_reclaim_slab = 0,
> .swappiness = swappiness,
> .order = 0,
> .mem_cgroup = mem,
> @@ -2076,7 +2225,7 @@ unsigned long mem_cgroup_shrink_node_zon
> * will pick up pages from other mem cgroup's as well. We hack
> * the priority and make it zero.
> */
> - shrink_zone(0, zone, &sc);
> + shrink_zone(0, zone, &sc, zone_reclaimable_pages(zone));
>
> trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
>
> @@ -2094,6 +2243,7 @@ unsigned long try_to_free_mem_cgroup_pag
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> .may_swap = !noswap,
> + .may_reclaim_slab = 0,
> .nr_to_reclaim = SWAP_CLUSTER_MAX,
> .swappiness = swappiness,
> .order = 0,
> @@ -2171,11 +2321,11 @@ static unsigned long balance_pgdat(pg_da
> int priority;
> int i;
> unsigned long total_scanned;
> - struct reclaim_state *reclaim_state = current->reclaim_state;
> struct scan_control sc = {
> .gfp_mask = GFP_KERNEL,
> .may_unmap = 1,
> .may_swap = 1,
> + .may_reclaim_slab = 1,
> /*
> * kswapd doesn't want to be bailed out while reclaim. because
> * we want to put equal scanning pressure on each zone.
> @@ -2249,7 +2399,6 @@ static unsigned long balance_pgdat(pg_da
> */
> for (i = 0; i <= end_zone; i++) {
> struct zone *zone = pgdat->node_zones + i;
> - int nr_slab;
>
> if (!populated_zone(zone))
> continue;
> @@ -2271,15 +2420,11 @@ static unsigned long balance_pgdat(pg_da
> */
> if (!zone_watermark_ok(zone, order,
> 8*high_wmark_pages(zone), end_zone, 0))
> - shrink_zone(priority, zone, &sc);
> - reclaim_state->reclaimed_slab = 0;
> - nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> - lru_pages);
> - sc.nr_reclaimed += reclaim_state->reclaimed_slab;
> + shrink_zone(priority, zone, &sc, lru_pages);
> total_scanned += sc.nr_scanned;
> if (zone->all_unreclaimable)
> continue;
> - if (nr_slab == 0 && !zone_reclaimable(zone))
> + if (!zone_reclaimable(zone))
> zone->all_unreclaimable = 1;
> /*
> * If we've done a decent amount of scanning and
> @@ -2545,6 +2690,7 @@ unsigned long shrink_all_memory(unsigned
> .may_swap = 1,
> .may_unmap = 1,
> .may_writepage = 1,
> + .may_reclaim_slab = 1,
> .nr_to_reclaim = nr_to_reclaim,
> .hibernation_mode = 1,
> .swappiness = vm_swappiness,
> @@ -2728,13 +2874,14 @@ static int __zone_reclaim(struct zone *z
> .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
> .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
> .may_swap = 1,
> + .may_reclaim_slab = 0,
> .nr_to_reclaim = max_t(unsigned long, nr_pages,
> SWAP_CLUSTER_MAX),
> .gfp_mask = gfp_mask,
> .swappiness = vm_swappiness,
> .order = order,
> };
> - unsigned long nr_slab_pages0, nr_slab_pages1;
> + unsigned long lru_pages, slab_pages;
>
> cond_resched();
> /*
> @@ -2747,51 +2894,61 @@ static int __zone_reclaim(struct zone *z
> reclaim_state.reclaimed_slab = 0;
> p->reclaim_state = &reclaim_state;
>
> + lru_pages = zone_reclaimable_pages(zone);
> + slab_pages = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> +
> if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
> + if (slab_pages > zone->min_slab_pages)
> + sc.may_reclaim_slab = 1;
> /*
> * Free memory by calling shrink zone with increasing
> * priorities until we have enough memory freed.
> */
> priority = ZONE_RECLAIM_PRIORITY;
> do {
> - shrink_zone(priority, zone, &sc);
> + shrink_zone(priority, zone, &sc, lru_pages);
> priority--;
> } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
> - }
>
> - nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> - if (nr_slab_pages0 > zone->min_slab_pages) {
> + } else if (slab_pages > zone->min_slab_pages) {
> /*
> - * shrink_slab() does not currently allow us to determine how
> - * many pages were freed in this zone. So we take the current
> - * number of slab pages and shake the slab until it is reduced
> - * by the same nr_pages that we used for reclaiming unmapped
> - * pages.
> - *
> - * Note that shrink_slab will free memory on all zones and may
> - * take a long time.
> + * Scanning slab without pagecache, have to open code
> + * call to shrink_slab (shirnk_zone drives slab reclaim via
> + * pagecache scanning, so it isn't set up to shrink slab
> + * without scanning pagecache.
> */
> - for (;;) {
> - unsigned long lru_pages = zone_reclaimable_pages(zone);
> -
> - /* No reclaimable slab or very low memory pressure */
> - if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
> - break;
>
> - /* Freed enough memory */
> - nr_slab_pages1 = zone_page_state(zone,
> - NR_SLAB_RECLAIMABLE);
> - if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
> - break;
> - }
> + /*
> + * lru_pages / 10 -- put a 10% pressure on the slab
> + * which roughly corresponds to ZONE_RECLAIM_PRIORITY
> + * scanning 1/16th of pagecache.
> + *
> + * Global slabs will be shrink at a relatively more
> + * aggressive rate because we don't calculate the
> + * global lru size for speed. But they really should
> + * be converted to per zone slabs if they are important
> + */
> + shrink_slab(zone, lru_pages / 10, lru_pages, lru_pages,
> + gfp_mask);
Why don't you use sc.nr_scanned? It seems straight forward.
>
> /*
> - * Update nr_reclaimed by the number of slab pages we
> - * reclaimed from this zone.
> + * Although we have a zone based slab shrinker API, some slabs
> + * are still scanned globally. This means we can't quite
> + * determine how many pages were freed in this zone by
> + * checking reclaimed_slab. However the regular shrink_zone
> + * paths have exactly the same problem that they largely
> + * ignore. So don't be different.
> + *
> + * The situation will improve dramatically as important slabs
> + * are switched over to using reclaimed_slab after the
> + * important slabs are converted to using per zone shrinkers.
> + *
> + * Note that shrink_slab may free memory on all zones and may
> + * take a long time, but again switching important slabs to
> + * zone based shrinkers will solve this problem.
> */
> - nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> - if (nr_slab_pages1 < nr_slab_pages0)
> - sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
> + sc.nr_reclaimed += reclaim_state.reclaimed_slab;
> + reclaim_state.reclaimed_slab = 0;
> }
>
> p->reclaim_state = NULL;
> Index: linux-2.6/fs/drop_caches.c
> ===================================================================
> --- linux-2.6.orig/fs/drop_caches.c 2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/fs/drop_caches.c 2010-11-09 22:11:10.000000000 +1100
> @@ -35,11 +35,7 @@ static void drop_pagecache_sb(struct sup
>
> static void drop_slab(void)
> {
> - int nr_objects;
> -
> - do {
> - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
> - } while (nr_objects > 10);
> + shrink_all_slab(NULL); /* NULL - all zones */
> }
>
> int drop_caches_sysctl_handler(ctl_table *table, int write,
> Index: linux-2.6/mm/memory-failure.c
> ===================================================================
> --- linux-2.6.orig/mm/memory-failure.c 2010-11-09 22:11:03.000000000 +1100
> +++ linux-2.6/mm/memory-failure.c 2010-11-09 22:11:10.000000000 +1100
> @@ -235,14 +235,8 @@ void shake_page(struct page *p, int acce
> * Only all shrink_slab here (which would also
> * shrink other caches) if access is not potentially fatal.
> */
> - if (access) {
> - int nr;
> - do {
> - nr = shrink_slab(1000, GFP_KERNEL, 1000);
> - if (page_count(p) == 1)
> - break;
> - } while (nr > 10);
> - }
> + if (access)
> + shrink_all_slab(page_zone(p));
> }
> EXPORT_SYMBOL_GPL(shake_page);
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> > @@ -1835,8 +1978,6 @@ static void shrink_zone(int priority, st
> > break;
> > }
> >
> > - sc->nr_reclaimed = nr_reclaimed;
> > -
> > /*
> > * Even if we did not try to evict anon pages at all, we want to
> > * rebalance the anon lru active/inactive ratio.
> > @@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
> > if (inactive_anon_is_low(zone, sc))
> > shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
> >
> > + /*
> > + * Don't shrink slabs when reclaiming memory from
> > + * over limit cgroups
> > + */
> > + if (sc->may_reclaim_slab) {
> > + struct reclaim_state *reclaim_state = current->reclaim_state;
> > +
> > + shrink_slab(zone, sc->nr_scanned - nr_scanned,
>
> Doubtful calculation. What mean "sc->nr_scanned - nr_scanned"?
> I think nr_scanned simply keep old slab balancing behavior.
And per-zone reclaim can lead to new issue. On 32bit highmem system,
theorically the system has following memory usage.
ZONE_HIGHMEM: 100% used for page cache
ZONE_NORMAL: 100% used for slab
So, traditional page-cache/slab balancing may not work. I think following
new calculation or somethinhg else is necessary.
if (zone_reclaimable_pages() > NR_SLAB_RECLAIMABLE) {
using current calculation
} else {
shrink number of "objects >> reclaim-priority" objects
(as page cache scanning calculation)
}
However, it can be separate this patch, perhaps.
>
>
> > + lru_pages, global_lru_pages, sc->gfp_mask);
> > + if (reclaim_state) {
> > + nr_reclaimed += reclaim_state->reclaimed_slab;
> > + reclaim_state->reclaimed_slab = 0;
> > + }
> > + }
> > +
> > + sc->nr_reclaimed = nr_reclaimed;
> > +
> > throttle_vm_writeout(sc->gfp_mask);
> > }
On Sun, Nov 14, 2010 at 07:07:17PM +0900, KOSAKI Motohiro wrote:
> Hi
>
> > Hi,
> >
> > I'm doing some works that require per-zone shrinkers, I'd like to get
> > the vmscan part signed off and merged by interested mm people, please.
> >
> > [And before anybody else kindly suggests per-node shrinkers, please go
> > back and read all the discussion about this first.]
>
> vmscan part looks good to me. however I hope fs folks review too even though
> I'm not sure who is best.
>
> btw, I have some nitpick comments. see below.
Thanks for the review, it's very helpful.
> > + void (*shrink_zone)(struct shrinker *shrink,
> > + struct zone *zone, unsigned long scanned,
> > + unsigned long total, unsigned long global,
> > + unsigned long flags, gfp_t gfp_mask);
> > +
>
> shrink_zone is slightly grep unfriendly. Can you consider shrink_slab_zone()
> or something else?
Yes that's true. I want to move away from the term "slab" shrinker
however. It seems to confuse people (of course, the shrinker can shrink
memory from any allocator, not just slab).
shrink_cache_zone()?
> > +void shrinker_add_scan(unsigned long *dst,
> > + unsigned long scanned, unsigned long total,
> > + unsigned long objects, unsigned int ratio)
> > {
> > - struct shrinker *shrinker;
> > - unsigned long ret = 0;
> > + unsigned long long delta;
> >
> > - if (scanned == 0)
> > - scanned = SWAP_CLUSTER_MAX;
> > + delta = (unsigned long long)scanned * objects;
> > + delta *= SHRINK_FACTOR;
> > + do_div(delta, total + 1);
>
> > + delta *= SHRINK_FACTOR; /* ratio is also in SHRINK_FACTOR units */
> > + do_div(delta, ratio + 1);
>
> introdusing tiny macro is better than the comment.
>
> >
> > - if (!down_read_trylock(&shrinker_rwsem))
> > - return 1; /* Assume we'll be able to shrink next time */
> > + /*
> > + * Avoid risking looping forever due to too large nr value:
> > + * never try to free more than twice the estimate number of
> > + * freeable entries.
> > + */
> > + *dst += delta;
> > +
> > + if (*dst / SHRINK_FACTOR > objects)
> > + *dst = objects * SHRINK_FACTOR;
>
> objects * SHRINK_FACTOR appear twice in this function.
> calculate "objects = obj * SHRINK_FACTOR" at first improve
> code readability slightly.
I wasn't quite sure what you meant with this comment and the above one.
Could you illustrate what your preferred code would look like?
> > +unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch)
>
> Seems misleading name a bit. shrinker_do_scan() does NOT scan.
> It only does batch adjustment.
True. shrinker_get_batch_nr() or similar?
> > +{
> > + unsigned long nr = ACCESS_ONCE(*dst);
>
> Dumb question: why is this ACCESS_ONCE() necessary?
>
>
> > + if (nr < batch * SHRINK_FACTOR)
> > + return 0;
> > + *dst = nr - batch * SHRINK_FACTOR;
> > + return batch;
It should have a comment: *dst can be accessed without a lock.
However if nr is reloaded from memory between the two expressions
and *dst changes during that time, we could end up with a negative
result in *dst.
> {
> unsigned long nr = ACCESS_ONCE(*dst);
> batch *= SHRINK_FACTOR;
>
> if (nr < batch)
> return 0;
> *dst = nr - batch;
> return batch;
> }
>
> is slighly cleaner. however It's unclear why dst and batch argument
> need to have different unit (i.e why caller can't do batch * FACTOR?).
OK I'll take it into consideration. I guess I didn't want the caller
to care too much about the fixed point.
> > + list_for_each_entry(shrinker, &shrinker_list, list) {
> > + if (!shrinker->shrink_zone)
> > + continue;
> > + (*shrinker->shrink_zone)(shrinker, zone, scanned,
> > + total, global, 0, gfp_mask);
>
> flags argument is unused?
Yes it is, at the moment. I actually have a flag that I would like
to use (close to OOM flag), so I've just added the placeholder for
now.
It may well be useful for other things in future too.
> > @@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
> > if (inactive_anon_is_low(zone, sc))
> > shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
> >
> > + /*
> > + * Don't shrink slabs when reclaiming memory from
> > + * over limit cgroups
> > + */
> > + if (sc->may_reclaim_slab) {
> > + struct reclaim_state *reclaim_state = current->reclaim_state;
> > +
> > + shrink_slab(zone, sc->nr_scanned - nr_scanned,
>
> Doubtful calculation. What mean "sc->nr_scanned - nr_scanned"?
> I think nr_scanned simply keep old slab balancing behavior.
OK, good catch.
> > + for_each_zone_zonelist(zone, z, zonelist,
> > + gfp_zone(sc->gfp_mask)) {
> > + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> > + continue;
> >
> > - shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
> > - if (reclaim_state) {
> > - sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> > - reclaim_state->reclaimed_slab = 0;
> > - }
> > + lru_pages += zone_reclaimable_pages(zone);
>
> Do we really need this doubtful cpuset hardwall filtering? Why do we
> need to change slab reclaim pressure if cpuset is used. In old days,
> we didn't have per-zone slab shrinker, then we need artificial slab
> pressure boost for preventing false positive oom-killer. but now we have.
Yeah I'm not completely sure. But we should be mindful that until the
major caches are converted to LRU, we still have to care for the global
shrinker case too.
> However, If you strongly keep old behavior at this time, I don't oppose.
> We can change it later.
Yes I would prefer that, but I would welcome patches to improve things.
> > + /*
> > + * lru_pages / 10 -- put a 10% pressure on the slab
> > + * which roughly corresponds to ZONE_RECLAIM_PRIORITY
> > + * scanning 1/16th of pagecache.
> > + *
> > + * Global slabs will be shrink at a relatively more
> > + * aggressive rate because we don't calculate the
> > + * global lru size for speed. But they really should
> > + * be converted to per zone slabs if they are important
> > + */
> > + shrink_slab(zone, lru_pages / 10, lru_pages, lru_pages,
> > + gfp_mask);
>
> Why don't you use sc.nr_scanned? It seems straight forward.
Well it may not be over the pagecache limit.
I agree the situation is pretty ugly here with all these magic
constants, but I didn't want to change too much in this patch.
Thanks,
Nick
On Mon, Nov 15, 2010 at 09:50:36AM +0900, KOSAKI Motohiro wrote:
> > > @@ -1835,8 +1978,6 @@ static void shrink_zone(int priority, st
> > > break;
> > > }
> > >
> > > - sc->nr_reclaimed = nr_reclaimed;
> > > -
> > > /*
> > > * Even if we did not try to evict anon pages at all, we want to
> > > * rebalance the anon lru active/inactive ratio.
> > > @@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
> > > if (inactive_anon_is_low(zone, sc))
> > > shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
> > >
> > > + /*
> > > + * Don't shrink slabs when reclaiming memory from
> > > + * over limit cgroups
> > > + */
> > > + if (sc->may_reclaim_slab) {
> > > + struct reclaim_state *reclaim_state = current->reclaim_state;
> > > +
> > > + shrink_slab(zone, sc->nr_scanned - nr_scanned,
> >
> > Doubtful calculation. What mean "sc->nr_scanned - nr_scanned"?
> > I think nr_scanned simply keep old slab balancing behavior.
>
> And per-zone reclaim can lead to new issue. On 32bit highmem system,
> theorically the system has following memory usage.
>
> ZONE_HIGHMEM: 100% used for page cache
> ZONE_NORMAL: 100% used for slab
>
> So, traditional page-cache/slab balancing may not work. I think following
Yes, in theory you are right. I guess in theory the same hole exists
if we have 0% page cache reclaimable globally, but this may be slightly
more likely to hit.
> new calculation or somethinhg else is necessary.
>
> if (zone_reclaimable_pages() > NR_SLAB_RECLAIMABLE) {
> using current calculation
> } else {
> shrink number of "objects >> reclaim-priority" objects
> (as page cache scanning calculation)
> }
>
> However, it can be separate this patch, perhaps.
I agree. In fact, perhaps the new calculation would work well in all
cases anyway, so maybe we should move away from making slab reclaim a
slave to pagecache reclaim.
Can we approach that in subsequent patches?
Thanks,
Nick
Nick, I want to test your tree.
This is taking too long.
Make something available now. And test it in real configs.
于 11/16/2010 03:53 PM, Anca Emanuel 写道:
> Nick, I want to test your tree.
> This is taking too long.
> Make something available now. And test it in real configs.
>
hi Anca,
would you like to give your test method?
On Tue, Nov 16, 2010 at 10:05 AM, Figo.zhang
<[email protected]> wrote:
> 于 11/16/2010 03:53 PM, Anca Emanuel 写道:
>>
>> Nick, I want to test your tree.
>> This is taking too long.
>> Make something available now. And test it in real configs.
>>
>
> hi Anca,
>
> would you like to give your test method?
>
Nothig special, for now I will test on my PC.
于 11/16/2010 04:20 PM, Anca Emanuel 写道:
> On Tue, Nov 16, 2010 at 10:05 AM, Figo.zhang
> <[email protected]> wrote:
>> 于 11/16/2010 03:53 PM, Anca Emanuel 写道:
>>>
>>> Nick, I want to test your tree.
>>> This is taking too long.
>>> Make something available now. And test it in real configs.
>>>
>>
>> hi Anca,
>>
>> would you like to give your test method?
>>
>
> Nothig special, for now I will test on my PC.
>
hi KOSAKI Motohiro,
is it any test suite or test scripts for test page-reclaim performance?
Best,
Figo.zhang
On Tue, Nov 16, 2010 at 09:53:19AM +0200, Anca Emanuel wrote:
> Nick, I want to test your tree.
> This is taking too long.
> Make something available now. And test it in real configs.
I'm working on it, I'll have another batch of patches out to look at
in a couple of hours, if stress testing holds up.
On Tue, Nov 16, 2010 at 10:22 AM, Figo.zhang
<[email protected]> wrote:
> $BP2(B 11/16/2010 04:20 PM, Anca Emanuel $B<LF;(B:
>>
>> On Tue, Nov 16, 2010 at 10:05 AM, Figo.zhang
>> <[email protected]> wrote:
>>>
>>> $BP2(B 11/16/2010 03:53 PM, Anca Emanuel $B<LF;(B:
>>>>
>>>> Nick, I want to test your tree.
>>>> This is taking too long.
>>>> Make something available now. And test it in real configs.
>>>>
>>>
>>> hi Anca,
>>>
>>> would you like to give your test method?
>>>
>>
>> Nothig special, for now I will test on my PC.
>>
>
> hi KOSAKI Motohiro,
>
> is it any test suite or test scripts for test page-reclaim performance?
>
> Best,
> Figo.zhang
>
There is http://www.phoronix.com
$BP2(B 11/16/2010 04:26 PM, Anca Emanuel $B<LF;(B:
> On Tue, Nov 16, 2010 at 10:22 AM, Figo.zhang
> <[email protected]> wrote:
>> $BP2(B 11/16/2010 04:20 PM, Anca Emanuel $B<LF;(B:
>>>
>>> On Tue, Nov 16, 2010 at 10:05 AM, Figo.zhang
>>> <[email protected]> wrote:
>>>>
>>>> $BP2(B 11/16/2010 03:53 PM, Anca Emanuel $B<LF;(B:
>>>>>
>>>>> Nick, I want to test your tree.
>>>>> This is taking too long.
>>>>> Make something available now. And test it in real configs.
>>>>>
>>>>
>>>> hi Anca,
>>>>
>>>> would you like to give your test method?
>>>>
>>>
>>> Nothig special, for now I will test on my PC.
>>>
>>
>> hi KOSAKI Motohiro,
>>
>> is it any test suite or test scripts for test page-reclaim performance?
>>
>> Best,
>> Figo.zhang
>>
>
> There is http://www.phoronix.com
it is not focus on page-reclaim test, or specially for MM.
>
2010/11/17 Figo.zhang <[email protected]>:
> ?? 11/16/2010 04:26 PM, Anca Emanuel д??:
>> On Tue, Nov 16, 2010 at 10:22 AM, Figo.zhang
>> <[email protected]> wrote:
>>> ?? 11/16/2010 04:20 PM, Anca Emanuel д??:
>>>>
>>>> On Tue, Nov 16, 2010 at 10:05 AM, Figo.zhang
>>>> <[email protected]> wrote:
>>>>>
>>>>> ?? 11/16/2010 03:53 PM, Anca Emanuel д??:
>>>>>>
>>>>>> Nick, I want to test your tree.
>>>>>> This is taking too long.
>>>>>> Make something available now. And test it in real configs.
>>>>>>
>>>>>
>>>>> hi Anca,
>>>>>
>>>>> would you like to give your test method?
>>>>>
>>>>
>>>> Nothig special, for now I will test on my PC.
>>>>
>>>
>>> hi KOSAKI Motohiro,
>>>
>>> is it any test suite or test scripts for test page-reclaim performance?
>>>
>>> Best,
>>> Figo.zhang
>>>
>>
>> There is http://www.phoronix.com
>
> it is not focus on page-reclaim test, or specially for MM.
>>
>
>
If you want some special test, you have to ask Michael Larabel for that.
http://www.phoronix-test-suite.com/
>>>>
>>>> hi KOSAKI Motohiro,
>>>>
>>>> is it any test suite or test scripts for test page-reclaim performance?
>>>>
>>>> Best,
>>>> Figo.zhang
>>>>
>>>
>>> There is http://www.phoronix.com
>>
>> it is not focus on page-reclaim test, or specially for MM.
>>>
>>
>>
>
> If you want some special test, you have to ask Michael Larabel for that.
> http://www.phoronix-test-suite.com/
yes, i see, the phoronix-test-suite is test such as ffmpeg, games. not
focus on MM.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email:<a href=ilto:"[email protected]"> [email protected]</a>
>
> >>>>
> >>>> hi KOSAKI Motohiro,
> >>>>
> >>>> is it any test suite or test scripts for test page-reclaim performance?
> >>>>
> >>>> Best,
> >>>> Figo.zhang
> >>>>
> >>>
> >>> There is http://www.phoronix.com
> >>
> >> it is not focus on page-reclaim test, or specially for MM.
> >>>
> >>
> >>
> >
> > If you want some special test, you have to ask Michael Larabel for that.
> > http://www.phoronix-test-suite.com/
>
> yes, i see, the phoronix-test-suite is test such as ffmpeg, games. not
> focus on MM.
Hi Figo,
I agree with Anca. Nick's patch will change almost VM behavior. then,
generic performance test suit is better than supecific one.
Thanks.
Sorry for the delay. Recently I have no time at all ;)
> On Mon, Nov 15, 2010 at 09:50:36AM +0900, KOSAKI Motohiro wrote:
> > > > @@ -1835,8 +1978,6 @@ static void shrink_zone(int priority, st
> > > > break;
> > > > }
> > > >
> > > > - sc->nr_reclaimed = nr_reclaimed;
> > > > -
> > > > /*
> > > > * Even if we did not try to evict anon pages at all, we want to
> > > > * rebalance the anon lru active/inactive ratio.
> > > > @@ -1844,6 +1985,23 @@ static void shrink_zone(int priority, st
> > > > if (inactive_anon_is_low(zone, sc))
> > > > shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
> > > >
> > > > + /*
> > > > + * Don't shrink slabs when reclaiming memory from
> > > > + * over limit cgroups
> > > > + */
> > > > + if (sc->may_reclaim_slab) {
> > > > + struct reclaim_state *reclaim_state = current->reclaim_state;
> > > > +
> > > > + shrink_slab(zone, sc->nr_scanned - nr_scanned,
> > >
> > > Doubtful calculation. What mean "sc->nr_scanned - nr_scanned"?
> > > I think nr_scanned simply keep old slab balancing behavior.
> >
> > And per-zone reclaim can lead to new issue. On 32bit highmem system,
> > theorically the system has following memory usage.
> >
> > ZONE_HIGHMEM: 100% used for page cache
> > ZONE_NORMAL: 100% used for slab
> >
> > So, traditional page-cache/slab balancing may not work. I think following
>
> Yes, in theory you are right. I guess in theory the same hole exists
> if we have 0% page cache reclaimable globally, but this may be slightly
> more likely to hit.
I'm not worry about so much "0% page cache reclaimable globally" case
because I doubt it can be happen in real.
> > new calculation or somethinhg else is necessary.
> >
> > if (zone_reclaimable_pages() > NR_SLAB_RECLAIMABLE) {
> > using current calculation
> > } else {
> > shrink number of "objects >> reclaim-priority" objects
> > (as page cache scanning calculation)
> > }
> >
> > However, it can be separate this patch, perhaps.
>
> I agree. In fact, perhaps the new calculation would work well in all
> cases anyway, so maybe we should move away from making slab reclaim a
> slave to pagecache reclaim.
>
> Can we approach that in subsequent patches?
OK!