2014-06-20 16:34:01

by Johannes Weiner

[permalink] [raw]
Subject: [patch 1/4] mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable

shrink_zones() has a special branch to skip the all_unreclaimable()
check during hibernation, because a frozen kswapd can't mark a zone
unreclaimable.

But ever since 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages()
livelock"), determining a zone to be unreclaimable is done by directly
looking at its scan history and no longer relies on kswapd setting the
per-zone flag.

Remove this branch and let shrink_zones() check the reclaimability of
the target zones regardless of hibernation state.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 8 --------
1 file changed, 8 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0f16ffe8eb67..19b5b8016209 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2534,14 +2534,6 @@ out:
if (sc->nr_reclaimed)
return sc->nr_reclaimed;

- /*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
- * check.
- */
- if (oom_killer_disabled)
- return 0;
-
/* Aborted reclaim to try compaction? don't OOM, then */
if (aborted_reclaim)
return 1;
--
2.0.0


2014-06-20 16:34:06

by Johannes Weiner

[permalink] [raw]
Subject: [patch 3/4] mm: vmscan: remove all_unreclaimable()

Direct reclaim currently calls shrink_zones() to reclaim all members
of a zonelist, and if that wasn't successful it does another pass
through the same zonelist to check overall reclaimability.

Just check reclaimability in shrink_zones() directly and propagate the
result through the return value. Then remove all_unreclaimable().

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 48 +++++++++++++++++++++++-------------------------
1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ed1efb84c542..d0bc1a209746 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2244,9 +2244,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}

-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_zone(struct zone *zone, struct scan_control *sc)
{
unsigned long nr_reclaimed, nr_scanned;
+ unsigned long zone_reclaimed = 0;

do {
struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2290,8 +2291,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);

+ zone_reclaimed += sc->nr_reclaimed - nr_reclaimed;
+
} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+
+ return zone_reclaimed;
}

/* Returns true if compaction should go ahead for a high-order request */
@@ -2340,8 +2345,10 @@ static inline bool compaction_ready(struct zone *zone, int order)
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * Returns whether the zones overall are reclaimable or not.
*/
-static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
@@ -2354,6 +2361,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
.gfp_mask = sc->gfp_mask,
};
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+ bool all_unreclaimable = true;

/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2368,6 +2376,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)

for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
+ unsigned long zone_reclaimed = 0;
+
if (!populated_zone(zone))
continue;
/*
@@ -2414,10 +2424,15 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
sc->nr_scanned += nr_soft_scanned;
+ zone_reclaimed += nr_soft_reclaimed;
/* need some check for avoid more shrink_zone() */
}

- shrink_zone(zone, sc);
+ zone_reclaimed += shrink_zone(zone, sc);
+
+ if (zone_reclaimed ||
+ (global_reclaim(sc) && zone_reclaimable(zone)))
+ all_unreclaimable = false;
}

/*
@@ -2439,26 +2454,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
-}

-/* All zones in zonelist are unreclaimable? */
-static bool all_unreclaimable(struct zonelist *zonelist,
- struct scan_control *sc)
-{
- struct zoneref *z;
- struct zone *zone;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_zone(sc->gfp_mask), sc->nodemask) {
- if (!populated_zone(zone))
- continue;
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
- if (zone_reclaimable(zone))
- return false;
- }
-
- return true;
+ return !all_unreclaimable;
}

/*
@@ -2482,6 +2479,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
{
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
+ bool zones_reclaimable;

delayacct_freepages_start();

@@ -2492,7 +2490,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- shrink_zones(zonelist, sc);
+ zones_reclaimable = shrink_zones(zonelist, sc);

total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2533,8 +2531,8 @@ out:
if (sc->compaction_ready)
return 1;

- /* top priority shrink_zones still had more to do? don't OOM, then */
- if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
+ /* Any of the zones still reclaimable? Don't OOM. */
+ if (zones_reclaimable)
return 1;

return 0;
--
2.0.0

2014-06-20 16:34:11

by Johannes Weiner

[permalink] [raw]
Subject: [patch 4/4] mm: vmscan: move swappiness out of scan_control

Swappiness is determined for each scanned memcg individually in
shrink_zone() and is not a parameter that applies throughout the
reclaim scan. Move it out of struct scan_control to prevent
accidental use of a stale value.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 27 +++++++++++++--------------
1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d0bc1a209746..757e2a8dbf58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -89,9 +89,6 @@ struct scan_control {
/* Scan (total_size >> priority) pages at once */
int priority;

- /* anon vs. file LRUs scanning "ratio" */
- int swappiness;
-
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
@@ -1868,8 +1865,8 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
- unsigned long *nr)
+static void get_scan_count(struct lruvec *lruvec, int swappiness,
+ struct scan_control *sc, unsigned long *nr)
{
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
@@ -1912,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* using the memory controller's swap limit feature would be
* too expensive.
*/
- if (!global_reclaim(sc) && !sc->swappiness) {
+ if (!global_reclaim(sc) && !swappiness) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -1922,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* system is close to OOM, scan both anon and file equally
* (unless the swappiness setting disagrees with swapping).
*/
- if (!sc->priority && sc->swappiness) {
+ if (!sc->priority && swappiness) {
scan_balance = SCAN_EQUAL;
goto out;
}
@@ -1965,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = sc->swappiness;
+ anon_prio = swappiness;
file_prio = 200 - anon_prio;

/*
@@ -2055,7 +2052,8 @@ out:
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
+ struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
@@ -2066,7 +2064,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
struct blk_plug plug;
bool scan_adjusted;

- get_scan_count(lruvec, sc, nr);
+ get_scan_count(lruvec, swappiness, sc, nr);

/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2263,11 +2261,12 @@ static unsigned long shrink_zone(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
struct lruvec *lruvec;
+ int swappiness;

lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ swappiness = mem_cgroup_swappiness(memcg);

- sc->swappiness = mem_cgroup_swappiness(memcg);
- shrink_lruvec(lruvec, sc);
+ shrink_lruvec(lruvec, swappiness, sc);

/*
* Direct reclaim and kswapd have to scan all memory
@@ -2714,10 +2713,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_swap = !noswap,
.order = 0,
.priority = 0,
- .swappiness = mem_cgroup_swappiness(memcg),
.target_mem_cgroup = memcg,
};
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ int swappiness = mem_cgroup_swappiness(memcg);

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2733,7 +2732,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, &sc);
+ shrink_lruvec(lruvec, swappiness, &sc);

trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

--
2.0.0

2014-06-20 16:34:25

by Johannes Weiner

[permalink] [raw]
Subject: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

Page reclaim for a higher-order page runs until compaction is ready,
then aborts and signals this situation through the return value of
shrink_zones(). This is an oddly specific signal to encode in the
return value of shrink_zones(), though, and can be quite confusing.

Introduce sc->compaction_ready and signal the compactability of the
zones out-of-band to free up the return value of shrink_zones() for
actual zone reclaimability.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 19b5b8016209..ed1efb84c542 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -65,6 +65,9 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;

+ /* One of the zones is ready for compaction */
+ int compaction_ready;
+
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;

@@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
}

/* Returns true if compaction should go ahead for a high-order request */
-static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+static inline bool compaction_ready(struct zone *zone, int order)
{
unsigned long balance_gap, watermark;
bool watermark_ok;

- /* Do not consider compaction for orders reclaim is meant to satisfy */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
- return false;
-
/*
* Compaction takes time to run and there are potentially other
* callers using the pages just freed. Continue reclaiming until
@@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
*/
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
- watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);

/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
- if (compaction_deferred(zone, sc->order))
+ if (compaction_deferred(zone, order))
return watermark_ok;

/* If compaction is not ready to start, keep reclaiming */
- if (!compaction_suitable(zone, sc->order))
+ if (!compaction_suitable(zone, order))
return false;

return watermark_ok;
@@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
- *
- * This function returns true if a zone is being reclaimed for a costly
- * high-order allocation and compaction is ready to begin. This indicates to
- * the caller that it should consider retrying the allocation instead of
- * further reclaim.
*/
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long lru_pages = 0;
- bool aborted_reclaim = false;
struct reclaim_state *reclaim_state = current->reclaim_state;
gfp_t orig_mask;
struct shrink_control shrink = {
@@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
- if (IS_ENABLED(CONFIG_COMPACTION)) {
- /*
- * If we already have plenty of memory free for
- * compaction in this zone, don't free any more.
- * Even though compaction is invoked for any
- * non-zero order, only frequent costly order
- * reclamation is disruptive enough to become a
- * noticeable problem, like transparent huge
- * page allocations.
- */
- if ((zonelist_zone_idx(z) <= requested_highidx)
- && compaction_ready(zone, sc)) {
- aborted_reclaim = true;
- continue;
- }
+
+ /*
+ * If we already have plenty of memory free
+ * for compaction in this zone, don't free any
+ * more. Even though compaction is invoked
+ * for any non-zero order, only frequent
+ * costly order reclamation is disruptive
+ * enough to become a noticeable problem, like
+ * transparent huge page allocations.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+ zonelist_zone_idx(z) <= requested_highidx &&
+ compaction_ready(zone, sc->order)) {
+ sc->compaction_ready = true;
+ continue;
}
+
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
-
- return aborted_reclaim;
}

/* All zones in zonelist are unreclaimable? */
@@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
{
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
- bool aborted_reclaim;

delayacct_freepages_start();

@@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- aborted_reclaim = shrink_zones(zonelist, sc);
+ shrink_zones(zonelist, sc);

total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;

+ if (sc->compaction_ready)
+ goto out;
+
/*
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
@@ -2526,7 +2521,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
- } while (--sc->priority >= 0 && !aborted_reclaim);
+ } while (--sc->priority >= 0);

out:
delayacct_freepages_end();
@@ -2535,7 +2530,7 @@ out:
return sc->nr_reclaimed;

/* Aborted reclaim to try compaction? don't OOM, then */
- if (aborted_reclaim)
+ if (sc->compaction_ready)
return 1;

/* top priority shrink_zones still had more to do? don't OOM, then */
--
2.0.0

2014-06-20 16:56:08

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On 06/20/2014 06:33 PM, Johannes Weiner wrote:
> Page reclaim for a higher-order page runs until compaction is ready,
> then aborts and signals this situation through the return value of
> shrink_zones(). This is an oddly specific signal to encode in the
> return value of shrink_zones(), though, and can be quite confusing.
>
> Introduce sc->compaction_ready and signal the compactability of the
> zones out-of-band to free up the return value of shrink_zones() for
> actual zone reclaimability.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: Vlastimil Babka <[email protected]>

(with a nitpick below)

> ---
> mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
> 1 file changed, 31 insertions(+), 36 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 19b5b8016209..ed1efb84c542 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -65,6 +65,9 @@ struct scan_control {
> /* Number of pages freed so far during a call to shrink_zones() */
> unsigned long nr_reclaimed;
>
> + /* One of the zones is ready for compaction */
> + int compaction_ready;
> +
> /* How many pages shrink_list() should reclaim */
> unsigned long nr_to_reclaim;
>
> @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> }
>
> /* Returns true if compaction should go ahead for a high-order request */
> -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> +static inline bool compaction_ready(struct zone *zone, int order)
> {
> unsigned long balance_gap, watermark;
> bool watermark_ok;
>
> - /* Do not consider compaction for orders reclaim is meant to satisfy */
> - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
> - return false;
> -
> /*
> * Compaction takes time to run and there are potentially other
> * callers using the pages just freed. Continue reclaiming until
> @@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> */
> balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
> zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
> - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
> + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
> watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
>
> /*
> * If compaction is deferred, reclaim up to a point where
> * compaction will have a chance of success when re-enabled
> */
> - if (compaction_deferred(zone, sc->order))
> + if (compaction_deferred(zone, order))
> return watermark_ok;
>
> /* If compaction is not ready to start, keep reclaiming */
> - if (!compaction_suitable(zone, sc->order))
> + if (!compaction_suitable(zone, order))
> return false;
>
> return watermark_ok;
> @@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> - *
> - * This function returns true if a zone is being reclaimed for a costly
> - * high-order allocation and compaction is ready to begin. This indicates to
> - * the caller that it should consider retrying the allocation instead of
> - * further reclaim.
> */
> -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> {
> struct zoneref *z;
> struct zone *zone;
> unsigned long nr_soft_reclaimed;
> unsigned long nr_soft_scanned;
> unsigned long lru_pages = 0;
> - bool aborted_reclaim = false;
> struct reclaim_state *reclaim_state = current->reclaim_state;
> gfp_t orig_mask;
> struct shrink_control shrink = {
> @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> if (sc->priority != DEF_PRIORITY &&
> !zone_reclaimable(zone))
> continue; /* Let kswapd poll it */
> - if (IS_ENABLED(CONFIG_COMPACTION)) {
> - /*
> - * If we already have plenty of memory free for
> - * compaction in this zone, don't free any more.
> - * Even though compaction is invoked for any
> - * non-zero order, only frequent costly order
> - * reclamation is disruptive enough to become a
> - * noticeable problem, like transparent huge
> - * page allocations.
> - */
> - if ((zonelist_zone_idx(z) <= requested_highidx)
> - && compaction_ready(zone, sc)) {
> - aborted_reclaim = true;
> - continue;
> - }
> +
> + /*
> + * If we already have plenty of memory free
> + * for compaction in this zone, don't free any
> + * more. Even though compaction is invoked
> + * for any non-zero order, only frequent
> + * costly order reclamation is disruptive
> + * enough to become a noticeable problem, like
> + * transparent huge page allocations.
> + */

You moved this comment block left, yet you further shortened the individual lines, despite
there is now more space to prolong them.

> + if (IS_ENABLED(CONFIG_COMPACTION) &&
> + sc->order > PAGE_ALLOC_COSTLY_ORDER &&
> + zonelist_zone_idx(z) <= requested_highidx &&
> + compaction_ready(zone, sc->order)) {
> + sc->compaction_ready = true;
> + continue;
> }
> +
> /*
> * This steals pages from memory cgroups over softlimit
> * and returns the number of reclaimed pages and
> @@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> * promoted it to __GFP_HIGHMEM.
> */
> sc->gfp_mask = orig_mask;
> -
> - return aborted_reclaim;
> }
>
> /* All zones in zonelist are unreclaimable? */
> @@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> {
> unsigned long total_scanned = 0;
> unsigned long writeback_threshold;
> - bool aborted_reclaim;
>
> delayacct_freepages_start();
>
> @@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - aborted_reclaim = shrink_zones(zonelist, sc);
> + shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> goto out;
>
> + if (sc->compaction_ready)
> + goto out;
> +
> /*
> * If we're getting trouble reclaiming, start doing
> * writepage even in laptop mode.
> @@ -2526,7 +2521,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> WB_REASON_TRY_TO_FREE_PAGES);
> sc->may_writepage = 1;
> }
> - } while (--sc->priority >= 0 && !aborted_reclaim);
> + } while (--sc->priority >= 0);
>
> out:
> delayacct_freepages_end();
> @@ -2535,7 +2530,7 @@ out:
> return sc->nr_reclaimed;
>
> /* Aborted reclaim to try compaction? don't OOM, then */
> - if (aborted_reclaim)
> + if (sc->compaction_ready)
> return 1;
>
> /* top priority shrink_zones still had more to do? don't OOM, then */
>

2014-06-20 20:25:00

by Johannes Weiner

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On Fri, Jun 20, 2014 at 06:56:03PM +0200, Vlastimil Babka wrote:
> On 06/20/2014 06:33 PM, Johannes Weiner wrote:
> > Page reclaim for a higher-order page runs until compaction is ready,
> > then aborts and signals this situation through the return value of
> > shrink_zones(). This is an oddly specific signal to encode in the
> > return value of shrink_zones(), though, and can be quite confusing.
> >
> > Introduce sc->compaction_ready and signal the compactability of the
> > zones out-of-band to free up the return value of shrink_zones() for
> > actual zone reclaimability.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
>
> Acked-by: Vlastimil Babka <[email protected]>

Thanks, Vlastimil!

> > @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> > if (sc->priority != DEF_PRIORITY &&
> > !zone_reclaimable(zone))
> > continue; /* Let kswapd poll it */
> > - if (IS_ENABLED(CONFIG_COMPACTION)) {
> > - /*
> > - * If we already have plenty of memory free for
> > - * compaction in this zone, don't free any more.
> > - * Even though compaction is invoked for any
> > - * non-zero order, only frequent costly order
> > - * reclamation is disruptive enough to become a
> > - * noticeable problem, like transparent huge
> > - * page allocations.
> > - */
> > - if ((zonelist_zone_idx(z) <= requested_highidx)
> > - && compaction_ready(zone, sc)) {
> > - aborted_reclaim = true;
> > - continue;
> > - }
> > +
> > + /*
> > + * If we already have plenty of memory free
> > + * for compaction in this zone, don't free any
> > + * more. Even though compaction is invoked
> > + * for any non-zero order, only frequent
> > + * costly order reclamation is disruptive
> > + * enough to become a noticeable problem, like
> > + * transparent huge page allocations.
> > + */
>
> You moved this comment block left, yet you further shortened the individual lines, despite
> there is now more space to prolong them.

This is a result of using emacs' auto-fill all the time when writing
comments, I have to watch my reflexes while moving stuff around :-)

Updated patch:

---
>From cd48b73fdca9e23aa21f65e9af1f850dbac5ab8e Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[email protected]>
Date: Wed, 11 Jun 2014 12:53:59 -0400
Subject: [patch] mm: vmscan: rework compaction-ready signaling in direct
reclaim

Page reclaim for a higher-order page runs until compaction is ready,
then aborts and signals this situation through the return value of
shrink_zones(). This is an oddly specific signal to encode in the
return value of shrink_zones(), though, and can be quite confusing.

Introduce sc->compaction_ready and signal the compactability of the
zones out-of-band to free up the return value of shrink_zones() for
actual zone reclaimability.

Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 19b5b8016209..35747a75bf08 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -65,6 +65,9 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;

+ /* One of the zones is ready for compaction */
+ int compaction_ready;
+
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;

@@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
}

/* Returns true if compaction should go ahead for a high-order request */
-static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+static inline bool compaction_ready(struct zone *zone, int order)
{
unsigned long balance_gap, watermark;
bool watermark_ok;

- /* Do not consider compaction for orders reclaim is meant to satisfy */
- if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
- return false;
-
/*
* Compaction takes time to run and there are potentially other
* callers using the pages just freed. Continue reclaiming until
@@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
*/
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
- watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);

/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
- if (compaction_deferred(zone, sc->order))
+ if (compaction_deferred(zone, order))
return watermark_ok;

/* If compaction is not ready to start, keep reclaiming */
- if (!compaction_suitable(zone, sc->order))
+ if (!compaction_suitable(zone, order))
return false;

return watermark_ok;
@@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
- *
- * This function returns true if a zone is being reclaimed for a costly
- * high-order allocation and compaction is ready to begin. This indicates to
- * the caller that it should consider retrying the allocation instead of
- * further reclaim.
*/
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long lru_pages = 0;
- bool aborted_reclaim = false;
struct reclaim_state *reclaim_state = current->reclaim_state;
gfp_t orig_mask;
struct shrink_control shrink = {
@@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (sc->priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue; /* Let kswapd poll it */
- if (IS_ENABLED(CONFIG_COMPACTION)) {
- /*
- * If we already have plenty of memory free for
- * compaction in this zone, don't free any more.
- * Even though compaction is invoked for any
- * non-zero order, only frequent costly order
- * reclamation is disruptive enough to become a
- * noticeable problem, like transparent huge
- * page allocations.
- */
- if ((zonelist_zone_idx(z) <= requested_highidx)
- && compaction_ready(zone, sc)) {
- aborted_reclaim = true;
- continue;
- }
+
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticeable problem, like transparent huge
+ * page allocations.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) &&
+ sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+ zonelist_zone_idx(z) <= requested_highidx &&
+ compaction_ready(zone, sc->order)) {
+ sc->compaction_ready = true;
+ continue;
}
+
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* promoted it to __GFP_HIGHMEM.
*/
sc->gfp_mask = orig_mask;
-
- return aborted_reclaim;
}

/* All zones in zonelist are unreclaimable? */
@@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
{
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
- bool aborted_reclaim;

delayacct_freepages_start();

@@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);
sc->nr_scanned = 0;
- aborted_reclaim = shrink_zones(zonelist, sc);
+ shrink_zones(zonelist, sc);

total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;

+ if (sc->compaction_ready)
+ goto out;
+
/*
* If we're getting trouble reclaiming, start doing
* writepage even in laptop mode.
@@ -2526,7 +2521,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
- } while (--sc->priority >= 0 && !aborted_reclaim);
+ } while (--sc->priority >= 0);

out:
delayacct_freepages_end();
@@ -2535,7 +2530,7 @@ out:
return sc->nr_reclaimed;

/* Aborted reclaim to try compaction? don't OOM, then */
- if (aborted_reclaim)
+ if (sc->compaction_ready)
return 1;

/* top priority shrink_zones still had more to do? don't OOM, then */
--
2.0.0

2014-06-23 06:15:18

by Minchan Kim

[permalink] [raw]
Subject: Re: [patch 1/4] mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable

On Fri, Jun 20, 2014 at 12:33:47PM -0400, Johannes Weiner wrote:
> shrink_zones() has a special branch to skip the all_unreclaimable()
> check during hibernation, because a frozen kswapd can't mark a zone
> unreclaimable.
>
> But ever since 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages()
> livelock"), determining a zone to be unreclaimable is done by directly
> looking at its scan history and no longer relies on kswapd setting the
> per-zone flag.
>
> Remove this branch and let shrink_zones() check the reclaimability of
> the target zones regardless of hibernation state.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Minchan Kim <[email protected]>

It would be not bad to Cced KOSAKI who was involved all_unreclaimable
series several time with me.

> ---
> mm/vmscan.c | 8 --------
> 1 file changed, 8 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 0f16ffe8eb67..19b5b8016209 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2534,14 +2534,6 @@ out:
> if (sc->nr_reclaimed)
> return sc->nr_reclaimed;
>
> - /*
> - * As hibernation is going on, kswapd is freezed so that it can't mark
> - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
> - * check.
> - */
> - if (oom_killer_disabled)
> - return 0;
> -
> /* Aborted reclaim to try compaction? don't OOM, then */
> if (aborted_reclaim)
> return 1;
> --
> 2.0.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2014-06-23 06:35:50

by Minchan Kim

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On Fri, Jun 20, 2014 at 12:33:48PM -0400, Johannes Weiner wrote:
> Page reclaim for a higher-order page runs until compaction is ready,
> then aborts and signals this situation through the return value of
> shrink_zones(). This is an oddly specific signal to encode in the
> return value of shrink_zones(), though, and can be quite confusing.
>
> Introduce sc->compaction_ready and signal the compactability of the
> zones out-of-band to free up the return value of shrink_zones() for
> actual zone reclaimability.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Minchan Kim <[email protected]>

Below just one nitpick.

> ---
> mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
> 1 file changed, 31 insertions(+), 36 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 19b5b8016209..ed1efb84c542 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -65,6 +65,9 @@ struct scan_control {
> /* Number of pages freed so far during a call to shrink_zones() */
> unsigned long nr_reclaimed;
>
> + /* One of the zones is ready for compaction */
> + int compaction_ready;
> +
> /* How many pages shrink_list() should reclaim */
> unsigned long nr_to_reclaim;
>
> @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> }
>
> /* Returns true if compaction should go ahead for a high-order request */
> -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> +static inline bool compaction_ready(struct zone *zone, int order)
> {
> unsigned long balance_gap, watermark;
> bool watermark_ok;
>
> - /* Do not consider compaction for orders reclaim is meant to satisfy */
> - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
> - return false;
> -
> /*
> * Compaction takes time to run and there are potentially other
> * callers using the pages just freed. Continue reclaiming until
> @@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> */
> balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
> zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
> - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
> + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
> watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
>
> /*
> * If compaction is deferred, reclaim up to a point where
> * compaction will have a chance of success when re-enabled
> */
> - if (compaction_deferred(zone, sc->order))
> + if (compaction_deferred(zone, order))
> return watermark_ok;
>
> /* If compaction is not ready to start, keep reclaiming */
> - if (!compaction_suitable(zone, sc->order))
> + if (!compaction_suitable(zone, order))
> return false;
>
> return watermark_ok;
> @@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> - *
> - * This function returns true if a zone is being reclaimed for a costly
> - * high-order allocation and compaction is ready to begin. This indicates to
> - * the caller that it should consider retrying the allocation instead of
> - * further reclaim.
> */
> -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> {
> struct zoneref *z;
> struct zone *zone;
> unsigned long nr_soft_reclaimed;
> unsigned long nr_soft_scanned;
> unsigned long lru_pages = 0;
> - bool aborted_reclaim = false;

> struct reclaim_state *reclaim_state = current->reclaim_state;
> gfp_t orig_mask;
> struct shrink_control shrink = {
> @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> if (sc->priority != DEF_PRIORITY &&
> !zone_reclaimable(zone))
> continue; /* Let kswapd poll it */
> - if (IS_ENABLED(CONFIG_COMPACTION)) {
> - /*
> - * If we already have plenty of memory free for
> - * compaction in this zone, don't free any more.
> - * Even though compaction is invoked for any
> - * non-zero order, only frequent costly order
> - * reclamation is disruptive enough to become a
> - * noticeable problem, like transparent huge
> - * page allocations.
> - */
> - if ((zonelist_zone_idx(z) <= requested_highidx)
> - && compaction_ready(zone, sc)) {
> - aborted_reclaim = true;
> - continue;
> - }
> +
> + /*
> + * If we already have plenty of memory free
> + * for compaction in this zone, don't free any
> + * more. Even though compaction is invoked
> + * for any non-zero order, only frequent
> + * costly order reclamation is disruptive
> + * enough to become a noticeable problem, like
> + * transparent huge page allocations.
> + */
> + if (IS_ENABLED(CONFIG_COMPACTION) &&
> + sc->order > PAGE_ALLOC_COSTLY_ORDER &&

You are deleting comment sc->order <= PAGE_ALLOC_COSTLY_ORDER which was
in compaction_ready. At least, that comment was useful for me to guess
the intention. So if you have strong reason to remove that, I'd like to
remain it.


> + zonelist_zone_idx(z) <= requested_highidx &&
> + compaction_ready(zone, sc->order)) {
> + sc->compaction_ready = true;
> + continue;
> }
> +
> /*
> * This steals pages from memory cgroups over softlimit
> * and returns the number of reclaimed pages and
> @@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> * promoted it to __GFP_HIGHMEM.
> */
> sc->gfp_mask = orig_mask;
> -
> - return aborted_reclaim;
> }
>
> /* All zones in zonelist are unreclaimable? */
> @@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> {
> unsigned long total_scanned = 0;
> unsigned long writeback_threshold;
> - bool aborted_reclaim;
>
> delayacct_freepages_start();
>
> @@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - aborted_reclaim = shrink_zones(zonelist, sc);
> + shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> goto out;
>
> + if (sc->compaction_ready)
> + goto out;
> +
> /*
> * If we're getting trouble reclaiming, start doing
> * writepage even in laptop mode.
> @@ -2526,7 +2521,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> WB_REASON_TRY_TO_FREE_PAGES);
> sc->may_writepage = 1;
> }
> - } while (--sc->priority >= 0 && !aborted_reclaim);
> + } while (--sc->priority >= 0);
>
> out:
> delayacct_freepages_end();
> @@ -2535,7 +2530,7 @@ out:
> return sc->nr_reclaimed;
>
> /* Aborted reclaim to try compaction? don't OOM, then */
> - if (aborted_reclaim)
> + if (sc->compaction_ready)
> return 1;
>
> /* top priority shrink_zones still had more to do? don't OOM, then */
> --
> 2.0.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2014-06-23 06:47:53

by Minchan Kim

[permalink] [raw]
Subject: Re: [patch 3/4] mm: vmscan: remove all_unreclaimable()

On Fri, Jun 20, 2014 at 12:33:49PM -0400, Johannes Weiner wrote:
> Direct reclaim currently calls shrink_zones() to reclaim all members
> of a zonelist, and if that wasn't successful it does another pass
> through the same zonelist to check overall reclaimability.
>
> Just check reclaimability in shrink_zones() directly and propagate the
> result through the return value. Then remove all_unreclaimable().
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Minchan Kim <[email protected]>

> ---
> mm/vmscan.c | 48 +++++++++++++++++++++++-------------------------
> 1 file changed, 23 insertions(+), 25 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ed1efb84c542..d0bc1a209746 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2244,9 +2244,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
> }
> }
>
> -static void shrink_zone(struct zone *zone, struct scan_control *sc)
> +static unsigned long shrink_zone(struct zone *zone, struct scan_control *sc)
> {
> unsigned long nr_reclaimed, nr_scanned;
> + unsigned long zone_reclaimed = 0;
>
> do {
> struct mem_cgroup *root = sc->target_mem_cgroup;
> @@ -2290,8 +2291,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> sc->nr_scanned - nr_scanned,
> sc->nr_reclaimed - nr_reclaimed);
>
> + zone_reclaimed += sc->nr_reclaimed - nr_reclaimed;
> +
> } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
> sc->nr_scanned - nr_scanned, sc));
> +
> + return zone_reclaimed;
> }
>
> /* Returns true if compaction should go ahead for a high-order request */
> @@ -2340,8 +2345,10 @@ static inline bool compaction_ready(struct zone *zone, int order)
> *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> + *
> + * Returns whether the zones overall are reclaimable or not.
> */
> -static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> {
> struct zoneref *z;
> struct zone *zone;
> @@ -2354,6 +2361,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> .gfp_mask = sc->gfp_mask,
> };
> enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
> + bool all_unreclaimable = true;
>
> /*
> * If the number of buffer_heads in the machine exceeds the maximum
> @@ -2368,6 +2376,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>
> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> gfp_zone(sc->gfp_mask), sc->nodemask) {
> + unsigned long zone_reclaimed = 0;
> +
> if (!populated_zone(zone))
> continue;
> /*
> @@ -2414,10 +2424,15 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> &nr_soft_scanned);
> sc->nr_reclaimed += nr_soft_reclaimed;
> sc->nr_scanned += nr_soft_scanned;
> + zone_reclaimed += nr_soft_reclaimed;
> /* need some check for avoid more shrink_zone() */
> }
>
> - shrink_zone(zone, sc);
> + zone_reclaimed += shrink_zone(zone, sc);
> +
> + if (zone_reclaimed ||
> + (global_reclaim(sc) && zone_reclaimable(zone)))
> + all_unreclaimable = false;
> }
>
> /*
> @@ -2439,26 +2454,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> * promoted it to __GFP_HIGHMEM.
> */
> sc->gfp_mask = orig_mask;
> -}
>
> -/* All zones in zonelist are unreclaimable? */
> -static bool all_unreclaimable(struct zonelist *zonelist,
> - struct scan_control *sc)
> -{
> - struct zoneref *z;
> - struct zone *zone;
> -
> - for_each_zone_zonelist_nodemask(zone, z, zonelist,
> - gfp_zone(sc->gfp_mask), sc->nodemask) {
> - if (!populated_zone(zone))
> - continue;
> - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> - continue;
> - if (zone_reclaimable(zone))
> - return false;
> - }
> -
> - return true;
> + return !all_unreclaimable;
> }
>
> /*
> @@ -2482,6 +2479,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> {
> unsigned long total_scanned = 0;
> unsigned long writeback_threshold;
> + bool zones_reclaimable;
>
> delayacct_freepages_start();
>
> @@ -2492,7 +2490,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - shrink_zones(zonelist, sc);
> + zones_reclaimable = shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> @@ -2533,8 +2531,8 @@ out:
> if (sc->compaction_ready)
> return 1;
>
> - /* top priority shrink_zones still had more to do? don't OOM, then */
> - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
> + /* Any of the zones still reclaimable? Don't OOM. */
> + if (zones_reclaimable)
> return 1;
>
> return 0;
> --
> 2.0.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Kind regards,
Minchan Kim

2014-06-23 06:50:59

by Minchan Kim

[permalink] [raw]
Subject: Re: [patch 4/4] mm: vmscan: move swappiness out of scan_control

On Fri, Jun 20, 2014 at 12:33:50PM -0400, Johannes Weiner wrote:
> Swappiness is determined for each scanned memcg individually in
> shrink_zone() and is not a parameter that applies throughout the
> reclaim scan. Move it out of struct scan_control to prevent
> accidental use of a stale value.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Minchan Kim <[email protected]>

--
Kind regards,
Minchan Kim

2014-06-23 07:28:56

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On Fri 20-06-14 16:24:49, Johannes Weiner wrote:
[...]
> From cd48b73fdca9e23aa21f65e9af1f850dbac5ab8e Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <[email protected]>
> Date: Wed, 11 Jun 2014 12:53:59 -0400
> Subject: [patch] mm: vmscan: rework compaction-ready signaling in direct
> reclaim
>
> Page reclaim for a higher-order page runs until compaction is ready,
> then aborts and signals this situation through the return value of
> shrink_zones(). This is an oddly specific signal to encode in the
> return value of shrink_zones(), though, and can be quite confusing.
>
> Introduce sc->compaction_ready and signal the compactability of the
> zones out-of-band to free up the return value of shrink_zones() for
> actual zone reclaimability.
>
> Signed-off-by: Johannes Weiner <[email protected]>
> Acked-by: Vlastimil Babka <[email protected]>

Very nice. It will help me to get rid off additional hacks for the
min_limit for memcg. Thanks!

One question below

[...]
> @@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - aborted_reclaim = shrink_zones(zonelist, sc);
> + shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> goto out;
>
> + if (sc->compaction_ready)
> + goto out;
> +
> /*
> * If we're getting trouble reclaiming, start doing
> * writepage even in laptop mode.
> @@ -2526,7 +2521,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> WB_REASON_TRY_TO_FREE_PAGES);
> sc->may_writepage = 1;
> }
> - } while (--sc->priority >= 0 && !aborted_reclaim);
> + } while (--sc->priority >= 0);
>
> out:
> delayacct_freepages_end();

It is not entirely clear to me why we do not need to check and wake up
flusher threads anymore?

--
Michal Hocko
SUSE Labs

2014-06-23 07:49:05

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch 1/4] mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable

On Fri 20-06-14 12:33:47, Johannes Weiner wrote:
> shrink_zones() has a special branch to skip the all_unreclaimable()
> check during hibernation, because a frozen kswapd can't mark a zone
> unreclaimable.
>
> But ever since 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages()
> livelock"), determining a zone to be unreclaimable is done by directly
> looking at its scan history and no longer relies on kswapd setting the
> per-zone flag.
>
> Remove this branch and let shrink_zones() check the reclaimability of
> the target zones regardless of hibernation state.
>
> Signed-off-by: Johannes Weiner <[email protected]>

This code is really tricky :/

But the patch looks good to me.
Acked-by: Michal Hocko <[email protected]>

> ---
> mm/vmscan.c | 8 --------
> 1 file changed, 8 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 0f16ffe8eb67..19b5b8016209 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2534,14 +2534,6 @@ out:
> if (sc->nr_reclaimed)
> return sc->nr_reclaimed;
>
> - /*
> - * As hibernation is going on, kswapd is freezed so that it can't mark
> - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
> - * check.
> - */
> - if (oom_killer_disabled)
> - return 0;
> -
> /* Aborted reclaim to try compaction? don't OOM, then */
> if (aborted_reclaim)
> return 1;
> --
> 2.0.0
>

--
Michal Hocko
SUSE Labs

2014-06-23 08:35:50

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch 3/4] mm: vmscan: remove all_unreclaimable()

On Fri 20-06-14 12:33:49, Johannes Weiner wrote:
> Direct reclaim currently calls shrink_zones() to reclaim all members
> of a zonelist, and if that wasn't successful it does another pass
> through the same zonelist to check overall reclaimability.
>
> Just check reclaimability in shrink_zones() directly and propagate the
> result through the return value. Then remove all_unreclaimable().

Heh, I was really looking for the return value and abuse it for the
memcg low/min reclaim purposes. I will find a way...

> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: Michal Hocko <[email protected]>

> ---
> mm/vmscan.c | 48 +++++++++++++++++++++++-------------------------
> 1 file changed, 23 insertions(+), 25 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ed1efb84c542..d0bc1a209746 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2244,9 +2244,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
> }
> }
>
> -static void shrink_zone(struct zone *zone, struct scan_control *sc)
> +static unsigned long shrink_zone(struct zone *zone, struct scan_control *sc)
> {
> unsigned long nr_reclaimed, nr_scanned;
> + unsigned long zone_reclaimed = 0;
>
> do {
> struct mem_cgroup *root = sc->target_mem_cgroup;
> @@ -2290,8 +2291,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> sc->nr_scanned - nr_scanned,
> sc->nr_reclaimed - nr_reclaimed);
>
> + zone_reclaimed += sc->nr_reclaimed - nr_reclaimed;
> +
> } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
> sc->nr_scanned - nr_scanned, sc));
> +
> + return zone_reclaimed;
> }
>
> /* Returns true if compaction should go ahead for a high-order request */
> @@ -2340,8 +2345,10 @@ static inline bool compaction_ready(struct zone *zone, int order)
> *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> + *
> + * Returns whether the zones overall are reclaimable or not.
> */
> -static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> {
> struct zoneref *z;
> struct zone *zone;
> @@ -2354,6 +2361,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> .gfp_mask = sc->gfp_mask,
> };
> enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
> + bool all_unreclaimable = true;
>
> /*
> * If the number of buffer_heads in the machine exceeds the maximum
> @@ -2368,6 +2376,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>
> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> gfp_zone(sc->gfp_mask), sc->nodemask) {
> + unsigned long zone_reclaimed = 0;
> +
> if (!populated_zone(zone))
> continue;
> /*
> @@ -2414,10 +2424,15 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> &nr_soft_scanned);
> sc->nr_reclaimed += nr_soft_reclaimed;
> sc->nr_scanned += nr_soft_scanned;
> + zone_reclaimed += nr_soft_reclaimed;
> /* need some check for avoid more shrink_zone() */
> }
>
> - shrink_zone(zone, sc);
> + zone_reclaimed += shrink_zone(zone, sc);
> +
> + if (zone_reclaimed ||
> + (global_reclaim(sc) && zone_reclaimable(zone)))
> + all_unreclaimable = false;
> }
>
> /*
> @@ -2439,26 +2454,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> * promoted it to __GFP_HIGHMEM.
> */
> sc->gfp_mask = orig_mask;
> -}
>
> -/* All zones in zonelist are unreclaimable? */
> -static bool all_unreclaimable(struct zonelist *zonelist,
> - struct scan_control *sc)
> -{
> - struct zoneref *z;
> - struct zone *zone;
> -
> - for_each_zone_zonelist_nodemask(zone, z, zonelist,
> - gfp_zone(sc->gfp_mask), sc->nodemask) {
> - if (!populated_zone(zone))
> - continue;
> - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> - continue;
> - if (zone_reclaimable(zone))
> - return false;
> - }
> -
> - return true;
> + return !all_unreclaimable;
> }
>
> /*
> @@ -2482,6 +2479,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> {
> unsigned long total_scanned = 0;
> unsigned long writeback_threshold;
> + bool zones_reclaimable;
>
> delayacct_freepages_start();
>
> @@ -2492,7 +2490,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - shrink_zones(zonelist, sc);
> + zones_reclaimable = shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> @@ -2533,8 +2531,8 @@ out:
> if (sc->compaction_ready)
> return 1;
>
> - /* top priority shrink_zones still had more to do? don't OOM, then */
> - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
> + /* Any of the zones still reclaimable? Don't OOM. */
> + if (zones_reclaimable)
> return 1;
>
> return 0;
> --
> 2.0.0
>

--
Michal Hocko
SUSE Labs

2014-06-23 09:15:46

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch 4/4] mm: vmscan: move swappiness out of scan_control

On Fri 20-06-14 12:33:50, Johannes Weiner wrote:
> Swappiness is determined for each scanned memcg individually in
> shrink_zone() and is not a parameter that applies throughout the
> reclaim scan. Move it out of struct scan_control to prevent
> accidental use of a stale value.

Yes, putting it into scan_control was a quick&dirty temporal
solution. I was thinking about something like lruvec_swappiness
(lruvec->mem_cgroup_per_zone->mem_cgroup) and stick it into
get_scan_count but what you have here is better because the swappiness
has memcg scope rather than lruvec.

> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: Michal Hocko <[email protected]>

> ---
> mm/vmscan.c | 27 +++++++++++++--------------
> 1 file changed, 13 insertions(+), 14 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d0bc1a209746..757e2a8dbf58 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -89,9 +89,6 @@ struct scan_control {
> /* Scan (total_size >> priority) pages at once */
> int priority;
>
> - /* anon vs. file LRUs scanning "ratio" */
> - int swappiness;
> -
> /*
> * The memory cgroup that hit its limit and as a result is the
> * primary target of this reclaim invocation.
> @@ -1868,8 +1865,8 @@ enum scan_balance {
> * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
> * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
> */
> -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
> - unsigned long *nr)
> +static void get_scan_count(struct lruvec *lruvec, int swappiness,
> + struct scan_control *sc, unsigned long *nr)
> {
> struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
> u64 fraction[2];
> @@ -1912,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
> * using the memory controller's swap limit feature would be
> * too expensive.
> */
> - if (!global_reclaim(sc) && !sc->swappiness) {
> + if (!global_reclaim(sc) && !swappiness) {
> scan_balance = SCAN_FILE;
> goto out;
> }
> @@ -1922,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
> * system is close to OOM, scan both anon and file equally
> * (unless the swappiness setting disagrees with swapping).
> */
> - if (!sc->priority && sc->swappiness) {
> + if (!sc->priority && swappiness) {
> scan_balance = SCAN_EQUAL;
> goto out;
> }
> @@ -1965,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
> * With swappiness at 100, anonymous and file have the same priority.
> * This scanning priority is essentially the inverse of IO cost.
> */
> - anon_prio = sc->swappiness;
> + anon_prio = swappiness;
> file_prio = 200 - anon_prio;
>
> /*
> @@ -2055,7 +2052,8 @@ out:
> /*
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> +static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
> + struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> unsigned long targets[NR_LRU_LISTS];
> @@ -2066,7 +2064,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
> struct blk_plug plug;
> bool scan_adjusted;
>
> - get_scan_count(lruvec, sc, nr);
> + get_scan_count(lruvec, swappiness, sc, nr);
>
> /* Record the original scan target for proportional adjustments later */
> memcpy(targets, nr, sizeof(nr));
> @@ -2263,11 +2261,12 @@ static unsigned long shrink_zone(struct zone *zone, struct scan_control *sc)
> memcg = mem_cgroup_iter(root, NULL, &reclaim);
> do {
> struct lruvec *lruvec;
> + int swappiness;
>
> lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + swappiness = mem_cgroup_swappiness(memcg);
>
> - sc->swappiness = mem_cgroup_swappiness(memcg);
> - shrink_lruvec(lruvec, sc);
> + shrink_lruvec(lruvec, swappiness, sc);
>
> /*
> * Direct reclaim and kswapd have to scan all memory
> @@ -2714,10 +2713,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> .may_swap = !noswap,
> .order = 0,
> .priority = 0,
> - .swappiness = mem_cgroup_swappiness(memcg),
> .target_mem_cgroup = memcg,
> };
> struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> + int swappiness = mem_cgroup_swappiness(memcg);
>
> sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
> (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
> @@ -2733,7 +2732,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> * will pick up pages from other mem cgroup's as well. We hack
> * the priority and make it zero.
> */
> - shrink_lruvec(lruvec, &sc);
> + shrink_lruvec(lruvec, swappiness, &sc);
>
> trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
>
> --
> 2.0.0
>

--
Michal Hocko
SUSE Labs

2014-06-23 12:56:09

by Mel Gorman

[permalink] [raw]
Subject: Re: [patch 1/4] mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable

On Fri, Jun 20, 2014 at 12:33:47PM -0400, Johannes Weiner wrote:
> shrink_zones() has a special branch to skip the all_unreclaimable()
> check during hibernation, because a frozen kswapd can't mark a zone
> unreclaimable.
>
> But ever since 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages()
> livelock"), determining a zone to be unreclaimable is done by directly
> looking at its scan history and no longer relies on kswapd setting the
> per-zone flag.
>
> Remove this branch and let shrink_zones() check the reclaimability of
> the target zones regardless of hibernation state.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: Mel Gorman <[email protected]>

--
Mel Gorman
SUSE Labs

2014-06-23 13:07:15

by Mel Gorman

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On Fri, Jun 20, 2014 at 12:33:48PM -0400, Johannes Weiner wrote:
> Page reclaim for a higher-order page runs until compaction is ready,
> then aborts and signals this situation through the return value of
> shrink_zones(). This is an oddly specific signal to encode in the
> return value of shrink_zones(), though, and can be quite confusing.
>
> Introduce sc->compaction_ready and signal the compactability of the
> zones out-of-band to free up the return value of shrink_zones() for
> actual zone reclaimability.
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
> 1 file changed, 31 insertions(+), 36 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 19b5b8016209..ed1efb84c542 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -65,6 +65,9 @@ struct scan_control {
> /* Number of pages freed so far during a call to shrink_zones() */
> unsigned long nr_reclaimed;
>
> + /* One of the zones is ready for compaction */
> + int compaction_ready;
> +
> /* How many pages shrink_list() should reclaim */
> unsigned long nr_to_reclaim;
>

You are not the criminal here but scan_control is larger than it needs
to be and the stack usage of reclaim has reared its head again.

Add a preparation patch that convert sc->may* and sc->hibernation_mode
to bool and moves them towards the end of the struct. Then add
compaction_ready as a bool.

> @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> }
>
> /* Returns true if compaction should go ahead for a high-order request */
> -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> +static inline bool compaction_ready(struct zone *zone, int order)
> {

Why did you remove the use of sc->order? In this patch there is only one
called of compaction_ready and it looks like

if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
zonelist_zone_idx(z) <= requested_highidx &&
compaction_ready(zone, sc->order)) {

So it's unclear why you changed the signature.


> unsigned long balance_gap, watermark;
> bool watermark_ok;
>
> - /* Do not consider compaction for orders reclaim is meant to satisfy */
> - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
> - return false;
> -
> /*
> * Compaction takes time to run and there are potentially other
> * callers using the pages just freed. Continue reclaiming until
> @@ -2309,18 +2308,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> */
> balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
> zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
> - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
> + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
> watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
>
> /*
> * If compaction is deferred, reclaim up to a point where
> * compaction will have a chance of success when re-enabled
> */
> - if (compaction_deferred(zone, sc->order))
> + if (compaction_deferred(zone, order))
> return watermark_ok;
>
> /* If compaction is not ready to start, keep reclaiming */
> - if (!compaction_suitable(zone, sc->order))
> + if (!compaction_suitable(zone, order))
> return false;
>
> return watermark_ok;
> @@ -2341,20 +2340,14 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> - *
> - * This function returns true if a zone is being reclaimed for a costly
> - * high-order allocation and compaction is ready to begin. This indicates to
> - * the caller that it should consider retrying the allocation instead of
> - * further reclaim.
> */
> -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> {
> struct zoneref *z;
> struct zone *zone;
> unsigned long nr_soft_reclaimed;
> unsigned long nr_soft_scanned;
> unsigned long lru_pages = 0;
> - bool aborted_reclaim = false;
> struct reclaim_state *reclaim_state = current->reclaim_state;
> gfp_t orig_mask;
> struct shrink_control shrink = {
> @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> if (sc->priority != DEF_PRIORITY &&
> !zone_reclaimable(zone))
> continue; /* Let kswapd poll it */
> - if (IS_ENABLED(CONFIG_COMPACTION)) {
> - /*
> - * If we already have plenty of memory free for
> - * compaction in this zone, don't free any more.
> - * Even though compaction is invoked for any
> - * non-zero order, only frequent costly order
> - * reclamation is disruptive enough to become a
> - * noticeable problem, like transparent huge
> - * page allocations.
> - */
> - if ((zonelist_zone_idx(z) <= requested_highidx)
> - && compaction_ready(zone, sc)) {
> - aborted_reclaim = true;
> - continue;
> - }
> +
> + /*
> + * If we already have plenty of memory free
> + * for compaction in this zone, don't free any
> + * more. Even though compaction is invoked
> + * for any non-zero order, only frequent
> + * costly order reclamation is disruptive
> + * enough to become a noticeable problem, like
> + * transparent huge page allocations.
> + */
> + if (IS_ENABLED(CONFIG_COMPACTION) &&
> + sc->order > PAGE_ALLOC_COSTLY_ORDER &&
> + zonelist_zone_idx(z) <= requested_highidx &&
> + compaction_ready(zone, sc->order)) {
> + sc->compaction_ready = true;
> + continue;
> }
> +
> /*
> * This steals pages from memory cgroups over softlimit
> * and returns the number of reclaimed pages and
> @@ -2444,8 +2439,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> * promoted it to __GFP_HIGHMEM.
> */
> sc->gfp_mask = orig_mask;
> -
> - return aborted_reclaim;
> }
>
> /* All zones in zonelist are unreclaimable? */
> @@ -2489,7 +2482,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> {
> unsigned long total_scanned = 0;
> unsigned long writeback_threshold;
> - bool aborted_reclaim;
>
> delayacct_freepages_start();
>
> @@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - aborted_reclaim = shrink_zones(zonelist, sc);
> + shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> goto out;
>
> + if (sc->compaction_ready)
> + goto out;
> +

break?

Convert the other one to break as well. out label seems unnecessary in
this context.

> /*
> * If we're getting trouble reclaiming, start doing
> * writepage even in laptop mode.
> @@ -2526,7 +2521,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> WB_REASON_TRY_TO_FREE_PAGES);
> sc->may_writepage = 1;
> }
> - } while (--sc->priority >= 0 && !aborted_reclaim);
> + } while (--sc->priority >= 0);
>
> out:
> delayacct_freepages_end();
> @@ -2535,7 +2530,7 @@ out:
> return sc->nr_reclaimed;
>
> /* Aborted reclaim to try compaction? don't OOM, then */
> - if (aborted_reclaim)
> + if (sc->compaction_ready)
> return 1;
>
> /* top priority shrink_zones still had more to do? don't OOM, then */
> --
> 2.0.0
>

--
Mel Gorman
SUSE Labs

2014-06-23 13:32:28

by Mel Gorman

[permalink] [raw]
Subject: Re: [patch 3/4] mm: vmscan: remove all_unreclaimable()

On Fri, Jun 20, 2014 at 12:33:49PM -0400, Johannes Weiner wrote:
> Direct reclaim currently calls shrink_zones() to reclaim all members
> of a zonelist, and if that wasn't successful it does another pass
> through the same zonelist to check overall reclaimability.
>
> Just check reclaimability in shrink_zones() directly and propagate the
> result through the return value. Then remove all_unreclaimable().
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> mm/vmscan.c | 48 +++++++++++++++++++++++-------------------------
> 1 file changed, 23 insertions(+), 25 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ed1efb84c542..d0bc1a209746 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2244,9 +2244,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
> }
> }
>
> -static void shrink_zone(struct zone *zone, struct scan_control *sc)
> +static unsigned long shrink_zone(struct zone *zone, struct scan_control *sc)
> {
> unsigned long nr_reclaimed, nr_scanned;
> + unsigned long zone_reclaimed = 0;
>
> do {
> struct mem_cgroup *root = sc->target_mem_cgroup;
> @@ -2290,8 +2291,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> sc->nr_scanned - nr_scanned,
> sc->nr_reclaimed - nr_reclaimed);
>
> + zone_reclaimed += sc->nr_reclaimed - nr_reclaimed;
> +
> } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
> sc->nr_scanned - nr_scanned, sc));
> +
> + return zone_reclaimed;
> }

You do not actually need a counter here because all that matters is that
a page got reclaimed. It could just as easily have been

bool zone_reclaimable = false;

...

if (sc->nr_reclaimed - nr_reclaimed)
zone_reclaimable = true;

...

return zone_reclaimable

so that zone[s]_reclaimable is always a boolean and not sometimes a boolean
and sometimes a counter.


>
> /* Returns true if compaction should go ahead for a high-order request */
> @@ -2340,8 +2345,10 @@ static inline bool compaction_ready(struct zone *zone, int order)
> *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> + *
> + * Returns whether the zones overall are reclaimable or not.
> */

Returns true if a zone was reclaimable

> -static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> {
> struct zoneref *z;
> struct zone *zone;
> @@ -2354,6 +2361,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> .gfp_mask = sc->gfp_mask,
> };
> enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
> + bool all_unreclaimable = true;
>
> /*
> * If the number of buffer_heads in the machine exceeds the maximum
> @@ -2368,6 +2376,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>
> for_each_zone_zonelist_nodemask(zone, z, zonelist,
> gfp_zone(sc->gfp_mask), sc->nodemask) {
> + unsigned long zone_reclaimed = 0;
> +
> if (!populated_zone(zone))
> continue;
> /*
> @@ -2414,10 +2424,15 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> &nr_soft_scanned);
> sc->nr_reclaimed += nr_soft_reclaimed;
> sc->nr_scanned += nr_soft_scanned;
> + zone_reclaimed += nr_soft_reclaimed;
> /* need some check for avoid more shrink_zone() */
> }
>
> - shrink_zone(zone, sc);
> + zone_reclaimed += shrink_zone(zone, sc);
> +
> + if (zone_reclaimed ||
> + (global_reclaim(sc) && zone_reclaimable(zone)))
> + all_unreclaimable = false;
> }
>

This is where you don't need the counter as such. It could just as
easily have been

bool reclaimable = false;
....
if (shrink_zone(zone, sc))
reclaimable = true;

if (!reclaimable && global_reclaim(sc) && zone_reclaimable(zone))
reclaimable = true;

return reclaimable;

It doesn't matter as such, it's just zone_reclaimed is implemented as a
counter but not used as one.

> /*
> @@ -2439,26 +2454,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> * promoted it to __GFP_HIGHMEM.
> */
> sc->gfp_mask = orig_mask;
> -}
>
> -/* All zones in zonelist are unreclaimable? */
> -static bool all_unreclaimable(struct zonelist *zonelist,
> - struct scan_control *sc)
> -{
> - struct zoneref *z;
> - struct zone *zone;
> -
> - for_each_zone_zonelist_nodemask(zone, z, zonelist,
> - gfp_zone(sc->gfp_mask), sc->nodemask) {
> - if (!populated_zone(zone))
> - continue;
> - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> - continue;
> - if (zone_reclaimable(zone))
> - return false;
> - }
> -
> - return true;
> + return !all_unreclaimable;
> }
>
> /*
> @@ -2482,6 +2479,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> {
> unsigned long total_scanned = 0;
> unsigned long writeback_threshold;
> + bool zones_reclaimable;
>
> delayacct_freepages_start();
>
> @@ -2492,7 +2490,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> sc->priority);
> sc->nr_scanned = 0;
> - shrink_zones(zonelist, sc);
> + zones_reclaimable = shrink_zones(zonelist, sc);
>
> total_scanned += sc->nr_scanned;
> if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> @@ -2533,8 +2531,8 @@ out:
> if (sc->compaction_ready)
> return 1;
>
> - /* top priority shrink_zones still had more to do? don't OOM, then */
> - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
> + /* Any of the zones still reclaimable? Don't OOM. */
> + if (zones_reclaimable)
> return 1;
>
> return 0;
> --
> 2.0.0
>

--
Mel Gorman
SUSE Labs

2014-06-23 16:03:08

by Motohiro Kosaki

[permalink] [raw]
Subject: RE: [patch 1/4] mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable



> -----Original Message-----
> From: Minchan Kim [mailto:[email protected]]
> Sent: Monday, June 23, 2014 2:16 AM
> To: Johannes Weiner
> Cc: Andrew Morton; Mel Gorman; Rik van Riel; Michal Hocko; [email protected]; [email protected]; Motohiro Kosaki JP
> Subject: Re: [patch 1/4] mm: vmscan: remove remains of kswapd-managed zone->all_unreclaimable
>
> On Fri, Jun 20, 2014 at 12:33:47PM -0400, Johannes Weiner wrote:
> > shrink_zones() has a special branch to skip the all_unreclaimable()
> > check during hibernation, because a frozen kswapd can't mark a zone
> > unreclaimable.
> >
> > But ever since 6e543d5780e3 ("mm: vmscan: fix do_try_to_free_pages()
> > livelock"), determining a zone to be unreclaimable is done by directly
> > looking at its scan history and no longer relies on kswapd setting the
> > per-zone flag.
> >
> > Remove this branch and let shrink_zones() check the reclaimability of
> > the target zones regardless of hibernation state.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
> Acked-by: Minchan Kim <[email protected]>
>
> It would be not bad to Cced KOSAKI who was involved all_unreclaimable series several time with me.

Looks good to me.

KOSAKI Motohiro <[email protected]>




????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2014-06-23 17:21:08

by Johannes Weiner

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

Hi Mel,

On Mon, Jun 23, 2014 at 02:07:05PM +0100, Mel Gorman wrote:
> On Fri, Jun 20, 2014 at 12:33:48PM -0400, Johannes Weiner wrote:
> > Page reclaim for a higher-order page runs until compaction is ready,
> > then aborts and signals this situation through the return value of
> > shrink_zones(). This is an oddly specific signal to encode in the
> > return value of shrink_zones(), though, and can be quite confusing.
> >
> > Introduce sc->compaction_ready and signal the compactability of the
> > zones out-of-band to free up the return value of shrink_zones() for
> > actual zone reclaimability.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
> > ---
> > mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
> > 1 file changed, 31 insertions(+), 36 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 19b5b8016209..ed1efb84c542 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -65,6 +65,9 @@ struct scan_control {
> > /* Number of pages freed so far during a call to shrink_zones() */
> > unsigned long nr_reclaimed;
> >
> > + /* One of the zones is ready for compaction */
> > + int compaction_ready;
> > +
> > /* How many pages shrink_list() should reclaim */
> > unsigned long nr_to_reclaim;
> >
>
> You are not the criminal here but scan_control is larger than it needs
> to be and the stack usage of reclaim has reared its head again.
>
> Add a preparation patch that convert sc->may* and sc->hibernation_mode
> to bool and moves them towards the end of the struct. Then add
> compaction_ready as a bool.

Good idea, I'll do that.

> > @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> > }
> >
> > /* Returns true if compaction should go ahead for a high-order request */
> > -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> > +static inline bool compaction_ready(struct zone *zone, int order)
> >
> > {
>
> Why did you remove the use of sc->order? In this patch there is only one
> called of compaction_ready and it looks like
>
> if (IS_ENABLED(CONFIG_COMPACTION) &&
> sc->order > PAGE_ALLOC_COSTLY_ORDER &&
> zonelist_zone_idx(z) <= requested_highidx &&
> compaction_ready(zone, sc->order)) {
>
> So it's unclear why you changed the signature.

Everything else in compaction_ready() is about internal compaction
requirements, like checking for free pages and deferred compaction,
whereas this order check is more of a reclaim policy rule according to
the comment in the caller:

...
* Even though compaction is invoked for any
* non-zero order, only frequent costly order
* reclamation is disruptive enough to become a
* noticeable problem, like transparent huge
* page allocations.
*/

But it's an unrelated in-the-area-anyway change, I can split it out -
or drop it entirely - if you prefer.

> > @@ -2500,12 +2492,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> > vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> > sc->priority);
> > sc->nr_scanned = 0;
> > - aborted_reclaim = shrink_zones(zonelist, sc);
> > + shrink_zones(zonelist, sc);
> >
> > total_scanned += sc->nr_scanned;
> > if (sc->nr_reclaimed >= sc->nr_to_reclaim)
> > goto out;
> >
> > + if (sc->compaction_ready)
> > + goto out;
> > +
>
> break?
>
> Convert the other one to break as well. out label seems unnecessary in
> this context.

Makes sense, I'll include this in v2.

Thanks!

2014-06-23 18:20:33

by Johannes Weiner

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On Mon, Jun 23, 2014 at 03:36:37PM +0900, Minchan Kim wrote:
> On Fri, Jun 20, 2014 at 12:33:48PM -0400, Johannes Weiner wrote:
> > Page reclaim for a higher-order page runs until compaction is ready,
> > then aborts and signals this situation through the return value of
> > shrink_zones(). This is an oddly specific signal to encode in the
> > return value of shrink_zones(), though, and can be quite confusing.
> >
> > Introduce sc->compaction_ready and signal the compactability of the
> > zones out-of-band to free up the return value of shrink_zones() for
> > actual zone reclaimability.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
> Acked-by: Minchan Kim <[email protected]>

Thanks!

> > @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> > }
> >
> > /* Returns true if compaction should go ahead for a high-order request */
> > -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> > +static inline bool compaction_ready(struct zone *zone, int order)
> > {
> > unsigned long balance_gap, watermark;
> > bool watermark_ok;
> >
> > - /* Do not consider compaction for orders reclaim is meant to satisfy */
> > - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
> > - return false;
> > -
> > /*
> > * Compaction takes time to run and there are potentially other
> > * callers using the pages just freed. Continue reclaiming until

> > @@ -2391,22 +2384,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
> > if (sc->priority != DEF_PRIORITY &&
> > !zone_reclaimable(zone))
> > continue; /* Let kswapd poll it */
> > - if (IS_ENABLED(CONFIG_COMPACTION)) {
> > - /*
> > - * If we already have plenty of memory free for
> > - * compaction in this zone, don't free any more.
> > - * Even though compaction is invoked for any
> > - * non-zero order, only frequent costly order
> > - * reclamation is disruptive enough to become a
> > - * noticeable problem, like transparent huge
> > - * page allocations.
> > - */
> > - if ((zonelist_zone_idx(z) <= requested_highidx)
> > - && compaction_ready(zone, sc)) {
> > - aborted_reclaim = true;
> > - continue;
> > - }
> > +
> > + /*
> > + * If we already have plenty of memory free
> > + * for compaction in this zone, don't free any
> > + * more. Even though compaction is invoked
> > + * for any non-zero order, only frequent
> > + * costly order reclamation is disruptive
> > + * enough to become a noticeable problem, like
> > + * transparent huge page allocations.
> > + */
> > + if (IS_ENABLED(CONFIG_COMPACTION) &&
> > + sc->order > PAGE_ALLOC_COSTLY_ORDER &&
>
> You are deleting comment sc->order <= PAGE_ALLOC_COSTLY_ORDER which was
> in compaction_ready. At least, that comment was useful for me to guess
> the intention. So if you have strong reason to remove that, I'd like to
> remain it.

There are two separate explanations for aborting reclaim early for
costly orders:

1. /* Do not consider compaction for orders reclaim is meant to satisfy */

2. /*
* Even though compaction is invoked
* for any non-zero order, only frequent
* costly order reclamation is disruptive
* enough to become a noticeable problem, like
* transparent huge page allocations.
*/

I thought it makes sense to pick one and go with that, so I went with
2. and moved the order check out there as well.

2014-06-25 09:55:33

by Mel Gorman

[permalink] [raw]
Subject: Re: [patch 2/4] mm: vmscan: rework compaction-ready signaling in direct reclaim

On Mon, Jun 23, 2014 at 01:20:56PM -0400, Johannes Weiner wrote:
> Hi Mel,
>
> On Mon, Jun 23, 2014 at 02:07:05PM +0100, Mel Gorman wrote:
> > On Fri, Jun 20, 2014 at 12:33:48PM -0400, Johannes Weiner wrote:
> > > Page reclaim for a higher-order page runs until compaction is ready,
> > > then aborts and signals this situation through the return value of
> > > shrink_zones(). This is an oddly specific signal to encode in the
> > > return value of shrink_zones(), though, and can be quite confusing.
> > >
> > > Introduce sc->compaction_ready and signal the compactability of the
> > > zones out-of-band to free up the return value of shrink_zones() for
> > > actual zone reclaimability.
> > >
> > > Signed-off-by: Johannes Weiner <[email protected]>
> > > ---
> > > mm/vmscan.c | 67 ++++++++++++++++++++++++++++---------------------------------
> > > 1 file changed, 31 insertions(+), 36 deletions(-)
> > >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 19b5b8016209..ed1efb84c542 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -65,6 +65,9 @@ struct scan_control {
> > > /* Number of pages freed so far during a call to shrink_zones() */
> > > unsigned long nr_reclaimed;
> > >
> > > + /* One of the zones is ready for compaction */
> > > + int compaction_ready;
> > > +
> > > /* How many pages shrink_list() should reclaim */
> > > unsigned long nr_to_reclaim;
> > >
> >
> > You are not the criminal here but scan_control is larger than it needs
> > to be and the stack usage of reclaim has reared its head again.
> >
> > Add a preparation patch that convert sc->may* and sc->hibernation_mode
> > to bool and moves them towards the end of the struct. Then add
> > compaction_ready as a bool.
>
> Good idea, I'll do that.
>

Thanks.

> > > @@ -2292,15 +2295,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
> > > }
> > >
> > > /* Returns true if compaction should go ahead for a high-order request */
> > > -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
> > > +static inline bool compaction_ready(struct zone *zone, int order)
> > >
> > > {
> >
> > Why did you remove the use of sc->order? In this patch there is only one
> > called of compaction_ready and it looks like
> >
> > if (IS_ENABLED(CONFIG_COMPACTION) &&
> > sc->order > PAGE_ALLOC_COSTLY_ORDER &&
> > zonelist_zone_idx(z) <= requested_highidx &&
> > compaction_ready(zone, sc->order)) {
> >
> > So it's unclear why you changed the signature.
>
> Everything else in compaction_ready() is about internal compaction
> requirements, like checking for free pages and deferred compaction,
> whereas this order check is more of a reclaim policy rule according to
> the comment in the caller:
>
> ...
> * Even though compaction is invoked for any
> * non-zero order, only frequent costly order
> * reclamation is disruptive enough to become a
> * noticeable problem, like transparent huge
> * page allocations.
> */
>
> But it's an unrelated in-the-area-anyway change, I can split it out -
> or drop it entirely - if you prefer.
>

It's ok as-is. It just seemed unrelated and seemed to do nothing. I was
wondering if this was a rebasing artifact and some other change that
required it got lost along the way by accident.

--
Mel Gorman
SUSE Labs