Memcg reclaim throttles on congestion if no reclaim progress is made.
This makes little sense, it might be due to writeback or a host of
other factors.
For !memcg reclaim, it's messy. Direct reclaim primarily is throttled
in the page allocator if it is failing to make progress. Kswapd
throttles if too many pages are under writeback and marked for
immediate reclaim.
This patch explicitly throttles if reclaim is failing to make progress.
Signed-off-by: Mel Gorman <[email protected]>
---
include/linux/mmzone.h | 1 +
include/trace/events/vmscan.h | 4 +++-
mm/memcontrol.c | 10 +--------
mm/vmscan.c | 38 +++++++++++++++++++++++++++++++++++
4 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ca65d6a64bdd..7c08cc91d526 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -276,6 +276,7 @@ enum lru_list {
enum vmscan_throttle_state {
VMSCAN_THROTTLE_WRITEBACK,
VMSCAN_THROTTLE_ISOLATED,
+ VMSCAN_THROTTLE_NOPROGRESS,
NR_VMSCAN_THROTTLE,
};
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index d4905bd9e9c4..f25a6149d3ba 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -29,11 +29,13 @@
#define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK)
#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED)
+#define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS)
#define show_throttle_flags(flags) \
(flags) ? __print_flags(flags, "|", \
{_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \
- {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"} \
+ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \
+ {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"} \
) : "VMSCAN_THROTTLE_NONE"
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b762215d73eb..8479919a633c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3470,19 +3470,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
/* try to free all pages in this cgroup */
while (nr_retries && page_counter_read(&memcg->memory)) {
- int progress;
-
if (signal_pending(current))
return -EINTR;
- progress = try_to_free_mem_cgroup_pages(memcg, 1,
- GFP_KERNEL, true);
- if (!progress) {
+ if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
nr_retries--;
- /* maybe some writeback is necessary */
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
}
return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb81dcac15b2..18b9826953a0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3307,6 +3307,33 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
+static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+ /* If reclaim is making progress, wake any throttled tasks. */
+ if (sc->nr_reclaimed) {
+ wait_queue_head_t *wqh;
+
+ wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
+ if (waitqueue_active(wqh))
+ wake_up_interruptible_all(wqh);
+
+ return;
+ }
+
+ /*
+ * Do not throttle kswapd on NOPROGRESS as it will throttle on
+ * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
+ * writeback and marked for immediate reclaim at the tail of
+ * the LRU.
+ */
+ if (current_is_kswapd())
+ return;
+
+ /* Throttle if making no progress at high prioities. */
+ if (sc->priority < DEF_PRIORITY - 2)
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -3391,6 +3418,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc);
+ consider_reclaim_throttle(zone->zone_pgdat, sc);
}
/*
@@ -3765,6 +3793,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
+ if (!nr_reclaimed) {
+ struct zoneref *z;
+ pg_data_t *pgdat;
+
+ z = first_zones_zonelist(zonelist, sc.reclaim_idx, sc.nodemask);
+ pgdat = zonelist_zone(z)->zone_pgdat;
+
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
+ }
+
return nr_reclaimed;
}
#endif
--
2.31.1
On Mon, 20 Sep 2021, Mel Gorman wrote:
> +
> + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
We always seem to pass "HZ/10" to reclaim_throttle(). Should we just
hard-code that in the one place inside reclaim_throttle() itself?
NeilBrown
On Tue, Sep 21, 2021 at 09:31:30AM +1000, NeilBrown wrote:
> On Mon, 20 Sep 2021, Mel Gorman wrote:
> > +
> > + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
>
> We always seem to pass "HZ/10" to reclaim_throttle(). Should we just
> hard-code that in the one place inside reclaim_throttle() itself?
>
do_writepages passes in HZ/50. I'm not sure if these values even have
any special meaning, I think it's more likely they were pulled out of
the air based on the speed of some disk in the past and then copied.
It's another reason why I want the wakeups to be based on events within
the mm as much as possible.
--
Mel Gorman
SUSE Labs
On Tue, 21 Sep 2021, Mel Gorman wrote:
> On Tue, Sep 21, 2021 at 09:31:30AM +1000, NeilBrown wrote:
> > On Mon, 20 Sep 2021, Mel Gorman wrote:
> > > +
> > > + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
> >
> > We always seem to pass "HZ/10" to reclaim_throttle(). Should we just
> > hard-code that in the one place inside reclaim_throttle() itself?
> >
>
> do_writepages passes in HZ/50. I'm not sure if these values even have
> any special meaning, I think it's more likely they were pulled out of
> the air based on the speed of some disk in the past and then copied.
> It's another reason why I want the wakeups to be based on events within
> the mm as much as possible.
Yes, I saw the HZ/50 shortly after writing that email :-)
I agree with your guess for the source of these numbers. I still think
we should pull them all from the same piece of air.
Hopefully, once these changes are properly understood and the events
reliably come as expected, we can make it quite large (HZ?) with minimal
cost.
Thanks,
NeilBrown
>
> --
> Mel Gorman
> SUSE Labs
>
>
On Wed, Sep 22, 2021 at 07:46:58AM +1000, NeilBrown wrote:
> On Tue, 21 Sep 2021, Mel Gorman wrote:
> > On Tue, Sep 21, 2021 at 09:31:30AM +1000, NeilBrown wrote:
> > > On Mon, 20 Sep 2021, Mel Gorman wrote:
> > > > +
> > > > + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
> > >
> > > We always seem to pass "HZ/10" to reclaim_throttle(). Should we just
> > > hard-code that in the one place inside reclaim_throttle() itself?
> > >
> >
> > do_writepages passes in HZ/50. I'm not sure if these values even have
> > any special meaning, I think it's more likely they were pulled out of
> > the air based on the speed of some disk in the past and then copied.
> > It's another reason why I want the wakeups to be based on events within
> > the mm as much as possible.
>
> Yes, I saw the HZ/50 shortly after writing that email :-)
> I agree with your guess for the source of these numbers. I still think
> we should pull them all from the same piece of air.
> Hopefully, once these changes are properly understood and the events
> reliably come as expected, we can make it quite large (HZ?) with minimal
> cost.
>
I'd prefer to do it as a separate patch. At some point congestion_wait
worked and the original timeouts may have been selected based on testing
(I severely doubt it but I'm trying to be optimistic). However, we can
at least centralise the decision based on "reason" with this
---8<---
From 11e5197c0c569e89145475afd511efe3ce61711c Mon Sep 17 00:00:00 2001
From: Mel Gorman <[email protected]>
Date: Wed, 22 Sep 2021 10:16:33 +0100
Subject: [PATCH] mm/vmscan: Centralise timeout values for reclaim_throttle
Neil Brown raised concerns about callers of reclaim_throttle specifying
a timeout value. The original timeout values to congestion_wait() were
probably pulled out of thin air or copy&pasted from somewhere else.
This patch centralises the timeout values and selects a timeout based
on the reason for reclaim throttling. These figures are also pulled
out of the same thin air but better values may be derived
Running a workload that is throttling for inappropriate periods
and tracing mm_vmscan_throttled can be used to pick a more appropriate
value. Excessive throttling would pick a lower timeout where as
excessive CPU usage in reclaim context would select a larger timeout.
Ideally a large value would always be used and the wakeups would
occur before a timeout but that requires careful testing.
Signed-off-by: Mel Gorman <[email protected]>
---
mm/compaction.c | 2 +-
mm/internal.h | 3 +--
mm/page-writeback.c | 2 +-
mm/vmscan.c | 39 +++++++++++++++++++++++++++++++--------
4 files changed, 34 insertions(+), 12 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index 7359093d8ac0..151b04c4dab3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -828,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (cc->mode == MIGRATE_ASYNC)
return -EAGAIN;
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
if (fatal_signal_pending(current))
return -EINTR;
diff --git a/mm/internal.h b/mm/internal.h
index 06d0c376efcd..f8d203cfd4e1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -129,8 +129,7 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
- long timeout);
+extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
/*
* in mm/rmap.c:
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f34f54fcd5b4..7d08706c541a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2374,7 +2374,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
* guess as any.
*/
reclaim_throttle(NODE_DATA(numa_node_id()),
- VMSCAN_THROTTLE_WRITEBACK, HZ/50);
+ VMSCAN_THROTTLE_WRITEBACK);
}
/*
* Usually few pages are written by now from those we've just submitted
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0012f9536e1..36b21549a3a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1006,14 +1006,37 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
-void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason,
- long timeout)
+void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
{
wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
unsigned long start = jiffies;
- long ret;
+ long timeout, ret;
DEFINE_WAIT(wait);
+ /*
+ * These figures are pulled out of thin air.
+ * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
+ * parallel reclaimers which is a short-lived event so the timeout is
+ * short. Failing to make progress or waiting on writeback are
+ * potentially long-lived events so use a longer timeout. This is shaky
+ * logic as a failure to make progress could be due to anything from
+ * writeback to a slow device to excessive references pages at the tail
+ * of the inactive LRU.
+ */
+ switch(reason) {
+ case VMSCAN_THROTTLE_NOPROGRESS:
+ case VMSCAN_THROTTLE_WRITEBACK:
+ timeout = HZ/10;
+ break;
+ case VMSCAN_THROTTLE_ISOLATED:
+ timeout = HZ/50;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ timeout = HZ;
+ break;
+ }
+
atomic_inc(&pgdat->nr_reclaim_throttled);
WRITE_ONCE(pgdat->nr_reclaim_start,
node_page_state(pgdat, NR_THROTTLED_WRITTEN));
@@ -2298,7 +2321,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
/* wait a bit for the reclaimer. */
stalled = true;
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -3230,7 +3253,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
* until some pages complete writeback.
*/
if (sc->nr.immediate)
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/*
@@ -3254,7 +3277,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
@@ -3326,7 +3349,7 @@ static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
/* Throttle if making no progress at high prioities. */
if (sc->priority < DEF_PRIORITY - 2)
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
/*
@@ -3795,7 +3818,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
z = first_zones_zonelist(zonelist, sc.reclaim_idx, sc.nodemask);
pgdat = zonelist_zone(z)->zone_pgdat;
- reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
return nr_reclaimed;