Here are 8 patches that clean up the reclaim code's interaction with
cgroups a bit. They're not supposed to change any behavior, just make
the implementation easier to understand and work with.
My apologies in advance for 5/8, which changes the indentation of
shrink_node() and so results in a terrible diff. The rest of the
series should be straight-forward.
include/linux/memcontrol.h | 32 ++--
include/linux/mmzone.h | 26 +--
mm/memcontrol.c | 12 +-
mm/page_alloc.c | 2 +-
mm/slab.h | 4 +-
mm/vmscan.c | 386 +++++++++++++++++++------------------------
mm/workingset.c | 8 +-
7 files changed, 216 insertions(+), 254 deletions(-)
This function is getting long and unwieldy, split out the memcg bits.
The updated shrink_node() handles the generic (node) reclaim aspects:
- global vmpressure notifications
- writeback and congestion throttling
- reclaim/compaction management
- kswapd giving up on unreclaimable nodes
It then calls a new shrink_node_memcgs() which handles cgroup specifics:
- the cgroup tree traversal
- memory.low considerations
- per-cgroup slab shrinking callbacks
- per-cgroup vmpressure notifications
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 28 ++++++++++++++++++----------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index db073b40c432..65baa89740dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2722,18 +2722,10 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
(memcg && memcg_congested(pgdat, memcg));
}
-static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
- struct reclaim_state *reclaim_state = current->reclaim_state;
struct mem_cgroup *root = sc->target_mem_cgroup;
- unsigned long nr_reclaimed, nr_scanned;
- bool reclaimable = false;
struct mem_cgroup *memcg;
-again:
- memset(&sc->nr, 0, sizeof(sc->nr));
-
- nr_reclaimed = sc->nr_reclaimed;
- nr_scanned = sc->nr_scanned;
memcg = mem_cgroup_iter(root, NULL, NULL);
do {
@@ -2786,6 +2778,22 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_reclaimed - reclaimed);
} while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+}
+
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+{
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ unsigned long nr_reclaimed, nr_scanned;
+ bool reclaimable = false;
+
+again:
+ memset(&sc->nr, 0, sizeof(sc->nr));
+
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
+
+ shrink_node_memcgs(pgdat, sc);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2793,7 +2801,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
/* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ vmpressure(sc->gfp_mask, root, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
--
2.23.0
This function currently takes the node or lruvec size and subtracts
the zones that are excluded by the classzone index of the
allocation. It uses four different types of counters to do this.
Just add up the eligible zones.
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 21 +++++----------------
1 file changed, 5 insertions(+), 16 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1154b3a2b637..57f533b808f2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -351,32 +351,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
*/
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
- unsigned long lru_size = 0;
+ unsigned long size = 0;
int zid;
- if (!mem_cgroup_disabled()) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
- } else
- lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-
- for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid <= zone_idx; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
- unsigned long size;
if (!managed_zone(zone))
continue;
if (!mem_cgroup_disabled())
- size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
- size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
- NR_ZONE_LRU_BASE + lru);
- lru_size -= min(size, lru_size);
+ size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
}
-
- return lru_size;
-
+ return size;
}
/*
--
2.23.0
On Tue, Oct 22, 2019 at 10:47:56AM -0400, Johannes Weiner wrote:
> This function currently takes the node or lruvec size and subtracts
> the zones that are excluded by the classzone index of the
> allocation. It uses four different types of counters to do this.
>
> Just add up the eligible zones.
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> mm/vmscan.c | 21 +++++----------------
> 1 file changed, 5 insertions(+), 16 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1154b3a2b637..57f533b808f2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -351,32 +351,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
> */
> unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
> {
> - unsigned long lru_size = 0;
> + unsigned long size = 0;
> int zid;
>
> - if (!mem_cgroup_disabled()) {
> - for (zid = 0; zid < MAX_NR_ZONES; zid++)
> - lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
> - } else
> - lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
> -
> - for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
> + for (zid = 0; zid <= zone_idx; zid++) {
> struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
> - unsigned long size;
>
> if (!managed_zone(zone))
> continue;
>
> if (!mem_cgroup_disabled())
> - size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
> + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
> else
> - size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
> - NR_ZONE_LRU_BASE + lru);
> - lru_size -= min(size, lru_size);
> + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
> }
> -
> - return lru_size;
> -
> + return size;
Neat!
Reviewed-by: Roman Gushchin <[email protected]>
Thanks!
On Tue, Oct 22, 2019 at 10:48:02AM -0400, Johannes Weiner wrote:
> This function is getting long and unwieldy, split out the memcg bits.
>
> The updated shrink_node() handles the generic (node) reclaim aspects:
> - global vmpressure notifications
> - writeback and congestion throttling
> - reclaim/compaction management
> - kswapd giving up on unreclaimable nodes
>
> It then calls a new shrink_node_memcgs() which handles cgroup specifics:
> - the cgroup tree traversal
> - memory.low considerations
> - per-cgroup slab shrinking callbacks
> - per-cgroup vmpressure notifications
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> mm/vmscan.c | 28 ++++++++++++++++++----------
> 1 file changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index db073b40c432..65baa89740dd 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2722,18 +2722,10 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
> (memcg && memcg_congested(pgdat, memcg));
> }
>
> -static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> {
> - struct reclaim_state *reclaim_state = current->reclaim_state;
> struct mem_cgroup *root = sc->target_mem_cgroup;
> - unsigned long nr_reclaimed, nr_scanned;
> - bool reclaimable = false;
> struct mem_cgroup *memcg;
> -again:
> - memset(&sc->nr, 0, sizeof(sc->nr));
> -
> - nr_reclaimed = sc->nr_reclaimed;
> - nr_scanned = sc->nr_scanned;
>
> memcg = mem_cgroup_iter(root, NULL, NULL);
> do {
> @@ -2786,6 +2778,22 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> sc->nr_reclaimed - reclaimed);
>
> } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +}
> +
> +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> +{
> + struct reclaim_state *reclaim_state = current->reclaim_state;
> + struct mem_cgroup *root = sc->target_mem_cgroup;
> + unsigned long nr_reclaimed, nr_scanned;
> + bool reclaimable = false;
> +
> +again:
> + memset(&sc->nr, 0, sizeof(sc->nr));
> +
> + nr_reclaimed = sc->nr_reclaimed;
> + nr_scanned = sc->nr_scanned;
> +
> + shrink_node_memcgs(pgdat, sc);
>
> if (reclaim_state) {
> sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> @@ -2793,7 +2801,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> }
>
> /* Record the subtree's reclaim efficiency */
> - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> + vmpressure(sc->gfp_mask, root, true,
Maybe target? Or target_memcg? The word root is associated with the root cgroup.
Other than root the patch looks good to me:
Reviewed-by: Roman Gushchin <[email protected]>
On Tue 22-10-19 10:47:56, Johannes Weiner wrote:
> This function currently takes the node or lruvec size and subtracts
> the zones that are excluded by the classzone index of the
> allocation. It uses four different types of counters to do this.
>
> Just add up the eligible zones.
The original intention was to optimize this for GFP_KERNEL like
allocations by reducing the number of zones to reduce. But considering
this is not called from hot paths I do agree that a simpler code is more
preferable.
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/vmscan.c | 21 +++++----------------
> 1 file changed, 5 insertions(+), 16 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1154b3a2b637..57f533b808f2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -351,32 +351,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
> */
> unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
> {
> - unsigned long lru_size = 0;
> + unsigned long size = 0;
> int zid;
>
> - if (!mem_cgroup_disabled()) {
> - for (zid = 0; zid < MAX_NR_ZONES; zid++)
> - lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
> - } else
> - lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
> -
> - for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
> + for (zid = 0; zid <= zone_idx; zid++) {
> struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
> - unsigned long size;
>
> if (!managed_zone(zone))
> continue;
>
> if (!mem_cgroup_disabled())
> - size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
> + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
> else
> - size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
> - NR_ZONE_LRU_BASE + lru);
> - lru_size -= min(size, lru_size);
> + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
> }
> -
> - return lru_size;
> -
> + return size;
> }
>
> /*
> --
> 2.23.0
--
Michal Hocko
SUSE Labs
On Tue 22-10-19 10:48:02, Johannes Weiner wrote:
> This function is getting long and unwieldy, split out the memcg bits.
>
> The updated shrink_node() handles the generic (node) reclaim aspects:
> - global vmpressure notifications
> - writeback and congestion throttling
> - reclaim/compaction management
> - kswapd giving up on unreclaimable nodes
>
> It then calls a new shrink_node_memcgs() which handles cgroup specifics:
> - the cgroup tree traversal
> - memory.low considerations
> - per-cgroup slab shrinking callbacks
> - per-cgroup vmpressure notifications
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/vmscan.c | 28 ++++++++++++++++++----------
> 1 file changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index db073b40c432..65baa89740dd 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2722,18 +2722,10 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
> (memcg && memcg_congested(pgdat, memcg));
> }
>
> -static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> {
> - struct reclaim_state *reclaim_state = current->reclaim_state;
> struct mem_cgroup *root = sc->target_mem_cgroup;
> - unsigned long nr_reclaimed, nr_scanned;
> - bool reclaimable = false;
> struct mem_cgroup *memcg;
> -again:
> - memset(&sc->nr, 0, sizeof(sc->nr));
> -
> - nr_reclaimed = sc->nr_reclaimed;
> - nr_scanned = sc->nr_scanned;
>
> memcg = mem_cgroup_iter(root, NULL, NULL);
> do {
> @@ -2786,6 +2778,22 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> sc->nr_reclaimed - reclaimed);
>
> } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +}
> +
> +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> +{
> + struct reclaim_state *reclaim_state = current->reclaim_state;
> + struct mem_cgroup *root = sc->target_mem_cgroup;
> + unsigned long nr_reclaimed, nr_scanned;
> + bool reclaimable = false;
> +
> +again:
> + memset(&sc->nr, 0, sizeof(sc->nr));
> +
> + nr_reclaimed = sc->nr_reclaimed;
> + nr_scanned = sc->nr_scanned;
> +
> + shrink_node_memcgs(pgdat, sc);
>
> if (reclaim_state) {
> sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> @@ -2793,7 +2801,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> }
>
> /* Record the subtree's reclaim efficiency */
> - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> + vmpressure(sc->gfp_mask, root, true,
> sc->nr_scanned - nr_scanned,
> sc->nr_reclaimed - nr_reclaimed);
>
> --
> 2.23.0
>
--
Michal Hocko
SUSE Labs
On Tue, Oct 22, 2019 at 08:08:23PM +0000, Roman Gushchin wrote:
> On Tue, Oct 22, 2019 at 10:48:02AM -0400, Johannes Weiner wrote:
> > This function is getting long and unwieldy, split out the memcg bits.
> >
> > The updated shrink_node() handles the generic (node) reclaim aspects:
> > - global vmpressure notifications
> > - writeback and congestion throttling
> > - reclaim/compaction management
> > - kswapd giving up on unreclaimable nodes
> >
> > It then calls a new shrink_node_memcgs() which handles cgroup specifics:
> > - the cgroup tree traversal
> > - memory.low considerations
> > - per-cgroup slab shrinking callbacks
> > - per-cgroup vmpressure notifications
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
> > ---
> > mm/vmscan.c | 28 ++++++++++++++++++----------
> > 1 file changed, 18 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index db073b40c432..65baa89740dd 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2722,18 +2722,10 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
> > (memcg && memcg_congested(pgdat, memcg));
> > }
> >
> > -static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> > +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> > {
> > - struct reclaim_state *reclaim_state = current->reclaim_state;
> > struct mem_cgroup *root = sc->target_mem_cgroup;
> > - unsigned long nr_reclaimed, nr_scanned;
> > - bool reclaimable = false;
> > struct mem_cgroup *memcg;
> > -again:
> > - memset(&sc->nr, 0, sizeof(sc->nr));
> > -
> > - nr_reclaimed = sc->nr_reclaimed;
> > - nr_scanned = sc->nr_scanned;
> >
> > memcg = mem_cgroup_iter(root, NULL, NULL);
> > do {
> > @@ -2786,6 +2778,22 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> > sc->nr_reclaimed - reclaimed);
> >
> > } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> > +}
> > +
> > +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> > +{
> > + struct reclaim_state *reclaim_state = current->reclaim_state;
> > + struct mem_cgroup *root = sc->target_mem_cgroup;
> > + unsigned long nr_reclaimed, nr_scanned;
> > + bool reclaimable = false;
> > +
> > +again:
> > + memset(&sc->nr, 0, sizeof(sc->nr));
> > +
> > + nr_reclaimed = sc->nr_reclaimed;
> > + nr_scanned = sc->nr_scanned;
> > +
> > + shrink_node_memcgs(pgdat, sc);
> >
> > if (reclaim_state) {
> > sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> > @@ -2793,7 +2801,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> > }
> >
> > /* Record the subtree's reclaim efficiency */
> > - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> > + vmpressure(sc->gfp_mask, root, true,
>
> Maybe target? Or target_memcg? The word root is associated with the root cgroup.
>
> Other than root the patch looks good to me:
>
> Reviewed-by: Roman Gushchin <[email protected]>
Thanks!
I agree, target_memcg is better than root. The next patch also
replaces some of these with target_lruvec.
This on top?
From f981c99d3a9da05513c5137873315974782e97ec Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[email protected]>
Date: Fri, 25 Oct 2019 10:28:42 -0400
Subject: [PATCH] mm: vmscan: split shrink_node() into node part and
memcgs part fix
As per Roman's suggestion, rename "root" to "target_memcg" to avoid
confusion with the global cgroup root, root_mem_cgroup.
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 65baa89740dd..6199692af434 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2724,16 +2724,16 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
{
- struct mem_cgroup *root = sc->target_mem_cgroup;
+ struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
struct mem_cgroup *memcg;
- memcg = mem_cgroup_iter(root, NULL, NULL);
+ memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
unsigned long reclaimed;
unsigned long scanned;
- switch (mem_cgroup_protected(root, memcg)) {
+ switch (mem_cgroup_protected(target_memcg, memcg)) {
case MEMCG_PROT_MIN:
/*
* Hard protection.
@@ -2777,13 +2777,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+ } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct mem_cgroup *root = sc->target_mem_cgroup;
+ struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2801,7 +2801,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
}
/* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, root, true,
+ vmpressure(sc->gfp_mask, target_memcg, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
@@ -2857,7 +2857,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
*/
if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
- set_memcg_congestion(pgdat, root, true);
+ set_memcg_congestion(pgdat, target_memcg, true);
/*
* Stall direct reclaim for IO completions if underlying BDIs
@@ -2866,7 +2866,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
* the LRU too quickly.
*/
if (!sc->hibernation_mode && !current_is_kswapd() &&
- current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+ current_may_throttle() &&
+ pgdat_memcg_congested(pgdat, target_memcg))
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
--
2.23.0