We noticed a 2% webserver throughput regression after upgrading from
5.6. This could be tracked down to a shift in the anon/file reclaim
balance (confirmed with swappiness) that resulted in worse reclaim
efficiency and thus more kswapd activity for the same outcome.
The change that exposed the problem is aae466b0052e ("mm/swap:
implement workingset detection for anonymous LRU"). By qualifying
swapins based on their refault distance, it lowered the cost of anon
reclaim in this workload, in turn causing (much) more anon scanning
than before. Scanning the anon list is more expensive due to the
higher ratio of mmapped pages that may rotate during reclaim, and so
the result was an increase in %sys time.
Right now, rotations aren't considered a cost when balancing scan
pressure between LRUs. We can end up with very few file refaults
putting all the scan pressure on hot anon pages that are rotated en
masse, don't get reclaimed, and never push back on the file LRU
again. We still only reclaim file cache in that case, but we burn a
lot CPU rotating anon pages. It's "fair" from an LRU age POV, but
doesn't reflect the real cost it imposes on the system.
Consider rotations as a secondary factor in balancing the LRUs. This
doesn't attempt to make a precise comparison between IO cost and CPU
cost, it just says: if reloads are about comparable between the lists,
or rotations are overwhelmingly different, adjust for CPU work.
This fixed the regression on our webservers. It has since been
deployed to the entire Meta fleet and hasn't caused any problems.
Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/swap.h | 5 +++--
mm/swap.c | 22 +++++++++++++++++-----
mm/vmscan.c | 4 +++-
mm/workingset.c | 2 +-
4 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a18cf4b7c724..369d7799205d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
-void lru_note_cost_folio(struct folio *);
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated);
+void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void lru_cache_add(struct page *);
diff --git a/mm/swap.c b/mm/swap.c
index 955930f41d20..2f12a2ee1d3a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -295,8 +295,20 @@ void folio_rotate_reclaimable(struct folio *folio)
}
}
-void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
+void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_io, unsigned int nr_rotated)
{
+ unsigned long cost;
+
+ /*
+ * Reflect the relative cost of incurring IO and spending CPU
+ * time on rotations. This doesn't attempt to make a precise
+ * comparison, it just says: if reloads are about comparable
+ * between the LRU lists, or rotations are overwhelmingly
+ * different between them, adjust scan balance for CPU work.
+ */
+ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+
do {
unsigned long lrusize;
@@ -310,9 +322,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
- lruvec->file_cost += nr_pages;
+ lruvec->file_cost += cost;
else
- lruvec->anon_cost += nr_pages;
+ lruvec->anon_cost += cost;
/*
* Decay previous events
@@ -335,10 +347,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
-void lru_note_cost_folio(struct folio *folio)
+void lru_note_cost_refault(struct folio *folio)
{
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
- folio_nr_pages(folio));
+ folio_nr_pages(folio), 0);
}
static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..ffe402e095d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list);
@@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
diff --git a/mm/workingset.c b/mm/workingset.c
index ae7e984b23c6..d2d02978588c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow)
if (workingset) {
folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
- lru_note_cost_folio(folio);
+ lru_note_cost_refault(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
--
2.37.3