LinuxLists.cc - [PATCH v20 18/20] mm/lru: replace pgdat lru

2020-10-29 10:48:08

Subject: [PATCH v20 18/20] mm/lru: replace pgdat lru_lock with lruvec lock

This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In func isolate_migratepages_block, compact_unlock_should_abort and
lock_page_lruvec_irqsave are open coded to work with compact_control.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

Daniel Jordan's testing show 62% improvement on modified readtwice case
on his 2P * 10 core * 2 HT broadwell box.
https://lore.kernel.org/lkml/[email protected]/

On a large machine with memcg enabled but not used, the page's lruvec
seeking pass a few pointers, that may lead to lru_lock holding time
increase and a bit regression.

Hugh Dickins helped on the patch polish, thanks!

Signed-off-by: Alex Shi <[email protected]>
Acked-by: Hugh Dickins <[email protected]>
Cc: Rong Chen <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Yang Shi <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Konstantin Khlebnikov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
include/linux/memcontrol.h | 58 +++++++++++++++++++++++++
include/linux/mmzone.h | 3 +-
mm/compaction.c | 56 +++++++++++++++---------
mm/huge_memory.c | 11 ++---
mm/memcontrol.c | 62 ++++++++++++++++++++++++--
mm/mlock.c | 22 +++++++---
mm/mmzone.c | 1 +
mm/page_alloc.c | 1 -
mm/swap.c | 105 +++++++++++++++++++++------------------------
mm/vmscan.c | 55 +++++++++++-------------
10 files changed, 249 insertions(+), 125 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e391e3c56de5..f447a1bfa654 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,6 +497,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,

struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);

+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -1038,6 +1051,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irq(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flagsp)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+ return &pgdat->__lruvec;
+}
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1285,6 +1323,10 @@ static inline void count_memcg_page_event(struct page *page,
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
#endif /* CONFIG_MEMCG */

/* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1414,6 +1456,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+ spin_unlock(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+ unsigned long flags)
+{
+ spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fb3bf696c05e..0afba4ea2a21 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -276,6 +276,8 @@ enum lruvec_flags {

struct lruvec {
struct list_head lists[NR_LRU_LISTS];
+ /* per lruvec lru_lock for memcg */
+ spinlock_t lru_lock;
/*
* These track the cost of reclaiming one LRU - file or anon -
* over the other. As the observed cost of reclaiming one LRU
@@ -796,7 +798,6 @@ struct deferred_split {

/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
- spinlock_t lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
diff --git a/mm/compaction.c b/mm/compaction.c
index 75f7973605f4..a69784820324 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -804,7 +804,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -864,11 +864,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+
+ low_pfn = 0;
+ goto fatal_pending;
+ }
+
+ cond_resched();
}

if (!pfn_valid_within(low_pfn))
@@ -940,9 +949,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}

if (!isolate_movable_page(page, isolate_mode))
@@ -983,10 +991,19 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!TestClearPageLRU(page))
goto isolate_fail_put;

+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);

/* Try get exclusive access under lock */
if (!skip_updated) {
@@ -1005,9 +1022,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
SetPageLRU(page);
goto isolate_fail_put;
}
- }
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ } else
+ rcu_read_unlock();

/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1041,8 +1057,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
put_page(page);

@@ -1057,8 +1073,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1086,7 +1102,7 @@ static bool too_many_isolated(pg_data_t *pgdat)

isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
if (page) {
SetPageLRU(page);
put_page(page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5fa890e26975..9b3e6479c0c4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2352,7 +2352,7 @@ static void lru_add_page_tail(struct page *head, struct page *page_tail,
VM_BUG_ON_PAGE(!PageHead(head), head);
VM_BUG_ON_PAGE(PageCompound(page_tail), head);
VM_BUG_ON_PAGE(PageLRU(page_tail), head);
- lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+ lockdep_assert_held(&lruvec->lru_lock);

if (list) {
/* page reclaim is reclaiming a huge page */
@@ -2436,7 +2436,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end)
{
struct page *head = compound_head(page);
- pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
@@ -2454,10 +2453,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}

- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock(&pgdat->lru_lock);
-
- lruvec = mem_cgroup_page_lruvec(head, pgdat);
+ /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ lruvec = lock_page_lruvec(head);

for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
@@ -2478,7 +2475,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}

ClearPageCompound(head);
- spin_unlock(&pgdat->lru_lock);
+ unlock_page_lruvec(lruvec);
/* Caller disabled irqs, so they are still disabled here */

split_page_owner(head, nr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 599aa8863111..0c97292834fa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1330,6 +1330,19 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}

+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!page->mem_cgroup)
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+ else
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page);
+}
+#endif
+
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
@@ -1367,6 +1380,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
return lruvec;
}

+struct lruvec *lock_page_lruvec(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irqsave(&lruvec->lru_lock, *flags);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
@@ -3270,10 +3328,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
/*
- * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * Because page->mem_cgroup is not set on compound tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b0e6334be6f..ab164a675c25 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -262,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
int nr = pagevec_count(pvec);
int delta_munlocked = -nr;
struct pagevec pvec_putback;
+ struct lruvec *lruvec = NULL;
int pgrescued = 0;

pagevec_init(&pvec_putback);

/* Phase 1: page isolation */
- spin_lock_irq(&zone->zone_pgdat->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];

@@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* so we can spare the get_page() here.
*/
if (TestClearPageLRU(page)) {
- struct lruvec *lruvec;
+ struct lruvec *new_lruvec;
+
+ new_lruvec = mem_cgroup_page_lruvec(page,
+ page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ lruvec = lock_page_lruvec_irq(page);
+ }

- lruvec = mem_cgroup_page_lruvec(page,
- page_pgdat(page));
del_page_from_lru_list(page, lruvec,
page_lru(page));
continue;
@@ -299,8 +305,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+ if (lruvec) {
+ __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ unlock_page_lruvec_irq(lruvec);
+ } else if (delta_munlocked) {
+ mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ }

/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..3750a90ed4a0 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec)
enum lru_list lru;

memset(lruvec, 0, sizeof(struct lruvec));
+ spin_lock_init(&lruvec->lru_lock);

for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..713e6554becd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6791,7 +6791,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->pfmemalloc_wait);

pgdat_page_ext_init(pgdat);
- spin_lock_init(&pgdat->lru_lock);
lruvec_init(&pgdat->__lruvec);
}

diff --git a/mm/swap.c b/mm/swap.c
index 9e30f096309b..580ea18a9596 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
unsigned long flags;

- spin_lock_irqsave(&pgdat->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
}
@@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec))
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
-
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- }
+ struct lruvec *new_lruvec;

/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
+ }
+
(*move_fn)(page, lruvec);

SetPageLRU(page);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
@@ -274,9 +270,8 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
do {
unsigned long lrusize;
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += nr_pages;
@@ -300,7 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
} while ((lruvec = parent_lruvec(lruvec)));
}

@@ -364,13 +359,13 @@ static inline void activate_page_drain(int cpu)

static void activate_page(struct page *page)
{
- pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;

page = compound_head(page);
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
if (PageLRU(page))
- __activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
- spin_unlock_irq(&pgdat->lru_lock);
+ __activate_page(page, lruvec);
+ unlock_page_lruvec_irq(lruvec);
}
#endif

@@ -860,8 +855,7 @@ void release_pages(struct page **pages, int nr)
{
int i;
LIST_HEAD(pages_to_free);
- struct pglist_data *locked_pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags;
unsigned int lock_batch;

@@ -871,11 +865,11 @@ void release_pages(struct page **pages, int nr)
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
- * same pgdat. The lock is held only if pgdat != NULL.
+ * same lruvec. The lock is held only if lruvec != NULL.
*/
- if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}

page = compound_head(page);
@@ -883,10 +877,9 @@ void release_pages(struct page **pages, int nr)
continue;

if (is_zone_device_page(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
/*
* ZONE_DEVICE pages that return 'false' from
@@ -904,27 +897,27 @@ void release_pages(struct page **pages, int nr)
continue;

if (PageCompound(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
__put_compound_page(page);
continue;
}

if (PageLRU(page)) {
- struct pglist_data *pgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

- if (pgdat != locked_pgdat) {
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ new_lruvec = mem_cgroup_page_lruvec(page,
+ page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec,
flags);
lock_batch = 0;
- locked_pgdat = pgdat;
- spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
}

- lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -934,8 +927,8 @@ void release_pages(struct page **pages, int nr)

list_add(&page->lru, &pages_to_free);
}
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);

mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
@@ -1023,26 +1016,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
void __pagevec_lru_add(struct pagevec *pvec)
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
}

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
__pagevec_lru_add_fn(page, lruvec);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e28df9cb5be3..9e726b587d74 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1765,14 +1765,12 @@ int isolate_lru_page(struct page *page)
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");

if (TestClearPageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;

get_page(page);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
ret = 0;
}

@@ -1839,7 +1837,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
@@ -1850,9 +1847,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
putback_lru_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
continue;
}

@@ -1874,9 +1871,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
__ClearPageActive(page);

if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
destroy_compound_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
} else
list_add(&page->lru, &pages_to_free);

@@ -1949,7 +1946,7 @@ static int current_may_throttle(void)

lru_add_drain();

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
@@ -1961,7 +1958,7 @@ static int current_may_throttle(void)
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);

- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

if (nr_taken == 0)
return 0;
@@ -1969,7 +1966,7 @@ static int current_may_throttle(void)
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
move_pages_to_lru(lruvec, &page_list);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1978,7 +1975,7 @@ static int current_may_throttle(void)
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

lru_note_cost(lruvec, file, stat.nr_pageout);
mem_cgroup_uncharge_list(&page_list);
@@ -2031,7 +2028,7 @@ static void shrink_active_list(unsigned long nr_to_scan,

lru_add_drain();

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
@@ -2042,7 +2039,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);

- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

while (!list_empty(&l_hold)) {
cond_resched();
@@ -2088,7 +2085,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2099,7 +2096,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
@@ -2689,10 +2686,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
/*
* Determine the scan balance between anon and file LRUs.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&target_lruvec->lru_lock);

/*
* Target desirable inactive:active list ratios for the anon
@@ -4268,16 +4265,15 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
*/
void check_move_unevictable_pages(struct pagevec *pvec)
{
- struct lruvec *lruvec;
- struct pglist_data *pgdat = NULL;
+ struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;

for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
int nr_pages;
+ struct lruvec *new_lruvec;

if (PageTransTail(page))
continue;
@@ -4289,13 +4285,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
if (!TestClearPageLRU(page))
continue;

- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irq(&pgdat->lru_lock);
- pgdat = pagepgdat;
- spin_lock_irq(&pgdat->lru_lock);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ lruvec = lock_page_lruvec_irq(page);
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);

if (page_evictable(page) && PageUnevictable(page)) {
enum lru_list lru = page_lru_base_type(page);
@@ -4309,10 +4304,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
SetPageLRU(page);
}

- if (pgdat) {
+ if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
} else if (pgscanned) {
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
--
1.8.3.1

2020-10-30 02:55:47

by Alex Shi

[permalink] [raw]

Subject: Re: [PATCH v20 18/20] mm/lru: replace pgdat lru_lock with lruvec lock

patch changed since variable renaming in 04th patch:

From e892e74a35c27e69bebb73d2e4cff54e438f8d7d Mon Sep 17 00:00:00 2001
From: Alex Shi <[email protected]>
Date: Tue, 18 Aug 2020 16:44:21 +0800
Subject: [PATCH v21 18/20] mm/lru: replace pgdat lru_lock with lruvec lock

This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In func isolate_migratepages_block, compact_unlock_should_abort and
lock_page_lruvec_irqsave are open coded to work with compact_control.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

Daniel Jordan's testing show 62% improvement on modified readtwice case
on his 2P * 10 core * 2 HT broadwell box.
https://lore.kernel.org/lkml/[email protected]/

On a large machine with memcg enabled but not used, the page's lruvec
seeking pass a few pointers, that may lead to lru_lock holding time
increase and a bit regression.

Hugh Dickins helped on the patch polish, thanks!

Signed-off-by: Alex Shi <[email protected]>
Acked-by: Hugh Dickins <[email protected]>
Cc: Rong Chen <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Yang Shi <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Konstantin Khlebnikov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
include/linux/memcontrol.h | 58 +++++++++++++++++++++++++
include/linux/mmzone.h | 3 +-
mm/compaction.c | 56 +++++++++++++++---------
mm/huge_memory.c | 11 ++---
mm/memcontrol.c | 62 ++++++++++++++++++++++++--
mm/mlock.c | 22 +++++++---
mm/mmzone.c | 1 +
mm/page_alloc.c | 1 -
mm/swap.c | 105 +++++++++++++++++++++------------------------
mm/vmscan.c | 55 +++++++++++-------------
10 files changed, 249 insertions(+), 125 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e391e3c56de5..f447a1bfa654 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,6 +497,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,

struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);

+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -1038,6 +1051,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irq(&pgdat->__lruvec.lru_lock);
+ return &pgdat->__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+ unsigned long *flagsp)
+{
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
+ return &pgdat->__lruvec;
+}
+
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1285,6 +1323,10 @@ static inline void count_memcg_page_event(struct page *page,
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
#endif /* CONFIG_MEMCG */

/* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1414,6 +1456,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+ spin_unlock(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+ unsigned long flags)
+{
+ spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+}
+
#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fb3bf696c05e..0afba4ea2a21 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -276,6 +276,8 @@ enum lruvec_flags {

struct lruvec {
struct list_head lists[NR_LRU_LISTS];
+ /* per lruvec lru_lock for memcg */
+ spinlock_t lru_lock;
/*
* These track the cost of reclaiming one LRU - file or anon -
* over the other. As the observed cost of reclaiming one LRU
@@ -796,7 +798,6 @@ struct deferred_split {

/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
- spinlock_t lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
diff --git a/mm/compaction.c b/mm/compaction.c
index 75f7973605f4..a69784820324 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -804,7 +804,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -864,11 +864,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+
+ low_pfn = 0;
+ goto fatal_pending;
+ }
+
+ cond_resched();
}

if (!pfn_valid_within(low_pfn))
@@ -940,9 +949,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}

if (!isolate_movable_page(page, isolate_mode))
@@ -983,10 +991,19 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!TestClearPageLRU(page))
goto isolate_fail_put;

+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);

/* Try get exclusive access under lock */
if (!skip_updated) {
@@ -1005,9 +1022,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
SetPageLRU(page);
goto isolate_fail_put;
}
- }
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ } else
+ rcu_read_unlock();

/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1041,8 +1057,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
put_page(page);

@@ -1057,8 +1073,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1086,7 +1102,7 @@ static bool too_many_isolated(pg_data_t *pgdat)

isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
if (page) {
SetPageLRU(page);
put_page(page);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7811a30080fb..7b5da37895bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2352,7 +2352,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
VM_BUG_ON_PAGE(!PageHead(head), head);
VM_BUG_ON_PAGE(PageCompound(tail), head);
VM_BUG_ON_PAGE(PageLRU(tail), head);
- lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+ lockdep_assert_held(&lruvec->lru_lock);

if (list) {
/* page reclaim is reclaiming a huge page */
@@ -2436,7 +2436,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end)
{
struct page *head = compound_head(page);
- pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
@@ -2454,10 +2453,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}

- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock(&pgdat->lru_lock);
-
- lruvec = mem_cgroup_page_lruvec(head, pgdat);
+ /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ lruvec = lock_page_lruvec(head);

for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
@@ -2478,7 +2475,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}

ClearPageCompound(head);
- spin_unlock(&pgdat->lru_lock);
+ unlock_page_lruvec(lruvec);
/* Caller disabled irqs, so they are still disabled here */

split_page_owner(head, nr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 599aa8863111..0c97292834fa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1330,6 +1330,19 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}

+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!page->mem_cgroup)
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+ else
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != page->mem_cgroup, page);
+}
+#endif
+
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
@@ -1367,6 +1380,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
return lruvec;
}

+struct lruvec *lock_page_lruvec(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irqsave(&lruvec->lru_lock, *flags);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
/**
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
@@ -3270,10 +3328,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
/*
- * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * Because page->mem_cgroup is not set on compound tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b0e6334be6f..ab164a675c25 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -262,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
int nr = pagevec_count(pvec);
int delta_munlocked = -nr;
struct pagevec pvec_putback;
+ struct lruvec *lruvec = NULL;
int pgrescued = 0;

pagevec_init(&pvec_putback);

/* Phase 1: page isolation */
- spin_lock_irq(&zone->zone_pgdat->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];

@@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* so we can spare the get_page() here.
*/
if (TestClearPageLRU(page)) {
- struct lruvec *lruvec;
+ struct lruvec *new_lruvec;
+
+ new_lruvec = mem_cgroup_page_lruvec(page,
+ page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ lruvec = lock_page_lruvec_irq(page);
+ }

- lruvec = mem_cgroup_page_lruvec(page,
- page_pgdat(page));
del_page_from_lru_list(page, lruvec,
page_lru(page));
continue;
@@ -299,8 +305,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+ if (lruvec) {
+ __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ unlock_page_lruvec_irq(lruvec);
+ } else if (delta_munlocked) {
+ mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ }

/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..3750a90ed4a0 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -91,6 +91,7 @@ void lruvec_init(struct lruvec *lruvec)
enum lru_list lru;

memset(lruvec, 0, sizeof(struct lruvec));
+ spin_lock_init(&lruvec->lru_lock);

for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..713e6554becd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6791,7 +6791,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->pfmemalloc_wait);

pgdat_page_ext_init(pgdat);
- spin_lock_init(&pgdat->lru_lock);
lruvec_init(&pgdat->__lruvec);
}

diff --git a/mm/swap.c b/mm/swap.c
index 9e30f096309b..580ea18a9596 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
unsigned long flags;

- spin_lock_irqsave(&pgdat->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
}
@@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec))
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
-
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- }
+ struct lruvec *new_lruvec;

/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
+ }
+
(*move_fn)(page, lruvec);

SetPageLRU(page);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
@@ -274,9 +270,8 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
do {
unsigned long lrusize;
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += nr_pages;
@@ -300,7 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
} while ((lruvec = parent_lruvec(lruvec)));
}

@@ -364,13 +359,13 @@ static inline void activate_page_drain(int cpu)

static void activate_page(struct page *page)
{
- pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;

page = compound_head(page);
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
if (PageLRU(page))
- __activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
- spin_unlock_irq(&pgdat->lru_lock);
+ __activate_page(page, lruvec);
+ unlock_page_lruvec_irq(lruvec);
}
#endif

@@ -860,8 +855,7 @@ void release_pages(struct page **pages, int nr)
{
int i;
LIST_HEAD(pages_to_free);
- struct pglist_data *locked_pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags;
unsigned int lock_batch;

@@ -871,11 +865,11 @@ void release_pages(struct page **pages, int nr)
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
- * same pgdat. The lock is held only if pgdat != NULL.
+ * same lruvec. The lock is held only if lruvec != NULL.
*/
- if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}

page = compound_head(page);
@@ -883,10 +877,9 @@ void release_pages(struct page **pages, int nr)
continue;

if (is_zone_device_page(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
/*
* ZONE_DEVICE pages that return 'false' from
@@ -904,27 +897,27 @@ void release_pages(struct page **pages, int nr)
continue;

if (PageCompound(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
__put_compound_page(page);
continue;
}

if (PageLRU(page)) {
- struct pglist_data *pgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

- if (pgdat != locked_pgdat) {
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ new_lruvec = mem_cgroup_page_lruvec(page,
+ page_pgdat(page));
+ if (new_lruvec != lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec,
flags);
lock_batch = 0;
- locked_pgdat = pgdat;
- spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
}

- lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -934,8 +927,8 @@ void release_pages(struct page **pages, int nr)

list_add(&page->lru, &pages_to_free);
}
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);

mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
@@ -1023,26 +1016,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
void __pagevec_lru_add(struct pagevec *pvec)
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;

for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
+ struct lruvec *new_lruvec;

- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
}

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
__pagevec_lru_add_fn(page, lruvec);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e28df9cb5be3..9e726b587d74 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1765,14 +1765,12 @@ int isolate_lru_page(struct page *page)
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");

if (TestClearPageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;

get_page(page);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- spin_lock_irq(&pgdat->lru_lock);
+ lruvec = lock_page_lruvec_irq(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
ret = 0;
}

@@ -1839,7 +1837,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
@@ -1850,9 +1847,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
putback_lru_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
continue;
}

@@ -1874,9 +1871,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
__ClearPageActive(page);

if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
destroy_compound_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
} else
list_add(&page->lru, &pages_to_free);

@@ -1949,7 +1946,7 @@ static int current_may_throttle(void)

lru_add_drain();

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
@@ -1961,7 +1958,7 @@ static int current_may_throttle(void)
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);

- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

if (nr_taken == 0)
return 0;
@@ -1969,7 +1966,7 @@ static int current_may_throttle(void)
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
move_pages_to_lru(lruvec, &page_list);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1978,7 +1975,7 @@ static int current_may_throttle(void)
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

lru_note_cost(lruvec, file, stat.nr_pageout);
mem_cgroup_uncharge_list(&page_list);
@@ -2031,7 +2028,7 @@ static void shrink_active_list(unsigned long nr_to_scan,

lru_add_drain();

- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
@@ -2042,7 +2039,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);

- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

while (!list_empty(&l_hold)) {
cond_resched();
@@ -2088,7 +2085,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);

nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2099,7 +2096,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);

__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);

mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
@@ -2689,10 +2686,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
/*
* Determine the scan balance between anon and file LRUs.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&target_lruvec->lru_lock);

/*
* Target desirable inactive:active list ratios for the anon
@@ -4268,16 +4265,15 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
*/
void check_move_unevictable_pages(struct pagevec *pvec)
{
- struct lruvec *lruvec;
- struct pglist_data *pgdat = NULL;
+ struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;

for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
int nr_pages;
+ struct lruvec *new_lruvec;

if (PageTransTail(page))
continue;
@@ -4289,13 +4285,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
if (!TestClearPageLRU(page))
continue;

- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irq(&pgdat->lru_lock);
- pgdat = pagepgdat;
- spin_lock_irq(&pgdat->lru_lock);
+ new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ if (lruvec != new_lruvec) {
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ lruvec = lock_page_lruvec_irq(page);
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);

if (page_evictable(page) && PageUnevictable(page)) {
enum lru_list lru = page_lru_base_type(page);
@@ -4309,10 +4304,10 @@ void check_move_unevictable_pages(struct pagevec *pvec)
SetPageLRU(page);
}

- if (pgdat) {
+ if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
} else if (pgscanned) {
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
--
1.8.3.1

2020-11-02 20:55:48

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH v20 18/20] mm/lru: replace pgdat lru_lock with lruvec lock

On Fri, Oct 30, 2020 at 10:49:41AM +0800, Alex Shi wrote:
>
>
> patch changed since variable renaming in 04th patch:
>
> From e892e74a35c27e69bebb73d2e4cff54e438f8d7d Mon Sep 17 00:00:00 2001
> From: Alex Shi <[email protected]>
> Date: Tue, 18 Aug 2020 16:44:21 +0800
> Subject: [PATCH v21 18/20] mm/lru: replace pgdat lru_lock with lruvec lock
>
> This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
> each of memcg per node. So on a large machine, each of memcg don't
> have to suffer from per node pgdat->lru_lock competition. They could go
> fast with their self lru_lock.
>
> After move memcg charge before lru inserting, page isolation could
> serialize page's memcg, then per memcg lruvec lock is stable and could
> replace per node lru lock.
>
> In func isolate_migratepages_block, compact_unlock_should_abort and
> lock_page_lruvec_irqsave are open coded to work with compact_control.
> Also add a debug func in locking which may give some clues if there are
> sth out of hands.
>
> Daniel Jordan's testing show 62% improvement on modified readtwice case
> on his 2P * 10 core * 2 HT broadwell box.
> https://lore.kernel.org/lkml/[email protected]/
>
> On a large machine with memcg enabled but not used, the page's lruvec
> seeking pass a few pointers, that may lead to lru_lock holding time
> increase and a bit regression.
>
> Hugh Dickins helped on the patch polish, thanks!
>
> Signed-off-by: Alex Shi <[email protected]>
> Acked-by: Hugh Dickins <[email protected]>
> Cc: Rong Chen <[email protected]>
> Cc: Hugh Dickins <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Cc: Yang Shi <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Konstantin Khlebnikov <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> ---
> include/linux/memcontrol.h | 58 +++++++++++++++++++++++++
> include/linux/mmzone.h | 3 +-
> mm/compaction.c | 56 +++++++++++++++---------
> mm/huge_memory.c | 11 ++---
> mm/memcontrol.c | 62 ++++++++++++++++++++++++--
> mm/mlock.c | 22 +++++++---
> mm/mmzone.c | 1 +
> mm/page_alloc.c | 1 -
> mm/swap.c | 105 +++++++++++++++++++++------------------------
> mm/vmscan.c | 55 +++++++++++-------------
> 10 files changed, 249 insertions(+), 125 deletions(-)

This came out really well. Thanks for persisting!

A few inline comments:

> @@ -1367,6 +1380,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
> return lruvec;
> }
>
> +struct lruvec *lock_page_lruvec(struct page *page)
> +{
> + struct lruvec *lruvec;
> + struct pglist_data *pgdat = page_pgdat(page);
> +
> + rcu_read_lock();
> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + spin_lock(&lruvec->lru_lock);
> + rcu_read_unlock();
> +
> + lruvec_memcg_debug(lruvec, page);
> +
> + return lruvec;
> +}
> +
> +struct lruvec *lock_page_lruvec_irq(struct page *page)
> +{
> + struct lruvec *lruvec;
> + struct pglist_data *pgdat = page_pgdat(page);
> +
> + rcu_read_lock();
> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + spin_lock_irq(&lruvec->lru_lock);
> + rcu_read_unlock();
> +
> + lruvec_memcg_debug(lruvec, page);
> +
> + return lruvec;
> +}
> +
> +struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
> +{
> + struct lruvec *lruvec;
> + struct pglist_data *pgdat = page_pgdat(page);
> +
> + rcu_read_lock();
> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + spin_lock_irqsave(&lruvec->lru_lock, *flags);
> + rcu_read_unlock();
> +
> + lruvec_memcg_debug(lruvec, page);
> +
> + return lruvec;
> +}

As these are used quite widely throughout the VM code now, it would be
good to give them kerneldoc comments that explain the interface.

In particular, I think it's necessary to explain the contexts from
which this is safe to use (in particular wrt pages moving between
memcgs - see the comment in commit_charge()).

I'm going to go through the callsites that follow and try to identify
what makes them safe. It's mostly an exercise to double check our
thinking here.

Most of them are straight-forward, and I don't think they warrant
individual comments. But some do, IMO. And it appears at least one
actually isn't safe yet:

> @@ -277,10 +277,16 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
> * so we can spare the get_page() here.
> */
> if (TestClearPageLRU(page)) {
> - struct lruvec *lruvec;
> + struct lruvec *new_lruvec;
> +
> + new_lruvec = mem_cgroup_page_lruvec(page,
> + page_pgdat(page));
> + if (new_lruvec != lruvec) {
> + if (lruvec)
> + unlock_page_lruvec_irq(lruvec);
> + lruvec = lock_page_lruvec_irq(page);

This is safe because PageLRU has been cleared.

> @@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
> static void __page_cache_release(struct page *page)
> {
> if (PageLRU(page)) {
> - pg_data_t *pgdat = page_pgdat(page);
> struct lruvec *lruvec;
> unsigned long flags;
>
> - spin_lock_irqsave(&pgdat->lru_lock, flags);
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = lock_page_lruvec_irqsave(page, &flags);
> VM_BUG_ON_PAGE(!PageLRU(page), page);
> __ClearPageLRU(page);
> del_page_from_lru_list(page, lruvec, page_off_lru(page));
> - spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> + unlock_page_lruvec_irqrestore(lruvec, flags);

This is safe because the page refcount is 0.

> @@ -207,32 +205,30 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
> void (*move_fn)(struct page *page, struct lruvec *lruvec))
> {
> int i;
> - struct pglist_data *pgdat = NULL;
> - struct lruvec *lruvec;
> + struct lruvec *lruvec = NULL;
> unsigned long flags = 0;
>
> for (i = 0; i < pagevec_count(pvec); i++) {
> struct page *page = pvec->pages[i];
> - struct pglist_data *pagepgdat = page_pgdat(page);
> -
> - if (pagepgdat != pgdat) {
> - if (pgdat)
> - spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> - pgdat = pagepgdat;
> - spin_lock_irqsave(&pgdat->lru_lock, flags);
> - }
> + struct lruvec *new_lruvec;
>
> /* block memcg migration during page moving between lru */
> if (!TestClearPageLRU(page))
> continue;
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + if (lruvec != new_lruvec) {
> + if (lruvec)
> + unlock_page_lruvec_irqrestore(lruvec, flags);
> + lruvec = lock_page_lruvec_irqsave(page, &flags);

This is safe because PageLRU has been cleared.

> @@ -274,9 +270,8 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
> {
> do {
> unsigned long lrusize;
> - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>
> - spin_lock_irq(&pgdat->lru_lock);
> + spin_lock_irq(&lruvec->lru_lock);
> /* Record cost event */
> if (file)
> lruvec->file_cost += nr_pages;
> @@ -300,7 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
> lruvec->file_cost /= 2;
> lruvec->anon_cost /= 2;
> }
> - spin_unlock_irq(&pgdat->lru_lock);
> + spin_unlock_irq(&lruvec->lru_lock);
> } while ((lruvec = parent_lruvec(lruvec)));
> }

This is safe because it either comes from

1) the pinned lruvec in reclaim, or

2) from a pre-LRU page during refault (which also holds the
rcu lock, so would be safe even if the page was on the LRU
and could move simultaneously to a new lruvec).

The second one seems a bit tricky. It could be good to add a comment
to lru_note_cost_page() that explains why its mem_cgroup_page_lruvec()
is safe.

> @@ -364,13 +359,13 @@ static inline void activate_page_drain(int cpu)
>
> static void activate_page(struct page *page)
> {
> - pg_data_t *pgdat = page_pgdat(page);
> + struct lruvec *lruvec;
>
> page = compound_head(page);
> - spin_lock_irq(&pgdat->lru_lock);
> + lruvec = lock_page_lruvec_irq(page);

I don't see what makes this safe. There is nothing that appears to
lock out a concurrent page move between memcgs/lruvecs, which means
the following could manipulate an unlocked lru list:

> if (PageLRU(page))
> - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
> - spin_unlock_irq(&pgdat->lru_lock);
> + __activate_page(page, lruvec);
> + unlock_page_lruvec_irq(lruvec);
> }
> #endif

Shouldn't this be something like this?

if (TestClearPageLRU()) {
lruvec = lock_page_lruvec_irq(page);
__activate_page(page, lruvec);
unlock_page_lruvec_irq(lruvec);
SetPageLRU(page);
}

> @@ -904,27 +897,27 @@ void release_pages(struct page **pages, int nr)
> continue;
>
> if (PageCompound(page)) {
> - if (locked_pgdat) {
> - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
> - locked_pgdat = NULL;
> + if (lruvec) {
> + unlock_page_lruvec_irqrestore(lruvec, flags);
> + lruvec = NULL;
> }
> __put_compound_page(page);
> continue;
> }
>
> if (PageLRU(page)) {
> - struct pglist_data *pgdat = page_pgdat(page);
> + struct lruvec *new_lruvec;
>
> - if (pgdat != locked_pgdat) {
> - if (locked_pgdat)
> - spin_unlock_irqrestore(&locked_pgdat->lru_lock,
> + new_lruvec = mem_cgroup_page_lruvec(page,
> + page_pgdat(page));
> + if (new_lruvec != lruvec) {
> + if (lruvec)
> + unlock_page_lruvec_irqrestore(lruvec,
> flags);
> lock_batch = 0;
> - locked_pgdat = pgdat;
> - spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
> + lruvec = lock_page_lruvec_irqsave(page, &flags);

Safe because page refcount=0.

> @@ -1023,26 +1016,24 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
> void __pagevec_lru_add(struct pagevec *pvec)
> {
> int i;
> - struct pglist_data *pgdat = NULL;
> - struct lruvec *lruvec;
> + struct lruvec *lruvec = NULL;
> unsigned long flags = 0;
>
> for (i = 0; i < pagevec_count(pvec); i++) {
> struct page *page = pvec->pages[i];
> - struct pglist_data *pagepgdat = page_pgdat(page);
> + struct lruvec *new_lruvec;
>
> - if (pagepgdat != pgdat) {
> - if (pgdat)
> - spin_unlock_irqrestore(&pgdat->lru_lock, flags);
> - pgdat = pagepgdat;
> - spin_lock_irqsave(&pgdat->lru_lock, flags);
> + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + if (lruvec != new_lruvec) {
> + if (lruvec)
> + unlock_page_lruvec_irqrestore(lruvec, flags);
> + lruvec = lock_page_lruvec_irqsave(page, &flags);

Safe because PageLRU hasn't been set yet.

> @@ -1765,14 +1765,12 @@ int isolate_lru_page(struct page *page)
> WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
>
> if (TestClearPageLRU(page)) {
> - pg_data_t *pgdat = page_pgdat(page);
> struct lruvec *lruvec;
>
> get_page(page);
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> - spin_lock_irq(&pgdat->lru_lock);
> + lruvec = lock_page_lruvec_irq(page);

Safe because PageLRU is cleared.

> @@ -1839,7 +1837,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
> static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
> struct list_head *list)
> {
> - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> int nr_pages, nr_moved = 0;
> LIST_HEAD(pages_to_free);
> struct page *page;
> @@ -1850,9 +1847,9 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
> VM_BUG_ON_PAGE(PageLRU(page), page);
> list_del(&page->lru);
> if (unlikely(!page_evictable(page))) {
> - spin_unlock_irq(&pgdat->lru_lock);
> + spin_unlock_irq(&lruvec->lru_lock);

[snipped all the reclaim lock sites as they start with lruvec]

> @@ -4289,13 +4285,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
> if (!TestClearPageLRU(page))
> continue;
>
> - if (pagepgdat != pgdat) {
> - if (pgdat)
> - spin_unlock_irq(&pgdat->lru_lock);
> - pgdat = pagepgdat;
> - spin_lock_irq(&pgdat->lru_lock);
> + new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + if (lruvec != new_lruvec) {
> + if (lruvec)
> + unlock_page_lruvec_irq(lruvec);
> + lruvec = lock_page_lruvec_irq(page);

Safe because PageLRU is clear.

2020-11-03 05:05:16

by Alex Shi

[permalink] [raw]

Subject: Re: [PATCH v20 18/20] mm/lru: replace pgdat lru_lock with lruvec lock

?? 2020/11/3 ????4:41, Johannes Weiner д??:
> On Fri, Oct 30, 2020 at 10:49:41AM +0800, Alex Shi wrote:
>>
>>
>> @@ -1367,6 +1380,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
>> return lruvec;
>> }
>>
>> +struct lruvec *lock_page_lruvec(struct page *page)
>> +{
>> + struct lruvec *lruvec;
>> + struct pglist_data *pgdat = page_pgdat(page);
>> +
>> + rcu_read_lock();
>> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> + spin_lock(&lruvec->lru_lock);
>> + rcu_read_unlock();
>> +
>> + lruvec_memcg_debug(lruvec, page);
>> +
>> + return lruvec;
>> +}
>> +
>> +struct lruvec *lock_page_lruvec_irq(struct page *page)
>> +{
>> + struct lruvec *lruvec;
>> + struct pglist_data *pgdat = page_pgdat(page);
>> +
>> + rcu_read_lock();
>> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> + spin_lock_irq(&lruvec->lru_lock);
>> + rcu_read_unlock();
>> +
>> + lruvec_memcg_debug(lruvec, page);
>> +
>> + return lruvec;
>> +}
>> +
>> +struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
>> +{
>> + struct lruvec *lruvec;
>> + struct pglist_data *pgdat = page_pgdat(page);
>> +
>> + rcu_read_lock();
>> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> + spin_lock_irqsave(&lruvec->lru_lock, *flags);
>> + rcu_read_unlock();
>> +
>> + lruvec_memcg_debug(lruvec, page);
>> +
>> + return lruvec;
>> +}
>
> As these are used quite widely throughout the VM code now, it would be
> good to give them kerneldoc comments that explain the interface.
>
> In particular, I think it's necessary to explain the contexts from
> which this is safe to use (in particular wrt pages moving between
> memcgs - see the comment in commit_charge()).
>
> I'm going to go through the callsites that follow and try to identify
> what makes them safe. It's mostly an exercise to double check our
> thinking here.
>
> Most of them are straight-forward, and I don't think they warrant
> individual comments. But some do, IMO. And it appears at least one
> actually isn't safe yet:

Thanks a lot reminder. is the following comments fine?

/**
* lock_page_lruvec - return lruvec for the locked page.
* @page: the page
*
* This series functions should be used in either conditions:
* PageLRU is cleared or unset
* or, page->_refcount is zero
*/
struct lruvec *lock_page_lruvec(struct page *page)
{

....

>> @@ -274,9 +270,8 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
>> {
>> do {
>> unsigned long lrusize;
>> - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>
>> - spin_lock_irq(&pgdat->lru_lock);
>> + spin_lock_irq(&lruvec->lru_lock);
>> /* Record cost event */
>> if (file)
>> lruvec->file_cost += nr_pages;
>> @@ -300,7 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
>> lruvec->file_cost /= 2;
>> lruvec->anon_cost /= 2;
>> }
>> - spin_unlock_irq(&pgdat->lru_lock);
>> + spin_unlock_irq(&lruvec->lru_lock);
>> } while ((lruvec = parent_lruvec(lruvec)));
>> }
>
> This is safe because it either comes from
>
> 1) the pinned lruvec in reclaim, or
>
> 2) from a pre-LRU page during refault (which also holds the
> rcu lock, so would be safe even if the page was on the LRU
> and could move simultaneously to a new lruvec).
>
> The second one seems a bit tricky. It could be good to add a comment
> to lru_note_cost_page() that explains why its mem_cgroup_page_lruvec()
> is safe.

Thanks for pointed, is the following comments fine?
diff --git a/mm/swap.c b/mm/swap.c
index 9fe5ff9a8111..55ccc93ffb49 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -264,6 +264,13 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
do {
unsigned long lrusize;

+ /*
+ * Hold lruvec->lru_lock is safe here, since
+ * 1) The pinned lruvec in reclaim, or
+ * 2) From a pre-LRU page during refault (which also holds the
+ * rcu lock, so would be safe even if the page was on the LRU
+ * and could move simultaneously to a new lruvec).
+ */
spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
>
>> @@ -364,13 +359,13 @@ static inline void activate_page_drain(int cpu)
>>
>> static void activate_page(struct page *page)
>> {
>> - pg_data_t *pgdat = page_pgdat(page);
>> + struct lruvec *lruvec;
>>
>> page = compound_head(page);
>> - spin_lock_irq(&pgdat->lru_lock);
>> + lruvec = lock_page_lruvec_irq(page);
>
> I don't see what makes this safe. There is nothing that appears to
> lock out a concurrent page move between memcgs/lruvecs, which means
> the following could manipulate an unlocked lru list:
>

This funtion is for !CONFIG_SMP, could the cpu be preempt with RT kernel?

>> if (PageLRU(page))
>> - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat));
>> - spin_unlock_irq(&pgdat->lru_lock);
>> + __activate_page(page, lruvec);
>> + unlock_page_lruvec_irq(lruvec);
>> }
>> #endif
>
> Shouldn't this be something like this?
>
> if (TestClearPageLRU()) {
> lruvec = lock_page_lruvec_irq(page);
> __activate_page(page, lruvec);
> unlock_page_lruvec_irq(lruvec);
> SetPageLRU(page);
> }

But anyway, your new changes are more beautiful and logcially. I will change
to this.

Thanks a lot!
Alex