Received-SPF: pass (google.com: domain of linux-kernel+bounces-119752-linux.lists.archive=gmail.com@vger.kernel.org designates 147.75.199.223 as permitted sender) client-ip=147.75.199.223;
From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: "Huang, Ying" <ying.huang@intel.com>,
	Chris Li <chrisl@kernel.org>,
	Minchan Kim <minchan@kernel.org>,
	Barry Song <v-songbaohua@oppo.com>,
	Ryan Roberts <ryan.roberts@arm.com>,
	Yu Zhao <yuzhao@google.com>,
	SeongJae Park <sj@kernel.org>,
	David Hildenbrand <david@redhat.com>,
	Yosry Ahmed <yosryahmed@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Matthew Wilcox <willy@infradead.org>,
	Nhat Pham <nphamcs@gmail.com>,
	Chengming Zhou <zhouchengming@bytedance.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org,
	Kairui Song <kasong@tencent.com>
Subject: [RFC PATCH 09/10] mm/swap: delay the swap cache lookup for swapin
Date: Wed, 27 Mar 2024 02:50:31 +0800
Message-ID: <20240326185032.72159-10-ryncsn@gmail.com>
In-Reply-To: <20240326185032.72159-1-ryncsn@gmail.com>
References: <20240326185032.72159-1-ryncsn@gmail.com>
Reply-To: Kairui Song <kasong@tencent.com>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit

From: Kairui Song <kasong@tencent.com>

From: Kairui Song <ryncsn@gmail.com>

Currently we do a swap cache lookup first, then call into the ordinary
swapin path. But all swapin path will call swap_cache_add_or_get,
which will do a swap cache lookup again on race, because the first
lookup is racy and could miss the swap cache.

If the race happened (could be frequent on busy device), caller have no
way of knowing that, not be able to distinguish minor / major fault,
and the first lookup is redundant.

So try to do swapcache lookup and readahead update late, defer it to
swap_cache_alloc_or_get, and make it faster by avoiding lookup if
HAS_CACHE flag is not set. This will be less accurate but the
later look up will always ensure we never miss a existing swap cache.
This provides 100% accuracy swap cache usage info for callers,
improve minor / major page fault info, and also improve performance.

Test result of sequential swapin/out of 30G zero page on ZRAM:

               Before (us)        After (us)
Swapout:       33827215           33853883
Swapin:        39466754           38336519 (+2.9%)
Swapout (THP): 6917709            6814619
Swapin (THP) : 39566916           38383367 (+3.0%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/memory.c     |  45 ++++++++----------
 mm/shmem.c      |  39 +++++++---------
 mm/swap.h       |  16 +++++--
 mm/swap_state.c | 122 +++++++++++++++++++++++++++++-------------------
 mm/swapfile.c   |  32 +++++++------
 5 files changed, 141 insertions(+), 113 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 357d239ee2f6..774a912eb46d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3932,6 +3932,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	struct page *page;
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
+	bool folio_allocated = false;
 	bool exclusive = false;
 	swp_entry_t entry;
 	pte_t pte;
@@ -3991,35 +3992,29 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (unlikely(!si))
 		goto out;
 
-	folio = swap_cache_get_folio(entry, vma, vmf->address);
-	if (folio)
-		page = folio_file_page(folio, swp_offset(entry));
-	swapcache = folio;
+	if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) {
+		folio = swapin_direct(entry, GFP_HIGHUSER_MOVABLE, vmf, &folio_allocated);
+	} else {
+		folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf, &folio_allocated);
+	}
 
 	if (!folio) {
-		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
-		    __swap_count(entry) == 1) {
-			folio = swapin_direct(entry, GFP_HIGHUSER_MOVABLE, vmf);
-		} else {
-			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
-		}
-
-		if (!folio) {
-			/*
-			 * Back out if somebody else faulted in this pte
-			 * while we released the pte lock.
-			 */
-			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-					vmf->address, &vmf->ptl);
-			if (likely(vmf->pte &&
-				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
-				ret = VM_FAULT_OOM;
-			goto unlock;
-		}
+		/*
+		 * Back out if somebody else faulted in this pte
+		 * while we released the pte lock.
+		 */
+		vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+				vmf->address, &vmf->ptl);
+		if (likely(vmf->pte &&
+			   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+			ret = VM_FAULT_OOM;
+		goto unlock;
+	}
 
-		swapcache = folio;
-		page = folio_file_page(folio, swp_offset(entry));
+	swapcache = folio;
+	page = folio_file_page(folio, swp_offset(entry));
 
+	if (folio_allocated) {
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
diff --git a/mm/shmem.c b/mm/shmem.c
index 51e4593f9e2e..7884bbe28731 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1570,20 +1570,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 			pgoff_t index, unsigned int order, pgoff_t *ilx);
 
-static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	struct mempolicy *mpol;
-	pgoff_t ilx;
-	struct folio *folio;
-
-	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
-	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
-	mpol_cond_put(mpol);
-
-	return folio;
-}
-
 /*
  * Make sure huge_gfp is always more limited than limit_gfp.
  * Some of the flags set permissions, while others set limitations.
@@ -1857,9 +1843,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	bool folio_allocated = false;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
+	struct mempolicy *mpol;
 	swp_entry_t swap;
+	pgoff_t ilx;
 	int error;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
@@ -1878,22 +1867,28 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	}
 
 	/* Look it up and read it in.. */
-	folio = swap_cache_get_folio(swap, NULL, 0);
+	folio = swap_cache_try_get(swap);
 	if (!folio) {
-		/* Or update major stats only when swapin succeeds?? */
-		if (fault_type) {
-			*fault_type |= VM_FAULT_MAJOR;
-			count_vm_event(PGMAJFAULT);
-			count_memcg_event_mm(fault_mm, PGMAJFAULT);
-		}
 		/* Here we actually start the io */
-		folio = shmem_swapin_cluster(swap, gfp, info, index);
+		mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+		folio = swap_cluster_readahead(swap, gfp, mpol, ilx, &folio_allocated);
+		mpol_cond_put(mpol);
 		if (!folio) {
 			error = -ENOMEM;
 			goto failed;
 		}
+
+		/* Update major stats only when swapin succeeds */
+		if (folio_allocated && fault_type) {
+			*fault_type |= VM_FAULT_MAJOR;
+			count_vm_event(PGMAJFAULT);
+			count_memcg_event_mm(fault_mm, PGMAJFAULT);
+		}
 	}
 
+	if (!folio_allocated)
+		swap_cache_update_ra(folio, NULL, 0);
+
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
 	if (!folio_test_swapcache(folio) ||
diff --git a/mm/swap.h b/mm/swap.h
index be2d1642b5d9..bd872b157950 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -39,7 +39,8 @@ void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(swp_entry_t entry);
 int swap_cache_add_wait(struct folio *folio, swp_entry_t entry, gfp_t gfp);
-struct folio *swap_cache_get_folio(swp_entry_t entry,
+struct folio *swap_cache_try_get(swp_entry_t entry);
+void swap_cache_update_ra(struct folio *folio,
 		struct vm_area_struct *vma, unsigned long addr);
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
 		pgoff_t index);
@@ -49,16 +50,18 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_flags,
 		struct mempolicy *mpol, pgoff_t ilx, bool *folio_allocated);
 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
-		struct mempolicy *mpol, pgoff_t ilx);
+		struct mempolicy *mpol, pgoff_t ilx, bool *folio_allocated);
 struct folio *swapin_direct(swp_entry_t entry, gfp_t flag,
-			    struct vm_fault *vmf);
+			    struct vm_fault *vmf, bool *folio_allocated);
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
-			      struct vm_fault *vmf);
+			      struct vm_fault *vmf, bool *folio_allocated);
 
 static inline unsigned int folio_swap_flags(struct folio *folio)
 {
 	return swp_swap_info(folio->swap)->flags;
 }
+
+bool __swap_has_cache(swp_entry_t entry);
 #else /* CONFIG_SWAP */
 struct swap_iocb;
 static inline void swap_read_folio(struct folio *folio, bool do_poll,
@@ -151,5 +154,10 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
 {
 	return 0;
 }
+
+static inline bool __swap_has_cache(swp_entry_t entry);
+{
+	return false;
+}
 #endif /* CONFIG_SWAP */
 #endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b5ea13295e17..cf178dd1131a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,54 +300,54 @@ static inline bool swap_use_vma_readahead(void)
 }
 
 /*
- * Lookup a swap entry in the swap cache. A found folio will be returned
- * unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the folio
- * lock before returning.
- *
- * Caller must lock the swap device or hold a reference to keep it valid.
+ * Try get the swap cache, bail out quickly if swapcache bit is not set.
  */
-struct folio *swap_cache_get_folio(swp_entry_t entry,
-		struct vm_area_struct *vma, unsigned long addr)
+struct folio *swap_cache_try_get(swp_entry_t entry)
 {
 	struct folio *folio;
 
-	folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
-	if (!IS_ERR(folio)) {
-		bool vma_ra = swap_use_vma_readahead();
-		bool readahead;
-
-		/*
-		 * At the moment, we don't support PG_readahead for anon THP
-		 * so let's bail out rather than confusing the readahead stat.
-		 */
-		if (unlikely(folio_test_large(folio)))
+	if (__swap_has_cache(entry)) {
+		folio = filemap_get_folio(swap_address_space(entry),
+				swp_offset(entry));
+		if (!IS_ERR(folio))
 			return folio;
+	}
 
-		readahead = folio_test_clear_readahead(folio);
-		if (vma && vma_ra) {
-			unsigned long ra_val;
-			int win, hits;
-
-			ra_val = GET_SWAP_RA_VAL(vma);
-			win = SWAP_RA_WIN(ra_val);
-			hits = SWAP_RA_HITS(ra_val);
-			if (readahead)
-				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
-			atomic_long_set(&vma->swap_readahead_info,
-					SWAP_RA_VAL(addr, win, hits));
-		}
+	return NULL;
+}
 
-		if (readahead) {
-			count_vm_event(SWAP_RA_HIT);
-			if (!vma || !vma_ra)
-				atomic_inc(&swapin_readahead_hits);
-		}
-	} else {
-		folio = NULL;
+void swap_cache_update_ra(struct folio *folio, struct vm_area_struct *vma,
+			  unsigned long addr)
+{
+	bool vma_ra = swap_use_vma_readahead();
+	bool readahead;
+
+	/*
+	 * At the moment, we don't support PG_readahead for anon THP
+	 * so let's bail out rather than confusing the readahead stat.
+	 */
+	if (unlikely(folio_test_large(folio)))
+		return;
+
+	readahead = folio_test_clear_readahead(folio);
+	if (vma && vma_ra) {
+		unsigned long ra_val;
+		int win, hits;
+
+		ra_val = GET_SWAP_RA_VAL(vma);
+		win = SWAP_RA_WIN(ra_val);
+		hits = SWAP_RA_HITS(ra_val);
+		if (readahead)
+			hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+		atomic_long_set(&vma->swap_readahead_info,
+				SWAP_RA_VAL(addr, win, hits));
 	}
 
-	return folio;
+	if (readahead) {
+		count_vm_event(SWAP_RA_HIT);
+		if (!vma || !vma_ra)
+			atomic_inc(&swapin_readahead_hits);
+	}
 }
 
 /**
@@ -485,6 +485,11 @@ struct folio *swap_cache_alloc_or_get(swp_entry_t entry, gfp_t gfp_mask,
 	if (!si)
 		goto out_no_device;
 
+	/* First do a racy check if cache is already loaded. */
+	swapcache = swap_cache_try_get(entry);
+	if (swapcache)
+		goto out_no_alloc;
+
 	/* We are very likely the first user, alloc and try add to the swapcache. */
 	folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, mpol, ilx,
 						 numa_node_id());
@@ -614,7 +619,8 @@ static unsigned long swapin_nr_pages(unsigned long offset)
  * are fairly likely to have been swapped out from the same node.
  */
 struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
-				    struct mempolicy *mpol, pgoff_t ilx)
+				     struct mempolicy *mpol, pgoff_t ilx,
+				     bool *folio_allocated)
 {
 	struct folio *folio;
 	unsigned long entry_offset = swp_offset(entry);
@@ -644,6 +650,10 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		folio = swap_cache_alloc_or_get(
 				swp_entry(swp_type(entry), offset),
 				gfp_mask, mpol, ilx, &page_allocated);
+		if (offset == entry_offset) {
+			*folio_allocated = page_allocated;
+			folio_allocated = NULL;
+		}
 		if (!folio)
 			continue;
 		if (page_allocated) {
@@ -666,6 +676,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		zswap_folio_swapin(folio);
 		swap_read_folio(folio, false, NULL);
 	}
+	if (folio_allocated)
+		*folio_allocated = page_allocated;
 	return folio;
 }
 
@@ -779,7 +791,8 @@ static void swap_ra_info(struct vm_fault *vmf,
  *
  */
 static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
-		struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
+					struct mempolicy *mpol, pgoff_t targ_ilx,
+					struct vm_fault *vmf, bool *folio_allocated)
 {
 	struct blk_plug plug;
 	struct swap_iocb *splug = NULL;
@@ -818,6 +831,10 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		pte = NULL;
 		folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
 						&page_allocated);
+		if (i == ra_info.offset) {
+			*folio_allocated = page_allocated;
+			folio_allocated = NULL;
+		}
 		if (!folio)
 			continue;
 		if (page_allocated) {
@@ -842,6 +859,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 		zswap_folio_swapin(folio);
 		swap_read_folio(folio, false, NULL);
 	}
+	if (folio_allocated)
+		*folio_allocated = page_allocated;
 	return folio;
 }
 
@@ -854,20 +873,21 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
  * Returns the folio for entry after it is read in.
  */
 struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
-			    struct vm_fault *vmf)
+			    struct vm_fault *vmf, bool *folio_allocated)
 {
 	struct mempolicy *mpol;
 	struct folio *folio;
-	bool page_allocated;
 	pgoff_t ilx;
 
 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
 	folio = swap_cache_alloc_or_get(entry, gfp_mask, mpol, ilx,
-					&page_allocated);
+					folio_allocated);
 	mpol_cond_put(mpol);
 
-	if (page_allocated)
+	if (*folio_allocated)
 		swap_read_folio(folio, true, NULL);
+	else if (folio)
+		swap_cache_update_ra(folio, vmf->vma, vmf->address);
 
 	return folio;
 }
@@ -885,18 +905,22 @@ struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
  * or vma-based(ie, virtual address based on faulty address) readahead.
  */
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
-				struct vm_fault *vmf)
+			       struct vm_fault *vmf, bool *folio_allocated)
 {
 	struct mempolicy *mpol;
-	pgoff_t ilx;
 	struct folio *folio;
+	bool allocated;
+	pgoff_t ilx;
 
 	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
 	folio = swap_use_vma_readahead() ?
-		swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
-		swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+		swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf, &allocated) :
+		swap_cluster_readahead(entry, gfp_mask, mpol, ilx, &allocated);
 	mpol_cond_put(mpol);
 
+	if (!*folio_allocated && folio)
+		swap_cache_update_ra(folio, vmf->vma, vmf->address);
+
 	return folio;
 }
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8225091d42b6..ddcf2ff91c39 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1455,6 +1455,15 @@ int __swap_count(swp_entry_t entry)
 	return swap_count(si->swap_map[offset]);
 }
 
+bool __swap_has_cache(swp_entry_t entry)
+{
+	pgoff_t offset = swp_offset(entry);
+	struct swap_info_struct *si = swp_swap_info(entry);
+	unsigned char count = READ_ONCE(si->swap_map[offset]);
+
+	return swap_count(count) && (count & SWAP_HAS_CACHE);
+}
+
 /*
  * How many references to @entry are currently swapped out?
  * This does not give an exact answer when swap count is continued,
@@ -1862,10 +1871,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		struct folio *folio;
 		unsigned long offset;
 		unsigned char swp_count;
+		bool folio_allocated;
 		swp_entry_t entry;
 		int ret;
 		pte_t ptent;
 
+		struct vm_fault vmf = {
+			.vma = vma,
+			.address = addr,
+			.real_address = addr,
+			.pmd = pmd,
+		};
+
 		if (!pte++) {
 			pte = pte_offset_map(pmd, addr);
 			if (!pte)
@@ -1884,19 +1901,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		offset = swp_offset(entry);
 		pte_unmap(pte);
 		pte = NULL;
-
-		folio = swap_cache_get_folio(entry, vma, addr);
-		if (!folio) {
-			struct vm_fault vmf = {
-				.vma = vma,
-				.address = addr,
-				.real_address = addr,
-				.pmd = pmd,
-			};
-
-			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
-						&vmf);
-		}
+		folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+				&vmf, &folio_allocated);
 		if (!folio) {
 			swp_count = READ_ONCE(si->swap_map[offset]);
 			if (swap_count(swp_count) == 0 || swp_count == SWAP_MAP_BAD)
-- 
2.43.0