2023-10-16 20:07:33

by Vishal Moola

[permalink] [raw]
Subject: [PATCH 0/5] Some khugepaged folio conversions

This patchset converts a number of functions to use folios. This cleans
up some khugepaged code and removes a large number of hidden
compound_head() calls.

---

The first 2 patches break khugepaged max_ptes_shared selftests as the
functions now use folio_estimated_sharers() instead of page_mapcount().
This is expected, although I'm uncertain as to whether that's actually a
bad thing or not. Some performance testing/feedback would be appreciated
on that front.

Vishal Moola (Oracle) (5):
mm/khugepaged: Convert __collapse_huge_page_isolate() to use folios
mm/khugepaged: Convert hpage_collapse_scan_pmd() to use folios
mm/khugepaged: Convert is_refcount_suitable() to use folios
mm/khugepaged: Convert alloc_charge_hpage() to use folios
mm/khugepaged: Convert collapse_pte_mapped_thp() to use folios

mm/khugepaged.c | 145 +++++++++++++++++++++++-------------------------
1 file changed, 69 insertions(+), 76 deletions(-)

--
2.40.1


2023-10-16 20:07:49

by Vishal Moola

[permalink] [raw]
Subject: [PATCH 5/5] mm/khugepaged: Convert collapse_pte_mapped_thp() to use folios

This removes 2 calls to compound_head() and helps convert khugepaged to
use folios throughout.

Previously, if the address passed to collapse_pte_mapped_thp()
corresponded to a tail page, the scan would fail immediately. Using
filemap_lock_folio() we can get the corresponding folio back and try to
operate on the folio instead.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
---
mm/khugepaged.c | 45 ++++++++++++++++++++-------------------------
1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 70bba8ddea13..99c437979848 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1471,7 +1471,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
- struct page *hpage;
+ struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
@@ -1504,19 +1504,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;

- hpage = find_lock_page(vma->vm_file->f_mapping,
+ folio = filemap_lock_folio(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
- if (!hpage)
+ if (!folio)
return SCAN_PAGE_NULL;

- if (!PageHead(hpage)) {
- result = SCAN_FAIL;
- goto drop_hpage;
- }
-
- if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ if (folio_order(folio) != HPAGE_PMD_ORDER) {
result = SCAN_PAGE_COMPOUND;
- goto drop_hpage;
+ goto drop_folio;
}

result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
@@ -1530,13 +1525,13 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
*/
goto maybe_install_pmd;
default:
- goto drop_hpage;
+ goto drop_folio;
}

result = SCAN_FAIL;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
- goto drop_hpage;
+ goto drop_folio;

/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1561,7 +1556,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
*/
- if (hpage + i != page)
+ if (folio_page(folio, i) != page)
goto abort;
}

@@ -1576,7 +1571,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* page_table_lock) ptl nests inside pml. The less time we hold pml,
* the better; but userfaultfd's mfill_atomic_pte() on a private VMA
* inserts a valid as-if-COWed PTE without even looking up page cache.
- * So page lock of hpage does not protect from it, so we must not drop
+ * So page lock of folio does not protect from it, so we must not drop
* ptl before pgt_pmd is removed, so uffd private needs pml taken now.
*/
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
@@ -1600,7 +1595,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
continue;
/*
* We dropped ptl after the first scan, to do the mmu_notifier:
- * page lock stops more PTEs of the hpage being faulted in, but
+ * page lock stops more PTEs of the folio being faulted in, but
* does not stop write faults COWing anon copies from existing
* PTEs; and does not stop those being swapped out or migrated.
*/
@@ -1609,7 +1604,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
- if (hpage + i != page)
+ if (folio_page(folio, i) != page)
goto abort;

/*
@@ -1628,8 +1623,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,

/* step 3: set proper refcount and mm_counters. */
if (nr_ptes) {
- page_ref_sub(hpage, nr_ptes);
- add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ folio_ref_sub(folio, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
}

/* step 4: remove empty page table */
@@ -1653,14 +1648,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, hpage)
+ ? set_huge_pmd(vma, haddr, pmd, &folio->page)
: SCAN_SUCCEED;
- goto drop_hpage;
+ goto drop_folio;
abort:
if (nr_ptes) {
flush_tlb_mm(mm);
- page_ref_sub(hpage, nr_ptes);
- add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ folio_ref_sub(folio, nr_ptes);
+ add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
}
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
@@ -1668,9 +1663,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
spin_unlock(pml);
if (notified)
mmu_notifier_invalidate_range_end(&range);
-drop_hpage:
- unlock_page(hpage);
- put_page(hpage);
+drop_folio:
+ folio_unlock(folio);
+ folio_put(folio);
return result;
}

--
2.40.1

2023-10-16 20:07:52

by Vishal Moola

[permalink] [raw]
Subject: [PATCH 1/5] mm/khugepaged: Convert __collapse_huge_page_isolate() to use folios

Replaces 11 calls to compound_head() with 1, and removes 1375 bytes of
kernel text.

Previously, to determine if any pte was shared, the page mapcount
corresponding exactly to the pte was checked. This gave us a precise
number of shared ptes. Using folio_estimated_sharers() instead uses
the mapcount of the head page, giving us an estimate for tail page ptes.

This means if a tail page's mapcount is greater than its head page's
mapcount, folio_estimated_sharers() would be underestimating the number of
shared ptes, and vice versa.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
---
mm/khugepaged.c | 51 ++++++++++++++++++++++++-------------------------
1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..7a552fe16c92 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -541,7 +541,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
struct collapse_control *cc,
struct list_head *compound_pagelist)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
@@ -570,15 +570,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_UFFD_WP;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+ folio = vm_normal_folio(vma, address, pteval);
+ if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
result = SCAN_PAGE_NULL;
goto out;
}

- VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);

- if (page_mapcount(page) > 1) {
+ if (folio_estimated_sharers(folio) > 1) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -588,16 +588,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
}

- if (PageCompound(page)) {
- struct page *p;
- page = compound_head(page);
+ if (folio_test_large(folio)) {
+ struct folio *f;

/*
* Check if we have dealt with the compound page
* already
*/
- list_for_each_entry(p, compound_pagelist, lru) {
- if (page == p)
+ list_for_each_entry(f, compound_pagelist, lru) {
+ if (folio == f)
goto next;
}
}
@@ -608,7 +607,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page)) {
+ if (!folio_trylock(folio)) {
result = SCAN_PAGE_LOCK;
goto out;
}
@@ -624,8 +623,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(page)) {
- unlock_page(page);
+ if (!is_refcount_suitable(&folio->page)) {
+ folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
}
@@ -634,27 +633,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
- if (!isolate_lru_page(page)) {
- unlock_page(page);
+ if (!folio_isolate_lru(folio)) {
+ folio_unlock(folio);
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- compound_nr(page));
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

- if (PageCompound(page))
- list_add_tail(&page->lru, compound_pagelist);
+ if (folio_test_large(folio))
+ list_add_tail(&folio->lru, compound_pagelist);
next:
/*
* If collapse was initiated by khugepaged, check that there is
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
- (pte_young(pteval) || page_is_young(page) ||
- PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ (pte_young(pteval) || folio_test_young(folio) ||
+ folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;

@@ -668,13 +667,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
referenced, writable, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
referenced, writable, result);
return result;
}
--
2.40.1

2023-10-16 20:07:56

by Vishal Moola

[permalink] [raw]
Subject: [PATCH 2/5] mm/khugepaged: Convert hpage_collapse_scan_pmd() to use folios

Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
text.

Previously, to determine if any pte was shared, the page mapcount
corresponding exactly to the pte was checked. This gave us a precise
number of shared ptes. Using folio_estimated_sharers() instead uses
the mapcount of the head page, giving us an estimate for tail page ptes.

This means if a tail page's mapcount is greater than its head page's
mapcount, folio_estimated_sharers() would be underestimating the number of
shared ptes, and vice versa.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
---
mm/khugepaged.c | 26 ++++++++++++--------------
1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7a552fe16c92..67aac53b31c8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1245,7 +1245,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
pte_t *pte, *_pte;
int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
- struct page *page = NULL;
+ struct folio *folio = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
@@ -1316,13 +1316,13 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
if (pte_write(pteval))
writable = true;

- page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+ folio = vm_normal_folio(vma, _address, pteval);
+ if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}

- if (page_mapcount(page) > 1) {
+ if (folio_estimated_sharers(folio) > 1) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -1332,29 +1332,27 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
}
}

- page = compound_head(page);
-
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
- node = page_to_nid(page);
+ node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
cc->node_load[node]++;
- if (!PageLRU(page)) {
+ if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
}
- if (PageLocked(page)) {
+ if (folio_test_locked(folio)) {
result = SCAN_PAGE_LOCK;
goto out_unmap;
}
- if (!PageAnon(page)) {
+ if (!folio_test_anon(folio)) {
result = SCAN_PAGE_ANON;
goto out_unmap;
}
@@ -1369,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(page)) {
+ if (!is_refcount_suitable(&folio->page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1379,8 +1377,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* enough young pte to justify collapsing the page
*/
if (cc->is_khugepaged &&
- (pte_young(pteval) || page_is_young(page) ||
- PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ (pte_young(pteval) || folio_test_young(folio) ||
+ folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
address)))
referenced++;
}
@@ -1402,7 +1400,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
none_or_zero, result, unmapped);
return result;
}
--
2.40.1

2023-10-16 20:07:56

by Vishal Moola

[permalink] [raw]
Subject: [PATCH 4/5] mm/khugepaged: Convert alloc_charge_hpage() to use folios

Also convert hpage_collapse_alloc_page() to
hpage_collapse_alloc_folio().

This removes 1 call to compound_head() and helps convert khugepaged to
use folios throughout.

Signed-off-by: Vishal Moola (Oracle) <[email protected]>
---
mm/khugepaged.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fa21a53ce0c0..70bba8ddea13 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -886,16 +886,16 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
}
#endif

-static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
nodemask_t *nmask)
{
- *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
- if (unlikely(!*hpage)) {
+ *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
+
+ if (unlikely(!*folio)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
return false;
}

- folio_prep_large_rmappable((struct folio *)*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return true;
}
@@ -1062,15 +1062,16 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
int node = hpage_collapse_find_target_node(cc);
struct folio *folio;

- if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+ if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask))
return SCAN_ALLOC_HUGE_PAGE_FAIL;

- folio = page_folio(*hpage);
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*hpage = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
}
+
+ *hpage = folio_page(folio, 0);
count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);

return SCAN_SUCCEED;
--
2.40.1

2023-10-17 20:42:12

by Yang Shi

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm/khugepaged: Convert hpage_collapse_scan_pmd() to use folios

On Mon, Oct 16, 2023 at 1:06 PM Vishal Moola (Oracle)
<[email protected]> wrote:
>
> Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
> text.
>
> Previously, to determine if any pte was shared, the page mapcount
> corresponding exactly to the pte was checked. This gave us a precise
> number of shared ptes. Using folio_estimated_sharers() instead uses
> the mapcount of the head page, giving us an estimate for tail page ptes.
>
> This means if a tail page's mapcount is greater than its head page's
> mapcount, folio_estimated_sharers() would be underestimating the number of
> shared ptes, and vice versa.
>
> Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> ---
> mm/khugepaged.c | 26 ++++++++++++--------------
> 1 file changed, 12 insertions(+), 14 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 7a552fe16c92..67aac53b31c8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1245,7 +1245,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> pte_t *pte, *_pte;
> int result = SCAN_FAIL, referenced = 0;
> int none_or_zero = 0, shared = 0;
> - struct page *page = NULL;
> + struct folio *folio = NULL;
> unsigned long _address;
> spinlock_t *ptl;
> int node = NUMA_NO_NODE, unmapped = 0;
> @@ -1316,13 +1316,13 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> if (pte_write(pteval))
> writable = true;
>
> - page = vm_normal_page(vma, _address, pteval);
> - if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> + folio = vm_normal_folio(vma, _address, pteval);
> + if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
> result = SCAN_PAGE_NULL;
> goto out_unmap;
> }
>
> - if (page_mapcount(page) > 1) {
> + if (folio_estimated_sharers(folio) > 1) {

This doesn't look correct. The max_ptes_shared is used to control the
cap of shared PTEs. IIRC, folio_estimated_sharers() just reads the
mapcount of the head page. If we set max_ptes_shared to 256, and just
the head page is shared, but "shared" will return 512 and prevent from
collapsing the area even though just one PTE is shared. This breaks
the semantics of max_ptes_shared.

> ++shared;
> if (cc->is_khugepaged &&
> shared > khugepaged_max_ptes_shared) {
> @@ -1332,29 +1332,27 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> }
> }
>
> - page = compound_head(page);
> -
> /*
> * Record which node the original page is from and save this
> * information to cc->node_load[].
> * Khugepaged will allocate hugepage from the node has the max
> * hit record.
> */
> - node = page_to_nid(page);
> + node = folio_nid(folio);
> if (hpage_collapse_scan_abort(node, cc)) {
> result = SCAN_SCAN_ABORT;
> goto out_unmap;
> }
> cc->node_load[node]++;
> - if (!PageLRU(page)) {
> + if (!folio_test_lru(folio)) {
> result = SCAN_PAGE_LRU;
> goto out_unmap;
> }
> - if (PageLocked(page)) {
> + if (folio_test_locked(folio)) {
> result = SCAN_PAGE_LOCK;
> goto out_unmap;
> }
> - if (!PageAnon(page)) {
> + if (!folio_test_anon(folio)) {
> result = SCAN_PAGE_ANON;
> goto out_unmap;
> }
> @@ -1369,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> * has excessive GUP pins (i.e. 512). Anyway the same check
> * will be done again later the risk seems low.
> */
> - if (!is_refcount_suitable(page)) {
> + if (!is_refcount_suitable(&folio->page)) {
> result = SCAN_PAGE_COUNT;
> goto out_unmap;
> }
> @@ -1379,8 +1377,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> * enough young pte to justify collapsing the page
> */
> if (cc->is_khugepaged &&
> - (pte_young(pteval) || page_is_young(page) ||
> - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> + (pte_young(pteval) || folio_test_young(folio) ||
> + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
> address)))
> referenced++;
> }
> @@ -1402,7 +1400,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> *mmap_locked = false;
> }
> out:
> - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> + trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
> none_or_zero, result, unmapped);
> return result;
> }
> --
> 2.40.1
>

2023-10-18 17:31:39

by Vishal Moola

[permalink] [raw]
Subject: Re: [PATCH 4/5] mm/khugepaged: Convert alloc_charge_hpage() to use folios

On Mon, Oct 16, 2023 at 8:48 PM Kefeng Wang <[email protected]> wrote:
>
>
>
> On 2023/10/17 4:05, Vishal Moola (Oracle) wrote:
> > Also convert hpage_collapse_alloc_page() to
> > hpage_collapse_alloc_folio().
> >
> > This removes 1 call to compound_head() and helps convert khugepaged to
> > use folios throughout.
> >
> > Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> > ---
> > mm/khugepaged.c | 13 +++++++------
> > 1 file changed, 7 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index fa21a53ce0c0..70bba8ddea13 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -886,16 +886,16 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
> > }
> > #endif
> >
> > -static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
> > +static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
> > nodemask_t *nmask)
> > {
> > - *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
> > - if (unlikely(!*hpage)) {
> > + *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
> > +
> > + if (unlikely(!*folio)) {
> > count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> > return false;
> > }
> >
> > - folio_prep_large_rmappable((struct folio *)*hpage);
> > count_vm_event(THP_COLLAPSE_ALLOC);
> > return true;
> > }
> > @@ -1062,15 +1062,16 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
> > int node = hpage_collapse_find_target_node(cc);
> > struct folio *folio;
> >
> > - if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
> > + if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask))
> > return SCAN_ALLOC_HUGE_PAGE_FAIL;
> >
> > - folio = page_folio(*hpage);
> > if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
> > folio_put(folio);
> > *hpage = NULL;
> > return SCAN_CGROUP_CHARGE_FAIL;
> > }
> > +
> > + *hpage = folio_page(folio, 0);
> > count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
>
> count_memcg_folio_events()and kill count_memcg_page_event?
> >
> > return SCAN_SUCCEED;

Thanks, I didn't notice that was the last caller.

2023-10-18 17:37:37

by Vishal Moola

[permalink] [raw]
Subject: Re: [PATCH 2/5] mm/khugepaged: Convert hpage_collapse_scan_pmd() to use folios

On Tue, Oct 17, 2023 at 1:41 PM Yang Shi <[email protected]> wrote:
>
> On Mon, Oct 16, 2023 at 1:06 PM Vishal Moola (Oracle)
> <[email protected]> wrote:
> >
> > Replaces 5 calls to compound_head(), and removes 1466 bytes of kernel
> > text.
> >
> > Previously, to determine if any pte was shared, the page mapcount
> > corresponding exactly to the pte was checked. This gave us a precise
> > number of shared ptes. Using folio_estimated_sharers() instead uses
> > the mapcount of the head page, giving us an estimate for tail page ptes.
> >
> > This means if a tail page's mapcount is greater than its head page's
> > mapcount, folio_estimated_sharers() would be underestimating the number of
> > shared ptes, and vice versa.
> >
> > Signed-off-by: Vishal Moola (Oracle) <[email protected]>
> > ---
> > mm/khugepaged.c | 26 ++++++++++++--------------
> > 1 file changed, 12 insertions(+), 14 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 7a552fe16c92..67aac53b31c8 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1245,7 +1245,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > pte_t *pte, *_pte;
> > int result = SCAN_FAIL, referenced = 0;
> > int none_or_zero = 0, shared = 0;
> > - struct page *page = NULL;
> > + struct folio *folio = NULL;
> > unsigned long _address;
> > spinlock_t *ptl;
> > int node = NUMA_NO_NODE, unmapped = 0;
> > @@ -1316,13 +1316,13 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > if (pte_write(pteval))
> > writable = true;
> >
> > - page = vm_normal_page(vma, _address, pteval);
> > - if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> > + folio = vm_normal_folio(vma, _address, pteval);
> > + if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
> > result = SCAN_PAGE_NULL;
> > goto out_unmap;
> > }
> >
> > - if (page_mapcount(page) > 1) {
> > + if (folio_estimated_sharers(folio) > 1) {
>
> This doesn't look correct. The max_ptes_shared is used to control the
> cap of shared PTEs. IIRC, folio_estimated_sharers() just reads the
> mapcount of the head page. If we set max_ptes_shared to 256, and just
> the head page is shared, but "shared" will return 512 and prevent from
> collapsing the area even though just one PTE is shared. This breaks
> the semantics of max_ptes_shared.

In my testing this replacement appears to do the opposite (underestimating
instead of overestimating), which admittedly still breaks the semantics of
max_ptes_shared. It appears that this happens quite frequently in many
regular use cases, so I'll go back to checking each individual subpage's
mapcount in v2.

> > ++shared;
> > if (cc->is_khugepaged &&
> > shared > khugepaged_max_ptes_shared) {
> > @@ -1332,29 +1332,27 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > }
> > }
> >
> > - page = compound_head(page);
> > -
> > /*
> > * Record which node the original page is from and save this
> > * information to cc->node_load[].
> > * Khugepaged will allocate hugepage from the node has the max
> > * hit record.
> > */
> > - node = page_to_nid(page);
> > + node = folio_nid(folio);
> > if (hpage_collapse_scan_abort(node, cc)) {
> > result = SCAN_SCAN_ABORT;
> > goto out_unmap;
> > }
> > cc->node_load[node]++;
> > - if (!PageLRU(page)) {
> > + if (!folio_test_lru(folio)) {
> > result = SCAN_PAGE_LRU;
> > goto out_unmap;
> > }
> > - if (PageLocked(page)) {
> > + if (folio_test_locked(folio)) {
> > result = SCAN_PAGE_LOCK;
> > goto out_unmap;
> > }
> > - if (!PageAnon(page)) {
> > + if (!folio_test_anon(folio)) {
> > result = SCAN_PAGE_ANON;
> > goto out_unmap;
> > }
> > @@ -1369,7 +1367,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > * has excessive GUP pins (i.e. 512). Anyway the same check
> > * will be done again later the risk seems low.
> > */
> > - if (!is_refcount_suitable(page)) {
> > + if (!is_refcount_suitable(&folio->page)) {
> > result = SCAN_PAGE_COUNT;
> > goto out_unmap;
> > }
> > @@ -1379,8 +1377,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > * enough young pte to justify collapsing the page
> > */
> > if (cc->is_khugepaged &&
> > - (pte_young(pteval) || page_is_young(page) ||
> > - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> > + (pte_young(pteval) || folio_test_young(folio) ||
> > + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
> > address)))
> > referenced++;
> > }
> > @@ -1402,7 +1400,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > *mmap_locked = false;
> > }
> > out:
> > - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> > + trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
> > none_or_zero, result, unmapped);
> > return result;
> > }
> > --
> > 2.40.1
> >