This patch series makes swapin readahead up to a
certain number to gain more thp performance and adds
tracepoint for khugepaged_scan_pmd, collapse_huge_page,
__collapse_huge_page_isolate.
This patch series was written to deal with programs
that access most, but not all, of their memory after
they get swapped out. Currently these programs do not
get their memory collapsed into THPs after the system
swapped their memory out, while they would get THPs
before swapping happened.
This patch series was tested with a test program,
it allocates 800MB of memory, writes to it, and
then sleeps. I force the system to swap out all.
Afterwards, the test program touches the area by
writing and leaves a piece of it without writing.
This shows how much swap in readahead made by the
patch.
Test results:
After swapped out
-------------------------------------------------------------------
| Anonymous | AnonHugePages | Swap | Fraction |
-------------------------------------------------------------------
With patch | 267128 kB | 266240 kB | 532876 kB | %99 |
-------------------------------------------------------------------
Without patch | 238160 kB | 235520 kB | 561844 kB | %98 |
-------------------------------------------------------------------
After swapped in
-------------------------------------------------------------------
| Anonymous | AnonHugePages | Swap | Fraction |
-------------------------------------------------------------------
With patch | 533876 kB | 530432 kB | 266128 kB | %99 |
-------------------------------------------------------------------
Without patch | 499956 kB | 235520 kB | 300048 kB | %47 |
-------------------------------------------------------------------
Ebru Akagunduz (3):
mm: add tracepoint for scanning pages
mm: make optimistic check for swapin readahead
mm: make swapin readahead to improve thp collapse rate
include/linux/mm.h | 23 +++++
include/trace/events/huge_memory.h | 127 ++++++++++++++++++++++++++++
mm/huge_memory.c | 168 ++++++++++++++++++++++++++++++-------
mm/memory.c | 2 +-
4 files changed, 288 insertions(+), 32 deletions(-)
create mode 100644 include/trace/events/huge_memory.h
--
1.9.1
Using static tracepoints, data of functions is recorded.
It is good to automatize debugging without doing a lot
of changes in the source code.
This patch adds tracepoint for khugepaged_scan_pmd,
collapse_huge_page and __collapse_huge_page_isolate.
Signed-off-by: Ebru Akagunduz <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
Acked-by: Rik van Riel <[email protected]>
---
Changes in v2:
- Nothing changed
Changes in v3:
- Print page address instead of vm_start (Vlastimil Babka)
- Define constants to specify exact tracepoint result (Vlastimil Babka)
include/linux/mm.h | 18 ++++++
include/trace/events/huge_memory.h | 100 ++++++++++++++++++++++++++++++++
mm/huge_memory.c | 114 +++++++++++++++++++++++++++----------
3 files changed, 203 insertions(+), 29 deletions(-)
create mode 100644 include/trace/events/huge_memory.h
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7f47178..bf341c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -21,6 +21,24 @@
#include <linux/resource.h>
#include <linux/page_ext.h>
+#define MM_PMD_NULL 0
+#define MM_EXCEED_NONE_PTE 3
+#define MM_PTE_NON_PRESENT 4
+#define MM_PAGE_NULL 5
+#define MM_SCAN_ABORT 6
+#define MM_PAGE_COUNT 7
+#define MM_PAGE_LRU 8
+#define MM_ANY_PROCESS 0
+#define MM_VMA_NULL 2
+#define MM_VMA_CHECK 3
+#define MM_ADDRESS_RANGE 4
+#define MM_PAGE_LOCK 2
+#define MM_SWAP_CACHE_PAGE 6
+#define MM_ISOLATE_LRU_PAGE 7
+#define MM_ALLOC_HUGE_PAGE_FAIL 6
+#define MM_CGROUP_CHARGE_FAIL 7
+#define MM_COLLAPSE_ISOLATE_FAIL 5
+
struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
new file mode 100644
index 0000000..cbc56fc
--- /dev/null
+++ b/include/trace/events/huge_memory.h
@@ -0,0 +1,100 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM huge_memory
+
+#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HUGE_MEMORY_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mm_khugepaged_scan_pmd,
+
+ TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
+ bool referenced, int none_or_zero, int ret),
+
+ TP_ARGS(mm, page, writable, referenced, none_or_zero, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(struct page *, page)
+ __field(bool, writable)
+ __field(bool, referenced)
+ __field(int, none_or_zero)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->page = page;
+ __entry->writable = writable;
+ __entry->referenced = referenced;
+ __entry->none_or_zero = none_or_zero;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, page=%p, writable=%d, referenced=%d, none_or_zero=%d, ret=%d",
+ __entry->mm,
+ __entry->page,
+ __entry->writable,
+ __entry->referenced,
+ __entry->none_or_zero,
+ __entry->ret)
+);
+
+TRACE_EVENT(mm_collapse_huge_page,
+
+ TP_PROTO(struct mm_struct *mm, int isolated, int ret),
+
+ TP_ARGS(mm, isolated, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, isolated)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->isolated = isolated;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, isolated=%d, ret=%d",
+ __entry->mm,
+ __entry->isolated,
+ __entry->ret)
+);
+
+TRACE_EVENT(mm_collapse_huge_page_isolate,
+
+ TP_PROTO(struct page *page, int none_or_zero,
+ bool referenced, bool writable, int ret),
+
+ TP_ARGS(page, none_or_zero, referenced, writable, ret),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page)
+ __field(int, none_or_zero)
+ __field(bool, referenced)
+ __field(bool, writable)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->page = page;
+ __entry->none_or_zero = none_or_zero;
+ __entry->referenced = referenced;
+ __entry->writable = writable;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("page=%p, none_or_zero=%d, referenced=%d, writable=%d, ret=%d",
+ __entry->page,
+ __entry->none_or_zero,
+ __entry->referenced,
+ __entry->writable,
+ __entry->ret)
+);
+
+#endif /* __HUGE_MEMORY_H */
+#include <trace/define_trace.h>
+
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9671f51..595edd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -29,6 +29,9 @@
#include <asm/pgalloc.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
@@ -2190,25 +2193,32 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
- struct page *page;
+ struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0;
+ int none_or_zero = 0, ret = 0;
bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ ret = MM_EXCEED_NONE_PTE;
goto out;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ ret = MM_PTE_NON_PRESENT;
goto out;
+ }
+
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ ret = MM_PAGE_NULL;
goto out;
+ }
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2220,8 +2230,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ ret = MM_PAGE_LOCK;
goto out;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
@@ -2230,6 +2242,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
+ ret = MM_PAGE_COUNT;
goto out;
}
if (pte_write(pteval)) {
@@ -2237,6 +2250,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
+ ret = MM_SWAP_CACHE_PAGE;
goto out;
}
/*
@@ -2251,6 +2265,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (isolate_lru_page(page)) {
unlock_page(page);
+ ret = MM_ISOLATE_LRU_PAGE;
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -2263,11 +2278,16 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (likely(referenced && writable))
+ if (likely(referenced && writable)) {
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, ret);
return 1;
+ }
out:
release_pte_pages(pte, _pte);
- return 0;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, ret);
+ return ret;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -2501,7 +2521,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated;
+ int isolated = 0, ret = 1;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2516,12 +2536,18 @@ static void collapse_huge_page(struct mm_struct *mm,
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
- if (!new_page)
+ if (!new_page) {
+ ret = MM_ALLOC_HUGE_PAGE_FAIL;
+ trace_mm_collapse_huge_page(mm, isolated, ret);
return;
+ }
if (unlikely(mem_cgroup_try_charge(new_page, mm,
- gfp, &memcg)))
+ gfp, &memcg))) {
+ ret = MM_CGROUP_CHARGE_FAIL;
+ trace_mm_collapse_huge_page(mm, isolated, ret);
return;
+ }
/*
* Prevent all access to pagetables with the exception of
@@ -2529,21 +2555,31 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(khugepaged_test_exit(mm))) {
+ ret = MM_ANY_PROCESS;
goto out;
+ }
vma = find_vma(mm, address);
- if (!vma)
+ if (!vma) {
+ ret = MM_VMA_NULL;
goto out;
+ }
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+ ret = MM_ADDRESS_RANGE;
goto out;
- if (!hugepage_vma_check(vma))
+ }
+ if (!hugepage_vma_check(vma)) {
+ ret = MM_VMA_CHECK;
goto out;
+ }
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ ret = MM_PMD_NULL;
goto out;
+ }
anon_vma_lock_write(vma->anon_vma);
@@ -2568,7 +2604,7 @@ static void collapse_huge_page(struct mm_struct *mm,
isolated = __collapse_huge_page_isolate(vma, address, pte);
spin_unlock(pte_ptl);
- if (unlikely(!isolated)) {
+ if (unlikely(isolated != 1)) {
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
@@ -2580,6 +2616,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ ret = MM_COLLAPSE_ISOLATE_FAIL;
goto out;
}
@@ -2619,6 +2656,7 @@ static void collapse_huge_page(struct mm_struct *mm,
khugepaged_pages_collapsed++;
out_up_write:
up_write(&mm->mmap_sem);
+ trace_mm_collapse_huge_page(mm, isolated, ret);
return;
out:
@@ -2634,7 +2672,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pmd_t *pmd;
pte_t *pte, *_pte;
int ret = 0, none_or_zero = 0;
- struct page *page;
+ struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE;
@@ -2643,8 +2681,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ ret = MM_PMD_NULL;
goto out;
+ }
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2653,19 +2693,26 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ ret = MM_EXCEED_NONE_PTE;
goto out_unmap;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ ret = MM_PTE_NON_PRESENT;
goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ ret = MM_PAGE_NULL;
goto out_unmap;
+ }
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -2673,33 +2720,42 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node))
+ if (khugepaged_scan_abort(node)) {
+ ret = MM_SCAN_ABORT;
goto out_unmap;
+ }
khugepaged_node_load[node]++;
VM_BUG_ON_PAGE(PageCompound(page), page);
- if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) {
+ ret = MM_PAGE_LRU;
goto out_unmap;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page))
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ ret = MM_PAGE_COUNT;
goto out_unmap;
+ }
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
+ /* only 1 for scan succeed case */
if (referenced && writable)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
+ if (ret == 1) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
+ trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ none_or_zero, ret);
return ret;
}
@@ -2795,7 +2851,7 @@ skip:
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
- if (ret)
+ if (ret == 1)
/* we released mmap_sem so break loop */
goto breakouterloop_mmap_sem;
if (progress >= pages)
--
1.9.1
This patch makes optimistic check for swapin readahead
to increase thp collapse rate. Before getting swapped
out pages to memory, checks them and allows up to a
certain number. It also prints out using tracepoints
amount of unmapped ptes.
Signed-off-by: Ebru Akagunduz <[email protected]>
---
Changes in v2:
- Nothing changed
Changes in v3:
- Define constant to specify exact tracepoint result (Vlastimil Babka)
include/linux/mm.h | 1 +
include/trace/events/huge_memory.h | 11 +++++++----
mm/huge_memory.c | 15 ++++++++++++---
3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf341c0..eacf348 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -38,6 +38,7 @@
#define MM_ALLOC_HUGE_PAGE_FAIL 6
#define MM_CGROUP_CHARGE_FAIL 7
#define MM_COLLAPSE_ISOLATE_FAIL 5
+#define MM_EXCEED_SWAP_PTE 2
struct mempolicy;
struct anon_vma;
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index cbc56fc..b6bdcc4 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -9,9 +9,9 @@
TRACE_EVENT(mm_khugepaged_scan_pmd,
TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
- bool referenced, int none_or_zero, int ret),
+ bool referenced, int none_or_zero, int ret, int unmapped),
- TP_ARGS(mm, page, writable, referenced, none_or_zero, ret),
+ TP_ARGS(mm, page, writable, referenced, none_or_zero, ret, unmapped),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
@@ -20,6 +20,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
__field(bool, referenced)
__field(int, none_or_zero)
__field(int, ret)
+ __field(int, unmapped)
),
TP_fast_assign(
@@ -29,15 +30,17 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
__entry->referenced = referenced;
__entry->none_or_zero = none_or_zero;
__entry->ret = ret;
+ __entry->unmapped = unmapped;
),
- TP_printk("mm=%p, page=%p, writable=%d, referenced=%d, none_or_zero=%d, ret=%d",
+ TP_printk("mm=%p, page=%p, writable=%d, referenced=%d, none_or_zero=%d, ret=%d, unmapped=%d",
__entry->mm,
__entry->page,
__entry->writable,
__entry->referenced,
__entry->none_or_zero,
- __entry->ret)
+ __entry->ret,
+ __entry->unmapped)
);
TRACE_EVENT(mm_collapse_huge_page,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 595edd9..b4cef9d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -24,6 +24,7 @@
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
+#include <linux/swapops.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -2671,11 +2672,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0;
+ int ret = 0, none_or_zero = 0, unmapped = 0;
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = NUMA_NO_NODE, max_ptes_swap = HPAGE_PMD_NR/8;
bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2691,6 +2692,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
+ if (is_swap_pte(pteval)) {
+ if (++unmapped <= max_ptes_swap) {
+ continue;
+ } else {
+ ret = MM_EXCEED_SWAP_PTE;
+ goto out_unmap;
+ }
+ }
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
@@ -2755,7 +2764,7 @@ out_unmap:
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
- none_or_zero, ret);
+ none_or_zero, ret, unmapped);
return ret;
}
--
1.9.1
This patch makes swapin readahead to improve thp collapse rate.
When khugepaged scanned pages, there can be a few of the pages
in swap area.
With the patch THP can collapse 4kB pages into a THP when
there are up to max_ptes_swap swap ptes in a 2MB range.
The patch was tested with a test program that allocates
800MB of memory, writes to it, and then sleeps. I force
the system to swap out all. Afterwards, the test program
touches the area by writing, it skips a page in each
20 pages of the area.
Without the patch, system did not swap in readahead.
THP rate was %47 of the program of the memory, it
did not change over time.
With this patch, after 10 minutes of waiting khugepaged had
collapsed %99 of the program's memory.
Signed-off-by: Ebru Akagunduz <[email protected]>
Acked-by: Rik van Riel <[email protected]>
---
Changes in v2:
- Use FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT flag
instead of 0x0 when called do_swap_page from
__collapse_huge_page_swapin (Rik van Riel)
Changes in v3:
- Catch VM_FAULT_HWPOISON and VM_FAULT_OOM return cases
in __collapse_huge_page_swapin (Kirill A. Shutemov)
Test results:
After swapped out
-------------------------------------------------------------------
| Anonymous | AnonHugePages | Swap | Fraction |
-------------------------------------------------------------------
With patch | 267128 kB | 266240 kB | 532876 kB | %99 |
-------------------------------------------------------------------
Without patch | 238160 kB | 235520 kB | 561844 kB | %98 |
-------------------------------------------------------------------
After swapped in
-------------------------------------------------------------------
| Anonymous | AnonHugePages | Swap | Fraction |
-------------------------------------------------------------------
With patch | 533876 kB | 530432 kB | 266128 kB | %99 |
-------------------------------------------------------------------
Without patch | 499956 kB | 235520 kB | 300048 kB | %47 |
-------------------------------------------------------------------
include/linux/mm.h | 4 ++++
include/trace/events/huge_memory.h | 24 ++++++++++++++++++++++
mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++++++++++
mm/memory.c | 2 +-
4 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index eacf348..603f3ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -40,6 +40,10 @@
#define MM_COLLAPSE_ISOLATE_FAIL 5
#define MM_EXCEED_SWAP_PTE 2
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte);
+
struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index b6bdcc4..8d34086 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -98,6 +98,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__entry->ret)
);
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+ TP_PROTO(struct mm_struct *mm, int swap_pte, int ret),
+
+ TP_ARGS(mm, swap_pte, ret),
+
+ TP_STRUCT__entry(
+ __field(struct mm_struct *, mm)
+ __field(int, swap_pte)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->mm = mm;
+ __entry->swap_pte = swap_pte;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("mm=%p, swap_pte=%d, ret=%d",
+ __entry->mm,
+ __entry->swap_pte,
+ __entry->ret)
+);
+
#endif /* __HUGE_MEMORY_H */
#include <trace/define_trace.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b4cef9d..b372b40 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2511,6 +2511,45 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return true;
}
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pte_t *pte)
+{
+ unsigned long _address;
+ pte_t pteval = *pte;
+ int swap_pte = 0, ret = 0;
+
+ pte = pte_offset_map(pmd, address);
+ for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ pte++, _address += PAGE_SIZE) {
+ pteval = *pte;
+ if (is_swap_pte(pteval)) {
+ swap_pte++;
+ ret = do_swap_page(mm, vma, _address, pte, pmd,
+ FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+ pteval);
+ if (ret == VM_FAULT_HWPOISON || ret == VM_FAULT_OOM) {
+ trace_mm_collapse_huge_page_swapin(mm, vma->vm_start, swap_pte, 0);
+ return;
+ }
+ /* pte is unmapped now, we need to map it */
+ pte = pte_offset_map(pmd, _address);
+ }
+ }
+ pte--;
+ pte_unmap(pte);
+ trace_mm_collapse_huge_page_swapin(mm, swap_pte, 1);
+}
+
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
@@ -2584,6 +2623,8 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);
+ __collapse_huge_page_swapin(mm, vma, address, pmd, pte);
+
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
diff --git a/mm/memory.c b/mm/memory.c
index 67afe75..eec23a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2443,7 +2443,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
--
1.9.1
On Mon, Jul 13, 2015 at 11:28:02PM +0300, Ebru Akagunduz wrote:
> Using static tracepoints, data of functions is recorded.
> It is good to automatize debugging without doing a lot
> of changes in the source code.
>
> This patch adds tracepoint for khugepaged_scan_pmd,
> collapse_huge_page and __collapse_huge_page_isolate.
>
> Signed-off-by: Ebru Akagunduz <[email protected]>
> Acked-by: Kirill A. Shutemov <[email protected]>
> Acked-by: Rik van Riel <[email protected]>
> ---
> Changes in v2:
> - Nothing changed
>
> Changes in v3:
> - Print page address instead of vm_start (Vlastimil Babka)
> - Define constants to specify exact tracepoint result (Vlastimil Babka)
>
>
> include/linux/mm.h | 18 ++++++
> include/trace/events/huge_memory.h | 100 ++++++++++++++++++++++++++++++++
> mm/huge_memory.c | 114 +++++++++++++++++++++++++++----------
> 3 files changed, 203 insertions(+), 29 deletions(-)
> create mode 100644 include/trace/events/huge_memory.h
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7f47178..bf341c0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -21,6 +21,24 @@
> #include <linux/resource.h>
> #include <linux/page_ext.h>
>
> +#define MM_PMD_NULL 0
> +#define MM_EXCEED_NONE_PTE 3
> +#define MM_PTE_NON_PRESENT 4
> +#define MM_PAGE_NULL 5
> +#define MM_SCAN_ABORT 6
> +#define MM_PAGE_COUNT 7
> +#define MM_PAGE_LRU 8
> +#define MM_ANY_PROCESS 0
> +#define MM_VMA_NULL 2
> +#define MM_VMA_CHECK 3
> +#define MM_ADDRESS_RANGE 4
> +#define MM_PAGE_LOCK 2
> +#define MM_SWAP_CACHE_PAGE 6
> +#define MM_ISOLATE_LRU_PAGE 7
> +#define MM_ALLOC_HUGE_PAGE_FAIL 6
> +#define MM_CGROUP_CHARGE_FAIL 7
> +#define MM_COLLAPSE_ISOLATE_FAIL 5
> +
These magic numbers looks very random. What's logic behind?
--
Kirill A. Shutemov
On Mon, Jul 13, 2015 at 11:28:03PM +0300, Ebru Akagunduz wrote:
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 595edd9..b4cef9d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -24,6 +24,7 @@
> #include <linux/migrate.h>
> #include <linux/hashtable.h>
> #include <linux/userfaultfd_k.h>
> +#include <linux/swapops.h>
>
> #include <asm/tlb.h>
> #include <asm/pgalloc.h>
> @@ -2671,11 +2672,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> {
> pmd_t *pmd;
> pte_t *pte, *_pte;
> - int ret = 0, none_or_zero = 0;
> + int ret = 0, none_or_zero = 0, unmapped = 0;
> struct page *page = NULL;
> unsigned long _address;
> spinlock_t *ptl;
> - int node = NUMA_NO_NODE;
> + int node = NUMA_NO_NODE, max_ptes_swap = HPAGE_PMD_NR/8;
So, you've decide to ignore knob request for max_ptes_swap.
Why?
--
Kirill A. Shutemov
On Mon, Jul 13, 2015 at 11:28:04PM +0300, Ebru Akagunduz wrote:
> This patch makes swapin readahead to improve thp collapse rate.
> When khugepaged scanned pages, there can be a few of the pages
> in swap area.
>
> With the patch THP can collapse 4kB pages into a THP when
> there are up to max_ptes_swap swap ptes in a 2MB range.
>
> The patch was tested with a test program that allocates
> 800MB of memory, writes to it, and then sleeps. I force
> the system to swap out all. Afterwards, the test program
> touches the area by writing, it skips a page in each
> 20 pages of the area.
>
> Without the patch, system did not swap in readahead.
> THP rate was %47 of the program of the memory, it
> did not change over time.
>
> With this patch, after 10 minutes of waiting khugepaged had
> collapsed %99 of the program's memory.
>
> Signed-off-by: Ebru Akagunduz <[email protected]>
> Acked-by: Rik van Riel <[email protected]>
> ---
> Changes in v2:
> - Use FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT flag
> instead of 0x0 when called do_swap_page from
> __collapse_huge_page_swapin (Rik van Riel)
>
> Changes in v3:
> - Catch VM_FAULT_HWPOISON and VM_FAULT_OOM return cases
> in __collapse_huge_page_swapin (Kirill A. Shutemov)
>
> Test results:
>
> After swapped out
> -------------------------------------------------------------------
> | Anonymous | AnonHugePages | Swap | Fraction |
> -------------------------------------------------------------------
> With patch | 267128 kB | 266240 kB | 532876 kB | %99 |
> -------------------------------------------------------------------
> Without patch | 238160 kB | 235520 kB | 561844 kB | %98 |
> -------------------------------------------------------------------
>
> After swapped in
> -------------------------------------------------------------------
> | Anonymous | AnonHugePages | Swap | Fraction |
> -------------------------------------------------------------------
> With patch | 533876 kB | 530432 kB | 266128 kB | %99 |
> -------------------------------------------------------------------
> Without patch | 499956 kB | 235520 kB | 300048 kB | %47 |
> -------------------------------------------------------------------
>
> include/linux/mm.h | 4 ++++
> include/trace/events/huge_memory.h | 24 ++++++++++++++++++++++
> mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++++++++++
> mm/memory.c | 2 +-
> 4 files changed, 70 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index eacf348..603f3ba 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -40,6 +40,10 @@
> #define MM_COLLAPSE_ISOLATE_FAIL 5
> #define MM_EXCEED_SWAP_PTE 2
>
> +extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long address, pte_t *page_table, pmd_t *pmd,
> + unsigned int flags, pte_t orig_pte);
> +
> struct mempolicy;
> struct anon_vma;
> struct anon_vma_chain;
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index b6bdcc4..8d34086 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -98,6 +98,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
> __entry->ret)
> );
>
> +TRACE_EVENT(mm_collapse_huge_page_swapin,
> +
> + TP_PROTO(struct mm_struct *mm, int swap_pte, int ret),
> +
> + TP_ARGS(mm, swap_pte, ret),
> +
> + TP_STRUCT__entry(
> + __field(struct mm_struct *, mm)
> + __field(int, swap_pte)
> + __field(int, ret)
> + ),
> +
> + TP_fast_assign(
> + __entry->mm = mm;
> + __entry->swap_pte = swap_pte;
> + __entry->ret = ret;
> + ),
> +
> + TP_printk("mm=%p, swap_pte=%d, ret=%d",
> + __entry->mm,
> + __entry->swap_pte,
> + __entry->ret)
> +);
> +
> #endif /* __HUGE_MEMORY_H */
> #include <trace/define_trace.h>
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index b4cef9d..b372b40 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2511,6 +2511,45 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
> return true;
> }
>
> +/*
> + * Bring missing pages in from swap, to complete THP collapse.
> + * Only done if khugepaged_scan_pmd believes it is worthwhile.
> + *
> + * Called and returns without pte mapped or spinlocks held,
> + * but with mmap_sem held to protect against vma changes.
> + */
> +
> +static void __collapse_huge_page_swapin(struct mm_struct *mm,
> + struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmd,
> + pte_t *pte)
> +{
> + unsigned long _address;
> + pte_t pteval = *pte;
> + int swap_pte = 0, ret = 0;
> +
> + pte = pte_offset_map(pmd, address);
> + for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
> + pte++, _address += PAGE_SIZE) {
> + pteval = *pte;
> + if (is_swap_pte(pteval)) {
> + swap_pte++;
> + ret = do_swap_page(mm, vma, _address, pte, pmd,
> + FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> + pteval);
Indentation looks broken.
You can reclaim some horizontal space if you'll revert the check above:
if (!is_space_pte(pteval))
continue;
> + if (ret == VM_FAULT_HWPOISON || ret == VM_FAULT_OOM) {
No, this is wrong. ret is bitmask and more than one bit can be set.
The right way would be:
if (ret & VM_FAULT_ERROR) {
> + trace_mm_collapse_huge_page_swapin(mm, vma->vm_start, swap_pte, 0);
> + return;
> + }
> + /* pte is unmapped now, we need to map it */
> + pte = pte_offset_map(pmd, _address);
> + }
> + }
> + pte--;
> + pte_unmap(pte);
> + trace_mm_collapse_huge_page_swapin(mm, swap_pte, 1);
> +}
> +
> static void collapse_huge_page(struct mm_struct *mm,
> unsigned long address,
> struct page **hpage,
> @@ -2584,6 +2623,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>
> anon_vma_lock_write(vma->anon_vma);
>
> + __collapse_huge_page_swapin(mm, vma, address, pmd, pte);
> +
> pte = pte_offset_map(pmd, address);
> pte_ptl = pte_lockptr(mm, pmd);
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 67afe75..eec23a2 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2443,7 +2443,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
> * We return with the mmap_sem locked or unlocked in the same cases
> * as does filemap_fault().
> */
> -static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> +int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, pte_t *page_table, pmd_t *pmd,
> unsigned int flags, pte_t orig_pte)
> {
> --
> 1.9.1
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Kirill A. Shutemov
On Tue, Jul 14, 2015 at 12:07:27AM +0300, Kirill A. Shutemov wrote:
> On Mon, Jul 13, 2015 at 11:28:03PM +0300, Ebru Akagunduz wrote:
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 595edd9..b4cef9d 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -24,6 +24,7 @@
> > #include <linux/migrate.h>
> > #include <linux/hashtable.h>
> > #include <linux/userfaultfd_k.h>
> > +#include <linux/swapops.h>
> >
> > #include <asm/tlb.h>
> > #include <asm/pgalloc.h>
> > @@ -2671,11 +2672,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> > {
> > pmd_t *pmd;
> > pte_t *pte, *_pte;
> > - int ret = 0, none_or_zero = 0;
> > + int ret = 0, none_or_zero = 0, unmapped = 0;
> > struct page *page = NULL;
> > unsigned long _address;
> > spinlock_t *ptl;
> > - int node = NUMA_NO_NODE;
> > + int node = NUMA_NO_NODE, max_ptes_swap = HPAGE_PMD_NR/8;
>
> So, you've decide to ignore knob request for max_ptes_swap.
> Why?
I did not know sysfs knob at your first comment in v2
I thought you meant something else, so did not request
for sysfs knob. I will add it to commit message in v4.
kind regards,
Ebru
On Tue, Jul 14, 2015 at 12:06:46AM +0300, Kirill A. Shutemov wrote:
> On Mon, Jul 13, 2015 at 11:28:02PM +0300, Ebru Akagunduz wrote:
> > Using static tracepoints, data of functions is recorded.
> > It is good to automatize debugging without doing a lot
> > of changes in the source code.
> >
> > This patch adds tracepoint for khugepaged_scan_pmd,
> > collapse_huge_page and __collapse_huge_page_isolate.
> >
> > Signed-off-by: Ebru Akagunduz <[email protected]>
> > Acked-by: Kirill A. Shutemov <[email protected]>
> > Acked-by: Rik van Riel <[email protected]>
> > ---
> > Changes in v2:
> > - Nothing changed
> >
> > Changes in v3:
> > - Print page address instead of vm_start (Vlastimil Babka)
> > - Define constants to specify exact tracepoint result (Vlastimil Babka)
> >
> >
> > include/linux/mm.h | 18 ++++++
> > include/trace/events/huge_memory.h | 100 ++++++++++++++++++++++++++++++++
> > mm/huge_memory.c | 114 +++++++++++++++++++++++++++----------
> > 3 files changed, 203 insertions(+), 29 deletions(-)
> > create mode 100644 include/trace/events/huge_memory.h
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 7f47178..bf341c0 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -21,6 +21,24 @@
> > #include <linux/resource.h>
> > #include <linux/page_ext.h>
> >
> > +#define MM_PMD_NULL 0
> > +#define MM_EXCEED_NONE_PTE 3
> > +#define MM_PTE_NON_PRESENT 4
> > +#define MM_PAGE_NULL 5
> > +#define MM_SCAN_ABORT 6
> > +#define MM_PAGE_COUNT 7
> > +#define MM_PAGE_LRU 8
> > +#define MM_ANY_PROCESS 0
> > +#define MM_VMA_NULL 2
> > +#define MM_VMA_CHECK 3
> > +#define MM_ADDRESS_RANGE 4
> > +#define MM_PAGE_LOCK 2
> > +#define MM_SWAP_CACHE_PAGE 6
> > +#define MM_ISOLATE_LRU_PAGE 7
> > +#define MM_ALLOC_HUGE_PAGE_FAIL 6
> > +#define MM_CGROUP_CHARGE_FAIL 7
> > +#define MM_COLLAPSE_ISOLATE_FAIL 5
> > +
>
> These magic numbers looks very random. What's logic behind?
>
I defined them to specify reason of all success and failure cases
of the functions with tracepoint. Only 1 means success case.
All other values mean failure, I give consecutive numbers as
far as possible, and tried to avoid conflicts of different functions
those can be fail for same reason.
kind regards,
Ebru
On 07/13/2015 10:28 PM, Ebru Akagunduz wrote:
> Using static tracepoints, data of functions is recorded.
> It is good to automatize debugging without doing a lot
> of changes in the source code.
>
> This patch adds tracepoint for khugepaged_scan_pmd,
> collapse_huge_page and __collapse_huge_page_isolate.
>
> Signed-off-by: Ebru Akagunduz <[email protected]>
> Acked-by: Kirill A. Shutemov <[email protected]>
> Acked-by: Rik van Riel <[email protected]>
> ---
> Changes in v2:
> - Nothing changed
>
> Changes in v3:
> - Print page address instead of vm_start (Vlastimil Babka)
> - Define constants to specify exact tracepoint result (Vlastimil Babka)
Hi, and thanks for improving the tracepoints!
>
> include/linux/mm.h | 18 ++++++
> include/trace/events/huge_memory.h | 100 ++++++++++++++++++++++++++++++++
> mm/huge_memory.c | 114 +++++++++++++++++++++++++++----------
> 3 files changed, 203 insertions(+), 29 deletions(-)
> create mode 100644 include/trace/events/huge_memory.h
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7f47178..bf341c0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -21,6 +21,24 @@
> #include <linux/resource.h>
> #include <linux/page_ext.h>
>
> +#define MM_PMD_NULL 0
> +#define MM_EXCEED_NONE_PTE 3
> +#define MM_PTE_NON_PRESENT 4
> +#define MM_PAGE_NULL 5
> +#define MM_SCAN_ABORT 6
> +#define MM_PAGE_COUNT 7
> +#define MM_PAGE_LRU 8
> +#define MM_ANY_PROCESS 0
> +#define MM_VMA_NULL 2
> +#define MM_VMA_CHECK 3
> +#define MM_ADDRESS_RANGE 4
> +#define MM_PAGE_LOCK 2
> +#define MM_SWAP_CACHE_PAGE 6
> +#define MM_ISOLATE_LRU_PAGE 7
> +#define MM_ALLOC_HUGE_PAGE_FAIL 6
> +#define MM_CGROUP_CHARGE_FAIL 7
> +#define MM_COLLAPSE_ISOLATE_FAIL 5
this would better go to mm/huge_memory.c since it's used nowhere else,
so we shouldn't pollute a global header. Also I'd suggest changing the
MM_ prefix to e.g. SCAN_ ?
Reusing the numbers depending on whether they can appear in a single
function is unnecessarily complicated, we don't have to fit in some
small limit here. You could also use an enum to avoid defining each
constant's value manually.
> struct mempolicy;
> struct anon_vma;
> struct anon_vma_chain;
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> new file mode 100644
> index 0000000..cbc56fc
> --- /dev/null
> +++ b/include/trace/events/huge_memory.h
> @@ -0,0 +1,100 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM huge_memory
> +
> +#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define __HUGE_MEMORY_H
> +
> +#include <linux/tracepoint.h>
> +
> +TRACE_EVENT(mm_khugepaged_scan_pmd,
> +
> + TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
> + bool referenced, int none_or_zero, int ret),
> +
> + TP_ARGS(mm, page, writable, referenced, none_or_zero, ret),
> +
> + TP_STRUCT__entry(
> + __field(struct mm_struct *, mm)
> + __field(struct page *, page)
> + __field(bool, writable)
> + __field(bool, referenced)
> + __field(int, none_or_zero)
> + __field(int, ret)
> + ),
> +
> + TP_fast_assign(
> + __entry->mm = mm;
> + __entry->page = page;
> + __entry->writable = writable;
> + __entry->referenced = referenced;
> + __entry->none_or_zero = none_or_zero;
> + __entry->ret = ret;
> + ),
> +
> + TP_printk("mm=%p, page=%p, writable=%d, referenced=%d, none_or_zero=%d, ret=%d",
> + __entry->mm,
> + __entry->page,
Sorry, when I suggested "the address of the page itself" instead of
vm_start, I was thinking physical address (pfn).
Compaction tracepoints recently standardized on this print format so I'd
recommend it here too:
scan_pfn=0x%lx
> + __entry->writable,
> + __entry->referenced,
> + __entry->none_or_zero,
> + __entry->ret)
Instead of printing a number that has to be translated manually, I'd
recommend converting to string. Look at how compaction_status_string is
defined in mm/compaction.c and used from the tracepoints.
[ ... ]
>
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/huge_memory.h>
> +
> /*
> * By default transparent hugepage support is disabled in order that avoid
> * to risk increase the memory footprint of applications without a guaranteed
> @@ -2190,25 +2193,32 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long address,
> pte_t *pte)
> {
> - struct page *page;
> + struct page *page = NULL;
> pte_t *_pte;
> - int none_or_zero = 0;
> + int none_or_zero = 0, ret = 0;
> bool referenced = false, writable = false;
> for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
> _pte++, address += PAGE_SIZE) {
> pte_t pteval = *_pte;
> if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> if (!userfaultfd_armed(vma) &&
> - ++none_or_zero <= khugepaged_max_ptes_none)
> + ++none_or_zero <= khugepaged_max_ptes_none) {
> continue;
> - else
> + } else {
> + ret = MM_EXCEED_NONE_PTE;
> goto out;
> + }
> }
> - if (!pte_present(pteval))
> + if (!pte_present(pteval)) {
> + ret = MM_PTE_NON_PRESENT;
> goto out;
> + }
> +
> page = vm_normal_page(vma, address, pteval);
> - if (unlikely(!page))
> + if (unlikely(!page)) {
> + ret = MM_PAGE_NULL;
> goto out;
> + }
>
> VM_BUG_ON_PAGE(PageCompound(page), page);
> VM_BUG_ON_PAGE(!PageAnon(page), page);
> @@ -2220,8 +2230,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> * is needed to serialize against split_huge_page
> * when invoked from the VM.
> */
> - if (!trylock_page(page))
> + if (!trylock_page(page)) {
> + ret = MM_PAGE_LOCK;
> goto out;
> + }
>
> /*
> * cannot use mapcount: can't collapse if there's a gup pin.
> @@ -2230,6 +2242,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> */
> if (page_count(page) != 1 + !!PageSwapCache(page)) {
> unlock_page(page);
> + ret = MM_PAGE_COUNT;
> goto out;
> }
> if (pte_write(pteval)) {
> @@ -2237,6 +2250,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> } else {
> if (PageSwapCache(page) && !reuse_swap_page(page)) {
> unlock_page(page);
> + ret = MM_SWAP_CACHE_PAGE;
> goto out;
> }
> /*
> @@ -2251,6 +2265,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> */
> if (isolate_lru_page(page)) {
> unlock_page(page);
> + ret = MM_ISOLATE_LRU_PAGE;
> goto out;
> }
> /* 0 stands for page_is_file_cache(page) == false */
> @@ -2263,11 +2278,16 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> mmu_notifier_test_young(vma->vm_mm, address))
> referenced = true;
> }
> - if (likely(referenced && writable))
> + if (likely(referenced && writable)) {
> + trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> + referenced, writable, ret);
> return 1;
> + }
> out:
> release_pte_pages(pte, _pte);
> - return 0;
> + trace_mm_collapse_huge_page_isolate(page, none_or_zero,
> + referenced, writable, ret);
> + return ret;
> }
Having success returned as "1" and failures of either 0 or other
positive values is uncommon and may lead to mistakes. Per
Documentation/CodingStyle Chapter 16, it should be either 0 = success
and any other = error, or 0 = failure, non-zero = success. Here the
first variant would be applicable. Following the chapter strictly, the
function should have been using this variant even before your patch,
since here the "name of a function is an action or an imperative
command" :) but yeah...
Anyway I don't think you need to return the exact error, since the
caller doesn't use it. It's there only for the tracepoint, so you can
simply keep returning just 1 or 0. Then with tracepoints disabled in
.config, the compiler should be also able to eliminate all the
assignments that would be unused in the end.
Same suggestions apply to khugepaged_scan_pmd()
>
> static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> @@ -2501,7 +2521,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> pgtable_t pgtable;
> struct page *new_page;
> spinlock_t *pmd_ptl, *pte_ptl;
> - int isolated;
> + int isolated = 0, ret = 1;
> unsigned long hstart, hend;
> struct mem_cgroup *memcg;
> unsigned long mmun_start; /* For mmu_notifiers */
> @@ -2516,12 +2536,18 @@ static void collapse_huge_page(struct mm_struct *mm,
>
> /* release the mmap_sem read lock. */
> new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
> - if (!new_page)
> + if (!new_page) {
> + ret = MM_ALLOC_HUGE_PAGE_FAIL;
> + trace_mm_collapse_huge_page(mm, isolated, ret);
> return;
> + }
>
> if (unlikely(mem_cgroup_try_charge(new_page, mm,
> - gfp, &memcg)))
> + gfp, &memcg))) {
> + ret = MM_CGROUP_CHARGE_FAIL;
> + trace_mm_collapse_huge_page(mm, isolated, ret);
> return;
> + }
You could add a label called e.g. "out_nolock" right after
"up_write(&mm->mmap_sem);" below, and goto there to avoid the multiple
tracepoints.
>
> /*
> * Prevent all access to pagetables with the exception of
> @@ -2529,21 +2555,31 @@ static void collapse_huge_page(struct mm_struct *mm,
> * handled by the anon_vma lock + PG_lock.
> */
> down_write(&mm->mmap_sem);
> - if (unlikely(khugepaged_test_exit(mm)))
> + if (unlikely(khugepaged_test_exit(mm))) {
> + ret = MM_ANY_PROCESS;
> goto out;
> + }
>
> vma = find_vma(mm, address);
> - if (!vma)
> + if (!vma) {
> + ret = MM_VMA_NULL;
> goto out;
> + }
> hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> hend = vma->vm_end & HPAGE_PMD_MASK;
> - if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> + if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
> + ret = MM_ADDRESS_RANGE;
> goto out;
> - if (!hugepage_vma_check(vma))
> + }
> + if (!hugepage_vma_check(vma)) {
> + ret = MM_VMA_CHECK;
> goto out;
> + }
> pmd = mm_find_pmd(mm, address);
> - if (!pmd)
> + if (!pmd) {
> + ret = MM_PMD_NULL;
> goto out;
> + }
>
> anon_vma_lock_write(vma->anon_vma);
>
> @@ -2568,7 +2604,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> isolated = __collapse_huge_page_isolate(vma, address, pte);
> spin_unlock(pte_ptl);
>
> - if (unlikely(!isolated)) {
> + if (unlikely(isolated != 1)) {
> pte_unmap(pte);
> spin_lock(pmd_ptl);
> BUG_ON(!pmd_none(*pmd));
> @@ -2580,6 +2616,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> spin_unlock(pmd_ptl);
> anon_vma_unlock_write(vma->anon_vma);
> + ret = MM_COLLAPSE_ISOLATE_FAIL;
> goto out;
> }
>
> @@ -2619,6 +2656,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> khugepaged_pages_collapsed++;
> out_up_write:
> up_write(&mm->mmap_sem);
out_nolock:
> + trace_mm_collapse_huge_page(mm, isolated, ret);
> return;
>
> out:
On 07/13/2015 10:28 PM, Ebru Akagunduz wrote:
> This patch makes swapin readahead to improve thp collapse rate.
> When khugepaged scanned pages, there can be a few of the pages
> in swap area.
>
> With the patch THP can collapse 4kB pages into a THP when
> there are up to max_ptes_swap swap ptes in a 2MB range.
>
> The patch was tested with a test program that allocates
> 800MB of memory, writes to it, and then sleeps. I force
> the system to swap out all. Afterwards, the test program
> touches the area by writing, it skips a page in each
> 20 pages of the area.
>
> Without the patch, system did not swap in readahead.
> THP rate was %47 of the program of the memory, it
> did not change over time.
>
> With this patch, after 10 minutes of waiting khugepaged had
> collapsed %99 of the program's memory.
>
> Signed-off-by: Ebru Akagunduz <[email protected]>
> Acked-by: Rik van Riel <[email protected]>
> ---
> Changes in v2:
> - Use FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT flag
> instead of 0x0 when called do_swap_page from
> __collapse_huge_page_swapin (Rik van Riel)
>
> Changes in v3:
> - Catch VM_FAULT_HWPOISON and VM_FAULT_OOM return cases
> in __collapse_huge_page_swapin (Kirill A. Shutemov)
>
> Test results:
>
> After swapped out
> -------------------------------------------------------------------
> | Anonymous | AnonHugePages | Swap | Fraction |
> -------------------------------------------------------------------
> With patch | 267128 kB | 266240 kB | 532876 kB | %99 |
> -------------------------------------------------------------------
> Without patch | 238160 kB | 235520 kB | 561844 kB | %98 |
> -------------------------------------------------------------------
>
> After swapped in
> -------------------------------------------------------------------
> | Anonymous | AnonHugePages | Swap | Fraction |
> -------------------------------------------------------------------
> With patch | 533876 kB | 530432 kB | 266128 kB | %99 |
> -------------------------------------------------------------------
> Without patch | 499956 kB | 235520 kB | 300048 kB | %47 |
> -------------------------------------------------------------------
>
> include/linux/mm.h | 4 ++++
> include/trace/events/huge_memory.h | 24 ++++++++++++++++++++++
> mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++++++++++
> mm/memory.c | 2 +-
> 4 files changed, 70 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index eacf348..603f3ba 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -40,6 +40,10 @@
> #define MM_COLLAPSE_ISOLATE_FAIL 5
> #define MM_EXCEED_SWAP_PTE 2
>
> +extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long address, pte_t *page_table, pmd_t *pmd,
> + unsigned int flags, pte_t orig_pte);
> +
> struct mempolicy;
> struct anon_vma;
> struct anon_vma_chain;
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index b6bdcc4..8d34086 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -98,6 +98,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
> __entry->ret)
> );
>
> +TRACE_EVENT(mm_collapse_huge_page_swapin,
> +
> + TP_PROTO(struct mm_struct *mm, int swap_pte, int ret),
> +
> + TP_ARGS(mm, swap_pte, ret),
> +
> + TP_STRUCT__entry(
> + __field(struct mm_struct *, mm)
> + __field(int, swap_pte)
> + __field(int, ret)
> + ),
> +
> + TP_fast_assign(
> + __entry->mm = mm;
> + __entry->swap_pte = swap_pte;
> + __entry->ret = ret;
> + ),
> +
> + TP_printk("mm=%p, swap_pte=%d, ret=%d",
> + __entry->mm,
> + __entry->swap_pte,
> + __entry->ret)
> +);
swap_pte is weird name for the number of swapped-in pages, how about
"swapped_in" ?
> +
> #endif /* __HUGE_MEMORY_H */
> #include <trace/define_trace.h>
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index b4cef9d..b372b40 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2511,6 +2511,45 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
> return true;
> }
>
> +/*
> + * Bring missing pages in from swap, to complete THP collapse.
> + * Only done if khugepaged_scan_pmd believes it is worthwhile.
> + *
> + * Called and returns without pte mapped or spinlocks held,
> + * but with mmap_sem held to protect against vma changes.
> + */
> +
> +static void __collapse_huge_page_swapin(struct mm_struct *mm,
> + struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmd,
> + pte_t *pte)
> +{
> + unsigned long _address;
> + pte_t pteval = *pte;
> + int swap_pte = 0, ret = 0;
Same concern about swap_pte name.
> +
> + pte = pte_offset_map(pmd, address);
> + for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
> + pte++, _address += PAGE_SIZE) {
> + pteval = *pte;
> + if (is_swap_pte(pteval)) {
> + swap_pte++;
Move the increment towards the end of the "if" and then it counts what
was successfully swapped in :)
> + ret = do_swap_page(mm, vma, _address, pte, pmd,
> + FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> + pteval);
> + if (ret == VM_FAULT_HWPOISON || ret == VM_FAULT_OOM) {
> + trace_mm_collapse_huge_page_swapin(mm, vma->vm_start, swap_pte, 0);
The vma->vm_start should be removed otherwise this won't compile.
> + return;
> + }
> + /* pte is unmapped now, we need to map it */
> + pte = pte_offset_map(pmd, _address);
> + }
> + }
> + pte--;
> + pte_unmap(pte);
> + trace_mm_collapse_huge_page_swapin(mm, swap_pte, 1);
> +}
> +
> static void collapse_huge_page(struct mm_struct *mm,
> unsigned long address,
> struct page **hpage,
> @@ -2584,6 +2623,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>
> anon_vma_lock_write(vma->anon_vma);
>
> + __collapse_huge_page_swapin(mm, vma, address, pmd, pte);
> +
> pte = pte_offset_map(pmd, address);
> pte_ptl = pte_lockptr(mm, pmd);
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 67afe75..eec23a2 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2443,7 +2443,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
> * We return with the mmap_sem locked or unlocked in the same cases
> * as does filemap_fault().
> */
> -static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> +int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, pte_t *page_table, pmd_t *pmd,
> unsigned int flags, pte_t orig_pte)
> {
>