Convert all the caller functions of cgroup_throttle_swaprate() to use
folios, and use folio_throttle_swaprate(), which allows us to remove
cgroup_throttle_swaprate() completely.
Kefeng Wang (7):
mm: huge_memory: make __do_huge_pmd_anonymous_page() to take a folio
mm: memory: convert do_anonymous_page() to use a folio
mm: memory: convert do_cow_fault to use folios
mm: memory: convert page_copy_prealloc() to use a folio
mm: memory: convert wp_page_copy() to use folios
mm: memory: use folio_throttle_swaprate() in do_swap_page()
mm: swap: remove unneeded cgroup_throttle_swaprate()
include/linux/swap.h | 12 ++---
mm/huge_memory.c | 23 +++++-----
mm/memory.c | 104 +++++++++++++++++++++++--------------------
mm/swapfile.c | 4 +-
4 files changed, 74 insertions(+), 69 deletions(-)
--
2.35.3
The page functions are converted to corresponding folio functions in
do_cow_fault().
Signed-off-by: Kefeng Wang <[email protected]>
---
mm/memory.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 1cfdb0fd8d79..f29bca499e0d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4507,22 +4507,24 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ struct folio *cow_folio, *folio;
vm_fault_t ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
- if (!vmf->cow_page)
+ cow_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address,
+ false);
+ if (!cow_folio)
return VM_FAULT_OOM;
- if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
- GFP_KERNEL)) {
- put_page(vmf->cow_page);
+ if (mem_cgroup_charge(cow_folio, vma->vm_mm, GFP_KERNEL)) {
+ folio_put(cow_folio);
return VM_FAULT_OOM;
}
- cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
+ folio_throttle_swaprate(cow_folio, GFP_KERNEL);
+ vmf->cow_page = &cow_folio->page;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
@@ -4530,16 +4532,17 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
return ret;
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
- __SetPageUptodate(vmf->cow_page);
+ __folio_mark_uptodate(cow_folio);
ret |= finish_fault(vmf);
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio = page_folio(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- put_page(vmf->cow_page);
+ folio_put(cow_folio);
return ret;
}
--
2.35.3
Convert do_anonymous_page() to use a folio and replace related functions
to folio functions.
Signed-off-by: Kefeng Wang <[email protected]>
---
mm/memory.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 56b571c83a0e..1cfdb0fd8d79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4002,6 +4002,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
+ struct folio *folio;
vm_fault_t ret = 0;
pte_t entry;
@@ -4055,16 +4056,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
+ folio = page_folio(page);
+ if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
- cgroup_throttle_swaprate(page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
entry = mk_pte(page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
@@ -4085,13 +4087,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- put_page(page);
+ folio_put(folio);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_lru_vma(folio, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -4101,10 +4103,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
- put_page(page);
+ folio_put(folio);
goto unlock;
oom_free_page:
- put_page(page);
+ folio_put(folio);
oom:
return VM_FAULT_OOM;
}
--
2.35.3
The page functions are converted to corresponding to folio functions
in page_copy_prealloc().
Signed-off-by: Kefeng Wang <[email protected]>
---
mm/memory.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index f29bca499e0d..b66c425b4d7c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -958,19 +958,19 @@ static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
unsigned long addr)
{
- struct page *new_page;
+ struct folio *new_folio;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
- if (!new_page)
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
+ if (!new_folio)
return NULL;
- if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
- put_page(new_page);
+ if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
+ folio_put(new_folio);
return NULL;
}
- cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+ folio_throttle_swaprate(new_folio, GFP_KERNEL);
- return new_page;
+ return &new_folio->page;
}
static int
--
2.35.3
The old_page/new_page are converted to old_folio/new_folio in
wp_page_copy(), then replaced related page functions to folio
functions.
Signed-off-by: Kefeng Wang <[email protected]>
---
mm/memory.c | 47 +++++++++++++++++++++++++----------------------
1 file changed, 25 insertions(+), 22 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index b66c425b4d7c..746f485366e8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3044,7 +3044,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
+ struct folio *old_folio = page_folio(old_page);
struct page *new_page = NULL;
+ struct folio *new_folio = NULL;
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
@@ -3060,12 +3062,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
vmf->address);
if (!new_page)
goto oom;
+ new_folio = page_folio(new_page);
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
- if (!new_page)
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ vmf->address, false);
+ if (!new_folio)
goto oom;
-
+ new_page = &new_folio->page;
ret = __wp_page_copy_user(new_page, old_page, vmf);
if (ret) {
/*
@@ -3075,9 +3078,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* from the second attempt.
* The -EHWPOISON case will not be retried.
*/
- put_page(new_page);
- if (old_page)
- put_page(old_page);
+ folio_put(new_folio);
+ if (old_folio)
+ folio_put(old_folio);
delayacct_wpcopy_end();
return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
@@ -3085,11 +3088,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
kmsan_copy_page_meta(new_page, old_page);
}
- if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
+ if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
goto oom_free_new;
- cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+ folio_throttle_swaprate(new_folio, GFP_KERNEL);
- __SetPageUptodate(new_page);
+ __folio_mark_uptodate(new_folio);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
vmf->address & PAGE_MASK,
@@ -3101,8 +3104,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
- if (old_page) {
- if (!PageAnon(old_page)) {
+ if (old_folio) {
+ if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
@@ -3130,7 +3133,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ folio_add_lru_vma(new_folio, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3139,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
- if (old_page) {
+ if (old_folio) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
@@ -3166,14 +3169,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
/* Free the old page.. */
- new_page = old_page;
+ new_folio = old_folio;
page_copied = 1;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
}
- if (new_page)
- put_page(new_page);
+ if (new_folio)
+ folio_put(new_folio);
pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
@@ -3181,19 +3184,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
- if (old_page) {
+ if (old_folio) {
if (page_copied)
free_swap_cache(old_page);
- put_page(old_page);
+ folio_put(old_folio);
}
delayacct_wpcopy_end();
return 0;
oom_free_new:
- put_page(new_page);
+ folio_put(new_folio);
oom:
- if (old_page)
- put_page(old_page);
+ if (old_folio)
+ folio_put(old_folio);
delayacct_wpcopy_end();
return VM_FAULT_OOM;
--
2.35.3
Directly use folio_throttle_swaprate() instead cgroup_throttle_swaprate().
Signed-off-by: Kefeng Wang <[email protected]>
---
mm/memory.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memory.c b/mm/memory.c
index 746f485366e8..abc4e606d8bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3839,7 +3839,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
lru_add_drain();
}
- cgroup_throttle_swaprate(page, GFP_KERNEL);
+ folio_throttle_swaprate(folio, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
--
2.35.3
Let's __do_huge_pmd_anonymous_page() take a folio and convert related
functions to use folios.
Signed-off-by: Kefeng Wang <[email protected]>
---
mm/huge_memory.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c13b1f67d14e..cb23b24e2eb8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,22 +650,23 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
- struct page *page, gfp_t gfp)
+ struct folio *folio, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
+ struct page *page = &folio->page;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
- put_page(page);
+ if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
- cgroup_throttle_swaprate(page, gfp);
+ folio_throttle_swaprate(folio, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -675,11 +676,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
* write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
@@ -694,7 +695,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl);
- put_page(page);
+ folio_put(folio);
pte_free(vma->vm_mm, pgtable);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
@@ -704,7 +705,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
@@ -721,7 +722,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- put_page(page);
+ folio_put(folio);
return ret;
}
@@ -834,7 +835,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, folio, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
--
2.35.3
All the callers of cgroup_throttle_swaprate() are converted to
folio_throttle_swaprate(), so make __cgroup_throttle_swaprate()
to take a folio, and drop unused cgroup_throttle_swaprate().
Signed-off-by: Kefeng Wang <[email protected]>
---
include/linux/swap.h | 12 ++++--------
mm/swapfile.c | 4 ++--
2 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 209a425739a9..2674408e6d63 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -641,22 +641,18 @@ extern atomic_t zswap_stored_pages;
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+extern void __cgroup_throttle_swaprate(struct folio *folio, gfp_t gfp_mask);
+static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp_mask)
{
if (mem_cgroup_disabled())
return;
- __cgroup_throttle_swaprate(page, gfp_mask);
+ __cgroup_throttle_swaprate(folio, gfp_mask);
}
#else
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp_mask)
{
}
#endif
-static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
-{
- cgroup_throttle_swaprate(&folio->page, gfp);
-}
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a5729273480e..3abf514c3f28 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3636,10 +3636,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __cgroup_throttle_swaprate(struct folio *folio, gfp_t gfp_mask)
{
struct swap_info_struct *si, *next;
- int nid = page_to_nid(page);
+ int nid = folio_nid(folio);
if (!(gfp_mask & __GFP_IO))
return;
--
2.35.3
Hi
On 12.01.2023 09:30, Kefeng Wang wrote:
> The old_page/new_page are converted to old_folio/new_folio in
> wp_page_copy(), then replaced related page functions to folio
> functions.
>
> Signed-off-by: Kefeng Wang <[email protected]>
This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
memory: convert wp_page_copy() to use folios"), causes serious stability
issues on my ARM based test boards. Here is the example of such crash:
VFS: Mounted root (ext4 filesystem) readonly on device 179:6.
devtmpfs: mounted
Freeing unused kernel image (initmem) memory: 1024K
Run /sbin/init as init process
8<--- cut here ---
Unable to handle kernel NULL pointer dereference at virtual address
00000004 when read
[00000004] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 PID: 1 Comm: init Not tainted 6.2.0-rc3-00294-g9ebae00c8e30 #13254
Hardware name: Samsung Exynos (Flattened Device Tree)
PC is at do_wp_page+0x21c/0xd48
LR is at do_wp_page+0x1f8/0xd48
pc : [<c02aa518>] lr : [<c02aa4f4>] psr: 60000113
sp : f082de58 ip : 0006fff8 fp : 0000065f
r10: 00000000 r9 : 00000c73 r8 : c2b87318
r7 : c1d78000 r6 : b6ed9000 r5 : 00000000 r4 : f082dedc
r3 : c25d0000 r2 : 00000001 r1 : c0ee9568 r0 : 00000000
Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
Control: 10c5387d Table: 4363804a DAC: 00000051
Register r0 information: NULL pointer
Register r1 information: non-slab/vmalloc memory
Register r2 information: non-paged memory
Register r3 information: slab mm_struct start c25d0000 pointer offset 0
size 908
Register r4 information: 2-page vmalloc region starting at 0xf082c000
allocated at kernel_clone+0x54/0x3e4
Register r5 information: NULL pointer
Register r6 information: non-paged memory
Register r7 information: slab task_struct start c1d78000 pointer offset
0 size 4032
Register r8 information: slab vm_area_struct start c2b87318 pointer
offset 0 size 68
Register r9 information: non-paged memory
Register r10 information: NULL pointer
Register r11 information: non-paged memory
Register r12 information: non-paged memory
Process init (pid: 1, stack limit = 0x(ptrval))
Stack: (0xf082de58 to 0xf082e000)
...
do_wp_page from handle_mm_fault+0x938/0xda8
handle_mm_fault from do_page_fault+0x154/0x408
do_page_fault from do_DataAbort+0x3c/0xb0
do_DataAbort from __dabt_usr+0x58/0x60
Exception stack(0xf082dfb0 to 0xf082dff8)
dfa0: 00000000 00000001 b6ed9060
00000000
dfc0: 00000000 b6fea968 b6ed9060 00000000 b6cd4888 00000000 00000000
00000000
dfe0: b6eda28c be8a9dd0 b6e6ff08 ffff0fcc 60000010 ffffffff
Code: e594a028 e58d301c e5983008 e58d3014 (e59a3004)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
CPU1: stopping
CPU: 1 PID: 0 Comm: swapper/1 Tainted: G D
6.2.0-rc3-00294-g9ebae00c8e30 #13254
Hardware name: Samsung Exynos (Flattened Device Tree)
unwind_backtrace from show_stack+0x10/0x14
show_stack from dump_stack_lvl+0x58/0x70
dump_stack_lvl from do_handle_IPI+0x348/0x374
do_handle_IPI from ipi_handler+0x18/0x20
ipi_handler from handle_percpu_devid_irq+0x9c/0x170
handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34
generic_handle_domain_irq from gic_handle_irq+0x88/0xa8
gic_handle_irq from generic_handle_arch_irq+0x58/0x78
generic_handle_arch_irq from call_with_stack+0x18/0x20
call_with_stack from __irq_svc+0x9c/0xd0
Exception stack(0xf08a9ee0 to 0xf08a9f28)
...
__irq_svc from cpuidle_enter_state+0x318/0x3d0
cpuidle_enter_state from cpuidle_enter_state_coupled+0x160/0x400
cpuidle_enter_state_coupled from cpuidle_enter+0x3c/0x54
cpuidle_enter from do_idle+0x1f0/0x2b0
do_idle from cpu_startup_entry+0x18/0x1c
cpu_startup_entry from secondary_start_kernel+0x1a0/0x230
secondary_start_kernel from 0x40101a00
---[ end Kernel panic - not syncing: Attempted to kill init!
exitcode=0x0000000b ]---
Reverting it on top of linux-next 20220113 together with aaf3f7afbf10
("mm: swap: remove unneeded cgroup_throttle_swaprate()") fixes the
stability issues.
> ---
> mm/memory.c | 47 +++++++++++++++++++++++++----------------------
> 1 file changed, 25 insertions(+), 22 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index b66c425b4d7c..746f485366e8 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3044,7 +3044,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> struct vm_area_struct *vma = vmf->vma;
> struct mm_struct *mm = vma->vm_mm;
> struct page *old_page = vmf->page;
> + struct folio *old_folio = page_folio(old_page);
> struct page *new_page = NULL;
> + struct folio *new_folio = NULL;
> pte_t entry;
> int page_copied = 0;
> struct mmu_notifier_range range;
> @@ -3060,12 +3062,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> vmf->address);
> if (!new_page)
> goto oom;
> + new_folio = page_folio(new_page);
> } else {
> - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
> - vmf->address);
> - if (!new_page)
> + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
> + vmf->address, false);
> + if (!new_folio)
> goto oom;
> -
> + new_page = &new_folio->page;
> ret = __wp_page_copy_user(new_page, old_page, vmf);
> if (ret) {
> /*
> @@ -3075,9 +3078,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> * from the second attempt.
> * The -EHWPOISON case will not be retried.
> */
> - put_page(new_page);
> - if (old_page)
> - put_page(old_page);
> + folio_put(new_folio);
> + if (old_folio)
> + folio_put(old_folio);
>
> delayacct_wpcopy_end();
> return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
> @@ -3085,11 +3088,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> kmsan_copy_page_meta(new_page, old_page);
> }
>
> - if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
> + if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
> goto oom_free_new;
> - cgroup_throttle_swaprate(new_page, GFP_KERNEL);
> + folio_throttle_swaprate(new_folio, GFP_KERNEL);
>
> - __SetPageUptodate(new_page);
> + __folio_mark_uptodate(new_folio);
>
> mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> vmf->address & PAGE_MASK,
> @@ -3101,8 +3104,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> */
> vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
> if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> - if (old_page) {
> - if (!PageAnon(old_page)) {
> + if (old_folio) {
> + if (!folio_test_anon(old_folio)) {
> dec_mm_counter(mm, mm_counter_file(old_page));
> inc_mm_counter(mm, MM_ANONPAGES);
> }
> @@ -3130,7 +3133,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> */
> ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
> page_add_new_anon_rmap(new_page, vma, vmf->address);
> - lru_cache_add_inactive_or_unevictable(new_page, vma);
> + folio_add_lru_vma(new_folio, vma);
> /*
> * We call the notify macro here because, when using secondary
> * mmu page tables (such as kvm shadow page tables), we want the
> @@ -3139,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> BUG_ON(unshare && pte_write(entry));
> set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
> update_mmu_cache(vma, vmf->address, vmf->pte);
> - if (old_page) {
> + if (old_folio) {
> /*
> * Only after switching the pte to the new page may
> * we remove the mapcount here. Otherwise another
> @@ -3166,14 +3169,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> }
>
> /* Free the old page.. */
> - new_page = old_page;
> + new_folio = old_folio;
> page_copied = 1;
> } else {
> update_mmu_tlb(vma, vmf->address, vmf->pte);
> }
>
> - if (new_page)
> - put_page(new_page);
> + if (new_folio)
> + folio_put(new_folio);
>
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> /*
> @@ -3181,19 +3184,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> * the above ptep_clear_flush_notify() did already call it.
> */
> mmu_notifier_invalidate_range_only_end(&range);
> - if (old_page) {
> + if (old_folio) {
> if (page_copied)
> free_swap_cache(old_page);
> - put_page(old_page);
> + folio_put(old_folio);
> }
>
> delayacct_wpcopy_end();
> return 0;
> oom_free_new:
> - put_page(new_page);
> + folio_put(new_folio);
> oom:
> - if (old_page)
> - put_page(old_page);
> + if (old_folio)
> + folio_put(old_folio);
>
> delayacct_wpcopy_end();
> return VM_FAULT_OOM;
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland
On 13.01.23 14:01, Marek Szyprowski wrote:
> Hi
>
> On 12.01.2023 09:30, Kefeng Wang wrote:
>> The old_page/new_page are converted to old_folio/new_folio in
>> wp_page_copy(), then replaced related page functions to folio
>> functions.
>>
>> Signed-off-by: Kefeng Wang <[email protected]>
>
> This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> memory: convert wp_page_copy() to use folios"), causes serious stability
> issues on my ARM based test boards. Here is the example of such crash:
syzbot is also not happy:
https://lkml.kernel.org/r/[email protected]
--
Thanks,
David / dhildenb
On Thu, Jan 12, 2023 at 04:30:00PM +0800, Kefeng Wang wrote:
> Let's __do_huge_pmd_anonymous_page() take a folio and convert related
> functions to use folios.
No, this is actively wrong! Andrew, please drop this patch.
If we want to support folio sizes larger than PMD size (and I think we
do), we need to be able to specify precisely which page in the folio
is to be stored at this PTE. The *interface* must remain struct page.
We can convert from page to folio within the function, but we *MUST NOT*
go the other way.
> static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> - struct page *page, gfp_t gfp)
> + struct folio *folio, gfp_t gfp)
> {
> struct vm_area_struct *vma = vmf->vma;
> + struct page *page = &folio->page;
... ie this is bad and wrong.
> @@ -834,7 +835,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> count_vm_event(THP_FAULT_FALLBACK);
> return VM_FAULT_FALLBACK;
> }
> - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> + return __do_huge_pmd_anonymous_page(vmf, folio, gfp);
> }
>
> static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
A reasonable person might ask "But Matthew, you allocated a folio here,
then you're converting back to a struct page to call
__do_huge_pmd_anonymous_page() so isn't this a sensible patch?"
And I would say "still no". This is a question of interfaces, and
even though __do_huge_pmd_anonymous_page() is static and has precisely
one caller today that always allocates a folio of precisely PMD size,
I suspect it will need to be more visible in the future and the
conversion of the interface from page to folio misleads people.
On Thu, Jan 12, 2023 at 04:30:01PM +0800, Kefeng Wang wrote:
> Convert do_anonymous_page() to use a folio and replace related functions
> to folio functions.
I think this patch has a prerequisite of sorting out
alloc_zeroed_user_highpage_movable(). That way we can get rid of
the 'page' variable inside this function altogether.
> inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> page_add_new_anon_rmap(page, vma, vmf->address);
folio_add_new_anon-rmap().
> - lru_cache_add_inactive_or_unevictable(page, vma);
> + folio_add_lru_vma(folio, vma);
On Thu, Jan 12, 2023 at 04:30:06PM +0800, Kefeng Wang wrote:
> All the callers of cgroup_throttle_swaprate() are converted to
> folio_throttle_swaprate(), so make __cgroup_throttle_swaprate()
> to take a folio, and drop unused cgroup_throttle_swaprate().
Shouldn't __cgroup_throttle_swaprate() then be called
__folio_throttle_swaprate()?
> +extern void __cgroup_throttle_swaprate(struct folio *folio, gfp_t gfp_mask);
Also you can drop the 'extern'.
> +static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp_mask)
And it's not a gfp mask. It's gfp_flags (we have this mistake all
through the mm). Or you can just call it 'gfp'.
On Thu, Jan 12, 2023 at 04:30:02PM +0800, Kefeng Wang wrote:
> - vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
> - if (!vmf->cow_page)
> + cow_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address,
> + false);
> + if (!cow_folio)
I have a patch I've been sitting on that converts vmf->cow_page to be
a folio. I think this series is well and truly wrecked at this point,
so let me go back and dig it out; see if it still makes sense.
I'm a bit unsure about it because maybe we want to allocate
high(ish)-order folios on COW fault, and if we do, then maybe we want
to align them in some way with the virtual addresses, or the other
folios in the VMA. And then we might want to indicate the precise
page for this page fault rather than have this page fault be the
start of a multi-order folio.
On Fri, Jan 13, 2023 at 02:01:36PM +0100, Marek Szyprowski wrote:
> Hi
>
> On 12.01.2023 09:30, Kefeng Wang wrote:
> > The old_page/new_page are converted to old_folio/new_folio in
> > wp_page_copy(), then replaced related page functions to folio
> > functions.
> >
> > Signed-off-by: Kefeng Wang <[email protected]>
>
> This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> memory: convert wp_page_copy() to use folios"), causes serious stability
> issues on my ARM based test boards.
I'm seeing something very similar[1] and it bisected down to this patch.
FWIW it reproduces for me using an ARCH=arm64 defconfig kernel and
with the following QEMU invocation:
qemu-system-aarch64 \
-M virt -cpu cortex-a57 -nographic \
-kernel arch/arm64/boot/Image -initrd rootfs.cpio.gz
The rootfs I used for reproduction is here:
https://fileserver.linaro.org/s/AKcrKWB2Jtyzo6g
Daniel.
[1]:
[ 1.740416] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
[ 1.740898] Mem abort info:
[ 1.741067] ESR = 0x0000000096000006
[ 1.741291] EC = 0x25: DABT (current EL), IL = 32 bits
[ 1.741557] SET = 0, FnV = 0
[ 1.741734] EA = 0, S1PTW = 0
[ 1.741910] FSC = 0x06: level 2 translation fault
[ 1.742161] Data abort info:
[ 1.742328] ISV = 0, ISS = 0x00000006
[ 1.742533] CM = 0, WnR = 0
[ 1.742729] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000447ff000
[ 1.743041] [0000000000000008] pgd=0800000044819003, p4d=0800000044819003, pud=080000004481a003, pmd=0000000000000000
[ 1.743819] Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP
[ 1.744118] Modules linked in:
[ 1.744353] CPU: 0 PID: 44 Comm: modprobe Not tainted 6.2.0-rc3-00294-g9ebae00c8e30 #244
[ 1.744617] Hardware name: linux,dummy-virt (DT)
[ 1.744848] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 1.745045] pc : do_wp_page+0x2c4/0xd40
[ 1.745218] lr : do_wp_page+0x2bc/0xd40
[ 1.745318] sp : ffff80000822bc10
[ 1.745415] x29: ffff80000822bc10 x28: ffff60710459b100 x27: 0000000000000002
[ 1.745633] x26: ffff80000822bd28 x25: ffff607103b18000 x24: 0000000200100073
[ 1.745815] x23: ffff607104811ff0 x22: ffffc6250d418000 x21: 0000000000000000
[ 1.745986] x20: ffff607104810360 x19: ffff607104810360 x18: 0000000000000001
[ 1.746171] x17: ffff9a4bfa8fa000 x16: ffff800008004000 x15: 0000000000000000
[ 1.746363] x14: 000000000000a8a9 x13: 0000000000000004 x12: 0000000000000026
[ 1.746548] x11: 0000000000000001 x10: 0000000000000000 x9 : ffffc6250e9e7000
[ 1.746756] x8 : ffff60710459b100 x7 : 00000000412a6223 x6 : 0000000000000000
[ 1.746928] x5 : 0000000000042cc5 x4 : 0000ffff9acb8000 x3 : ffff80000822bbe4
[ 1.747095] x2 : ffff9a4bfa8fa000 x1 : ffff60710459b100 x0 : 0000000100000000
[ 1.747341] Call trace:
[ 1.747431] do_wp_page+0x2c4/0xd40
[ 1.747547] __handle_mm_fault+0x704/0x1124
[ 1.747649] handle_mm_fault+0xe4/0x25c
[ 1.747736] do_page_fault+0x1e8/0x4c0
[ 1.747834] do_mem_abort+0x44/0x9c
[ 1.747934] el0_da+0x60/0xd4
[ 1.748020] el0t_64_sync_handler+0xac/0x120
[ 1.748134] el0t_64_sync+0x190/0x194
[ 1.748388] Code: f9403340 943cc31e f9402b55 f9400354 (f94006b6)
[ 1.748767] ---[ end trace 0000000000000000 ]---
>
> VFS: Mounted root (ext4 filesystem) readonly on device 179:6.
> devtmpfs: mounted
> Freeing unused kernel image (initmem) memory: 1024K
> Run /sbin/init as init process
> 8<--- cut here ---
> Unable to handle kernel NULL pointer dereference at virtual address
> 00000004 when read
> [00000004] *pgd=00000000
> Internal error: Oops: 5 [#1] PREEMPT SMP ARM
> Modules linked in:
> CPU: 0 PID: 1 Comm: init Not tainted 6.2.0-rc3-00294-g9ebae00c8e30 #13254
> Hardware name: Samsung Exynos (Flattened Device Tree)
> PC is at do_wp_page+0x21c/0xd48
> LR is at do_wp_page+0x1f8/0xd48
> pc : [<c02aa518>]??? lr : [<c02aa4f4>]??? psr: 60000113
> sp : f082de58? ip : 0006fff8? fp : 0000065f
> r10: 00000000? r9 : 00000c73? r8 : c2b87318
> r7 : c1d78000? r6 : b6ed9000? r5 : 00000000? r4 : f082dedc
> r3 : c25d0000? r2 : 00000001? r1 : c0ee9568? r0 : 00000000
> Flags: nZCv? IRQs on? FIQs on? Mode SVC_32? ISA ARM? Segment none
> Control: 10c5387d? Table: 4363804a? DAC: 00000051
> Register r0 information: NULL pointer
> Register r1 information: non-slab/vmalloc memory
> Register r2 information: non-paged memory
> Register r3 information: slab mm_struct start c25d0000 pointer offset 0
> size 908
> Register r4 information: 2-page vmalloc region starting at 0xf082c000
> allocated at kernel_clone+0x54/0x3e4
> Register r5 information: NULL pointer
> Register r6 information: non-paged memory
> Register r7 information: slab task_struct start c1d78000 pointer offset
> 0 size 4032
> Register r8 information: slab vm_area_struct start c2b87318 pointer
> offset 0 size 68
> Register r9 information: non-paged memory
> Register r10 information: NULL pointer
> Register r11 information: non-paged memory
> Register r12 information: non-paged memory
> Process init (pid: 1, stack limit = 0x(ptrval))
> Stack: (0xf082de58 to 0xf082e000)
> ...
> ?do_wp_page from handle_mm_fault+0x938/0xda8
> ?handle_mm_fault from do_page_fault+0x154/0x408
> ?do_page_fault from do_DataAbort+0x3c/0xb0
> ?do_DataAbort from __dabt_usr+0x58/0x60
> Exception stack(0xf082dfb0 to 0xf082dff8)
> dfa0:???????????????????????????????????? 00000000 00000001 b6ed9060
> 00000000
> dfc0: 00000000 b6fea968 b6ed9060 00000000 b6cd4888 00000000 00000000
> 00000000
> dfe0: b6eda28c be8a9dd0 b6e6ff08 ffff0fcc 60000010 ffffffff
> Code: e594a028 e58d301c e5983008 e58d3014 (e59a3004)
> ---[ end trace 0000000000000000 ]---
> Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
> CPU1: stopping
> CPU: 1 PID: 0 Comm: swapper/1 Tainted: G????? D
> 6.2.0-rc3-00294-g9ebae00c8e30 #13254
> Hardware name: Samsung Exynos (Flattened Device Tree)
> ?unwind_backtrace from show_stack+0x10/0x14
> ?show_stack from dump_stack_lvl+0x58/0x70
> ?dump_stack_lvl from do_handle_IPI+0x348/0x374
> ?do_handle_IPI from ipi_handler+0x18/0x20
> ?ipi_handler from handle_percpu_devid_irq+0x9c/0x170
> ?handle_percpu_devid_irq from generic_handle_domain_irq+0x24/0x34
> ?generic_handle_domain_irq from gic_handle_irq+0x88/0xa8
> ?gic_handle_irq from generic_handle_arch_irq+0x58/0x78
> ?generic_handle_arch_irq from call_with_stack+0x18/0x20
> ?call_with_stack from __irq_svc+0x9c/0xd0
> Exception stack(0xf08a9ee0 to 0xf08a9f28)
> ...
> ?__irq_svc from cpuidle_enter_state+0x318/0x3d0
> ?cpuidle_enter_state from cpuidle_enter_state_coupled+0x160/0x400
> ?cpuidle_enter_state_coupled from cpuidle_enter+0x3c/0x54
> ?cpuidle_enter from do_idle+0x1f0/0x2b0
> ?do_idle from cpu_startup_entry+0x18/0x1c
> ?cpu_startup_entry from secondary_start_kernel+0x1a0/0x230
> ?secondary_start_kernel from 0x40101a00
> ---[ end Kernel panic - not syncing: Attempted to kill init!
> exitcode=0x0000000b ]---
>
> Reverting it on top of linux-next 20220113 together with aaf3f7afbf10
> ("mm: swap: remove unneeded cgroup_throttle_swaprate()") fixes the
> stability issues.
>
>
> > ---
> > mm/memory.c | 47 +++++++++++++++++++++++++----------------------
> > 1 file changed, 25 insertions(+), 22 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index b66c425b4d7c..746f485366e8 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3044,7 +3044,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > struct vm_area_struct *vma = vmf->vma;
> > struct mm_struct *mm = vma->vm_mm;
> > struct page *old_page = vmf->page;
> > + struct folio *old_folio = page_folio(old_page);
> > struct page *new_page = NULL;
> > + struct folio *new_folio = NULL;
> > pte_t entry;
> > int page_copied = 0;
> > struct mmu_notifier_range range;
> > @@ -3060,12 +3062,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > vmf->address);
> > if (!new_page)
> > goto oom;
> > + new_folio = page_folio(new_page);
> > } else {
> > - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
> > - vmf->address);
> > - if (!new_page)
> > + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
> > + vmf->address, false);
> > + if (!new_folio)
> > goto oom;
> > -
> > + new_page = &new_folio->page;
> > ret = __wp_page_copy_user(new_page, old_page, vmf);
> > if (ret) {
> > /*
> > @@ -3075,9 +3078,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > * from the second attempt.
> > * The -EHWPOISON case will not be retried.
> > */
> > - put_page(new_page);
> > - if (old_page)
> > - put_page(old_page);
> > + folio_put(new_folio);
> > + if (old_folio)
> > + folio_put(old_folio);
> >
> > delayacct_wpcopy_end();
> > return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
> > @@ -3085,11 +3088,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > kmsan_copy_page_meta(new_page, old_page);
> > }
> >
> > - if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
> > + if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL))
> > goto oom_free_new;
> > - cgroup_throttle_swaprate(new_page, GFP_KERNEL);
> > + folio_throttle_swaprate(new_folio, GFP_KERNEL);
> >
> > - __SetPageUptodate(new_page);
> > + __folio_mark_uptodate(new_folio);
> >
> > mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> > vmf->address & PAGE_MASK,
> > @@ -3101,8 +3104,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > */
> > vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
> > if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> > - if (old_page) {
> > - if (!PageAnon(old_page)) {
> > + if (old_folio) {
> > + if (!folio_test_anon(old_folio)) {
> > dec_mm_counter(mm, mm_counter_file(old_page));
> > inc_mm_counter(mm, MM_ANONPAGES);
> > }
> > @@ -3130,7 +3133,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > */
> > ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
> > page_add_new_anon_rmap(new_page, vma, vmf->address);
> > - lru_cache_add_inactive_or_unevictable(new_page, vma);
> > + folio_add_lru_vma(new_folio, vma);
> > /*
> > * We call the notify macro here because, when using secondary
> > * mmu page tables (such as kvm shadow page tables), we want the
> > @@ -3139,7 +3142,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > BUG_ON(unshare && pte_write(entry));
> > set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
> > update_mmu_cache(vma, vmf->address, vmf->pte);
> > - if (old_page) {
> > + if (old_folio) {
> > /*
> > * Only after switching the pte to the new page may
> > * we remove the mapcount here. Otherwise another
> > @@ -3166,14 +3169,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > }
> >
> > /* Free the old page.. */
> > - new_page = old_page;
> > + new_folio = old_folio;
> > page_copied = 1;
> > } else {
> > update_mmu_tlb(vma, vmf->address, vmf->pte);
> > }
> >
> > - if (new_page)
> > - put_page(new_page);
> > + if (new_folio)
> > + folio_put(new_folio);
> >
> > pte_unmap_unlock(vmf->pte, vmf->ptl);
> > /*
> > @@ -3181,19 +3184,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> > * the above ptep_clear_flush_notify() did already call it.
> > */
> > mmu_notifier_invalidate_range_only_end(&range);
> > - if (old_page) {
> > + if (old_folio) {
> > if (page_copied)
> > free_swap_cache(old_page);
> > - put_page(old_page);
> > + folio_put(old_folio);
> > }
> >
> > delayacct_wpcopy_end();
> > return 0;
> > oom_free_new:
> > - put_page(new_page);
> > + folio_put(new_folio);
> > oom:
> > - if (old_page)
> > - put_page(old_page);
> > + if (old_folio)
> > + folio_put(old_folio);
> >
> > delayacct_wpcopy_end();
> > return VM_FAULT_OOM;
>
> Best regards
> --
> Marek Szyprowski, PhD
> Samsung R&D Institute Poland
>
On Fri, Jan 13, 2023 at 02:08:36PM +0100, David Hildenbrand wrote:
> On 13.01.23 14:01, Marek Szyprowski wrote:
> > Hi
> >
> > On 12.01.2023 09:30, Kefeng Wang wrote:
> > > The old_page/new_page are converted to old_folio/new_folio in
> > > wp_page_copy(), then replaced related page functions to folio
> > > functions.
> > >
> > > Signed-off-by: Kefeng Wang <[email protected]>
> >
> > This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> > memory: convert wp_page_copy() to use folios"), causes serious stability
> > issues on my ARM based test boards. Here is the example of such crash:
>
> syzbot is also not happy:
>
> https://lkml.kernel.org/r/[email protected]
>
> --
> Thanks,
>
> David / dhildenb
>
This also completely broke my qemu environment.
In that thread Willy points out that the issue stems from blindly assigning
page_folio(old_page) to old_folio without checking whether it is NULL first,
therefore triggering a NULL pointer deref.
A quick fix would be to put in a check (as shown below) which fixes the issue,
but as Willy said, I think we should drop this until it can be fixed in a
respin.
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3044,7 +3044,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
- struct folio *old_folio = page_folio(old_page);
+ struct folio *old_folio = old_page ? page_folio(old_page) : NULL;
Hello,
On Fri, 13 Jan 2023 19:04:14 +0000 Lorenzo Stoakes <[email protected]> wrote:
> On Fri, Jan 13, 2023 at 02:08:36PM +0100, David Hildenbrand wrote:
> > On 13.01.23 14:01, Marek Szyprowski wrote:
> > > Hi
> > >
> > > On 12.01.2023 09:30, Kefeng Wang wrote:
> > > > The old_page/new_page are converted to old_folio/new_folio in
> > > > wp_page_copy(), then replaced related page functions to folio
> > > > functions.
> > > >
> > > > Signed-off-by: Kefeng Wang <[email protected]>
> > >
> > > This patch, merged into today's linux-next as commit 9ebae00c8e30 ("mm:
> > > memory: convert wp_page_copy() to use folios"), causes serious stability
> > > issues on my ARM based test boards. Here is the example of such crash:
> >
> > syzbot is also not happy:
> >
> > https://lkml.kernel.org/r/[email protected]
> >
> > --
> > Thanks,
> >
> > David / dhildenb
> >
>
> This also completely broke my qemu environment.
Same to me.
>
> In that thread Willy points out that the issue stems from blindly assigning
> page_folio(old_page) to old_folio without checking whether it is NULL first,
> therefore triggering a NULL pointer deref.
>
> A quick fix would be to put in a check (as shown below) which fixes the issue,
> but as Willy said, I think we should drop this until it can be fixed in a
> respin.
>
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3044,7 +3044,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
> struct vm_area_struct *vma = vmf->vma;
> struct mm_struct *mm = vma->vm_mm;
> struct page *old_page = vmf->page;
> - struct folio *old_folio = page_folio(old_page);
> + struct folio *old_folio = old_page ? page_folio(old_page) : NULL;
Tested-by: SeongJae Park <[email protected]>
Thanks,
SJ
Greeting,
FYI, we noticed BUG:kernel_NULL_pointer_dereference,address due to commit (built with gcc-11):
commit: 94dd2d69bf084166a5537f957dac6a3b79fa238f ("[PATCH -next 5/7] mm: memory: convert wp_page_copy() to use folios")
url: https://github.com/intel-lab-lkp/linux/commits/Kefeng-Wang/mm-huge_memory-make-__do_huge_pmd_anonymous_page-to-take-a-folio/20230112-161810
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH -next 5/7] mm: memory: convert wp_page_copy() to use folios
in testcase: boot
on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
[ 6.211602][ T64] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 6.213035][ T64] #PF: supervisor read access in kernel mode
[ 6.214169][ T64] #PF: error_code(0x0000) - not-present page
[ 6.215275][ T64] PGD 80000001202fc067 P4D 80000001202fc067 PUD 1202f9067 PMD 0
[ 6.216694][ T64] Oops: 0000 [#1] SMP PTI
[ 6.217525][ T64] CPU: 1 PID: 64 Comm: modprobe Not tainted 6.2.0-rc3-00317-g94dd2d69bf08 #1
[ 6.219042][ T64] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[ 6.220947][ T64] RIP: 0010:_compound_head (include/linux/page-flags.h:251)
[ 6.221957][ T64] Code: 66 2e 0f 1f 84 00 00 00 00 00 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 <48> 8b 47 08 a8 01 75 24 66 90 48 89 f8 c3 cc cc cc cc f7 c7 ff 0f
All code
========
0: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
7: 00 00 00
a: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
11: 00 00 00 00
15: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
1a: 90 nop
1b: 90 nop
1c: 90 nop
1d: 90 nop
1e: 90 nop
1f: 90 nop
20: 90 nop
21: 90 nop
22: 90 nop
23: 90 nop
24: 90 nop
25: 90 nop
26: 90 nop
27: 90 nop
28: 90 nop
29: 90 nop
2a:* 48 8b 47 08 mov 0x8(%rdi),%rax <-- trapping instruction
2e: a8 01 test $0x1,%al
30: 75 24 jne 0x56
32: 66 90 xchg %ax,%ax
34: 48 89 f8 mov %rdi,%rax
37: c3 retq
38: cc int3
39: cc int3
3a: cc int3
3b: cc int3
3c: f7 .byte 0xf7
3d: c7 (bad)
3e: ff 0f decl (%rdi)
Code starting with the faulting instruction
===========================================
0: 48 8b 47 08 mov 0x8(%rdi),%rax
4: a8 01 test $0x1,%al
6: 75 24 jne 0x2c
8: 66 90 xchg %ax,%ax
a: 48 89 f8 mov %rdi,%rax
d: c3 retq
e: cc int3
f: cc int3
10: cc int3
11: cc int3
12: f7 .byte 0xf7
13: c7 (bad)
14: ff 0f decl (%rdi)
[ 6.225382][ T64] RSP: 0000:ffffc900004c3d38 EFLAGS: 00010282
[ 6.226529][ T64] RAX: 0000000000000a55 RBX: ffffc900004c3df8 RCX: 0000000000003663
[ 6.230756][ T64] RDX: 0000000000000000 RSI: 00000000f7f80000 RDI: 0000000000000000
[ 6.232224][ T64] RBP: ffff8881202f6240 R08: 8000000003663225 R09: ffff888120201000
[ 6.233742][ T64] R10: 0000000000000000 R11: 00000000f7f80684 R12: 00000000f7f80684
[ 6.235231][ T64] R13: 0000000000000df8 R14: 0000000000000000 R15: ffff88812024f440
[ 6.236759][ T64] FS: 0000000000000000(0000) GS:ffff88842fd00000(0063) knlGS:00000000f7ddb900
[ 6.238454][ T64] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
[ 6.239697][ T64] CR2: 0000000000000008 CR3: 00000001009e2000 CR4: 00000000000406e0
[ 6.241215][ T64] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6.242709][ T64] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6.244212][ T64] Call Trace:
[ 6.244933][ T64] <TASK>
[ 6.245575][ T64] wp_page_copy (mm/memory.c:3047)
[ 6.246461][ T64] ? do_anonymous_page (arch/x86/include/asm/preempt.h:85 include/linux/spinlock_api_smp.h:143 include/linux/spinlock.h:390 mm/memory.c:4106)
[ 6.247440][ T64] __handle_mm_fault (mm/memory.c:5061)
[ 6.248338][ T64] handle_mm_fault (mm/memory.c:5207)
[ 6.249227][ T64] do_user_addr_fault (arch/x86/mm/fault.c:1407)
[ 6.250166][ T64] ? do_set_thread_area (arch/x86/kernel/tls.c:152)
[ 6.251133][ T64] exc_page_fault (arch/x86/include/asm/irqflags.h:40 arch/x86/include/asm/irqflags.h:75 arch/x86/mm/fault.c:1506 arch/x86/mm/fault.c:1554)
[ 6.252009][ T64] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570)
[ 6.252931][ T64] RIP: 0023:0xf7df587c
[ 6.253753][ T64] Code: 1c 31 c0 89 5c 24 0c e8 45 d4 10 00 81 c3 96 77 18 00 89 74 24 10 87 de 0f a2 87 de 81 f9 6e 74 65 6c 89 7c 24 14 89 6c 24 18 <89> 83 90 36 00 00 75 14 81 fe 47 65 6e 75 75 0c 81 fa 69 6e 65 49
All code
========
0: 1c 31 sbb $0x31,%al
2: c0 89 5c 24 0c e8 45 rorb $0x45,-0x17f3dba4(%rcx)
9: d4 (bad)
a: 10 00 adc %al,(%rax)
c: 81 c3 96 77 18 00 add $0x187796,%ebx
12: 89 74 24 10 mov %esi,0x10(%rsp)
16: 87 de xchg %ebx,%esi
18: 0f a2 cpuid
1a: 87 de xchg %ebx,%esi
1c: 81 f9 6e 74 65 6c cmp $0x6c65746e,%ecx
22: 89 7c 24 14 mov %edi,0x14(%rsp)
26: 89 6c 24 18 mov %ebp,0x18(%rsp)
2a:* 89 83 90 36 00 00 mov %eax,0x3690(%rbx) <-- trapping instruction
30: 75 14 jne 0x46
32: 81 fe 47 65 6e 75 cmp $0x756e6547,%esi
38: 75 0c jne 0x46
3a: 81 fa 69 6e 65 49 cmp $0x49656e69,%edx
Code starting with the faulting instruction
===========================================
0: 89 83 90 36 00 00 mov %eax,0x3690(%rbx)
6: 75 14 jne 0x1c
8: 81 fe 47 65 6e 75 cmp $0x756e6547,%esi
e: 75 0c jne 0x1c
10: 81 fa 69 6e 65 49 cmp $0x49656e69,%edx
If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-lkp/[email protected]
To reproduce:
# build kernel
cd linux
cp config-6.2.0-rc3-00317-g94dd2d69bf08 .config
make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
make HOSTCC=gcc-11 CC=gcc-11 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
cd <mod-install-dir>
find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
On 2023/1/13 22:25, Matthew Wilcox wrote:
> On Thu, Jan 12, 2023 at 04:30:00PM +0800, Kefeng Wang wrote:
>> Let's __do_huge_pmd_anonymous_page() take a folio and convert related
>> functions to use folios.
>
> No, this is actively wrong! Andrew, please drop this patch.
>
> If we want to support folio sizes larger than PMD size (and I think we
> do), we need to be able to specify precisely which page in the folio
> is to be stored at this PTE. The *interface* must remain struct page.
> We can convert from page to folio within the function, but we *MUST NOT*
> go the other way.
Got it,
>
>> static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> - struct page *page, gfp_t gfp)
>> + struct folio *folio, gfp_t gfp)
>> {
>> struct vm_area_struct *vma = vmf->vma;
>> + struct page *page = &folio->page;
>
> ... ie this is bad and wrong.
>
>> @@ -834,7 +835,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> count_vm_event(THP_FAULT_FALLBACK);
>> return VM_FAULT_FALLBACK;
>> }
>> - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>> + return __do_huge_pmd_anonymous_page(vmf, folio, gfp);
>> }
>>
>> static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
>
> A reasonable person might ask "But Matthew, you allocated a folio here,
> then you're converting back to a struct page to call
> __do_huge_pmd_anonymous_page() so isn't this a sensible patch?"
and this is why I change the parameter from page to folio(no need to go
back and forth between page and folio),
>
> And I would say "still no". This is a question of interfaces, and
> even though __do_huge_pmd_anonymous_page() is static and has precisely
> one caller today that always allocates a folio of precisely PMD size,
> I suspect it will need to be more visible in the future and the
> conversion of the interface from page to folio misleads people.
ok, will keep page for __do_huge_pmd_anonymous_page().
>
Hi Matthew,
On 2023/1/13 23:50, Matthew Wilcox wrote:
> On Thu, Jan 12, 2023 at 04:30:06PM +0800, Kefeng Wang wrote:
>> All the callers of cgroup_throttle_swaprate() are converted to
>> folio_throttle_swaprate(), so make __cgroup_throttle_swaprate()
>> to take a folio, and drop unused cgroup_throttle_swaprate().
>
> Shouldn't __cgroup_throttle_swaprate() then be called
> __folio_throttle_swaprate()?
Sure.
>
>> +extern void __cgroup_throttle_swaprate(struct folio *folio, gfp_t gfp_mask);
>
> Also you can drop the 'extern'.
Ok.
>
>> +static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp_mask)
>
> And it's not a gfp mask. It's gfp_flags (we have this mistake all
> through the mm). Or you can just call it 'gfp'.
>
>
Thanks for your kindly review and advise, will update.
On 2023/1/13 23:33, Matthew Wilcox wrote:
> On Thu, Jan 12, 2023 at 04:30:01PM +0800, Kefeng Wang wrote:
>> Convert do_anonymous_page() to use a folio and replace related functions
>> to folio functions.
>
> I think this patch has a prerequisite of sorting out
> alloc_zeroed_user_highpage_movable(). That way we can get rid of
> the 'page' variable inside this function altogether.
How about provide a wrapper like
folio_alloc_zeroed_user_highmem_movable(), but the is a little bit long.
static inline struct folio *
folio_alloc_zeroed_user_highmem_movable(struct vm_area_struct *vma,
unsigned long vaddr)
{
struct folio *folio = NULL;
struct page *page;
page = alloc_zeroed_user_highpage_movable(vma, vaddr);
if (page)
folio = page_folio(page);
return folio;
}
>
>> inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>> page_add_new_anon_rmap(page, vma, vmf->address);
>
> folio_add_new_anon-rmap().
ok, will update.
>
>> - lru_cache_add_inactive_or_unevictable(page, vma);
>> + folio_add_lru_vma(folio, vma);
On 2023/1/13 23:37, Matthew Wilcox wrote:
> On Thu, Jan 12, 2023 at 04:30:02PM +0800, Kefeng Wang wrote:
>> - vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
>> - if (!vmf->cow_page)
>> + cow_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address,
>> + false);
>> + if (!cow_folio)
>
> I have a patch I've been sitting on that converts vmf->cow_page to be
> a folio. I think this series is well and truly wrecked at this point,
> so let me go back and dig it out; see if it still makes sense.
>
For now, as vmf->page and vmf->cow_page used in do_cow_page(), it only
supports cow on a page, and after converting, the folio still is 0 order,
> I'm a bit unsure about it because maybe we want to allocate
> high(ish)-order folios on COW fault, and if we do, then maybe we want
> to align them in some way with the virtual addresses, or the other
> folios in the VMA. And then we might want to indicate the precise
> page for this page fault rather than have this page fault be the
> start of a multi-order folio.
>
This means that if high(ish)-order folios/multi-order folio on cow, it
needs additional jobs and precise page for this pagefault, but for
order-0, the converting is right but could break/mislead future logical?
not very clear about this part, but let's wait for your patches.