2023-07-03 06:04:09

by Sidhartha Kumar

[permalink] [raw]
Subject: [PATCH 1/4] mm/memory: convert do_page_mkwrite() to use folios

Saves one implicit call to compound_head();

Signed-off-by: Sidhartha Kumar <[email protected]>
---
mm/memory.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 21fab27272092..098fac2f5efc0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2932,7 +2932,7 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
vm_fault_t ret;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
unsigned int old_flags = vmf->flags;

vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
@@ -2947,14 +2947,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio_mapping(folio)) {
+ folio_unlock(folio);
return 0; /* retry */
}
ret |= VM_FAULT_LOCKED;
} else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
return ret;
}

--
2.41.0



2023-07-03 06:07:47

by Sidhartha Kumar

[permalink] [raw]
Subject: [PATCH 3/4] mm/memory: convert do_shared_fault() to folios

Saves three implicit calls to compound_head().

Signed-off-by: Sidhartha Kumar <[email protected]>
---
mm/memory.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 93480e846ace6..33bf13431974c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4594,6 +4594,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret, tmp;
+ struct folio *folio = page_folio(vmf->page);

ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4604,11 +4605,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(vmf->page);
+ folio_unlock(folio);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
}
@@ -4616,8 +4617,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return ret;
}

--
2.41.0


2023-07-03 06:16:41

by Sidhartha Kumar

[permalink] [raw]
Subject: [PATCH 4/4] mm/memory: convert do_read_fault() to use folios

Saves one implicit call to compound_head()

Signed-off-by: Sidhartha Kumar <[email protected]>
---
mm/memory.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 33bf13431974c..b97c66df4adac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4528,6 +4528,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
vm_fault_t ret = 0;
+ struct folio *folio = page_folio(vmf->page);

/*
* Let's call ->map_pages() first and use ->fault() as fallback
@@ -4545,9 +4546,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
return ret;

ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ folio_unlock(folio);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(vmf->page);
+ folio_put(folio);
return ret;
}

--
2.41.0


2023-07-03 06:19:03

by Sidhartha Kumar

[permalink] [raw]
Subject: [PATCH 2/4] mm/memory: convert wp_page_shared() to use folios

Saves five implicit calls to compound_head().

Signed-off-by: Sidhartha Kumar <[email protected]>
---
mm/memory.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 098fac2f5efc0..93480e846ace6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3286,8 +3286,9 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
+ struct folio *folio = page_folio(vmf->page);

- get_page(vmf->page);
+ folio_get(folio);

if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
vm_fault_t tmp;
@@ -3296,21 +3297,21 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(vmf->page);
+ folio_put(folio);
return tmp;
}
tmp = finish_mkwrite_fault(vmf);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- unlock_page(vmf->page);
- put_page(vmf->page);
+ folio_unlock(folio);
+ folio_put(folio);
return tmp;
}
} else {
wp_page_reuse(vmf);
- lock_page(vmf->page);
+ folio_lock(folio);
}
ret |= fault_dirty_shared_page(vmf);
- put_page(vmf->page);
+ folio_put(folio);

return ret;
}
--
2.41.0


2023-07-03 09:33:14

by zhangpeng (AS)

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm/memory: convert wp_page_shared() to use folios

On 2023/7/3 13:58, Sidhartha Kumar wrote:

> Saves five implicit calls to compound_head().
>
> Signed-off-by: Sidhartha Kumar <[email protected]>

Reviewed-by: ZhangPeng <[email protected]>

> ---
> mm/memory.c | 13 +++++++------
> 1 file changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 098fac2f5efc0..93480e846ace6 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3286,8 +3286,9 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> vm_fault_t ret = 0;
> + struct folio *folio = page_folio(vmf->page);
>
> - get_page(vmf->page);
> + folio_get(folio);
>
> if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
> vm_fault_t tmp;
> @@ -3296,21 +3297,21 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp || (tmp &
> (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> - put_page(vmf->page);
> + folio_put(folio);
> return tmp;
> }
> tmp = finish_mkwrite_fault(vmf);
> if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
> - unlock_page(vmf->page);
> - put_page(vmf->page);
> + folio_unlock(folio);
> + folio_put(folio);
> return tmp;
> }
> } else {
> wp_page_reuse(vmf);
> - lock_page(vmf->page);
> + folio_lock(folio);
> }
> ret |= fault_dirty_shared_page(vmf);
> - put_page(vmf->page);
> + folio_put(folio);
>
> return ret;
> }

2023-07-03 09:46:21

by zhangpeng (AS)

[permalink] [raw]
Subject: Re: [PATCH 4/4] mm/memory: convert do_read_fault() to use folios

On 2023/7/3 13:58, Sidhartha Kumar wrote:

> Saves one implicit call to compound_head()
>
> Signed-off-by: Sidhartha Kumar <[email protected]>

Reviewed-by: ZhangPeng <[email protected]>

> ---
> mm/memory.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 33bf13431974c..b97c66df4adac 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4528,6 +4528,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
> static vm_fault_t do_read_fault(struct vm_fault *vmf)
> {
> vm_fault_t ret = 0;
> + struct folio *folio = page_folio(vmf->page);
>
> /*
> * Let's call ->map_pages() first and use ->fault() as fallback
> @@ -4545,9 +4546,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
> return ret;
>
> ret |= finish_fault(vmf);
> - unlock_page(vmf->page);
> + folio_unlock(folio);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> - put_page(vmf->page);
> + folio_put(folio);
> return ret;
> }
>

2023-07-03 09:48:10

by zhangpeng (AS)

[permalink] [raw]
Subject: Re: [PATCH 3/4] mm/memory: convert do_shared_fault() to folios

On 2023/7/3 13:58, Sidhartha Kumar wrote:

> Saves three implicit calls to compound_head().
>
> Signed-off-by: Sidhartha Kumar <[email protected]>

Reviewed-by: ZhangPeng <[email protected]>

> ---
> mm/memory.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 93480e846ace6..33bf13431974c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4594,6 +4594,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> vm_fault_t ret, tmp;
> + struct folio *folio = page_folio(vmf->page);
>
> ret = __do_fault(vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> @@ -4604,11 +4605,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> * about to become writable
> */
> if (vma->vm_ops->page_mkwrite) {
> - unlock_page(vmf->page);
> + folio_unlock(folio);
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp ||
> (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> - put_page(vmf->page);
> + folio_put(folio);
> return tmp;
> }
> }
> @@ -4616,8 +4617,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> ret |= finish_fault(vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
> VM_FAULT_RETRY))) {
> - unlock_page(vmf->page);
> - put_page(vmf->page);
> + folio_unlock(folio);
> + folio_put(folio);
> return ret;
> }
>

2023-07-03 09:48:52

by zhangpeng (AS)

[permalink] [raw]
Subject: Re: [PATCH 1/4] mm/memory: convert do_page_mkwrite() to use folios

On 2023/7/3 13:58, Sidhartha Kumar wrote:

> Saves one implicit call to compound_head();
>
> Signed-off-by: Sidhartha Kumar <[email protected]>
> ---
> mm/memory.c | 10 +++++-----
> 1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 21fab27272092..098fac2f5efc0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2932,7 +2932,7 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
> static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
> {
> vm_fault_t ret;
> - struct page *page = vmf->page;
> + struct folio *folio = page_folio(vmf->page);
> unsigned int old_flags = vmf->flags;
>
> vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
> @@ -2947,14 +2947,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
> if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> - lock_page(page);
> - if (!page->mapping) {
> - unlock_page(page);
> + folio_lock(folio);
> + if (!folio_mapping(folio)) {

Could page->mapping be directly converted to folio->mapping?

Thanks,
Peng

> + folio_unlock(folio);
> return 0; /* retry */
> }
> ret |= VM_FAULT_LOCKED;
> } else
> - VM_BUG_ON_PAGE(!PageLocked(page), page);
> + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
> return ret;
> }
>

2023-07-03 22:48:56

by SeongJae Park

[permalink] [raw]
Subject: Re: [PATCH 3/4] mm/memory: convert do_shared_fault() to folios

Hi Sidharta,

On Sun, 2 Jul 2023 22:58:49 -0700 Sidhartha Kumar <[email protected]> wrote:

> Saves three implicit calls to compound_head().
>
> Signed-off-by: Sidhartha Kumar <[email protected]>
> ---
> mm/memory.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 93480e846ace6..33bf13431974c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4594,6 +4594,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> vm_fault_t ret, tmp;
> + struct folio *folio = page_folio(vmf->page);
>
> ret = __do_fault(vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> @@ -4604,11 +4605,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> * about to become writable
> */
> if (vma->vm_ops->page_mkwrite) {
> - unlock_page(vmf->page);
> + folio_unlock(folio);
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp ||
> (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> - put_page(vmf->page);
> + folio_put(folio);
> return tmp;
> }
> }
> @@ -4616,8 +4617,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> ret |= finish_fault(vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
> VM_FAULT_RETRY))) {
> - unlock_page(vmf->page);
> - put_page(vmf->page);
> + folio_unlock(folio);
> + folio_put(folio);
> return ret;
> }

I found the latest mm-unstable tree fails booting with stacktraces like below,
and bisecting points this patch (commit a43f078c7a3b of mm-unstable). Do you
have any idea?

[ 7.388551] BUG: kernel NULL pointer dereference, address: 0000000000000008
[ 7.389149] systemd[1]: Starting Load Kernel Module pstore_zone...
[ 7.390101] #PF: supervisor read access in kernel mode
[ 7.392370] #PF: error_code(0x0000) - not-present page
[ 7.392372] PGD 0 P4D 0
[ 7.392376] Oops: 0000 [#1] PREEMPT SMP PTI
[ 7.392379] CPU: 9 PID: 594 Comm: systemd-journal Not tainted 6.4.0+ #8
[ S7t.a3r9t2i3n82] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-4
[ 7.392384] RIP: 0010:_compound_head (include/linux/page-flags.h:245)
[ 7.400935] Code: e8 35 b5 cd ff 5d c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90f

Code starting with the faulting instruction
===========================================
0: e8 35 b5 cd ff callq 0xffffffffffcdb53a
5: 5d pop %rbp
6: c3 retq
7: cc int3
8: cc int3
9: cc int3
a: cc int3
b: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
12: 00 00 00 00
16: 0f 1f 40 00 nopl 0x0(%rax)
1a: 90 nop
1b: 0f .byte 0xf
[ 7.405283] RSP: 0000:ffffb86140cd3d58 EFLAGS: 00010202
[ 7.406551] RAX: ffff96d3c19c4d38 RBX: ffffffffa103f080 RCX: 00000001019c4067
[ 7.408233] RDX: 0000000000000000 RSI: ffff96d2c0000d38 RDI: 0000000000000000
[ 7.409893] RBP: ffffb86140cd3d90 R08: ffff96d3c19c4d38 R09: 0000000000000067
[ 7.411457] R10: 0000000000000000 R11: 00007f2ae19d5fff R12: ffffb86140cd3dd0
[ 7.412792] R13: 0000000000000001 R14: ffff96d3cb7aa450 R15: 0000000000000860
[ 7.414139] FS: 00007f2ae0f40980(0000) GS:ffff96f1fd640000(0000) knlGS:0000000000000000
[ 7.415699] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7.416780] CR2: 0000000000000008 CR3: 0000000104830000 CR4: 00000000000006e0
[ 7.418115] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 7.419492] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 7.420830] Call Trace:
[ 7.421308] <TASK>
[ 7.421722] ? show_regs (arch/x86/kernel/dumpstack.c:479)
[ 7.422411] ? __die_body (arch/x86/kernel/dumpstack.c:421)
[ 7.423113] ? __die (arch/x86/kernel/dumpstack.c:435)
[ 7.423716] ? page_fault_oops (arch/x86/mm/fault.c:707)
[ 7.424504] ? search_bpf_extables (kernel/bpf/core.c:751)
[ 7.425329] ? __pfx__compound_head (include/linux/page-flags.h:245)
[ 7.426171] ? search_exception_tables (kernel/extable.c:64)
[ 7.427084] ? kernelmode_fixup_or_oops (arch/x86/mm/fault.c:762)
[ 7.427995] ? __bad_area_nosemaphore (arch/x86/mm/fault.c:860)
[ 7.428891] ? up_read (arch/x86/include/asm/preempt.h:104 kernel/locking/rwsem.c:1354 kernel/locking/rwsem.c:1616)
[ 7.429514] ? bad_area_nosemaphore (arch/x86/mm/fault.c:867)
[ 7.430367] ? do_user_addr_fault (arch/x86/mm/fault.c:1458)
[ 7.431238] ? exc_page_fault (arch/x86/include/asm/paravirt.h:695 arch/x86/mm/fault.c:1495 arch/x86/mm/fault.c:1543)
[ 7.431998] ? asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570)
[ 7.432802] ? __pfx__compound_head (include/linux/page-flags.h:245)
[ 7.433640] ? do_pte_missing (mm/memory.c:4610 mm/memory.c:4682 mm/memory.c:3670)
[ 7.434425] __handle_mm_fault (mm/memory.c:4947 mm/memory.c:5087)
[ 7.435234] handle_mm_fault (mm/memory.c:5252)
[ 7.435976] do_user_addr_fault (arch/x86/mm/fault.c:1393)
[ 7.436786] exc_page_fault (arch/x86/include/asm/paravirt.h:695 arch/x86/mm/fault.c:1495 arch/x86/mm/fault.c:1543)
[ 7.437517] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570)
[ 7.438294] RIP: 0033:0x7f2ae1480ace
[ 7.439035] Code: 8d a0 48 00 00 00 49 8b 44 24 08 48 0b 85 48 00 00 00 74 28 48 8d 3d f1 63 1d 00 e8f

Code starting with the faulting instruction
===========================================
0: 8d a0 48 00 00 00 lea 0x48(%rax),%esp
6: 49 8b 44 24 08 mov 0x8(%r12),%rax
b: 48 0b 85 48 00 00 00 or 0x48(%rbp),%rax
12: 74 28 je 0x3c
14: 48 8d 3d f1 63 1d 00 lea 0x1d63f1(%rip),%rdi # 0x1d640c
1b: 8f .byte 0x8f
[ 7.442519] RSP: 002b:00007ffdfa53bc70 EFLAGS: 00010246
[ 7.443524] RAX: 0000000000000000 RBX: 00007f2ae19a7028 RCX: fffffffffffff000
[ 7.444857] RDX: 00007f2ae0f412e0 RSI: a3d70a3d70a3d70b RDI: 00007f2ae1656eb0
[ 7.446195] RBP: 00007f2ae0f40828 R08: 0000000000000001 R09: 00007f2ae19a7000
[ 7.447572] R10: 000055f8d50a1010 R11: 0000000000000246 R12: 00007f2ae0f40870
[ 7.448939] R13: 000055f8d50a7110 R14: 000055f8d50a2fd0 R15: 0000000000000001
[ 7.450277] </TASK>
[ 7.450725] Modules linked in: ip_tables x_tables autofs4 raid10 raid456 libcrc32c async_raid6_recov 4
[ 7.454653] Dumping ftrace buffer:
[ 7.455321] (ftrace buffer empty)
[ 7.456014] CR2: 0000000000000008
[ 7.456686] ---[ end trace 0000000000000000 ]---
[ 7.457576] RIP: 0010:_compound_head (include/linux/page-flags.h:245)
[ 7.458440] Code: e8 35 b5 cd ff 5d c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90f

Code starting with the faulting instruction
===========================================
0: e8 35 b5 cd ff callq 0xffffffffffcdb53a
5: 5d pop %rbp
6: c3 retq
7: cc int3
8: cc int3
9: cc int3
a: cc int3
b: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
12: 00 00 00 00
16: 0f 1f 40 00 nopl 0x0(%rax)
1a: 90 nop
1b: 0f .byte 0xf
[ 7.461986] RSP: 0000:ffffb86140cd3d58 EFLAGS: 00010202
[ 7.463014] RAX: ffff96d3c19c4d38 RBX: ffffffffa103f080 RCX: 00000001019c4067
[ 7.464374] RDX: 0000000000000000 RSI: ffff96d2c0000d38 RDI: 0000000000000000
[ 7.465731] RBP: ffffb86140cd3d90 R08: ffff96d3c19c4d38 R09: 0000000000000067
[ 7.467114] R10: 0000000000000000 R11: 00007f2ae19d5fff R12: ffffb86140cd3dd0
[ 7.468469] R13: 0000000000000001 R14: ffff96d3cb7aa450 R15: 0000000000000860
[ 7.469824] FS: 00007f2ae0f40980(0000) GS:ffff96f1fd640000(0000) knlGS:0000000000000000
[ 7.471383] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7.472482] CR2: 0000000000000008 CR3: 0000000104830000 CR4: 00000000000006e0
[ 7.473835] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 7.475216] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


Thanks,
SJ

2023-07-04 00:09:10

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 3/4] mm/memory: convert do_shared_fault() to folios

On Sun, Jul 02, 2023 at 10:58:49PM -0700, Sidhartha Kumar wrote:
> Saves three implicit calls to compound_head().
>
> Signed-off-by: Sidhartha Kumar <[email protected]>
> ---
> mm/memory.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 93480e846ace6..33bf13431974c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4594,6 +4594,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> vm_fault_t ret, tmp;
> + struct folio *folio = page_folio(vmf->page);

You can't call page_folio() until after __do_fault(). Did you test this
series?

> ret = __do_fault(vmf);
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))

2023-07-04 00:12:18

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/4] mm/memory: convert do_page_mkwrite() to use folios

On Sun, Jul 02, 2023 at 10:58:47PM -0700, Sidhartha Kumar wrote:
> @@ -2947,14 +2947,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
> if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> - lock_page(page);
> - if (!page->mapping) {
> - unlock_page(page);
> + folio_lock(folio);
> + if (!folio_mapping(folio)) {

You don't need to call folio_mapping() here. folio->mapping works
absolutely fine in this circumstance. In fact, you may have broken a
driver with this change. I can elaborate more, but I'm not quite in the
mood to do that right now.


2023-07-04 00:12:32

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 4/4] mm/memory: convert do_read_fault() to use folios

On Sun, Jul 02, 2023 at 10:58:50PM -0700, Sidhartha Kumar wrote:
> +++ b/mm/memory.c
> @@ -4528,6 +4528,7 @@ static inline bool should_fault_around(struct vm_fault *vmf)
> static vm_fault_t do_read_fault(struct vm_fault *vmf)
> {
> vm_fault_t ret = 0;
> + struct folio *folio = page_folio(vmf->page);

Similarly, vmf->page is not initialised until after __do_fault().


2023-07-04 00:16:24

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 2/4] mm/memory: convert wp_page_shared() to use folios

On Sun, Jul 02, 2023 at 10:58:48PM -0700, Sidhartha Kumar wrote:
> @@ -3296,21 +3297,21 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> tmp = do_page_mkwrite(vmf);

A nice improvement to make after the series might be to pass (vmf,
folio) to save even more calls to compound_head().


2023-07-04 02:22:50

by Sidhartha Kumar

[permalink] [raw]
Subject: Re: [PATCH 3/4] mm/memory: convert do_shared_fault() to folios

On 7/3/23 4:31 PM, Matthew Wilcox wrote:
> On Sun, Jul 02, 2023 at 10:58:49PM -0700, Sidhartha Kumar wrote:
>> Saves three implicit calls to compound_head().
>>
>> Signed-off-by: Sidhartha Kumar <[email protected]>
>> ---
>> mm/memory.c | 9 +++++----
>> 1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 93480e846ace6..33bf13431974c 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4594,6 +4594,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
>> {
>> struct vm_area_struct *vma = vmf->vma;
>> vm_fault_t ret, tmp;
>> + struct folio *folio = page_folio(vmf->page);
>
> You can't call page_folio() until after __do_fault(). Did you test this
> series?

I thought it would be straightforward enough without testing but I
didn't realize the initialization in __do_fault(). I will test and
incorporate the other suggestions for a v2.

Thanks
>
>> ret = __do_fault(vmf);
>> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))


2023-07-04 03:33:00

by Sidhartha Kumar

[permalink] [raw]
Subject: Re: [PATCH 3/4] mm/memory: convert do_shared_fault() to folios

On 7/3/23 3:05 PM, SeongJae Park wrote:
> Hi Sidharta,
>
> On Sun, 2 Jul 2023 22:58:49 -0700 Sidhartha Kumar <[email protected]> wrote:
>
>> Saves three implicit calls to compound_head().
>>
>> Signed-off-by: Sidhartha Kumar <[email protected]>
>> ---
>> mm/memory.c | 9 +++++----
>> 1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 93480e846ace6..33bf13431974c 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4594,6 +4594,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
>> {
>> struct vm_area_struct *vma = vmf->vma;
>> vm_fault_t ret, tmp;
>> + struct folio *folio = page_folio(vmf->page);
>>
>> ret = __do_fault(vmf);
>> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
>> @@ -4604,11 +4605,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
>> * about to become writable
>> */
>> if (vma->vm_ops->page_mkwrite) {
>> - unlock_page(vmf->page);
>> + folio_unlock(folio);
>> tmp = do_page_mkwrite(vmf);
>> if (unlikely(!tmp ||
>> (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
>> - put_page(vmf->page);
>> + folio_put(folio);
>> return tmp;
>> }
>> }
>> @@ -4616,8 +4617,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
>> ret |= finish_fault(vmf);
>> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
>> VM_FAULT_RETRY))) {
>> - unlock_page(vmf->page);
>> - put_page(vmf->page);
>> + folio_unlock(folio);
>> + folio_put(folio);
>> return ret;
>> }
>
> I found the latest mm-unstable tree fails booting with stacktraces like below,
> and bisecting points this patch (commit a43f078c7a3b of mm-unstable). Do you
> have any idea?

This looks like the issue that Matthew pointed out in the thread.
Andrew, can you please remove this patch series.

Thanks
Sidhartha Kumar
>
> [ 7.388551] BUG: kernel NULL pointer dereference, address: 0000000000000008
> [ 7.389149] systemd[1]: Starting Load Kernel Module pstore_zone...
> [ 7.390101] #PF: supervisor read access in kernel mode
> [ 7.392370] #PF: error_code(0x0000) - not-present page
> [ 7.392372] PGD 0 P4D 0
> [ 7.392376] Oops: 0000 [#1] PREEMPT SMP PTI
> [ 7.392379] CPU: 9 PID: 594 Comm: systemd-journal Not tainted 6.4.0+ #8
> [ S7t.a3r9t2i3n82] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-4
> [ 7.392384] RIP: 0010:_compound_head (include/linux/page-flags.h:245)
> [ 7.400935] Code: e8 35 b5 cd ff 5d c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90f
>
> Code starting with the faulting instruction
> ===========================================
> 0: e8 35 b5 cd ff callq 0xffffffffffcdb53a
> 5: 5d pop %rbp
> 6: c3 retq
> 7: cc int3
> 8: cc int3
> 9: cc int3
> a: cc int3
> b: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
> 12: 00 00 00 00
> 16: 0f 1f 40 00 nopl 0x0(%rax)
> 1a: 90 nop
> 1b: 0f .byte 0xf
> [ 7.405283] RSP: 0000:ffffb86140cd3d58 EFLAGS: 00010202
> [ 7.406551] RAX: ffff96d3c19c4d38 RBX: ffffffffa103f080 RCX: 00000001019c4067
> [ 7.408233] RDX: 0000000000000000 RSI: ffff96d2c0000d38 RDI: 0000000000000000
> [ 7.409893] RBP: ffffb86140cd3d90 R08: ffff96d3c19c4d38 R09: 0000000000000067
> [ 7.411457] R10: 0000000000000000 R11: 00007f2ae19d5fff R12: ffffb86140cd3dd0
> [ 7.412792] R13: 0000000000000001 R14: ffff96d3cb7aa450 R15: 0000000000000860
> [ 7.414139] FS: 00007f2ae0f40980(0000) GS:ffff96f1fd640000(0000) knlGS:0000000000000000
> [ 7.415699] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 7.416780] CR2: 0000000000000008 CR3: 0000000104830000 CR4: 00000000000006e0
> [ 7.418115] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 7.419492] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 7.420830] Call Trace:
> [ 7.421308] <TASK>
> [ 7.421722] ? show_regs (arch/x86/kernel/dumpstack.c:479)
> [ 7.422411] ? __die_body (arch/x86/kernel/dumpstack.c:421)
> [ 7.423113] ? __die (arch/x86/kernel/dumpstack.c:435)
> [ 7.423716] ? page_fault_oops (arch/x86/mm/fault.c:707)
> [ 7.424504] ? search_bpf_extables (kernel/bpf/core.c:751)
> [ 7.425329] ? __pfx__compound_head (include/linux/page-flags.h:245)
> [ 7.426171] ? search_exception_tables (kernel/extable.c:64)
> [ 7.427084] ? kernelmode_fixup_or_oops (arch/x86/mm/fault.c:762)
> [ 7.427995] ? __bad_area_nosemaphore (arch/x86/mm/fault.c:860)
> [ 7.428891] ? up_read (arch/x86/include/asm/preempt.h:104 kernel/locking/rwsem.c:1354 kernel/locking/rwsem.c:1616)
> [ 7.429514] ? bad_area_nosemaphore (arch/x86/mm/fault.c:867)
> [ 7.430367] ? do_user_addr_fault (arch/x86/mm/fault.c:1458)
> [ 7.431238] ? exc_page_fault (arch/x86/include/asm/paravirt.h:695 arch/x86/mm/fault.c:1495 arch/x86/mm/fault.c:1543)
> [ 7.431998] ? asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570)
> [ 7.432802] ? __pfx__compound_head (include/linux/page-flags.h:245)
> [ 7.433640] ? do_pte_missing (mm/memory.c:4610 mm/memory.c:4682 mm/memory.c:3670)
> [ 7.434425] __handle_mm_fault (mm/memory.c:4947 mm/memory.c:5087)
> [ 7.435234] handle_mm_fault (mm/memory.c:5252)
> [ 7.435976] do_user_addr_fault (arch/x86/mm/fault.c:1393)
> [ 7.436786] exc_page_fault (arch/x86/include/asm/paravirt.h:695 arch/x86/mm/fault.c:1495 arch/x86/mm/fault.c:1543)
> [ 7.437517] asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570)
> [ 7.438294] RIP: 0033:0x7f2ae1480ace
> [ 7.439035] Code: 8d a0 48 00 00 00 49 8b 44 24 08 48 0b 85 48 00 00 00 74 28 48 8d 3d f1 63 1d 00 e8f
>
> Code starting with the faulting instruction
> ===========================================
> 0: 8d a0 48 00 00 00 lea 0x48(%rax),%esp
> 6: 49 8b 44 24 08 mov 0x8(%r12),%rax
> b: 48 0b 85 48 00 00 00 or 0x48(%rbp),%rax
> 12: 74 28 je 0x3c
> 14: 48 8d 3d f1 63 1d 00 lea 0x1d63f1(%rip),%rdi # 0x1d640c
> 1b: 8f .byte 0x8f
> [ 7.442519] RSP: 002b:00007ffdfa53bc70 EFLAGS: 00010246
> [ 7.443524] RAX: 0000000000000000 RBX: 00007f2ae19a7028 RCX: fffffffffffff000
> [ 7.444857] RDX: 00007f2ae0f412e0 RSI: a3d70a3d70a3d70b RDI: 00007f2ae1656eb0
> [ 7.446195] RBP: 00007f2ae0f40828 R08: 0000000000000001 R09: 00007f2ae19a7000
> [ 7.447572] R10: 000055f8d50a1010 R11: 0000000000000246 R12: 00007f2ae0f40870
> [ 7.448939] R13: 000055f8d50a7110 R14: 000055f8d50a2fd0 R15: 0000000000000001
> [ 7.450277] </TASK>
> [ 7.450725] Modules linked in: ip_tables x_tables autofs4 raid10 raid456 libcrc32c async_raid6_recov 4
> [ 7.454653] Dumping ftrace buffer:
> [ 7.455321] (ftrace buffer empty)
> [ 7.456014] CR2: 0000000000000008
> [ 7.456686] ---[ end trace 0000000000000000 ]---
> [ 7.457576] RIP: 0010:_compound_head (include/linux/page-flags.h:245)
> [ 7.458440] Code: e8 35 b5 cd ff 5d c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90f
>
> Code starting with the faulting instruction
> ===========================================
> 0: e8 35 b5 cd ff callq 0xffffffffffcdb53a
> 5: 5d pop %rbp
> 6: c3 retq
> 7: cc int3
> 8: cc int3
> 9: cc int3
> a: cc int3
> b: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1)
> 12: 00 00 00 00
> 16: 0f 1f 40 00 nopl 0x0(%rax)
> 1a: 90 nop
> 1b: 0f .byte 0xf
> [ 7.461986] RSP: 0000:ffffb86140cd3d58 EFLAGS: 00010202
> [ 7.463014] RAX: ffff96d3c19c4d38 RBX: ffffffffa103f080 RCX: 00000001019c4067
> [ 7.464374] RDX: 0000000000000000 RSI: ffff96d2c0000d38 RDI: 0000000000000000
> [ 7.465731] RBP: ffffb86140cd3d90 R08: ffff96d3c19c4d38 R09: 0000000000000067
> [ 7.467114] R10: 0000000000000000 R11: 00007f2ae19d5fff R12: ffffb86140cd3dd0
> [ 7.468469] R13: 0000000000000001 R14: ffff96d3cb7aa450 R15: 0000000000000860
> [ 7.469824] FS: 00007f2ae0f40980(0000) GS:ffff96f1fd640000(0000) knlGS:0000000000000000
> [ 7.471383] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 7.472482] CR2: 0000000000000008 CR3: 0000000104830000 CR4: 00000000000006e0
> [ 7.473835] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 7.475216] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>
>
> Thanks,
> SJ
>


2023-07-04 19:52:13

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/4] mm/memory: convert do_page_mkwrite() to use folios

On Sun, Jul 02, 2023 at 10:58:47PM -0700, Sidhartha Kumar wrote:
> @@ -2947,14 +2947,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
> if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> - lock_page(page);
> - if (!page->mapping) {
> - unlock_page(page);
> + folio_lock(folio);
> + if (!folio_mapping(folio)) {
> + folio_unlock(folio);

I promised to explain this better once I had time, and I have time now.

folio->mapping is used for a multitude of purposes, unfortunately.
Maybe some future work will reduce that, but for now, These Are The Rules.

If the folio is marked as being Slab, it's used for something else.
The folio does not belong to an address space (nor can it be mapped,
so we're not going to see it here, but sometimes we see it in other
contexts where we call folio_mapping()).

The bottom two bits are used as PAGE_MAPPING_FLAGS. If they're both
0, this folio belongs to a file, and the rest of folio->mapping is a
pointer to a struct address_space. Or they're both 0 because the whole
thing is NULL. More on that below. If the bottom two bits are 01b,
this is an anonymous folio, and folio->mapping is actually a pointer
to an anon_vma (which is not the same thing as an anon vma). If the
bottom two bits are 10b, this is a Movable page (anon & file memory is
also movable, but this is different). The folio->mapping points to a
struct movable_operations. If the bottom two bits are 11b, this is a
KSM allocation, and folio->mapping points to a struct ksm_stable_node.

When we remove a folio from the page cache, we reset folio->mapping
to NULL. We often remove folios from the page cache before their
refcount drops to zero (the common case is to look up the folio in the
page cache, which grabs a reference, remove the folio from the page
cache which decrements the refcount, then put the folio which might be
the last refcount). So it's entirely possible to see a folio in this
function with a NULL mapping; that means it's been removed from the
file through a truncate or siimlar, and we need to fail the mkwrite.
Userspace is about to get a segfault.

If you find all of that confusing, well, I agree, and I'm trying to
simplify it.

So, with all that background, what's going on here? Part of the
"modern" protocol for handling page faults is to lock the folio
in vm_ops->page_mkwrite. But we still support (... why?) drivers
that haven't been updated. They return 0 on success instead of
VM_FAULT_LOCKED. So we take the lock for them, then check that the
folio wasn't truncated, and bail out if it looks like it was.

If we have a really old-school driver that has allocated a page,
mapped it to userspace, and set page->mapping to be, eg, Movable, by
calling folio_mapping() instead of folio->mapping, we'll end up seeing
NULL instead of a non-NULL value, mistakenly believe it to have been
truncated and enter an endless loop.

Am I being paranoid here? Maybe! Drivers should have been updated by
now. The "modern" way was introduced in 2007 (commit d0217ac04ca6), so
it'd be nice to turn this into a WARN_ON_ONCE so drivers fix their code.
There are only ~30 implementations of page_mkwrite in the kernel, so it
might not take too long to check everything's OK.