Rebased on latest mm-unstable. As we have a conflict now with a cleanup
from Chrostoph, temporarily revert that one, so we can apply the fix,
and reapply the adjusted cleanup on top. I squashed the fixups
sitting in Andrew's tree for that patch.
The fix should likely go in first via the hotfix route, that's why I'm
moving it to the front.
Tested with my reproducer.
v1 -> v2:
* Rebased to latest mm-unstable
* "x86/mm/pat: fix VM_PAT handling in COW mappings"
-> Fix function parameter indentation
-> Add Ingos Ack
Cc: Andrew Morton <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Fei Li <[email protected]>
Cc: Nathan Chancellor <[email protected]>
Christoph Hellwig (1):
mm: move follow_phys to arch/x86/mm/pat/memtype.c
David Hildenbrand (2):
[mm-unstable] Revert "mm: move follow_phys to
arch/x86/mm/pat/memtype.c"
x86/mm/pat: fix VM_PAT handling in COW mappings
arch/x86/mm/pat/memtype.c | 56 +++++++++++++++++++++++++++++----------
1 file changed, 42 insertions(+), 14 deletions(-)
--
2.44.0
Revert mm-unstable patches:
* mm-move-follow_phys-to-arch-x86-mm-pat-memtypec-fix-2
* mm-move-follow_phys-to-arch-x86-mm-pat-memtypec-fix
* mm: move follow_phys to arch/x86/mm/pat/memtype.c
Signed-off-by: David Hildenbrand <[email protected]>
---
arch/x86/mm/pat/memtype.c | 24 ++----------------------
include/linux/mm.h | 2 ++
mm/memory.c | 28 ++++++++++++++++++++++++++++
3 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 143d1e3d3fd2..0d72183b5dd0 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,7 +39,6 @@
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
@@ -948,25 +947,6 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
-static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
- resource_size_t *phys)
-{
- pte_t *ptep, pte;
- spinlock_t *ptl;
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- return -EINVAL;
-
- if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl))
- return -EINVAL;
-
- pte = ptep_get(ptep);
- *prot = pgprot_val(pte_pgprot(pte));
- *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
- pte_unmap_unlock(ptep, ptl);
- return 0;
-}
-
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
@@ -986,7 +966,7 @@ int track_pfn_copy(struct vm_area_struct *vma)
* reserve the whole chunk covered by vma. We need the
* starting address and protection from pte.
*/
- if (follow_phys(vma, &prot, &paddr)) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -1073,7 +1053,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
- if (follow_phys(vma, &prot, &paddr)) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc0cd34a8042..97e779993c74 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2424,6 +2424,8 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp);
+int follow_phys(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
diff --git a/mm/memory.c b/mm/memory.c
index 912cd738ec03..1211e2090c1a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5987,6 +5987,34 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
EXPORT_SYMBOL_GPL(follow_pte);
#ifdef CONFIG_HAVE_IOREMAP_PROT
+int follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot, resource_size_t *phys)
+{
+ int ret = -EINVAL;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out;
+
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ goto out;
+ pte = ptep_get(ptep);
+
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+
+ *prot = pgprot_val(pte_pgprot(pte));
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+ ret = 0;
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return ret;
+}
+
/**
* generic_access_phys - generic implementation for iomem mmap access
* @vma: the vma to access
--
2.44.0
PAT handling won't do the right thing in COW mappings: the first PTE
(or, in fact, all PTEs) can be replaced during write faults to point at
anon folios. Reliably recovering the correct PFN and cachemode using
follow_phys() from PTEs will not work in COW mappings.
Using follow_phys(), we might just get the address+protection of the
anon folio (which is very wrong), or fail on swap/nonswap entries,
failing follow_phys() and triggering a WARN_ON_ONCE() in untrack_pfn()
and track_pfn_copy(), not properly calling free_pfn_range().
In free_pfn_range(), we either wouldn't call memtype_free() or would
call it with the wrong range, possibly leaking memory.
To fix that, let's update follow_phys() to refuse returning anon folios,
and fallback to using the stored PFN inside vma->vm_pgoff for COW
mappings if we run into that.
We will now properly handle untrack_pfn() with COW mappings, where we
don't need the cachemode. We'll have to fail fork()->track_pfn_copy() if
the first page was replaced by an anon folio, though: we'd have to store
the cachemode in the VMA to make this work, likely growing the VMA size.
For now, lets keep it simple and let track_pfn_copy() just fail in that
case: it would have failed in the past with swap/nonswap entries already,
and it would have done the wrong thing with anon folios.
Simple reproducer to trigger the WARN_ON_ONCE() in untrack_pfn():
<--- C reproducer --->
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <liburing.h>
int main(void)
{
struct io_uring_params p = {};
int ring_fd;
size_t size;
char *map;
ring_fd = io_uring_setup(1, &p);
if (ring_fd < 0) {
perror("io_uring_setup");
return 1;
}
size = p.sq_off.array + p.sq_entries * sizeof(unsigned);
/* Map the submission queue ring MAP_PRIVATE */
map = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE,
ring_fd, IORING_OFF_SQ_RING);
if (map == MAP_FAILED) {
perror("mmap");
return 1;
}
/* We have at least one page. Let's COW it. */
*map = 0;
pause();
return 0;
}
<--- C reproducer --->
On a system with 16 GiB RAM and swap configured:
# ./iouring &
# memhog 16G
# killall iouring
[ 301.552930] ------------[ cut here ]------------
[ 301.553285] WARNING: CPU: 7 PID: 1402 at arch/x86/mm/pat/memtype.c:1060 untrack_pfn+0xf4/0x100
[ 301.553989] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_g
[ 301.558232] CPU: 7 PID: 1402 Comm: iouring Not tainted 6.7.5-100.fc38.x86_64 #1
[ 301.558772] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebu4
[ 301.559569] RIP: 0010:untrack_pfn+0xf4/0x100
[ 301.559893] Code: 75 c4 eb cf 48 8b 43 10 8b a8 e8 00 00 00 3b 6b 28 74 b8 48 8b 7b 30 e8 ea 1a f7 000
[ 301.561189] RSP: 0018:ffffba2c0377fab8 EFLAGS: 00010282
[ 301.561590] RAX: 00000000ffffffea RBX: ffff9208c8ce9cc0 RCX: 000000010455e047
[ 301.562105] RDX: 07fffffff0eb1e0a RSI: 0000000000000000 RDI: ffff9208c391d200
[ 301.562628] RBP: 0000000000000000 R08: ffffba2c0377fab8 R09: 0000000000000000
[ 301.563145] R10: ffff9208d2292d50 R11: 0000000000000002 R12: 00007fea890e0000
[ 301.563669] R13: 0000000000000000 R14: ffffba2c0377fc08 R15: 0000000000000000
[ 301.564186] FS: 0000000000000000(0000) GS:ffff920c2fbc0000(0000) knlGS:0000000000000000
[ 301.564773] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 301.565197] CR2: 00007fea88ee8a20 CR3: 00000001033a8000 CR4: 0000000000750ef0
[ 301.565725] PKRU: 55555554
[ 301.565944] Call Trace:
[ 301.566148] <TASK>
[ 301.566325] ? untrack_pfn+0xf4/0x100
[ 301.566618] ? __warn+0x81/0x130
[ 301.566876] ? untrack_pfn+0xf4/0x100
[ 301.567163] ? report_bug+0x171/0x1a0
[ 301.567466] ? handle_bug+0x3c/0x80
[ 301.567743] ? exc_invalid_op+0x17/0x70
[ 301.568038] ? asm_exc_invalid_op+0x1a/0x20
[ 301.568363] ? untrack_pfn+0xf4/0x100
[ 301.568660] ? untrack_pfn+0x65/0x100
[ 301.568947] unmap_single_vma+0xa6/0xe0
[ 301.569247] unmap_vmas+0xb5/0x190
[ 301.569532] exit_mmap+0xec/0x340
[ 301.569801] __mmput+0x3e/0x130
[ 301.570051] do_exit+0x305/0xaf0
..
Reported-by: Wupeng Ma <[email protected]>
Closes: https://lkml.kernel.org/r/[email protected]
Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines")
Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3")
Acked-by: Ingo Molnar <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]>
---
arch/x86/mm/pat/memtype.c | 49 ++++++++++++++++++++++++++++-----------
mm/memory.c | 4 ++++
2 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 0d72183b5dd0..36b603d0cdde 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -947,6 +947,38 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
+ pgprot_t *pgprot)
+{
+ unsigned long prot;
+
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
+
+ /*
+ * We need the starting PFN and cachemode used for track_pfn_remap()
+ * that covered the whole VMA. For most mappings, we can obtain that
+ * information from the page tables. For COW mappings, we might now
+ * suddenly have anon folios mapped and follow_phys() will fail.
+ *
+ * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
+ * detect the PFN. If we need the cachemode as well, we're out of luck
+ * for now and have to fail fork().
+ */
+ if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (pgprot)
+ *pgprot = __pgprot(prot);
+ return 0;
+ }
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (pgprot)
+ return -EINVAL;
+ *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
@@ -957,20 +989,13 @@ static void free_pfn_range(u64 paddr, unsigned long size)
int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
- unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (vma->vm_flags & VM_PAT) {
- /*
- * reserve the whole chunk covered by vma. We need the
- * starting address and protection from pte.
- */
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, &pgprot))
return -EINVAL;
- }
- pgprot = __pgprot(prot);
+ /* reserve the whole chunk covered by vma. */
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
@@ -1045,7 +1070,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size, bool mm_wr_locked)
{
resource_size_t paddr;
- unsigned long prot;
if (vma && !(vma->vm_flags & VM_PAT))
return;
@@ -1053,11 +1077,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, NULL))
return;
- }
-
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
diff --git a/mm/memory.c b/mm/memory.c
index 1211e2090c1a..1e9a0288fdaf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6002,6 +6002,10 @@ int follow_phys(struct vm_area_struct *vma,
goto out;
pte = ptep_get(ptep);
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (vm_normal_folio(vma, address, pte))
+ goto unlock;
+
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
--
2.44.0
From: Christoph Hellwig <[email protected]>
follow_phys is only used by two callers in arch/x86/mm/pat/memtype.c.
Move it there and hardcode the two arguments that get the same values
passed by both callers.
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Fei Li <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Nathan Chancellor <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]> # rebased, applied fixups
---
arch/x86/mm/pat/memtype.c | 29 ++++++++++++++++++++++++++++-
include/linux/mm.h | 2 --
mm/memory.c | 32 --------------------------------
3 files changed, 28 insertions(+), 35 deletions(-)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 36b603d0cdde..d01c3b0bd6eb 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,6 +39,7 @@
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
@@ -947,6 +948,32 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
+ resource_size_t *phys)
+{
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+ if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl))
+ return -EINVAL;
+
+ pte = ptep_get(ptep);
+
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (vm_normal_folio(vma, vma->vm_start, pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ return -EINVAL;
+ }
+
+ *prot = pgprot_val(pte_pgprot(pte));
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+
static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
pgprot_t *pgprot)
{
@@ -964,7 +991,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
* detect the PFN. If we need the cachemode as well, we're out of luck
* for now and have to fail fork().
*/
- if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (!follow_phys(vma, &prot, paddr)) {
if (pgprot)
*pgprot = __pgprot(prot);
return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 97e779993c74..bc0cd34a8042 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2424,8 +2424,6 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp);
-int follow_phys(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags, unsigned long *prot, resource_size_t *phys);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
diff --git a/mm/memory.c b/mm/memory.c
index 1e9a0288fdaf..912cd738ec03 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5987,38 +5987,6 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
EXPORT_SYMBOL_GPL(follow_pte);
#ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned long *prot, resource_size_t *phys)
-{
- int ret = -EINVAL;
- pte_t *ptep, pte;
- spinlock_t *ptl;
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- goto out;
-
- if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
- goto out;
- pte = ptep_get(ptep);
-
- /* Never return PFNs of anon folios in COW mappings. */
- if (vm_normal_folio(vma, address, pte))
- goto unlock;
-
- if ((flags & FOLL_WRITE) && !pte_write(pte))
- goto unlock;
-
- *prot = pgprot_val(pte_pgprot(pte));
- *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
-
- ret = 0;
-unlock:
- pte_unmap_unlock(ptep, ptl);
-out:
- return ret;
-}
-
/**
* generic_access_phys - generic implementation for iomem mmap access
* @vma: the vma to access
--
2.44.0
On Wed, 3 Apr 2024 23:21:28 +0200 David Hildenbrand <[email protected]> wrote:
> Rebased on latest mm-unstable. As we have a conflict now with a cleanup
> from Chrostoph, temporarily revert that one, so we can apply the fix,
> and reapply the adjusted cleanup on top. I squashed the fixups
> sitting in Andrew's tree for that patch.
>
> The fix should likely go in first via the hotfix route, that's why I'm
> moving it to the front.
Well that was easy, thanks ;) Perhaps hch can double-check [3/3] here.
On Wed, 3 Apr 2024 23:21:30 +0200 David Hildenbrand <[email protected]> wrote:
> PAT handling won't do the right thing in COW mappings: the first PTE
> (or, in fact, all PTEs) can be replaced during write faults to point at
> anon folios. Reliably recovering the correct PFN and cachemode using
> follow_phys() from PTEs will not work in COW mappings.
>
> ...
>
> Reported-by: Wupeng Ma <[email protected]>
> Closes: https://lkml.kernel.org/r/[email protected]
> Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines")
> Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3")
These are really old. Should we backport this?
On 04.04.24 00:12, Andrew Morton wrote:
> On Wed, 3 Apr 2024 23:21:30 +0200 David Hildenbrand <[email protected]> wrote:
>
>> PAT handling won't do the right thing in COW mappings: the first PTE
>> (or, in fact, all PTEs) can be replaced during write faults to point at
>> anon folios. Reliably recovering the correct PFN and cachemode using
>> follow_phys() from PTEs will not work in COW mappings.
>>
>> ...
>>
>> Reported-by: Wupeng Ma <[email protected]>
>> Closes: https://lkml.kernel.org/r/[email protected]
>> Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines")
>> Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3")
>
> These are really old. Should we backport this?
I was asking that question myself.
With the reproducer, the worst thing that happens on most systems is the
warning. On !RAM and with PAT, there could be memory leaks and other
surprises.
Likely, we should just backport it to stable. Should not be too hard to
backport to stable kernels I guess/hope.
--
Cheers,
David / dhildenb
On Thu, 4 Apr 2024 21:20:06 +0200 David Hildenbrand <[email protected]> wrote:
> >> Reported-by: Wupeng Ma <[email protected]>
> >> Closes: https://lkml.kernel.org/r/[email protected]
> >> Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines")
> >> Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3")
> >
> > These are really old. Should we backport this?
>
> I was asking that question myself.
>
> With the reproducer, the worst thing that happens on most systems is the
> warning. On !RAM and with PAT, there could be memory leaks and other
> surprises.
>
> Likely, we should just backport it to stable. Should not be too hard to
> backport to stable kernels I guess/hope.
OK, thanks, I added the cc:stable tag.
On Wed, Apr 03, 2024 at 03:09:58PM -0700, Andrew Morton wrote:
> On Wed, 3 Apr 2024 23:21:28 +0200 David Hildenbrand <[email protected]> wrote:
>
> > Rebased on latest mm-unstable. As we have a conflict now with a cleanup
> > from Chrostoph, temporarily revert that one, so we can apply the fix,
> > and reapply the adjusted cleanup on top. I squashed the fixups
> > sitting in Andrew's tree for that patch.
> >
> > The fix should likely go in first via the hotfix route, that's why I'm
> > moving it to the front.
>
> Well that was easy, thanks ;) Perhaps hch can double-check [3/3] here.
Still looks good.
On 2024/4/4 5:21, David Hildenbrand wrote:
> PAT handling won't do the right thing in COW mappings: the first PTE
> (or, in fact, all PTEs) can be replaced during write faults to point at
> anon folios. Reliably recovering the correct PFN and cachemode using
> follow_phys() from PTEs will not work in COW mappings.
>
> Using follow_phys(), we might just get the address+protection of the
> anon folio (which is very wrong), or fail on swap/nonswap entries,
> failing follow_phys() and triggering a WARN_ON_ONCE() in untrack_pfn()
> and track_pfn_copy(), not properly calling free_pfn_range().
>
> In free_pfn_range(), we either wouldn't call memtype_free() or would
> call it with the wrong range, possibly leaking memory.
>
> To fix that, let's update follow_phys() to refuse returning anon folios,
> and fallback to using the stored PFN inside vma->vm_pgoff for COW
> mappings if we run into that.
>
> We will now properly handle untrack_pfn() with COW mappings, where we
> don't need the cachemode. We'll have to fail fork()->track_pfn_copy() if
> the first page was replaced by an anon folio, though: we'd have to store
> the cachemode in the VMA to make this work, likely growing the VMA size.
>
> For now, lets keep it simple and let track_pfn_copy() just fail in that
> case: it would have failed in the past with swap/nonswap entries already,
> and it would have done the wrong thing with anon folios.
>
> Simple reproducer to trigger the WARN_ON_ONCE() in untrack_pfn():
>
> <--- C reproducer --->
> #include <stdio.h>
> #include <sys/mman.h>
> #include <unistd.h>
> #include <liburing.h>
>
> int main(void)
> {
> struct io_uring_params p = {};
> int ring_fd;
> size_t size;
> char *map;
>
> ring_fd = io_uring_setup(1, &p);
> if (ring_fd < 0) {
> perror("io_uring_setup");
> return 1;
> }
> size = p.sq_off.array + p.sq_entries * sizeof(unsigned);
>
> /* Map the submission queue ring MAP_PRIVATE */
> map = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE,
> ring_fd, IORING_OFF_SQ_RING);
> if (map == MAP_FAILED) {
> perror("mmap");
> return 1;
> }
>
> /* We have at least one page. Let's COW it. */
> *map = 0;
> pause();
> return 0;
> }
> <--- C reproducer --->
>
> On a system with 16 GiB RAM and swap configured:
> # ./iouring &
> # memhog 16G
> # killall iouring
> [ 301.552930] ------------[ cut here ]------------
> [ 301.553285] WARNING: CPU: 7 PID: 1402 at arch/x86/mm/pat/memtype.c:1060 untrack_pfn+0xf4/0x100
> [ 301.553989] Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_g
> [ 301.558232] CPU: 7 PID: 1402 Comm: iouring Not tainted 6.7.5-100.fc38.x86_64 #1
> [ 301.558772] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebu4
> [ 301.559569] RIP: 0010:untrack_pfn+0xf4/0x100
> [ 301.559893] Code: 75 c4 eb cf 48 8b 43 10 8b a8 e8 00 00 00 3b 6b 28 74 b8 48 8b 7b 30 e8 ea 1a f7 000
> [ 301.561189] RSP: 0018:ffffba2c0377fab8 EFLAGS: 00010282
> [ 301.561590] RAX: 00000000ffffffea RBX: ffff9208c8ce9cc0 RCX: 000000010455e047
> [ 301.562105] RDX: 07fffffff0eb1e0a RSI: 0000000000000000 RDI: ffff9208c391d200
> [ 301.562628] RBP: 0000000000000000 R08: ffffba2c0377fab8 R09: 0000000000000000
> [ 301.563145] R10: ffff9208d2292d50 R11: 0000000000000002 R12: 00007fea890e0000
> [ 301.563669] R13: 0000000000000000 R14: ffffba2c0377fc08 R15: 0000000000000000
> [ 301.564186] FS: 0000000000000000(0000) GS:ffff920c2fbc0000(0000) knlGS:0000000000000000
> [ 301.564773] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 301.565197] CR2: 00007fea88ee8a20 CR3: 00000001033a8000 CR4: 0000000000750ef0
> [ 301.565725] PKRU: 55555554
> [ 301.565944] Call Trace:
> [ 301.566148] <TASK>
> [ 301.566325] ? untrack_pfn+0xf4/0x100
> [ 301.566618] ? __warn+0x81/0x130
> [ 301.566876] ? untrack_pfn+0xf4/0x100
> [ 301.567163] ? report_bug+0x171/0x1a0
> [ 301.567466] ? handle_bug+0x3c/0x80
> [ 301.567743] ? exc_invalid_op+0x17/0x70
> [ 301.568038] ? asm_exc_invalid_op+0x1a/0x20
> [ 301.568363] ? untrack_pfn+0xf4/0x100
> [ 301.568660] ? untrack_pfn+0x65/0x100
> [ 301.568947] unmap_single_vma+0xa6/0xe0
> [ 301.569247] unmap_vmas+0xb5/0x190
> [ 301.569532] exit_mmap+0xec/0x340
> [ 301.569801] __mmput+0x3e/0x130
> [ 301.570051] do_exit+0x305/0xaf0
> ...
>
> Reported-by: Wupeng Ma <[email protected]>
> Closes: https://lkml.kernel.org/r/[email protected]
> Fixes: b1a86e15dc03 ("x86, pat: remove the dependency on 'vm_pgoff' in track/untrack pfn vma routines")
> Fixes: 5899329b1910 ("x86: PAT: implement track/untrack of pfnmap regions for x86 - v3")
> Acked-by: Ingo Molnar <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: "H. Peter Anvin" <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Signed-off-by: David Hildenbrand <[email protected]>
> ---
> arch/x86/mm/pat/memtype.c | 49 ++++++++++++++++++++++++++++-----------
> mm/memory.c | 4 ++++
> 2 files changed, 39 insertions(+), 14 deletions(-
Test-by: Wupeng Ma <[email protected]>
>
> diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
> index 0d72183b5dd0..36b603d0cdde 100644
> --- a/arch/x86/mm/pat/memtype.c
> +++ b/arch/x86/mm/pat/memtype.c
> @@ -947,6 +947,38 @@ static void free_pfn_range(u64 paddr, unsigned long size)
> memtype_free(paddr, paddr + size);
> }
>
> +static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
> + pgprot_t *pgprot)
> +{
> + unsigned long prot;
> +
> + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
> +
> + /*
> + * We need the starting PFN and cachemode used for track_pfn_remap()
> + * that covered the whole VMA. For most mappings, we can obtain that
> + * information from the page tables. For COW mappings, we might now
> + * suddenly have anon folios mapped and follow_phys() will fail.
> + *
> + * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
> + * detect the PFN. If we need the cachemode as well, we're out of luck
> + * for now and have to fail fork().
> + */
> + if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
> + if (pgprot)
> + *pgprot = __pgprot(prot);
> + return 0;
> + }
> + if (is_cow_mapping(vma->vm_flags)) {
> + if (pgprot)
> + return -EINVAL;
> + *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
> + return 0;
> + }
> + WARN_ON_ONCE(1);
> + return -EINVAL;
> +}
> +
> /*
> * track_pfn_copy is called when vma that is covering the pfnmap gets
> * copied through copy_page_range().
> @@ -957,20 +989,13 @@ static void free_pfn_range(u64 paddr, unsigned long size)
> int track_pfn_copy(struct vm_area_struct *vma)
> {
> resource_size_t paddr;
> - unsigned long prot;
> unsigned long vma_size = vma->vm_end - vma->vm_start;
> pgprot_t pgprot;
>
> if (vma->vm_flags & VM_PAT) {
> - /*
> - * reserve the whole chunk covered by vma. We need the
> - * starting address and protection from pte.
> - */
> - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
> - WARN_ON_ONCE(1);
> + if (get_pat_info(vma, &paddr, &pgprot))
> return -EINVAL;
> - }
> - pgprot = __pgprot(prot);
> + /* reserve the whole chunk covered by vma. */
> return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
> }
>
> @@ -1045,7 +1070,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
> unsigned long size, bool mm_wr_locked)
> {
> resource_size_t paddr;
> - unsigned long prot;
>
> if (vma && !(vma->vm_flags & VM_PAT))
> return;
> @@ -1053,11 +1077,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
> /* free the chunk starting from pfn or the whole chunk */
> paddr = (resource_size_t)pfn << PAGE_SHIFT;
> if (!paddr && !size) {
> - if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
> - WARN_ON_ONCE(1);
> + if (get_pat_info(vma, &paddr, NULL))
> return;
> - }
> -
> size = vma->vm_end - vma->vm_start;
> }
> free_pfn_range(paddr, size);
> diff --git a/mm/memory.c b/mm/memory.c
> index 1211e2090c1a..1e9a0288fdaf 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -6002,6 +6002,10 @@ int follow_phys(struct vm_area_struct *vma,
> goto out;
> pte = ptep_get(ptep);
>
> + /* Never return PFNs of anon folios in COW mappings. */
> + if (vm_normal_folio(vma, address, pte))
> + goto unlock;
> +
> if ((flags & FOLL_WRITE) && !pte_write(pte))
> goto unlock;
>