2019-12-05 14:05:13

by Daniel Axtens

[permalink] [raw]
Subject: [PATCH 1/3] mm: add apply_to_existing_pages helper

apply_to_page_range takes an address range, and if any parts of it
are not covered by the existing page table hierarchy, it allocates
memory to fill them in.

In some use cases, this is not what we want - we want to be able to
operate exclusively on PTEs that are already in the tables.

Add apply_to_existing_pages for this. Adjust the walker functions
for apply_to_page_range to take 'create', which switches them between
the old and new modes.

This will be used in KASAN vmalloc.

Signed-off-by: Daniel Axtens <[email protected]>
---
include/linux/mm.h | 3 ++
mm/memory.c | 131 +++++++++++++++++++++++++++++++++------------
2 files changed, 99 insertions(+), 35 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c97ea3b694e6..f4dba827d76e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2621,6 +2621,9 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
+extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
+ unsigned long size, pte_fn_t fn,
+ void *data);

#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index 606da187d1de..e508ba7e0a19 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2021,26 +2021,34 @@ EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+ pte_fn_t fn, void *data, bool create)
{
pte_t *pte;
- int err;
+ int err = 0;
spinlock_t *uninitialized_var(ptl);

- pte = (mm == &init_mm) ?
- pte_alloc_kernel(pmd, addr) :
- pte_alloc_map_lock(mm, pmd, addr, &ptl);
- if (!pte)
- return -ENOMEM;
+ if (create) {
+ pte = (mm == &init_mm) ?
+ pte_alloc_kernel(pmd, addr) :
+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+ } else {
+ pte = (mm == &init_mm) ?
+ pte_offset_kernel(pmd, addr) :
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ }

BUG_ON(pmd_huge(*pmd));

arch_enter_lazy_mmu_mode();

do {
- err = fn(pte++, addr, data);
- if (err)
- break;
+ if (create || !pte_none(*pte)) {
+ err = fn(pte++, addr, data);
+ if (err)
+ break;
+ }
} while (addr += PAGE_SIZE, addr != end);

arch_leave_lazy_mmu_mode();
@@ -2052,62 +2060,83 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+ pte_fn_t fn, void *data, bool create)
{
pmd_t *pmd;
unsigned long next;
- int err;
+ int err = 0;

BUG_ON(pud_huge(*pud));

- pmd = pmd_alloc(mm, pud, addr);
- if (!pmd)
- return -ENOMEM;
+ if (create) {
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ } else {
+ pmd = pmd_offset(pud, addr);
+ }
do {
next = pmd_addr_end(addr, end);
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
- if (err)
- break;
+ if (create || !pmd_none_or_clear_bad(pmd)) {
+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
+ create);
+ if (err)
+ break;
+ }
} while (pmd++, addr = next, addr != end);
return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+ pte_fn_t fn, void *data, bool create)
{
pud_t *pud;
unsigned long next;
- int err;
+ int err = 0;

- pud = pud_alloc(mm, p4d, addr);
- if (!pud)
- return -ENOMEM;
+ if (create) {
+ pud = pud_alloc(mm, p4d, addr);
+ if (!pud)
+ return -ENOMEM;
+ } else {
+ pud = pud_offset(p4d, addr);
+ }
do {
next = pud_addr_end(addr, end);
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
- if (err)
- break;
+ if (create || !pud_none_or_clear_bad(pud)) {
+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
+ create);
+ if (err)
+ break;
+ }
} while (pud++, addr = next, addr != end);
return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+ pte_fn_t fn, void *data, bool create)
{
p4d_t *p4d;
unsigned long next;
- int err;
+ int err = 0;

- p4d = p4d_alloc(mm, pgd, addr);
- if (!p4d)
- return -ENOMEM;
+ if (create) {
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return -ENOMEM;
+ } else {
+ p4d = p4d_offset(pgd, addr);
+ }
do {
next = p4d_addr_end(addr, end);
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
- if (err)
- break;
+ if (create || !p4d_none_or_clear_bad(p4d)) {
+ err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
+ create);
+ if (err)
+ break;
+ }
} while (p4d++, addr = next, addr != end);
return err;
}
@@ -2130,7 +2159,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, true);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -2139,6 +2168,38 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

+/*
+ * Scan a region of virtual memory, calling a provided function on
+ * each leaf page table where it exists.
+ *
+ * Unlike apply_to_page_range, this does _not_ fill in page tables
+ * where they are absent.
+ */
+int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long end = addr + size;
+ int err = 0;
+
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_existing_pages);
+
/*
* handle_pte_fault chooses page fault handler according to an entry which was
* read non-atomically. Before making any commitment, on those architectures
--
2.20.1


2019-12-05 14:06:52

by Daniel Axtens

[permalink] [raw]
Subject: [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow

kasan_release_vmalloc uses apply_to_page_range to release vmalloc
shadow. Unfortunately, apply_to_page_range can allocate memory to
fill in page table entries, which is not what we want.

Also, kasan_release_vmalloc is called under free_vmap_area_lock,
so if apply_to_page_range does allocate memory, we get a sleep in
atomic bug:

BUG: sleeping function called from invalid context at mm/page_alloc.c:4681
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15087, name:

Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x199/0x216 lib/dump_stack.c:118
___might_sleep.cold.97+0x1f5/0x238 kernel/sched/core.c:6800
__might_sleep+0x95/0x190 kernel/sched/core.c:6753
prepare_alloc_pages mm/page_alloc.c:4681 [inline]
__alloc_pages_nodemask+0x3cd/0x890 mm/page_alloc.c:4730
alloc_pages_current+0x10c/0x210 mm/mempolicy.c:2211
alloc_pages include/linux/gfp.h:532 [inline]
__get_free_pages+0xc/0x40 mm/page_alloc.c:4786
__pte_alloc_one_kernel include/asm-generic/pgalloc.h:21 [inline]
pte_alloc_one_kernel include/asm-generic/pgalloc.h:33 [inline]
__pte_alloc_kernel+0x1d/0x200 mm/memory.c:459
apply_to_pte_range mm/memory.c:2031 [inline]
apply_to_pmd_range mm/memory.c:2068 [inline]
apply_to_pud_range mm/memory.c:2088 [inline]
apply_to_p4d_range mm/memory.c:2108 [inline]
apply_to_page_range+0x77d/0xa00 mm/memory.c:2133
kasan_release_vmalloc+0xa7/0xc0 mm/kasan/common.c:970
__purge_vmap_area_lazy+0xcbb/0x1f30 mm/vmalloc.c:1313
try_purge_vmap_area_lazy mm/vmalloc.c:1332 [inline]
free_vmap_area_noflush+0x2ca/0x390 mm/vmalloc.c:1368
free_unmap_vmap_area mm/vmalloc.c:1381 [inline]
remove_vm_area+0x1cc/0x230 mm/vmalloc.c:2209
vm_remove_mappings mm/vmalloc.c:2236 [inline]
__vunmap+0x223/0xa20 mm/vmalloc.c:2299
__vfree+0x3f/0xd0 mm/vmalloc.c:2356
__vmalloc_area_node mm/vmalloc.c:2507 [inline]
__vmalloc_node_range+0x5d5/0x810 mm/vmalloc.c:2547
__vmalloc_node mm/vmalloc.c:2607 [inline]
__vmalloc_node_flags mm/vmalloc.c:2621 [inline]
vzalloc+0x6f/0x80 mm/vmalloc.c:2666
alloc_one_pg_vec_page net/packet/af_packet.c:4233 [inline]
alloc_pg_vec net/packet/af_packet.c:4258 [inline]
packet_set_ring+0xbc0/0x1b50 net/packet/af_packet.c:4342
packet_setsockopt+0xed7/0x2d90 net/packet/af_packet.c:3695
__sys_setsockopt+0x29b/0x4d0 net/socket.c:2117
__do_sys_setsockopt net/socket.c:2133 [inline]
__se_sys_setsockopt net/socket.c:2130 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:2130
do_syscall_64+0xfa/0x780 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe

Switch to using the apply_to_existing_pages helper instead, which
won't allocate memory.

Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Reported-by: Dmitry Vyukov <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Signed-off-by: Daniel Axtens <[email protected]>

---

Andrew, if you want to take this, it replaces
"kasan: Don't allocate page tables in kasan_release_vmalloc()"
---
mm/kasan/common.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index e04e73603dfc..26fd0c13dd28 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -957,6 +957,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
{
void *shadow_start, *shadow_end;
unsigned long region_start, region_end;
+ unsigned long size;

region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
@@ -979,9 +980,10 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
shadow_end = kasan_mem_to_shadow((void *)region_end);

if (shadow_end > shadow_start) {
- apply_to_page_range(&init_mm, (unsigned long)shadow_start,
- (unsigned long)(shadow_end - shadow_start),
- kasan_depopulate_vmalloc_pte, NULL);
+ size = shadow_end - shadow_start;
+ apply_to_existing_pages(&init_mm, (unsigned long)shadow_start,
+ size, kasan_depopulate_vmalloc_pte,
+ NULL);
flush_tlb_kernel_range((unsigned long)shadow_start,
(unsigned long)shadow_end);
}
--
2.20.1

2019-12-05 14:07:21

by Daniel Axtens

[permalink] [raw]
Subject: [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed

syzkaller and the fault injector showed that I was wrong to assume
that we could ignore percpu shadow allocation failures.

Handle failures properly. Merge all the allocated areas back into the free
list and release the shadow, then clean up and return NULL. The shadow
is released unconditionally, which relies upon the fact that the release
function is able to tolerate pages not being present.

Also clean up shadows in the recovery path - currently they are not
released, which leaks a bit of memory.

Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
Reported-by: [email protected]
Reported-by: [email protected]
Cc: Dmitry Vyukov <[email protected]>
Cc: Andrey Ryabinin <[email protected]>
Signed-off-by: Daniel Axtens <[email protected]>
---
mm/vmalloc.c | 48 ++++++++++++++++++++++++++++++++++++++----------
1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 37af94b6cf30..fa5688093a88 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3291,7 +3291,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
- unsigned long base, start, size, end, last_end;
+ unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
enum fit_type type;

@@ -3421,6 +3421,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,

spin_unlock(&free_vmap_area_lock);

+ /* populate the kasan shadow space */
+ for (area = 0; area < nr_vms; area++) {
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ goto err_free_shadow;
+
+ kasan_unpoison_vmalloc((void *)vas[area]->va_start,
+ sizes[area]);
+ }
+
/* insert all vm's */
spin_lock(&vmap_area_lock);
for (area = 0; area < nr_vms; area++) {
@@ -3431,13 +3440,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
}
spin_unlock(&vmap_area_lock);

- /* populate the shadow space outside of the lock */
- for (area = 0; area < nr_vms; area++) {
- /* assume success here */
- kasan_populate_vmalloc(vas[area]->va_start, sizes[area]);
- kasan_unpoison_vmalloc((void *)vms[area]->addr, sizes[area]);
- }
-
kfree(vas);
return vms;

@@ -3449,8 +3451,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
* and when pcpu_get_vm_areas() is success.
*/
while (area--) {
- merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
}

@@ -3485,6 +3491,28 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
kfree(vas);
kfree(vms);
return NULL;
+
+err_free_shadow:
+ spin_lock(&free_vmap_area_lock);
+ /*
+ * We release all the vmalloc shadows, even the ones for regions that
+ * hadn't been successfully added. This relies on kasan_release_vmalloc
+ * being able to tolerate this case.
+ */
+ for (area = 0; area < nr_vms; area++) {
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
+ vas[area] = NULL;
+ kfree(vms[area]);
+ }
+ spin_unlock(&free_vmap_area_lock);
+ kfree(vas);
+ kfree(vms);
+ return NULL;
}

/**
--
2.20.1

2019-12-06 16:25:51

by Andrey Ryabinin

[permalink] [raw]
Subject: Re: [PATCH 3/3] kasan: don't assume percpu shadow allocations will succeed



On 12/5/19 5:04 PM, Daniel Axtens wrote:
> syzkaller and the fault injector showed that I was wrong to assume
> that we could ignore percpu shadow allocation failures.
>
> Handle failures properly. Merge all the allocated areas back into the free
> list and release the shadow, then clean up and return NULL. The shadow
> is released unconditionally, which relies upon the fact that the release
> function is able to tolerate pages not being present.
>
> Also clean up shadows in the recovery path - currently they are not
> released, which leaks a bit of memory.
>
> Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
> Reported-by: [email protected]
> Reported-by: [email protected]
> Cc: Dmitry Vyukov <[email protected]>
> Cc: Andrey Ryabinin <[email protected]>
> Signed-off-by: Daniel Axtens <[email protected]>
> ---

Reviewed-by: Andrey Ryabinin <[email protected]>

2019-12-06 16:26:46

by Andrey Ryabinin

[permalink] [raw]
Subject: Re: [PATCH 2/3] kasan: use apply_to_existing_pages for releasing vmalloc shadow



On 12/5/19 5:04 PM, Daniel Axtens wrote:
> kasan_release_vmalloc uses apply_to_page_range to release vmalloc
> shadow. Unfortunately, apply_to_page_range can allocate memory to
> fill in page table entries, which is not what we want.
>
> Also, kasan_release_vmalloc is called under free_vmap_area_lock,
> so if apply_to_page_range does allocate memory, we get a sleep in
> atomic bug:
>
> BUG: sleeping function called from invalid context at mm/page_alloc.c:4681
> in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15087, name:
>
> Call Trace:
> __dump_stack lib/dump_stack.c:77 [inline]
> dump_stack+0x199/0x216 lib/dump_stack.c:118
> ___might_sleep.cold.97+0x1f5/0x238 kernel/sched/core.c:6800
> __might_sleep+0x95/0x190 kernel/sched/core.c:6753
> prepare_alloc_pages mm/page_alloc.c:4681 [inline]
> __alloc_pages_nodemask+0x3cd/0x890 mm/page_alloc.c:4730
> alloc_pages_current+0x10c/0x210 mm/mempolicy.c:2211
> alloc_pages include/linux/gfp.h:532 [inline]
> __get_free_pages+0xc/0x40 mm/page_alloc.c:4786
> __pte_alloc_one_kernel include/asm-generic/pgalloc.h:21 [inline]
> pte_alloc_one_kernel include/asm-generic/pgalloc.h:33 [inline]
> __pte_alloc_kernel+0x1d/0x200 mm/memory.c:459
> apply_to_pte_range mm/memory.c:2031 [inline]
> apply_to_pmd_range mm/memory.c:2068 [inline]
> apply_to_pud_range mm/memory.c:2088 [inline]
> apply_to_p4d_range mm/memory.c:2108 [inline]
> apply_to_page_range+0x77d/0xa00 mm/memory.c:2133
> kasan_release_vmalloc+0xa7/0xc0 mm/kasan/common.c:970
> __purge_vmap_area_lazy+0xcbb/0x1f30 mm/vmalloc.c:1313
> try_purge_vmap_area_lazy mm/vmalloc.c:1332 [inline]
> free_vmap_area_noflush+0x2ca/0x390 mm/vmalloc.c:1368
> free_unmap_vmap_area mm/vmalloc.c:1381 [inline]
> remove_vm_area+0x1cc/0x230 mm/vmalloc.c:2209
> vm_remove_mappings mm/vmalloc.c:2236 [inline]
> __vunmap+0x223/0xa20 mm/vmalloc.c:2299
> __vfree+0x3f/0xd0 mm/vmalloc.c:2356
> __vmalloc_area_node mm/vmalloc.c:2507 [inline]
> __vmalloc_node_range+0x5d5/0x810 mm/vmalloc.c:2547
> __vmalloc_node mm/vmalloc.c:2607 [inline]
> __vmalloc_node_flags mm/vmalloc.c:2621 [inline]
> vzalloc+0x6f/0x80 mm/vmalloc.c:2666
> alloc_one_pg_vec_page net/packet/af_packet.c:4233 [inline]
> alloc_pg_vec net/packet/af_packet.c:4258 [inline]
> packet_set_ring+0xbc0/0x1b50 net/packet/af_packet.c:4342
> packet_setsockopt+0xed7/0x2d90 net/packet/af_packet.c:3695
> __sys_setsockopt+0x29b/0x4d0 net/socket.c:2117
> __do_sys_setsockopt net/socket.c:2133 [inline]
> __se_sys_setsockopt net/socket.c:2130 [inline]
> __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2130
> do_syscall_64+0xfa/0x780 arch/x86/entry/common.c:294
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Switch to using the apply_to_existing_pages helper instead, which
> won't allocate memory.
>
> Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory")
> Reported-by: Dmitry Vyukov <[email protected]>
> Cc: Andrey Ryabinin <[email protected]>
> Signed-off-by: Daniel Axtens <[email protected]>
>
> ---
>
> Andrew, if you want to take this, it replaces
> "kasan: Don't allocate page tables in kasan_release_vmalloc()"
> ---
> mm/kasan/common.c | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>


Reviewed-by: Andrey Ryabinin <[email protected]>

2019-12-06 16:26:52

by Andrey Ryabinin

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: add apply_to_existing_pages helper



On 12/5/19 5:04 PM, Daniel Axtens wrote:
> apply_to_page_range takes an address range, and if any parts of it
> are not covered by the existing page table hierarchy, it allocates
> memory to fill them in.
>
> In some use cases, this is not what we want - we want to be able to
> operate exclusively on PTEs that are already in the tables.
>
> Add apply_to_existing_pages for this. Adjust the walker functions
> for apply_to_page_range to take 'create', which switches them between
> the old and new modes.
>
> This will be used in KASAN vmalloc.
>
> Signed-off-by: Daniel Axtens <[email protected]>

Reviewed-by: Andrey Ryabinin <[email protected]>

2019-12-07 00:37:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: add apply_to_existing_pages helper

On Fri, 6 Dec 2019 01:04:05 +1100 Daniel Axtens <[email protected]> wrote:

> +/*
> + * Scan a region of virtual memory, calling a provided function on
> + * each leaf page table where it exists.
> + *
> + * Unlike apply_to_page_range, this does _not_ fill in page tables
> + * where they are absent.
> + */
> +int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
> + unsigned long size, pte_fn_t fn, void *data)
> +{
> + pgd_t *pgd;
> + unsigned long next;
> + unsigned long end = addr + size;
> + int err = 0;
> +
> + if (WARN_ON(addr >= end))
> + return -EINVAL;
> +
> + pgd = pgd_offset(mm, addr);
> + do {
> + next = pgd_addr_end(addr, end);
> + if (pgd_none_or_clear_bad(pgd))
> + continue;
> + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
> + if (err)
> + break;
> + } while (pgd++, addr = next, addr != end);
> +
> + return err;
> +}
> +EXPORT_SYMBOL_GPL(apply_to_existing_pages);

This is almost identical to apply_to_page_range() and cries out for
some deduplication. This?

--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix
+++ a/mm/memory.c
@@ -2141,12 +2141,9 @@ static int apply_to_p4d_range(struct mm_
return err;
}

-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
- unsigned long size, pte_fn_t fn, void *data)
+static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn,
+ void *data, bool create)
{
pgd_t *pgd;
unsigned long next;
@@ -2159,13 +2156,25 @@ int apply_to_page_range(struct mm_struct
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, true);
+ if (!create && pgd_none_or_clear_bad(pgd))
+ continue;
+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
if (err)
break;
} while (pgd++, addr = next, addr != end);

return err;
}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ return __apply_to_page_range(mm, addr, size, fn, data, true);
+}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
@@ -2178,25 +2187,7 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data)
{
- pgd_t *pgd;
- unsigned long next;
- unsigned long end = addr + size;
- int err = 0;
-
- if (WARN_ON(addr >= end))
- return -EINVAL;
-
- pgd = pgd_offset(mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, false);
- if (err)
- break;
- } while (pgd++, addr = next, addr != end);
-
- return err;
+ return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_pages);

_


2019-12-07 00:39:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: add apply_to_existing_pages helper

On Fri, 6 Dec 2019 01:04:05 +1100 Daniel Axtens <[email protected]> wrote:

> apply_to_page_range takes an address range, and if any parts of it
> are not covered by the existing page table hierarchy, it allocates
> memory to fill them in.
>
> In some use cases, this is not what we want - we want to be able to
> operate exclusively on PTEs that are already in the tables.
>
> Add apply_to_existing_pages for this. Adjust the walker functions
> for apply_to_page_range to take 'create', which switches them between
> the old and new modes.

Wouldn't apply_to_existing_page_range() be a better name?

--- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper-fix-fix
+++ a/include/linux/mm.h
@@ -2621,9 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
-extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
- unsigned long size, pte_fn_t fn,
- void *data);
+extern int apply_to_existing_page_range(struct mm_struct *mm,
+ unsigned long address, unsigned long size,
+ pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
--- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix-fix
+++ a/mm/memory.c
@@ -2184,12 +2184,12 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
* Unlike apply_to_page_range, this does _not_ fill in page tables
* where they are absent.
*/
-int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
- unsigned long size, pte_fn_t fn, void *data)
+int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
-EXPORT_SYMBOL_GPL(apply_to_existing_pages);
+EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

/*
* handle_pte_fault chooses page fault handler according to an entry which was
_

2019-12-07 02:20:58

by Daniel Axtens

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: add apply_to_existing_pages helper

>
> Wouldn't apply_to_existing_page_range() be a better name?

I agree with both of those fixups, thanks!

Regards,
Daniel

>
> --- a/include/linux/mm.h~mm-add-apply_to_existing_pages-helper-fix-fix
> +++ a/include/linux/mm.h
> @@ -2621,9 +2621,9 @@ static inline int vm_fault_to_errno(vm_f
> typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
> extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
> unsigned long size, pte_fn_t fn, void *data);
> -extern int apply_to_existing_pages(struct mm_struct *mm, unsigned long address,
> - unsigned long size, pte_fn_t fn,
> - void *data);
> +extern int apply_to_existing_page_range(struct mm_struct *mm,
> + unsigned long address, unsigned long size,
> + pte_fn_t fn, void *data);
>
> #ifdef CONFIG_PAGE_POISONING
> extern bool page_poisoning_enabled(void);
> --- a/mm/memory.c~mm-add-apply_to_existing_pages-helper-fix-fix
> +++ a/mm/memory.c
> @@ -2184,12 +2184,12 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
> * Unlike apply_to_page_range, this does _not_ fill in page tables
> * where they are absent.
> */
> -int apply_to_existing_pages(struct mm_struct *mm, unsigned long addr,
> - unsigned long size, pte_fn_t fn, void *data)
> +int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
> + unsigned long size, pte_fn_t fn, void *data)
> {
> return __apply_to_page_range(mm, addr, size, fn, data, false);
> }
> -EXPORT_SYMBOL_GPL(apply_to_existing_pages);
> +EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
>
> /*
> * handle_pte_fault chooses page fault handler according to an entry which was
> _

2019-12-09 07:35:44

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: add apply_to_existing_pages helper

Or a flags argument with descriptive flags to the existing function?
These magic bool arguments don't scale..

2019-12-11 01:32:08

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: add apply_to_existing_pages helper

On Sun, 8 Dec 2019 23:34:58 -0800 Christoph Hellwig <[email protected]> wrote:

> Or a flags argument with descriptive flags to the existing function?
> These magic bool arguments don't scale..

True. But it's easy enough to do s/bool create/enum mode/ in the
future should the need arise. For now, the code is clearer.