2019-03-20 15:24:50

by Thomas Hellstrom

[permalink] [raw]
Subject: [RFC PATCH 0/3] mm modifications / helpers for emulated GPU coherent memory

Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: [email protected]
Cc: [email protected]

Hi,
This is an early RFC to make sure I don't go too far in the wrong direction.

Non-coherent GPUs that can't directly see contents in CPU-visible memory,
like VMWare's SVGA device, run into trouble when trying to implement
coherent memory requirements of modern graphics APIs. Examples are
Vulkan and OpenGL 4.4's ARB_buffer_storage.

To remedy, we need to emulate coherent memory. Typically when it's detected
that a buffer object is about to be accessed by the GPU, we need to
gather the ranges that have been dirtied by the CPU since the last operation,
apply an operation to make the content visible to the GPU and clear the
the dirty tracking.

Depending on the size of the buffer object and the access pattern there are
two major possibilities:

1) Use page_mkwrite() and pfn_mkwrite(). (GPU buffer objects are backed
either by PCI device memory or by driver-alloced pages).
The dirty-tracking needs to be reset by write-protecting the affected ptes
and flush tlb. This has a complexity of O(num_dirty_pages), but the
write page-fault is of course costly.

2) Use hardware dirty-flags in the ptes. The dirty-tracking needs to be reset
by clearing the dirty bits and flush tlb. This has a complexity of
O(num_buffer_object_pages) and dirty bits need to be scanned in full before
each gpu-access.

So in practice the two methods need to be interleaved for best performance.

So to facilitate this, I propose two new helpers, apply_as_wrprotect() and
apply_as_clean() ("as" stands for address-space) both inspired by
unmap_mapping_range(). Users of these helpers are in the making, but needs
some cleaning-up.

There's also a change to x_mkwrite() to allow dropping the mmap_sem while
waiting.

Any comments or suggestions appreciated.

Thanks,
Thomas





2019-03-20 15:25:27

by Thomas Hellstrom

[permalink] [raw]
Subject: [RFC PATCH 2/3] mm: Add an apply_to_pfn_range interface

This is basically apply_to_page_range with added functionality:
Allocating missing parts of the page table becomes optional, which
means that the function can be guaranteed not to error if allocation
is disabled. Also passing of the closure struct and callback function
becomes different and more in line with how things are done elsewhere.

Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range

Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Thomas Hellstrom <[email protected]>
---
include/linux/mm.h | 10 ++++
mm/memory.c | 121 +++++++++++++++++++++++++++++++++------------
2 files changed, 99 insertions(+), 32 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..b7dd4ddd6efb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);

+struct pfn_range_apply;
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+ struct pfn_range_apply *closure);
+struct pfn_range_apply {
+ struct mm_struct *mm;
+ pter_fn_t ptefn;
+ unsigned int alloc;
+};
+extern int apply_to_pfn_range(struct pfn_range_apply *closure,
+ unsigned long address, unsigned long size);

#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index dcd80313cf10..0feb7191c2d2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
}
EXPORT_SYMBOL(vm_iomap_memory);

-static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
+ unsigned long addr, unsigned long end)
{
pte_t *pte;
int err;
pgtable_t token;
spinlock_t *uninitialized_var(ptl);

- pte = (mm == &init_mm) ?
+ pte = (closure->mm == &init_mm) ?
pte_alloc_kernel(pmd, addr) :
- pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;

@@ -1960,86 +1959,103 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
token = pmd_pgtable(*pmd);

do {
- err = fn(pte++, token, addr, data);
+ err = closure->ptefn(pte++, token, addr, closure);
if (err)
break;
} while (addr += PAGE_SIZE, addr != end);

arch_leave_lazy_mmu_mode();

- if (mm != &init_mm)
+ if (closure->mm != &init_mm)
pte_unmap_unlock(pte-1, ptl);
return err;
}

-static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
+ unsigned long addr, unsigned long end)
{
pmd_t *pmd;
unsigned long next;
- int err;
+ int err = 0;

BUG_ON(pud_huge(*pud));

- pmd = pmd_alloc(mm, pud, addr);
+ pmd = pmd_alloc(closure->mm, pud, addr);
if (!pmd)
return -ENOMEM;
+
do {
next = pmd_addr_end(addr, end);
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+ if (!closure->alloc && pmd_none_or_clear_bad(pmd))
+ continue;
+ err = apply_to_pte_range(closure, pmd, addr, next);
if (err)
break;
} while (pmd++, addr = next, addr != end);
return err;
}

-static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
+ unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
- int err;
+ int err = 0;

- pud = pud_alloc(mm, p4d, addr);
+ pud = pud_alloc(closure->mm, p4d, addr);
if (!pud)
return -ENOMEM;
+
do {
next = pud_addr_end(addr, end);
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+ if (!closure->alloc && pud_none_or_clear_bad(pud))
+ continue;
+ err = apply_to_pmd_range(closure, pud, addr, next);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}

-static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
+ unsigned long addr, unsigned long end)
{
p4d_t *p4d;
unsigned long next;
- int err;
+ int err = 0;

- p4d = p4d_alloc(mm, pgd, addr);
+ p4d = p4d_alloc(closure->mm, pgd, addr);
if (!p4d)
return -ENOMEM;
+
do {
next = p4d_addr_end(addr, end);
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+ if (!closure->alloc && p4d_none_or_clear_bad(p4d))
+ continue;
+ err = apply_to_pud_range(closure, p4d, addr, next);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}

-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
+/**
+ * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
+ * function on each leaf page table entry
+ * @closure: Details about how to scan and what function to apply
+ * @addr: Start virtual address
+ * @size: Size of the region
+ *
+ * If @closure->alloc is set to 1, the function will fill in the page table
+ * as necessary. Otherwise it will skip non-present parts.
+ *
+ * Returns: Zero on success. If the provided function returns a non-zero status,
+ * the page table walk will terminate and that status will be returned.
+ * If @closure->alloc is set to 1, then this function may also return memory
+ * allocation errors arising from allocating page table memory.
*/
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
- unsigned long size, pte_fn_t fn, void *data)
+int apply_to_pfn_range(struct pfn_range_apply *closure,
+ unsigned long addr, unsigned long size)
{
pgd_t *pgd;
unsigned long next;
@@ -2049,16 +2065,57 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (WARN_ON(addr >= end))
return -EINVAL;

- pgd = pgd_offset(mm, addr);
+ pgd = pgd_offset(closure->mm, addr);
do {
next = pgd_addr_end(addr, end);
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+ if (!closure->alloc && pgd_none_or_clear_bad(pgd))
+ continue;
+ err = apply_to_p4d_range(closure, pgd, addr, next);
if (err)
break;
} while (pgd++, addr = next, addr != end);

return err;
}
+EXPORT_SYMBOL_GPL(apply_to_pfn_range);
+
+struct page_range_apply {
+ struct pfn_range_apply pter;
+ pte_fn_t fn;
+ void *data;
+};
+
+/*
+ * Callback wrapper to enable use of apply_to_pfn_range for
+ * the apply_to_page_range interface
+ */
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+ unsigned long addr,
+ struct pfn_range_apply *pter)
+{
+ struct page_range_apply *pra =
+ container_of(pter, typeof(*pra), pter);
+
+ return pra->fn(pte, token, addr, pra->data);
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ struct page_range_apply pra = {
+ .pter = {.mm = mm,
+ .alloc = 1,
+ .ptefn = apply_to_page_range_wrapper },
+ .fn = fn,
+ .data = data
+ };
+
+ return apply_to_pfn_range(&pra.pter, addr, size);
+}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
--
2.19.0.rc1


2019-03-20 15:26:23

by Thomas Hellstrom

[permalink] [raw]
Subject: [RFC PATCH 1/3] mm: Allow the [page|pfn]_mkwrite callbacks to drop the mmap_sem

Driver fault callbacks are allowed to drop the mmap_sem when expecting
long hardware waits to avoid blocking other mm users. Allow the mkwrite
callbacks to do the same by returning early on VM_FAULT_RETRY.

In particular we want to be able to drop the mmap_sem when waiting for
a reservation object lock on a GPU buffer object. These locks may be
held while waiting for the GPU.

Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: [email protected]
Cc: [email protected]

Signed-off-by: Thomas Hellstrom <[email protected]>
---
mm/memory.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index a52663c0612d..dcd80313cf10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
lock_page(page);
@@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
- if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
@@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ (VM_FAULT_ERROR | VM_FAULT_RETRY |
+ VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
@@ -3472,7 +3473,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
- (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ (tmp & (VM_FAULT_ERROR | VM_FAULT_RETRY |
+ VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
--
2.19.0.rc1


2019-03-20 17:46:57

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC PATCH 1/3] mm: Allow the [page|pfn]_mkwrite callbacks to drop the mmap_sem

On Wed, Mar 20, 2019 at 8:23 AM Thomas Hellstrom <[email protected]> wrote:
>
> Driver fault callbacks are allowed to drop the mmap_sem when expecting
> long hardware waits [...]

No comment on the patch itself, but please fix your email setup.

All the patches were marked as spam, because you sent them from your
vmware.com address, but without going through the proper vmware smtp
gateway.

So they lack the proper vmware DKIM hashes and proper mail handling
should (and does) consider them spam.

Linus