Graphics APIs like OpenGL 4.4 and Vulkan require the graphics driver
to provide coherent graphics memory, meaning that the GPU sees any
content written to the coherent memory on the next GPU operation that
touches that memory, and the CPU sees any content written by the GPU
to that memory immediately after any fence object trailing the GPU
operation has signaled.
Paravirtual drivers that otherwise require explicit synchronization
needs to do this by hooking up dirty tracking to pagefault handlers
and buffer object validation. This is a first attempt to do that for
the vmwgfx driver.
The mm patches has been out for RFC. I think I have addressed all the
feedback I got, except a possible softdirty breakage. But although the
dirty-tracking and softdirty may write-protect PTEs both care about,
that shouldn't really cause any operation interference. In particular
since we use the hardware dirty PTE bits and softdirty uses other PTE bits.
For the TTM changes they are hopefully in line with the long-term
strategy of making helpers out of what's left of TTM.
The code has been tested and excercised by a tailored version of mesa
where we disable all explicit synchronization and assume graphics memory
is coherent. The performance loss varies of course; a typical number is
around 5%.
Any feedback greatly appreciated.
Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: "Christian König" <[email protected]>
Cc: [email protected]
This is basically apply_to_page_range with added functionality:
Allocating missing parts of the page table becomes optional, which
means that the function can be guaranteed not to error if allocation
is disabled. Also passing of the closure struct and callback function
becomes different and more in line with how things are done elsewhere.
Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range
The reason for not using the page-walk code is that we want to perform
the page-walk on vmas pointing to an address space without requiring the
mmap_sem to be held rather thand on vmas belonging to a process with the
mmap_sem held.
Notable changes since RFC:
Don't export apply_to_pfn range.
Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Thomas Hellstrom <[email protected]>
---
include/linux/mm.h | 10 ++++
mm/memory.c | 130 ++++++++++++++++++++++++++++++++++-----------
2 files changed, 108 insertions(+), 32 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..b7dd4ddd6efb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
+struct pfn_range_apply;
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+ struct pfn_range_apply *closure);
+struct pfn_range_apply {
+ struct mm_struct *mm;
+ pter_fn_t ptefn;
+ unsigned int alloc;
+};
+extern int apply_to_pfn_range(struct pfn_range_apply *closure,
+ unsigned long address, unsigned long size);
#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index a95b4a3b1ae2..60d67158964f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
}
EXPORT_SYMBOL(vm_iomap_memory);
-static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
+ unsigned long addr, unsigned long end)
{
pte_t *pte;
int err;
pgtable_t token;
spinlock_t *uninitialized_var(ptl);
- pte = (mm == &init_mm) ?
+ pte = (closure->mm == &init_mm) ?
pte_alloc_kernel(pmd, addr) :
- pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
@@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
token = pmd_pgtable(*pmd);
do {
- err = fn(pte++, token, addr, data);
+ err = closure->ptefn(pte++, token, addr, closure);
if (err)
break;
} while (addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- if (mm != &init_mm)
+ if (closure->mm != &init_mm)
pte_unmap_unlock(pte-1, ptl);
return err;
}
-static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
+ unsigned long addr, unsigned long end)
{
pmd_t *pmd;
unsigned long next;
- int err;
+ int err = 0;
BUG_ON(pud_huge(*pud));
- pmd = pmd_alloc(mm, pud, addr);
+ pmd = pmd_alloc(closure->mm, pud, addr);
if (!pmd)
return -ENOMEM;
+
do {
next = pmd_addr_end(addr, end);
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+ if (!closure->alloc && pmd_none_or_clear_bad(pmd))
+ continue;
+ err = apply_to_pte_range(closure, pmd, addr, next);
if (err)
break;
} while (pmd++, addr = next, addr != end);
return err;
}
-static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
+ unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
- int err;
+ int err = 0;
- pud = pud_alloc(mm, p4d, addr);
+ pud = pud_alloc(closure->mm, p4d, addr);
if (!pud)
return -ENOMEM;
+
do {
next = pud_addr_end(addr, end);
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+ if (!closure->alloc && pud_none_or_clear_bad(pud))
+ continue;
+ err = apply_to_pmd_range(closure, pud, addr, next);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
-static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data)
+static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
+ unsigned long addr, unsigned long end)
{
p4d_t *p4d;
unsigned long next;
- int err;
+ int err = 0;
- p4d = p4d_alloc(mm, pgd, addr);
+ p4d = p4d_alloc(closure->mm, pgd, addr);
if (!p4d)
return -ENOMEM;
+
do {
next = p4d_addr_end(addr, end);
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+ if (!closure->alloc && p4d_none_or_clear_bad(p4d))
+ continue;
+ err = apply_to_pud_range(closure, p4d, addr, next);
if (err)
break;
} while (p4d++, addr = next, addr != end);
return err;
}
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
+/**
+ * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
+ * function on each leaf page table entry
+ * @closure: Details about how to scan and what function to apply
+ * @addr: Start virtual address
+ * @size: Size of the region
+ *
+ * If @closure->alloc is set to 1, the function will fill in the page table
+ * as necessary. Otherwise it will skip non-present parts.
+ * Note: The caller must ensure that the range does not contain huge pages.
+ * The caller must also assure that the proper mmu_notifier functions are
+ * called. Either in the pte leaf function or before and after the call to
+ * apply_to_pfn_range.
+ *
+ * Returns: Zero on success. If the provided function returns a non-zero status,
+ * the page table walk will terminate and that status will be returned.
+ * If @closure->alloc is set to 1, then this function may also return memory
+ * allocation errors arising from allocating page table memory.
*/
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
- unsigned long size, pte_fn_t fn, void *data)
+int apply_to_pfn_range(struct pfn_range_apply *closure,
+ unsigned long addr, unsigned long size)
{
pgd_t *pgd;
unsigned long next;
@@ -2049,16 +2069,62 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (WARN_ON(addr >= end))
return -EINVAL;
- pgd = pgd_offset(mm, addr);
+ pgd = pgd_offset(closure->mm, addr);
do {
next = pgd_addr_end(addr, end);
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+ if (!closure->alloc && pgd_none_or_clear_bad(pgd))
+ continue;
+ err = apply_to_p4d_range(closure, pgd, addr, next);
if (err)
break;
} while (pgd++, addr = next, addr != end);
return err;
}
+
+/**
+ * struct page_range_apply - Closure structure for apply_to_page_range()
+ * @pter: The base closure structure we derive from
+ * @fn: The leaf pte function to call
+ * @data: The leaf pte function closure
+ */
+struct page_range_apply {
+ struct pfn_range_apply pter;
+ pte_fn_t fn;
+ void *data;
+};
+
+/*
+ * Callback wrapper to enable use of apply_to_pfn_range for
+ * the apply_to_page_range interface
+ */
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+ unsigned long addr,
+ struct pfn_range_apply *pter)
+{
+ struct page_range_apply *pra =
+ container_of(pter, typeof(*pra), pter);
+
+ return pra->fn(pte, token, addr, pra->data);
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ struct page_range_apply pra = {
+ .pter = {.mm = mm,
+ .alloc = 1,
+ .ptefn = apply_to_page_range_wrapper },
+ .fn = fn,
+ .data = data
+ };
+
+ return apply_to_pfn_range(&pra.pter, addr, size);
+}
EXPORT_SYMBOL_GPL(apply_to_page_range);
/*
--
2.20.1
Add a pointer to the struct vm_operations_struct in the bo_device, and
assign that pointer to the default value currently used.
The driver can then optionally modify that pointer and the new value
can be used for each new vma created.
Cc: "Christian König" <[email protected]>
Signed-off-by: Thomas Hellstrom <[email protected]>
---
drivers/gpu/drm/ttm/ttm_bo.c | 1 +
drivers/gpu/drm/ttm/ttm_bo_vm.c | 6 +++---
include/drm/ttm/ttm_bo_driver.h | 6 ++++++
3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 3f56647cdb35..1c85bec00472 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1656,6 +1656,7 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
mutex_lock(&ttm_global_mutex);
list_add_tail(&bdev->device_list, &glob->device_list);
mutex_unlock(&ttm_global_mutex);
+ bdev->vm_ops = &ttm_bo_vm_ops;
return 0;
out_no_sys:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index e86a29a1e51f..bfb25b81fed7 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -395,7 +395,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-static const struct vm_operations_struct ttm_bo_vm_ops = {
+const struct vm_operations_struct ttm_bo_vm_ops = {
.fault = ttm_bo_vm_fault,
.open = ttm_bo_vm_open,
.close = ttm_bo_vm_close,
@@ -445,7 +445,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
if (unlikely(ret != 0))
goto out_unref;
- vma->vm_ops = &ttm_bo_vm_ops;
+ vma->vm_ops = bdev->vm_ops;
/*
* Note: We're transferring the bo reference to
@@ -477,7 +477,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
ttm_bo_get(bo);
- vma->vm_ops = &ttm_bo_vm_ops;
+ vma->vm_ops = bo->bdev->vm_ops;
vma->vm_private_data = bo;
vma->vm_flags |= VM_MIXEDMAP;
vma->vm_flags |= VM_IO | VM_DONTEXPAND;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index cbf3180cb612..cfeaff5d9706 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -443,6 +443,9 @@ extern struct ttm_bo_global {
* @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
* @man: An array of mem_type_managers.
* @vma_manager: Address space manager
+ * @vm_ops: Pointer to the struct vm_operations_struct used for this
+ * device's VM operations. The driver may override this before the first
+ * mmap() call.
* lru_lock: Spinlock that protects the buffer+device lru lists and
* ddestroy lists.
* @dev_mapping: A pointer to the struct address_space representing the
@@ -461,6 +464,7 @@ struct ttm_bo_device {
struct ttm_bo_global *glob;
struct ttm_bo_driver *driver;
struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
+ const struct vm_operations_struct *vm_ops;
/*
* Protected by internal locks.
@@ -489,6 +493,8 @@ struct ttm_bo_device {
bool no_retry;
};
+extern const struct vm_operations_struct ttm_bo_vm_ops;
+
/**
* struct ttm_lru_bulk_move_pos
*
--
2.20.1
Add two utilities to a) write-protect and b) clean all ptes pointing into
a range of an address space
The utilities are intended to aid in tracking dirty pages (either
driver-allocated system memory or pci device memory).
The write-protect utility should be used in conjunction with
page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
accesses. Typically one would want to use this on sparse accesses into
large memory regions. The clean utility should be used to utilize
hardware dirtying functionality and avoid the overhead of page-faults,
typically on large accesses into small memory regions.
The added file "apply_as_range.c" is initially listed as maintained by
VMware under our DRM driver. If somebody would like it elsewhere,
that's of course no problem.
Notable changes since RFC:
- Added comments to help avoid the usage of these function for VMAs
it's not intended for. We also do advisory checks on the vm_flags and
warn on illegal usage.
- Perform the pte modifications the same way softdirty does.
- Add mmu_notifier range invalidation calls.
- Add a config option so that this code is not unconditionally included.
- Tell the mmu_gather code about pending tlb flushes.
Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Thomas Hellstrom <[email protected]>
---
MAINTAINERS | 1 +
include/linux/mm.h | 9 +-
mm/Kconfig | 3 +
mm/Makefile | 3 +-
mm/apply_as_range.c | 295 ++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 309 insertions(+), 2 deletions(-)
create mode 100644 mm/apply_as_range.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 35e6357f9d30..bc243ffcb840 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4971,6 +4971,7 @@ T: git git://people.freedesktop.org/~thomash/linux
S: Supported
F: drivers/gpu/drm/vmwgfx/
F: include/uapi/drm/vmwgfx_drm.h
+F: mm/apply_as_range.c
DRM DRIVERS
M: David Airlie <[email protected]>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7dd4ddd6efb..62f24dd0bfa0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2642,7 +2642,14 @@ struct pfn_range_apply {
};
extern int apply_to_pfn_range(struct pfn_range_apply *closure,
unsigned long address, unsigned long size);
-
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr);
+unsigned long apply_as_clean(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ pgoff_t bitmap_pgoff,
+ unsigned long *bitmap,
+ pgoff_t *start,
+ pgoff_t *end);
#ifdef CONFIG_PAGE_POISONING
extern bool page_poisoning_enabled(void);
extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..80e41cdbb4ae 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -758,4 +758,7 @@ config GUP_BENCHMARK
config ARCH_HAS_PTE_SPECIAL
bool
+config AS_DIRTY_HELPERS
+ bool
+
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..b295717be856 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o $(mmu-y)
+ debug.o apply_as_range.o $(mmu-y)
obj-y += init-mm.o
obj-y += memblock.o
@@ -99,3 +99,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AS_DIRTY_HELPERS) += apply_as_range.o
diff --git a/mm/apply_as_range.c b/mm/apply_as_range.c
new file mode 100644
index 000000000000..32d28619aec5
--- /dev/null
+++ b/mm/apply_as_range.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct apply_as - Closure structure for apply_as_range
+ * @base: struct pfn_range_apply we derive from
+ * @start: Address of first modified pte
+ * @end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ * @vma: Pointer to the struct vm_area_struct we're currently operating on
+ */
+struct apply_as {
+ struct pfn_range_apply base;
+ unsigned long start, end;
+ unsigned long total;
+ const struct vm_area_struct *vma;
+};
+
+/**
+ * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+ unsigned long addr,
+ struct pfn_range_apply *closure)
+{
+ struct apply_as *aas = container_of(closure, typeof(*aas), base);
+ pte_t ptent = *pte;
+
+ if (pte_write(ptent)) {
+ ptent = ptep_modify_prot_start(closure->mm, addr, pte);
+ ptent = pte_wrprotect(ptent);
+ ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
+ aas->total++;
+ aas->start = min(aas->start, addr);
+ aas->end = max(aas->end, addr + PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+/**
+ * struct apply_as_clean - Closure structure for apply_as_clean
+ * @base: struct apply_as we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct apply_as_clean {
+ struct apply_as base;
+ pgoff_t bitmap_pgoff;
+ unsigned long *bitmap;
+ pgoff_t start, end;
+};
+
+/**
+ * apply_pt_clean - Leaf pte callback to clean a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as_clean
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
+ unsigned long addr,
+ struct pfn_range_apply *closure)
+{
+ struct apply_as *aas = container_of(closure, typeof(*aas), base);
+ struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
+ pte_t ptent = *pte;
+
+ if (pte_dirty(ptent)) {
+ pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
+ aas->vma->vm_pgoff - clean->bitmap_pgoff;
+
+ ptent = ptep_modify_prot_start(closure->mm, addr, pte);
+ ptent = pte_mkclean(ptent);
+ ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
+
+ aas->total++;
+ aas->start = min(aas->start, addr);
+ aas->end = max(aas->end, addr + PAGE_SIZE);
+
+ __set_bit(pgoff, clean->bitmap);
+ clean->start = min(clean->start, pgoff);
+ clean->end = max(clean->end, pgoff + 1);
+ }
+
+ return 0;
+}
+
+/**
+ * apply_as_range - Apply a pte callback to all PTEs pointing into a range
+ * of an address_space.
+ * @mapping: Pointer to the struct address_space
+ * @aas: Closure structure
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Return: Number of ptes touched. Note that this number might be larger
+ * than @nr if there are overlapping vmas
+ */
+static unsigned long apply_as_range(struct address_space *mapping,
+ struct apply_as *aas,
+ pgoff_t first_index, pgoff_t nr)
+{
+ struct vm_area_struct *vma;
+ pgoff_t vba, vea, cba, cea;
+ unsigned long start_addr, end_addr;
+ struct mmu_notifier_range range;
+
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+ first_index + nr - 1) {
+ unsigned long vm_flags = READ_ONCE(vma->vm_flags);
+
+ /*
+ * We can only do advisory flag tests below, since we can't
+ * require the vm's mmap_sem to be held to protect the flags.
+ * Therefore, callers that strictly depend on specific mmap
+ * flags to remain constant throughout the operation must
+ * either ensure those flags are immutable for all relevant
+ * vmas or can't use this function. Fixing this properly would
+ * require the vma::vm_flags to be protected by a separate
+ * lock taken after the i_mmap_lock
+ */
+
+ /* Skip non-applicable VMAs */
+ if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
+ (VM_SHARED | VM_WRITE))
+ continue;
+
+ /* Warn on and skip VMAs whose flags indicate illegal usage */
+ if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
+ continue;
+
+ /* Clip to the vma */
+ vba = vma->vm_pgoff;
+ vea = vba + vma_pages(vma);
+ cba = first_index;
+ cba = max(cba, vba);
+ cea = first_index + nr;
+ cea = min(cea, vea);
+
+ /* Translate to virtual address */
+ start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+ end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+ if (start_addr >= end_addr)
+ continue;
+
+ aas->base.mm = vma->vm_mm;
+ aas->vma = vma;
+ aas->start = end_addr;
+ aas->end = start_addr;
+
+ mmu_notifier_range_init(&range, vma->vm_mm,
+ start_addr, end_addr);
+ mmu_notifier_invalidate_range_start(&range);
+
+ /* Needed when we only change protection? */
+ flush_cache_range(vma, start_addr, end_addr);
+
+ /*
+ * We're not using tlb_gather_mmu() since typically
+ * only a small subrange of PTEs are affected.
+ */
+ inc_tlb_flush_pending(vma->vm_mm);
+
+ /* Should not error since aas->base.alloc == 0 */
+ WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
+ end_addr - start_addr));
+ if (aas->end > aas->start)
+ flush_tlb_range(vma, aas->start, aas->end);
+
+ mmu_notifier_invalidate_range_end(&range);
+ dec_tlb_flush_pending(vma->vm_mm);
+ }
+ i_mmap_unlock_read(mapping);
+
+ return aas->total;
+}
+
+/**
+ * apply_as_wrprotect - Write-protect all ptes in an address_space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * WARNING: This function should only be used for address spaces that
+ * completely own the pages / memory the page table points to. Typically a
+ * device file.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr)
+{
+ struct apply_as aas = {
+ .base = {
+ .alloc = 0,
+ .ptefn = apply_pt_wrprotect,
+ },
+ .total = 0,
+ };
+
+ return apply_as_range(mapping, &aas, first_index, nr);
+}
+EXPORT_SYMBOL(apply_as_wrprotect);
+
+/**
+ * apply_as_clean - Clean all ptes in an address_space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bets are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick upp all dirty bits.
+ *
+ * WARNING: This function should only be used for address spaces that
+ * completely own the pages / memory the page table points to. Typically a
+ * device file.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long apply_as_clean(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t nr,
+ pgoff_t bitmap_pgoff,
+ unsigned long *bitmap,
+ pgoff_t *start,
+ pgoff_t *end)
+{
+ bool none_set = (*start >= *end);
+ struct apply_as_clean clean = {
+ .base = {
+ .base = {
+ .alloc = 0,
+ .ptefn = apply_pt_clean,
+ },
+ .total = 0,
+ },
+ .bitmap_pgoff = bitmap_pgoff,
+ .bitmap = bitmap,
+ .start = none_set ? nr : *start,
+ .end = none_set ? 0 : *end,
+ };
+ unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
+ nr);
+
+ *start = clean.start;
+ *end = clean.end;
+ return ret;
+}
+EXPORT_SYMBOL(apply_as_clean);
--
2.20.1
With emulated coherent memory we need to be able to quickly look up
a resource from the MOB offset. Instead of traversing a linked list with
O(n) worst case, use an RBtree with O(log n) worst case complexity.
Signed-off-by: Thomas Hellstrom <[email protected]>
---
drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 5 ++--
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 10 +++----
drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 33 +++++++++++++++++-------
3 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index 90ca866640fe..e8bc7a7ac031 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -464,6 +464,7 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
WARN_ON(vmw_bo->dirty);
+ WARN_ON(!RB_EMPTY_ROOT(&vmw_bo->res_tree));
vmw_bo_unmap(vmw_bo);
kfree(vmw_bo);
}
@@ -480,6 +481,7 @@ static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
WARN_ON(vbo->dirty);
+ WARN_ON(!RB_EMPTY_ROOT(&vbo->res_tree));
vmw_bo_unmap(vbo);
ttm_prime_object_kfree(vmw_user_bo, prime);
}
@@ -515,8 +517,7 @@ int vmw_bo_init(struct vmw_private *dev_priv,
memset(vmw_bo, 0, sizeof(*vmw_bo));
BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3);
vmw_bo->base.priority = 3;
-
- INIT_LIST_HEAD(&vmw_bo->res_list);
+ vmw_bo->res_tree = RB_ROOT;
ret = ttm_bo_init(bdev, &vmw_bo->base, size,
ttm_bo_type_device, placement,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index f05fce52fbb4..81ebcd668038 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -90,7 +90,7 @@ struct vmw_fpriv {
/**
* struct vmw_buffer_object - TTM buffer object with vmwgfx additions
* @base: The TTM buffer object
- * @res_list: List of resources using this buffer object as a backing MOB
+ * @res_tree: RB tree of resources using this buffer object as a backing MOB
* @pin_count: pin depth
* @dx_query_ctx: DX context if this buffer object is used as a DX query MOB
* @map: Kmap object for semi-persistent mappings
@@ -99,7 +99,7 @@ struct vmw_fpriv {
*/
struct vmw_buffer_object {
struct ttm_buffer_object base;
- struct list_head res_list;
+ struct rb_root res_tree;
s32 pin_count;
/* Not ref-counted. Protected by binding_mutex */
struct vmw_resource *dx_query_ctx;
@@ -147,8 +147,8 @@ struct vmw_res_func;
* pin-count greater than zero. It is not on the resource LRU lists and its
* backup buffer is pinned. Hence it can't be evicted.
* @func: Method vtable for this resource. Immutable.
+ * @mob_node; Node for the MOB backup rbtree. Protected by @backup reserved.
* @lru_head: List head for the LRU list. Protected by @dev_priv::resource_lock.
- * @mob_head: List head for the MOB backup list. Protected by @backup reserved.
* @binding_head: List head for the context binding list. Protected by
* the @dev_priv::binding_mutex
* @res_free: The resource destructor.
@@ -169,8 +169,8 @@ struct vmw_resource {
unsigned long backup_offset;
unsigned long pin_count;
const struct vmw_res_func *func;
+ struct rb_node mob_node;
struct list_head lru_head;
- struct list_head mob_head;
struct list_head binding_head;
struct vmw_resource_dirty *dirty;
void (*res_free) (struct vmw_resource *res);
@@ -743,7 +743,7 @@ void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
*/
static inline bool vmw_resource_mob_attached(const struct vmw_resource *res)
{
- return !list_empty(&res->mob_head);
+ return !RB_EMPTY_NODE(&res->mob_node);
}
/**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index d35f4bd32cd9..ff9fe5650468 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -41,11 +41,24 @@
void vmw_resource_mob_attach(struct vmw_resource *res)
{
struct vmw_buffer_object *backup = res->backup;
+ struct rb_node **new = &backup->res_tree.rb_node, *parent = NULL;
lockdep_assert_held(&backup->base.resv->lock.base);
res->used_prio = (res->res_dirty) ? res->func->dirty_prio :
res->func->prio;
- list_add_tail(&res->mob_head, &backup->res_list);
+
+ while (*new) {
+ struct vmw_resource *this =
+ container_of(*new, struct vmw_resource, mob_node);
+
+ parent = *new;
+ new = (res->backup_offset < this->backup_offset) ?
+ &((*new)->rb_left) : &((*new)->rb_right);
+ }
+
+ rb_link_node(&res->mob_node, parent, new);
+ rb_insert_color(&res->mob_node, &backup->res_tree);
+
vmw_bo_prio_add(backup, res->used_prio);
}
@@ -59,7 +72,8 @@ void vmw_resource_mob_detach(struct vmw_resource *res)
lockdep_assert_held(&backup->base.resv->lock.base);
if (vmw_resource_mob_attached(res)) {
- list_del_init(&res->mob_head);
+ rb_erase(&res->mob_node, &backup->res_tree);
+ RB_CLEAR_NODE(&res->mob_node);
vmw_bo_prio_del(backup, res->used_prio);
}
}
@@ -206,8 +220,8 @@ int vmw_resource_init(struct vmw_private *dev_priv, struct vmw_resource *res,
res->res_free = res_free;
res->dev_priv = dev_priv;
res->func = func;
+ RB_CLEAR_NODE(&res->mob_node);
INIT_LIST_HEAD(&res->lru_head);
- INIT_LIST_HEAD(&res->mob_head);
INIT_LIST_HEAD(&res->binding_head);
res->id = -1;
res->backup = NULL;
@@ -755,19 +769,20 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr)
*/
void vmw_resource_unbind_list(struct vmw_buffer_object *vbo)
{
-
- struct vmw_resource *res, *next;
struct ttm_validate_buffer val_buf = {
.bo = &vbo->base,
.num_shared = 0
};
lockdep_assert_held(&vbo->base.resv->lock.base);
- list_for_each_entry_safe(res, next, &vbo->res_list, mob_head) {
- if (!res->func->unbind)
- continue;
+ while (!RB_EMPTY_ROOT(&vbo->res_tree)) {
+ struct rb_node *node = vbo->res_tree.rb_node;
+ struct vmw_resource *res =
+ container_of(node, struct vmw_resource, mob_node);
+
+ if (!WARN_ON_ONCE(!res->func->unbind))
+ (void) res->func->unbind(res, res->res_dirty, &val_buf);
- (void) res->func->unbind(res, res->res_dirty, &val_buf);
res->backup_dirty = true;
res->res_dirty = false;
vmw_resource_mob_detach(res);
--
2.20.1
Similar to write-coherent resources, make sure that from the user-space
point of view, GPU rendered contents is automatically available for
reading by the CPU.
Signed-off-by: Thomas Hellstrom <[email protected]>
---
drivers/gpu/drm/ttm/ttm_bo_vm.c | 1 +
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 8 +-
drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 69 +++++++++++-
drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 102 +++++++++++++++++-
drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h | 2 +
drivers/gpu/drm/vmwgfx/vmwgfx_validation.c | 3 +-
6 files changed, 176 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 3bd28fb97124..0065b138f450 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -42,6 +42,7 @@
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
+
static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
struct vm_fault *vmf)
{
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 81ebcd668038..00794415335e 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -96,6 +96,7 @@ struct vmw_fpriv {
* @map: Kmap object for semi-persistent mappings
* @res_prios: Eviction priority counts for attached resources
* @dirty: structure for user-space dirty-tracking
+ * @cleaning: Current validation sequence is cleaning.
*/
struct vmw_buffer_object {
struct ttm_buffer_object base;
@@ -690,7 +691,8 @@ extern void vmw_resource_unreference(struct vmw_resource **p_res);
extern struct vmw_resource *vmw_resource_reference(struct vmw_resource *res);
extern struct vmw_resource *
vmw_resource_reference_unless_doomed(struct vmw_resource *res);
-extern int vmw_resource_validate(struct vmw_resource *res, bool intr);
+extern int vmw_resource_validate(struct vmw_resource *res, bool intr,
+ bool dirtying);
extern int vmw_resource_reserve(struct vmw_resource *res, bool interruptible,
bool no_backup);
extern bool vmw_resource_needs_backup(const struct vmw_resource *res);
@@ -734,6 +736,8 @@ void vmw_resource_mob_attach(struct vmw_resource *res);
void vmw_resource_mob_detach(struct vmw_resource *res);
void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
pgoff_t end);
+int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
+ pgoff_t end, pgoff_t *num_prefault);
/**
* vmw_resource_mob_attached - Whether a resource currently has a mob attached
@@ -1428,6 +1432,8 @@ int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
void vmw_bo_dirty_clear_res(struct vmw_resource *res);
void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
+void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
+ pgoff_t start, pgoff_t end);
vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
index 87e4a73b1175..773ff30a4b60 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
@@ -153,7 +153,6 @@ static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object *vbo)
}
}
-
/**
* vmw_bo_dirty_scan - Scan for dirty pages and add them to the dirty
* tracking structure
@@ -171,6 +170,51 @@ void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
vmw_bo_dirty_scan_mkwrite(vbo);
}
+/**
+ * vmw_bo_dirty_pre_unmap - write-protect and pick up dirty pages before
+ * an unmap_mapping_range operation.
+ * @vbo: The buffer object,
+ * @start: First page of the range within the buffer object.
+ * @end: Last page of the range within the buffer object + 1.
+ *
+ * If we're using the _PAGETABLE scan method, we may leak dirty pages
+ * when calling unmap_mapping_range(). This function makes sure we pick
+ * up all dirty pages.
+ */
+static void vmw_bo_dirty_pre_unmap(struct vmw_buffer_object *vbo,
+ pgoff_t start, pgoff_t end)
+{
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+ unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
+ struct address_space *mapping = vbo->base.bdev->dev_mapping;
+
+ if (dirty->method != VMW_BO_DIRTY_PAGETABLE || start >= end)
+ return;
+
+ apply_as_wrprotect(mapping, start + offset, end - start);
+ apply_as_clean(mapping, start + offset, end - start, offset,
+ &dirty->bitmap[0], &dirty->start, &dirty->end);
+}
+
+/**
+ * vmw_bo_dirty_unmap - Clear all ptes pointing to a range within a bo
+ * @vbo: The buffer object,
+ * @start: First page of the range within the buffer object.
+ * @end: Last page of the range within the buffer object + 1.
+ *
+ * This is similar to ttm_bo_unmap_virtual_locked() except it takes a subrange.
+ */
+void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
+ pgoff_t start, pgoff_t end)
+{
+ unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
+ struct address_space *mapping = vbo->base.bdev->dev_mapping;
+
+ vmw_bo_dirty_pre_unmap(vbo, start, end);
+ unmap_shared_mapping_range(mapping, (offset + start) << PAGE_SHIFT,
+ (loff_t) (end - start) << PAGE_SHIFT);
+}
+
/**
* vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
* @vbo: The buffer object
@@ -392,6 +436,26 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
if (ret)
return ret;
+ num_prefault = (vma->vm_flags & VM_RAND_READ) ? 1 :
+ TTM_BO_VM_NUM_PREFAULT;
+
+ if (vbo->dirty) {
+ pgoff_t allowed_prefault;
+ unsigned long page_offset;
+
+ page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
+ if (page_offset >= bo->num_pages ||
+ vmw_resources_clean(vbo, page_offset,
+ page_offset + PAGE_SIZE,
+ &allowed_prefault)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ num_prefault = min(num_prefault, allowed_prefault);
+ }
+
+
/*
* This will cause mkwrite() to be called for each pte on
* write-enable vmas.
@@ -399,12 +463,11 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
cvma.vm_flags &= ~VM_WRITE;
- num_prefault = (vma->vm_flags & VM_RAND_READ) ? 0 :
- TTM_BO_VM_NUM_PREFAULT;
ret = ttm_bo_vm_fault_reserved(vmf, &cvma, num_prefault);
if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
return ret;
+out_unlock:
reservation_object_unlock(bo->resv);
return ret;
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index ff9fe5650468..30367cb06143 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -395,7 +395,8 @@ static int vmw_resource_buf_alloc(struct vmw_resource *res,
* should be retried once resources have been freed up.
*/
static int vmw_resource_do_validate(struct vmw_resource *res,
- struct ttm_validate_buffer *val_buf)
+ struct ttm_validate_buffer *val_buf,
+ bool dirtying)
{
int ret = 0;
const struct vmw_res_func *func = res->func;
@@ -437,6 +438,15 @@ static int vmw_resource_do_validate(struct vmw_resource *res,
* the resource.
*/
if (res->dirty) {
+ if (dirtying && !res->res_dirty) {
+ pgoff_t start = res->backup_offset >> PAGE_SHIFT;
+ pgoff_t end = __KERNEL_DIV_ROUND_UP
+ (res->backup_offset + res->backup_size,
+ PAGE_SIZE);
+
+ vmw_bo_dirty_unmap(res->backup, start, end);
+ }
+
vmw_bo_dirty_transfer_to_res(res);
return func->dirty_sync(res);
}
@@ -680,6 +690,7 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
* to the device.
* @res: The resource to make visible to the device.
* @intr: Perform waits interruptible if possible.
+ * @dirtying: Pending GPU operation will dirty the resource
*
* On succesful return, any backup DMA buffer pointed to by @res->backup will
* be reserved and validated.
@@ -689,7 +700,8 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
* Return: Zero on success, -ERESTARTSYS if interrupted, negative error code
* on failure.
*/
-int vmw_resource_validate(struct vmw_resource *res, bool intr)
+int vmw_resource_validate(struct vmw_resource *res, bool intr,
+ bool dirtying)
{
int ret;
struct vmw_resource *evict_res;
@@ -706,7 +718,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr)
if (res->backup)
val_buf.bo = &res->backup->base;
do {
- ret = vmw_resource_do_validate(res, &val_buf);
+ ret = vmw_resource_do_validate(res, &val_buf, dirtying);
if (likely(ret != -EBUSY))
break;
@@ -1006,7 +1018,7 @@ int vmw_resource_pin(struct vmw_resource *res, bool interruptible)
/* Do we really need to pin the MOB as well? */
vmw_bo_pin_reserved(vbo, true);
}
- ret = vmw_resource_validate(res, interruptible);
+ ret = vmw_resource_validate(res, interruptible, true);
if (vbo)
ttm_bo_unreserve(&vbo->base);
if (ret)
@@ -1081,3 +1093,85 @@ void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
res->func->dirty_range_add(res, start << PAGE_SHIFT,
end << PAGE_SHIFT);
}
+
+/**
+ * vmw_resources_clean - Clean resources intersecting a mob range
+ * @res_tree: Tree of resources attached to the mob
+ * @start: The mob page offset starting the range
+ * @end: The mob page offset ending the range
+ * @num_prefault: Returns how many pages including the first have been
+ * cleaned and are ok to prefault
+ */
+int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
+ pgoff_t end, pgoff_t *num_prefault)
+{
+ struct rb_node *cur = vbo->res_tree.rb_node;
+ struct vmw_resource *found = NULL;
+ unsigned long res_start = start << PAGE_SHIFT;
+ unsigned long res_end = end << PAGE_SHIFT;
+ unsigned long last_cleaned = 0;
+
+ /*
+ * Find the resource with lowest backup_offset that intersects the
+ * range.
+ */
+ while (cur) {
+ struct vmw_resource *cur_res =
+ container_of(cur, struct vmw_resource, mob_node);
+
+ if (cur_res->backup_offset >= res_end) {
+ cur = cur->rb_left;
+ } else if (cur_res->backup_offset + cur_res->backup_size <=
+ res_start) {
+ cur = cur->rb_right;
+ } else {
+ found = cur_res;
+ cur = cur->rb_left;
+ }
+ }
+
+ /*
+ * In order of increasing backup_offset, clean dirty resorces
+ * intersecting the range.
+ */
+ while (found) {
+ if (found->res_dirty) {
+ int ret;
+
+ if (!found->func->clean)
+ return -EINVAL;
+
+ ret = found->func->clean(found);
+ if (ret)
+ return ret;
+
+ found->res_dirty = false;
+ }
+ last_cleaned = found->backup_offset + found->backup_size;
+ cur = rb_next(&found->mob_node);
+ if (!cur)
+ break;
+
+ found = container_of(cur, struct vmw_resource, mob_node);
+ if (found->backup_offset >= res_end)
+ break;
+ }
+
+ /*
+ * Set number of pages allowed prefaulting and fence the buffer object
+ */
+ *num_prefault = 1;
+ if (last_cleaned > res_start) {
+ struct ttm_buffer_object *bo = &vbo->base;
+
+ *num_prefault = __KERNEL_DIV_ROUND_UP(last_cleaned - res_start,
+ PAGE_SIZE);
+ vmw_bo_fence_single(bo, NULL);
+ if (bo->moving)
+ dma_fence_put(bo->moving);
+ bo->moving = dma_fence_get
+ (reservation_object_get_excl(bo->resv));
+ }
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index c85144286cfe..3b7438b2d289 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -77,6 +77,7 @@ struct vmw_user_resource_conv {
* @dirty_sync: Upload the dirty mob contents to the resource.
* @dirty_add_range: Add a sequential dirty range to the resource
* dirty tracker.
+ * @clean: Clean the resource.
*/
struct vmw_res_func {
enum vmw_res_type res_type;
@@ -101,6 +102,7 @@ struct vmw_res_func {
int (*dirty_sync)(struct vmw_resource *res);
void (*dirty_range_add)(struct vmw_resource *res, size_t start,
size_t end);
+ int (*clean)(struct vmw_resource *res);
};
/**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
index 5b0c928bb5ba..81d9d7adc055 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
@@ -644,7 +644,8 @@ int vmw_validation_res_validate(struct vmw_validation_context *ctx, bool intr)
struct vmw_resource *res = val->res;
struct vmw_buffer_object *backup = res->backup;
- ret = vmw_resource_validate(res, intr);
+ ret = vmw_resource_validate(res, intr, val->dirty_set &&
+ val->dirty);
if (ret) {
if (ret != -ERESTARTSYS)
DRM_ERROR("Failed to validate resource.\n");
--
2.20.1
Driver fault callbacks are allowed to drop the mmap_sem when expecting
long hardware waits to avoid blocking other mm users. Allow the mkwrite
callbacks to do the same by returning early on VM_FAULT_RETRY.
In particular we want to be able to drop the mmap_sem when waiting for
a reservation object lock on a GPU buffer object. These locks may be
held while waiting for the GPU.
Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Souptick Joarder <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Thomas Hellstrom <[email protected]>
---
mm/memory.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..a95b4a3b1ae2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
lock_page(page);
@@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
- if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
@@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ (VM_FAULT_ERROR | VM_FAULT_RETRY |
+ VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
@@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
- (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ (tmp & (VM_FAULT_ERROR | VM_FAULT_RETRY |
+ VM_FAULT_NOPAGE)))) {
put_page(vmf->page);
return tmp;
}
--
2.20.1
Add the callbacks necessary to implement emulated coherent memory for
surfaces. Add a flag to the gb_surface_create ioctl to indicate that
surface memory should be coherent.
Also bump the drm minor version to signal the availability of coherent
surfaces.
Signed-off-by: Thomas Hellstrom <[email protected]>
---
.../device_include/svga3d_surfacedefs.h | 209 +++++++++-
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 4 +-
drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 390 +++++++++++++++++-
include/uapi/drm/vmwgfx_drm.h | 4 +-
4 files changed, 600 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
index f2bfd3d80598..d901206c04e3 100644
--- a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
+++ b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
@@ -1280,7 +1280,6 @@ svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
return offset;
}
-
static inline u32
svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
surf_size_struct baseLevelSize,
@@ -1375,4 +1374,212 @@ svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
return svga3dsurface_is_dx_screen_target_format(format);
}
+/**
+ * struct svga3dsurface_mip - Mimpmap level information
+ * @bytes: Bytes required in the backing store of this mipmap level.
+ * @img_stride: Byte stride per image.
+ * @row_stride: Byte stride per block row.
+ * @size: The size of the mipmap.
+ */
+struct svga3dsurface_mip {
+ size_t bytes;
+ size_t img_stride;
+ size_t row_stride;
+ struct drm_vmw_size size;
+
+};
+
+/**
+ * struct svga3dsurface_cache - Cached surface information
+ * @desc: Pointer to the surface descriptor
+ * @mip: Array of mipmap level information. Valid size is @num_mip_levels.
+ * @mip_chain_bytes: Bytes required in the backing store for the whole chain
+ * of mip levels.
+ * @num_mip_levels: Valid size of the @mip array. Number of mipmap levels in
+ * a chain.
+ * @num_layers: Number of slices in an array texture or number of faces in
+ * a cubemap texture.
+ */
+struct svga3dsurface_cache {
+ const struct svga3d_surface_desc *desc;
+ struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
+ size_t mip_chain_bytes;
+ u32 num_mip_levels;
+ u32 num_layers;
+};
+
+/**
+ * struct svga3dsurface_loc - Surface location
+ * @sub_resource: Surface subresource. Defined as layer * num_mip_levels +
+ * mip_level.
+ * @x: X coordinate.
+ * @y: Y coordinate.
+ * @z: Z coordinate.
+ */
+struct svga3dsurface_loc {
+ u32 sub_resource;
+ u32 x, y, z;
+};
+
+/**
+ * svga3dsurface_subres - Compute the subresource from layer and mipmap.
+ * @cache: Surface layout data.
+ * @mip_level: The mipmap level.
+ * @layer: The surface layer (face or array slice).
+ *
+ * Return: The subresource.
+ */
+static inline u32 svga3dsurface_subres(const struct svga3dsurface_cache *cache,
+ u32 mip_level, u32 layer)
+{
+ return cache->num_mip_levels * layer + mip_level;
+}
+
+/**
+ * svga3dsurface_setup_cache - Build a surface cache entry
+ * @size: The surface base level dimensions.
+ * @format: The surface format.
+ * @num_mip_levels: Number of mipmap levels.
+ * @num_layers: Number of layers.
+ * @cache: Pointer to a struct svga3dsurface_cach object to be filled in.
+ */
+static inline void svga3dsurface_setup_cache(const struct drm_vmw_size *size,
+ SVGA3dSurfaceFormat format,
+ u32 num_mip_levels,
+ u32 num_layers,
+ u32 num_samples,
+ struct svga3dsurface_cache *cache)
+{
+ const struct svga3d_surface_desc *desc;
+ u32 i;
+
+ memset(cache, 0, sizeof(*cache));
+ cache->desc = desc = svga3dsurface_get_desc(format);
+ cache->num_mip_levels = num_mip_levels;
+ cache->num_layers = num_layers;
+ for (i = 0; i < cache->num_mip_levels; i++) {
+ struct svga3dsurface_mip *mip = &cache->mip[i];
+
+ mip->size = svga3dsurface_get_mip_size(*size, i);
+ mip->bytes = svga3dsurface_get_image_buffer_size
+ (desc, &mip->size, 0) * num_samples;
+ mip->row_stride =
+ __KERNEL_DIV_ROUND_UP(mip->size.width,
+ desc->block_size.width) *
+ desc->bytes_per_block * num_samples;
+ mip->img_stride =
+ __KERNEL_DIV_ROUND_UP(mip->size.height,
+ desc->block_size.height) *
+ mip->row_stride;
+ cache->mip_chain_bytes += mip->bytes;
+ }
+}
+
+/**
+ * svga3dsurface_get_loc - Get a surface location from an offset into the
+ * backing store
+ * @cache: Surface layout data.
+ * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
+ * @offset: Offset into the surface backing store.
+ */
+static inline void
+svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
+ struct svga3dsurface_loc *loc,
+ size_t offset)
+{
+ u32 layer = offset / cache->mip_chain_bytes;
+ const struct svga3dsurface_mip *mip = &cache->mip[0];
+ const struct svga3d_surface_desc *desc = cache->desc;
+ int i;
+
+ offset -= layer * cache->mip_chain_bytes;
+ for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
+ if (mip->bytes > offset)
+ break;
+ offset -= mip->bytes;
+ }
+
+ loc->sub_resource = svga3dsurface_subres(cache, i, layer);
+ loc->z = offset / mip->img_stride;
+ offset -= loc->z * mip->img_stride;
+ loc->z *= desc->block_size.depth;
+ loc->y = offset / mip->row_stride;
+ offset -= loc->y * mip->row_stride;
+ loc->y *= desc->block_size.height;
+ loc->x = offset / desc->bytes_per_block;
+ loc->x *= desc->block_size.width;
+}
+
+/**
+ * svga3dsurface_inc_loc - Clamp increment a surface location with one block
+ * size
+ * in each dimension.
+ * @loc: Pointer to a struct svga3dsurface_loc to be incremented.
+ *
+ * When computing the size of a range as size = end - start, the range does not
+ * include the end element. However a location representing the last byte
+ * of a touched region in the backing store *is* included in the range.
+ * This function motifies such a location to match the end definition
+ * given as start + size which is the one used in a SVGA3dBox.
+ */
+static inline void
+svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
+ struct svga3dsurface_loc *loc)
+{
+ const struct svga3d_surface_desc *desc = cache->desc;
+ u32 mip = loc->sub_resource % cache->num_mip_levels;
+ const struct drm_vmw_size *size = &cache->mip[mip].size;
+
+ loc->sub_resource++;
+ loc->x += desc->block_size.width;
+ if (loc->x > size->width)
+ loc->x = size->width;
+ loc->y += desc->block_size.height;
+ if (loc->y > size->height)
+ loc->y = size->height;
+ loc->z += desc->block_size.depth;
+ if (loc->z > size->depth)
+ loc->z = size->depth;
+}
+
+/**
+ * svga3dsurface_min_loc - The start location in a subresorce
+ * @cache: Surface layout data.
+ * @sub_resource: The subresource.
+ * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
+ */
+static inline void
+svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
+ u32 sub_resource,
+ struct svga3dsurface_loc *loc)
+{
+ loc->sub_resource = sub_resource;
+ loc->x = loc->y = loc->z = 0;
+}
+
+/**
+ * svga3dsurface_min_loc - The end location in a subresorce
+ * @cache: Surface layout data.
+ * @sub_resource: The subresource.
+ * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
+ *
+ * Following the end definition given in svga3dsurface_inc_loc(),
+ * Compute the end location of a surface subresource.
+ */
+static inline void
+svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
+ u32 sub_resource,
+ struct svga3dsurface_loc *loc)
+{
+ const struct drm_vmw_size *size;
+ u32 mip;
+
+ loc->sub_resource = sub_resource + 1;
+ mip = sub_resource % cache->num_mip_levels;
+ size = &cache->mip[mip].size;
+ loc->x = size->width;
+ loc->y = size->height;
+ loc->z = size->depth;
+}
+
#endif /* _SVGA3D_SURFACEDEFS_H_ */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 00794415335e..630a01d75a41 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -44,9 +44,9 @@
#include <linux/sync_file.h>
#define VMWGFX_DRIVER_NAME "vmwgfx"
-#define VMWGFX_DRIVER_DATE "20180704"
+#define VMWGFX_DRIVER_DATE "20190328"
#define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 15
+#define VMWGFX_DRIVER_MINOR 16
#define VMWGFX_DRIVER_PATCHLEVEL 0
#define VMWGFX_FILE_PAGE_OFFSET 0x00100000
#define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index c40d44f4d9af..f56141529da5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -68,6 +68,20 @@ struct vmw_surface_offset {
uint32_t bo_offset;
};
+/**
+ * vmw_surface_dirty - Surface dirty-tracker
+ * @cache: Cached layout information of the surface.
+ * @size: Accounting size for the struct vmw_surface_dirty.
+ * @num_subres: Number of subresources.
+ * @boxes: Array of SVGA3dBoxes indicating dirty regions. One per subresource.
+ */
+struct vmw_surface_dirty {
+ struct svga3dsurface_cache cache;
+ size_t size;
+ u32 num_subres;
+ SVGA3dBox boxes[0];
+};
+
static void vmw_user_surface_free(struct vmw_resource *res);
static struct vmw_resource *
vmw_user_surface_base_to_res(struct ttm_base_object *base);
@@ -96,6 +110,13 @@ vmw_gb_surface_reference_internal(struct drm_device *dev,
struct drm_vmw_gb_surface_ref_ext_rep *rep,
struct drm_file *file_priv);
+static void vmw_surface_dirty_free(struct vmw_resource *res);
+static int vmw_surface_dirty_alloc(struct vmw_resource *res);
+static int vmw_surface_dirty_sync(struct vmw_resource *res);
+static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
+ size_t end);
+static int vmw_surface_clean(struct vmw_resource *res);
+
static const struct vmw_user_resource_conv user_surface_conv = {
.object_type = VMW_RES_SURFACE,
.base_obj_to_res = vmw_user_surface_base_to_res,
@@ -133,7 +154,12 @@ static const struct vmw_res_func vmw_gb_surface_func = {
.create = vmw_gb_surface_create,
.destroy = vmw_gb_surface_destroy,
.bind = vmw_gb_surface_bind,
- .unbind = vmw_gb_surface_unbind
+ .unbind = vmw_gb_surface_unbind,
+ .dirty_alloc = vmw_surface_dirty_alloc,
+ .dirty_free = vmw_surface_dirty_free,
+ .dirty_sync = vmw_surface_dirty_sync,
+ .dirty_range_add = vmw_surface_dirty_range_add,
+ .clean = vmw_surface_clean,
};
/**
@@ -641,6 +667,7 @@ static void vmw_user_surface_free(struct vmw_resource *res)
struct vmw_private *dev_priv = srf->res.dev_priv;
uint32_t size = user_srf->size;
+ WARN_ON_ONCE(res->dirty);
if (user_srf->master)
drm_master_put(&user_srf->master);
kfree(srf->offsets);
@@ -1174,10 +1201,16 @@ static int vmw_gb_surface_bind(struct vmw_resource *res,
cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_SURFACE;
cmd2->header.size = sizeof(cmd2->body);
cmd2->body.sid = res->id;
- res->backup_dirty = false;
}
vmw_fifo_commit(dev_priv, submit_size);
+ if (res->backup->dirty && res->backup_dirty) {
+ /* We've just made a full upload. Cear dirty regions. */
+ vmw_bo_dirty_clear_res(res);
+ }
+
+ res->backup_dirty = false;
+
return 0;
}
@@ -1642,7 +1675,8 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
}
}
} else if (req->base.drm_surface_flags &
- drm_vmw_surface_flag_create_buffer)
+ (drm_vmw_surface_flag_create_buffer |
+ drm_vmw_surface_flag_coherent))
ret = vmw_user_bo_alloc(dev_priv, tfile,
res->backup_size,
req->base.drm_surface_flags &
@@ -1656,6 +1690,26 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
goto out_unlock;
}
+ if (req->base.drm_surface_flags & drm_vmw_surface_flag_coherent) {
+ struct vmw_buffer_object *backup = res->backup;
+
+ ttm_bo_reserve(&backup->base, false, false, NULL);
+ if (!res->func->dirty_alloc)
+ ret = -EINVAL;
+ if (!ret)
+ ret = vmw_bo_dirty_add(backup);
+ if (!ret) {
+ res->coherent = true;
+ ret = res->func->dirty_alloc(res);
+ }
+ ttm_bo_unreserve(&backup->base);
+ if (ret) {
+ vmw_resource_unreference(&res);
+ goto out_unlock;
+ }
+
+ }
+
tmp = vmw_resource_reference(res);
ret = ttm_prime_object_init(tfile, res->backup_size, &user_srf->prime,
req->base.drm_surface_flags &
@@ -1764,3 +1818,333 @@ vmw_gb_surface_reference_internal(struct drm_device *dev,
return ret;
}
+
+/**
+ * vmw_subres_dirty_add - Add a dirty region to a subresource
+ * @dirty: The surfaces's dirty tracker.
+ * @loc_start: The location corresponding to the start of the region.
+ * @loc_end: The location corresponding to the end of the region.
+ *
+ * As we are assuming that @loc_start and @loc_end represent a sequential
+ * range of backing store memory, if the region spans multiple lines then
+ * regardless of the x coordinate, the full lines are dirtied.
+ * Correspondingly if the region spans multiple z slices, then full rather
+ * than partial z slices are dirtied.
+ */
+static void vmw_subres_dirty_add(struct vmw_surface_dirty *dirty,
+ const struct svga3dsurface_loc *loc_start,
+ const struct svga3dsurface_loc *loc_end)
+{
+ const struct svga3dsurface_cache *cache = &dirty->cache;
+ SVGA3dBox *box = &dirty->boxes[loc_start->sub_resource];
+ u32 mip = loc_start->sub_resource % cache->num_mip_levels;
+ const struct drm_vmw_size *size = &cache->mip[mip].size;
+ u32 box_c2 = box->z + box->d;
+
+ if (WARN_ON(loc_start->sub_resource >= dirty->num_subres))
+ return;
+
+ if (box->d == 0 || box->z > loc_start->z)
+ box->z = loc_start->z;
+ if (box_c2 < loc_end->z)
+ box->d = loc_end->z - box->z;
+
+ if (loc_start->z + 1 == loc_end->z) {
+ box_c2 = box->y + box->h;
+ if (box->h == 0 || box->y > loc_start->y)
+ box->y = loc_start->y;
+ if (box_c2 < loc_end->y)
+ box->h = loc_end->y - box->y;
+
+ if (loc_start->y + 1 == loc_end->y) {
+ box_c2 = box->x + box->w;
+ if (box->w == 0 || box->x > loc_start->x)
+ box->x = loc_start->x;
+ if (box_c2 < loc_end->x)
+ box->w = loc_end->x - box->x;
+ } else {
+ box->x = 0;
+ box->w = size->width;
+ }
+ } else {
+ box->y = 0;
+ box->h = size->height;
+ box->x = 0;
+ box->w = size->width;
+ }
+}
+
+/**
+ * vmw_subres_dirty_full - Mark a full subresource as dirty
+ * @dirty: The surface's dirty tracker.
+ * @subres: The subresource
+ */
+static void vmw_subres_dirty_full(struct vmw_surface_dirty *dirty, u32 subres)
+{
+ const struct svga3dsurface_cache *cache = &dirty->cache;
+ u32 mip = subres % cache->num_mip_levels;
+ const struct drm_vmw_size *size = &cache->mip[mip].size;
+ SVGA3dBox *box = &dirty->boxes[subres];
+
+ box->x = 0;
+ box->y = 0;
+ box->z = 0;
+ box->w = size->width;
+ box->h = size->height;
+ box->d = size->depth;
+}
+
+/*
+ * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for texture
+ * surfaces.
+ */
+static void vmw_surface_tex_dirty_range_add(struct vmw_resource *res,
+ size_t start, size_t end)
+{
+ struct vmw_surface_dirty *dirty =
+ (struct vmw_surface_dirty *) res->dirty;
+ size_t backup_end = res->backup_offset + res->backup_size;
+ struct svga3dsurface_loc loc1, loc2;
+ const struct svga3dsurface_cache *cache;
+
+ start = max(start, res->backup_offset) - res->backup_offset;
+ end = min(end, backup_end) - res->backup_offset;
+ cache = &dirty->cache;
+ svga3dsurface_get_loc(cache, &loc1, start);
+ svga3dsurface_get_loc(cache, &loc2, end - 1);
+ svga3dsurface_inc_loc(cache, &loc2);
+
+ if (loc1.sub_resource + 1 == loc2.sub_resource) {
+ /* Dirty range covers a single sub-resource */
+ vmw_subres_dirty_add(dirty, &loc1, &loc2);
+ } else {
+ /* Dirty range covers multiple sub-resources */
+ struct svga3dsurface_loc loc_min, loc_max;
+ u32 sub_res = loc1.sub_resource;
+
+ svga3dsurface_max_loc(cache, loc1.sub_resource, &loc_max);
+ vmw_subres_dirty_add(dirty, &loc1, &loc_max);
+ svga3dsurface_min_loc(cache, loc2.sub_resource - 1, &loc_min);
+ vmw_subres_dirty_add(dirty, &loc_min, &loc2);
+ for (sub_res = loc1.sub_resource + 1;
+ sub_res < loc2.sub_resource - 1; ++sub_res)
+ vmw_subres_dirty_full(dirty, sub_res);
+ }
+}
+
+/*
+ * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for buffer
+ * surfaces.
+ */
+static void vmw_surface_buf_dirty_range_add(struct vmw_resource *res,
+ size_t start, size_t end)
+{
+ struct vmw_surface_dirty *dirty =
+ (struct vmw_surface_dirty *) res->dirty;
+ const struct svga3dsurface_cache *cache = &dirty->cache;
+ size_t backup_end = res->backup_offset + cache->mip_chain_bytes;
+ SVGA3dBox *box = &dirty->boxes[0];
+ u32 box_c2;
+
+ box->h = box->d = 1;
+ start = max(start, res->backup_offset) - res->backup_offset;
+ end = min(end, backup_end) - res->backup_offset;
+ box_c2 = box->x + box->w;
+ if (box->w == 0 || box->x > start)
+ box->x = start;
+ if (box_c2 < end)
+ box->w = end - box->x;
+}
+
+/*
+ * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for surfaces
+ */
+static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
+ size_t end)
+{
+ struct vmw_surface *srf = vmw_res_to_srf(res);
+
+ if (WARN_ON(end <= res->backup_offset ||
+ start >= res->backup_offset + res->backup_size))
+ return;
+
+ if (srf->format == SVGA3D_BUFFER)
+ vmw_surface_buf_dirty_range_add(res, start, end);
+ else
+ vmw_surface_tex_dirty_range_add(res, start, end);
+}
+
+/*
+ * vmw_surface_dirty_sync - The surface's dirty_sync callback.
+ */
+static int vmw_surface_dirty_sync(struct vmw_resource *res)
+{
+ struct vmw_private *dev_priv = res->dev_priv;
+ bool has_dx = 0;
+ u32 i, num_dirty;
+ struct vmw_surface_dirty *dirty =
+ (struct vmw_surface_dirty *) res->dirty;
+ size_t alloc_size;
+ const struct svga3dsurface_cache *cache = &dirty->cache;
+ struct {
+ SVGA3dCmdHeader header;
+ SVGA3dCmdDXUpdateSubResource body;
+ } *cmd1;
+ struct {
+ SVGA3dCmdHeader header;
+ SVGA3dCmdUpdateGBImage body;
+ } *cmd2;
+ void *cmd;
+
+ num_dirty = 0;
+ for (i = 0; i < dirty->num_subres; ++i) {
+ const SVGA3dBox *box = &dirty->boxes[i];
+
+ if (box->d)
+ num_dirty++;
+ }
+
+ if (!num_dirty)
+ goto out;
+
+ alloc_size = num_dirty * ((has_dx) ? sizeof(*cmd1) : sizeof(*cmd2));
+ cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd1 = cmd;
+ cmd2 = cmd;
+
+ for (i = 0; i < dirty->num_subres; ++i) {
+ const SVGA3dBox *box = &dirty->boxes[i];
+
+ if (!box->d)
+ continue;
+
+ /*
+ * DX_UPDATE_SUBRESOURCE is aware of array surfaces.
+ * UPDATE_GB_IMAGE is not.
+ */
+ if (has_dx) {
+ cmd1->header.id = SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE;
+ cmd1->header.size = sizeof(cmd1->body);
+ cmd1->body.sid = res->id;
+ cmd1->body.subResource = i;
+ cmd1->body.box = *box;
+ cmd1++;
+ } else {
+ cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE;
+ cmd2->header.size = sizeof(cmd2->body);
+ cmd2->body.image.sid = res->id;
+ cmd2->body.image.face = i / cache->num_mip_levels;
+ cmd2->body.image.mipmap = i -
+ (cache->num_mip_levels * cmd2->body.image.face);
+ cmd2->body.box = *box;
+ cmd2++;
+ }
+
+ }
+ vmw_fifo_commit(dev_priv, alloc_size);
+ out:
+ memset(&dirty->boxes[0], 0, sizeof(dirty->boxes[0]) *
+ dirty->num_subres);
+
+ return 0;
+}
+
+/*
+ * vmw_surface_dirty_alloc - The surface's dirty_alloc callback.
+ */
+static int vmw_surface_dirty_alloc(struct vmw_resource *res)
+{
+ struct vmw_surface *srf = vmw_res_to_srf(res);
+ struct vmw_surface_dirty *dirty;
+ u32 num_layers = 1;
+ u32 num_mip;
+ u32 num_subres;
+ u32 num_samples;
+ size_t dirty_size, acc_size;
+ static struct ttm_operation_ctx ctx = {
+ .interruptible = false,
+ .no_wait_gpu = false
+ };
+ int ret;
+
+ if (srf->array_size)
+ num_layers = srf->array_size;
+ else if (srf->flags & SVGA3D_SURFACE_CUBEMAP)
+ num_layers *= SVGA3D_MAX_SURFACE_FACES;
+
+ num_mip = srf->mip_levels[0];
+ if (!num_mip)
+ num_mip = 1;
+
+ num_subres = num_layers * num_mip;
+ dirty_size = sizeof(*dirty) + num_subres * sizeof(dirty->boxes[0]);
+ acc_size = ttm_round_pot(dirty_size);
+ ret = ttm_mem_global_alloc(vmw_mem_glob(res->dev_priv),
+ acc_size, &ctx);
+ if (ret) {
+ VMW_DEBUG_USER("Out of graphics memory for surface "
+ "dirty tracker.\n");
+ return ret;
+ }
+
+ dirty = kvzalloc(dirty_size, GFP_KERNEL);
+ if (!dirty) {
+ ret = -ENOMEM;
+ goto out_no_dirty;
+ }
+
+ num_samples = max_t(u32, 1, srf->multisample_count);
+ svga3dsurface_setup_cache(&srf->base_size, srf->format, num_mip,
+ num_layers, num_samples, &dirty->cache);
+ dirty->num_subres = num_subres;
+ dirty->size = acc_size;
+ res->dirty = (struct vmw_resource_dirty *) dirty;
+
+ return 0;
+
+out_no_dirty:
+ ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
+ return ret;
+}
+
+/*
+ * vmw_surface_dirty_free - The surface's dirty_free callback
+ */
+static void vmw_surface_dirty_free(struct vmw_resource *res)
+{
+ struct vmw_surface_dirty *dirty =
+ (struct vmw_surface_dirty *) res->dirty;
+ size_t acc_size = dirty->size;
+
+ kvfree(dirty);
+ ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
+ res->dirty = NULL;
+}
+
+/*
+ * vmw_surface_clean - The surface's clean callback
+ */
+static int vmw_surface_clean(struct vmw_resource *res)
+{
+ struct vmw_private *dev_priv = res->dev_priv;
+ size_t alloc_size;
+ struct {
+ SVGA3dCmdHeader header;
+ SVGA3dCmdReadbackGBSurface body;
+ } *cmd;
+
+ alloc_size = sizeof(*cmd);
+ cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
+ if (!cmd)
+ return -ENOMEM;
+
+ cmd->header.id = SVGA_3D_CMD_READBACK_GB_SURFACE;
+ cmd->header.size = sizeof(cmd->body);
+ cmd->body.sid = res->id;
+ vmw_fifo_commit(dev_priv, alloc_size);
+
+ return 0;
+}
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index 399f58317cff..02cab33f2f25 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -891,11 +891,13 @@ struct drm_vmw_shader_arg {
* surface.
* @drm_vmw_surface_flag_create_buffer: Create a backup buffer if none is
* given.
+ * @drm_vmw_surface_flag_coherent: Back surface with coherent memory.
*/
enum drm_vmw_surface_flags {
drm_vmw_surface_flag_shareable = (1 << 0),
drm_vmw_surface_flag_scanout = (1 << 1),
- drm_vmw_surface_flag_create_buffer = (1 << 2)
+ drm_vmw_surface_flag_create_buffer = (1 << 2),
+ drm_vmw_surface_flag_coherent = (1 << 3),
};
/**
--
2.20.1
This infrastructure will, for coherent resources, make sure that
from the user-space point of view, data written by the CPU is immediately
automatically available to the GPU at resource validation time.
Signed-off-by: Thomas Hellstrom <[email protected]>
---
drivers/gpu/drm/vmwgfx/Kconfig | 1 +
drivers/gpu/drm/vmwgfx/Makefile | 2 +-
drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 5 +-
drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 5 +
drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 26 +-
drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 1 -
drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 410 ++++++++++++++++++
drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 57 +++
drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h | 11 +
drivers/gpu/drm/vmwgfx/vmwgfx_validation.c | 74 ++++
drivers/gpu/drm/vmwgfx/vmwgfx_validation.h | 16 +-
11 files changed, 588 insertions(+), 20 deletions(-)
create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index 6b28a326f8bb..d5fd81a521f6 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -8,6 +8,7 @@ config DRM_VMWGFX
select FB_CFB_IMAGEBLIT
select DRM_TTM
select FB
+ select AS_DIRTY_HELPERS
# Only needed for the transitional use of drm_crtc_init - can be removed
# again once vmwgfx sets up the primary plane itself.
select DRM_KMS_HELPER
diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index 8841bd30e1e5..c877a21a0739 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
- vmwgfx_validation.o \
+ vmwgfx_validation.o vmwgfx_page_dirty.o \
ttm_object.o ttm_lock.o
obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index c0829d50eecc..90ca866640fe 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -463,6 +463,7 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
{
struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
+ WARN_ON(vmw_bo->dirty);
vmw_bo_unmap(vmw_bo);
kfree(vmw_bo);
}
@@ -476,8 +477,10 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
{
struct vmw_user_buffer_object *vmw_user_bo = vmw_user_buffer_object(bo);
+ struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
- vmw_bo_unmap(&vmw_user_bo->vbo);
+ WARN_ON(vbo->dirty);
+ vmw_bo_unmap(vbo);
ttm_prime_object_kfree(vmw_user_bo, prime);
}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 6165fe2c4504..74e94138877e 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -857,6 +857,11 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
DRM_ERROR("Failed initializing TTM buffer object driver.\n");
goto out_no_bdev;
}
+ dev_priv->vm_ops = *dev_priv->bdev.vm_ops;
+ dev_priv->vm_ops.fault = vmw_bo_vm_fault;
+ dev_priv->vm_ops.pfn_mkwrite = vmw_bo_vm_mkwrite;
+ dev_priv->vm_ops.page_mkwrite = vmw_bo_vm_mkwrite;
+ dev_priv->bdev.vm_ops = &dev_priv->vm_ops;
/*
* Enable VRAM, but initially don't use it until SVGA is enabled and
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index bd6919b90519..f05fce52fbb4 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -95,6 +95,7 @@ struct vmw_fpriv {
* @dx_query_ctx: DX context if this buffer object is used as a DX query MOB
* @map: Kmap object for semi-persistent mappings
* @res_prios: Eviction priority counts for attached resources
+ * @dirty: structure for user-space dirty-tracking
*/
struct vmw_buffer_object {
struct ttm_buffer_object base;
@@ -105,6 +106,7 @@ struct vmw_buffer_object {
/* Protected by reservation */
struct ttm_bo_kmap_obj map;
u32 res_prios[TTM_MAX_BO_PRIORITY];
+ struct vmw_bo_dirty *dirty;
};
/**
@@ -135,7 +137,8 @@ struct vmw_res_func;
* @res_dirty: Resource contains data not yet in the backup buffer. Protected
* by resource reserved.
* @backup_dirty: Backup buffer contains data not yet in the HW resource.
- * Protecte by resource reserved.
+ * Protected by resource reserved.
+ * @coherent: Emulate coherency by tracking vm accesses.
* @backup: The backup buffer if any. Protected by resource reserved.
* @backup_offset: Offset into the backup buffer if any. Protected by resource
* reserved. Note that only a few resource types can have a @backup_offset
@@ -152,14 +155,16 @@ struct vmw_res_func;
* @hw_destroy: Callback to destroy the resource on the device, as part of
* resource destruction.
*/
+struct vmw_resource_dirty;
struct vmw_resource {
struct kref kref;
struct vmw_private *dev_priv;
int id;
u32 used_prio;
unsigned long backup_size;
- bool res_dirty;
- bool backup_dirty;
+ u32 res_dirty : 1;
+ u32 backup_dirty : 1;
+ u32 coherent : 1;
struct vmw_buffer_object *backup;
unsigned long backup_offset;
unsigned long pin_count;
@@ -167,6 +172,7 @@ struct vmw_resource {
struct list_head lru_head;
struct list_head mob_head;
struct list_head binding_head;
+ struct vmw_resource_dirty *dirty;
void (*res_free) (struct vmw_resource *res);
void (*hw_destroy) (struct vmw_resource *res);
};
@@ -607,6 +613,9 @@ struct vmw_private {
/* Validation memory reservation */
struct vmw_validation_mem vvm;
+
+ /* VM operations */
+ struct vm_operations_struct vm_ops;
};
static inline struct vmw_surface *vmw_res_to_srf(struct vmw_resource *res)
@@ -723,6 +732,8 @@ extern void vmw_resource_evict_all(struct vmw_private *dev_priv);
extern void vmw_resource_unbind_list(struct vmw_buffer_object *vbo);
void vmw_resource_mob_attach(struct vmw_resource *res);
void vmw_resource_mob_detach(struct vmw_resource *res);
+void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
+ pgoff_t end);
/**
* vmw_resource_mob_attached - Whether a resource currently has a mob attached
@@ -1411,6 +1422,15 @@ int vmw_host_log(const char *log);
#define VMW_DEBUG_USER(fmt, ...) \
DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
+/* Resource dirtying - vmwgfx_page_dirty.c */
+void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
+int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
+void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
+void vmw_bo_dirty_clear_res(struct vmw_resource *res);
+void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
+vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
+vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
+
/**
* Inline helper functions
*/
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index da3ac0bc2e14..7cb22119f516 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -2483,7 +2483,6 @@ static int vmw_cmd_dx_check_subresource(struct vmw_private *dev_priv,
offsetof(typeof(*cmd), sid));
cmd = container_of(header, typeof(*cmd), header);
-
return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
VMW_RES_DIRTY_NONE, user_surface_converter,
&cmd->sid, NULL);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
new file mode 100644
index 000000000000..87e4a73b1175
--- /dev/null
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/**************************************************************************
+ *
+ * Copyright 2019 VMware, Inc., Palo Alto, CA., USA
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#include "vmwgfx_drv.h"
+
+/*
+ * Different methods for tracking dirty:
+ * VMW_BO_DIRTY_PAGETABLE - Scan the pagetable for hardware dirty bits
+ * VMW_BO_DIRTY_MKWRITE - Write-protect page table entries and record write-
+ * accesses in the VM mkwrite() callback
+ */
+enum vmw_bo_dirty_method {
+ VMW_BO_DIRTY_PAGETABLE,
+ VMW_BO_DIRTY_MKWRITE,
+};
+
+/*
+ * No dirtied pages at scan trigger a transition to the _MKWRITE method,
+ * similarly a certain percentage of dirty pages trigger a transition to
+ * the _PAGETABLE method. How many triggers should we wait for before
+ * changing method?
+ */
+#define VMW_DIRTY_NUM_CHANGE_TRIGGERS 2
+
+/* Percentage to trigger a transition to the _PAGETABLE method */
+#define VMW_DIRTY_PERCENTAGE 10
+
+/**
+ * struct vmw_bo_dirty - Dirty information for buffer objects
+ * @start: First currently dirty bit
+ * @end: Last currently dirty bit + 1
+ * @method: The currently used dirty method
+ * @change_count: Number of consecutive method change triggers
+ * @ref_count: Reference count for this structure
+ * @bitmap_size: The size of the bitmap in bits. Typically equal to the
+ * nuber of pages in the bo.
+ * @size: The accounting size for this struct.
+ * @bitmap: A bitmap where each bit represents a page. A set bit means a
+ * dirty page.
+ */
+struct vmw_bo_dirty {
+ unsigned long start;
+ unsigned long end;
+ enum vmw_bo_dirty_method method;
+ unsigned int change_count;
+ unsigned int ref_count;
+ unsigned long bitmap_size;
+ size_t size;
+ unsigned long bitmap[0];
+};
+
+/**
+ * vmw_bo_dirty_scan_pagetable - Perform a pagetable scan for dirty bits
+ * @vbo: The buffer object to scan
+ *
+ * Scans the pagetable for dirty bits. Clear those bits and modify the
+ * dirty structure with the results. This function may change the
+ * dirty-tracking method.
+ */
+static void vmw_bo_dirty_scan_pagetable(struct vmw_buffer_object *vbo)
+{
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+ pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
+ struct address_space *mapping = vbo->base.bdev->dev_mapping;
+ pgoff_t num_marked;
+
+ num_marked = apply_as_clean(mapping,
+ offset, dirty->bitmap_size,
+ offset, &dirty->bitmap[0],
+ &dirty->start, &dirty->end);
+ if (num_marked == 0)
+ dirty->change_count++;
+ else
+ dirty->change_count = 0;
+
+ if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
+ dirty->change_count = 0;
+ dirty->method = VMW_BO_DIRTY_MKWRITE;
+ apply_as_wrprotect(mapping,
+ offset, dirty->bitmap_size);
+ apply_as_clean(mapping,
+ offset, dirty->bitmap_size,
+ offset, &dirty->bitmap[0],
+ &dirty->start, &dirty->end);
+ }
+}
+
+/**
+ * vmw_bo_dirty_scan_mkwrite - Reset the mkwrite dirty-tracking method
+ * @vbo: The buffer object to scan
+ *
+ * Write-protect pages written to so that consecutive write accesses will
+ * trigger a call to mkwrite.
+ *
+ * This function may change the dirty-tracking method.
+ */
+static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object *vbo)
+{
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+ unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
+ struct address_space *mapping = vbo->base.bdev->dev_mapping;
+ pgoff_t num_marked;
+
+ if (dirty->end <= dirty->start)
+ return;
+
+ num_marked = apply_as_wrprotect(vbo->base.bdev->dev_mapping,
+ dirty->start + offset,
+ dirty->end - dirty->start);
+
+ if (100UL * num_marked / dirty->bitmap_size >
+ VMW_DIRTY_PERCENTAGE) {
+ dirty->change_count++;
+ } else {
+ dirty->change_count = 0;
+ }
+
+ if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
+ pgoff_t start = 0;
+ pgoff_t end = dirty->bitmap_size;
+
+ dirty->method = VMW_BO_DIRTY_PAGETABLE;
+ apply_as_clean(mapping, offset, end, offset, &dirty->bitmap[0],
+ &start, &end);
+ bitmap_clear(&dirty->bitmap[0], 0, dirty->bitmap_size);
+ if (dirty->start < dirty->end)
+ bitmap_set(&dirty->bitmap[0], dirty->start,
+ dirty->end - dirty->start);
+ dirty->change_count = 0;
+ }
+}
+
+
+/**
+ * vmw_bo_dirty_scan - Scan for dirty pages and add them to the dirty
+ * tracking structure
+ * @vbo: The buffer object to scan
+ *
+ * This function may change the dirty tracking method.
+ */
+void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
+{
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+
+ if (dirty->method == VMW_BO_DIRTY_PAGETABLE)
+ vmw_bo_dirty_scan_pagetable(vbo);
+ else
+ vmw_bo_dirty_scan_mkwrite(vbo);
+}
+
+/**
+ * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
+ * @vbo: The buffer object
+ *
+ * This function registers a dirty-tracking user to a buffer object.
+ * A user can be for example a resource or a vma in a special user-space
+ * mapping.
+ *
+ * Return: Zero on success, -ENOMEM on memory allocation failure.
+ */
+int vmw_bo_dirty_add(struct vmw_buffer_object *vbo)
+{
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+ pgoff_t num_pages = vbo->base.num_pages;
+ size_t size, acc_size;
+ int ret;
+ static struct ttm_operation_ctx ctx = {
+ .interruptible = false,
+ .no_wait_gpu = false
+ };
+
+ if (dirty) {
+ dirty->ref_count++;
+ return 0;
+ }
+
+ size = sizeof(*dirty) + BITS_TO_LONGS(num_pages) * sizeof(long);
+ acc_size = ttm_round_pot(size);
+ ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
+ if (ret) {
+ VMW_DEBUG_USER("Out of graphics memory for buffer object "
+ "dirty tracker.\n");
+ return ret;
+ }
+ dirty = kvzalloc(size, GFP_KERNEL);
+ if (!dirty) {
+ ret = -ENOMEM;
+ goto out_no_dirty;
+ }
+
+ dirty->size = acc_size;
+ dirty->bitmap_size = num_pages;
+ dirty->start = dirty->bitmap_size;
+ dirty->end = 0;
+ dirty->ref_count = 1;
+ if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
+ dirty->method = VMW_BO_DIRTY_PAGETABLE;
+ } else {
+ struct address_space *mapping = vbo->base.bdev->dev_mapping;
+ pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
+
+ dirty->method = VMW_BO_DIRTY_MKWRITE;
+
+ /* Write-protect and then pick up already dirty bits */
+ apply_as_wrprotect(mapping, offset, num_pages);
+ apply_as_clean(mapping, offset, num_pages, offset,
+ &dirty->bitmap[0], &dirty->start, &dirty->end);
+ }
+
+ vbo->dirty = dirty;
+
+ return 0;
+
+out_no_dirty:
+ ttm_mem_global_free(&ttm_mem_glob, acc_size);
+ return ret;
+}
+
+/**
+ * vmw_bo_dirty_release - Release a dirty-tracking user from a buffer object
+ * @vbo: The buffer object
+ *
+ * This function releases a dirty-tracking user from a buffer object.
+ * If the reference count reaches zero, then the dirty-tracking object is
+ * freed and the pointer to it cleared.
+ *
+ * Return: Zero on success, -ENOMEM on memory allocation failure.
+ */
+void vmw_bo_dirty_release(struct vmw_buffer_object *vbo)
+{
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+
+ if (dirty && --dirty->ref_count == 0) {
+ size_t acc_size = dirty->size;
+
+ kvfree(dirty);
+ ttm_mem_global_free(&ttm_mem_glob, acc_size);
+ vbo->dirty = NULL;
+ }
+}
+
+/**
+ * vmw_bo_dirty_transfer_to_res - Pick up a resource's dirty region from
+ * its backing mob.
+ * @res: The resource
+ *
+ * This function will pick up all dirty ranges affecting the resource from
+ * it's backup mob, and call vmw_resource_dirty_update() once for each
+ * range. The transferred ranges will be cleared from the backing mob's
+ * dirty tracking.
+ */
+void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res)
+{
+ struct vmw_buffer_object *vbo = res->backup;
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+ pgoff_t start, cur, end;
+ unsigned long res_start = res->backup_offset;
+ unsigned long res_end = res->backup_offset + res->backup_size;
+
+ WARN_ON_ONCE(res_start & ~PAGE_MASK);
+ res_start >>= PAGE_SHIFT;
+ res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
+
+ if (res_start >= dirty->end || res_end <= dirty->start)
+ return;
+
+ cur = max(res_start, dirty->start);
+ res_end = max(res_end, dirty->end);
+ while (cur < res_end) {
+ unsigned long num;
+
+ start = find_next_bit(&dirty->bitmap[0], res_end, cur);
+ if (start >= res_end)
+ break;
+
+ end = find_next_zero_bit(&dirty->bitmap[0], res_end, start + 1);
+ cur = end + 1;
+ num = end - start;
+ bitmap_clear(&dirty->bitmap[0], start, num);
+ vmw_resource_dirty_update(res, start, end);
+ }
+
+ if (res_start <= dirty->start && res_end > dirty->start)
+ dirty->start = res_end;
+ if (res_start < dirty->end && res_end >= dirty->end)
+ dirty->end = res_start;
+}
+
+/**
+ * vmw_bo_dirty_clear_res - Clear a resource's dirty region from
+ * its backing mob.
+ * @res: The resource
+ *
+ * This function will clar all dirty ranges affecting the resource from
+ * it's backup mob's dirty tracking.
+ */
+void vmw_bo_dirty_clear_res(struct vmw_resource *res)
+{
+ unsigned long res_start = res->backup_offset;
+ unsigned long res_end = res->backup_offset + res->backup_size;
+ struct vmw_buffer_object *vbo = res->backup;
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+
+ res_start >>= PAGE_SHIFT;
+ res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
+
+ if (res_start >= dirty->end || res_end <= dirty->start)
+ return;
+
+ res_start = max(res_start, dirty->start);
+ res_end = min(res_end, dirty->end);
+ bitmap_clear(&dirty->bitmap[0], res_start, res_end - res_start);
+
+ if (res_start <= dirty->start && res_end > dirty->start)
+ dirty->start = res_end;
+ if (res_start < dirty->end && res_end >= dirty->end)
+ dirty->end = res_start;
+}
+
+/* vmw_bo_vm_mkwrite - The vmwgfx page_mkwrite() or pfn_mkwrite() callback */
+vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+ vma->vm_private_data;
+ vm_fault_t ret;
+ unsigned long page_offset;
+ struct vmw_buffer_object *vbo =
+ container_of(bo, typeof(*vbo), base);
+
+ ret = ttm_bo_vm_reserve(bo, vmf);
+ if (ret)
+ return ret;
+
+ page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
+ if (unlikely(page_offset >= bo->num_pages)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_unlock;
+ }
+
+ if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE &&
+ !test_bit(page_offset, &vbo->dirty->bitmap[0])) {
+ struct vmw_bo_dirty *dirty = vbo->dirty;
+
+ __set_bit(page_offset, &dirty->bitmap[0]);
+ dirty->start = min(dirty->start, page_offset);
+ dirty->end = max(dirty->end, page_offset + 1);
+ }
+
+out_unlock:
+ reservation_object_unlock(bo->resv);
+ return ret;
+}
+
+
+/* vmw_bo_vm_fault - The vmwgfx fault() callback */
+vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vm_area_struct cvma = *vma;
+ struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+ vma->vm_private_data;
+ struct vmw_buffer_object *vbo =
+ container_of(bo, struct vmw_buffer_object, base);
+ pgoff_t num_prefault;
+ vm_fault_t ret;
+
+ ret = ttm_bo_vm_reserve(bo, vmf);
+ if (ret)
+ return ret;
+
+ /*
+ * This will cause mkwrite() to be called for each pte on
+ * write-enable vmas.
+ */
+ if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
+ cvma.vm_flags &= ~VM_WRITE;
+
+ num_prefault = (vma->vm_flags & VM_RAND_READ) ? 0 :
+ TTM_BO_VM_NUM_PREFAULT;
+ ret = ttm_bo_vm_fault_reserved(vmf, &cvma, num_prefault);
+ if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
+ return ret;
+
+ reservation_object_unlock(bo->resv);
+ return ret;
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index 88981c4dbae3..d35f4bd32cd9 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -121,6 +121,10 @@ static void vmw_resource_release(struct kref *kref)
}
res->backup_dirty = false;
vmw_resource_mob_detach(res);
+ if (res->dirty)
+ res->func->dirty_free(res);
+ if (res->coherent)
+ vmw_bo_dirty_release(res->backup);
ttm_bo_unreserve(bo);
vmw_bo_unreference(&res->backup);
}
@@ -210,7 +214,9 @@ int vmw_resource_init(struct vmw_private *dev_priv, struct vmw_resource *res,
res->backup_offset = 0;
res->backup_dirty = false;
res->res_dirty = false;
+ res->coherent = false;
res->used_prio = 3;
+ res->dirty = NULL;
if (delay_id)
return 0;
else
@@ -397,6 +403,30 @@ static int vmw_resource_do_validate(struct vmw_resource *res,
vmw_resource_mob_attach(res);
}
+ /*
+ * Handle the case where the backup mob is marked coherent but
+ * the resource isn't.
+ */
+ if (func->dirty_alloc && vmw_resource_mob_attached(res) &&
+ !res->coherent) {
+ if (res->backup->dirty && !res->dirty) {
+ ret = func->dirty_alloc(res);
+ if (ret)
+ return ret;
+ } else if (!res->backup->dirty && res->dirty) {
+ func->dirty_free(res);
+ }
+ }
+
+ /*
+ * Transfer the dirty regions to the resource and update
+ * the resource.
+ */
+ if (res->dirty) {
+ vmw_bo_dirty_transfer_to_res(res);
+ return func->dirty_sync(res);
+ }
+
return 0;
out_bind_failed:
@@ -435,16 +465,28 @@ void vmw_resource_unreserve(struct vmw_resource *res,
if (switch_backup && new_backup != res->backup) {
if (res->backup) {
vmw_resource_mob_detach(res);
+ if (res->coherent)
+ vmw_bo_dirty_release(res->backup);
vmw_bo_unreference(&res->backup);
}
if (new_backup) {
res->backup = vmw_bo_reference(new_backup);
+
+ /*
+ * The validation code should already have added a
+ * dirty tracker here.
+ */
+ WARN_ON(res->coherent && !new_backup->dirty);
+
vmw_resource_mob_attach(res);
} else {
res->backup = NULL;
}
+ } else if (switch_backup && res->coherent) {
+ vmw_bo_dirty_release(res->backup);
}
+
if (switch_backup)
res->backup_offset = new_backup_offset;
@@ -1009,3 +1051,18 @@ enum vmw_res_type vmw_res_type(const struct vmw_resource *res)
{
return res->func->res_type;
}
+
+/**
+ * vmw_resource_update_dirty - Update a resource's dirty tracker with a
+ * sequential range of touched backing store memory.
+ * @res: The resource.
+ * @start: The first page touched.
+ * @end: The last page touched + 1.
+ */
+void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
+ pgoff_t end)
+{
+ if (res->dirty)
+ res->func->dirty_range_add(res, start << PAGE_SHIFT,
+ end << PAGE_SHIFT);
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index 984e588c62ca..c85144286cfe 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -71,6 +71,12 @@ struct vmw_user_resource_conv {
* @commit_notify: If the resource is a command buffer managed resource,
* callback to notify that a define or remove command
* has been committed to the device.
+ * @dirty_alloc: Allocate a dirty tracker. NULL if dirty-tracking is not
+ * supported.
+ * @dirty_free: Free the dirty tracker.
+ * @dirty_sync: Upload the dirty mob contents to the resource.
+ * @dirty_add_range: Add a sequential dirty range to the resource
+ * dirty tracker.
*/
struct vmw_res_func {
enum vmw_res_type res_type;
@@ -90,6 +96,11 @@ struct vmw_res_func {
struct ttm_validate_buffer *val_buf);
void (*commit_notify)(struct vmw_resource *res,
enum vmw_cmdbuf_res_state state);
+ int (*dirty_alloc)(struct vmw_resource *res);
+ void (*dirty_free)(struct vmw_resource *res);
+ int (*dirty_sync)(struct vmw_resource *res);
+ void (*dirty_range_add)(struct vmw_resource *res, size_t start,
+ size_t end);
};
/**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
index f611b2290a1b..5b0c928bb5ba 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
@@ -33,8 +33,13 @@
* struct vmw_validation_bo_node - Buffer object validation metadata.
* @base: Metadata used for TTM reservation- and validation.
* @hash: A hash entry used for the duplicate detection hash table.
+ * @coherent_count: If switching backup buffers, number of new coherent
+ * resources that will have this buffer as a backup buffer.
* @as_mob: Validate as mob.
* @cpu_blit: Validate for cpu blit access.
+ * @coherent_alloced: In switching backup buffers for coherent resources:
+ * The bo dirty tracker has been allocated and needs to be freed if
+ * reverting.
*
* Bit fields are used since these structures are allocated and freed in
* large numbers and space conservation is desired.
@@ -42,6 +47,7 @@
struct vmw_validation_bo_node {
struct ttm_validate_buffer base;
struct drm_hash_item hash;
+ unsigned int coherent_count;
u32 as_mob : 1;
u32 cpu_blit : 1;
};
@@ -459,6 +465,19 @@ int vmw_validation_res_reserve(struct vmw_validation_context *ctx,
if (ret)
goto out_unreserve;
}
+
+ if (val->switching_backup && val->new_backup &&
+ res->coherent) {
+ struct vmw_validation_bo_node *bo_node =
+ vmw_validation_find_bo_dup(ctx,
+ val->new_backup);
+
+ if (WARN_ON(!bo_node)) {
+ ret = -EINVAL;
+ goto out_unreserve;
+ }
+ bo_node->coherent_count++;
+ }
}
return 0;
@@ -562,6 +581,9 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
int ret;
list_for_each_entry(entry, &ctx->bo_list, base.head) {
+ struct vmw_buffer_object *vbo =
+ container_of(entry->base.bo, typeof(*vbo), base);
+
if (entry->cpu_blit) {
struct ttm_operation_ctx ctx = {
.interruptible = intr,
@@ -576,6 +598,27 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
}
if (ret)
return ret;
+
+ /*
+ * Rather than having the resource code allocating the bo
+ * dirty tracker in resource_unreserve() where we can't fail,
+ * Do it here when validating the buffer object.
+ */
+ if (entry->coherent_count) {
+ unsigned int coherent_count = entry->coherent_count;
+
+ while (coherent_count) {
+ ret = vmw_bo_dirty_add(vbo);
+ if (ret)
+ return ret;
+
+ coherent_count--;
+ }
+ entry->coherent_count -= coherent_count;
+ }
+
+ if (vbo->dirty)
+ vmw_bo_dirty_scan(vbo);
}
return 0;
}
@@ -828,3 +871,34 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
ctx->mem_size_left += size;
return 0;
}
+
+/**
+ * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
+ * validation context
+ * @ctx: The validation context
+ *
+ * This function unreserves the buffer objects previously reserved using
+ * vmw_validation_bo_reserve. It's typically used as part of an error path
+ */
+void vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
+{
+ struct vmw_validation_bo_node *entry;
+
+ /*
+ * Switching coherent resource backup buffers failed.
+ * Release corresponding buffer object dirty trackers.
+ */
+ list_for_each_entry(entry, &ctx->bo_list, base.head) {
+ if (entry->coherent_count) {
+ unsigned int coherent_count = entry->coherent_count;
+ struct vmw_buffer_object *vbo =
+ container_of(entry->base.bo, typeof(*vbo),
+ base);
+
+ while (coherent_count--)
+ vmw_bo_dirty_release(vbo);
+ }
+ }
+
+ ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
+}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
index 523f6ac5c335..058c7f2fbf83 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
@@ -172,20 +172,6 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
NULL);
}
-/**
- * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
- * validation context
- * @ctx: The validation context
- *
- * This function unreserves the buffer objects previously reserved using
- * vmw_validation_bo_reserve. It's typically used as part of an error path
- */
-static inline void
-vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
-{
- ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
-}
-
/**
* vmw_validation_bo_fence - Unreserve and fence buffer objects registered
* with a validation context
@@ -268,4 +254,6 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
unsigned int size);
void vmw_validation_res_set_dirty(struct vmw_validation_context *ctx,
void *val_private, u32 dirty);
+void vmw_validation_bo_backoff(struct vmw_validation_context *ctx);
+
#endif
--
2.20.1
With the vmwgfx dirty tracking, the default TTM fault handler is not
completely sufficient (vmwgfx need to modify the vma->vm_flags member,
and also needs to restrict the number of prefaults).
We also want to replicate the new ttm_bo_vm_reserve() functionality
So start turning the TTM vm code into helpers: ttm_bo_vm_fault_reserved()
and ttm_bo_vm_reserve(), and provide a default TTM fault handler for other
drivers to use.
Cc: "Christian König" <[email protected]>
Signed-off-by: Thomas Hellstrom <[email protected]>
---
drivers/gpu/drm/ttm/ttm_bo_vm.c | 170 ++++++++++++++++++++------------
include/drm/ttm/ttm_bo_api.h | 10 ++
2 files changed, 116 insertions(+), 64 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index bfb25b81fed7..3bd28fb97124 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -42,8 +42,6 @@
#include <linux/uaccess.h>
#include <linux/mem_encrypt.h>
-#define TTM_BO_VM_NUM_PREFAULT 16
-
static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
struct vm_fault *vmf)
{
@@ -106,31 +104,30 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
+ page_offset;
}
-static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
+/**
+ * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
+ * @bo: The buffer object
+ * @vmf: The fault structure handed to the callback
+ *
+ * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
+ * during long waits, and after the wait the callback will be restarted. This
+ * is to allow other threads using the same virtual memory space concurrent
+ * access to map(), unmap() completely unrelated buffer objects. TTM buffer
+ * object reservations sometimes wait for GPU and should therefore be
+ * considered long waits. This function reserves the buffer object interruptibly
+ * taking this into account. Starvation is avoided by the vm system not
+ * allowing too many repeated restarts.
+ * This function is intended to be used in customized fault() and _mkwrite()
+ * handlers.
+ *
+ * Return:
+ * 0 on success and the bo was reserved.
+ * VM_FAULT_RETRY if blocking wait.
+ * VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
+ */
+vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
+ struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
- struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
- vma->vm_private_data;
- struct ttm_bo_device *bdev = bo->bdev;
- unsigned long page_offset;
- unsigned long page_last;
- unsigned long pfn;
- struct ttm_tt *ttm = NULL;
- struct page *page;
- int err;
- int i;
- vm_fault_t ret = VM_FAULT_NOPAGE;
- unsigned long address = vmf->address;
- struct ttm_mem_type_manager *man =
- &bdev->man[bo->mem.mem_type];
- struct vm_area_struct cvma;
-
- /*
- * Work around locking order reversal in fault / nopfn
- * between mmap_sem and bo_reserve: Perform a trylock operation
- * for reserve, and if it fails, retry the fault after waiting
- * for the buffer to become unreserved.
- */
if (unlikely(!reservation_object_trylock(bo->resv))) {
if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) {
if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
@@ -151,14 +148,56 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
return VM_FAULT_NOPAGE;
}
+ return 0;
+}
+EXPORT_SYMBOL(ttm_bo_vm_reserve);
+
+/**
+ * ttm_bo_vm_fault_reserved - TTM fault helper
+ * @vmf: The struct vm_fault given as argument to the fault callback
+ * @cvma: The struct vmw_area_struct affected. Note that this may be a
+ * copy of the real vma object if the caller needs, for example, VM
+ * flags to be temporarily altered while determining the page protection.
+ * @num_prefault: Maximum number of prefault pages. The caller may want to
+ * specify this based on madvice settings and the size of the GPU object
+ * backed by the memory.
+ *
+ * This function inserts one or more page table entries pointing to the
+ * memory backing the buffer object, and then returns a return code
+ * instructing the caller to retry the page access.
+ *
+ * Return:
+ * VM_FAULT_NOPAGE on success or pending signal
+ * VM_FAULT_SIGBUS on unspecified error
+ * VM_FAULT_OOM on out-of-memory
+ * VM_FAULT_RETRY if retryable wait
+ */
+vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
+ struct vm_area_struct *cvma,
+ pgoff_t num_prefault)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+ vma->vm_private_data;
+ struct ttm_bo_device *bdev = bo->bdev;
+ unsigned long page_offset;
+ unsigned long page_last;
+ unsigned long pfn;
+ struct ttm_tt *ttm = NULL;
+ struct page *page;
+ int err;
+ pgoff_t i;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
+ unsigned long address = vmf->address;
+ struct ttm_mem_type_manager *man =
+ &bdev->man[bo->mem.mem_type];
+
/*
* Refuse to fault imported pages. This should be handled
* (if at all) by redirecting mmap to the exporter.
*/
- if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
- }
+ if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
+ return VM_FAULT_SIGBUS;
if (bdev->driver->fault_reserve_notify) {
struct dma_fence *moving = dma_fence_get(bo->moving);
@@ -169,11 +208,9 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
break;
case -EBUSY:
case -ERESTARTSYS:
- ret = VM_FAULT_NOPAGE;
- goto out_unlock;
+ return VM_FAULT_NOPAGE;
default:
- ret = VM_FAULT_SIGBUS;
- goto out_unlock;
+ return VM_FAULT_SIGBUS;
}
if (bo->moving != moving) {
@@ -189,24 +226,15 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
* move.
*/
ret = ttm_bo_vm_fault_idle(bo, vmf);
- if (unlikely(ret != 0)) {
- if (ret == VM_FAULT_RETRY &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* The BO has already been unreserved. */
- return ret;
- }
-
- goto out_unlock;
- }
+ if (unlikely(ret != 0))
+ return ret;
err = ttm_mem_io_lock(man, true);
- if (unlikely(err != 0)) {
- ret = VM_FAULT_NOPAGE;
- goto out_unlock;
- }
+ if (unlikely(err != 0))
+ return VM_FAULT_NOPAGE;
err = ttm_mem_io_reserve_vm(bo);
if (unlikely(err != 0)) {
- ret = VM_FAULT_SIGBUS;
+ return VM_FAULT_SIGBUS;
goto out_io_unlock;
}
@@ -220,17 +248,11 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
goto out_io_unlock;
}
- /*
- * Make a local vma copy to modify the page_prot member
- * and vm_flags if necessary. The vma parameter is protected
- * by mmap_sem in write mode.
- */
- cvma = *vma;
- cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
+ cvma->vm_page_prot = vm_get_page_prot(cvma->vm_flags);
if (bo->mem.bus.is_iomem) {
- cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
- cvma.vm_page_prot);
+ cvma->vm_page_prot = ttm_io_prot(bo->mem.placement,
+ cvma->vm_page_prot);
} else {
struct ttm_operation_ctx ctx = {
.interruptible = false,
@@ -240,8 +262,8 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
};
ttm = bo->ttm;
- cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
- cvma.vm_page_prot);
+ cvma->vm_page_prot = ttm_io_prot(bo->mem.placement,
+ cvma->vm_page_prot);
/* Allocate all page at once, most common usage */
if (ttm_tt_populate(ttm, &ctx)) {
@@ -254,10 +276,11 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
* Speculatively prefault a number of pages. Only error on
* first page.
*/
- for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
+ for (i = 0; i < num_prefault; ++i) {
if (bo->mem.bus.is_iomem) {
/* Iomem should not be marked encrypted */
- cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
+ cvma->vm_page_prot =
+ pgprot_decrypted(cvma->vm_page_prot);
pfn = ttm_bo_io_mem_pfn(bo, page_offset);
} else {
page = ttm->pages[page_offset];
@@ -273,10 +296,10 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
}
if (vma->vm_flags & VM_MIXEDMAP)
- ret = vmf_insert_mixed(&cvma, address,
+ ret = vmf_insert_mixed(cvma, address,
__pfn_to_pfn_t(pfn, PFN_DEV));
else
- ret = vmf_insert_pfn(&cvma, address, pfn);
+ ret = vmf_insert_pfn(cvma, address, pfn);
/*
* Somebody beat us to this PTE or prefaulting to
@@ -295,7 +318,26 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
out_io_unlock:
ttm_mem_io_unlock(man);
-out_unlock:
+ return ret;
+}
+EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
+
+static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct vm_area_struct cvma = *vma;
+ struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+ vma->vm_private_data;
+ vm_fault_t ret;
+
+ ret = ttm_bo_vm_reserve(bo, vmf);
+ if (ret)
+ return ret;
+
+ ret = ttm_bo_vm_fault_reserved(vmf, &cvma, TTM_BO_VM_NUM_PREFAULT);
+ if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
+ return ret;
+
reservation_object_unlock(bo->resv);
return ret;
}
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 49d9cdfc58f2..bebfa16426ca 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -768,4 +768,14 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
struct ttm_operation_ctx *ctx);
void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
+
+/* Default number of pre-faulted pages in the TTM fault handler */
+#define TTM_BO_VM_NUM_PREFAULT 16
+
+vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
+ struct vm_fault *vmf);
+
+vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
+ struct vm_area_struct *cvma,
+ pgoff_t num_prefault);
#endif
--
2.20.1
On 4/12/19 9:04 AM, Thomas Hellstrom wrote:
> Driver fault callbacks are allowed to drop the mmap_sem when expecting
> long hardware waits to avoid blocking other mm users. Allow the mkwrite
> callbacks to do the same by returning early on VM_FAULT_RETRY.
>
> In particular we want to be able to drop the mmap_sem when waiting for
> a reservation object lock on a GPU buffer object. These locks may be
> held while waiting for the GPU.
>
> Cc: Andrew Morton <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Huang Ying <[email protected]>
> Cc: Souptick Joarder <[email protected]>
> Cc: "Jérôme Glisse" <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
>
> Signed-off-by: Thomas Hellstrom <[email protected]>
Reviewed-by: Ralph Campbell <[email protected]>
> ---
> mm/memory.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index e11ca9dd823f..a95b4a3b1ae2 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
> ret = vmf->vma->vm_ops->page_mkwrite(vmf);
> /* Restore original flags so that caller is not surprised */
> vmf->flags = old_flags;
> - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE)))
A very minor nit, for consistency elsewhere in mm/memory.c,
could you make this be:
(VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)
> return ret;
> if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> lock_page(page);
> @@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> vmf->flags |= FAULT_FLAG_MKWRITE;
> ret = vma->vm_ops->pfn_mkwrite(vmf);
> - if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
> + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE))
> return ret;
> return finish_mkwrite_fault(vmf);
> }
> @@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp || (tmp &
> - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> + (VM_FAULT_ERROR | VM_FAULT_RETRY |
> + VM_FAULT_NOPAGE)))) {
> put_page(vmf->page);
> return tmp;
> }
> @@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> unlock_page(vmf->page);
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp ||
> - (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> + (tmp & (VM_FAULT_ERROR | VM_FAULT_RETRY |
> + VM_FAULT_NOPAGE)))) {
> put_page(vmf->page);
> return tmp;
> }
>
On 4/12/19 9:04 AM, Thomas Hellstrom wrote:
> This is basically apply_to_page_range with added functionality:
> Allocating missing parts of the page table becomes optional, which
> means that the function can be guaranteed not to error if allocation
> is disabled. Also passing of the closure struct and callback function
> becomes different and more in line with how things are done elsewhere.
>
> Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range
>
> The reason for not using the page-walk code is that we want to perform
> the page-walk on vmas pointing to an address space without requiring the
> mmap_sem to be held rather thand on vmas belonging to a process with the
s/thand/than/
> mmap_sem held.
>
> Notable changes since RFC:
> Don't export apply_to_pfn range.
>
> Cc: Andrew Morton <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Huang Ying <[email protected]>
> Cc: Souptick Joarder <[email protected]>
> Cc: "Jérôme Glisse" <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Thomas Hellstrom <[email protected]>
Reviewed-by: Ralph Campbell <[email protected]>
> ---
> include/linux/mm.h | 10 ++++
> mm/memory.c | 130 ++++++++++++++++++++++++++++++++++-----------
> 2 files changed, 108 insertions(+), 32 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 80bb6408fe73..b7dd4ddd6efb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
> extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
> unsigned long size, pte_fn_t fn, void *data);
>
> +struct pfn_range_apply;
> +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
> + struct pfn_range_apply *closure);
> +struct pfn_range_apply {
> + struct mm_struct *mm;
> + pter_fn_t ptefn;
> + unsigned int alloc;
> +};
> +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> + unsigned long address, unsigned long size);
>
> #ifdef CONFIG_PAGE_POISONING
> extern bool page_poisoning_enabled(void);
> diff --git a/mm/memory.c b/mm/memory.c
> index a95b4a3b1ae2..60d67158964f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
> }
> EXPORT_SYMBOL(vm_iomap_memory);
>
> -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
> + unsigned long addr, unsigned long end)
> {
> pte_t *pte;
> int err;
> pgtable_t token;
> spinlock_t *uninitialized_var(ptl);
>
> - pte = (mm == &init_mm) ?
> + pte = (closure->mm == &init_mm) ?
> pte_alloc_kernel(pmd, addr) :
> - pte_alloc_map_lock(mm, pmd, addr, &ptl);
> + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
> if (!pte)
> return -ENOMEM;
>
> @@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> token = pmd_pgtable(*pmd);
>
> do {
> - err = fn(pte++, token, addr, data);
> + err = closure->ptefn(pte++, token, addr, closure);
> if (err)
> break;
> } while (addr += PAGE_SIZE, addr != end);
>
> arch_leave_lazy_mmu_mode();
>
> - if (mm != &init_mm)
> + if (closure->mm != &init_mm)
> pte_unmap_unlock(pte-1, ptl);
> return err;
> }
>
> -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
> + unsigned long addr, unsigned long end)
> {
> pmd_t *pmd;
> unsigned long next;
> - int err;
> + int err = 0;
>
> BUG_ON(pud_huge(*pud));
>
> - pmd = pmd_alloc(mm, pud, addr);
> + pmd = pmd_alloc(closure->mm, pud, addr);
> if (!pmd)
> return -ENOMEM;
> +
> do {
> next = pmd_addr_end(addr, end);
> - err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
> + if (!closure->alloc && pmd_none_or_clear_bad(pmd))
> + continue;
> + err = apply_to_pte_range(closure, pmd, addr, next);
> if (err)
> break;
> } while (pmd++, addr = next, addr != end);
> return err;
> }
>
> -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
> + unsigned long addr, unsigned long end)
> {
> pud_t *pud;
> unsigned long next;
> - int err;
> + int err = 0;
>
> - pud = pud_alloc(mm, p4d, addr);
> + pud = pud_alloc(closure->mm, p4d, addr);
> if (!pud)
> return -ENOMEM;
> +
> do {
> next = pud_addr_end(addr, end);
> - err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
> + if (!closure->alloc && pud_none_or_clear_bad(pud))
> + continue;
> + err = apply_to_pmd_range(closure, pud, addr, next);
> if (err)
> break;
> } while (pud++, addr = next, addr != end);
> return err;
> }
>
> -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
> + unsigned long addr, unsigned long end)
> {
> p4d_t *p4d;
> unsigned long next;
> - int err;
> + int err = 0;
>
> - p4d = p4d_alloc(mm, pgd, addr);
> + p4d = p4d_alloc(closure->mm, pgd, addr);
> if (!p4d)
> return -ENOMEM;
> +
> do {
> next = p4d_addr_end(addr, end);
> - err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
> + if (!closure->alloc && p4d_none_or_clear_bad(p4d))
> + continue;
> + err = apply_to_pud_range(closure, p4d, addr, next);
> if (err)
> break;
> } while (p4d++, addr = next, addr != end);
> return err;
> }
>
> -/*
> - * Scan a region of virtual memory, filling in page tables as necessary
> - * and calling a provided function on each leaf page table.
> +/**
> + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
> + * function on each leaf page table entry
> + * @closure: Details about how to scan and what function to apply
> + * @addr: Start virtual address
> + * @size: Size of the region
> + *
> + * If @closure->alloc is set to 1, the function will fill in the page table
> + * as necessary. Otherwise it will skip non-present parts.
> + * Note: The caller must ensure that the range does not contain huge pages.
> + * The caller must also assure that the proper mmu_notifier functions are
> + * called. Either in the pte leaf function or before and after the call to
> + * apply_to_pfn_range.
> + *
> + * Returns: Zero on success. If the provided function returns a non-zero status,
s/Returns/Return/
See Documentation/kernel-guide/kernel-doc.rst
> + * the page table walk will terminate and that status will be returned.
> + * If @closure->alloc is set to 1, then this function may also return memory
> + * allocation errors arising from allocating page table memory.
> */
> -int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> - unsigned long size, pte_fn_t fn, void *data)
> +int apply_to_pfn_range(struct pfn_range_apply *closure,
> + unsigned long addr, unsigned long size)
> {
> pgd_t *pgd;
> unsigned long next;
> @@ -2049,16 +2069,62 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> if (WARN_ON(addr >= end))
> return -EINVAL;
>
> - pgd = pgd_offset(mm, addr);
> + pgd = pgd_offset(closure->mm, addr);
> do {
> next = pgd_addr_end(addr, end);
> - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
> + if (!closure->alloc && pgd_none_or_clear_bad(pgd))
> + continue;
> + err = apply_to_p4d_range(closure, pgd, addr, next);
> if (err)
> break;
> } while (pgd++, addr = next, addr != end);
>
> return err;
> }
> +
> +/**
> + * struct page_range_apply - Closure structure for apply_to_page_range()
> + * @pter: The base closure structure we derive from
> + * @fn: The leaf pte function to call
> + * @data: The leaf pte function closure
> + */
> +struct page_range_apply {
> + struct pfn_range_apply pter;
> + pte_fn_t fn;
> + void *data;
> +};
> +
> +/*
> + * Callback wrapper to enable use of apply_to_pfn_range for
> + * the apply_to_page_range interface
> + */
> +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *pter)
> +{
> + struct page_range_apply *pra =
> + container_of(pter, typeof(*pra), pter);
> +
> + return pra->fn(pte, token, addr, pra->data);
> +}
> +
> +/*
> + * Scan a region of virtual memory, filling in page tables as necessary
> + * and calling a provided function on each leaf page table.
> + */
> +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> + unsigned long size, pte_fn_t fn, void *data)
> +{
> + struct page_range_apply pra = {
> + .pter = {.mm = mm,
> + .alloc = 1,
> + .ptefn = apply_to_page_range_wrapper },
> + .fn = fn,
> + .data = data
> + };
> +
> + return apply_to_pfn_range(&pra.pter, addr, size);
> +}
> EXPORT_SYMBOL_GPL(apply_to_page_range);
>
> /*
>
On 4/12/19 9:04 AM, Thomas Hellstrom wrote:
> Add two utilities to a) write-protect and b) clean all ptes pointing into
> a range of an address space
A period at the end, please.
> The utilities are intended to aid in tracking dirty pages (either
> driver-allocated system memory or pci device memory).
> The write-protect utility should be used in conjunction with
> page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
> accesses. Typically one would want to use this on sparse accesses into
> large memory regions. The clean utility should be used to utilize
> hardware dirtying functionality and avoid the overhead of page-faults,
> typically on large accesses into small memory regions.
>
> The added file "apply_as_range.c" is initially listed as maintained by
> VMware under our DRM driver. If somebody would like it elsewhere,
> that's of course no problem.
>
> Notable changes since RFC:
> - Added comments to help avoid the usage of these function for VMAs
> it's not intended for. We also do advisory checks on the vm_flags and
> warn on illegal usage.
> - Perform the pte modifications the same way softdirty does.
> - Add mmu_notifier range invalidation calls.
> - Add a config option so that this code is not unconditionally included.
> - Tell the mmu_gather code about pending tlb flushes.
>
> Cc: Andrew Morton <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Huang Ying <[email protected]>
> Cc: Souptick Joarder <[email protected]>
> Cc: "Jérôme Glisse" <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Thomas Hellstrom <[email protected]>
Reviewed-by: Ralph Campbell <[email protected]>
> ---
> MAINTAINERS | 1 +
> include/linux/mm.h | 9 +-
> mm/Kconfig | 3 +
> mm/Makefile | 3 +-
> mm/apply_as_range.c | 295 ++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 309 insertions(+), 2 deletions(-)
> create mode 100644 mm/apply_as_range.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 35e6357f9d30..bc243ffcb840 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -4971,6 +4971,7 @@ T: git git://people.freedesktop.org/~thomash/linux
> S: Supported
> F: drivers/gpu/drm/vmwgfx/
> F: include/uapi/drm/vmwgfx_drm.h
> +F: mm/apply_as_range.c
>
> DRM DRIVERS
> M: David Airlie <[email protected]>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b7dd4ddd6efb..62f24dd0bfa0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2642,7 +2642,14 @@ struct pfn_range_apply {
> };
> extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> unsigned long address, unsigned long size);
> -
> +unsigned long apply_as_wrprotect(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr);
> +unsigned long apply_as_clean(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr,
> + pgoff_t bitmap_pgoff,
> + unsigned long *bitmap,
> + pgoff_t *start,
> + pgoff_t *end);
> #ifdef CONFIG_PAGE_POISONING
> extern bool page_poisoning_enabled(void);
> extern void kernel_poison_pages(struct page *page, int numpages, int enable);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 25c71eb8a7db..80e41cdbb4ae 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -758,4 +758,7 @@ config GUP_BENCHMARK
> config ARCH_HAS_PTE_SPECIAL
> bool
>
> +config AS_DIRTY_HELPERS
> + bool
> +
> endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index d210cc9d6f80..b295717be856 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
> mm_init.o mmu_context.o percpu.o slab_common.o \
> compaction.o vmacache.o \
> interval_tree.o list_lru.o workingset.o \
> - debug.o $(mmu-y)
> + debug.o apply_as_range.o $(mmu-y)
>
> obj-y += init-mm.o
> obj-y += memblock.o
> @@ -99,3 +99,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
> obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
> obj-$(CONFIG_HMM) += hmm.o
> obj-$(CONFIG_MEMFD_CREATE) += memfd.o
> +obj-$(CONFIG_AS_DIRTY_HELPERS) += apply_as_range.o
> diff --git a/mm/apply_as_range.c b/mm/apply_as_range.c
> new file mode 100644
> index 000000000000..32d28619aec5
> --- /dev/null
> +++ b/mm/apply_as_range.c
> @@ -0,0 +1,295 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/mm.h>
> +#include <linux/mm_types.h>
> +#include <linux/hugetlb.h>
> +#include <linux/bitops.h>
> +#include <linux/mmu_notifier.h>
> +#include <asm/cacheflush.h>
> +#include <asm/tlbflush.h>
> +
> +/**
> + * struct apply_as - Closure structure for apply_as_range
> + * @base: struct pfn_range_apply we derive from
> + * @start: Address of first modified pte
> + * @end: Address of last modified pte + 1
> + * @total: Total number of modified ptes
> + * @vma: Pointer to the struct vm_area_struct we're currently operating on
> + */
> +struct apply_as {
> + struct pfn_range_apply base;
> + unsigned long start, end;
One variable defined per line, please.
> + unsigned long total;
> + const struct vm_area_struct *vma;
> +};
> +
> +/**
> + * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
> + * @pte: Pointer to the pte
> + * @token: Page table token, see apply_to_pfn_range()
> + * @addr: The virtual page address
> + * @closure: Pointer to a struct pfn_range_apply embedded in a
> + * struct apply_as
> + *
> + * The function write-protects a pte and records the range in
> + * virtual address space of touched ptes for efficient range TLB flushes.
> + *
> + * Return: Always zero.
> + */
> +static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *closure)
> +{
> + struct apply_as *aas = container_of(closure, typeof(*aas), base);
> + pte_t ptent = *pte;
> +
> + if (pte_write(ptent)) {
> + ptent = ptep_modify_prot_start(closure->mm, addr, pte);
> + ptent = pte_wrprotect(ptent);
> + ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
> + aas->total++;
> + aas->start = min(aas->start, addr);
> + aas->end = max(aas->end, addr + PAGE_SIZE);
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * struct apply_as_clean - Closure structure for apply_as_clean
> + * @base: struct apply_as we derive from
> + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
> + * @bitmap: Bitmap with one bit for each page offset in the address_space range
> + * covered.
> + * @start: Address_space page offset of first modified pte relative
> + * to @bitmap_pgoff
> + * @end: Address_space page offset of last modified pte relative
> + * to @bitmap_pgoff
> + */
> +struct apply_as_clean {
> + struct apply_as base;
> + pgoff_t bitmap_pgoff;
> + unsigned long *bitmap;
> + pgoff_t start, end;
One variable defined per line, please.
> +};
> +
> +/**
> + * apply_pt_clean - Leaf pte callback to clean a pte
> + * @pte: Pointer to the pte
> + * @token: Page table token, see apply_to_pfn_range()
> + * @addr: The virtual page address
> + * @closure: Pointer to a struct pfn_range_apply embedded in a
> + * struct apply_as_clean
> + *
> + * The function cleans a pte and records the range in
> + * virtual address space of touched ptes for efficient TLB flushes.
> + * It also records dirty ptes in a bitmap representing page offsets
> + * in the address_space, as well as the first and last of the bits
> + * touched.
> + *
> + * Return: Always zero.
> + */
> +static int apply_pt_clean(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *closure)
> +{
> + struct apply_as *aas = container_of(closure, typeof(*aas), base);
> + struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
> + pte_t ptent = *pte;
> +
> + if (pte_dirty(ptent)) {
> + pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
> + aas->vma->vm_pgoff - clean->bitmap_pgoff;
> +
> + ptent = ptep_modify_prot_start(closure->mm, addr, pte);
> + ptent = pte_mkclean(ptent);
> + ptep_modify_prot_commit(closure->mm, addr, pte, ptent);
> +
> + aas->total++;
> + aas->start = min(aas->start, addr);
> + aas->end = max(aas->end, addr + PAGE_SIZE);
> +
> + __set_bit(pgoff, clean->bitmap);
> + clean->start = min(clean->start, pgoff);
> + clean->end = max(clean->end, pgoff + 1);
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * apply_as_range - Apply a pte callback to all PTEs pointing into a range
> + * of an address_space.
> + * @mapping: Pointer to the struct address_space
> + * @aas: Closure structure
> + * @first_index: First page offset in the address_space
> + * @nr: Number of incremental page offsets to cover
> + *
> + * Return: Number of ptes touched. Note that this number might be larger
> + * than @nr if there are overlapping vmas
> + */
> +static unsigned long apply_as_range(struct address_space *mapping,
> + struct apply_as *aas,
> + pgoff_t first_index, pgoff_t nr)
> +{
> + struct vm_area_struct *vma;
> + pgoff_t vba, vea, cba, cea;
> + unsigned long start_addr, end_addr;
> + struct mmu_notifier_range range;
> +
> + i_mmap_lock_read(mapping);
> + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
> + first_index + nr - 1) {
> + unsigned long vm_flags = READ_ONCE(vma->vm_flags);
> +
> + /*
> + * We can only do advisory flag tests below, since we can't
> + * require the vm's mmap_sem to be held to protect the flags.
> + * Therefore, callers that strictly depend on specific mmap
> + * flags to remain constant throughout the operation must
> + * either ensure those flags are immutable for all relevant
> + * vmas or can't use this function. Fixing this properly would
> + * require the vma::vm_flags to be protected by a separate
> + * lock taken after the i_mmap_lock
> + */
> +
> + /* Skip non-applicable VMAs */
> + if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
> + (VM_SHARED | VM_WRITE))
> + continue;
> +
> + /* Warn on and skip VMAs whose flags indicate illegal usage */
> + if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
> + continue;
> +
> + /* Clip to the vma */
> + vba = vma->vm_pgoff;
> + vea = vba + vma_pages(vma);
> + cba = first_index;
> + cba = max(cba, vba);
> + cea = first_index + nr;
> + cea = min(cea, vea);
> +
> + /* Translate to virtual address */
> + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
> + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
> + if (start_addr >= end_addr)
> + continue;
> +
> + aas->base.mm = vma->vm_mm;
> + aas->vma = vma;
> + aas->start = end_addr;
> + aas->end = start_addr;
> +
> + mmu_notifier_range_init(&range, vma->vm_mm,
> + start_addr, end_addr);
> + mmu_notifier_invalidate_range_start(&range);
> +
> + /* Needed when we only change protection? */
> + flush_cache_range(vma, start_addr, end_addr);
> +
> + /*
> + * We're not using tlb_gather_mmu() since typically
> + * only a small subrange of PTEs are affected.
> + */
> + inc_tlb_flush_pending(vma->vm_mm);
> +
> + /* Should not error since aas->base.alloc == 0 */
> + WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
> + end_addr - start_addr));
> + if (aas->end > aas->start)
> + flush_tlb_range(vma, aas->start, aas->end);
> +
> + mmu_notifier_invalidate_range_end(&range);
> + dec_tlb_flush_pending(vma->vm_mm);
> + }
> + i_mmap_unlock_read(mapping);
> +
> + return aas->total;
> +}
> +
> +/**
> + * apply_as_wrprotect - Write-protect all ptes in an address_space range
> + * @mapping: The address_space we want to write protect
> + * @first_index: The first page offset in the range
> + * @nr: Number of incremental page offsets to cover
> + *
> + * WARNING: This function should only be used for address spaces that
> + * completely own the pages / memory the page table points to. Typically a
> + * device file.
> + *
> + * Return: The number of ptes actually write-protected. Note that
> + * already write-protected ptes are not counted.
> + */
> +unsigned long apply_as_wrprotect(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr)
> +{
> + struct apply_as aas = {
> + .base = {
> + .alloc = 0,
> + .ptefn = apply_pt_wrprotect,
> + },
> + .total = 0,
> + };
> +
> + return apply_as_range(mapping, &aas, first_index, nr);
> +}
> +EXPORT_SYMBOL(apply_as_wrprotect);
> +
> +/**
> + * apply_as_clean - Clean all ptes in an address_space range
> + * @mapping: The address_space we want to clean
> + * @first_index: The first page offset in the range
> + * @nr: Number of incremental page offsets to cover
> + * @bitmap_pgoff: The page offset of the first bit in @bitmap
> + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
> + * cover the whole range @first_index..@first_index + @nr.
> + * @start: Pointer to number of the first set bit in @bitmap.
> + * is modified as new bits are set by the function.
> + * @end: Pointer to the number of the last set bit in @bitmap.
> + * none set. The value is modified as new bets are set by the function.
s/bets/bits/
> + *
> + * Note: When this function returns there is no guarantee that a CPU has
> + * not already dirtied new ptes. However it will not clean any ptes not
> + * reported in the bitmap.
> + *
> + * If a caller needs to make sure all dirty ptes are picked up and none
> + * additional are added, it first needs to write-protect the address-space
> + * range and make sure new writers are blocked in page_mkwrite() or
> + * pfn_mkwrite(). And then after a TLB flush following the write-protection
> + * pick upp all dirty bits.
s/upp/up/
> + *
> + * WARNING: This function should only be used for address spaces that
> + * completely own the pages / memory the page table points to. Typically a
> + * device file.
> + *
> + * Return: The number of dirty ptes actually cleaned.
> + */
> +unsigned long apply_as_clean(struct address_space *mapping,
> + pgoff_t first_index, pgoff_t nr,
> + pgoff_t bitmap_pgoff,
> + unsigned long *bitmap,
> + pgoff_t *start,
> + pgoff_t *end)
> +{
> + bool none_set = (*start >= *end);
> + struct apply_as_clean clean = {
> + .base = {
> + .base = {
> + .alloc = 0,
> + .ptefn = apply_pt_clean,
> + },
> + .total = 0,
> + },
> + .bitmap_pgoff = bitmap_pgoff,
> + .bitmap = bitmap,
> + .start = none_set ? nr : *start,
> + .end = none_set ? 0 : *end,
> + };
> + unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
> + nr);
> +
> + *start = clean.start;
> + *end = clean.end;
> + return ret;
> +}
> +EXPORT_SYMBOL(apply_as_clean);
>
On Fri, Apr 12, 2019 at 04:04:18PM +0000, Thomas Hellstrom wrote:
> This is basically apply_to_page_range with added functionality:
> Allocating missing parts of the page table becomes optional, which
> means that the function can be guaranteed not to error if allocation
> is disabled. Also passing of the closure struct and callback function
> becomes different and more in line with how things are done elsewhere.
>
> Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range
>
> The reason for not using the page-walk code is that we want to perform
> the page-walk on vmas pointing to an address space without requiring the
> mmap_sem to be held rather thand on vmas belonging to a process with the
> mmap_sem held.
>
> Notable changes since RFC:
> Don't export apply_to_pfn range.
>
> Cc: Andrew Morton <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Huang Ying <[email protected]>
> Cc: Souptick Joarder <[email protected]>
> Cc: "J?r?me Glisse" <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Thomas Hellstrom <[email protected]>
> ---
> include/linux/mm.h | 10 ++++
> mm/memory.c | 130 ++++++++++++++++++++++++++++++++++-----------
> 2 files changed, 108 insertions(+), 32 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 80bb6408fe73..b7dd4ddd6efb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
> extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
> unsigned long size, pte_fn_t fn, void *data);
>
> +struct pfn_range_apply;
> +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
> + struct pfn_range_apply *closure);
> +struct pfn_range_apply {
> + struct mm_struct *mm;
> + pter_fn_t ptefn;
> + unsigned int alloc;
> +};
> +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> + unsigned long address, unsigned long size);
>
> #ifdef CONFIG_PAGE_POISONING
> extern bool page_poisoning_enabled(void);
> diff --git a/mm/memory.c b/mm/memory.c
> index a95b4a3b1ae2..60d67158964f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
> }
> EXPORT_SYMBOL(vm_iomap_memory);
>
> -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
> + unsigned long addr, unsigned long end)
> {
> pte_t *pte;
> int err;
> pgtable_t token;
> spinlock_t *uninitialized_var(ptl);
>
> - pte = (mm == &init_mm) ?
> + pte = (closure->mm == &init_mm) ?
> pte_alloc_kernel(pmd, addr) :
> - pte_alloc_map_lock(mm, pmd, addr, &ptl);
> + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
> if (!pte)
> return -ENOMEM;
>
> @@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> token = pmd_pgtable(*pmd);
>
> do {
> - err = fn(pte++, token, addr, data);
> + err = closure->ptefn(pte++, token, addr, closure);
> if (err)
> break;
> } while (addr += PAGE_SIZE, addr != end);
>
> arch_leave_lazy_mmu_mode();
>
> - if (mm != &init_mm)
> + if (closure->mm != &init_mm)
> pte_unmap_unlock(pte-1, ptl);
> return err;
> }
>
> -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
> + unsigned long addr, unsigned long end)
> {
> pmd_t *pmd;
> unsigned long next;
> - int err;
> + int err = 0;
>
> BUG_ON(pud_huge(*pud));
>
> - pmd = pmd_alloc(mm, pud, addr);
> + pmd = pmd_alloc(closure->mm, pud, addr);
> if (!pmd)
> return -ENOMEM;
> +
> do {
> next = pmd_addr_end(addr, end);
> - err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
> + if (!closure->alloc && pmd_none_or_clear_bad(pmd))
> + continue;
> + err = apply_to_pte_range(closure, pmd, addr, next);
> if (err)
> break;
> } while (pmd++, addr = next, addr != end);
> return err;
> }
>
> -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
> + unsigned long addr, unsigned long end)
> {
> pud_t *pud;
> unsigned long next;
> - int err;
> + int err = 0;
>
> - pud = pud_alloc(mm, p4d, addr);
> + pud = pud_alloc(closure->mm, p4d, addr);
> if (!pud)
> return -ENOMEM;
> +
> do {
> next = pud_addr_end(addr, end);
> - err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
> + if (!closure->alloc && pud_none_or_clear_bad(pud))
> + continue;
> + err = apply_to_pmd_range(closure, pud, addr, next);
> if (err)
> break;
> } while (pud++, addr = next, addr != end);
> return err;
> }
>
> -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
> - unsigned long addr, unsigned long end,
> - pte_fn_t fn, void *data)
> +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
> + unsigned long addr, unsigned long end)
> {
> p4d_t *p4d;
> unsigned long next;
> - int err;
> + int err = 0;
>
> - p4d = p4d_alloc(mm, pgd, addr);
> + p4d = p4d_alloc(closure->mm, pgd, addr);
> if (!p4d)
> return -ENOMEM;
> +
> do {
> next = p4d_addr_end(addr, end);
> - err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
> + if (!closure->alloc && p4d_none_or_clear_bad(p4d))
> + continue;
> + err = apply_to_pud_range(closure, p4d, addr, next);
> if (err)
> break;
> } while (p4d++, addr = next, addr != end);
> return err;
> }
>
> -/*
> - * Scan a region of virtual memory, filling in page tables as necessary
> - * and calling a provided function on each leaf page table.
> +/**
> + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
> + * function on each leaf page table entry
> + * @closure: Details about how to scan and what function to apply
> + * @addr: Start virtual address
> + * @size: Size of the region
> + *
> + * If @closure->alloc is set to 1, the function will fill in the page table
> + * as necessary. Otherwise it will skip non-present parts.
> + * Note: The caller must ensure that the range does not contain huge pages.
> + * The caller must also assure that the proper mmu_notifier functions are
> + * called. Either in the pte leaf function or before and after the call to
> + * apply_to_pfn_range.
This is wrong there should be a big FAT warning that this can only be use
against mmap of device file. The page table walking above is broken for
various thing you might find in any other vma like THP, device pte, hugetlbfs,
...
Also the mmu notifier can not be call from the pfn callback as that callback
happens under page table lock (the change_pte notifier callback is useless
and not enough). So it _must_ happen around the call to apply_to_pfn_range
apply_to_page_range was really not meant to be use in that way ... it was not
for regular vma.
Using this function for anything else is dangerous and having its uses spread
more increase that risk. So there must be a big FAT warning saying that you
should not use this lightly and that it should only be only on mmap of device
file.
> + *
> + * Returns: Zero on success. If the provided function returns a non-zero status,
> + * the page table walk will terminate and that status will be returned.
> + * If @closure->alloc is set to 1, then this function may also return memory
> + * allocation errors arising from allocating page table memory.
> */
> -int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> - unsigned long size, pte_fn_t fn, void *data)
> +int apply_to_pfn_range(struct pfn_range_apply *closure,
> + unsigned long addr, unsigned long size)
> {
> pgd_t *pgd;
> unsigned long next;
> @@ -2049,16 +2069,62 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> if (WARN_ON(addr >= end))
> return -EINVAL;
>
> - pgd = pgd_offset(mm, addr);
> + pgd = pgd_offset(closure->mm, addr);
> do {
> next = pgd_addr_end(addr, end);
> - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
> + if (!closure->alloc && pgd_none_or_clear_bad(pgd))
> + continue;
> + err = apply_to_p4d_range(closure, pgd, addr, next);
> if (err)
> break;
> } while (pgd++, addr = next, addr != end);
>
> return err;
> }
> +
> +/**
> + * struct page_range_apply - Closure structure for apply_to_page_range()
> + * @pter: The base closure structure we derive from
> + * @fn: The leaf pte function to call
> + * @data: The leaf pte function closure
> + */
> +struct page_range_apply {
> + struct pfn_range_apply pter;
> + pte_fn_t fn;
> + void *data;
> +};
> +
> +/*
> + * Callback wrapper to enable use of apply_to_pfn_range for
> + * the apply_to_page_range interface
> + */
> +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
> + unsigned long addr,
> + struct pfn_range_apply *pter)
> +{
> + struct page_range_apply *pra =
> + container_of(pter, typeof(*pra), pter);
> +
> + return pra->fn(pte, token, addr, pra->data);
> +}
> +
> +/*
> + * Scan a region of virtual memory, filling in page tables as necessary
> + * and calling a provided function on each leaf page table.
> + */
It would be good to improve that comment too and make it a warning of
DO NOT USE ! THIS IS NOT SAFE ON REGULAR VMA !
> +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> + unsigned long size, pte_fn_t fn, void *data)
> +{
> + struct page_range_apply pra = {
> + .pter = {.mm = mm,
> + .alloc = 1,
> + .ptefn = apply_to_page_range_wrapper },
> + .fn = fn,
> + .data = data
> + };
> +
> + return apply_to_pfn_range(&pra.pter, addr, size);
> +}
> EXPORT_SYMBOL_GPL(apply_to_page_range);
>
> /*
> --
> 2.20.1
>
Hi, Jérôme
On Fri, 2019-04-12 at 17:07 -0400, Jerome Glisse wrote:
> On Fri, Apr 12, 2019 at 04:04:18PM +0000, Thomas Hellstrom wrote:
> > This is basically apply_to_page_range with added functionality:
> > Allocating missing parts of the page table becomes optional, which
> > means that the function can be guaranteed not to error if
> > allocation
> > is disabled. Also passing of the closure struct and callback
> > function
> > becomes different and more in line with how things are done
> > elsewhere.
> >
> > Finally we keep apply_to_page_range as a wrapper around
> > apply_to_pfn_range
> >
> > The reason for not using the page-walk code is that we want to
> > perform
> > the page-walk on vmas pointing to an address space without
> > requiring the
> > mmap_sem to be held rather thand on vmas belonging to a process
> > with the
> > mmap_sem held.
> >
> > Notable changes since RFC:
> > Don't export apply_to_pfn range.
> >
> > Cc: Andrew Morton <[email protected]>
> > Cc: Matthew Wilcox <[email protected]>
> > Cc: Will Deacon <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Rik van Riel <[email protected]>
> > Cc: Minchan Kim <[email protected]>
> > Cc: Michal Hocko <[email protected]>
> > Cc: Huang Ying <[email protected]>
> > Cc: Souptick Joarder <[email protected]>
> > Cc: "Jérôme Glisse" <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Signed-off-by: Thomas Hellstrom <[email protected]>
> > ---
> > include/linux/mm.h | 10 ++++
> > mm/memory.c | 130 ++++++++++++++++++++++++++++++++++-------
> > ----
> > 2 files changed, 108 insertions(+), 32 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 80bb6408fe73..b7dd4ddd6efb 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte,
> > pgtable_t token, unsigned long addr,
> > extern int apply_to_page_range(struct mm_struct *mm, unsigned long
> > address,
> > unsigned long size, pte_fn_t fn, void
> > *data);
> >
> > +struct pfn_range_apply;
> > +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned
> > long addr,
> > + struct pfn_range_apply *closure);
> > +struct pfn_range_apply {
> > + struct mm_struct *mm;
> > + pter_fn_t ptefn;
> > + unsigned int alloc;
> > +};
> > +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> > + unsigned long address, unsigned long
> > size);
> >
> > #ifdef CONFIG_PAGE_POISONING
> > extern bool page_poisoning_enabled(void);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index a95b4a3b1ae2..60d67158964f 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct
> > *vma, phys_addr_t start, unsigned long
> > }
> > EXPORT_SYMBOL(vm_iomap_memory);
> >
> > -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> > - unsigned long addr, unsigned long
> > end,
> > - pte_fn_t fn, void *data)
> > +static int apply_to_pte_range(struct pfn_range_apply *closure,
> > pmd_t *pmd,
> > + unsigned long addr, unsigned long end)
> > {
> > pte_t *pte;
> > int err;
> > pgtable_t token;
> > spinlock_t *uninitialized_var(ptl);
> >
> > - pte = (mm == &init_mm) ?
> > + pte = (closure->mm == &init_mm) ?
> > pte_alloc_kernel(pmd, addr) :
> > - pte_alloc_map_lock(mm, pmd, addr, &ptl);
> > + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
> > if (!pte)
> > return -ENOMEM;
> >
> > @@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct
> > mm_struct *mm, pmd_t *pmd,
> > token = pmd_pgtable(*pmd);
> >
> > do {
> > - err = fn(pte++, token, addr, data);
> > + err = closure->ptefn(pte++, token, addr, closure);
> > if (err)
> > break;
> > } while (addr += PAGE_SIZE, addr != end);
> >
> > arch_leave_lazy_mmu_mode();
> >
> > - if (mm != &init_mm)
> > + if (closure->mm != &init_mm)
> > pte_unmap_unlock(pte-1, ptl);
> > return err;
> > }
> >
> > -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
> > - unsigned long addr, unsigned long
> > end,
> > - pte_fn_t fn, void *data)
> > +static int apply_to_pmd_range(struct pfn_range_apply *closure,
> > pud_t *pud,
> > + unsigned long addr, unsigned long end)
> > {
> > pmd_t *pmd;
> > unsigned long next;
> > - int err;
> > + int err = 0;
> >
> > BUG_ON(pud_huge(*pud));
> >
> > - pmd = pmd_alloc(mm, pud, addr);
> > + pmd = pmd_alloc(closure->mm, pud, addr);
> > if (!pmd)
> > return -ENOMEM;
> > +
> > do {
> > next = pmd_addr_end(addr, end);
> > - err = apply_to_pte_range(mm, pmd, addr, next, fn,
> > data);
> > + if (!closure->alloc && pmd_none_or_clear_bad(pmd))
> > + continue;
> > + err = apply_to_pte_range(closure, pmd, addr, next);
> > if (err)
> > break;
> > } while (pmd++, addr = next, addr != end);
> > return err;
> > }
> >
> > -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
> > - unsigned long addr, unsigned long
> > end,
> > - pte_fn_t fn, void *data)
> > +static int apply_to_pud_range(struct pfn_range_apply *closure,
> > p4d_t *p4d,
> > + unsigned long addr, unsigned long end)
> > {
> > pud_t *pud;
> > unsigned long next;
> > - int err;
> > + int err = 0;
> >
> > - pud = pud_alloc(mm, p4d, addr);
> > + pud = pud_alloc(closure->mm, p4d, addr);
> > if (!pud)
> > return -ENOMEM;
> > +
> > do {
> > next = pud_addr_end(addr, end);
> > - err = apply_to_pmd_range(mm, pud, addr, next, fn,
> > data);
> > + if (!closure->alloc && pud_none_or_clear_bad(pud))
> > + continue;
> > + err = apply_to_pmd_range(closure, pud, addr, next);
> > if (err)
> > break;
> > } while (pud++, addr = next, addr != end);
> > return err;
> > }
> >
> > -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
> > - unsigned long addr, unsigned long
> > end,
> > - pte_fn_t fn, void *data)
> > +static int apply_to_p4d_range(struct pfn_range_apply *closure,
> > pgd_t *pgd,
> > + unsigned long addr, unsigned long end)
> > {
> > p4d_t *p4d;
> > unsigned long next;
> > - int err;
> > + int err = 0;
> >
> > - p4d = p4d_alloc(mm, pgd, addr);
> > + p4d = p4d_alloc(closure->mm, pgd, addr);
> > if (!p4d)
> > return -ENOMEM;
> > +
> > do {
> > next = p4d_addr_end(addr, end);
> > - err = apply_to_pud_range(mm, p4d, addr, next, fn,
> > data);
> > + if (!closure->alloc && p4d_none_or_clear_bad(p4d))
> > + continue;
> > + err = apply_to_pud_range(closure, p4d, addr, next);
> > if (err)
> > break;
> > } while (p4d++, addr = next, addr != end);
> > return err;
> > }
> >
> > -/*
> > - * Scan a region of virtual memory, filling in page tables as
> > necessary
> > - * and calling a provided function on each leaf page table.
> > +/**
> > + * apply_to_pfn_range - Scan a region of virtual memory, calling a
> > provided
> > + * function on each leaf page table entry
> > + * @closure: Details about how to scan and what function to apply
> > + * @addr: Start virtual address
> > + * @size: Size of the region
> > + *
> > + * If @closure->alloc is set to 1, the function will fill in the
> > page table
> > + * as necessary. Otherwise it will skip non-present parts.
> > + * Note: The caller must ensure that the range does not contain
> > huge pages.
> > + * The caller must also assure that the proper mmu_notifier
> > functions are
> > + * called. Either in the pte leaf function or before and after the
> > call to
> > + * apply_to_pfn_range.
>
> This is wrong there should be a big FAT warning that this can only be
> use
> against mmap of device file. The page table walking above is broken
> for
> various thing you might find in any other vma like THP, device pte,
> hugetlbfs,
I was figuring since we didn't export the function anymore, the warning
and checks could be left to its users, assuming that any other future
usage of this function would require mm people audit anyway. But I can
of course add that warning also to this function if you still want
that?
> ...
>
> Also the mmu notifier can not be call from the pfn callback as that
> callback
> happens under page table lock (the change_pte notifier callback is
> useless
> and not enough). So it _must_ happen around the call to
> apply_to_pfn_range
In the comments I was having in mind usage of, for example
ptep_clear_flush_notify(). But you're the mmu_notifier expert here. Are
you saying that function by itself would not be sufficient?
In that case, should I just scratch the text mentioning the pte leaf
function?
>
> apply_to_page_range was really not meant to be use in that way ... it
> was not
> for regular vma.
>
> Using this function for anything else is dangerous and having its
> uses spread
> more increase that risk. So there must be a big FAT warning saying
> that you
> should not use this lightly and that it should only be only on mmap
> of device
> file.
Understood.
/Thomas
>
>
> > + *
> > + * Returns: Zero on success. If the provided function returns a
> > non-zero status,
> > + * the page table walk will terminate and that status will be
> > returned.
> > + * If @closure->alloc is set to 1, then this function may also
> > return memory
> > + * allocation errors arising from allocating page table memory.
> > */
> > -int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> > - unsigned long size, pte_fn_t fn, void *data)
> > +int apply_to_pfn_range(struct pfn_range_apply *closure,
> > + unsigned long addr, unsigned long size)
> > {
> > pgd_t *pgd;
> > unsigned long next;
> > @@ -2049,16 +2069,62 @@ int apply_to_page_range(struct mm_struct
> > *mm, unsigned long addr,
> > if (WARN_ON(addr >= end))
> > return -EINVAL;
> >
> > - pgd = pgd_offset(mm, addr);
> > + pgd = pgd_offset(closure->mm, addr);
> > do {
> > next = pgd_addr_end(addr, end);
> > - err = apply_to_p4d_range(mm, pgd, addr, next, fn,
> > data);
> > + if (!closure->alloc && pgd_none_or_clear_bad(pgd))
> > + continue;
> > + err = apply_to_p4d_range(closure, pgd, addr, next);
> > if (err)
> > break;
> > } while (pgd++, addr = next, addr != end);
> >
> > return err;
> > }
> > +
> > +/**
> > + * struct page_range_apply - Closure structure for
> > apply_to_page_range()
> > + * @pter: The base closure structure we derive from
> > + * @fn: The leaf pte function to call
> > + * @data: The leaf pte function closure
> > + */
> > +struct page_range_apply {
> > + struct pfn_range_apply pter;
> > + pte_fn_t fn;
> > + void *data;
> > +};
> > +
> > +/*
> > + * Callback wrapper to enable use of apply_to_pfn_range for
> > + * the apply_to_page_range interface
> > + */
> > +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t
> > token,
> > + unsigned long addr,
> > + struct pfn_range_apply *pter)
> > +{
> > + struct page_range_apply *pra =
> > + container_of(pter, typeof(*pra), pter);
> > +
> > + return pra->fn(pte, token, addr, pra->data);
> > +}
> > +
> > +/*
> > + * Scan a region of virtual memory, filling in page tables as
> > necessary
> > + * and calling a provided function on each leaf page table.
> > + */
>
> It would be good to improve that comment too and make it a warning of
> DO NOT USE ! THIS IS NOT SAFE ON REGULAR VMA !
>
> > +int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
> > + unsigned long size, pte_fn_t fn, void *data)
> > +{
> > + struct page_range_apply pra = {
> > + .pter = {.mm = mm,
> > + .alloc = 1,
> > + .ptefn = apply_to_page_range_wrapper },
> > + .fn = fn,
> > + .data = data
> > + };
> > +
> > + return apply_to_pfn_range(&pra.pter, addr, size);
> > +}
> > EXPORT_SYMBOL_GPL(apply_to_page_range);
> >
> > /*
> > --
> > 2.20.1
> >
Hi, Ralph,
On Fri, 2019-04-12 at 11:52 -0700, Ralph Campbell wrote:
> On 4/12/19 9:04 AM, Thomas Hellstrom wrote:
> > Add two utilities to a) write-protect and b) clean all ptes
> > pointing into
> > a range of an address space
>
> A period at the end, please.
>
> > The utilities are intended to aid in tracking dirty pages (either
> > driver-allocated system memory or pci device memory).
> > The write-protect utility should be used in conjunction with
> > page_mkwrite() and pfn_mkwrite() to trigger write page-faults on
> > page
> > accesses. Typically one would want to use this on sparse accesses
> > into
> > large memory regions. The clean utility should be used to utilize
> > hardware dirtying functionality and avoid the overhead of page-
> > faults,
> > typically on large accesses into small memory regions.
> >
> > The added file "apply_as_range.c" is initially listed as maintained
> > by
> > VMware under our DRM driver. If somebody would like it elsewhere,
> > that's of course no problem.
> >
> > Notable changes since RFC:
> > - Added comments to help avoid the usage of these function for VMAs
> > it's not intended for. We also do advisory checks on the
> > vm_flags and
> > warn on illegal usage.
> > - Perform the pte modifications the same way softdirty does.
> > - Add mmu_notifier range invalidation calls.
> > - Add a config option so that this code is not unconditionally
> > included.
> > - Tell the mmu_gather code about pending tlb flushes.
> >
> > Cc: Andrew Morton <[email protected]>
> > Cc: Matthew Wilcox <[email protected]>
> > Cc: Will Deacon <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Rik van Riel <[email protected]>
> > Cc: Minchan Kim <[email protected]>
> > Cc: Michal Hocko <[email protected]>
> > Cc: Huang Ying <[email protected]>
> > Cc: Souptick Joarder <[email protected]>
> > Cc: "Jérôme Glisse" <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Signed-off-by: Thomas Hellstrom <[email protected]>
>
> Reviewed-by: Ralph Campbell <[email protected]>
Thanks for reviewing the patches. I'll incorporate your suggestions in
v2.
On Fri, Apr 12, 2019 at 9:34 PM Thomas Hellstrom <[email protected]> wrote:
>
> Driver fault callbacks are allowed to drop the mmap_sem when expecting
> long hardware waits to avoid blocking other mm users. Allow the mkwrite
> callbacks to do the same by returning early on VM_FAULT_RETRY.
>
> In particular we want to be able to drop the mmap_sem when waiting for
> a reservation object lock on a GPU buffer object. These locks may be
> held while waiting for the GPU.
>
> Cc: Andrew Morton <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Huang Ying <[email protected]>
> Cc: Souptick Joarder <[email protected]>
> Cc: "Jérôme Glisse" <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
>
> Signed-off-by: Thomas Hellstrom <[email protected]>
> ---
> mm/memory.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index e11ca9dd823f..a95b4a3b1ae2 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
> ret = vmf->vma->vm_ops->page_mkwrite(vmf);
> /* Restore original flags so that caller is not surprised */
> vmf->flags = old_flags;
> - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE)))
With this patch there will multiple instances of (VM_FAULT_ERROR |
VM_FAULT_RETRY | VM_FAULT_NOPAGE)
in mm/memory.c. Does it make sense to wrap it in a macro and use it ?
> return ret;
> if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> lock_page(page);
> @@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> vmf->flags |= FAULT_FLAG_MKWRITE;
> ret = vma->vm_ops->pfn_mkwrite(vmf);
> - if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
> + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE))
> return ret;
> return finish_mkwrite_fault(vmf);
> }
> @@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp || (tmp &
> - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> + (VM_FAULT_ERROR | VM_FAULT_RETRY |
> + VM_FAULT_NOPAGE)))) {
> put_page(vmf->page);
> return tmp;
> }
> @@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> unlock_page(vmf->page);
> tmp = do_page_mkwrite(vmf);
> if (unlikely(!tmp ||
> - (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
> + (tmp & (VM_FAULT_ERROR | VM_FAULT_RETRY |
> + VM_FAULT_NOPAGE)))) {
> put_page(vmf->page);
> return tmp;
> }
> --
> 2.20.1
>
Am 12.04.19 um 18:04 schrieb Thomas Hellstrom:
> Add a pointer to the struct vm_operations_struct in the bo_device, and
> assign that pointer to the default value currently used.
>
> The driver can then optionally modify that pointer and the new value
> can be used for each new vma created.
>
> Cc: "Christian König" <[email protected]>
> Signed-off-by: Thomas Hellstrom <[email protected]>
Yes, please. This way we can also finally cleanup the VM operations hack
we use in radeon and maybe still even amdgpu.
Reviewed-by: Christian König <[email protected]>
> ---
> drivers/gpu/drm/ttm/ttm_bo.c | 1 +
> drivers/gpu/drm/ttm/ttm_bo_vm.c | 6 +++---
> include/drm/ttm/ttm_bo_driver.h | 6 ++++++
> 3 files changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 3f56647cdb35..1c85bec00472 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -1656,6 +1656,7 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
> mutex_lock(&ttm_global_mutex);
> list_add_tail(&bdev->device_list, &glob->device_list);
> mutex_unlock(&ttm_global_mutex);
> + bdev->vm_ops = &ttm_bo_vm_ops;
>
> return 0;
> out_no_sys:
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index e86a29a1e51f..bfb25b81fed7 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -395,7 +395,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
> return ret;
> }
>
> -static const struct vm_operations_struct ttm_bo_vm_ops = {
> +const struct vm_operations_struct ttm_bo_vm_ops = {
> .fault = ttm_bo_vm_fault,
> .open = ttm_bo_vm_open,
> .close = ttm_bo_vm_close,
> @@ -445,7 +445,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
> if (unlikely(ret != 0))
> goto out_unref;
>
> - vma->vm_ops = &ttm_bo_vm_ops;
> + vma->vm_ops = bdev->vm_ops;
>
> /*
> * Note: We're transferring the bo reference to
> @@ -477,7 +477,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
>
> ttm_bo_get(bo);
>
> - vma->vm_ops = &ttm_bo_vm_ops;
> + vma->vm_ops = bo->bdev->vm_ops;
> vma->vm_private_data = bo;
> vma->vm_flags |= VM_MIXEDMAP;
> vma->vm_flags |= VM_IO | VM_DONTEXPAND;
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index cbf3180cb612..cfeaff5d9706 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -443,6 +443,9 @@ extern struct ttm_bo_global {
> * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
> * @man: An array of mem_type_managers.
> * @vma_manager: Address space manager
> + * @vm_ops: Pointer to the struct vm_operations_struct used for this
> + * device's VM operations. The driver may override this before the first
> + * mmap() call.
> * lru_lock: Spinlock that protects the buffer+device lru lists and
> * ddestroy lists.
> * @dev_mapping: A pointer to the struct address_space representing the
> @@ -461,6 +464,7 @@ struct ttm_bo_device {
> struct ttm_bo_global *glob;
> struct ttm_bo_driver *driver;
> struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
> + const struct vm_operations_struct *vm_ops;
>
> /*
> * Protected by internal locks.
> @@ -489,6 +493,8 @@ struct ttm_bo_device {
> bool no_retry;
> };
>
> +extern const struct vm_operations_struct ttm_bo_vm_ops;
> +
> /**
> * struct ttm_lru_bulk_move_pos
> *
Am 12.04.19 um 18:04 schrieb Thomas Hellstrom:
> With the vmwgfx dirty tracking, the default TTM fault handler is not
> completely sufficient (vmwgfx need to modify the vma->vm_flags member,
> and also needs to restrict the number of prefaults).
>
> We also want to replicate the new ttm_bo_vm_reserve() functionality
>
> So start turning the TTM vm code into helpers: ttm_bo_vm_fault_reserved()
> and ttm_bo_vm_reserve(), and provide a default TTM fault handler for other
> drivers to use.
>
> Cc: "Christian König" <[email protected]>
> Signed-off-by: Thomas Hellstrom <[email protected]>
Two nit picks below, apart from that looks good to me as well.
> ---
> drivers/gpu/drm/ttm/ttm_bo_vm.c | 170 ++++++++++++++++++++------------
> include/drm/ttm/ttm_bo_api.h | 10 ++
> 2 files changed, 116 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index bfb25b81fed7..3bd28fb97124 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -42,8 +42,6 @@
> #include <linux/uaccess.h>
> #include <linux/mem_encrypt.h>
>
> -#define TTM_BO_VM_NUM_PREFAULT 16
> -
> static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
> struct vm_fault *vmf)
> {
> @@ -106,31 +104,30 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
> + page_offset;
> }
>
> -static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> +/**
> + * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
> + * @bo: The buffer object
> + * @vmf: The fault structure handed to the callback
> + *
> + * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
> + * during long waits, and after the wait the callback will be restarted. This
> + * is to allow other threads using the same virtual memory space concurrent
> + * access to map(), unmap() completely unrelated buffer objects. TTM buffer
> + * object reservations sometimes wait for GPU and should therefore be
> + * considered long waits. This function reserves the buffer object interruptibly
> + * taking this into account. Starvation is avoided by the vm system not
> + * allowing too many repeated restarts.
> + * This function is intended to be used in customized fault() and _mkwrite()
> + * handlers.
> + *
> + * Return:
> + * 0 on success and the bo was reserved.
> + * VM_FAULT_RETRY if blocking wait.
> + * VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
> + */
> +vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
> + struct vm_fault *vmf)
> {
> - struct vm_area_struct *vma = vmf->vma;
> - struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> - vma->vm_private_data;
> - struct ttm_bo_device *bdev = bo->bdev;
> - unsigned long page_offset;
> - unsigned long page_last;
> - unsigned long pfn;
> - struct ttm_tt *ttm = NULL;
> - struct page *page;
> - int err;
> - int i;
> - vm_fault_t ret = VM_FAULT_NOPAGE;
> - unsigned long address = vmf->address;
> - struct ttm_mem_type_manager *man =
> - &bdev->man[bo->mem.mem_type];
> - struct vm_area_struct cvma;
> -
> - /*
> - * Work around locking order reversal in fault / nopfn
> - * between mmap_sem and bo_reserve: Perform a trylock operation
> - * for reserve, and if it fails, retry the fault after waiting
> - * for the buffer to become unreserved.
> - */
> if (unlikely(!reservation_object_trylock(bo->resv))) {
> if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) {
> if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
> @@ -151,14 +148,56 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> return VM_FAULT_NOPAGE;
> }
>
> + return 0;
> +}
> +EXPORT_SYMBOL(ttm_bo_vm_reserve);
> +
> +/**
> + * ttm_bo_vm_fault_reserved - TTM fault helper
> + * @vmf: The struct vm_fault given as argument to the fault callback
> + * @cvma: The struct vmw_area_struct affected. Note that this may be a
> + * copy of the real vma object if the caller needs, for example, VM
> + * flags to be temporarily altered while determining the page protection.
> + * @num_prefault: Maximum number of prefault pages. The caller may want to
> + * specify this based on madvice settings and the size of the GPU object
> + * backed by the memory.
> + *
> + * This function inserts one or more page table entries pointing to the
> + * memory backing the buffer object, and then returns a return code
> + * instructing the caller to retry the page access.
> + *
> + * Return:
> + * VM_FAULT_NOPAGE on success or pending signal
> + * VM_FAULT_SIGBUS on unspecified error
> + * VM_FAULT_OOM on out-of-memory
> + * VM_FAULT_RETRY if retryable wait
> + */
> +vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> + struct vm_area_struct *cvma,
> + pgoff_t num_prefault)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> + vma->vm_private_data;
> + struct ttm_bo_device *bdev = bo->bdev;
> + unsigned long page_offset;
> + unsigned long page_last;
> + unsigned long pfn;
> + struct ttm_tt *ttm = NULL;
> + struct page *page;
> + int err;
> + pgoff_t i;
> + vm_fault_t ret = VM_FAULT_NOPAGE;
> + unsigned long address = vmf->address;
> + struct ttm_mem_type_manager *man =
> + &bdev->man[bo->mem.mem_type];
> +
> /*
> * Refuse to fault imported pages. This should be handled
> * (if at all) by redirecting mmap to the exporter.
> */
> - if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
> - ret = VM_FAULT_SIGBUS;
> - goto out_unlock;
> - }
> + if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
> + return VM_FAULT_SIGBUS;
>
> if (bdev->driver->fault_reserve_notify) {
> struct dma_fence *moving = dma_fence_get(bo->moving);
> @@ -169,11 +208,9 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> break;
> case -EBUSY:
> case -ERESTARTSYS:
> - ret = VM_FAULT_NOPAGE;
> - goto out_unlock;
> + return VM_FAULT_NOPAGE;
> default:
> - ret = VM_FAULT_SIGBUS;
> - goto out_unlock;
> + return VM_FAULT_SIGBUS;
> }
>
> if (bo->moving != moving) {
> @@ -189,24 +226,15 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> * move.
> */
> ret = ttm_bo_vm_fault_idle(bo, vmf);
> - if (unlikely(ret != 0)) {
> - if (ret == VM_FAULT_RETRY &&
> - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
> - /* The BO has already been unreserved. */
> - return ret;
> - }
> -
> - goto out_unlock;
> - }
> + if (unlikely(ret != 0))
> + return ret;
>
> err = ttm_mem_io_lock(man, true);
> - if (unlikely(err != 0)) {
> - ret = VM_FAULT_NOPAGE;
> - goto out_unlock;
> - }
> + if (unlikely(err != 0))
> + return VM_FAULT_NOPAGE;
> err = ttm_mem_io_reserve_vm(bo);
> if (unlikely(err != 0)) {
> - ret = VM_FAULT_SIGBUS;
> + return VM_FAULT_SIGBUS;
> goto out_io_unlock;
This goto is now superfluous.
> }
>
> @@ -220,17 +248,11 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> goto out_io_unlock;
> }
>
> - /*
> - * Make a local vma copy to modify the page_prot member
> - * and vm_flags if necessary. The vma parameter is protected
> - * by mmap_sem in write mode.
> - */
> - cvma = *vma;
> - cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
> + cvma->vm_page_prot = vm_get_page_prot(cvma->vm_flags);
>
> if (bo->mem.bus.is_iomem) {
> - cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
> - cvma.vm_page_prot);
> + cvma->vm_page_prot = ttm_io_prot(bo->mem.placement,
> + cvma->vm_page_prot);
> } else {
> struct ttm_operation_ctx ctx = {
> .interruptible = false,
> @@ -240,8 +262,8 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> };
>
> ttm = bo->ttm;
> - cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
> - cvma.vm_page_prot);
> + cvma->vm_page_prot = ttm_io_prot(bo->mem.placement,
> + cvma->vm_page_prot);
>
> /* Allocate all page at once, most common usage */
> if (ttm_tt_populate(ttm, &ctx)) {
> @@ -254,10 +276,11 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> * Speculatively prefault a number of pages. Only error on
> * first page.
> */
> - for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
> + for (i = 0; i < num_prefault; ++i) {
> if (bo->mem.bus.is_iomem) {
> /* Iomem should not be marked encrypted */
> - cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
> + cvma->vm_page_prot =
> + pgprot_decrypted(cvma->vm_page_prot);
> pfn = ttm_bo_io_mem_pfn(bo, page_offset);
> } else {
> page = ttm->pages[page_offset];
> @@ -273,10 +296,10 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> }
>
> if (vma->vm_flags & VM_MIXEDMAP)
> - ret = vmf_insert_mixed(&cvma, address,
> + ret = vmf_insert_mixed(cvma, address,
> __pfn_to_pfn_t(pfn, PFN_DEV));
> else
> - ret = vmf_insert_pfn(&cvma, address, pfn);
> + ret = vmf_insert_pfn(cvma, address, pfn);
>
> /*
> * Somebody beat us to this PTE or prefaulting to
> @@ -295,7 +318,26 @@ static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> ret = VM_FAULT_NOPAGE;
> out_io_unlock:
> ttm_mem_io_unlock(man);
> -out_unlock:
> + return ret;
> +}
> +EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
> +
> +static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + struct vm_area_struct cvma = *vma;
> + struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> + vma->vm_private_data;
That extra cast can be dropped, the vm_private_data member is a void*
anyway.
Regards,
Christian.
> + vm_fault_t ret;
> +
> + ret = ttm_bo_vm_reserve(bo, vmf);
> + if (ret)
> + return ret;
> +
> + ret = ttm_bo_vm_fault_reserved(vmf, &cvma, TTM_BO_VM_NUM_PREFAULT);
> + if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> + return ret;
> +
> reservation_object_unlock(bo->resv);
> return ret;
> }
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
> index 49d9cdfc58f2..bebfa16426ca 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -768,4 +768,14 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
> struct ttm_operation_ctx *ctx);
> void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
> int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
> +
> +/* Default number of pre-faulted pages in the TTM fault handler */
> +#define TTM_BO_VM_NUM_PREFAULT 16
> +
> +vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
> + struct vm_fault *vmf);
> +
> +vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> + struct vm_area_struct *cvma,
> + pgoff_t num_prefault);
> #endif
On Sat, Apr 13, 2019 at 08:34:02AM +0000, Thomas Hellstrom wrote:
> Hi, J?r?me
>
> On Fri, 2019-04-12 at 17:07 -0400, Jerome Glisse wrote:
> > On Fri, Apr 12, 2019 at 04:04:18PM +0000, Thomas Hellstrom wrote:
> > > This is basically apply_to_page_range with added functionality:
> > > Allocating missing parts of the page table becomes optional, which
> > > means that the function can be guaranteed not to error if
> > > allocation
> > > is disabled. Also passing of the closure struct and callback
> > > function
> > > becomes different and more in line with how things are done
> > > elsewhere.
> > >
> > > Finally we keep apply_to_page_range as a wrapper around
> > > apply_to_pfn_range
> > >
> > > The reason for not using the page-walk code is that we want to
> > > perform
> > > the page-walk on vmas pointing to an address space without
> > > requiring the
> > > mmap_sem to be held rather thand on vmas belonging to a process
> > > with the
> > > mmap_sem held.
> > >
> > > Notable changes since RFC:
> > > Don't export apply_to_pfn range.
> > >
> > > Cc: Andrew Morton <[email protected]>
> > > Cc: Matthew Wilcox <[email protected]>
> > > Cc: Will Deacon <[email protected]>
> > > Cc: Peter Zijlstra <[email protected]>
> > > Cc: Rik van Riel <[email protected]>
> > > Cc: Minchan Kim <[email protected]>
> > > Cc: Michal Hocko <[email protected]>
> > > Cc: Huang Ying <[email protected]>
> > > Cc: Souptick Joarder <[email protected]>
> > > Cc: "J?r?me Glisse" <[email protected]>
> > > Cc: [email protected]
> > > Cc: [email protected]
> > > Signed-off-by: Thomas Hellstrom <[email protected]>
> > > ---
> > > include/linux/mm.h | 10 ++++
> > > mm/memory.c | 130 ++++++++++++++++++++++++++++++++++-------
> > > ----
> > > 2 files changed, 108 insertions(+), 32 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 80bb6408fe73..b7dd4ddd6efb 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte,
> > > pgtable_t token, unsigned long addr,
> > > extern int apply_to_page_range(struct mm_struct *mm, unsigned long
> > > address,
> > > unsigned long size, pte_fn_t fn, void
> > > *data);
> > >
> > > +struct pfn_range_apply;
> > > +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned
> > > long addr,
> > > + struct pfn_range_apply *closure);
> > > +struct pfn_range_apply {
> > > + struct mm_struct *mm;
> > > + pter_fn_t ptefn;
> > > + unsigned int alloc;
> > > +};
> > > +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> > > + unsigned long address, unsigned long
> > > size);
> > >
> > > #ifdef CONFIG_PAGE_POISONING
> > > extern bool page_poisoning_enabled(void);
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index a95b4a3b1ae2..60d67158964f 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct
> > > *vma, phys_addr_t start, unsigned long
> > > }
> > > EXPORT_SYMBOL(vm_iomap_memory);
> > >
> > > -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
> > > - unsigned long addr, unsigned long
> > > end,
> > > - pte_fn_t fn, void *data)
> > > +static int apply_to_pte_range(struct pfn_range_apply *closure,
> > > pmd_t *pmd,
> > > + unsigned long addr, unsigned long end)
> > > {
> > > pte_t *pte;
> > > int err;
> > > pgtable_t token;
> > > spinlock_t *uninitialized_var(ptl);
> > >
> > > - pte = (mm == &init_mm) ?
> > > + pte = (closure->mm == &init_mm) ?
> > > pte_alloc_kernel(pmd, addr) :
> > > - pte_alloc_map_lock(mm, pmd, addr, &ptl);
> > > + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
> > > if (!pte)
> > > return -ENOMEM;
> > >
> > > @@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct
> > > mm_struct *mm, pmd_t *pmd,
> > > token = pmd_pgtable(*pmd);
> > >
> > > do {
> > > - err = fn(pte++, token, addr, data);
> > > + err = closure->ptefn(pte++, token, addr, closure);
> > > if (err)
> > > break;
> > > } while (addr += PAGE_SIZE, addr != end);
> > >
> > > arch_leave_lazy_mmu_mode();
> > >
> > > - if (mm != &init_mm)
> > > + if (closure->mm != &init_mm)
> > > pte_unmap_unlock(pte-1, ptl);
> > > return err;
> > > }
> > >
> > > -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
> > > - unsigned long addr, unsigned long
> > > end,
> > > - pte_fn_t fn, void *data)
> > > +static int apply_to_pmd_range(struct pfn_range_apply *closure,
> > > pud_t *pud,
> > > + unsigned long addr, unsigned long end)
> > > {
> > > pmd_t *pmd;
> > > unsigned long next;
> > > - int err;
> > > + int err = 0;
> > >
> > > BUG_ON(pud_huge(*pud));
> > >
> > > - pmd = pmd_alloc(mm, pud, addr);
> > > + pmd = pmd_alloc(closure->mm, pud, addr);
> > > if (!pmd)
> > > return -ENOMEM;
> > > +
> > > do {
> > > next = pmd_addr_end(addr, end);
> > > - err = apply_to_pte_range(mm, pmd, addr, next, fn,
> > > data);
> > > + if (!closure->alloc && pmd_none_or_clear_bad(pmd))
> > > + continue;
> > > + err = apply_to_pte_range(closure, pmd, addr, next);
> > > if (err)
> > > break;
> > > } while (pmd++, addr = next, addr != end);
> > > return err;
> > > }
> > >
> > > -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
> > > - unsigned long addr, unsigned long
> > > end,
> > > - pte_fn_t fn, void *data)
> > > +static int apply_to_pud_range(struct pfn_range_apply *closure,
> > > p4d_t *p4d,
> > > + unsigned long addr, unsigned long end)
> > > {
> > > pud_t *pud;
> > > unsigned long next;
> > > - int err;
> > > + int err = 0;
> > >
> > > - pud = pud_alloc(mm, p4d, addr);
> > > + pud = pud_alloc(closure->mm, p4d, addr);
> > > if (!pud)
> > > return -ENOMEM;
> > > +
> > > do {
> > > next = pud_addr_end(addr, end);
> > > - err = apply_to_pmd_range(mm, pud, addr, next, fn,
> > > data);
> > > + if (!closure->alloc && pud_none_or_clear_bad(pud))
> > > + continue;
> > > + err = apply_to_pmd_range(closure, pud, addr, next);
> > > if (err)
> > > break;
> > > } while (pud++, addr = next, addr != end);
> > > return err;
> > > }
> > >
> > > -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
> > > - unsigned long addr, unsigned long
> > > end,
> > > - pte_fn_t fn, void *data)
> > > +static int apply_to_p4d_range(struct pfn_range_apply *closure,
> > > pgd_t *pgd,
> > > + unsigned long addr, unsigned long end)
> > > {
> > > p4d_t *p4d;
> > > unsigned long next;
> > > - int err;
> > > + int err = 0;
> > >
> > > - p4d = p4d_alloc(mm, pgd, addr);
> > > + p4d = p4d_alloc(closure->mm, pgd, addr);
> > > if (!p4d)
> > > return -ENOMEM;
> > > +
> > > do {
> > > next = p4d_addr_end(addr, end);
> > > - err = apply_to_pud_range(mm, p4d, addr, next, fn,
> > > data);
> > > + if (!closure->alloc && p4d_none_or_clear_bad(p4d))
> > > + continue;
> > > + err = apply_to_pud_range(closure, p4d, addr, next);
> > > if (err)
> > > break;
> > > } while (p4d++, addr = next, addr != end);
> > > return err;
> > > }
> > >
> > > -/*
> > > - * Scan a region of virtual memory, filling in page tables as
> > > necessary
> > > - * and calling a provided function on each leaf page table.
> > > +/**
> > > + * apply_to_pfn_range - Scan a region of virtual memory, calling a
> > > provided
> > > + * function on each leaf page table entry
> > > + * @closure: Details about how to scan and what function to apply
> > > + * @addr: Start virtual address
> > > + * @size: Size of the region
> > > + *
> > > + * If @closure->alloc is set to 1, the function will fill in the
> > > page table
> > > + * as necessary. Otherwise it will skip non-present parts.
> > > + * Note: The caller must ensure that the range does not contain
> > > huge pages.
> > > + * The caller must also assure that the proper mmu_notifier
> > > functions are
> > > + * called. Either in the pte leaf function or before and after the
> > > call to
> > > + * apply_to_pfn_range.
> >
> > This is wrong there should be a big FAT warning that this can only be
> > use
> > against mmap of device file. The page table walking above is broken
> > for
> > various thing you might find in any other vma like THP, device pte,
> > hugetlbfs,
>
> I was figuring since we didn't export the function anymore, the warning
> and checks could be left to its users, assuming that any other future
> usage of this function would require mm people audit anyway. But I can
> of course add that warning also to this function if you still want
> that?
Yeah more warning are better, people might start using this, i know
some poeple use unexported symbol and then report bugs while they
just were doing something illegal.
>
> > ...
> >
> > Also the mmu notifier can not be call from the pfn callback as that
> > callback
> > happens under page table lock (the change_pte notifier callback is
> > useless
> > and not enough). So it _must_ happen around the call to
> > apply_to_pfn_range
>
>
> In the comments I was having in mind usage of, for example
> ptep_clear_flush_notify(). But you're the mmu_notifier expert here. Are
> you saying that function by itself would not be sufficient?
> In that case, should I just scratch the text mentioning the pte leaf
> function?
ptep_clear_flush_notify() is useless ... i have posted patches to either
restore it or remove it. In any case you must call mmu notifier range and
they can not happen under lock. You usage looked fine (in the next patch)
but i would rather have a bit of comment here to make sure people are also
aware of that.
While we can hope that people would cc mm when using mm function, it is
not always the case. So i rather be cautious and warn in comment as much
as possible.
Cheers,
J?r?me
On Tue, 2019-04-16 at 10:46 -0400, Jerome Glisse wrote:
> On Sat, Apr 13, 2019 at 08:34:02AM +0000, Thomas Hellstrom wrote:
> > Hi, Jérôme
> >
> > On Fri, 2019-04-12 at 17:07 -0400, Jerome Glisse wrote:
> > > On Fri, Apr 12, 2019 at 04:04:18PM +0000, Thomas Hellstrom wrote:
> > > > This is basically apply_to_page_range with added functionality:
> > > > Allocating missing parts of the page table becomes optional,
> > > > which
> > > > means that the function can be guaranteed not to error if
> > > > allocation
> > > > is disabled. Also passing of the closure struct and callback
> > > > function
> > > > becomes different and more in line with how things are done
> > > > elsewhere.
> > > >
> > > > Finally we keep apply_to_page_range as a wrapper around
> > > > apply_to_pfn_range
> > > >
> > > > The reason for not using the page-walk code is that we want to
> > > > perform
> > > > the page-walk on vmas pointing to an address space without
> > > > requiring the
> > > > mmap_sem to be held rather thand on vmas belonging to a process
> > > > with the
> > > > mmap_sem held.
> > > >
> > > > Notable changes since RFC:
> > > > Don't export apply_to_pfn range.
> > > >
> > > > Cc: Andrew Morton <[email protected]>
> > > > Cc: Matthew Wilcox <[email protected]>
> > > > Cc: Will Deacon <[email protected]>
> > > > Cc: Peter Zijlstra <[email protected]>
> > > > Cc: Rik van Riel <[email protected]>
> > > > Cc: Minchan Kim <[email protected]>
> > > > Cc: Michal Hocko <[email protected]>
> > > > Cc: Huang Ying <[email protected]>
> > > > Cc: Souptick Joarder <[email protected]>
> > > > Cc: "Jérôme Glisse" <[email protected]>
> > > > Cc: [email protected]
> > > > Cc: [email protected]
> > > > Signed-off-by: Thomas Hellstrom <[email protected]>
> > > > ---
> > > > include/linux/mm.h | 10 ++++
> > > > mm/memory.c | 130 ++++++++++++++++++++++++++++++++++---
> > > > ----
> > > > ----
> > > > 2 files changed, 108 insertions(+), 32 deletions(-)
> > > >
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > index 80bb6408fe73..b7dd4ddd6efb 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte,
> > > > pgtable_t token, unsigned long addr,
> > > > extern int apply_to_page_range(struct mm_struct *mm, unsigned
> > > > long
> > > > address,
> > > > unsigned long size, pte_fn_t fn,
> > > > void
> > > > *data);
> > > >
> > > > +struct pfn_range_apply;
> > > > +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned
> > > > long addr,
> > > > + struct pfn_range_apply *closure);
> > > > +struct pfn_range_apply {
> > > > + struct mm_struct *mm;
> > > > + pter_fn_t ptefn;
> > > > + unsigned int alloc;
> > > > +};
> > > > +extern int apply_to_pfn_range(struct pfn_range_apply *closure,
> > > > + unsigned long address, unsigned
> > > > long
> > > > size);
> > > >
> > > > #ifdef CONFIG_PAGE_POISONING
> > > > extern bool page_poisoning_enabled(void);
> > > > diff --git a/mm/memory.c b/mm/memory.c
> > > > index a95b4a3b1ae2..60d67158964f 100644
> > > > --- a/mm/memory.c
> > > > +++ b/mm/memory.c
> > > > @@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct
> > > > vm_area_struct
> > > > *vma, phys_addr_t start, unsigned long
> > > > }
> > > > EXPORT_SYMBOL(vm_iomap_memory);
> > > >
> > > > -static int apply_to_pte_range(struct mm_struct *mm, pmd_t
> > > > *pmd,
> > > > - unsigned long addr,
> > > > unsigned long
> > > > end,
> > > > - pte_fn_t fn, void *data)
> > > > +static int apply_to_pte_range(struct pfn_range_apply *closure,
> > > > pmd_t *pmd,
> > > > + unsigned long addr, unsigned long
> > > > end)
> > > > {
> > > > pte_t *pte;
> > > > int err;
> > > > pgtable_t token;
> > > > spinlock_t *uninitialized_var(ptl);
> > > >
> > > > - pte = (mm == &init_mm) ?
> > > > + pte = (closure->mm == &init_mm) ?
> > > > pte_alloc_kernel(pmd, addr) :
> > > > - pte_alloc_map_lock(mm, pmd, addr, &ptl);
> > > > + pte_alloc_map_lock(closure->mm, pmd, addr,
> > > > &ptl);
> > > > if (!pte)
> > > > return -ENOMEM;
> > > >
> > > > @@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct
> > > > mm_struct *mm, pmd_t *pmd,
> > > > token = pmd_pgtable(*pmd);
> > > >
> > > > do {
> > > > - err = fn(pte++, token, addr, data);
> > > > + err = closure->ptefn(pte++, token, addr,
> > > > closure);
> > > > if (err)
> > > > break;
> > > > } while (addr += PAGE_SIZE, addr != end);
> > > >
> > > > arch_leave_lazy_mmu_mode();
> > > >
> > > > - if (mm != &init_mm)
> > > > + if (closure->mm != &init_mm)
> > > > pte_unmap_unlock(pte-1, ptl);
> > > > return err;
> > > > }
> > > >
> > > > -static int apply_to_pmd_range(struct mm_struct *mm, pud_t
> > > > *pud,
> > > > - unsigned long addr,
> > > > unsigned long
> > > > end,
> > > > - pte_fn_t fn, void *data)
> > > > +static int apply_to_pmd_range(struct pfn_range_apply *closure,
> > > > pud_t *pud,
> > > > + unsigned long addr, unsigned long
> > > > end)
> > > > {
> > > > pmd_t *pmd;
> > > > unsigned long next;
> > > > - int err;
> > > > + int err = 0;
> > > >
> > > > BUG_ON(pud_huge(*pud));
> > > >
> > > > - pmd = pmd_alloc(mm, pud, addr);
> > > > + pmd = pmd_alloc(closure->mm, pud, addr);
> > > > if (!pmd)
> > > > return -ENOMEM;
> > > > +
> > > > do {
> > > > next = pmd_addr_end(addr, end);
> > > > - err = apply_to_pte_range(mm, pmd, addr, next,
> > > > fn,
> > > > data);
> > > > + if (!closure->alloc &&
> > > > pmd_none_or_clear_bad(pmd))
> > > > + continue;
> > > > + err = apply_to_pte_range(closure, pmd, addr,
> > > > next);
> > > > if (err)
> > > > break;
> > > > } while (pmd++, addr = next, addr != end);
> > > > return err;
> > > > }
> > > >
> > > > -static int apply_to_pud_range(struct mm_struct *mm, p4d_t
> > > > *p4d,
> > > > - unsigned long addr,
> > > > unsigned long
> > > > end,
> > > > - pte_fn_t fn, void *data)
> > > > +static int apply_to_pud_range(struct pfn_range_apply *closure,
> > > > p4d_t *p4d,
> > > > + unsigned long addr, unsigned long
> > > > end)
> > > > {
> > > > pud_t *pud;
> > > > unsigned long next;
> > > > - int err;
> > > > + int err = 0;
> > > >
> > > > - pud = pud_alloc(mm, p4d, addr);
> > > > + pud = pud_alloc(closure->mm, p4d, addr);
> > > > if (!pud)
> > > > return -ENOMEM;
> > > > +
> > > > do {
> > > > next = pud_addr_end(addr, end);
> > > > - err = apply_to_pmd_range(mm, pud, addr, next,
> > > > fn,
> > > > data);
> > > > + if (!closure->alloc &&
> > > > pud_none_or_clear_bad(pud))
> > > > + continue;
> > > > + err = apply_to_pmd_range(closure, pud, addr,
> > > > next);
> > > > if (err)
> > > > break;
> > > > } while (pud++, addr = next, addr != end);
> > > > return err;
> > > > }
> > > >
> > > > -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t
> > > > *pgd,
> > > > - unsigned long addr,
> > > > unsigned long
> > > > end,
> > > > - pte_fn_t fn, void *data)
> > > > +static int apply_to_p4d_range(struct pfn_range_apply *closure,
> > > > pgd_t *pgd,
> > > > + unsigned long addr, unsigned long
> > > > end)
> > > > {
> > > > p4d_t *p4d;
> > > > unsigned long next;
> > > > - int err;
> > > > + int err = 0;
> > > >
> > > > - p4d = p4d_alloc(mm, pgd, addr);
> > > > + p4d = p4d_alloc(closure->mm, pgd, addr);
> > > > if (!p4d)
> > > > return -ENOMEM;
> > > > +
> > > > do {
> > > > next = p4d_addr_end(addr, end);
> > > > - err = apply_to_pud_range(mm, p4d, addr, next,
> > > > fn,
> > > > data);
> > > > + if (!closure->alloc &&
> > > > p4d_none_or_clear_bad(p4d))
> > > > + continue;
> > > > + err = apply_to_pud_range(closure, p4d, addr,
> > > > next);
> > > > if (err)
> > > > break;
> > > > } while (p4d++, addr = next, addr != end);
> > > > return err;
> > > > }
> > > >
> > > > -/*
> > > > - * Scan a region of virtual memory, filling in page tables as
> > > > necessary
> > > > - * and calling a provided function on each leaf page table.
> > > > +/**
> > > > + * apply_to_pfn_range - Scan a region of virtual memory,
> > > > calling a
> > > > provided
> > > > + * function on each leaf page table entry
> > > > + * @closure: Details about how to scan and what function to
> > > > apply
> > > > + * @addr: Start virtual address
> > > > + * @size: Size of the region
> > > > + *
> > > > + * If @closure->alloc is set to 1, the function will fill in
> > > > the
> > > > page table
> > > > + * as necessary. Otherwise it will skip non-present parts.
> > > > + * Note: The caller must ensure that the range does not
> > > > contain
> > > > huge pages.
> > > > + * The caller must also assure that the proper mmu_notifier
> > > > functions are
> > > > + * called. Either in the pte leaf function or before and after
> > > > the
> > > > call to
> > > > + * apply_to_pfn_range.
> > >
> > > This is wrong there should be a big FAT warning that this can
> > > only be
> > > use
> > > against mmap of device file. The page table walking above is
> > > broken
> > > for
> > > various thing you might find in any other vma like THP, device
> > > pte,
> > > hugetlbfs,
> >
> > I was figuring since we didn't export the function anymore, the
> > warning
> > and checks could be left to its users, assuming that any other
> > future
> > usage of this function would require mm people audit anyway. But I
> > can
> > of course add that warning also to this function if you still want
> > that?
>
> Yeah more warning are better, people might start using this, i know
> some poeple use unexported symbol and then report bugs while they
> just were doing something illegal.
>
> > > ...
> > >
> > > Also the mmu notifier can not be call from the pfn callback as
> > > that
> > > callback
> > > happens under page table lock (the change_pte notifier callback
> > > is
> > > useless
> > > and not enough). So it _must_ happen around the call to
> > > apply_to_pfn_range
> >
> > In the comments I was having in mind usage of, for example
> > ptep_clear_flush_notify(). But you're the mmu_notifier expert here.
> > Are
> > you saying that function by itself would not be sufficient?
> > In that case, should I just scratch the text mentioning the pte
> > leaf
> > function?
>
> ptep_clear_flush_notify() is useless ... i have posted patches to
> either
> restore it or remove it. In any case you must call mmu notifier range
> and
> they can not happen under lock. You usage looked fine (in the next
> patch)
> but i would rather have a bit of comment here to make sure people are
> also
> aware of that.
>
> While we can hope that people would cc mm when using mm function, it
> is
> not always the case. So i rather be cautious and warn in comment as
> much
> as possible.
>
OK. Understood. All this actually makes me tend to want to try a bit
harder using a slight modification to the pagewalk code instead. Don't
really want to encourage two parallel code paths doing essentially the
same thing; one good and one bad.
One thing that confuses me a bit with the pagewalk code is that callers
(for example softdirty) typically call
mmu_notifier_invalidate_range_start() around the pagewalk, but then if
it ends up splitting a pmd, mmu_notifier_invalidate_range is called
again, within the first range. Docs aren't really clear whether that's
permitted or not. Is it?
Thanks,
Thomas
> Cheers,
> Jérôme
On Mon, 2019-04-15 at 08:34 +0200, Christian König wrote:
> Am 12.04.19 um 18:04 schrieb Thomas Hellstrom:
> > With the vmwgfx dirty tracking, the default TTM fault handler is
> > not
> > completely sufficient (vmwgfx need to modify the vma->vm_flags
> > member,
> > and also needs to restrict the number of prefaults).
> >
> > We also want to replicate the new ttm_bo_vm_reserve() functionality
> >
> > So start turning the TTM vm code into helpers:
> > ttm_bo_vm_fault_reserved()
> > and ttm_bo_vm_reserve(), and provide a default TTM fault handler
> > for other
> > drivers to use.
> >
> > Cc: "Christian König" <[email protected]>
> > Signed-off-by: Thomas Hellstrom <[email protected]>
>
> Two nit picks below, apart from that looks good to me as well.
Thanks Christian,
I'll incoporate those.
/Thomas
>
> > ---
> > drivers/gpu/drm/ttm/ttm_bo_vm.c | 170 ++++++++++++++++++++-------
> > -----
> > include/drm/ttm/ttm_bo_api.h | 10 ++
> > 2 files changed, 116 insertions(+), 64 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > index bfb25b81fed7..3bd28fb97124 100644
> > --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > @@ -42,8 +42,6 @@
> > #include <linux/uaccess.h>
> > #include <linux/mem_encrypt.h>
> >
> > -#define TTM_BO_VM_NUM_PREFAULT 16
> > -
> > static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object
> > *bo,
> > struct vm_fault *vmf)
> > {
> > @@ -106,31 +104,30 @@ static unsigned long ttm_bo_io_mem_pfn(struct
> > ttm_buffer_object *bo,
> > + page_offset;
> > }
> >
> > -static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> > +/**
> > + * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm
> > callback
> > + * @bo: The buffer object
> > + * @vmf: The fault structure handed to the callback
> > + *
> > + * vm callbacks like fault() and *_mkwrite() allow for the mm_sem
> > to be dropped
> > + * during long waits, and after the wait the callback will be
> > restarted. This
> > + * is to allow other threads using the same virtual memory space
> > concurrent
> > + * access to map(), unmap() completely unrelated buffer objects.
> > TTM buffer
> > + * object reservations sometimes wait for GPU and should therefore
> > be
> > + * considered long waits. This function reserves the buffer object
> > interruptibly
> > + * taking this into account. Starvation is avoided by the vm
> > system not
> > + * allowing too many repeated restarts.
> > + * This function is intended to be used in customized fault() and
> > _mkwrite()
> > + * handlers.
> > + *
> > + * Return:
> > + * 0 on success and the bo was reserved.
> > + * VM_FAULT_RETRY if blocking wait.
> > + * VM_FAULT_NOPAGE if blocking wait and retrying was not
> > allowed.
> > + */
> > +vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
> > + struct vm_fault *vmf)
> > {
> > - struct vm_area_struct *vma = vmf->vma;
> > - struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> > - vma->vm_private_data;
> > - struct ttm_bo_device *bdev = bo->bdev;
> > - unsigned long page_offset;
> > - unsigned long page_last;
> > - unsigned long pfn;
> > - struct ttm_tt *ttm = NULL;
> > - struct page *page;
> > - int err;
> > - int i;
> > - vm_fault_t ret = VM_FAULT_NOPAGE;
> > - unsigned long address = vmf->address;
> > - struct ttm_mem_type_manager *man =
> > - &bdev->man[bo->mem.mem_type];
> > - struct vm_area_struct cvma;
> > -
> > - /*
> > - * Work around locking order reversal in fault / nopfn
> > - * between mmap_sem and bo_reserve: Perform a trylock operation
> > - * for reserve, and if it fails, retry the fault after waiting
> > - * for the buffer to become unreserved.
> > - */
> > if (unlikely(!reservation_object_trylock(bo->resv))) {
> > if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) {
> > if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
> > @@ -151,14 +148,56 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > return VM_FAULT_NOPAGE;
> > }
> >
> > + return 0;
> > +}
> > +EXPORT_SYMBOL(ttm_bo_vm_reserve);
> > +
> > +/**
> > + * ttm_bo_vm_fault_reserved - TTM fault helper
> > + * @vmf: The struct vm_fault given as argument to the fault
> > callback
> > + * @cvma: The struct vmw_area_struct affected. Note that this may
> > be a
> > + * copy of the real vma object if the caller needs, for example,
> > VM
> > + * flags to be temporarily altered while determining the page
> > protection.
> > + * @num_prefault: Maximum number of prefault pages. The caller may
> > want to
> > + * specify this based on madvice settings and the size of the GPU
> > object
> > + * backed by the memory.
> > + *
> > + * This function inserts one or more page table entries pointing
> > to the
> > + * memory backing the buffer object, and then returns a return
> > code
> > + * instructing the caller to retry the page access.
> > + *
> > + * Return:
> > + * VM_FAULT_NOPAGE on success or pending signal
> > + * VM_FAULT_SIGBUS on unspecified error
> > + * VM_FAULT_OOM on out-of-memory
> > + * VM_FAULT_RETRY if retryable wait
> > + */
> > +vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> > + struct vm_area_struct *cvma,
> > + pgoff_t num_prefault)
> > +{
> > + struct vm_area_struct *vma = vmf->vma;
> > + struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> > + vma->vm_private_data;
> > + struct ttm_bo_device *bdev = bo->bdev;
> > + unsigned long page_offset;
> > + unsigned long page_last;
> > + unsigned long pfn;
> > + struct ttm_tt *ttm = NULL;
> > + struct page *page;
> > + int err;
> > + pgoff_t i;
> > + vm_fault_t ret = VM_FAULT_NOPAGE;
> > + unsigned long address = vmf->address;
> > + struct ttm_mem_type_manager *man =
> > + &bdev->man[bo->mem.mem_type];
> > +
> > /*
> > * Refuse to fault imported pages. This should be handled
> > * (if at all) by redirecting mmap to the exporter.
> > */
> > - if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
> > - ret = VM_FAULT_SIGBUS;
> > - goto out_unlock;
> > - }
> > + if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
> > + return VM_FAULT_SIGBUS;
> >
> > if (bdev->driver->fault_reserve_notify) {
> > struct dma_fence *moving = dma_fence_get(bo->moving);
> > @@ -169,11 +208,9 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > break;
> > case -EBUSY:
> > case -ERESTARTSYS:
> > - ret = VM_FAULT_NOPAGE;
> > - goto out_unlock;
> > + return VM_FAULT_NOPAGE;
> > default:
> > - ret = VM_FAULT_SIGBUS;
> > - goto out_unlock;
> > + return VM_FAULT_SIGBUS;
> > }
> >
> > if (bo->moving != moving) {
> > @@ -189,24 +226,15 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > * move.
> > */
> > ret = ttm_bo_vm_fault_idle(bo, vmf);
> > - if (unlikely(ret != 0)) {
> > - if (ret == VM_FAULT_RETRY &&
> > - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
> > - /* The BO has already been unreserved. */
> > - return ret;
> > - }
> > -
> > - goto out_unlock;
> > - }
> > + if (unlikely(ret != 0))
> > + return ret;
> >
> > err = ttm_mem_io_lock(man, true);
> > - if (unlikely(err != 0)) {
> > - ret = VM_FAULT_NOPAGE;
> > - goto out_unlock;
> > - }
> > + if (unlikely(err != 0))
> > + return VM_FAULT_NOPAGE;
> > err = ttm_mem_io_reserve_vm(bo);
> > if (unlikely(err != 0)) {
> > - ret = VM_FAULT_SIGBUS;
> > + return VM_FAULT_SIGBUS;
> > goto out_io_unlock;
>
> This goto is now superfluous.
>
> > }
> >
> > @@ -220,17 +248,11 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > goto out_io_unlock;
> > }
> >
> > - /*
> > - * Make a local vma copy to modify the page_prot member
> > - * and vm_flags if necessary. The vma parameter is protected
> > - * by mmap_sem in write mode.
> > - */
> > - cvma = *vma;
> > - cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
> > + cvma->vm_page_prot = vm_get_page_prot(cvma->vm_flags);
> >
> > if (bo->mem.bus.is_iomem) {
> > - cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
> > - cvma.vm_page_prot);
> > + cvma->vm_page_prot = ttm_io_prot(bo->mem.placement,
> > + cvma->vm_page_prot);
> > } else {
> > struct ttm_operation_ctx ctx = {
> > .interruptible = false,
> > @@ -240,8 +262,8 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > };
> >
> > ttm = bo->ttm;
> > - cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
> > - cvma.vm_page_prot);
> > + cvma->vm_page_prot = ttm_io_prot(bo->mem.placement,
> > + cvma->vm_page_prot);
> >
> > /* Allocate all page at once, most common usage */
> > if (ttm_tt_populate(ttm, &ctx)) {
> > @@ -254,10 +276,11 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > * Speculatively prefault a number of pages. Only error on
> > * first page.
> > */
> > - for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
> > + for (i = 0; i < num_prefault; ++i) {
> > if (bo->mem.bus.is_iomem) {
> > /* Iomem should not be marked encrypted */
> > - cvma.vm_page_prot =
> > pgprot_decrypted(cvma.vm_page_prot);
> > + cvma->vm_page_prot =
> > + pgprot_decrypted(cvma->vm_page_prot);
> > pfn = ttm_bo_io_mem_pfn(bo, page_offset);
> > } else {
> > page = ttm->pages[page_offset];
> > @@ -273,10 +296,10 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > }
> >
> > if (vma->vm_flags & VM_MIXEDMAP)
> > - ret = vmf_insert_mixed(&cvma, address,
> > + ret = vmf_insert_mixed(cvma, address,
> > __pfn_to_pfn_t(pfn, PFN_DEV));
> > else
> > - ret = vmf_insert_pfn(&cvma, address, pfn);
> > + ret = vmf_insert_pfn(cvma, address, pfn);
> >
> > /*
> > * Somebody beat us to this PTE or prefaulting to
> > @@ -295,7 +318,26 @@ static vm_fault_t ttm_bo_vm_fault(struct
> > vm_fault *vmf)
> > ret = VM_FAULT_NOPAGE;
> > out_io_unlock:
> > ttm_mem_io_unlock(man);
> > -out_unlock:
> > + return ret;
> > +}
> > +EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
> > +
> > +static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
> > +{
> > + struct vm_area_struct *vma = vmf->vma;
> > + struct vm_area_struct cvma = *vma;
> > + struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> > + vma->vm_private_data;
>
> That extra cast can be dropped, the vm_private_data member is a
> void*
> anyway.
>
> Regards,
> Christian.
>
> > + vm_fault_t ret;
> > +
> > + ret = ttm_bo_vm_reserve(bo, vmf);
> > + if (ret)
> > + return ret;
> > +
> > + ret = ttm_bo_vm_fault_reserved(vmf, &cvma,
> > TTM_BO_VM_NUM_PREFAULT);
> > + if (ret == VM_FAULT_RETRY && !(vmf->flags &
> > FAULT_FLAG_RETRY_NOWAIT))
> > + return ret;
> > +
> > reservation_object_unlock(bo->resv);
> > return ret;
> > }
> > diff --git a/include/drm/ttm/ttm_bo_api.h
> > b/include/drm/ttm/ttm_bo_api.h
> > index 49d9cdfc58f2..bebfa16426ca 100644
> > --- a/include/drm/ttm/ttm_bo_api.h
> > +++ b/include/drm/ttm/ttm_bo_api.h
> > @@ -768,4 +768,14 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
> > struct ttm_operation_ctx *ctx);
> > void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
> > int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
> > +
> > +/* Default number of pre-faulted pages in the TTM fault handler */
> > +#define TTM_BO_VM_NUM_PREFAULT 16
> > +
> > +vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
> > + struct vm_fault *vmf);
> > +
> > +vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> > + struct vm_area_struct *cvma,
> > + pgoff_t num_prefault);
> > #endif
Hi, Souptick,
On Sat, 2019-04-13 at 20:41 +0530, Souptick Joarder wrote:
> On Fri, Apr 12, 2019 at 9:34 PM Thomas Hellstrom <
> [email protected]> wrote:
> > Driver fault callbacks are allowed to drop the mmap_sem when
> > expecting
> > long hardware waits to avoid blocking other mm users. Allow the
> > mkwrite
> > callbacks to do the same by returning early on VM_FAULT_RETRY.
> >
> > In particular we want to be able to drop the mmap_sem when waiting
> > for
> > a reservation object lock on a GPU buffer object. These locks may
> > be
> > held while waiting for the GPU.
> >
> > Cc: Andrew Morton <[email protected]>
> > Cc: Matthew Wilcox <[email protected]>
> > Cc: Will Deacon <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Rik van Riel <[email protected]>
> > Cc: Minchan Kim <[email protected]>
> > Cc: Michal Hocko <[email protected]>
> > Cc: Huang Ying <[email protected]>
> > Cc: Souptick Joarder <[email protected]>
> > Cc: "Jérôme Glisse" <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> >
> > Signed-off-by: Thomas Hellstrom <[email protected]>
> > ---
> > mm/memory.c | 10 ++++++----
> > 1 file changed, 6 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index e11ca9dd823f..a95b4a3b1ae2 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct
> > vm_fault *vmf)
> > ret = vmf->vma->vm_ops->page_mkwrite(vmf);
> > /* Restore original flags so that caller is not surprised
> > */
> > vmf->flags = old_flags;
> > - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> > + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY |
> > VM_FAULT_NOPAGE)))
>
> With this patch there will multiple instances of (VM_FAULT_ERROR |
> VM_FAULT_RETRY | VM_FAULT_NOPAGE)
> in mm/memory.c. Does it make sense to wrap it in a macro and use it ?
Even though the code will look neater, it might be trickier to follow a
particular error path. Could we perhaps postpone to a follow-up patch?
Thomas
>
> > return ret;
> > if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> > lock_page(page);
> > @@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct
> > vm_fault *vmf)
> > pte_unmap_unlock(vmf->pte, vmf->ptl);
> > vmf->flags |= FAULT_FLAG_MKWRITE;
> > ret = vma->vm_ops->pfn_mkwrite(vmf);
> > - if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
> > + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY |
> > VM_FAULT_NOPAGE))
> > return ret;
> > return finish_mkwrite_fault(vmf);
> > }
> > @@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct
> > vm_fault *vmf)
> > pte_unmap_unlock(vmf->pte, vmf->ptl);
> > tmp = do_page_mkwrite(vmf);
> > if (unlikely(!tmp || (tmp &
> > - (VM_FAULT_ERROR |
> > VM_FAULT_NOPAGE)))) {
> > + (VM_FAULT_ERROR |
> > VM_FAULT_RETRY |
> > + VM_FAULT_NOPAGE)))) {
> > put_page(vmf->page);
> > return tmp;
> > }
> > @@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct
> > vm_fault *vmf)
> > unlock_page(vmf->page);
> > tmp = do_page_mkwrite(vmf);
> > if (unlikely(!tmp ||
> > - (tmp & (VM_FAULT_ERROR |
> > VM_FAULT_NOPAGE)))) {
> > + (tmp & (VM_FAULT_ERROR |
> > VM_FAULT_RETRY |
> > + VM_FAULT_NOPAGE)))) {
> > put_page(vmf->page);
> > return tmp;
> > }
> > --
> > 2.20.1
> >
On Wed, Apr 17, 2019 at 4:28 PM Thomas Hellstrom <[email protected]> wrote:
>
> Hi, Souptick,
>
> On Sat, 2019-04-13 at 20:41 +0530, Souptick Joarder wrote:
> > On Fri, Apr 12, 2019 at 9:34 PM Thomas Hellstrom <
> > [email protected]> wrote:
> > > Driver fault callbacks are allowed to drop the mmap_sem when
> > > expecting
> > > long hardware waits to avoid blocking other mm users. Allow the
> > > mkwrite
> > > callbacks to do the same by returning early on VM_FAULT_RETRY.
> > >
> > > In particular we want to be able to drop the mmap_sem when waiting
> > > for
> > > a reservation object lock on a GPU buffer object. These locks may
> > > be
> > > held while waiting for the GPU.
> > >
> > > Cc: Andrew Morton <[email protected]>
> > > Cc: Matthew Wilcox <[email protected]>
> > > Cc: Will Deacon <[email protected]>
> > > Cc: Peter Zijlstra <[email protected]>
> > > Cc: Rik van Riel <[email protected]>
> > > Cc: Minchan Kim <[email protected]>
> > > Cc: Michal Hocko <[email protected]>
> > > Cc: Huang Ying <[email protected]>
> > > Cc: Souptick Joarder <[email protected]>
> > > Cc: "Jérôme Glisse" <[email protected]>
> > > Cc: [email protected]
> > > Cc: [email protected]
> > >
> > > Signed-off-by: Thomas Hellstrom <[email protected]>
> > > ---
> > > mm/memory.c | 10 ++++++----
> > > 1 file changed, 6 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index e11ca9dd823f..a95b4a3b1ae2 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct
> > > vm_fault *vmf)
> > > ret = vmf->vma->vm_ops->page_mkwrite(vmf);
> > > /* Restore original flags so that caller is not surprised
> > > */
> > > vmf->flags = old_flags;
> > > - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> > > + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY |
> > > VM_FAULT_NOPAGE)))
> >
> > With this patch there will multiple instances of (VM_FAULT_ERROR |
> > VM_FAULT_RETRY | VM_FAULT_NOPAGE)
> > in mm/memory.c. Does it make sense to wrap it in a macro and use it ?
>
> Even though the code will look neater, it might be trickier to follow a
> particular error path. Could we perhaps postpone to a follow-up patch?
Sure. follow-up-patch is fine.
>
> Thomas
>
>
>
> >
> > > return ret;
> > > if (unlikely(!(ret & VM_FAULT_LOCKED))) {
> > > lock_page(page);
> > > @@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct
> > > vm_fault *vmf)
> > > pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > vmf->flags |= FAULT_FLAG_MKWRITE;
> > > ret = vma->vm_ops->pfn_mkwrite(vmf);
> > > - if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
> > > + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY |
> > > VM_FAULT_NOPAGE))
> > > return ret;
> > > return finish_mkwrite_fault(vmf);
> > > }
> > > @@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct
> > > vm_fault *vmf)
> > > pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > tmp = do_page_mkwrite(vmf);
> > > if (unlikely(!tmp || (tmp &
> > > - (VM_FAULT_ERROR |
> > > VM_FAULT_NOPAGE)))) {
> > > + (VM_FAULT_ERROR |
> > > VM_FAULT_RETRY |
> > > + VM_FAULT_NOPAGE)))) {
> > > put_page(vmf->page);
> > > return tmp;
> > > }
> > > @@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct
> > > vm_fault *vmf)
> > > unlock_page(vmf->page);
> > > tmp = do_page_mkwrite(vmf);
> > > if (unlikely(!tmp ||
> > > - (tmp & (VM_FAULT_ERROR |
> > > VM_FAULT_NOPAGE)))) {
> > > + (tmp & (VM_FAULT_ERROR |
> > > VM_FAULT_RETRY |
> > > + VM_FAULT_NOPAGE)))) {
> > > put_page(vmf->page);
> > > return tmp;
> > > }
> > > --
> > > 2.20.1
> > >
On Wed, Apr 17, 2019 at 09:15:52AM +0000, Thomas Hellstrom wrote:
> On Tue, 2019-04-16 at 10:46 -0400, Jerome Glisse wrote:
> > On Sat, Apr 13, 2019 at 08:34:02AM +0000, Thomas Hellstrom wrote:
> > > Hi, J?r?me
> > >
> > > On Fri, 2019-04-12 at 17:07 -0400, Jerome Glisse wrote:
> > > > On Fri, Apr 12, 2019 at 04:04:18PM +0000, Thomas Hellstrom wrote:
[...]
> > > > > -/*
> > > > > - * Scan a region of virtual memory, filling in page tables as
> > > > > necessary
> > > > > - * and calling a provided function on each leaf page table.
> > > > > +/**
> > > > > + * apply_to_pfn_range - Scan a region of virtual memory,
> > > > > calling a
> > > > > provided
> > > > > + * function on each leaf page table entry
> > > > > + * @closure: Details about how to scan and what function to
> > > > > apply
> > > > > + * @addr: Start virtual address
> > > > > + * @size: Size of the region
> > > > > + *
> > > > > + * If @closure->alloc is set to 1, the function will fill in
> > > > > the
> > > > > page table
> > > > > + * as necessary. Otherwise it will skip non-present parts.
> > > > > + * Note: The caller must ensure that the range does not
> > > > > contain
> > > > > huge pages.
> > > > > + * The caller must also assure that the proper mmu_notifier
> > > > > functions are
> > > > > + * called. Either in the pte leaf function or before and after
> > > > > the
> > > > > call to
> > > > > + * apply_to_pfn_range.
> > > >
> > > > This is wrong there should be a big FAT warning that this can
> > > > only be
> > > > use
> > > > against mmap of device file. The page table walking above is
> > > > broken
> > > > for
> > > > various thing you might find in any other vma like THP, device
> > > > pte,
> > > > hugetlbfs,
> > >
> > > I was figuring since we didn't export the function anymore, the
> > > warning
> > > and checks could be left to its users, assuming that any other
> > > future
> > > usage of this function would require mm people audit anyway. But I
> > > can
> > > of course add that warning also to this function if you still want
> > > that?
> >
> > Yeah more warning are better, people might start using this, i know
> > some poeple use unexported symbol and then report bugs while they
> > just were doing something illegal.
> >
> > > > ...
> > > >
> > > > Also the mmu notifier can not be call from the pfn callback as
> > > > that
> > > > callback
> > > > happens under page table lock (the change_pte notifier callback
> > > > is
> > > > useless
> > > > and not enough). So it _must_ happen around the call to
> > > > apply_to_pfn_range
> > >
> > > In the comments I was having in mind usage of, for example
> > > ptep_clear_flush_notify(). But you're the mmu_notifier expert here.
> > > Are
> > > you saying that function by itself would not be sufficient?
> > > In that case, should I just scratch the text mentioning the pte
> > > leaf
> > > function?
> >
> > ptep_clear_flush_notify() is useless ... i have posted patches to
> > either
> > restore it or remove it. In any case you must call mmu notifier range
> > and
> > they can not happen under lock. You usage looked fine (in the next
> > patch)
> > but i would rather have a bit of comment here to make sure people are
> > also
> > aware of that.
> >
> > While we can hope that people would cc mm when using mm function, it
> > is
> > not always the case. So i rather be cautious and warn in comment as
> > much
> > as possible.
> >
>
> OK. Understood. All this actually makes me tend to want to try a bit
> harder using a slight modification to the pagewalk code instead. Don't
> really want to encourage two parallel code paths doing essentially the
> same thing; one good and one bad.
>
> One thing that confuses me a bit with the pagewalk code is that callers
> (for example softdirty) typically call
> mmu_notifier_invalidate_range_start() around the pagewalk, but then if
> it ends up splitting a pmd, mmu_notifier_invalidate_range is called
> again, within the first range. Docs aren't really clear whether that's
> permitted or not. Is it?
It is mandatory ie you have to call mmu_notifier_invalidate_range()
in some cases. This is all documented in mmu_notifier.h see struct
mmu_notifier_ops comments and also Documentation/vm/mmu_notifier.rst
Roughly anytime you go from one valid pte (pmd/pud/p4d) to another
valid pte (pmd/pud/p4d) with a different page then you have to call
after clearing pte (pmd/pud/p4d) and before replacing it with its
new value. Changing permission on same page ie going from read and
write to read only, or read only to read and write, does not require
any extra call to mmu_notifier_invalidate_range()
The mmu_notifier_invalidate_range() is important for IOMMU with ATS/
PASID as it is when the flush the TLB and remote device TLB. So you
must flush those secondary TLB after clearing entry so that it can
not race to repopulate the TLB and before setting the new entry so
that at no point in time any hardware can wrongly access old page
while a new page is just now active.
Hopes that clarify it, between if you see any improvement to mmu-
notifier doc it would be more than welcome. I try to put comments
in enough places that people should see at least one of them but
maybe i miss a place where i should have put a comments to point
to the doc :)
Cheers,
J?r?me
Hi Thomas,
With minor comments below
Reviewed-by: Deepak Rawat <[email protected]>
On Fri, 2019-04-12 at 09:04 -0700, Thomas Hellstrom wrote:
> This infrastructure will, for coherent resources, make sure that
> from the user-space point of view, data written by the CPU is
> immediately
> automatically available to the GPU at resource validation time.
>
> Signed-off-by: Thomas Hellstrom <[email protected]>
> ---
> drivers/gpu/drm/vmwgfx/Kconfig | 1 +
> drivers/gpu/drm/vmwgfx/Makefile | 2 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 5 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 5 +
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 26 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 1 -
> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 410
> ++++++++++++++++++
> drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 57 +++
> drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h | 11 +
> drivers/gpu/drm/vmwgfx/vmwgfx_validation.c | 74 ++++
> drivers/gpu/drm/vmwgfx/vmwgfx_validation.h | 16 +-
> 11 files changed, 588 insertions(+), 20 deletions(-)
> create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>
> diff --git a/drivers/gpu/drm/vmwgfx/Kconfig
> b/drivers/gpu/drm/vmwgfx/Kconfig
> index 6b28a326f8bb..d5fd81a521f6 100644
> --- a/drivers/gpu/drm/vmwgfx/Kconfig
> +++ b/drivers/gpu/drm/vmwgfx/Kconfig
> @@ -8,6 +8,7 @@ config DRM_VMWGFX
> select FB_CFB_IMAGEBLIT
> select DRM_TTM
> select FB
> + select AS_DIRTY_HELPERS
> # Only needed for the transitional use of drm_crtc_init - can
> be removed
> # again once vmwgfx sets up the primary plane itself.
> select DRM_KMS_HELPER
> diff --git a/drivers/gpu/drm/vmwgfx/Makefile
> b/drivers/gpu/drm/vmwgfx/Makefile
> index 8841bd30e1e5..c877a21a0739 100644
> --- a/drivers/gpu/drm/vmwgfx/Makefile
> +++ b/drivers/gpu/drm/vmwgfx/Makefile
> @@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o
> vmwgfx_kms.o vmwgfx_drv.o \
> vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
> vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o
> \
> vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
> - vmwgfx_validation.o \
> + vmwgfx_validation.o vmwgfx_page_dirty.o \
> ttm_object.o ttm_lock.o
>
> obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> index c0829d50eecc..90ca866640fe 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> @@ -463,6 +463,7 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
> {
> struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
>
> + WARN_ON(vmw_bo->dirty);
> vmw_bo_unmap(vmw_bo);
> kfree(vmw_bo);
> }
> @@ -476,8 +477,10 @@ void vmw_bo_bo_free(struct ttm_buffer_object
> *bo)
> static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
> {
> struct vmw_user_buffer_object *vmw_user_bo =
> vmw_user_buffer_object(bo);
> + struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
>
> - vmw_bo_unmap(&vmw_user_bo->vbo);
> + WARN_ON(vbo->dirty);
Is it possible for user-space to exploit this WARN? If yes then you
might want to change the logic?
> + vmw_bo_unmap(vbo);
> ttm_prime_object_kfree(vmw_user_bo, prime);
> }
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> index 6165fe2c4504..74e94138877e 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> @@ -857,6 +857,11 @@ static int vmw_driver_load(struct drm_device
> *dev, unsigned long chipset)
> DRM_ERROR("Failed initializing TTM buffer object
> driver.\n");
> goto out_no_bdev;
> }
> + dev_priv->vm_ops = *dev_priv->bdev.vm_ops;
> + dev_priv->vm_ops.fault = vmw_bo_vm_fault;
> + dev_priv->vm_ops.pfn_mkwrite = vmw_bo_vm_mkwrite;
> + dev_priv->vm_ops.page_mkwrite = vmw_bo_vm_mkwrite;
> + dev_priv->bdev.vm_ops = &dev_priv->vm_ops;
>
> /*
> * Enable VRAM, but initially don't use it until SVGA is
> enabled and
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> index bd6919b90519..f05fce52fbb4 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> @@ -95,6 +95,7 @@ struct vmw_fpriv {
> * @dx_query_ctx: DX context if this buffer object is used as a DX
> query MOB
> * @map: Kmap object for semi-persistent mappings
> * @res_prios: Eviction priority counts for attached resources
> + * @dirty: structure for user-space dirty-tracking
> */
> struct vmw_buffer_object {
> struct ttm_buffer_object base;
> @@ -105,6 +106,7 @@ struct vmw_buffer_object {
> /* Protected by reservation */
> struct ttm_bo_kmap_obj map;
> u32 res_prios[TTM_MAX_BO_PRIORITY];
> + struct vmw_bo_dirty *dirty;
> };
>
> /**
> @@ -135,7 +137,8 @@ struct vmw_res_func;
> * @res_dirty: Resource contains data not yet in the backup buffer.
> Protected
> * by resource reserved.
> * @backup_dirty: Backup buffer contains data not yet in the HW
> resource.
> - * Protecte by resource reserved.
> + * Protected by resource reserved.
> + * @coherent: Emulate coherency by tracking vm accesses.
> * @backup: The backup buffer if any. Protected by resource
> reserved.
> * @backup_offset: Offset into the backup buffer if any. Protected
> by resource
> * reserved. Note that only a few resource types can have a
> @backup_offset
> @@ -152,14 +155,16 @@ struct vmw_res_func;
> * @hw_destroy: Callback to destroy the resource on the device, as
> part of
> * resource destruction.
> */
> +struct vmw_resource_dirty;
> struct vmw_resource {
> struct kref kref;
> struct vmw_private *dev_priv;
> int id;
> u32 used_prio;
> unsigned long backup_size;
> - bool res_dirty;
> - bool backup_dirty;
> + u32 res_dirty : 1;
> + u32 backup_dirty : 1;
Is there a reason you changed res_dirty and backup_dirty from bool to
u32. They are still areused as bool, right?
> + u32 coherent : 1;
> struct vmw_buffer_object *backup;
> unsigned long backup_offset;
> unsigned long pin_count;
> @@ -167,6 +172,7 @@ struct vmw_resource {
> struct list_head lru_head;
> struct list_head mob_head;
> struct list_head binding_head;
> + struct vmw_resource_dirty *dirty;
> void (*res_free) (struct vmw_resource *res);
> void (*hw_destroy) (struct vmw_resource *res);
> };
> @@ -607,6 +613,9 @@ struct vmw_private {
>
> /* Validation memory reservation */
> struct vmw_validation_mem vvm;
> +
> + /* VM operations */
> + struct vm_operations_struct vm_ops;
> };
>
> static inline struct vmw_surface *vmw_res_to_srf(struct vmw_resource
> *res)
> @@ -723,6 +732,8 @@ extern void vmw_resource_evict_all(struct
> vmw_private *dev_priv);
> extern void vmw_resource_unbind_list(struct vmw_buffer_object *vbo);
> void vmw_resource_mob_attach(struct vmw_resource *res);
> void vmw_resource_mob_detach(struct vmw_resource *res);
> +void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t
> start,
> + pgoff_t end);
>
> /**
> * vmw_resource_mob_attached - Whether a resource currently has a
> mob attached
> @@ -1411,6 +1422,15 @@ int vmw_host_log(const char *log);
> #define VMW_DEBUG_USER(fmt,
> ...) \
> DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
>
> +/* Resource dirtying - vmwgfx_page_dirty.c */
> +void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
> +int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
> +void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
> +void vmw_bo_dirty_clear_res(struct vmw_resource *res);
> +void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
> +vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
> +vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
> +
> /**
> * Inline helper functions
> */
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> index da3ac0bc2e14..7cb22119f516 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> @@ -2483,7 +2483,6 @@ static int vmw_cmd_dx_check_subresource(struct
> vmw_private *dev_priv,
> offsetof(typeof(*cmd), sid));
>
> cmd = container_of(header, typeof(*cmd), header);
> -
> return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
> VMW_RES_DIRTY_NONE,
> user_surface_converter,
> &cmd->sid, NULL);
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> new file mode 100644
> index 000000000000..87e4a73b1175
> --- /dev/null
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> @@ -0,0 +1,410 @@
> +// SPDX-License-Identifier: GPL-2.0 OR MIT
> +/*******************************************************************
> *******
> + *
> + * Copyright 2019 VMware, Inc., Palo Alto, CA., USA
> + *
> + * Permission is hereby granted, free of charge, to any person
> obtaining a
> + * copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction,
> including
> + * without limitation the rights to use, copy, modify, merge,
> publish,
> + * distribute, sub license, and/or sell copies of the Software, and
> to
> + * permit persons to whom the Software is furnished to do so,
> subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice (including
> the
> + * next paragraph) shall be included in all copies or substantial
> portions
> + * of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO
> EVENT SHALL
> + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
> ANY CLAIM,
> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> TORT OR
> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> SOFTWARE OR THE
> + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> +
> *********************************************************************
> *****/
> +#include "vmwgfx_drv.h"
> +
> +/*
> + * Different methods for tracking dirty:
> + * VMW_BO_DIRTY_PAGETABLE - Scan the pagetable for hardware dirty
> bits
> + * VMW_BO_DIRTY_MKWRITE - Write-protect page table entries and
> record write-
> + * accesses in the VM mkwrite() callback
> + */
> +enum vmw_bo_dirty_method {
> + VMW_BO_DIRTY_PAGETABLE,
> + VMW_BO_DIRTY_MKWRITE,
> +};
> +
> +/*
> + * No dirtied pages at scan trigger a transition to the _MKWRITE
> method,
> + * similarly a certain percentage of dirty pages trigger a
> transition to
> + * the _PAGETABLE method. How many triggers should we wait for
> before
> + * changing method?
> + */
> +#define VMW_DIRTY_NUM_CHANGE_TRIGGERS 2
> +
> +/* Percentage to trigger a transition to the _PAGETABLE method */
> +#define VMW_DIRTY_PERCENTAGE 10
> +
> +/**
> + * struct vmw_bo_dirty - Dirty information for buffer objects
> + * @start: First currently dirty bit
> + * @end: Last currently dirty bit + 1
> + * @method: The currently used dirty method
> + * @change_count: Number of consecutive method change triggers
> + * @ref_count: Reference count for this structure
> + * @bitmap_size: The size of the bitmap in bits. Typically equal to
> the
> + * nuber of pages in the bo.
> + * @size: The accounting size for this struct.
> + * @bitmap: A bitmap where each bit represents a page. A set bit
> means a
> + * dirty page.
> + */
> +struct vmw_bo_dirty {
> + unsigned long start;
> + unsigned long end;
> + enum vmw_bo_dirty_method method;
> + unsigned int change_count;
> + unsigned int ref_count;
> + unsigned long bitmap_size;
> + size_t size;
> + unsigned long bitmap[0];
> +};
> +
> +/**
> + * vmw_bo_dirty_scan_pagetable - Perform a pagetable scan for dirty
> bits
> + * @vbo: The buffer object to scan
> + *
> + * Scans the pagetable for dirty bits. Clear those bits and modify
> the
> + * dirty structure with the results. This function may change the
> + * dirty-tracking method.
> + */
> +static void vmw_bo_dirty_scan_pagetable(struct vmw_buffer_object
> *vbo)
> +{
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> + pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
> + struct address_space *mapping = vbo->base.bdev->dev_mapping;
> + pgoff_t num_marked;
> +
> + num_marked = apply_as_clean(mapping,
> + offset, dirty->bitmap_size,
> + offset, &dirty->bitmap[0],
> + &dirty->start, &dirty->end);
> + if (num_marked == 0)
> + dirty->change_count++;
> + else
> + dirty->change_count = 0;
> +
> + if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
> + dirty->change_count = 0;
> + dirty->method = VMW_BO_DIRTY_MKWRITE;
> + apply_as_wrprotect(mapping,
> + offset, dirty->bitmap_size);
> + apply_as_clean(mapping,
> + offset, dirty->bitmap_size,
> + offset, &dirty->bitmap[0],
> + &dirty->start, &dirty->end);
> + }
> +}
> +
> +/**
> + * vmw_bo_dirty_scan_mkwrite - Reset the mkwrite dirty-tracking
> method
> + * @vbo: The buffer object to scan
> + *
> + * Write-protect pages written to so that consecutive write accesses
> will
> + * trigger a call to mkwrite.
> + *
> + * This function may change the dirty-tracking method.
> + */
> +static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object *vbo)
> +{
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> + unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
> + struct address_space *mapping = vbo->base.bdev->dev_mapping;
> + pgoff_t num_marked;
> +
> + if (dirty->end <= dirty->start)
> + return;
> +
> + num_marked = apply_as_wrprotect(vbo->base.bdev->dev_mapping,
> + dirty->start + offset,
> + dirty->end - dirty->start);
> +
> + if (100UL * num_marked / dirty->bitmap_size >
> + VMW_DIRTY_PERCENTAGE) {
> + dirty->change_count++;
> + } else {
> + dirty->change_count = 0;
> + }
> +
> + if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
> + pgoff_t start = 0;
> + pgoff_t end = dirty->bitmap_size;
> +
> + dirty->method = VMW_BO_DIRTY_PAGETABLE;
> + apply_as_clean(mapping, offset, end, offset, &dirty-
> >bitmap[0],
> + &start, &end);
> + bitmap_clear(&dirty->bitmap[0], 0, dirty->bitmap_size);
> + if (dirty->start < dirty->end)
> + bitmap_set(&dirty->bitmap[0], dirty->start,
> + dirty->end - dirty->start);
> + dirty->change_count = 0;
> + }
> +}
> +
> +
> +/**
> + * vmw_bo_dirty_scan - Scan for dirty pages and add them to the
> dirty
> + * tracking structure
> + * @vbo: The buffer object to scan
> + *
> + * This function may change the dirty tracking method.
> + */
> +void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
> +{
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> +
> + if (dirty->method == VMW_BO_DIRTY_PAGETABLE)
> + vmw_bo_dirty_scan_pagetable(vbo);
> + else
> + vmw_bo_dirty_scan_mkwrite(vbo);
> +}
> +
> +/**
> + * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
> + * @vbo: The buffer object
> + *
> + * This function registers a dirty-tracking user to a buffer object.
> + * A user can be for example a resource or a vma in a special user-
> space
> + * mapping.
> + *
> + * Return: Zero on success, -ENOMEM on memory allocation failure.
> + */
> +int vmw_bo_dirty_add(struct vmw_buffer_object *vbo)
> +{
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> + pgoff_t num_pages = vbo->base.num_pages;
> + size_t size, acc_size;
> + int ret;
> + static struct ttm_operation_ctx ctx = {
> + .interruptible = false,
> + .no_wait_gpu = false
> + };
> +
> + if (dirty) {
> + dirty->ref_count++;
> + return 0;
> + }
> +
> + size = sizeof(*dirty) + BITS_TO_LONGS(num_pages) *
> sizeof(long);
> + acc_size = ttm_round_pot(size);
> + ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
> + if (ret) {
> + VMW_DEBUG_USER("Out of graphics memory for buffer
> object "
> + "dirty tracker.\n");
> + return ret;
> + }
> + dirty = kvzalloc(size, GFP_KERNEL);
> + if (!dirty) {
> + ret = -ENOMEM;
> + goto out_no_dirty;
> + }
> +
> + dirty->size = acc_size;
> + dirty->bitmap_size = num_pages;
> + dirty->start = dirty->bitmap_size;
> + dirty->end = 0;
> + dirty->ref_count = 1;
> + if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
> + dirty->method = VMW_BO_DIRTY_PAGETABLE;
> + } else {
> + struct address_space *mapping = vbo->base.bdev-
> >dev_mapping;
> + pgoff_t offset = drm_vma_node_start(&vbo-
> >base.vma_node);
> +
> + dirty->method = VMW_BO_DIRTY_MKWRITE;
> +
> + /* Write-protect and then pick up already dirty bits */
> + apply_as_wrprotect(mapping, offset, num_pages);
> + apply_as_clean(mapping, offset, num_pages, offset,
> + &dirty->bitmap[0], &dirty->start,
> &dirty->end);
> + }
> +
> + vbo->dirty = dirty;
> +
> + return 0;
> +
> +out_no_dirty:
> + ttm_mem_global_free(&ttm_mem_glob, acc_size);
> + return ret;
> +}
> +
> +/**
> + * vmw_bo_dirty_release - Release a dirty-tracking user from a
> buffer object
> + * @vbo: The buffer object
> + *
> + * This function releases a dirty-tracking user from a buffer
> object.
> + * If the reference count reaches zero, then the dirty-tracking
> object is
> + * freed and the pointer to it cleared.
> + *
> + * Return: Zero on success, -ENOMEM on memory allocation failure.
> + */
> +void vmw_bo_dirty_release(struct vmw_buffer_object *vbo)
> +{
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> +
> + if (dirty && --dirty->ref_count == 0) {
> + size_t acc_size = dirty->size;
> +
> + kvfree(dirty);
> + ttm_mem_global_free(&ttm_mem_glob, acc_size);
> + vbo->dirty = NULL;
> + }
> +}
> +
> +/**
> + * vmw_bo_dirty_transfer_to_res - Pick up a resource's dirty region
> from
> + * its backing mob.
> + * @res: The resource
> + *
> + * This function will pick up all dirty ranges affecting the
> resource from
> + * it's backup mob, and call vmw_resource_dirty_update() once for
> each
> + * range. The transferred ranges will be cleared from the backing
> mob's
> + * dirty tracking.
> + */
> +void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res)
> +{
> + struct vmw_buffer_object *vbo = res->backup;
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> + pgoff_t start, cur, end;
> + unsigned long res_start = res->backup_offset;
> + unsigned long res_end = res->backup_offset + res->backup_size;
> +
> + WARN_ON_ONCE(res_start & ~PAGE_MASK);
> + res_start >>= PAGE_SHIFT;
> + res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
> +
> + if (res_start >= dirty->end || res_end <= dirty->start)
> + return;
> +
> + cur = max(res_start, dirty->start);
> + res_end = max(res_end, dirty->end);
> + while (cur < res_end) {
> + unsigned long num;
> +
> + start = find_next_bit(&dirty->bitmap[0], res_end, cur);
> + if (start >= res_end)
> + break;
> +
> + end = find_next_zero_bit(&dirty->bitmap[0], res_end,
> start + 1);
> + cur = end + 1;
> + num = end - start;
> + bitmap_clear(&dirty->bitmap[0], start, num);
> + vmw_resource_dirty_update(res, start, end);
> + }
> +
> + if (res_start <= dirty->start && res_end > dirty->start)
> + dirty->start = res_end;
> + if (res_start < dirty->end && res_end >= dirty->end)
> + dirty->end = res_start;
> +}
> +
> +/**
> + * vmw_bo_dirty_clear_res - Clear a resource's dirty region from
> + * its backing mob.
> + * @res: The resource
> + *
> + * This function will clar all dirty ranges affecting the resource
s/clar/clear
> from
> + * it's backup mob's dirty tracking.
> + */
> +void vmw_bo_dirty_clear_res(struct vmw_resource *res)
> +{
> + unsigned long res_start = res->backup_offset;
> + unsigned long res_end = res->backup_offset + res->backup_size;
> + struct vmw_buffer_object *vbo = res->backup;
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> +
> + res_start >>= PAGE_SHIFT;
> + res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
> +
> + if (res_start >= dirty->end || res_end <= dirty->start)
> + return;
> +
> + res_start = max(res_start, dirty->start);
> + res_end = min(res_end, dirty->end);
> + bitmap_clear(&dirty->bitmap[0], res_start, res_end -
> res_start);
> +
> + if (res_start <= dirty->start && res_end > dirty->start)
> + dirty->start = res_end;
> + if (res_start < dirty->end && res_end >= dirty->end)
> + dirty->end = res_start;
> +}
> +
> +/* vmw_bo_vm_mkwrite - The vmwgfx page_mkwrite() or pfn_mkwrite()
> callback */
Function documentation format, IMO since this is callback you can get
rid of the above line, same for vmw_bo_vm_fault.
> +vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> + vma->vm_private_data;
> + vm_fault_t ret;
> + unsigned long page_offset;
> + struct vmw_buffer_object *vbo =
> + container_of(bo, typeof(*vbo), base);
> +
> + ret = ttm_bo_vm_reserve(bo, vmf);
> + if (ret)
> + return ret;
> +
> + page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
> + if (unlikely(page_offset >= bo->num_pages)) {
> + ret = VM_FAULT_SIGBUS;
> + goto out_unlock;
> + }
> +
> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE &&
> + !test_bit(page_offset, &vbo->dirty->bitmap[0])) {
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> +
> + __set_bit(page_offset, &dirty->bitmap[0]);
> + dirty->start = min(dirty->start, page_offset);
> + dirty->end = max(dirty->end, page_offset + 1);
> + }
> +
> +out_unlock:
> + reservation_object_unlock(bo->resv);
> + return ret;
> +}
> +
> +
> +/* vmw_bo_vm_fault - The vmwgfx fault() callback */
> +vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + struct vm_area_struct cvma = *vma;
> + struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
> + vma->vm_private_data;
> + struct vmw_buffer_object *vbo =
> + container_of(bo, struct vmw_buffer_object, base);
> + pgoff_t num_prefault;
> + vm_fault_t ret;
> +
> + ret = ttm_bo_vm_reserve(bo, vmf);
> + if (ret)
> + return ret;
> +
> + /*
> + * This will cause mkwrite() to be called for each pte on
> + * write-enable vmas.
> + */
> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> + cvma.vm_flags &= ~VM_WRITE;
> +
> + num_prefault = (vma->vm_flags & VM_RAND_READ) ? 0 :
> + TTM_BO_VM_NUM_PREFAULT;
> + ret = ttm_bo_vm_fault_reserved(vmf, &cvma, num_prefault);
> + if (ret == VM_FAULT_RETRY && !(vmf->flags &
> FAULT_FLAG_RETRY_NOWAIT))
> + return ret;
> +
> + reservation_object_unlock(bo->resv);
> + return ret;
> +}
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> index 88981c4dbae3..d35f4bd32cd9 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> @@ -121,6 +121,10 @@ static void vmw_resource_release(struct kref
> *kref)
> }
> res->backup_dirty = false;
> vmw_resource_mob_detach(res);
> + if (res->dirty)
> + res->func->dirty_free(res);
> + if (res->coherent)
> + vmw_bo_dirty_release(res->backup);
> ttm_bo_unreserve(bo);
> vmw_bo_unreference(&res->backup);
> }
> @@ -210,7 +214,9 @@ int vmw_resource_init(struct vmw_private
> *dev_priv, struct vmw_resource *res,
> res->backup_offset = 0;
> res->backup_dirty = false;
> res->res_dirty = false;
> + res->coherent = false;
> res->used_prio = 3;
> + res->dirty = NULL;
> if (delay_id)
> return 0;
> else
> @@ -397,6 +403,30 @@ static int vmw_resource_do_validate(struct
> vmw_resource *res,
> vmw_resource_mob_attach(res);
> }
>
> + /*
> + * Handle the case where the backup mob is marked coherent but
> + * the resource isn't.
> + */
> + if (func->dirty_alloc && vmw_resource_mob_attached(res) &&
> + !res->coherent) {
> + if (res->backup->dirty && !res->dirty) {
> + ret = func->dirty_alloc(res);
> + if (ret)
> + return ret;
> + } else if (!res->backup->dirty && res->dirty) {
> + func->dirty_free(res);
> + }
> + }
> +
> + /*
> + * Transfer the dirty regions to the resource and update
> + * the resource.
> + */
> + if (res->dirty) {
> + vmw_bo_dirty_transfer_to_res(res);
> + return func->dirty_sync(res);
> + }
> +
> return 0;
>
> out_bind_failed:
> @@ -435,16 +465,28 @@ void vmw_resource_unreserve(struct vmw_resource
> *res,
> if (switch_backup && new_backup != res->backup) {
> if (res->backup) {
> vmw_resource_mob_detach(res);
> + if (res->coherent)
> + vmw_bo_dirty_release(res->backup);
> vmw_bo_unreference(&res->backup);
> }
>
> if (new_backup) {
> res->backup = vmw_bo_reference(new_backup);
> +
> + /*
> + * The validation code should already have
> added a
> + * dirty tracker here.
> + */
> + WARN_ON(res->coherent && !new_backup->dirty);
> +
> vmw_resource_mob_attach(res);
> } else {
> res->backup = NULL;
> }
> + } else if (switch_backup && res->coherent) {
> + vmw_bo_dirty_release(res->backup);
> }
> +
> if (switch_backup)
> res->backup_offset = new_backup_offset;
>
> @@ -1009,3 +1051,18 @@ enum vmw_res_type vmw_res_type(const struct
> vmw_resource *res)
> {
> return res->func->res_type;
> }
> +
> +/**
> + * vmw_resource_update_dirty - Update a resource's dirty tracker
> with a
> + * sequential range of touched backing store memory.
> + * @res: The resource.
> + * @start: The first page touched.
> + * @end: The last page touched + 1.
> + */
> +void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t
> start,
> + pgoff_t end)
> +{
> + if (res->dirty)
> + res->func->dirty_range_add(res, start << PAGE_SHIFT,
> + end << PAGE_SHIFT);
> +}
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> index 984e588c62ca..c85144286cfe 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> @@ -71,6 +71,12 @@ struct vmw_user_resource_conv {
> * @commit_notify: If the resource is a command buffer managed
> resource,
> * callback to notify that a define or remove
> command
> * has been committed to the device.
> + * @dirty_alloc: Allocate a dirty tracker. NULL if dirty-
> tracking is not
> + * supported.
> + * @dirty_free: Free the dirty tracker.
> + * @dirty_sync: Upload the dirty mob contents to the
> resource.
> + * @dirty_add_range: Add a sequential dirty range to the resource
> + * dirty tracker.
> */
> struct vmw_res_func {
> enum vmw_res_type res_type;
> @@ -90,6 +96,11 @@ struct vmw_res_func {
> struct ttm_validate_buffer *val_buf);
> void (*commit_notify)(struct vmw_resource *res,
> enum vmw_cmdbuf_res_state state);
> + int (*dirty_alloc)(struct vmw_resource *res);
> + void (*dirty_free)(struct vmw_resource *res);
> + int (*dirty_sync)(struct vmw_resource *res);
> + void (*dirty_range_add)(struct vmw_resource *res, size_t start,
> + size_t end);
> };
>
> /**
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> index f611b2290a1b..5b0c928bb5ba 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> @@ -33,8 +33,13 @@
> * struct vmw_validation_bo_node - Buffer object validation
> metadata.
> * @base: Metadata used for TTM reservation- and validation.
> * @hash: A hash entry used for the duplicate detection hash table.
> + * @coherent_count: If switching backup buffers, number of new
> coherent
> + * resources that will have this buffer as a backup buffer.
> * @as_mob: Validate as mob.
> * @cpu_blit: Validate for cpu blit access.
> + * @coherent_alloced: In switching backup buffers for coherent
> resources:
I don't see the field below, remove this.
> + * The bo dirty tracker has been allocated and needs to be freed if
> + * reverting.
> *
> * Bit fields are used since these structures are allocated and
> freed in
> * large numbers and space conservation is desired.
> @@ -42,6 +47,7 @@
> struct vmw_validation_bo_node {
> struct ttm_validate_buffer base;
> struct drm_hash_item hash;
> + unsigned int coherent_count;
> u32 as_mob : 1;
> u32 cpu_blit : 1;
> };
> @@ -459,6 +465,19 @@ int vmw_validation_res_reserve(struct
> vmw_validation_context *ctx,
> if (ret)
> goto out_unreserve;
> }
> +
> + if (val->switching_backup && val->new_backup &&
> + res->coherent) {
> + struct vmw_validation_bo_node *bo_node =
> + vmw_validation_find_bo_dup(ctx,
> + val-
> >new_backup);
> +
> + if (WARN_ON(!bo_node)) {
> + ret = -EINVAL;
> + goto out_unreserve;
> + }
> + bo_node->coherent_count++;
> + }
> }
>
> return 0;
> @@ -562,6 +581,9 @@ int vmw_validation_bo_validate(struct
> vmw_validation_context *ctx, bool intr)
> int ret;
>
> list_for_each_entry(entry, &ctx->bo_list, base.head) {
> + struct vmw_buffer_object *vbo =
> + container_of(entry->base.bo, typeof(*vbo),
> base);
> +
> if (entry->cpu_blit) {
> struct ttm_operation_ctx ctx = {
> .interruptible = intr,
> @@ -576,6 +598,27 @@ int vmw_validation_bo_validate(struct
> vmw_validation_context *ctx, bool intr)
> }
> if (ret)
> return ret;
> +
> + /*
> + * Rather than having the resource code allocating the
> bo
> + * dirty tracker in resource_unreserve() where we can't
> fail,
> + * Do it here when validating the buffer object.
> + */
> + if (entry->coherent_count) {
> + unsigned int coherent_count = entry-
> >coherent_count;
> +
> + while (coherent_count) {
> + ret = vmw_bo_dirty_add(vbo);
> + if (ret)
> + return ret;
> +
> + coherent_count--;
> + }
> + entry->coherent_count -= coherent_count;
> + }
> +
> + if (vbo->dirty)
> + vmw_bo_dirty_scan(vbo);
> }
> return 0;
> }
> @@ -828,3 +871,34 @@ int vmw_validation_preload_res(struct
> vmw_validation_context *ctx,
> ctx->mem_size_left += size;
> return 0;
> }
> +
> +/**
> + * vmw_validation_bo_backoff - Unreserve buffer objects registered
> with a
> + * validation context
> + * @ctx: The validation context
> + *
> + * This function unreserves the buffer objects previously reserved
> using
> + * vmw_validation_bo_reserve. It's typically used as part of an
> error path
> + */
> +void vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
> +{
> + struct vmw_validation_bo_node *entry;
> +
> + /*
> + * Switching coherent resource backup buffers failed.
> + * Release corresponding buffer object dirty trackers.
> + */
> + list_for_each_entry(entry, &ctx->bo_list, base.head) {
> + if (entry->coherent_count) {
> + unsigned int coherent_count = entry-
> >coherent_count;
> + struct vmw_buffer_object *vbo =
> + container_of(entry->base.bo,
> typeof(*vbo),
> + base);
> +
> + while (coherent_count--)
> + vmw_bo_dirty_release(vbo);
> + }
> + }
> +
> + ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
> +}
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> index 523f6ac5c335..058c7f2fbf83 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
> @@ -172,20 +172,6 @@ vmw_validation_bo_reserve(struct
> vmw_validation_context *ctx,
> NULL);
> }
>
> -/**
> - * vmw_validation_bo_backoff - Unreserve buffer objects registered
> with a
> - * validation context
> - * @ctx: The validation context
> - *
> - * This function unreserves the buffer objects previously reserved
> using
> - * vmw_validation_bo_reserve. It's typically used as part of an
> error path
> - */
> -static inline void
> -vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
> -{
> - ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
> -}
> -
> /**
> * vmw_validation_bo_fence - Unreserve and fence buffer objects
> registered
> * with a validation context
> @@ -268,4 +254,6 @@ int vmw_validation_preload_res(struct
> vmw_validation_context *ctx,
> unsigned int size);
> void vmw_validation_res_set_dirty(struct vmw_validation_context
> *ctx,
> void *val_private, u32 dirty);
> +void vmw_validation_bo_backoff(struct vmw_validation_context *ctx);
> +
> #endif
> --
> 2.20.1
>
Reviewed-by: Deepak Rawat <[email protected]>
On Fri, 2019-04-12 at 09:04 -0700, Thomas Hellstrom wrote:
> With emulated coherent memory we need to be able to quickly look up
> a resource from the MOB offset. Instead of traversing a linked list
> with
> O(n) worst case, use an RBtree with O(log n) worst case complexity.
>
> Signed-off-by: Thomas Hellstrom <[email protected]>
> ---
> drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 5 ++--
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 10 +++----
> drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 33 +++++++++++++++++-----
> --
> 3 files changed, 32 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> index 90ca866640fe..e8bc7a7ac031 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> @@ -464,6 +464,7 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
> struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
>
> WARN_ON(vmw_bo->dirty);
> + WARN_ON(!RB_EMPTY_ROOT(&vmw_bo->res_tree));
> vmw_bo_unmap(vmw_bo);
> kfree(vmw_bo);
> }
> @@ -480,6 +481,7 @@ static void vmw_user_bo_destroy(struct
> ttm_buffer_object *bo)
> struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
>
> WARN_ON(vbo->dirty);
> + WARN_ON(!RB_EMPTY_ROOT(&vbo->res_tree));
> vmw_bo_unmap(vbo);
> ttm_prime_object_kfree(vmw_user_bo, prime);
> }
> @@ -515,8 +517,7 @@ int vmw_bo_init(struct vmw_private *dev_priv,
> memset(vmw_bo, 0, sizeof(*vmw_bo));
> BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3);
> vmw_bo->base.priority = 3;
> -
> - INIT_LIST_HEAD(&vmw_bo->res_list);
> + vmw_bo->res_tree = RB_ROOT;
>
> ret = ttm_bo_init(bdev, &vmw_bo->base, size,
> ttm_bo_type_device, placement,
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> index f05fce52fbb4..81ebcd668038 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> @@ -90,7 +90,7 @@ struct vmw_fpriv {
> /**
> * struct vmw_buffer_object - TTM buffer object with vmwgfx
> additions
> * @base: The TTM buffer object
> - * @res_list: List of resources using this buffer object as a
> backing MOB
> + * @res_tree: RB tree of resources using this buffer object as a
> backing MOB
> * @pin_count: pin depth
> * @dx_query_ctx: DX context if this buffer object is used as a DX
> query MOB
> * @map: Kmap object for semi-persistent mappings
> @@ -99,7 +99,7 @@ struct vmw_fpriv {
> */
> struct vmw_buffer_object {
> struct ttm_buffer_object base;
> - struct list_head res_list;
> + struct rb_root res_tree;
> s32 pin_count;
> /* Not ref-counted. Protected by binding_mutex */
> struct vmw_resource *dx_query_ctx;
> @@ -147,8 +147,8 @@ struct vmw_res_func;
> * pin-count greater than zero. It is not on the resource LRU lists
> and its
> * backup buffer is pinned. Hence it can't be evicted.
> * @func: Method vtable for this resource. Immutable.
> + * @mob_node; Node for the MOB backup rbtree. Protected by @backup
> reserved.
> * @lru_head: List head for the LRU list. Protected by
> @dev_priv::resource_lock.
> - * @mob_head: List head for the MOB backup list. Protected by
> @backup reserved.
> * @binding_head: List head for the context binding list. Protected
> by
> * the @dev_priv::binding_mutex
> * @res_free: The resource destructor.
> @@ -169,8 +169,8 @@ struct vmw_resource {
> unsigned long backup_offset;
> unsigned long pin_count;
> const struct vmw_res_func *func;
> + struct rb_node mob_node;
> struct list_head lru_head;
> - struct list_head mob_head;
> struct list_head binding_head;
> struct vmw_resource_dirty *dirty;
> void (*res_free) (struct vmw_resource *res);
> @@ -743,7 +743,7 @@ void vmw_resource_dirty_update(struct
> vmw_resource *res, pgoff_t start,
> */
> static inline bool vmw_resource_mob_attached(const struct
> vmw_resource *res)
> {
> - return !list_empty(&res->mob_head);
> + return !RB_EMPTY_NODE(&res->mob_node);
> }
>
> /**
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> index d35f4bd32cd9..ff9fe5650468 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> @@ -41,11 +41,24 @@
> void vmw_resource_mob_attach(struct vmw_resource *res)
> {
> struct vmw_buffer_object *backup = res->backup;
> + struct rb_node **new = &backup->res_tree.rb_node, *parent =
> NULL;
>
> lockdep_assert_held(&backup->base.resv->lock.base);
> res->used_prio = (res->res_dirty) ? res->func->dirty_prio :
> res->func->prio;
> - list_add_tail(&res->mob_head, &backup->res_list);
> +
> + while (*new) {
> + struct vmw_resource *this =
> + container_of(*new, struct vmw_resource,
> mob_node);
> +
> + parent = *new;
> + new = (res->backup_offset < this->backup_offset) ?
> + &((*new)->rb_left) : &((*new)->rb_right);
> + }
> +
> + rb_link_node(&res->mob_node, parent, new);
> + rb_insert_color(&res->mob_node, &backup->res_tree);
> +
> vmw_bo_prio_add(backup, res->used_prio);
> }
>
> @@ -59,7 +72,8 @@ void vmw_resource_mob_detach(struct vmw_resource
> *res)
>
> lockdep_assert_held(&backup->base.resv->lock.base);
> if (vmw_resource_mob_attached(res)) {
> - list_del_init(&res->mob_head);
> + rb_erase(&res->mob_node, &backup->res_tree);
> + RB_CLEAR_NODE(&res->mob_node);
> vmw_bo_prio_del(backup, res->used_prio);
> }
> }
> @@ -206,8 +220,8 @@ int vmw_resource_init(struct vmw_private
> *dev_priv, struct vmw_resource *res,
> res->res_free = res_free;
> res->dev_priv = dev_priv;
> res->func = func;
> + RB_CLEAR_NODE(&res->mob_node);
> INIT_LIST_HEAD(&res->lru_head);
> - INIT_LIST_HEAD(&res->mob_head);
> INIT_LIST_HEAD(&res->binding_head);
> res->id = -1;
> res->backup = NULL;
> @@ -755,19 +769,20 @@ int vmw_resource_validate(struct vmw_resource
> *res, bool intr)
> */
> void vmw_resource_unbind_list(struct vmw_buffer_object *vbo)
> {
> -
> - struct vmw_resource *res, *next;
> struct ttm_validate_buffer val_buf = {
> .bo = &vbo->base,
> .num_shared = 0
> };
>
> lockdep_assert_held(&vbo->base.resv->lock.base);
> - list_for_each_entry_safe(res, next, &vbo->res_list, mob_head) {
> - if (!res->func->unbind)
> - continue;
> + while (!RB_EMPTY_ROOT(&vbo->res_tree)) {
> + struct rb_node *node = vbo->res_tree.rb_node;
> + struct vmw_resource *res =
> + container_of(node, struct vmw_resource,
> mob_node);
> +
> + if (!WARN_ON_ONCE(!res->func->unbind))
> + (void) res->func->unbind(res, res->res_dirty,
> &val_buf);
>
> - (void) res->func->unbind(res, res->res_dirty,
> &val_buf);
> res->backup_dirty = true;
> res->res_dirty = false;
> vmw_resource_mob_detach(res);
> --
> 2.20.1
>
Minor nits below, otherwise
Reviewed-by: Deepak Rawat <[email protected]>
On Fri, 2019-04-12 at 09:04 -0700, Thomas Hellstrom wrote:
> Similar to write-coherent resources, make sure that from the user-
> space
> point of view, GPU rendered contents is automatically available for
> reading by the CPU.
>
> Signed-off-by: Thomas Hellstrom <[email protected]>
> ---
> drivers/gpu/drm/ttm/ttm_bo_vm.c | 1 +
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 8 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 69 +++++++++++-
> drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 102
> +++++++++++++++++-
> drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h | 2 +
> drivers/gpu/drm/vmwgfx/vmwgfx_validation.c | 3 +-
> 6 files changed, 176 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index 3bd28fb97124..0065b138f450 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -42,6 +42,7 @@
> #include <linux/uaccess.h>
> #include <linux/mem_encrypt.h>
>
> +
> static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
> struct vm_fault *vmf)
> {
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> index 81ebcd668038..00794415335e 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> @@ -96,6 +96,7 @@ struct vmw_fpriv {
> * @map: Kmap object for semi-persistent mappings
> * @res_prios: Eviction priority counts for attached resources
> * @dirty: structure for user-space dirty-tracking
> + * @cleaning: Current validation sequence is cleaning.
> */
> struct vmw_buffer_object {
> struct ttm_buffer_object base;
> @@ -690,7 +691,8 @@ extern void vmw_resource_unreference(struct
> vmw_resource **p_res);
> extern struct vmw_resource *vmw_resource_reference(struct
> vmw_resource *res);
> extern struct vmw_resource *
> vmw_resource_reference_unless_doomed(struct vmw_resource *res);
> -extern int vmw_resource_validate(struct vmw_resource *res, bool
> intr);
> +extern int vmw_resource_validate(struct vmw_resource *res, bool
> intr,
> + bool dirtying);
> extern int vmw_resource_reserve(struct vmw_resource *res, bool
> interruptible,
> bool no_backup);
> extern bool vmw_resource_needs_backup(const struct vmw_resource
> *res);
> @@ -734,6 +736,8 @@ void vmw_resource_mob_attach(struct vmw_resource
> *res);
> void vmw_resource_mob_detach(struct vmw_resource *res);
> void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t
> start,
> pgoff_t end);
> +int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t
> start,
> + pgoff_t end, pgoff_t *num_prefault);
>
> /**
> * vmw_resource_mob_attached - Whether a resource currently has a
> mob attached
> @@ -1428,6 +1432,8 @@ int vmw_bo_dirty_add(struct vmw_buffer_object
> *vbo);
> void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
> void vmw_bo_dirty_clear_res(struct vmw_resource *res);
> void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
> +void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
> + pgoff_t start, pgoff_t end);
> vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
> vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
>
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> index 87e4a73b1175..773ff30a4b60 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> @@ -153,7 +153,6 @@ static void vmw_bo_dirty_scan_mkwrite(struct
> vmw_buffer_object *vbo)
> }
> }
>
> -
> /**
> * vmw_bo_dirty_scan - Scan for dirty pages and add them to the
> dirty
> * tracking structure
> @@ -171,6 +170,51 @@ void vmw_bo_dirty_scan(struct vmw_buffer_object
> *vbo)
> vmw_bo_dirty_scan_mkwrite(vbo);
> }
>
> +/**
> + * vmw_bo_dirty_pre_unmap - write-protect and pick up dirty pages
> before
> + * an unmap_mapping_range operation.
> + * @vbo: The buffer object,
> + * @start: First page of the range within the buffer object.
> + * @end: Last page of the range within the buffer object + 1.
> + *
> + * If we're using the _PAGETABLE scan method, we may leak dirty
> pages
> + * when calling unmap_mapping_range(). This function makes sure we
> pick
> + * up all dirty pages.
> + */
> +static void vmw_bo_dirty_pre_unmap(struct vmw_buffer_object *vbo,
> + pgoff_t start, pgoff_t end)
> +{
> + struct vmw_bo_dirty *dirty = vbo->dirty;
> + unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
> + struct address_space *mapping = vbo->base.bdev->dev_mapping;
> +
> + if (dirty->method != VMW_BO_DIRTY_PAGETABLE || start >= end)
> + return;
> +
> + apply_as_wrprotect(mapping, start + offset, end - start);
> + apply_as_clean(mapping, start + offset, end - start, offset,
> + &dirty->bitmap[0], &dirty->start, &dirty->end);
> +}
> +
> +/**
> + * vmw_bo_dirty_unmap - Clear all ptes pointing to a range within a
> bo
> + * @vbo: The buffer object,
> + * @start: First page of the range within the buffer object.
> + * @end: Last page of the range within the buffer object + 1.
> + *
> + * This is similar to ttm_bo_unmap_virtual_locked() except it takes
> a subrange.
> + */
> +void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
> + pgoff_t start, pgoff_t end)
> +{
> + unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
> + struct address_space *mapping = vbo->base.bdev->dev_mapping;
> +
> + vmw_bo_dirty_pre_unmap(vbo, start, end);
> + unmap_shared_mapping_range(mapping, (offset + start) <<
> PAGE_SHIFT,
> + (loff_t) (end - start) <<
> PAGE_SHIFT);
> +}
> +
> /**
> * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
> * @vbo: The buffer object
> @@ -392,6 +436,26 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
> if (ret)
> return ret;
>
> + num_prefault = (vma->vm_flags & VM_RAND_READ) ? 1 :
> + TTM_BO_VM_NUM_PREFAULT;
> +
> + if (vbo->dirty) {
> + pgoff_t allowed_prefault;
> + unsigned long page_offset;
> +
> + page_offset = vmf->pgoff - drm_vma_node_start(&bo-
> >vma_node);
> + if (page_offset >= bo->num_pages ||
> + vmw_resources_clean(vbo, page_offset,
> + page_offset + PAGE_SIZE,
> + &allowed_prefault)) {
> + ret = VM_FAULT_SIGBUS;
> + goto out_unlock;
> + }
> +
> + num_prefault = min(num_prefault, allowed_prefault);
> + }
> +
> +
Extra space
> /*
> * This will cause mkwrite() to be called for each pte on
> * write-enable vmas.
> @@ -399,12 +463,11 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault
> *vmf)
> if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> cvma.vm_flags &= ~VM_WRITE;
>
> - num_prefault = (vma->vm_flags & VM_RAND_READ) ? 0 :
> - TTM_BO_VM_NUM_PREFAULT;
> ret = ttm_bo_vm_fault_reserved(vmf, &cvma, num_prefault);
> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> FAULT_FLAG_RETRY_NOWAIT))
> return ret;
>
> +out_unlock:
> reservation_object_unlock(bo->resv);
> return ret;
> }
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> index ff9fe5650468..30367cb06143 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> @@ -395,7 +395,8 @@ static int vmw_resource_buf_alloc(struct
> vmw_resource *res,
> * should be retried once resources have been freed up.
> */
> static int vmw_resource_do_validate(struct vmw_resource *res,
> - struct ttm_validate_buffer
> *val_buf)
> + struct ttm_validate_buffer
> *val_buf,
> + bool dirtying)
> {
> int ret = 0;
> const struct vmw_res_func *func = res->func;
> @@ -437,6 +438,15 @@ static int vmw_resource_do_validate(struct
> vmw_resource *res,
> * the resource.
> */
> if (res->dirty) {
> + if (dirtying && !res->res_dirty) {
> + pgoff_t start = res->backup_offset >>
> PAGE_SHIFT;
> + pgoff_t end = __KERNEL_DIV_ROUND_UP
> + (res->backup_offset + res->backup_size,
> + PAGE_SIZE);
> +
> + vmw_bo_dirty_unmap(res->backup, start, end);
> + }
> +
> vmw_bo_dirty_transfer_to_res(res);
> return func->dirty_sync(res);
> }
> @@ -680,6 +690,7 @@ static int vmw_resource_do_evict(struct
> ww_acquire_ctx *ticket,
> * to the device.
> * @res: The resource to make visible to the device.
> * @intr: Perform waits interruptible if possible.
> + * @dirtying: Pending GPU operation will dirty the resource
> *
> * On succesful return, any backup DMA buffer pointed to by @res-
> >backup will
> * be reserved and validated.
> @@ -689,7 +700,8 @@ static int vmw_resource_do_evict(struct
> ww_acquire_ctx *ticket,
> * Return: Zero on success, -ERESTARTSYS if interrupted, negative
> error code
> * on failure.
> */
> -int vmw_resource_validate(struct vmw_resource *res, bool intr)
> +int vmw_resource_validate(struct vmw_resource *res, bool intr,
> + bool dirtying)
> {
> int ret;
> struct vmw_resource *evict_res;
> @@ -706,7 +718,7 @@ int vmw_resource_validate(struct vmw_resource
> *res, bool intr)
> if (res->backup)
> val_buf.bo = &res->backup->base;
> do {
> - ret = vmw_resource_do_validate(res, &val_buf);
> + ret = vmw_resource_do_validate(res, &val_buf,
> dirtying);
> if (likely(ret != -EBUSY))
> break;
>
> @@ -1006,7 +1018,7 @@ int vmw_resource_pin(struct vmw_resource *res,
> bool interruptible)
> /* Do we really need to pin the MOB as well? */
> vmw_bo_pin_reserved(vbo, true);
> }
> - ret = vmw_resource_validate(res, interruptible);
> + ret = vmw_resource_validate(res, interruptible, true);
> if (vbo)
> ttm_bo_unreserve(&vbo->base);
> if (ret)
> @@ -1081,3 +1093,85 @@ void vmw_resource_dirty_update(struct
> vmw_resource *res, pgoff_t start,
> res->func->dirty_range_add(res, start << PAGE_SHIFT,
> end << PAGE_SHIFT);
> }
> +
> +/**
> + * vmw_resources_clean - Clean resources intersecting a mob range
> + * @res_tree: Tree of resources attached to the mob
This doesn't match function signature
> + * @start: The mob page offset starting the range
> + * @end: The mob page offset ending the range
> + * @num_prefault: Returns how many pages including the first have
> been
> + * cleaned and are ok to prefault
> + */
> +int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t
> start,
> + pgoff_t end, pgoff_t *num_prefault)
> +{
> + struct rb_node *cur = vbo->res_tree.rb_node;
> + struct vmw_resource *found = NULL;
> + unsigned long res_start = start << PAGE_SHIFT;
> + unsigned long res_end = end << PAGE_SHIFT;
> + unsigned long last_cleaned = 0;
> +
> + /*
> + * Find the resource with lowest backup_offset that intersects
> the
> + * range.
> + */
> + while (cur) {
> + struct vmw_resource *cur_res =
> + container_of(cur, struct vmw_resource,
> mob_node);
> +
> + if (cur_res->backup_offset >= res_end) {
> + cur = cur->rb_left;
> + } else if (cur_res->backup_offset + cur_res-
> >backup_size <=
> + res_start) {
> + cur = cur->rb_right;
> + } else {
> + found = cur_res;
I didn't looked into how RB tree works but do you need to break the
loop when resource is found?
> + cur = cur->rb_left;
> + }
> + }
> +
> + /*
> + * In order of increasing backup_offset, clean dirty resorces
> + * intersecting the range.
> + */
> + while (found) {
> + if (found->res_dirty) {
> + int ret;
> +
> + if (!found->func->clean)
> + return -EINVAL;
> +
> + ret = found->func->clean(found);
> + if (ret)
> + return ret;
> +
> + found->res_dirty = false;
> + }
> + last_cleaned = found->backup_offset + found-
> >backup_size;
> + cur = rb_next(&found->mob_node);
> + if (!cur)
> + break;
> +
> + found = container_of(cur, struct vmw_resource,
> mob_node);
> + if (found->backup_offset >= res_end)
> + break;
> + }
> +
> + /*
> + * Set number of pages allowed prefaulting and fence the buffer
> object
> + */
> + *num_prefault = 1;
> + if (last_cleaned > res_start) {
> + struct ttm_buffer_object *bo = &vbo->base;
> +
> + *num_prefault = __KERNEL_DIV_ROUND_UP(last_cleaned -
> res_start,
> + PAGE_SIZE);
> + vmw_bo_fence_single(bo, NULL);
> + if (bo->moving)
> + dma_fence_put(bo->moving);
> + bo->moving = dma_fence_get
> + (reservation_object_get_excl(bo->resv));
> + }
> +
> + return 0;
> +}
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> index c85144286cfe..3b7438b2d289 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
> @@ -77,6 +77,7 @@ struct vmw_user_resource_conv {
> * @dirty_sync: Upload the dirty mob contents to the
> resource.
> * @dirty_add_range: Add a sequential dirty range to the resource
> * dirty tracker.
> + * @clean: Clean the resource.
> */
> struct vmw_res_func {
> enum vmw_res_type res_type;
> @@ -101,6 +102,7 @@ struct vmw_res_func {
> int (*dirty_sync)(struct vmw_resource *res);
> void (*dirty_range_add)(struct vmw_resource *res, size_t start,
> size_t end);
> + int (*clean)(struct vmw_resource *res);
> };
>
> /**
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> index 5b0c928bb5ba..81d9d7adc055 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
> @@ -644,7 +644,8 @@ int vmw_validation_res_validate(struct
> vmw_validation_context *ctx, bool intr)
> struct vmw_resource *res = val->res;
> struct vmw_buffer_object *backup = res->backup;
>
> - ret = vmw_resource_validate(res, intr);
> + ret = vmw_resource_validate(res, intr, val->dirty_set
> &&
> + val->dirty);
> if (ret) {
> if (ret != -ERESTARTSYS)
> DRM_ERROR("Failed to validate
> resource.\n");
> --
> 2.20.1
>
Minor nits below
Reviewed-by: Deepak Rawat <[email protected]>
On Fri, 2019-04-12 at 16:04 +0000, Thomas Hellstrom wrote:
> Add the callbacks necessary to implement emulated coherent memory for
> surfaces. Add a flag to the gb_surface_create ioctl to indicate that
> surface memory should be coherent.
> Also bump the drm minor version to signal the availability of
> coherent
> surfaces.
>
> Signed-off-by: Thomas Hellstrom <[email protected]>
> ---
> .../device_include/svga3d_surfacedefs.h | 209 +++++++++-
> drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 4 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 390
> +++++++++++++++++-
> include/uapi/drm/vmwgfx_drm.h | 4 +-
> 4 files changed, 600 insertions(+), 7 deletions(-)
>
> diff --git
> a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
> b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
> index f2bfd3d80598..d901206c04e3 100644
> --- a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
> +++ b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
> @@ -1280,7 +1280,6 @@
> svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
> return offset;
> }
>
> -
> static inline u32
> svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
> surf_size_struct baseLevelSize,
> @@ -1375,4 +1374,212 @@
> svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
> return svga3dsurface_is_dx_screen_target_format(format);
> }
>
> +/**
> + * struct svga3dsurface_mip - Mimpmap level information
> + * @bytes: Bytes required in the backing store of this mipmap level.
> + * @img_stride: Byte stride per image.
> + * @row_stride: Byte stride per block row.
> + * @size: The size of the mipmap.
> + */
> +struct svga3dsurface_mip {
> + size_t bytes;
> + size_t img_stride;
> + size_t row_stride;
> + struct drm_vmw_size size;
> +
> +};
> +
> +/**
> + * struct svga3dsurface_cache - Cached surface information
> + * @desc: Pointer to the surface descriptor
> + * @mip: Array of mipmap level information. Valid size is
> @num_mip_levels.
> + * @mip_chain_bytes: Bytes required in the backing store for the
> whole chain
> + * of mip levels.
> + * @num_mip_levels: Valid size of the @mip array. Number of mipmap
> levels in
> + * a chain.
> + * @num_layers: Number of slices in an array texture or number of
> faces in
> + * a cubemap texture.
> + */
> +struct svga3dsurface_cache {
> + const struct svga3d_surface_desc *desc;
> + struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
> + size_t mip_chain_bytes;
> + u32 num_mip_levels;
> + u32 num_layers;
> +};
> +
> +/**
> + * struct svga3dsurface_loc - Surface location
> + * @sub_resource: Surface subresource. Defined as layer *
> num_mip_levels +
> + * mip_level.
> + * @x: X coordinate.
> + * @y: Y coordinate.
> + * @z: Z coordinate.
> + */
> +struct svga3dsurface_loc {
> + u32 sub_resource;
> + u32 x, y, z;
> +};
> +
> +/**
> + * svga3dsurface_subres - Compute the subresource from layer and
> mipmap.
> + * @cache: Surface layout data.
> + * @mip_level: The mipmap level.
> + * @layer: The surface layer (face or array slice).
> + *
> + * Return: The subresource.
> + */
> +static inline u32 svga3dsurface_subres(const struct
> svga3dsurface_cache *cache,
> + u32 mip_level, u32 layer)
> +{
> + return cache->num_mip_levels * layer + mip_level;
> +}
> +
> +/**
> + * svga3dsurface_setup_cache - Build a surface cache entry
> + * @size: The surface base level dimensions.
> + * @format: The surface format.
> + * @num_mip_levels: Number of mipmap levels.
> + * @num_layers: Number of layers.
> + * @cache: Pointer to a struct svga3dsurface_cach object to be
> filled in.
> + */
> +static inline void svga3dsurface_setup_cache(const struct
> drm_vmw_size *size,
> + SVGA3dSurfaceFormat
> format,
> + u32 num_mip_levels,
> + u32 num_layers,
> + u32 num_samples,
> + struct svga3dsurface_cache
> *cache)
> +{
> + const struct svga3d_surface_desc *desc;
> + u32 i;
> +
> + memset(cache, 0, sizeof(*cache));
> + cache->desc = desc = svga3dsurface_get_desc(format);
> + cache->num_mip_levels = num_mip_levels;
> + cache->num_layers = num_layers;
> + for (i = 0; i < cache->num_mip_levels; i++) {
> + struct svga3dsurface_mip *mip = &cache->mip[i];
> +
> + mip->size = svga3dsurface_get_mip_size(*size, i);
> + mip->bytes = svga3dsurface_get_image_buffer_size
> + (desc, &mip->size, 0) * num_samples;
> + mip->row_stride =
> + __KERNEL_DIV_ROUND_UP(mip->size.width,
> + desc->block_size.width) *
> + desc->bytes_per_block * num_samples;
> + mip->img_stride =
> + __KERNEL_DIV_ROUND_UP(mip->size.height,
> + desc->block_size.height)
> *
> + mip->row_stride;
> + cache->mip_chain_bytes += mip->bytes;
> + }
> +}
> +
> +/**
> + * svga3dsurface_get_loc - Get a surface location from an offset
> into the
> + * backing store
> + * @cache: Surface layout data.
> + * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
> + * @offset: Offset into the surface backing store.
> + */
> +static inline void
> +svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
> + struct svga3dsurface_loc *loc,
> + size_t offset)
> +{
> + u32 layer = offset / cache->mip_chain_bytes;
> + const struct svga3dsurface_mip *mip = &cache->mip[0];
> + const struct svga3d_surface_desc *desc = cache->desc;
> + int i;
> +
> + offset -= layer * cache->mip_chain_bytes;
> + for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
> + if (mip->bytes > offset)
> + break;
> + offset -= mip->bytes;
> + }
> +
> + loc->sub_resource = svga3dsurface_subres(cache, i, layer);
> + loc->z = offset / mip->img_stride;
> + offset -= loc->z * mip->img_stride;
> + loc->z *= desc->block_size.depth;
> + loc->y = offset / mip->row_stride;
> + offset -= loc->y * mip->row_stride;
> + loc->y *= desc->block_size.height;
> + loc->x = offset / desc->bytes_per_block;
> + loc->x *= desc->block_size.width;
> +}
> +
> +/**
> + * svga3dsurface_inc_loc - Clamp increment a surface location with
> one block
> + * size
> + * in each dimension.
> + * @loc: Pointer to a struct svga3dsurface_loc to be incremented.
> + *
> + * When computing the size of a range as size = end - start, the
> range does not
> + * include the end element. However a location representing the last
> byte
> + * of a touched region in the backing store *is* included in the
> range.
> + * This function motifies such a location to match the end
/s/motifies/modifies
> definition
> + * given as start + size which is the one used in a SVGA3dBox.
> + */
> +static inline void
> +svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
> + struct svga3dsurface_loc *loc)
> +{
> + const struct svga3d_surface_desc *desc = cache->desc;
> + u32 mip = loc->sub_resource % cache->num_mip_levels;
> + const struct drm_vmw_size *size = &cache->mip[mip].size;
> +
> + loc->sub_resource++;
> + loc->x += desc->block_size.width;
> + if (loc->x > size->width)
> + loc->x = size->width;
> + loc->y += desc->block_size.height;
> + if (loc->y > size->height)
> + loc->y = size->height;
> + loc->z += desc->block_size.depth;
> + if (loc->z > size->depth)
> + loc->z = size->depth;
> +}
> +
> +/**
> + * svga3dsurface_min_loc - The start location in a subresorce
/s/subresorce/subresource
> + * @cache: Surface layout data.
> + * @sub_resource: The subresource.
> + * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
> + */
> +static inline void
> +svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
> + u32 sub_resource,
> + struct svga3dsurface_loc *loc)
> +{
> + loc->sub_resource = sub_resource;
> + loc->x = loc->y = loc->z = 0;
> +}
> +
> +/**
> + * svga3dsurface_min_loc - The end location in a subresorce
svga3dsurface_max_loc. Also /s/subresorce/subresource
> + * @cache: Surface layout data.
> + * @sub_resource: The subresource.
> + * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
> + *
> + * Following the end definition given in svga3dsurface_inc_loc(),
> + * Compute the end location of a surface subresource.
> + */
> +static inline void
> +svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
> + u32 sub_resource,
> + struct svga3dsurface_loc *loc)
> +{
> + const struct drm_vmw_size *size;
> + u32 mip;
> +
> + loc->sub_resource = sub_resource + 1;
> + mip = sub_resource % cache->num_mip_levels;
> + size = &cache->mip[mip].size;
> + loc->x = size->width;
> + loc->y = size->height;
> + loc->z = size->depth;
> +}
> +
> #endif /* _SVGA3D_SURFACEDEFS_H_ */
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> index 00794415335e..630a01d75a41 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> @@ -44,9 +44,9 @@
> #include <linux/sync_file.h>
>
> #define VMWGFX_DRIVER_NAME "vmwgfx"
> -#define VMWGFX_DRIVER_DATE "20180704"
> +#define VMWGFX_DRIVER_DATE "20190328"
> #define VMWGFX_DRIVER_MAJOR 2
> -#define VMWGFX_DRIVER_MINOR 15
> +#define VMWGFX_DRIVER_MINOR 16
> #define VMWGFX_DRIVER_PATCHLEVEL 0
> #define VMWGFX_FILE_PAGE_OFFSET 0x00100000
> #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
> index c40d44f4d9af..f56141529da5 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
> @@ -68,6 +68,20 @@ struct vmw_surface_offset {
> uint32_t bo_offset;
> };
>
> +/**
> + * vmw_surface_dirty - Surface dirty-tracker
> + * @cache: Cached layout information of the surface.
> + * @size: Accounting size for the struct vmw_surface_dirty.
> + * @num_subres: Number of subresources.
> + * @boxes: Array of SVGA3dBoxes indicating dirty regions. One per
> subresource.
> + */
> +struct vmw_surface_dirty {
> + struct svga3dsurface_cache cache;
> + size_t size;
> + u32 num_subres;
> + SVGA3dBox boxes[0];
> +};
> +
> static void vmw_user_surface_free(struct vmw_resource *res);
> static struct vmw_resource *
> vmw_user_surface_base_to_res(struct ttm_base_object *base);
> @@ -96,6 +110,13 @@ vmw_gb_surface_reference_internal(struct
> drm_device *dev,
> struct drm_vmw_gb_surface_ref_ext_rep
> *rep,
> struct drm_file *file_priv);
>
> +static void vmw_surface_dirty_free(struct vmw_resource *res);
> +static int vmw_surface_dirty_alloc(struct vmw_resource *res);
> +static int vmw_surface_dirty_sync(struct vmw_resource *res);
> +static void vmw_surface_dirty_range_add(struct vmw_resource *res,
> size_t start,
> + size_t end);
> +static int vmw_surface_clean(struct vmw_resource *res);
> +
> static const struct vmw_user_resource_conv user_surface_conv = {
> .object_type = VMW_RES_SURFACE,
> .base_obj_to_res = vmw_user_surface_base_to_res,
> @@ -133,7 +154,12 @@ static const struct vmw_res_func
> vmw_gb_surface_func = {
> .create = vmw_gb_surface_create,
> .destroy = vmw_gb_surface_destroy,
> .bind = vmw_gb_surface_bind,
> - .unbind = vmw_gb_surface_unbind
> + .unbind = vmw_gb_surface_unbind,
> + .dirty_alloc = vmw_surface_dirty_alloc,
> + .dirty_free = vmw_surface_dirty_free,
> + .dirty_sync = vmw_surface_dirty_sync,
> + .dirty_range_add = vmw_surface_dirty_range_add,
> + .clean = vmw_surface_clean,
> };
>
> /**
> @@ -641,6 +667,7 @@ static void vmw_user_surface_free(struct
> vmw_resource *res)
> struct vmw_private *dev_priv = srf->res.dev_priv;
> uint32_t size = user_srf->size;
>
> + WARN_ON_ONCE(res->dirty);
> if (user_srf->master)
> drm_master_put(&user_srf->master);
> kfree(srf->offsets);
> @@ -1174,10 +1201,16 @@ static int vmw_gb_surface_bind(struct
> vmw_resource *res,
> cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_SURFACE;
> cmd2->header.size = sizeof(cmd2->body);
> cmd2->body.sid = res->id;
> - res->backup_dirty = false;
> }
> vmw_fifo_commit(dev_priv, submit_size);
>
> + if (res->backup->dirty && res->backup_dirty) {
> + /* We've just made a full upload. Cear dirty regions.
> */
> + vmw_bo_dirty_clear_res(res);
> + }
> +
> + res->backup_dirty = false;
> +
> return 0;
> }
>
> @@ -1642,7 +1675,8 @@ vmw_gb_surface_define_internal(struct
> drm_device *dev,
> }
> }
> } else if (req->base.drm_surface_flags &
> - drm_vmw_surface_flag_create_buffer)
> + (drm_vmw_surface_flag_create_buffer |
> + drm_vmw_surface_flag_coherent))
> ret = vmw_user_bo_alloc(dev_priv, tfile,
> res->backup_size,
> req->base.drm_surface_flags &
> @@ -1656,6 +1690,26 @@ vmw_gb_surface_define_internal(struct
> drm_device *dev,
> goto out_unlock;
> }
>
> + if (req->base.drm_surface_flags &
> drm_vmw_surface_flag_coherent) {
> + struct vmw_buffer_object *backup = res->backup;
> +
> + ttm_bo_reserve(&backup->base, false, false, NULL);
> + if (!res->func->dirty_alloc)
> + ret = -EINVAL;
> + if (!ret)
> + ret = vmw_bo_dirty_add(backup);
> + if (!ret) {
> + res->coherent = true;
> + ret = res->func->dirty_alloc(res);
> + }
> + ttm_bo_unreserve(&backup->base);
> + if (ret) {
> + vmw_resource_unreference(&res);
> + goto out_unlock;
> + }
> +
> + }
> +
> tmp = vmw_resource_reference(res);
> ret = ttm_prime_object_init(tfile, res->backup_size, &user_srf-
> >prime,
> req->base.drm_surface_flags &
> @@ -1764,3 +1818,333 @@ vmw_gb_surface_reference_internal(struct
> drm_device *dev,
>
> return ret;
> }
> +
> +/**
> + * vmw_subres_dirty_add - Add a dirty region to a subresource
> + * @dirty: The surfaces's dirty tracker.
> + * @loc_start: The location corresponding to the start of the
> region.
> + * @loc_end: The location corresponding to the end of the region.
> + *
> + * As we are assuming that @loc_start and @loc_end represent a
> sequential
> + * range of backing store memory, if the region spans multiple lines
> then
> + * regardless of the x coordinate, the full lines are dirtied.
> + * Correspondingly if the region spans multiple z slices, then full
> rather
> + * than partial z slices are dirtied.
> + */
> +static void vmw_subres_dirty_add(struct vmw_surface_dirty *dirty,
> + const struct svga3dsurface_loc
> *loc_start,
> + const struct svga3dsurface_loc
> *loc_end)
> +{
> + const struct svga3dsurface_cache *cache = &dirty->cache;
> + SVGA3dBox *box = &dirty->boxes[loc_start->sub_resource];
> + u32 mip = loc_start->sub_resource % cache->num_mip_levels;
> + const struct drm_vmw_size *size = &cache->mip[mip].size;
> + u32 box_c2 = box->z + box->d;
> +
> + if (WARN_ON(loc_start->sub_resource >= dirty->num_subres))
> + return;
> +
> + if (box->d == 0 || box->z > loc_start->z)
> + box->z = loc_start->z;
> + if (box_c2 < loc_end->z)
> + box->d = loc_end->z - box->z;
> +
> + if (loc_start->z + 1 == loc_end->z) {
> + box_c2 = box->y + box->h;
> + if (box->h == 0 || box->y > loc_start->y)
> + box->y = loc_start->y;
> + if (box_c2 < loc_end->y)
> + box->h = loc_end->y - box->y;
> +
> + if (loc_start->y + 1 == loc_end->y) {
> + box_c2 = box->x + box->w;
> + if (box->w == 0 || box->x > loc_start->x)
> + box->x = loc_start->x;
> + if (box_c2 < loc_end->x)
> + box->w = loc_end->x - box->x;
> + } else {
> + box->x = 0;
> + box->w = size->width;
> + }
> + } else {
> + box->y = 0;
> + box->h = size->height;
> + box->x = 0;
> + box->w = size->width;
> + }
> +}
> +
> +/**
> + * vmw_subres_dirty_full - Mark a full subresource as dirty
> + * @dirty: The surface's dirty tracker.
> + * @subres: The subresource
> + */
> +static void vmw_subres_dirty_full(struct vmw_surface_dirty *dirty,
> u32 subres)
> +{
> + const struct svga3dsurface_cache *cache = &dirty->cache;
> + u32 mip = subres % cache->num_mip_levels;
> + const struct drm_vmw_size *size = &cache->mip[mip].size;
> + SVGA3dBox *box = &dirty->boxes[subres];
> +
> + box->x = 0;
> + box->y = 0;
> + box->z = 0;
> + box->w = size->width;
> + box->h = size->height;
> + box->d = size->depth;
> +}
> +
> +/*
> + * vmw_surface_tex_dirty_add_range - The dirty_add_range callback
> for texture
> + * surfaces.
> + */
> +static void vmw_surface_tex_dirty_range_add(struct vmw_resource
> *res,
> + size_t start, size_t end)
> +{
> + struct vmw_surface_dirty *dirty =
> + (struct vmw_surface_dirty *) res->dirty;
> + size_t backup_end = res->backup_offset + res->backup_size;
> + struct svga3dsurface_loc loc1, loc2;
> + const struct svga3dsurface_cache *cache;
> +
> + start = max(start, res->backup_offset) - res->backup_offset;
> + end = min(end, backup_end) - res->backup_offset;
> + cache = &dirty->cache;
> + svga3dsurface_get_loc(cache, &loc1, start);
> + svga3dsurface_get_loc(cache, &loc2, end - 1);
> + svga3dsurface_inc_loc(cache, &loc2);
> +
> + if (loc1.sub_resource + 1 == loc2.sub_resource) {
> + /* Dirty range covers a single sub-resource */
> + vmw_subres_dirty_add(dirty, &loc1, &loc2);
> + } else {
> + /* Dirty range covers multiple sub-resources */
> + struct svga3dsurface_loc loc_min, loc_max;
> + u32 sub_res = loc1.sub_resource;
> +
> + svga3dsurface_max_loc(cache, loc1.sub_resource,
> &loc_max);
> + vmw_subres_dirty_add(dirty, &loc1, &loc_max);
> + svga3dsurface_min_loc(cache, loc2.sub_resource - 1,
> &loc_min);
> + vmw_subres_dirty_add(dirty, &loc_min, &loc2);
> + for (sub_res = loc1.sub_resource + 1;
> + sub_res < loc2.sub_resource - 1; ++sub_res)
> + vmw_subres_dirty_full(dirty, sub_res);
> + }
> +}
> +
> +/*
> + * vmw_surface_tex_dirty_add_range - The dirty_add_range callback
> for buffer
> + * surfaces.
> + */
> +static void vmw_surface_buf_dirty_range_add(struct vmw_resource
> *res,
> + size_t start, size_t end)
> +{
> + struct vmw_surface_dirty *dirty =
> + (struct vmw_surface_dirty *) res->dirty;
> + const struct svga3dsurface_cache *cache = &dirty->cache;
> + size_t backup_end = res->backup_offset + cache-
> >mip_chain_bytes;
> + SVGA3dBox *box = &dirty->boxes[0];
> + u32 box_c2;
> +
> + box->h = box->d = 1;
> + start = max(start, res->backup_offset) - res->backup_offset;
> + end = min(end, backup_end) - res->backup_offset;
> + box_c2 = box->x + box->w;
> + if (box->w == 0 || box->x > start)
> + box->x = start;
> + if (box_c2 < end)
> + box->w = end - box->x;
> +}
> +
> +/*
> + * vmw_surface_tex_dirty_add_range - The dirty_add_range callback
> for surfaces
> + */
> +static void vmw_surface_dirty_range_add(struct vmw_resource *res,
> size_t start,
> + size_t end)
> +{
> + struct vmw_surface *srf = vmw_res_to_srf(res);
> +
> + if (WARN_ON(end <= res->backup_offset ||
> + start >= res->backup_offset + res->backup_size))
> + return;
> +
> + if (srf->format == SVGA3D_BUFFER)
> + vmw_surface_buf_dirty_range_add(res, start, end);
> + else
> + vmw_surface_tex_dirty_range_add(res, start, end);
> +}
> +
> +/*
> + * vmw_surface_dirty_sync - The surface's dirty_sync callback.
> + */
> +static int vmw_surface_dirty_sync(struct vmw_resource *res)
> +{
> + struct vmw_private *dev_priv = res->dev_priv;
> + bool has_dx = 0;
> + u32 i, num_dirty;
> + struct vmw_surface_dirty *dirty =
> + (struct vmw_surface_dirty *) res->dirty;
> + size_t alloc_size;
> + const struct svga3dsurface_cache *cache = &dirty->cache;
> + struct {
> + SVGA3dCmdHeader header;
> + SVGA3dCmdDXUpdateSubResource body;
> + } *cmd1;
> + struct {
> + SVGA3dCmdHeader header;
> + SVGA3dCmdUpdateGBImage body;
> + } *cmd2;
> + void *cmd;
> +
> + num_dirty = 0;
> + for (i = 0; i < dirty->num_subres; ++i) {
> + const SVGA3dBox *box = &dirty->boxes[i];
> +
> + if (box->d)
> + num_dirty++;
> + }
> +
> + if (!num_dirty)
> + goto out;
> +
> + alloc_size = num_dirty * ((has_dx) ? sizeof(*cmd1) :
> sizeof(*cmd2));
> + cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
> + if (!cmd)
> + return -ENOMEM;
> +
> + cmd1 = cmd;
> + cmd2 = cmd;
> +
> + for (i = 0; i < dirty->num_subres; ++i) {
> + const SVGA3dBox *box = &dirty->boxes[i];
> +
> + if (!box->d)
> + continue;
> +
> + /*
> + * DX_UPDATE_SUBRESOURCE is aware of array surfaces.
> + * UPDATE_GB_IMAGE is not.
> + */
> + if (has_dx) {
> + cmd1->header.id =
> SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE;
> + cmd1->header.size = sizeof(cmd1->body);
> + cmd1->body.sid = res->id;
> + cmd1->body.subResource = i;
> + cmd1->body.box = *box;
> + cmd1++;
> + } else {
> + cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE;
> + cmd2->header.size = sizeof(cmd2->body);
> + cmd2->body.image.sid = res->id;
> + cmd2->body.image.face = i / cache-
> >num_mip_levels;
> + cmd2->body.image.mipmap = i -
> + (cache->num_mip_levels * cmd2-
> >body.image.face);
> + cmd2->body.box = *box;
> + cmd2++;
> + }
> +
> + }
> + vmw_fifo_commit(dev_priv, alloc_size);
> + out:
> + memset(&dirty->boxes[0], 0, sizeof(dirty->boxes[0]) *
> + dirty->num_subres);
> +
> + return 0;
> +}
> +
> +/*
> + * vmw_surface_dirty_alloc - The surface's dirty_alloc callback.
> + */
> +static int vmw_surface_dirty_alloc(struct vmw_resource *res)
> +{
> + struct vmw_surface *srf = vmw_res_to_srf(res);
> + struct vmw_surface_dirty *dirty;
> + u32 num_layers = 1;
> + u32 num_mip;
> + u32 num_subres;
> + u32 num_samples;
> + size_t dirty_size, acc_size;
> + static struct ttm_operation_ctx ctx = {
> + .interruptible = false,
> + .no_wait_gpu = false
> + };
> + int ret;
> +
> + if (srf->array_size)
> + num_layers = srf->array_size;
> + else if (srf->flags & SVGA3D_SURFACE_CUBEMAP)
> + num_layers *= SVGA3D_MAX_SURFACE_FACES;
> +
> + num_mip = srf->mip_levels[0];
> + if (!num_mip)
> + num_mip = 1;
> +
> + num_subres = num_layers * num_mip;
> + dirty_size = sizeof(*dirty) + num_subres * sizeof(dirty-
> >boxes[0]);
> + acc_size = ttm_round_pot(dirty_size);
> + ret = ttm_mem_global_alloc(vmw_mem_glob(res->dev_priv),
> + acc_size, &ctx);
> + if (ret) {
> + VMW_DEBUG_USER("Out of graphics memory for surface "
> + "dirty tracker.\n");
> + return ret;
> + }
> +
> + dirty = kvzalloc(dirty_size, GFP_KERNEL);
> + if (!dirty) {
> + ret = -ENOMEM;
> + goto out_no_dirty;
> + }
> +
> + num_samples = max_t(u32, 1, srf->multisample_count);
> + svga3dsurface_setup_cache(&srf->base_size, srf->format,
> num_mip,
> + num_layers, num_samples, &dirty-
> >cache);
> + dirty->num_subres = num_subres;
> + dirty->size = acc_size;
> + res->dirty = (struct vmw_resource_dirty *) dirty;
> +
> + return 0;
> +
> +out_no_dirty:
> + ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
> + return ret;
> +}
> +
> +/*
> + * vmw_surface_dirty_free - The surface's dirty_free callback
> + */
> +static void vmw_surface_dirty_free(struct vmw_resource *res)
> +{
> + struct vmw_surface_dirty *dirty =
> + (struct vmw_surface_dirty *) res->dirty;
> + size_t acc_size = dirty->size;
> +
> + kvfree(dirty);
> + ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
> + res->dirty = NULL;
> +}
> +
> +/*
> + * vmw_surface_clean - The surface's clean callback
> + */
> +static int vmw_surface_clean(struct vmw_resource *res)
> +{
> + struct vmw_private *dev_priv = res->dev_priv;
> + size_t alloc_size;
> + struct {
> + SVGA3dCmdHeader header;
> + SVGA3dCmdReadbackGBSurface body;
> + } *cmd;
> +
> + alloc_size = sizeof(*cmd);
> + cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
> + if (!cmd)
> + return -ENOMEM;
> +
> + cmd->header.id = SVGA_3D_CMD_READBACK_GB_SURFACE;
> + cmd->header.size = sizeof(cmd->body);
> + cmd->body.sid = res->id;
> + vmw_fifo_commit(dev_priv, alloc_size);
> +
> + return 0;
> +}
> diff --git a/include/uapi/drm/vmwgfx_drm.h
> b/include/uapi/drm/vmwgfx_drm.h
> index 399f58317cff..02cab33f2f25 100644
> --- a/include/uapi/drm/vmwgfx_drm.h
> +++ b/include/uapi/drm/vmwgfx_drm.h
> @@ -891,11 +891,13 @@ struct drm_vmw_shader_arg {
> * surface.
> * @drm_vmw_surface_flag_create_buffer: Create a backup buffer if
> none is
> * given.
> + * @drm_vmw_surface_flag_coherent: Back surface with coherent
> memory.
> */
> enum drm_vmw_surface_flags {
> drm_vmw_surface_flag_shareable = (1 << 0),
> drm_vmw_surface_flag_scanout = (1 << 1),
> - drm_vmw_surface_flag_create_buffer = (1 << 2)
> + drm_vmw_surface_flag_create_buffer = (1 << 2),
> + drm_vmw_surface_flag_coherent = (1 << 3),
> };
>
> /**
> --
> 2.20.1
>
On Mon, 2019-04-22 at 18:54 +0000, Deepak Singh Rawat wrote:
> Hi Thomas,
>
> With minor comments below
>
> Reviewed-by: Deepak Rawat <[email protected]>
>
Thanks for reviewing Deepak, Some comments below:
> On Fri, 2019-04-12 at 09:04 -0700, Thomas Hellstrom wrote:
> > This infrastructure will, for coherent resources, make sure that
> > from the user-space point of view, data written by the CPU is
> > immediately
> > automatically available to the GPU at resource validation time.
> >
> > Signed-off-by: Thomas Hellstrom <[email protected]>
> > ---
> > drivers/gpu/drm/vmwgfx/Kconfig | 1 +
> > drivers/gpu/drm/vmwgfx/Makefile | 2 +-
> > drivers/gpu/drm/vmwgfx/vmwgfx_bo.c | 5 +-
> > drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 5 +
> > drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 26 +-
> > drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 1 -
> > drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 410
> > ++++++++++++++++++
> > drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 57 +++
> > drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h | 11 +
> > drivers/gpu/drm/vmwgfx/vmwgfx_validation.c | 74 ++++
> > drivers/gpu/drm/vmwgfx/vmwgfx_validation.h | 16 +-
> > 11 files changed, 588 insertions(+), 20 deletions(-)
> > create mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >
> > diff --git a/drivers/gpu/drm/vmwgfx/Kconfig
> > b/drivers/gpu/drm/vmwgfx/Kconfig
> > index 6b28a326f8bb..d5fd81a521f6 100644
> > --- a/drivers/gpu/drm/vmwgfx/Kconfig
> > +++ b/drivers/gpu/drm/vmwgfx/Kconfig
> > @@ -8,6 +8,7 @@ config DRM_VMWGFX
> > select FB_CFB_IMAGEBLIT
> > select DRM_TTM
> > select FB
> > + select AS_DIRTY_HELPERS
> > # Only needed for the transitional use of drm_crtc_init - can
> > be removed
> > # again once vmwgfx sets up the primary plane itself.
> > select DRM_KMS_HELPER
> > diff --git a/drivers/gpu/drm/vmwgfx/Makefile
> > b/drivers/gpu/drm/vmwgfx/Makefile
> > index 8841bd30e1e5..c877a21a0739 100644
> > --- a/drivers/gpu/drm/vmwgfx/Makefile
> > +++ b/drivers/gpu/drm/vmwgfx/Makefile
> > @@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o
> > vmwgfx_kms.o vmwgfx_drv.o \
> > vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
> > vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o
> > \
> > vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
> > - vmwgfx_validation.o \
> > + vmwgfx_validation.o vmwgfx_page_dirty.o \
> > ttm_object.o ttm_lock.o
> >
> > obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> > index c0829d50eecc..90ca866640fe 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
> > @@ -463,6 +463,7 @@ void vmw_bo_bo_free(struct ttm_buffer_object
> > *bo)
> > {
> > struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
> >
> > + WARN_ON(vmw_bo->dirty);
> > vmw_bo_unmap(vmw_bo);
> > kfree(vmw_bo);
> > }
> > @@ -476,8 +477,10 @@ void vmw_bo_bo_free(struct ttm_buffer_object
> > *bo)
> > static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
> > {
> > struct vmw_user_buffer_object *vmw_user_bo =
> > vmw_user_buffer_object(bo);
> > + struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
> >
> > - vmw_bo_unmap(&vmw_user_bo->vbo);
> > + WARN_ON(vbo->dirty);
>
> Is it possible for user-space to exploit this WARN? If yes then you
> might want to change the logic?
>
Nope, if this WARN hits, then it's due to a bug. Don't want to use
BUG_ON() since it's non-fatal.
> > + vmw_bo_unmap(vbo);
> > ttm_prime_object_kfree(vmw_user_bo, prime);
> > }
> >
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> > index 6165fe2c4504..74e94138877e 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
> > @@ -857,6 +857,11 @@ static int vmw_driver_load(struct drm_device
> > *dev, unsigned long chipset)
> > DRM_ERROR("Failed initializing TTM buffer object
> > driver.\n");
> > goto out_no_bdev;
> > }
> > + dev_priv->vm_ops = *dev_priv->bdev.vm_ops;
> > + dev_priv->vm_ops.fault = vmw_bo_vm_fault;
> > + dev_priv->vm_ops.pfn_mkwrite = vmw_bo_vm_mkwrite;
> > + dev_priv->vm_ops.page_mkwrite = vmw_bo_vm_mkwrite;
> > + dev_priv->bdev.vm_ops = &dev_priv->vm_ops;
> >
> > /*
> > * Enable VRAM, but initially don't use it until SVGA is
> > enabled and
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> > index bd6919b90519..f05fce52fbb4 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
> > @@ -95,6 +95,7 @@ struct vmw_fpriv {
> > * @dx_query_ctx: DX context if this buffer object is used as a DX
> > query MOB
> > * @map: Kmap object for semi-persistent mappings
> > * @res_prios: Eviction priority counts for attached resources
> > + * @dirty: structure for user-space dirty-tracking
> > */
> > struct vmw_buffer_object {
> > struct ttm_buffer_object base;
> > @@ -105,6 +106,7 @@ struct vmw_buffer_object {
> > /* Protected by reservation */
> > struct ttm_bo_kmap_obj map;
> > u32 res_prios[TTM_MAX_BO_PRIORITY];
> > + struct vmw_bo_dirty *dirty;
> > };
> >
> > /**
> > @@ -135,7 +137,8 @@ struct vmw_res_func;
> > * @res_dirty: Resource contains data not yet in the backup
> > buffer.
> > Protected
> > * by resource reserved.
> > * @backup_dirty: Backup buffer contains data not yet in the HW
> > resource.
> > - * Protecte by resource reserved.
> > + * Protected by resource reserved.
> > + * @coherent: Emulate coherency by tracking vm accesses.
> > * @backup: The backup buffer if any. Protected by resource
> > reserved.
> > * @backup_offset: Offset into the backup buffer if any. Protected
> > by resource
> > * reserved. Note that only a few resource types can have a
> > @backup_offset
> > @@ -152,14 +155,16 @@ struct vmw_res_func;
> > * @hw_destroy: Callback to destroy the resource on the device, as
> > part of
> > * resource destruction.
> > */
> > +struct vmw_resource_dirty;
> > struct vmw_resource {
> > struct kref kref;
> > struct vmw_private *dev_priv;
> > int id;
> > u32 used_prio;
> > unsigned long backup_size;
> > - bool res_dirty;
> > - bool backup_dirty;
> > + u32 res_dirty : 1;
> > + u32 backup_dirty : 1;
>
> Is there a reason you changed res_dirty and backup_dirty from bool to
> u32. They are still areused as bool, right?
Got a comment when the WW mutex patches was reviewed, that bool should
be avoided in compund data types, so I'm trying to avoid them in new
code.
>
> > + u32 coherent : 1;
> > struct vmw_buffer_object *backup;
> > unsigned long backup_offset;
> > unsigned long pin_count;
> > @@ -167,6 +172,7 @@ struct vmw_resource {
> > struct list_head lru_head;
> > struct list_head mob_head;
> > struct list_head binding_head;
> > + struct vmw_resource_dirty *dirty;
> > void (*res_free) (struct vmw_resource *res);
> > void (*hw_destroy) (struct vmw_resource *res);
> > };
> > @@ -607,6 +613,9 @@ struct vmw_private {
> >
> > /* Validation memory reservation */
> > struct vmw_validation_mem vvm;
> > +
> > + /* VM operations */
> > + struct vm_operations_struct vm_ops;
> > };
> >
> > static inline struct vmw_surface *vmw_res_to_srf(struct
> > vmw_resource
> > *res)
> > @@ -723,6 +732,8 @@ extern void vmw_resource_evict_all(struct
> > vmw_private *dev_priv);
> > extern void vmw_resource_unbind_list(struct vmw_buffer_object
> > *vbo);
> > void vmw_resource_mob_attach(struct vmw_resource *res);
> > void vmw_resource_mob_detach(struct vmw_resource *res);
> > +void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t
> > start,
> > + pgoff_t end);
> >
> > /**
> > * vmw_resource_mob_attached - Whether a resource currently has a
> > mob attached
> > @@ -1411,6 +1422,15 @@ int vmw_host_log(const char *log);
> > #define VMW_DEBUG_USER(fmt,
> > ...) \
> > DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
> >
> > +/* Resource dirtying - vmwgfx_page_dirty.c */
> > +void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
> > +int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
> > +void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
> > +void vmw_bo_dirty_clear_res(struct vmw_resource *res);
> > +void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
> > +vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
> > +vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
> > +
> > /**
> > * Inline helper functions
> > */
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> > index da3ac0bc2e14..7cb22119f516 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
> > @@ -2483,7 +2483,6 @@ static int
> > vmw_cmd_dx_check_subresource(struct
> > vmw_private *dev_priv,
> > offsetof(typeof(*cmd), sid));
> >
> > cmd = container_of(header, typeof(*cmd), header);
> > -
> > return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
> > VMW_RES_DIRTY_NONE,
> > user_surface_converter,
> > &cmd->sid, NULL);
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > new file mode 100644
> > index 000000000000..87e4a73b1175
> > --- /dev/null
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > @@ -0,0 +1,410 @@
> > +// SPDX-License-Identifier: GPL-2.0 OR MIT
> > +/*****************************************************************
> > **
> > *******
> > + *
> > + * Copyright 2019 VMware, Inc., Palo Alto, CA., USA
> > + *
> > + * Permission is hereby granted, free of charge, to any person
> > obtaining a
> > + * copy of this software and associated documentation files (the
> > + * "Software"), to deal in the Software without restriction,
> > including
> > + * without limitation the rights to use, copy, modify, merge,
> > publish,
> > + * distribute, sub license, and/or sell copies of the Software,
> > and
> > to
> > + * permit persons to whom the Software is furnished to do so,
> > subject to
> > + * the following conditions:
> > + *
> > + * The above copyright notice and this permission notice
> > (including
> > the
> > + * next paragraph) shall be included in all copies or substantial
> > portions
> > + * of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO
> > EVENT SHALL
> > + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE
> > FOR
> > ANY CLAIM,
> > + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> > TORT OR
> > + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> > SOFTWARE OR THE
> > + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> > + *
> > +
> > *******************************************************************
> > **
> > *****/
> > +#include "vmwgfx_drv.h"
> > +
> > +/*
> > + * Different methods for tracking dirty:
> > + * VMW_BO_DIRTY_PAGETABLE - Scan the pagetable for hardware dirty
> > bits
> > + * VMW_BO_DIRTY_MKWRITE - Write-protect page table entries and
> > record write-
> > + * accesses in the VM mkwrite() callback
> > + */
> > +enum vmw_bo_dirty_method {
> > + VMW_BO_DIRTY_PAGETABLE,
> > + VMW_BO_DIRTY_MKWRITE,
> > +};
> > +
> > +/*
> > + * No dirtied pages at scan trigger a transition to the _MKWRITE
> > method,
> > + * similarly a certain percentage of dirty pages trigger a
> > transition to
> > + * the _PAGETABLE method. How many triggers should we wait for
> > before
> > + * changing method?
> > + */
> > +#define VMW_DIRTY_NUM_CHANGE_TRIGGERS 2
> > +
> > +/* Percentage to trigger a transition to the _PAGETABLE method */
> > +#define VMW_DIRTY_PERCENTAGE 10
> > +
> > +/**
> > + * struct vmw_bo_dirty - Dirty information for buffer objects
> > + * @start: First currently dirty bit
> > + * @end: Last currently dirty bit + 1
> > + * @method: The currently used dirty method
> > + * @change_count: Number of consecutive method change triggers
> > + * @ref_count: Reference count for this structure
> > + * @bitmap_size: The size of the bitmap in bits. Typically equal
> > to
> > the
> > + * nuber of pages in the bo.
> > + * @size: The accounting size for this struct.
> > + * @bitmap: A bitmap where each bit represents a page. A set bit
> > means a
> > + * dirty page.
> > + */
> > +struct vmw_bo_dirty {
> > + unsigned long start;
> > + unsigned long end;
> > + enum vmw_bo_dirty_method method;
> > + unsigned int change_count;
> > + unsigned int ref_count;
> > + unsigned long bitmap_size;
> > + size_t size;
> > + unsigned long bitmap[0];
> > +};
> > +
> > +/**
> > + * vmw_bo_dirty_scan_pagetable - Perform a pagetable scan for
> > dirty
> > bits
> > + * @vbo: The buffer object to scan
> > + *
> > + * Scans the pagetable for dirty bits. Clear those bits and modify
> > the
> > + * dirty structure with the results. This function may change the
> > + * dirty-tracking method.
> > + */
> > +static void vmw_bo_dirty_scan_pagetable(struct vmw_buffer_object
> > *vbo)
> > +{
> > + struct vmw_bo_dirty *dirty = vbo->dirty;
> > + pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
> > + struct address_space *mapping = vbo->base.bdev->dev_mapping;
> > + pgoff_t num_marked;
> > +
> > + num_marked = apply_as_clean(mapping,
> > + offset, dirty->bitmap_size,
> > + offset, &dirty->bitmap[0],
> > + &dirty->start, &dirty->end);
> > + if (num_marked == 0)
> > + dirty->change_count++;
> > + else
> > + dirty->change_count = 0;
> > +
> > + if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
> > + dirty->change_count = 0;
> > + dirty->method = VMW_BO_DIRTY_MKWRITE;
> > + apply_as_wrprotect(mapping,
> > + offset, dirty->bitmap_size);
> > + apply_as_clean(mapping,
> > + offset, dirty->bitmap_size,
> > + offset, &dirty->bitmap[0],
> > + &dirty->start, &dirty->end);
> > + }
> > +}
> > +
> > +/**
> > + * vmw_bo_dirty_scan_mkwrite - Reset the mkwrite dirty-tracking
> > method
> > + * @vbo: The buffer object to scan
> > + *
> > + * Write-protect pages written to so that consecutive write
> > accesses
> > will
> > + * trigger a call to mkwrite.
> > + *
> > + * This function may change the dirty-tracking method.
> > + */
> > +static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object
> > *vbo)
> > +{
> > + struct vmw_bo_dirty *dirty = vbo->dirty;
> > + unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
> > + struct address_space *mapping = vbo->base.bdev->dev_mapping;
> > + pgoff_t num_marked;
> > +
> > + if (dirty->end <= dirty->start)
> > + return;
> > +
> > + num_marked = apply_as_wrprotect(vbo->base.bdev->dev_mapping,
> > + dirty->start + offset,
> > + dirty->end - dirty->start);
> > +
> > + if (100UL * num_marked / dirty->bitmap_size >
> > + VMW_DIRTY_PERCENTAGE) {
> > + dirty->change_count++;
> > + } else {
> > + dirty->change_count = 0;
> > + }
> > +
> > + if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
> > + pgoff_t start = 0;
> > + pgoff_t end = dirty->bitmap_size;
> > +
> > + dirty->method = VMW_BO_DIRTY_PAGETABLE;
> > + apply_as_clean(mapping, offset, end, offset, &dirty-
> > > bitmap[0],
> > + &start, &end);
> > + bitmap_clear(&dirty->bitmap[0], 0, dirty->bitmap_size);
> > + if (dirty->start < dirty->end)
> > + bitmap_set(&dirty->bitmap[0], dirty->start,
> > + dirty->end - dirty->start);
> > + dirty->change_count = 0;
> > + }
> > +}
> > +
> > +
> > +/**
> > + * vmw_bo_dirty_scan - Scan for dirty pages and add them to the
> > dirty
> > + * tracking structure
> > + * @vbo: The buffer object to scan
> > + *
> > + * This function may change the dirty tracking method.
> > + */
> > +void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
> > +{
> > + struct vmw_bo_dirty *dirty = vbo->dirty;
> > +
> > + if (dirty->method == VMW_BO_DIRTY_PAGETABLE)
> > + vmw_bo_dirty_scan_pagetable(vbo);
> > + else
> > + vmw_bo_dirty_scan_mkwrite(vbo);
> > +}
> > +
> > +/**
> > + * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
> > + * @vbo: The buffer object
> > + *
> > + * This function registers a dirty-tracking user to a buffer
> > object.
> > + * A user can be for example a resource or a vma in a special
> > user-
> > space
> > + * mapping.
> > + *
> > + * Return: Zero on success, -ENOMEM on memory allocation failure.
> > + */
> > +int vmw_bo_dirty_add(struct vmw_buffer_object *vbo)
> > +{
> > + struct vmw_bo_dirty *dirty = vbo->dirty;
> > + pgoff_t num_pages = vbo->base.num_pages;
> > + size_t size, acc_size;
> > + int ret;
> > + static struct ttm_operation_ctx ctx = {
> > + .interruptible = false,
> > + .no_wait_gpu = false
> > + };
> > +
> > + if (dirty) {
> > + dirty->ref_count++;
> > + return 0;
> > + }
> > +
> > + size = sizeof(*dirty) + BITS_TO_LONGS(num_pages) *
> > sizeof(long);
> > + acc_size = ttm_round_pot(size);
> > + ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
> > + if (ret) {
> > + VMW_DEBUG_USER("Out of graphics memory for buffer
> > object "
> > + "dirty tracker.\n");
> > + return ret;
> > + }
> > + dirty = kvzalloc(size, GFP_KERNEL);
> > + if (!dirty) {
> > + ret = -ENOMEM;
> > + goto out_no_dirty;
> > + }
> > +
> > + dirty->size = acc_size;
> > + dirty->bitmap_size = num_pages;
> > + dirty->start = dirty->bitmap_size;
> > + dirty->end = 0;
> > + dirty->ref_count = 1;
> > + if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
> > + dirty->method = VMW_BO_DIRTY_PAGETABLE;
> > + } else {
> > + struct address_space *mapping = vbo->base.bdev-
> > > dev_mapping;
> > + pgoff_t offset = drm_vma_node_start(&vbo-
> > > base.vma_node);
> > +
> > + dirty->method = VMW_BO_DIRTY_MKWRITE;
> > +
> > + /* Write-protect and then pick up already dirty bits */
> > + apply_as_wrprotect(mapping, offset, num_pages);
> > + apply_as_clean(mapping, offset, num_pages, offset,
> > + &dirty->bitmap[0], &dirty->start,
> > &dirty->end);
> > + }
> > +
> > + vbo->dirty = dirty;
> > +
> > + return 0;
> > +
> > +out_no_dirty:
> > + ttm_mem_global_free(&ttm_mem_glob, acc_size);
> > + return ret;
> > +}
> > +
> > +/**
> > + * vmw_bo_dirty_release - Release a dirty-tracking user from a
> > buffer object
> > + * @vbo: The buffer object
> > + *
> > + * This function releases a dirty-tracking user from a buffer
> > object.
> > + * If the reference count reaches zero, then the dirty-tracking
> > object is
> > + * freed and the pointer to it cleared.
> > + *
> > + * Return: Zero on success, -ENOMEM on memory allocation failure.
> > + */
> > +void vmw_bo_dirty_release(struct vmw_buffer_object *vbo)
> > +{
> > + struct vmw_bo_dirty *dirty = vbo->dirty;
> > +
> > + if (dirty && --dirty->ref_count == 0) {
> > + size_t acc_size = dirty->size;
> > +
> > + kvfree(dirty);
> > + ttm_mem_global_free(&ttm_mem_glob, acc_size);
> > + vbo->dirty = NULL;
> > + }
> > +}
> > +
> > +/**
> > + * vmw_bo_dirty_transfer_to_res - Pick up a resource's dirty
> > region
> > from
> > + * its backing mob.
> > + * @res: The resource
> > + *
> > + * This function will pick up all dirty ranges affecting the
> > resource from
> > + * it's backup mob, and call vmw_resource_dirty_update() once for
> > each
> > + * range. The transferred ranges will be cleared from the backing
> > mob's
> > + * dirty tracking.
> > + */
> > +void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res)
> > +{
> > + struct vmw_buffer_object *vbo = res->backup;
> > + struct vmw_bo_dirty *dirty = vbo->dirty;
> > + pgoff_t start, cur, end;
> > + unsigned long res_start = res->backup_offset;
> > + unsigned long res_end = res->backup_offset + res->backup_size;
> > +
> > + WARN_ON_ONCE(res_start & ~PAGE_MASK);
> > + res_start >>= PAGE_SHIFT;
> > + res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
> > +
> > + if (res_start >= dirty->end || res_end <= dirty->start)
> > + return;
> > +
> > + cur = max(res_start, dirty->start);
> > + res_end = max(res_end, dirty->end);
> > + while (cur < res_end) {
> > + unsigned long num;
> > +
> > + start = find_next_bit(&dirty->bitmap[0], res_end, cur);
> > + if (start >= res_end)
> > + break;
> > +
> > + end = find_next_zero_bit(&dirty->bitmap[0], res_end,
> > start + 1);
> > + cur = end + 1;
> > + num = end - start;
> > + bitmap_clear(&dirty->bitmap[0], start, num);
> > + vmw_resource_dirty_update(res, start, end);
> > + }
> > +
> > + if (res_start <= dirty->start && res_end > dirty->start)
> > + dirty->start = res_end;
> > + if (res_start < dirty->end && res_end >= dirty->end)
> > + dirty->end = res_start;
> > +}
> > +
> > +/**
> > + * vmw_bo_dirty_clear_res - Clear a resource's dirty region from
> > + * its backing mob.
> > + * @res: The resource
> > + *
> > + * This function will clar all dirty ranges affecting the
> > resource
>
>
...
Will address the rest of the comments.
/Thomas
On Mon, 2019-04-22 at 20:12 +0000, Deepak Singh Rawat wrote:
> Minor nits below, otherwise
>
> Reviewed-by: Deepak Rawat <[email protected]>
>
> On Fri, 2019-04-12 at 09:04 -0700, Thomas Hellstrom wrote:
> > Similar to write-coherent resources, make sure that from the user-
> > space
> > point of view, GPU rendered contents is automatically available for
> > reading by the CPU.
> >
> > Signed-off-by: Thomas Hellstrom <[email protected]>
> > ---
> >
> > + while (cur) {
> > + struct vmw_resource *cur_res =
> > + container_of(cur, struct vmw_resource,
> > mob_node);
> > +
> > + if (cur_res->backup_offset >= res_end) {
> > + cur = cur->rb_left;
> > + } else if (cur_res->backup_offset + cur_res-
> > > backup_size <=
> > + res_start) {
> > + cur = cur->rb_right;
> > + } else {
> > + found = cur_res;
>
> I didn't looked into how RB tree works but do you need to break the
> loop when resource is found?
No, here we will continue looking for a resource with even lower
starting offset. I'll add a comment about that.
Thanks,
Thomas