2020-04-22 00:23:08

by Jason Gunthorpe

[permalink] [raw]
Subject: [PATCH hmm 0/5] Adjust hmm_range_fault() API

From: Jason Gunthorpe <[email protected]>

The API is a bit complicated for the uses we actually have, and
disucssions for simplifying have come up a number of times.

This small series removes the customizable pfn format and simplifies the
return code of hmm_range_fault()

All the drivers are adjusted to process in the simplified format.
I would appreciated tested-by's for the two drivers, thanks!

This passes the hmm tester with the following diff:

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index d75e18f2ffd245..a2442efa038c41 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -47,23 +47,8 @@ struct dmirror_bounce {
unsigned long cpages;
};

-#define DPT_SHIFT PAGE_SHIFT
-#define DPT_VALID (1UL << 0)
-#define DPT_WRITE (1UL << 1)
-
#define DPT_XA_TAG_WRITE 3UL

-static const uint64_t dmirror_hmm_flags[HMM_PFN_FLAG_MAX] = {
- [HMM_PFN_VALID] = DPT_VALID,
- [HMM_PFN_WRITE] = DPT_WRITE,
-};
-
-static const uint64_t dmirror_hmm_values[HMM_PFN_VALUE_MAX] = {
- [HMM_PFN_NONE] = 0,
- [HMM_PFN_ERROR] = 0x10,
- [HMM_PFN_SPECIAL] = 0x10,
-};
-
/*
* Data structure to track address ranges and register for mmu interval
* notifier updates.
@@ -175,7 +160,7 @@ static inline struct dmirror_device *dmirror_page_to_device(struct page *page)

static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
{
- uint64_t *pfns = range->pfns;
+ unsigned long *pfns = range->hmm_pfns;
unsigned long pfn;

for (pfn = (range->start >> PAGE_SHIFT);
@@ -188,15 +173,16 @@ static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
* Since we asked for hmm_range_fault() to populate pages,
* it shouldn't return an error entry on success.
*/
- WARN_ON(*pfns == range->values[HMM_PFN_ERROR]);
+ WARN_ON(*pfns & HMM_PFN_ERROR);
+ WARN_ON(!(*pfns & HMM_PFN_VALID));

- page = hmm_device_entry_to_page(range, *pfns);
+ page = hmm_pfn_to_page(*pfns);
WARN_ON(!page);

entry = page;
- if (*pfns & range->flags[HMM_PFN_WRITE])
+ if (*pfns & HMM_PFN_WRITE)
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
- else if (range->default_flags & range->flags[HMM_PFN_WRITE])
+ else if (WARN_ON(range->default_flags & HMM_PFN_WRITE))
return -EFAULT;
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
if (xa_is_err(entry))
@@ -260,8 +246,6 @@ static int dmirror_range_fault(struct dmirror *dmirror,
int ret;

while (true) {
- long count;
-
if (time_after(jiffies, timeout)) {
ret = -EBUSY;
goto out;
@@ -269,12 +253,11 @@ static int dmirror_range_fault(struct dmirror *dmirror,

range->notifier_seq = mmu_interval_read_begin(range->notifier);
down_read(&mm->mmap_sem);
- count = hmm_range_fault(range);
+ ret = hmm_range_fault(range);
up_read(&mm->mmap_sem);
- if (count <= 0) {
- if (count == 0 || count == -EBUSY)
+ if (ret) {
+ if (ret == -EBUSY)
continue;
- ret = count;
goto out;
}

@@ -299,16 +282,13 @@ static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
{
struct mm_struct *mm = dmirror->notifier.mm;
unsigned long addr;
- uint64_t pfns[64];
+ unsigned long pfns[64];
struct hmm_range range = {
.notifier = &dmirror->notifier,
- .pfns = pfns,
- .flags = dmirror_hmm_flags,
- .values = dmirror_hmm_values,
- .pfn_shift = DPT_SHIFT,
+ .hmm_pfns = pfns,
.pfn_flags_mask = 0,
- .default_flags = dmirror_hmm_flags[HMM_PFN_VALID] |
- (write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0),
+ .default_flags =
+ HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
.dev_private_owner = dmirror->mdevice,
};
int ret = 0;
@@ -754,19 +734,20 @@ static int dmirror_migrate(struct dmirror *dmirror,
}

static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
- unsigned char *perm, uint64_t entry)
+ unsigned char *perm, unsigned long entry)
{
struct page *page;

- if (entry == range->values[HMM_PFN_ERROR]) {
+ if (entry & HMM_PFN_ERROR) {
*perm = HMM_DMIRROR_PROT_ERROR;
return;
}
- page = hmm_device_entry_to_page(range, entry);
- if (!page) {
+ if (!(entry & HMM_PFN_VALID)) {
*perm = HMM_DMIRROR_PROT_NONE;
return;
}
+
+ page = hmm_pfn_to_page(entry);
if (is_device_private_page(page)) {
/* Is the page migrated to this device or some other? */
if (dmirror->mdevice == dmirror_page_to_device(page))
@@ -777,7 +758,7 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
*perm = HMM_DMIRROR_PROT_ZERO;
else
*perm = HMM_DMIRROR_PROT_NONE;
- if (entry & range->flags[HMM_PFN_WRITE])
+ if (entry & HMM_PFN_WRITE)
*perm |= HMM_DMIRROR_PROT_WRITE;
else
*perm |= HMM_DMIRROR_PROT_READ;
@@ -832,8 +813,6 @@ static int dmirror_range_snapshot(struct dmirror *dmirror,
return ret;

while (true) {
- long count;
-
if (time_after(jiffies, timeout)) {
ret = -EBUSY;
goto out;
@@ -842,12 +821,11 @@ static int dmirror_range_snapshot(struct dmirror *dmirror,
range->notifier_seq = mmu_interval_read_begin(range->notifier);

down_read(&mm->mmap_sem);
- count = hmm_range_fault(range);
+ ret = hmm_range_fault(range);
up_read(&mm->mmap_sem);
- if (count <= 0) {
- if (count == 0 || count == -EBUSY)
+ if (ret) {
+ if (ret == -EBUSY)
continue;
- ret = count;
goto out;
}

@@ -862,7 +840,7 @@ static int dmirror_range_snapshot(struct dmirror *dmirror,

n = (range->end - range->start) >> PAGE_SHIFT;
for (i = 0; i < n; i++)
- dmirror_mkentry(dmirror, range, perm + i, range->pfns[i]);
+ dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]);

mutex_unlock(&dmirror->mutex);
out:
@@ -878,15 +856,11 @@ static int dmirror_snapshot(struct dmirror *dmirror,
unsigned long size = cmd->npages << PAGE_SHIFT;
unsigned long addr;
unsigned long next;
- uint64_t pfns[64];
+ unsigned long pfns[64];
unsigned char perm[64];
char __user *uptr;
struct hmm_range range = {
- .pfns = pfns,
- .flags = dmirror_hmm_flags,
- .values = dmirror_hmm_values,
- .pfn_shift = DPT_SHIFT,
- .pfn_flags_mask = 0,
+ .hmm_pfns = pfns,
.dev_private_owner = dmirror->mdevice,
};
int ret = 0;
@@ -1097,6 +1071,7 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id)
spin_lock_init(&mdevice->lock);

cdev_init(&mdevice->cdevice, &dmirror_fops);
+ mdevice->cdevice.owner = THIS_MODULE;
ret = cdev_add(&mdevice->cdevice, dev, 1);
if (ret)
return ret;
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 033a12c7ab5b6d..da15471a2bbf9a 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1274,7 +1274,7 @@ TEST_F(hmm2, snapshot)
/* Check what the device saw. */
m = buffer->mirror;
ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
- ASSERT_EQ(m[1], HMM_DMIRROR_PROT_NONE);
+ ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);

Jason Gunthorpe (5):
mm/hmm: make CONFIG_DEVICE_PRIVATE into a select
mm/hmm: make hmm_range_fault return 0 or -1
drm/amdgpu: remove dead code after hmm_range_fault()
mm/hmm: remove HMM_PFN_SPECIAL
mm/hmm: remove the customizable pfn format from hmm_range_fault

Documentation/vm/hmm.rst | 28 ++--
arch/powerpc/Kconfig | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 56 +++----
drivers/gpu/drm/nouveau/Kconfig | 2 +-
drivers/gpu/drm/nouveau/nouveau_dmem.c | 60 ++++++--
drivers/gpu/drm/nouveau/nouveau_dmem.h | 4 +-
drivers/gpu/drm/nouveau/nouveau_svm.c | 59 ++++----
include/linux/hmm.h | 109 +++++---------
mm/Kconfig | 7 +-
mm/hmm.c | 185 +++++++++++-------------
10 files changed, 229 insertions(+), 283 deletions(-)

--
2.26.0


2020-04-22 00:23:12

by Jason Gunthorpe

[permalink] [raw]
Subject: [PATCH hmm 3/5] drm/amdgpu: remove dead code after hmm_range_fault()

From: Jason Gunthorpe <[email protected]>

Since amdgpu does not use the snapshot mode of hmm_range_fault() a
successful return already proves that all entries in the pfns are
HMM_PFN_VALID, there is no need to check the return result of
hmm_device_entry_to_page().

Signed-off-by: Jason Gunthorpe <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index efc1329a019127..bff8e64701a547 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -862,17 +862,13 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
goto out_free_pfns;
}

- for (i = 0; i < ttm->num_pages; i++) {
- /* FIXME: The pages cannot be touched outside the notifier_lock */
+ /*
+ * Due to default_flags, all pages are HMM_PFN_VALID or
+ * hmm_range_fault() fails. FIXME: The pages cannot be touched outside
+ * the notifier_lock, and mmu_interval_read_retry() must be done first.
+ */
+ for (i = 0; i < ttm->num_pages; i++)
pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
- if (unlikely(!pages[i])) {
- pr_err("Page fault failed for pfn[%lu] = 0x%llx\n",
- i, range->pfns[i]);
- r = -ENOMEM;
-
- goto out_free_pfns;
- }
- }

gtt->range = range;
mmput(mm);
--
2.26.0

2020-04-22 00:23:18

by Jason Gunthorpe

[permalink] [raw]
Subject: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

From: Jason Gunthorpe <[email protected]>

Presumably the intent here was that hmm_range_fault() could put the data
into some HW specific format and thus avoid some work. However, nothing
actually does that, and it isn't clear how anything actually could do that
as hmm_range_fault() provides CPU addresses which must be DMA mapped.

Perhaps there is some special HW that does not need DMA mapping, but we
don't have any examples of this, and the theoretical performance win of
avoiding an extra scan over the pfns array doesn't seem worth the
complexity. Plus pfns needs to be scanned anyhow to sort out any
DEVICE_PRIVATE pages.

This version replaces the uint64_t with an usigned long containing a pfn
and fix flags. On input flags is filled with the HMM_PFN_REQ_* values, on
successful output it is filled with HMM_PFN_* values, describing the state
of the pages.

amdgpu is simple to convert, it doesn't use snapshot and doesn't use
per-page flags.

nouveau uses only 16 hmm_pte entries at most (ie fits in a few cache
lines), and it sweeps over its pfns array a couple of times anyhow.

Signed-off-by: Jason Gunthorpe <[email protected]>
Signed-off-by: Christoph Hellwig <[email protected]>
---
Documentation/vm/hmm.rst | 26 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 35 ++----
drivers/gpu/drm/nouveau/nouveau_dmem.c | 60 +++++++--
drivers/gpu/drm/nouveau/nouveau_dmem.h | 4 +-
drivers/gpu/drm/nouveau/nouveau_svm.c | 52 ++++----
include/linux/hmm.h | 99 ++++++---------
mm/hmm.c | 160 +++++++++++-------------
7 files changed, 204 insertions(+), 232 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 9924f2caa0184c..73a9b8c858e5d9 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -185,9 +185,6 @@ The usage pattern is::
range.start = ...;
range.end = ...;
range.pfns = ...;
- range.flags = ...;
- range.values = ...;
- range.pfn_shift = ...;

if (!mmget_not_zero(interval_sub->notifier.mm))
return -EFAULT;
@@ -229,15 +226,10 @@ The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specif
fault or snapshot policy for the whole range instead of having to set them
for each entry in the pfns array.

-For instance, if the device flags for range.flags are::
+For instance if the device driver wants pages for a range with at least read
+permission, it sets::

- range.flags[HMM_PFN_VALID] = (1 << 63);
- range.flags[HMM_PFN_WRITE] = (1 << 62);
-
-and the device driver wants pages for a range with at least read permission,
-it sets::
-
- range->default_flags = (1 << 63);
+ range->default_flags = HMM_PFN_REQ_VALID;
range->pfn_flags_mask = 0;

and calls hmm_range_fault() as described above. This will fill fault all pages
@@ -246,18 +238,18 @@ in the range with at least read permission.
Now let's say the driver wants to do the same except for one page in the range for
which it wants to have write permission. Now driver set::

- range->default_flags = (1 << 63);
- range->pfn_flags_mask = (1 << 62);
- range->pfns[index_of_write] = (1 << 62);
+ range->default_flags = HMM_PFN_REQ_VALID;
+ range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
+ range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;

With this, HMM will fault in all pages with at least read (i.e., valid) and for the
address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
write permission i.e., if the CPU pte does not have write permission set then HMM
will call handle_mm_fault().

-Note that HMM will populate the pfns array with write permission for any page
-that is mapped with CPU write permission no matter what values are set
-in default_flags or pfn_flags_mask.
+After hmm_range_fault completes the flag bits are set to the current state of
+the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
+writable.


Represent and manage device memory from core kernel point of view
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 449083f9f8a2bf..bcfa8c26647d5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -766,17 +766,6 @@ struct amdgpu_ttm_tt {
};

#ifdef CONFIG_DRM_AMDGPU_USERPTR
-/* flags used by HMM internal, not related to CPU/GPU PTE flags */
-static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
- (1 << 0), /* HMM_PFN_VALID */
- (1 << 1), /* HMM_PFN_WRITE */
-};
-
-static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
- 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
- 0, /* HMM_PFN_NONE */
-};
-
/**
* amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
* memory and start HMM tracking CPU page table update
@@ -815,18 +804,15 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
goto out;
}
range->notifier = &bo->notifier;
- range->flags = hmm_range_flags;
- range->values = hmm_range_values;
- range->pfn_shift = PAGE_SHIFT;
range->start = bo->notifier.interval_tree.start;
range->end = bo->notifier.interval_tree.last + 1;
- range->default_flags = hmm_range_flags[HMM_PFN_VALID];
+ range->default_flags = HMM_PFN_REQ_FAULT;
if (!amdgpu_ttm_tt_is_readonly(ttm))
- range->default_flags |= range->flags[HMM_PFN_WRITE];
+ range->default_flags |= HMM_PFN_REQ_WRITE;

- range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns),
- GFP_KERNEL);
- if (unlikely(!range->pfns)) {
+ range->hmm_pfns = kvmalloc_array(ttm->num_pages,
+ sizeof(*range->hmm_pfns), GFP_KERNEL);
+ if (unlikely(!range->hmm_pfns)) {
r = -ENOMEM;
goto out_free_ranges;
}
@@ -867,7 +853,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
* the notifier_lock, and mmu_interval_read_retry() must be done first.
*/
for (i = 0; i < ttm->num_pages; i++)
- pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
+ pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]);

gtt->range = range;
mmput(mm);
@@ -877,7 +863,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
out_unlock:
up_read(&mm->mmap_sem);
out_free_pfns:
- kvfree(range->pfns);
+ kvfree(range->hmm_pfns);
out_free_ranges:
kfree(range);
out:
@@ -902,7 +888,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%lx\n",
gtt->userptr, ttm->num_pages);

- WARN_ONCE(!gtt->range || !gtt->range->pfns,
+ WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
"No user pages to check\n");

if (gtt->range) {
@@ -912,7 +898,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
*/
r = mmu_interval_read_retry(gtt->range->notifier,
gtt->range->notifier_seq);
- kvfree(gtt->range->pfns);
+ kvfree(gtt->range->hmm_pfns);
kfree(gtt->range);
gtt->range = NULL;
}
@@ -1003,8 +989,7 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)

for (i = 0; i < ttm->num_pages; i++) {
if (ttm->pages[i] !=
- hmm_device_entry_to_page(gtt->range,
- gtt->range->pfns[i]))
+ hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
break;
}

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index ad89e09a0be39a..07876fb0e1d665 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -672,27 +672,61 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
return ret;
}

-void
-nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
- struct hmm_range *range)
+void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
+ u64 *ioctl_addr)
{
unsigned long i, npages;

+ /*
+ * The ioctl_addr prepared here is passed through nvif_object_ioctl()
+ * to an eventual DMA map on some call chain like:
+ * nouveau_svm_fault():
+ * args.i.m.method = NVIF_VMM_V0_PFNMAP
+ * nouveau_range_fault()
+ * nvif_object_ioctl()
+ * client->driver->ioctl()
+ * struct nvif_driver nvif_driver_nvkm:
+ * .ioctl = nvkm_client_ioctl
+ * nvkm_ioctl()
+ * nvkm_ioctl_path()
+ * nvkm_ioctl_v0[type].func(..)
+ * nvkm_ioctl_mthd()
+ * nvkm_object_mthd()
+ * struct nvkm_object_func nvkm_uvmm:
+ * .mthd = nvkm_uvmm_mthd
+ * nvkm_uvmm_mthd()
+ * nvkm_uvmm_mthd_pfnmap()
+ * nvkm_vmm_pfn_map()
+ * nvkm_vmm_ptes_get_map()
+ * func == gp100_vmm_pgt_pfn
+ * struct nvkm_vmm_desc_func gp100_vmm_desc_spt:
+ * .pfn = gp100_vmm_pgt_pfn
+ * nvkm_vmm_iter()
+ * REF_PTES == func == gp100_vmm_pgt_pfn()
+ * dma_map_page()
+ *
+ * This is all just encoding the internal hmm reprensetation into a
+ * different nouveau internal representation.
+ */
npages = (range->end - range->start) >> PAGE_SHIFT;
for (i = 0; i < npages; ++i) {
struct page *page;
- uint64_t addr;

- page = hmm_device_entry_to_page(range, range->pfns[i]);
- if (page == NULL)
- continue;
-
- if (!is_device_private_page(page))
+ if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
+ ioctl_addr[i] = 0;
continue;
+ }

- addr = nouveau_dmem_page_addr(page);
- range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
- range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
- range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
+ page = hmm_pfn_to_page(range->hmm_pfns[i]);
+ if (is_device_private_page(page))
+ ioctl_addr[i] = nouveau_dmem_page_addr(page) |
+ NVIF_VMM_PFNMAP_V0_V |
+ NVIF_VMM_PFNMAP_V0_VRAM;
+ else
+ ioctl_addr[i] = page_to_phys(page) |
+ NVIF_VMM_PFNMAP_V0_V |
+ NVIF_VMM_PFNMAP_V0_HOST;
+ if (range->hmm_pfns[i] & HMM_PFN_WRITE)
+ ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W;
}
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.h b/drivers/gpu/drm/nouveau/nouveau_dmem.h
index 92394be5d64923..4607c0be7dd062 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.h
@@ -38,8 +38,8 @@ int nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
unsigned long start,
unsigned long end);

-void nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
- struct hmm_range *range);
+void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
+ u64 *ioctl_addr);
#else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
static inline void nouveau_dmem_init(struct nouveau_drm *drm) {}
static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {}
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index cf0d9bd61bebf9..d1059bce091192 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -369,18 +369,6 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
return ret;
}

-static const u64
-nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
- [HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V,
- [HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W,
-};
-
-static const u64
-nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = {
- [HMM_PFN_ERROR ] = ~NVIF_VMM_PFNMAP_V0_V,
- [HMM_PFN_NONE ] = NVIF_VMM_PFNMAP_V0_NONE,
-};
-
/* Issue fault replay for GPU to retry accesses that faulted previously. */
static void
nouveau_svm_fault_replay(struct nouveau_svm *svm)
@@ -520,7 +508,8 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {

static int nouveau_range_fault(struct nouveau_svmm *svmm,
struct nouveau_drm *drm, void *data, u32 size,
- u64 *pfns, struct svm_notifier *notifier)
+ unsigned long hmm_pfns[], u64 *ioctl_addr,
+ struct svm_notifier *notifier)
{
unsigned long timeout =
jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
@@ -529,10 +518,8 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
.notifier = &notifier->notifier,
.start = notifier->notifier.interval_tree.start,
.end = notifier->notifier.interval_tree.last + 1,
- .pfns = pfns,
- .flags = nouveau_svm_pfn_flags,
- .values = nouveau_svm_pfn_values,
- .pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT,
+ .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
+ .hmm_pfns = hmm_pfns,
};
struct mm_struct *mm = notifier->notifier.mm;
int ret;
@@ -542,12 +529,15 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
return -EBUSY;

range.notifier_seq = mmu_interval_read_begin(range.notifier);
- range.default_flags = 0;
- range.pfn_flags_mask = -1UL;
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range);
up_read(&mm->mmap_sem);
if (ret) {
+ /*
+ * FIXME: the input PFN_REQ flags are destroyed on
+ * -EBUSY, we need to regenerate them, also for the
+ * other continue below
+ */
if (ret == -EBUSY)
continue;
return ret;
@@ -562,7 +552,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
break;
}

- nouveau_dmem_convert_pfn(drm, &range);
+ nouveau_hmm_convert_pfn(drm, &range, ioctl_addr);

svmm->vmm->vmm.object.client->super = true;
ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
@@ -589,6 +579,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
} i;
u64 phys[16];
} args;
+ unsigned long hmm_pfns[ARRAY_SIZE(args.phys)];
struct vm_area_struct *vma;
u64 inst, start, limit;
int fi, fn, pi, fill;
@@ -704,12 +695,17 @@ nouveau_svm_fault(struct nvif_notify *notify)
* access flags.
*XXX: atomic?
*/
- if (buffer->fault[fn]->access != 0 /* READ. */ &&
- buffer->fault[fn]->access != 3 /* PREFETCH. */) {
- args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V |
- NVIF_VMM_PFNMAP_V0_W;
- } else {
- args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V;
+ switch (buffer->fault[fn]->access) {
+ case 0: /* READ. */
+ hmm_pfns[pi++] = HMM_PFN_REQ_FAULT;
+ break;
+ case 3: /* PREFETCH. */
+ hmm_pfns[pi++] = 0;
+ break;
+ default:
+ hmm_pfns[pi++] = HMM_PFN_REQ_FAULT |
+ HMM_PFN_REQ_WRITE;
+ break;
}
args.i.p.size = pi << PAGE_SHIFT;

@@ -737,7 +733,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
fill = (buffer->fault[fn ]->addr -
buffer->fault[fn - 1]->addr) >> PAGE_SHIFT;
while (--fill)
- args.phys[pi++] = NVIF_VMM_PFNMAP_V0_NONE;
+ hmm_pfns[pi++] = 0;
}

SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)",
@@ -753,7 +749,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
ret = nouveau_range_fault(
svmm, svm->drm, &args,
sizeof(args.i) + pi * sizeof(args.phys[0]),
- args.phys, &notifier);
+ hmm_pfns, args.phys, &notifier);
mmu_interval_notifier_remove(&notifier.notifier);
}
mmput(mm);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 81c302c884c0e3..e72529786f16f8 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -19,45 +19,45 @@
#include <linux/mmu_notifier.h>

/*
- * hmm_pfn_flag_e - HMM flag enums
+ * On output:
+ * 0 - The page is faultable and a future call with
+ * HMM_PFN_REQ_FAULT could succeed.
+ * HMM_PFN_VALID - the pfn field points to a valid PFN. This PFN is at
+ * least readable. If dev_private_owner is !NULL then this could
+ * point at a DEVICE_PRIVATE page.
+ * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
+ * HMM_PFN_ERROR - accessing the pfn is impossible and the device should
+ * fail. ie poisoned memory, special pages, no vma, etc
*
- * Flags:
- * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
- * HMM_PFN_WRITE: CPU page table has write permission set
- *
- * The driver provides a flags array for mapping page protections to device
- * PTE bits. If the driver valid bit for an entry is bit 3,
- * i.e., (entry & (1 << 3)), then the driver must provide
- * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
- * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
- * except that this is per device driver rather than per architecture.
+ * On input:
+ * 0 - Return the current state of the page, do not fault it.
+ * HMM_PFN_REQ_FAULT - The output must have HMM_PFN_VALID or hmm_range_fault()
+ * will fail
+ * HMM_PFN_REQ_WRITE - The output must have HMM_PFN_WRITE or hmm_range_fault()
+ * will fail. Must be combined with HMM_PFN_REQ_FAULT.
*/
-enum hmm_pfn_flag_e {
- HMM_PFN_VALID = 0,
- HMM_PFN_WRITE,
- HMM_PFN_FLAG_MAX
+enum hmm_pfn_flags {
+ HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
+ HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
+ HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
+
+ HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
+ HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
+
+ HMM_PFN_FLAGS = HMM_PFN_VALID | HMM_PFN_WRITE | HMM_PFN_ERROR,
};

/*
- * hmm_pfn_value_e - HMM pfn special value
- *
- * Flags:
- * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
- * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
+ * hmm_pfn_to_page() - return struct page pointed to by a device entry
*
- * Driver provides values for none entry, error entry, and special entry.
- * Driver can alias (i.e., use same value) error and special, but
- * it should not alias none with error or special.
- *
- * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
- * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
- * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
+ * This must be called under the caller 'user_lock' after a successful
+ * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID
+ * already.
*/
-enum hmm_pfn_value_e {
- HMM_PFN_ERROR,
- HMM_PFN_NONE,
- HMM_PFN_VALUE_MAX
-};
+static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn)
+{
+ return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS);
+}

/*
* struct hmm_range - track invalidation lock on virtual address range
@@ -66,12 +66,9 @@ enum hmm_pfn_value_e {
* @notifier_seq: result of mmu_interval_read_begin()
* @start: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
- * @pfns: array of pfns (big enough for the range)
- * @flags: pfn flags to match device driver page table
- * @values: pfn value for some special case (none, special, error, ...)
+ * @hmm_pfns: array of pfns (big enough for the range)
* @default_flags: default flags for the range (write, read, ... see hmm doc)
* @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
- * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT)
* @dev_private_owner: owner of device private pages
*/
struct hmm_range {
@@ -79,36 +76,12 @@ struct hmm_range {
unsigned long notifier_seq;
unsigned long start;
unsigned long end;
- uint64_t *pfns;
- const uint64_t *flags;
- const uint64_t *values;
- uint64_t default_flags;
- uint64_t pfn_flags_mask;
- uint8_t pfn_shift;
+ unsigned long *hmm_pfns;
+ unsigned long default_flags;
+ unsigned long pfn_flags_mask;
void *dev_private_owner;
};

-/*
- * hmm_device_entry_to_page() - return struct page pointed to by a device entry
- * @range: range use to decode device entry value
- * @entry: device entry value to get corresponding struct page from
- * Return: struct page pointer if entry is a valid, NULL otherwise
- *
- * If the device entry is valid (ie valid flag set) then return the struct page
- * matching the entry value. Otherwise return NULL.
- */
-static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
- uint64_t entry)
-{
- if (entry == range->values[HMM_PFN_NONE])
- return NULL;
- if (entry == range->values[HMM_PFN_ERROR])
- return NULL;
- if (!(entry & range->flags[HMM_PFN_VALID]))
- return NULL;
- return pfn_to_page(entry >> range->pfn_shift);
-}
-
/*
* Please see Documentation/vm/hmm.rst for how to use the range API.
*/
diff --git a/mm/hmm.c b/mm/hmm.c
index 2693393dc14b30..c1c96d4cc0554c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,28 +37,13 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};

-/*
- * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
- * @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the device entry
- * Return: valid device entry for the pfn
- */
-static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
-{
- return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
-}
-
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
- struct hmm_range *range, enum hmm_pfn_value_e value)
+ struct hmm_range *range, unsigned long cpu_flags)
{
- uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i = (addr - range->start) >> PAGE_SHIFT;

- i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
- pfns[i] = range->values[value];
-
+ range->hmm_pfns[i] = cpu_flags;
return 0;
}

@@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}

static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- uint64_t pfns, uint64_t cpu_flags)
+ unsigned long pfn_req_flags,
+ unsigned long cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;

@@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
* waste to have the user pre-fill the pfn arrays with a default
* flags value.
*/
- pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+ pfn_req_flags &= range->pfn_flags_mask;
+ pfn_req_flags |= range->default_flags;

/* We aren't ask to do anything ... */
- if (!(pfns & range->flags[HMM_PFN_VALID]))
+ if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
return 0;

/* Need to write fault ? */
- if ((pfns & range->flags[HMM_PFN_WRITE]) &&
- !(cpu_flags & range->flags[HMM_PFN_WRITE]))
+ if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
+ !(cpu_flags & HMM_PFN_WRITE))
return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;

/* If CPU page table is not valid then we need to fault */
- if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
+ if (!(cpu_flags & HMM_PFN_VALID))
return HMM_NEED_FAULT;
return 0;
}

static unsigned int
hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- const uint64_t *pfns, unsigned long npages,
- uint64_t cpu_flags)
+ const unsigned long hmm_pfns[], unsigned long npages,
+ unsigned long cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault = 0;
@@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
* hmm_pte_need_fault() will always return 0.
*/
if (!((range->default_flags | range->pfn_flags_mask) &
- range->flags[HMM_PFN_VALID]))
+ HMM_PFN_REQ_FAULT))
return 0;

for (i = 0; i < npages; ++i) {
- required_fault |=
- hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
+ required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
+ cpu_flags);
if (required_fault == HMM_NEED_ALL_BITS)
return required_fault;
}
@@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
unsigned long i, npages;
- uint64_t *pfns;
+ unsigned long *hmm_pfns;

i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
- required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
+ hmm_pfns = &range->hmm_pfns[i];
+ required_fault =
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
if (!walk->vma) {
if (required_fault)
return -EFAULT;
@@ -174,44 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
}
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
- return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
+ return hmm_pfns_fill(addr, end, range, 0);
}

-static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
+static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
+ pmd_t pmd)
{
if (pmd_protnone(pmd))
return 0;
- return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
- unsigned long end, uint64_t *pfns, pmd_t pmd)
+ unsigned long end, unsigned long hmm_pfns[],
+ pmd_t pmd)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
unsigned int required_fault;
- uint64_t cpu_flags;
+ unsigned long cpu_flags;

npages = (end - addr) >> PAGE_SHIFT;
cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
required_fault =
- hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);

pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+ hmm_pfns[i] = pfn | cpu_flags;
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* stub to allow the code below to compile */
int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
- unsigned long end, uint64_t *pfns, pmd_t pmd);
+ unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool hmm_is_device_private_entry(struct hmm_range *range,
@@ -222,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
range->dev_private_owner;
}

-static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
+static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
+ pte_t pte)
{
if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
return 0;
- return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}

static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long end, pmd_t *pmdp, pte_t *ptep,
- uint64_t *pfn)
+ unsigned long *hmm_pfn)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
- uint64_t cpu_flags;
+ unsigned long cpu_flags;
pte_t pte = *ptep;
- uint64_t orig_pfn = *pfn;
+ uint64_t pfn_req_flags = *hmm_pfn;

if (pte_none(pte)) {
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *pfn = range->values[HMM_PFN_NONE];
+ *hmm_pfn = 0;
return 0;
}

@@ -258,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* the PFN even if not present.
*/
if (hmm_is_device_private_entry(range, entry)) {
- *pfn = hmm_device_entry_from_pfn(range,
- device_private_entry_to_pfn(entry));
- *pfn |= range->flags[HMM_PFN_VALID];
+ cpu_flags = HMM_PFN_VALID;
if (is_write_device_private_entry(entry))
- *pfn |= range->flags[HMM_PFN_WRITE];
+ cpu_flags |= HMM_PFN_WRITE;
+ *hmm_pfn = device_private_entry_to_pfn(entry) |
+ cpu_flags;
return 0;
}

- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (!required_fault) {
- *pfn = range->values[HMM_PFN_NONE];
+ *hmm_pfn = 0;
return 0;
}

@@ -288,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
}

cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault)
goto fault;

@@ -297,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* fall through and treat it like a normal page.
*/
if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
- if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
+ if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
}
- *pfn = range->values[HMM_PFN_ERROR];
+ *hmm_pfn = HMM_PFN_ERROR;
return 0;
}

- *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ *hmm_pfn = pte_pfn(pte) | cpu_flags;
return 0;

fault:
@@ -321,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+ unsigned long *hmm_pfns =
+ &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
unsigned long npages = (end - start) >> PAGE_SHIFT;
unsigned long addr = start;
pte_t *ptep;
@@ -333,16 +324,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return hmm_vma_walk_hole(start, end, -1, walk);

if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(walk->mm, pmdp);
return -EBUSY;
}
- return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+ return hmm_pfns_fill(start, end, range, 0);
}

if (!pmd_present(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
@@ -362,7 +353,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;

- return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
+ return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
}

/*
@@ -372,16 +363,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
* recover.
*/
if (pmd_bad(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}

ptep = pte_offset_map(pmdp, addr);
- for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
+ for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
int r;

- r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
if (r) {
/* hmm_vma_handle_pte() did pte_unmap() */
return r;
@@ -393,13 +384,12 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,

#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
+ pud_t pud)
{
if (!pud_present(pud))
return 0;
- return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}

static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
@@ -427,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
- uint64_t *pfns, cpu_flags;
+ unsigned long *hmm_pfns;
+ unsigned long cpu_flags;

if (!pud_present(pud)) {
spin_unlock(ptl);
@@ -436,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,

i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
+ hmm_pfns = &range->hmm_pfns[i];

cpu_flags = pud_to_hmm_pfn_flags(range, pud);
- required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
npages, cpu_flags);
if (required_fault) {
spin_unlock(ptl);
@@ -448,8 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,

pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn)
- pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
- cpu_flags;
+ hmm_pfns[i] = pfn | cpu_flags;
goto out_unlock;
}

@@ -473,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- uint64_t orig_pfn, cpu_flags;
unsigned int required_fault;
+ unsigned long pfn_req_flags;
+ unsigned long cpu_flags;
spinlock_t *ptl;
pte_t entry;

@@ -482,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
entry = huge_ptep_get(pte);

i = (start - range->start) >> PAGE_SHIFT;
- orig_pfn = range->pfns[i];
+ pfn_req_flags = range->hmm_pfns[i];
cpu_flags = pte_to_hmm_pfn_flags(range, entry);
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
spin_unlock(ptl);
return hmm_vma_fault(addr, end, required_fault, walk);
@@ -492,8 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,

pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
- cpu_flags;
+ range->hmm_pfns[i] = pfn | cpu_flags;
+
spin_unlock(ptl);
return 0;
}
@@ -524,7 +516,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
* failure.
*/
if (hmm_range_need_fault(hmm_vma_walk,
- range->pfns +
+ range->hmm_pfns +
((start - range->start) >> PAGE_SHIFT),
(end - start) >> PAGE_SHIFT, 0))
return -EFAULT;
--
2.26.0

2020-04-22 00:23:32

by Jason Gunthorpe

[permalink] [raw]
Subject: [PATCH hmm 4/5] mm/hmm: remove HMM_PFN_SPECIAL

From: Jason Gunthorpe <[email protected]>

This is just an alias for HMM_PFN_ERROR, nothing cares that the error was
because of a special page vs any other error case.

Signed-off-by: Jason Gunthorpe <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 -
drivers/gpu/drm/nouveau/nouveau_svm.c | 1 -
include/linux/hmm.h | 8 --------
mm/hmm.c | 2 +-
4 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index bff8e64701a547..449083f9f8a2bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -775,7 +775,6 @@ static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
0, /* HMM_PFN_NONE */
- 0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
};

/**
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index c68e9317cf0740..cf0d9bd61bebf9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -379,7 +379,6 @@ static const u64
nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = {
[HMM_PFN_ERROR ] = ~NVIF_VMM_PFNMAP_V0_V,
[HMM_PFN_NONE ] = NVIF_VMM_PFNMAP_V0_NONE,
- [HMM_PFN_SPECIAL] = ~NVIF_VMM_PFNMAP_V0_V,
};

/* Issue fault replay for GPU to retry accesses that faulted previously. */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 0df27dd03d53d7..81c302c884c0e3 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -44,10 +44,6 @@ enum hmm_pfn_flag_e {
* Flags:
* HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
* HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
- * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
- * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not
- * be mirrored by a device, because the entry will never have HMM_PFN_VALID
- * set and the pfn value is undefined.
*
* Driver provides values for none entry, error entry, and special entry.
* Driver can alias (i.e., use same value) error and special, but
@@ -56,12 +52,10 @@ enum hmm_pfn_flag_e {
* HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
* hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
* hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
- * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
*/
enum hmm_pfn_value_e {
HMM_PFN_ERROR,
HMM_PFN_NONE,
- HMM_PFN_SPECIAL,
HMM_PFN_VALUE_MAX
};

@@ -110,8 +104,6 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
return NULL;
if (entry == range->values[HMM_PFN_ERROR])
return NULL;
- if (entry == range->values[HMM_PFN_SPECIAL])
- return NULL;
if (!(entry & range->flags[HMM_PFN_VALID]))
return NULL;
return pfn_to_page(entry >> range->pfn_shift);
diff --git a/mm/hmm.c b/mm/hmm.c
index 4c7c396655b528..2693393dc14b30 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -301,7 +301,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *pfn = range->values[HMM_PFN_SPECIAL];
+ *pfn = range->values[HMM_PFN_ERROR];
return 0;
}

--
2.26.0

2020-04-22 00:24:10

by Jason Gunthorpe

[permalink] [raw]
Subject: [PATCH hmm 2/5] mm/hmm: make hmm_range_fault return 0 or -1

From: Jason Gunthorpe <[email protected]>

hmm_vma_walk->last is supposed to be updated after every write to the
pfns, so that it can be returned by hmm_range_fault(). However, this is
not done consistently. Fortunately nothing checks the return code of
hmm_range_fault() for anything other than error.

More importantly last must be set before returning -EBUSY as it is used to
prevent reading an output pfn as an input flags when the loop restarts.

For clarity and simplicity make hmm_range_fault() return 0 or -ERRNO. Only
set last when returning -EBUSY.

Signed-off-by: Jason Gunthorpe <[email protected]>
---
Documentation/vm/hmm.rst | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++--
drivers/gpu/drm/nouveau/nouveau_svm.c | 6 +++---
include/linux/hmm.h | 2 +-
mm/hmm.c | 25 +++++++++----------------
5 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 4e3e9362afeb10..9924f2caa0184c 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -161,7 +161,7 @@ device must complete the update before the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use::

- long hmm_range_fault(struct hmm_range *range);
+ int hmm_range_fault(struct hmm_range *range);

It will trigger a page fault on missing or read-only entries if write access is
requested (see below). Page faults use the generic mm page fault code path just
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 6309ff72bd7876..efc1329a019127 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -852,12 +852,12 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
down_read(&mm->mmap_sem);
r = hmm_range_fault(range);
up_read(&mm->mmap_sem);
- if (unlikely(r <= 0)) {
+ if (unlikely(r)) {
/*
* FIXME: This timeout should encompass the retry from
* mmu_interval_read_retry() as well.
*/
- if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout))
+ if ((r == -EBUSY) && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
}
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 645fedd77e21b4..c68e9317cf0740 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -536,7 +536,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT,
};
struct mm_struct *mm = notifier->notifier.mm;
- long ret;
+ int ret;

while (true) {
if (time_after(jiffies, timeout))
@@ -548,8 +548,8 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range);
up_read(&mm->mmap_sem);
- if (ret <= 0) {
- if (ret == 0 || ret == -EBUSY)
+ if (ret) {
+ if (ret == -EBUSY)
continue;
return ret;
}
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7475051100c782..0df27dd03d53d7 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -120,7 +120,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
/*
* Please see Documentation/vm/hmm.rst for how to use the range API.
*/
-long hmm_range_fault(struct hmm_range *range);
+int hmm_range_fault(struct hmm_range *range);

/*
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 280585833adfc1..4c7c396655b528 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -174,7 +174,6 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
}
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
- hmm_vma_walk->last = addr;
return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
}

@@ -207,7 +206,6 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
- hmm_vma_walk->last = end;
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -386,13 +384,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
if (r) {
/* hmm_vma_handle_pte() did pte_unmap() */
- hmm_vma_walk->last = addr;
return r;
}
}
pte_unmap(ptep - 1);
-
- hmm_vma_walk->last = addr;
return 0;
}

@@ -455,7 +450,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
for (i = 0; i < npages; ++i, ++pfn)
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
- hmm_vma_walk->last = end;
goto out_unlock;
}

@@ -500,7 +494,6 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
- hmm_vma_walk->last = end;
spin_unlock(ptl);
return 0;
}
@@ -537,7 +530,6 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
return -EFAULT;

hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
- hmm_vma_walk->last = end;

/* Skip this vma and continue processing the next vma. */
return 1;
@@ -555,9 +547,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
* hmm_range_fault - try to fault some address in a virtual address range
* @range: argument structure
*
- * Return: the number of valid pages in range->pfns[] (from range start
- * address), which may be zero. On error one of the following status codes
- * can be returned:
+ * Return: 0 or -ERRNO with one of the following status codes:
*
* -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
* (e.g., device file vma).
@@ -572,7 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
* This is similar to get_user_pages(), except that it can read the page tables
* without mutating them (ie causing faults).
*/
-long hmm_range_fault(struct hmm_range *range)
+int hmm_range_fault(struct hmm_range *range)
{
struct hmm_vma_walk hmm_vma_walk = {
.range = range,
@@ -590,10 +580,13 @@ long hmm_range_fault(struct hmm_range *range)
return -EBUSY;
ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
&hmm_walk_ops, &hmm_vma_walk);
+ /*
+ * When -EBUSY is returned the loop restarts with
+ * hmm_vma_walk.last set to an address that has not been stored
+ * in pfns. All entries < last in the pfn array are set to their
+ * output, and all >= are still at their input values.
+ */
} while (ret == -EBUSY);
-
- if (ret)
- return ret;
- return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
--
2.26.0

2020-04-22 00:26:21

by Jason Gunthorpe

[permalink] [raw]
Subject: [PATCH hmm 1/5] mm/hmm: make CONFIG_DEVICE_PRIVATE into a select

From: Jason Gunthorpe <[email protected]>

There is no reason for a user to select this or not directly - it should
be selected by drivers that are going to use the feature, similar to how
CONFIG_HMM_MIRROR works.

Currently all drivers provide a feature kconfig that will disable use of
DEVICE_PRIVATE in that driver, allowing users to avoid enabling this if
they don't want the overhead.

Signed-off-by: Jason Gunthorpe <[email protected]>
---
arch/powerpc/Kconfig | 2 +-
drivers/gpu/drm/nouveau/Kconfig | 2 +-
mm/Kconfig | 7 +------
3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 924c541a926008..8de52aefdc74cc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -455,7 +455,7 @@ config PPC_TRANSACTIONAL_MEM
config PPC_UV
bool "Ultravisor support"
depends on KVM_BOOK3S_HV_POSSIBLE
- depends on DEVICE_PRIVATE
+ select DEVICE_PRIVATE
default n
help
This option paravirtualizes the kernel to run in POWER platforms that
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index d6e4ae1ef7053a..af5793f3e7c2cf 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -86,10 +86,10 @@ config DRM_NOUVEAU_BACKLIGHT

config DRM_NOUVEAU_SVM
bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
- depends on DEVICE_PRIVATE
depends on DRM_NOUVEAU
depends on MMU
depends on STAGING
+ select DEVICE_PRIVATE
select HMM_MIRROR
select MMU_NOTIFIER
default n
diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c358c..7ca36bf5f5058e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -805,15 +805,10 @@ config HMM_MIRROR
depends on MMU

config DEVICE_PRIVATE
- bool "Unaddressable device memory (GPU memory, ...)"
+ bool
depends on ZONE_DEVICE
select DEV_PAGEMAP_OPS

- help
- Allows creation of struct pages to represent unaddressable device
- memory; i.e., memory that is only accessible from the device (or
- group of devices). You likely also want to select HMM_MIRROR.
-
config FRAME_VECTOR
bool

--
2.26.0

2020-04-22 05:52:21

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH hmm 1/5] mm/hmm: make CONFIG_DEVICE_PRIVATE into a select

On Tue, Apr 21, 2020 at 09:21:42PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <[email protected]>
>
> There is no reason for a user to select this or not directly - it should
> be selected by drivers that are going to use the feature, similar to how
> CONFIG_HMM_MIRROR works.
>
> Currently all drivers provide a feature kconfig that will disable use of
> DEVICE_PRIVATE in that driver, allowing users to avoid enabling this if
> they don't want the overhead.
>
> Signed-off-by: Jason Gunthorpe <[email protected]>

Looks good,

Reviewed-by: Christoph Hellwig <[email protected]>

2020-04-22 05:54:11

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH hmm 2/5] mm/hmm: make hmm_range_fault return 0 or -1

On Tue, Apr 21, 2020 at 09:21:43PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <[email protected]>
>
> hmm_vma_walk->last is supposed to be updated after every write to the
> pfns, so that it can be returned by hmm_range_fault(). However, this is
> not done consistently. Fortunately nothing checks the return code of
> hmm_range_fault() for anything other than error.
>
> More importantly last must be set before returning -EBUSY as it is used to
> prevent reading an output pfn as an input flags when the loop restarts.
>
> For clarity and simplicity make hmm_range_fault() return 0 or -ERRNO. Only
> set last when returning -EBUSY.
>
> Signed-off-by: Jason Gunthorpe <[email protected]>
> ---
> Documentation/vm/hmm.rst | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++--
> drivers/gpu/drm/nouveau/nouveau_svm.c | 6 +++---
> include/linux/hmm.h | 2 +-
> mm/hmm.c | 25 +++++++++----------------
> 5 files changed, 16 insertions(+), 23 deletions(-)
>
> diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
> index 4e3e9362afeb10..9924f2caa0184c 100644
> --- a/Documentation/vm/hmm.rst
> +++ b/Documentation/vm/hmm.rst
> @@ -161,7 +161,7 @@ device must complete the update before the driver callback returns.
> When the device driver wants to populate a range of virtual addresses, it can
> use::
>
> - long hmm_range_fault(struct hmm_range *range);
> + int hmm_range_fault(struct hmm_range *range);
>
> It will trigger a page fault on missing or read-only entries if write access is
> requested (see below). Page faults use the generic mm page fault code path just
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 6309ff72bd7876..efc1329a019127 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -852,12 +852,12 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> down_read(&mm->mmap_sem);
> r = hmm_range_fault(range);
> up_read(&mm->mmap_sem);
> - if (unlikely(r <= 0)) {
> + if (unlikely(r)) {
> /*
> * FIXME: This timeout should encompass the retry from
> * mmu_interval_read_retry() as well.
> */
> - if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout))
> + if ((r == -EBUSY) && !time_after(jiffies, timeout))

Please also kill the superflous inner braces here.

> + * Return: 0 or -ERRNO with one of the following status codes:

Maybe say something like:

* Returns 0 on success or one of the following error codes:

Otherwise this looks good:

Reviewed-by: Christoph Hellwig <[email protected]>

2020-04-22 05:55:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH hmm 4/5] mm/hmm: remove HMM_PFN_SPECIAL

On Tue, Apr 21, 2020 at 09:21:45PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <[email protected]>
>
> This is just an alias for HMM_PFN_ERROR, nothing cares that the error was
> because of a special page vs any other error case.

Looks good,

Reviewed-by: Christoph Hellwig <[email protected]>

2020-04-22 06:07:36

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault



On Tue, Apr 21, 2020 at 09:21:46PM -0300, Jason Gunthorpe wrote:
> +void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
> + u64 *ioctl_addr)
> {
> unsigned long i, npages;
>
> + /*
> + * The ioctl_addr prepared here is passed through nvif_object_ioctl()
> + * to an eventual DMA map on some call chain like:
> + * nouveau_svm_fault():
> + * args.i.m.method = NVIF_VMM_V0_PFNMAP
> + * nouveau_range_fault()
> + * nvif_object_ioctl()
> + * client->driver->ioctl()
> + * struct nvif_driver nvif_driver_nvkm:
> + * .ioctl = nvkm_client_ioctl
> + * nvkm_ioctl()
> + * nvkm_ioctl_path()
> + * nvkm_ioctl_v0[type].func(..)
> + * nvkm_ioctl_mthd()
> + * nvkm_object_mthd()
> + * struct nvkm_object_func nvkm_uvmm:
> + * .mthd = nvkm_uvmm_mthd
> + * nvkm_uvmm_mthd()
> + * nvkm_uvmm_mthd_pfnmap()
> + * nvkm_vmm_pfn_map()
> + * nvkm_vmm_ptes_get_map()
> + * func == gp100_vmm_pgt_pfn
> + * struct nvkm_vmm_desc_func gp100_vmm_desc_spt:
> + * .pfn = gp100_vmm_pgt_pfn
> + * nvkm_vmm_iter()
> + * REF_PTES == func == gp100_vmm_pgt_pfn()
> + * dma_map_page()
> + *
> + * This is all just encoding the internal hmm reprensetation into a
> + * different nouveau internal representation.
> + */

Nice callchain from hell.. Unfortunately such "code listings" tend to
get out of date very quickly, so I'm not sure it is worth keeping in
the code. What would be really worthile is consolidating the two
different sets of defines (NVIF_VMM_PFNMAP_V0_ vs NVKM_VMM_PFN_)
to make the code a little easier to follow.

> npages = (range->end - range->start) >> PAGE_SHIFT;
> for (i = 0; i < npages; ++i) {
> struct page *page;
>
> + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
> + ioctl_addr[i] = 0;
> continue;
> + }

Can't we rely on the caller pre-zeroing the array?

> + page = hmm_pfn_to_page(range->hmm_pfns[i]);
> + if (is_device_private_page(page))
> + ioctl_addr[i] = nouveau_dmem_page_addr(page) |
> + NVIF_VMM_PFNMAP_V0_V |
> + NVIF_VMM_PFNMAP_V0_VRAM;
> + else
> + ioctl_addr[i] = page_to_phys(page) |
> + NVIF_VMM_PFNMAP_V0_V |
> + NVIF_VMM_PFNMAP_V0_HOST;
> + if (range->hmm_pfns[i] & HMM_PFN_WRITE)
> + ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W;

Now that this routine isn't really device memory specific any more, I
wonder if it should move to nouveau_svm.c.

2020-04-22 12:41:36

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

On Wed, Apr 22, 2020 at 08:03:29AM +0200, Christoph Hellwig wrote:
>
>
> On Tue, Apr 21, 2020 at 09:21:46PM -0300, Jason Gunthorpe wrote:
> > +void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
> > + u64 *ioctl_addr)
> > {
> > unsigned long i, npages;
> >
> > + /*
> > + * The ioctl_addr prepared here is passed through nvif_object_ioctl()
> > + * to an eventual DMA map on some call chain like:
> > + * nouveau_svm_fault():
> > + * args.i.m.method = NVIF_VMM_V0_PFNMAP
> > + * nouveau_range_fault()
> > + * nvif_object_ioctl()
> > + * client->driver->ioctl()
> > + * struct nvif_driver nvif_driver_nvkm:
> > + * .ioctl = nvkm_client_ioctl
> > + * nvkm_ioctl()
> > + * nvkm_ioctl_path()
> > + * nvkm_ioctl_v0[type].func(..)
> > + * nvkm_ioctl_mthd()
> > + * nvkm_object_mthd()
> > + * struct nvkm_object_func nvkm_uvmm:
> > + * .mthd = nvkm_uvmm_mthd
> > + * nvkm_uvmm_mthd()
> > + * nvkm_uvmm_mthd_pfnmap()
> > + * nvkm_vmm_pfn_map()
> > + * nvkm_vmm_ptes_get_map()
> > + * func == gp100_vmm_pgt_pfn
> > + * struct nvkm_vmm_desc_func gp100_vmm_desc_spt:
> > + * .pfn = gp100_vmm_pgt_pfn
> > + * nvkm_vmm_iter()
> > + * REF_PTES == func == gp100_vmm_pgt_pfn()
> > + * dma_map_page()
> > + *
> > + * This is all just encoding the internal hmm reprensetation into a
> > + * different nouveau internal representation.
> > + */
>
> Nice callchain from hell.. Unfortunately such "code listings" tend to
> get out of date very quickly, so I'm not sure it is worth keeping in
> the code. What would be really worthile is consolidating the two
> different sets of defines (NVIF_VMM_PFNMAP_V0_ vs NVKM_VMM_PFN_)
> to make the code a little easier to follow.

I was mainly concerned that this function is using hmm properly,
becuase it sure looks like it is just forming the CPU physical address
into a HW specific data. But it turns out it is just an internal data
for some other code and the dma_map is impossibly far away

It took forever to find, I figured I'd leave a hint for the next poor
soul that has to look at this..

Also, I think it shows there is no 'performance' argument here, if
this path needs more performance the above should be cleaned
before we abuse hmm_range_fault.

Put it in the commit message instead?

> > npages = (range->end - range->start) >> PAGE_SHIFT;
> > for (i = 0; i < npages; ++i) {
> > struct page *page;
> >
> > + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
> > + ioctl_addr[i] = 0;
> > continue;
> > + }
>
> Can't we rely on the caller pre-zeroing the array?

This ends up as args.phys in nouveau_svm_fault - I didn't see a
zeroing?

I think it makes sense that this routine fully sets the output array
and does not assume pre-initialize

> > + page = hmm_pfn_to_page(range->hmm_pfns[i]);
> > + if (is_device_private_page(page))
> > + ioctl_addr[i] = nouveau_dmem_page_addr(page) |
> > + NVIF_VMM_PFNMAP_V0_V |
> > + NVIF_VMM_PFNMAP_V0_VRAM;
> > + else
> > + ioctl_addr[i] = page_to_phys(page) |
> > + NVIF_VMM_PFNMAP_V0_V |
> > + NVIF_VMM_PFNMAP_V0_HOST;
> > + if (range->hmm_pfns[i] & HMM_PFN_WRITE)
> > + ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W;
>
> Now that this routine isn't really device memory specific any more, I
> wonder if it should move to nouveau_svm.c.

Yes, if we expose nouveau_dmem_page_addr(), I will try it

Thanks,
Jason

2020-04-22 17:54:24

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

[+Philip Yang]

Am 2020-04-21 um 8:21 p.m. schrieb Jason Gunthorpe:
> From: Jason Gunthorpe <[email protected]>
>
> Presumably the intent here was that hmm_range_fault() could put the data
> into some HW specific format and thus avoid some work. However, nothing
> actually does that, and it isn't clear how anything actually could do that
> as hmm_range_fault() provides CPU addresses which must be DMA mapped.
>
> Perhaps there is some special HW that does not need DMA mapping, but we
> don't have any examples of this, and the theoretical performance win of
> avoiding an extra scan over the pfns array doesn't seem worth the
> complexity. Plus pfns needs to be scanned anyhow to sort out any
> DEVICE_PRIVATE pages.
>
> This version replaces the uint64_t with an usigned long containing a pfn
> and fix flags. On input flags is filled with the HMM_PFN_REQ_* values, on
> successful output it is filled with HMM_PFN_* values, describing the state
> of the pages.
>
> amdgpu is simple to convert, it doesn't use snapshot and doesn't use
> per-page flags.
>
> nouveau uses only 16 hmm_pte entries at most (ie fits in a few cache
> lines), and it sweeps over its pfns array a couple of times anyhow.
>
> Signed-off-by: Jason Gunthorpe <[email protected]>
> Signed-off-by: Christoph Hellwig <[email protected]>

Hi Jason,

I pointed out a typo in the documentation inline. Other than that, the
series is

Acked-by: Felix Kuehling <[email protected]>

I'll try to build it and run some basic tests later.


> ---
> Documentation/vm/hmm.rst | 26 ++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 35 ++----
> drivers/gpu/drm/nouveau/nouveau_dmem.c | 60 +++++++--
> drivers/gpu/drm/nouveau/nouveau_dmem.h | 4 +-
> drivers/gpu/drm/nouveau/nouveau_svm.c | 52 ++++----
> include/linux/hmm.h | 99 ++++++---------
> mm/hmm.c | 160 +++++++++++-------------
> 7 files changed, 204 insertions(+), 232 deletions(-)
>
> diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
> index 9924f2caa0184c..73a9b8c858e5d9 100644
> --- a/Documentation/vm/hmm.rst
> +++ b/Documentation/vm/hmm.rst
> @@ -185,9 +185,6 @@ The usage pattern is::
> range.start = ...;
> range.end = ...;
> range.pfns = ...;
> - range.flags = ...;
> - range.values = ...;
> - range.pfn_shift = ...;
>
> if (!mmget_not_zero(interval_sub->notifier.mm))
> return -EFAULT;
> @@ -229,15 +226,10 @@ The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specif
> fault or snapshot policy for the whole range instead of having to set them
> for each entry in the pfns array.
>
> -For instance, if the device flags for range.flags are::
> +For instance if the device driver wants pages for a range with at least read
> +permission, it sets::
>
> - range.flags[HMM_PFN_VALID] = (1 << 63);
> - range.flags[HMM_PFN_WRITE] = (1 << 62);
> -
> -and the device driver wants pages for a range with at least read permission,
> -it sets::
> -
> - range->default_flags = (1 << 63);
> + range->default_flags = HMM_PFN_REQ_VALID;

This should be HMM_PFN_REQ_FAULT.


> range->pfn_flags_mask = 0;
>
> and calls hmm_range_fault() as described above. This will fill fault all pages
> @@ -246,18 +238,18 @@ in the range with at least read permission.
> Now let's say the driver wants to do the same except for one page in the range for
> which it wants to have write permission. Now driver set::
>
> - range->default_flags = (1 << 63);
> - range->pfn_flags_mask = (1 << 62);
> - range->pfns[index_of_write] = (1 << 62);
> + range->default_flags = HMM_PFN_REQ_VALID;

HMM_PFN_REQ_FAULT

Regards,
  Felix


> + range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
> + range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
>
> With this, HMM will fault in all pages with at least read (i.e., valid) and for the
> address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
> write permission i.e., if the CPU pte does not have write permission set then HMM
> will call handle_mm_fault().
>
> -Note that HMM will populate the pfns array with write permission for any page
> -that is mapped with CPU write permission no matter what values are set
> -in default_flags or pfn_flags_mask.
> +After hmm_range_fault completes the flag bits are set to the current state of
> +the page tables, ie HMM_PFN_VALID | HMM_PFN_WRITE will be set if the page is
> +writable.
>
>
> Represent and manage device memory from core kernel point of view
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 449083f9f8a2bf..bcfa8c26647d5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -766,17 +766,6 @@ struct amdgpu_ttm_tt {
> };
>
> #ifdef CONFIG_DRM_AMDGPU_USERPTR
> -/* flags used by HMM internal, not related to CPU/GPU PTE flags */
> -static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
> - (1 << 0), /* HMM_PFN_VALID */
> - (1 << 1), /* HMM_PFN_WRITE */
> -};
> -
> -static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
> - 0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
> - 0, /* HMM_PFN_NONE */
> -};
> -
> /**
> * amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
> * memory and start HMM tracking CPU page table update
> @@ -815,18 +804,15 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> goto out;
> }
> range->notifier = &bo->notifier;
> - range->flags = hmm_range_flags;
> - range->values = hmm_range_values;
> - range->pfn_shift = PAGE_SHIFT;
> range->start = bo->notifier.interval_tree.start;
> range->end = bo->notifier.interval_tree.last + 1;
> - range->default_flags = hmm_range_flags[HMM_PFN_VALID];
> + range->default_flags = HMM_PFN_REQ_FAULT;
> if (!amdgpu_ttm_tt_is_readonly(ttm))
> - range->default_flags |= range->flags[HMM_PFN_WRITE];
> + range->default_flags |= HMM_PFN_REQ_WRITE;
>
> - range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns),
> - GFP_KERNEL);
> - if (unlikely(!range->pfns)) {
> + range->hmm_pfns = kvmalloc_array(ttm->num_pages,
> + sizeof(*range->hmm_pfns), GFP_KERNEL);
> + if (unlikely(!range->hmm_pfns)) {
> r = -ENOMEM;
> goto out_free_ranges;
> }
> @@ -867,7 +853,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> * the notifier_lock, and mmu_interval_read_retry() must be done first.
> */
> for (i = 0; i < ttm->num_pages; i++)
> - pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
> + pages[i] = hmm_pfn_to_page(range->hmm_pfns[i]);
>
> gtt->range = range;
> mmput(mm);
> @@ -877,7 +863,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> out_unlock:
> up_read(&mm->mmap_sem);
> out_free_pfns:
> - kvfree(range->pfns);
> + kvfree(range->hmm_pfns);
> out_free_ranges:
> kfree(range);
> out:
> @@ -902,7 +888,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
> DRM_DEBUG_DRIVER("user_pages_done 0x%llx pages 0x%lx\n",
> gtt->userptr, ttm->num_pages);
>
> - WARN_ONCE(!gtt->range || !gtt->range->pfns,
> + WARN_ONCE(!gtt->range || !gtt->range->hmm_pfns,
> "No user pages to check\n");
>
> if (gtt->range) {
> @@ -912,7 +898,7 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
> */
> r = mmu_interval_read_retry(gtt->range->notifier,
> gtt->range->notifier_seq);
> - kvfree(gtt->range->pfns);
> + kvfree(gtt->range->hmm_pfns);
> kfree(gtt->range);
> gtt->range = NULL;
> }
> @@ -1003,8 +989,7 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
>
> for (i = 0; i < ttm->num_pages; i++) {
> if (ttm->pages[i] !=
> - hmm_device_entry_to_page(gtt->range,
> - gtt->range->pfns[i]))
> + hmm_pfn_to_page(gtt->range->hmm_pfns[i]))
> break;
> }
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index ad89e09a0be39a..07876fb0e1d665 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -672,27 +672,61 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
> return ret;
> }
>
> -void
> -nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> - struct hmm_range *range)
> +void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
> + u64 *ioctl_addr)
> {
> unsigned long i, npages;
>
> + /*
> + * The ioctl_addr prepared here is passed through nvif_object_ioctl()
> + * to an eventual DMA map on some call chain like:
> + * nouveau_svm_fault():
> + * args.i.m.method = NVIF_VMM_V0_PFNMAP
> + * nouveau_range_fault()
> + * nvif_object_ioctl()
> + * client->driver->ioctl()
> + * struct nvif_driver nvif_driver_nvkm:
> + * .ioctl = nvkm_client_ioctl
> + * nvkm_ioctl()
> + * nvkm_ioctl_path()
> + * nvkm_ioctl_v0[type].func(..)
> + * nvkm_ioctl_mthd()
> + * nvkm_object_mthd()
> + * struct nvkm_object_func nvkm_uvmm:
> + * .mthd = nvkm_uvmm_mthd
> + * nvkm_uvmm_mthd()
> + * nvkm_uvmm_mthd_pfnmap()
> + * nvkm_vmm_pfn_map()
> + * nvkm_vmm_ptes_get_map()
> + * func == gp100_vmm_pgt_pfn
> + * struct nvkm_vmm_desc_func gp100_vmm_desc_spt:
> + * .pfn = gp100_vmm_pgt_pfn
> + * nvkm_vmm_iter()
> + * REF_PTES == func == gp100_vmm_pgt_pfn()
> + * dma_map_page()
> + *
> + * This is all just encoding the internal hmm reprensetation into a
> + * different nouveau internal representation.
> + */
> npages = (range->end - range->start) >> PAGE_SHIFT;
> for (i = 0; i < npages; ++i) {
> struct page *page;
> - uint64_t addr;
>
> - page = hmm_device_entry_to_page(range, range->pfns[i]);
> - if (page == NULL)
> - continue;
> -
> - if (!is_device_private_page(page))
> + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
> + ioctl_addr[i] = 0;
> continue;
> + }
>
> - addr = nouveau_dmem_page_addr(page);
> - range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
> - range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
> - range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
> + page = hmm_pfn_to_page(range->hmm_pfns[i]);
> + if (is_device_private_page(page))
> + ioctl_addr[i] = nouveau_dmem_page_addr(page) |
> + NVIF_VMM_PFNMAP_V0_V |
> + NVIF_VMM_PFNMAP_V0_VRAM;
> + else
> + ioctl_addr[i] = page_to_phys(page) |
> + NVIF_VMM_PFNMAP_V0_V |
> + NVIF_VMM_PFNMAP_V0_HOST;
> + if (range->hmm_pfns[i] & HMM_PFN_WRITE)
> + ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W;
> }
> }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.h b/drivers/gpu/drm/nouveau/nouveau_dmem.h
> index 92394be5d64923..4607c0be7dd062 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.h
> @@ -38,8 +38,8 @@ int nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
> unsigned long start,
> unsigned long end);
>
> -void nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> - struct hmm_range *range);
> +void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range,
> + u64 *ioctl_addr);
> #else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
> static inline void nouveau_dmem_init(struct nouveau_drm *drm) {}
> static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index cf0d9bd61bebf9..d1059bce091192 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -369,18 +369,6 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
> return ret;
> }
>
> -static const u64
> -nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
> - [HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V,
> - [HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W,
> -};
> -
> -static const u64
> -nouveau_svm_pfn_values[HMM_PFN_VALUE_MAX] = {
> - [HMM_PFN_ERROR ] = ~NVIF_VMM_PFNMAP_V0_V,
> - [HMM_PFN_NONE ] = NVIF_VMM_PFNMAP_V0_NONE,
> -};
> -
> /* Issue fault replay for GPU to retry accesses that faulted previously. */
> static void
> nouveau_svm_fault_replay(struct nouveau_svm *svm)
> @@ -520,7 +508,8 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
>
> static int nouveau_range_fault(struct nouveau_svmm *svmm,
> struct nouveau_drm *drm, void *data, u32 size,
> - u64 *pfns, struct svm_notifier *notifier)
> + unsigned long hmm_pfns[], u64 *ioctl_addr,
> + struct svm_notifier *notifier)
> {
> unsigned long timeout =
> jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
> @@ -529,10 +518,8 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
> .notifier = &notifier->notifier,
> .start = notifier->notifier.interval_tree.start,
> .end = notifier->notifier.interval_tree.last + 1,
> - .pfns = pfns,
> - .flags = nouveau_svm_pfn_flags,
> - .values = nouveau_svm_pfn_values,
> - .pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT,
> + .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
> + .hmm_pfns = hmm_pfns,
> };
> struct mm_struct *mm = notifier->notifier.mm;
> int ret;
> @@ -542,12 +529,15 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
> return -EBUSY;
>
> range.notifier_seq = mmu_interval_read_begin(range.notifier);
> - range.default_flags = 0;
> - range.pfn_flags_mask = -1UL;
> down_read(&mm->mmap_sem);
> ret = hmm_range_fault(&range);
> up_read(&mm->mmap_sem);
> if (ret) {
> + /*
> + * FIXME: the input PFN_REQ flags are destroyed on
> + * -EBUSY, we need to regenerate them, also for the
> + * other continue below
> + */
> if (ret == -EBUSY)
> continue;
> return ret;
> @@ -562,7 +552,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
> break;
> }
>
> - nouveau_dmem_convert_pfn(drm, &range);
> + nouveau_hmm_convert_pfn(drm, &range, ioctl_addr);
>
> svmm->vmm->vmm.object.client->super = true;
> ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
> @@ -589,6 +579,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
> } i;
> u64 phys[16];
> } args;
> + unsigned long hmm_pfns[ARRAY_SIZE(args.phys)];
> struct vm_area_struct *vma;
> u64 inst, start, limit;
> int fi, fn, pi, fill;
> @@ -704,12 +695,17 @@ nouveau_svm_fault(struct nvif_notify *notify)
> * access flags.
> *XXX: atomic?
> */
> - if (buffer->fault[fn]->access != 0 /* READ. */ &&
> - buffer->fault[fn]->access != 3 /* PREFETCH. */) {
> - args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V |
> - NVIF_VMM_PFNMAP_V0_W;
> - } else {
> - args.phys[pi++] = NVIF_VMM_PFNMAP_V0_V;
> + switch (buffer->fault[fn]->access) {
> + case 0: /* READ. */
> + hmm_pfns[pi++] = HMM_PFN_REQ_FAULT;
> + break;
> + case 3: /* PREFETCH. */
> + hmm_pfns[pi++] = 0;
> + break;
> + default:
> + hmm_pfns[pi++] = HMM_PFN_REQ_FAULT |
> + HMM_PFN_REQ_WRITE;
> + break;
> }
> args.i.p.size = pi << PAGE_SHIFT;
>
> @@ -737,7 +733,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
> fill = (buffer->fault[fn ]->addr -
> buffer->fault[fn - 1]->addr) >> PAGE_SHIFT;
> while (--fill)
> - args.phys[pi++] = NVIF_VMM_PFNMAP_V0_NONE;
> + hmm_pfns[pi++] = 0;
> }
>
> SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)",
> @@ -753,7 +749,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
> ret = nouveau_range_fault(
> svmm, svm->drm, &args,
> sizeof(args.i) + pi * sizeof(args.phys[0]),
> - args.phys, &notifier);
> + hmm_pfns, args.phys, &notifier);
> mmu_interval_notifier_remove(&notifier.notifier);
> }
> mmput(mm);
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 81c302c884c0e3..e72529786f16f8 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -19,45 +19,45 @@
> #include <linux/mmu_notifier.h>
>
> /*
> - * hmm_pfn_flag_e - HMM flag enums
> + * On output:
> + * 0 - The page is faultable and a future call with
> + * HMM_PFN_REQ_FAULT could succeed.
> + * HMM_PFN_VALID - the pfn field points to a valid PFN. This PFN is at
> + * least readable. If dev_private_owner is !NULL then this could
> + * point at a DEVICE_PRIVATE page.
> + * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
> + * HMM_PFN_ERROR - accessing the pfn is impossible and the device should
> + * fail. ie poisoned memory, special pages, no vma, etc
> *
> - * Flags:
> - * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
> - * HMM_PFN_WRITE: CPU page table has write permission set
> - *
> - * The driver provides a flags array for mapping page protections to device
> - * PTE bits. If the driver valid bit for an entry is bit 3,
> - * i.e., (entry & (1 << 3)), then the driver must provide
> - * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
> - * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
> - * except that this is per device driver rather than per architecture.
> + * On input:
> + * 0 - Return the current state of the page, do not fault it.
> + * HMM_PFN_REQ_FAULT - The output must have HMM_PFN_VALID or hmm_range_fault()
> + * will fail
> + * HMM_PFN_REQ_WRITE - The output must have HMM_PFN_WRITE or hmm_range_fault()
> + * will fail. Must be combined with HMM_PFN_REQ_FAULT.
> */
> -enum hmm_pfn_flag_e {
> - HMM_PFN_VALID = 0,
> - HMM_PFN_WRITE,
> - HMM_PFN_FLAG_MAX
> +enum hmm_pfn_flags {
> + HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
> + HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
> + HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
> +
> + HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
> + HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
> +
> + HMM_PFN_FLAGS = HMM_PFN_VALID | HMM_PFN_WRITE | HMM_PFN_ERROR,
> };
>
> /*
> - * hmm_pfn_value_e - HMM pfn special value
> - *
> - * Flags:
> - * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
> - * HMM_PFN_NONE: corresponding CPU page table entry is pte_none()
> + * hmm_pfn_to_page() - return struct page pointed to by a device entry
> *
> - * Driver provides values for none entry, error entry, and special entry.
> - * Driver can alias (i.e., use same value) error and special, but
> - * it should not alias none with error or special.
> - *
> - * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
> - * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
> - * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
> + * This must be called under the caller 'user_lock' after a successful
> + * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID
> + * already.
> */
> -enum hmm_pfn_value_e {
> - HMM_PFN_ERROR,
> - HMM_PFN_NONE,
> - HMM_PFN_VALUE_MAX
> -};
> +static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn)
> +{
> + return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS);
> +}
>
> /*
> * struct hmm_range - track invalidation lock on virtual address range
> @@ -66,12 +66,9 @@ enum hmm_pfn_value_e {
> * @notifier_seq: result of mmu_interval_read_begin()
> * @start: range virtual start address (inclusive)
> * @end: range virtual end address (exclusive)
> - * @pfns: array of pfns (big enough for the range)
> - * @flags: pfn flags to match device driver page table
> - * @values: pfn value for some special case (none, special, error, ...)
> + * @hmm_pfns: array of pfns (big enough for the range)
> * @default_flags: default flags for the range (write, read, ... see hmm doc)
> * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
> - * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT)
> * @dev_private_owner: owner of device private pages
> */
> struct hmm_range {
> @@ -79,36 +76,12 @@ struct hmm_range {
> unsigned long notifier_seq;
> unsigned long start;
> unsigned long end;
> - uint64_t *pfns;
> - const uint64_t *flags;
> - const uint64_t *values;
> - uint64_t default_flags;
> - uint64_t pfn_flags_mask;
> - uint8_t pfn_shift;
> + unsigned long *hmm_pfns;
> + unsigned long default_flags;
> + unsigned long pfn_flags_mask;
> void *dev_private_owner;
> };
>
> -/*
> - * hmm_device_entry_to_page() - return struct page pointed to by a device entry
> - * @range: range use to decode device entry value
> - * @entry: device entry value to get corresponding struct page from
> - * Return: struct page pointer if entry is a valid, NULL otherwise
> - *
> - * If the device entry is valid (ie valid flag set) then return the struct page
> - * matching the entry value. Otherwise return NULL.
> - */
> -static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
> - uint64_t entry)
> -{
> - if (entry == range->values[HMM_PFN_NONE])
> - return NULL;
> - if (entry == range->values[HMM_PFN_ERROR])
> - return NULL;
> - if (!(entry & range->flags[HMM_PFN_VALID]))
> - return NULL;
> - return pfn_to_page(entry >> range->pfn_shift);
> -}
> -
> /*
> * Please see Documentation/vm/hmm.rst for how to use the range API.
> */
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 2693393dc14b30..c1c96d4cc0554c 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -37,28 +37,13 @@ enum {
> HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
> };
>
> -/*
> - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
> - * @range: range use to encode HMM pfn value
> - * @pfn: pfn value for which to create the device entry
> - * Return: valid device entry for the pfn
> - */
> -static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
> - unsigned long pfn)
> -{
> - return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
> -}
> -
> static int hmm_pfns_fill(unsigned long addr, unsigned long end,
> - struct hmm_range *range, enum hmm_pfn_value_e value)
> + struct hmm_range *range, unsigned long cpu_flags)
> {
> - uint64_t *pfns = range->pfns;
> - unsigned long i;
> + unsigned long i = (addr - range->start) >> PAGE_SHIFT;
>
> - i = (addr - range->start) >> PAGE_SHIFT;
> for (; addr < end; addr += PAGE_SIZE, i++)
> - pfns[i] = range->values[value];
> -
> + range->hmm_pfns[i] = cpu_flags;
> return 0;
> }
>
> @@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
> }
>
> static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
> - uint64_t pfns, uint64_t cpu_flags)
> + unsigned long pfn_req_flags,
> + unsigned long cpu_flags)
> {
> struct hmm_range *range = hmm_vma_walk->range;
>
> @@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
> * waste to have the user pre-fill the pfn arrays with a default
> * flags value.
> */
> - pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
> + pfn_req_flags &= range->pfn_flags_mask;
> + pfn_req_flags |= range->default_flags;
>
> /* We aren't ask to do anything ... */
> - if (!(pfns & range->flags[HMM_PFN_VALID]))
> + if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
> return 0;
>
> /* Need to write fault ? */
> - if ((pfns & range->flags[HMM_PFN_WRITE]) &&
> - !(cpu_flags & range->flags[HMM_PFN_WRITE]))
> + if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
> + !(cpu_flags & HMM_PFN_WRITE))
> return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
>
> /* If CPU page table is not valid then we need to fault */
> - if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
> + if (!(cpu_flags & HMM_PFN_VALID))
> return HMM_NEED_FAULT;
> return 0;
> }
>
> static unsigned int
> hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
> - const uint64_t *pfns, unsigned long npages,
> - uint64_t cpu_flags)
> + const unsigned long hmm_pfns[], unsigned long npages,
> + unsigned long cpu_flags)
> {
> struct hmm_range *range = hmm_vma_walk->range;
> unsigned int required_fault = 0;
> @@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
> * hmm_pte_need_fault() will always return 0.
> */
> if (!((range->default_flags | range->pfn_flags_mask) &
> - range->flags[HMM_PFN_VALID]))
> + HMM_PFN_REQ_FAULT))
> return 0;
>
> for (i = 0; i < npages; ++i) {
> - required_fault |=
> - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
> + required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
> + cpu_flags);
> if (required_fault == HMM_NEED_ALL_BITS)
> return required_fault;
> }
> @@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
> struct hmm_range *range = hmm_vma_walk->range;
> unsigned int required_fault;
> unsigned long i, npages;
> - uint64_t *pfns;
> + unsigned long *hmm_pfns;
>
> i = (addr - range->start) >> PAGE_SHIFT;
> npages = (end - addr) >> PAGE_SHIFT;
> - pfns = &range->pfns[i];
> - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
> + hmm_pfns = &range->hmm_pfns[i];
> + required_fault =
> + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
> if (!walk->vma) {
> if (required_fault)
> return -EFAULT;
> @@ -174,44 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
> }
> if (required_fault)
> return hmm_vma_fault(addr, end, required_fault, walk);
> - return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
> + return hmm_pfns_fill(addr, end, range, 0);
> }
>
> -static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
> +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
> + pmd_t pmd)
> {
> if (pmd_protnone(pmd))
> return 0;
> - return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
> - range->flags[HMM_PFN_WRITE] :
> - range->flags[HMM_PFN_VALID];
> + return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
> }
>
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
> - unsigned long end, uint64_t *pfns, pmd_t pmd)
> + unsigned long end, unsigned long hmm_pfns[],
> + pmd_t pmd)
> {
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> unsigned long pfn, npages, i;
> unsigned int required_fault;
> - uint64_t cpu_flags;
> + unsigned long cpu_flags;
>
> npages = (end - addr) >> PAGE_SHIFT;
> cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
> required_fault =
> - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
> + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
> if (required_fault)
> return hmm_vma_fault(addr, end, required_fault, walk);
>
> pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
> - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
> + hmm_pfns[i] = pfn | cpu_flags;
> return 0;
> }
> #else /* CONFIG_TRANSPARENT_HUGEPAGE */
> /* stub to allow the code below to compile */
> int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
> - unsigned long end, uint64_t *pfns, pmd_t pmd);
> + unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> static inline bool hmm_is_device_private_entry(struct hmm_range *range,
> @@ -222,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
> range->dev_private_owner;
> }
>
> -static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
> +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
> + pte_t pte)
> {
> if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
> return 0;
> - return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
> - range->flags[HMM_PFN_WRITE] :
> - range->flags[HMM_PFN_VALID];
> + return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
> }
>
> static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> unsigned long end, pmd_t *pmdp, pte_t *ptep,
> - uint64_t *pfn)
> + unsigned long *hmm_pfn)
> {
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> unsigned int required_fault;
> - uint64_t cpu_flags;
> + unsigned long cpu_flags;
> pte_t pte = *ptep;
> - uint64_t orig_pfn = *pfn;
> + uint64_t pfn_req_flags = *hmm_pfn;
>
> if (pte_none(pte)) {
> - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
> + required_fault =
> + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
> if (required_fault)
> goto fault;
> - *pfn = range->values[HMM_PFN_NONE];
> + *hmm_pfn = 0;
> return 0;
> }
>
> @@ -258,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> * the PFN even if not present.
> */
> if (hmm_is_device_private_entry(range, entry)) {
> - *pfn = hmm_device_entry_from_pfn(range,
> - device_private_entry_to_pfn(entry));
> - *pfn |= range->flags[HMM_PFN_VALID];
> + cpu_flags = HMM_PFN_VALID;
> if (is_write_device_private_entry(entry))
> - *pfn |= range->flags[HMM_PFN_WRITE];
> + cpu_flags |= HMM_PFN_WRITE;
> + *hmm_pfn = device_private_entry_to_pfn(entry) |
> + cpu_flags;
> return 0;
> }
>
> - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
> + required_fault =
> + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
> if (!required_fault) {
> - *pfn = range->values[HMM_PFN_NONE];
> + *hmm_pfn = 0;
> return 0;
> }
>
> @@ -288,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> }
>
> cpu_flags = pte_to_hmm_pfn_flags(range, pte);
> - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
> + required_fault =
> + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
> if (required_fault)
> goto fault;
>
> @@ -297,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> * fall through and treat it like a normal page.
> */
> if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
> - if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
> + if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
> pte_unmap(ptep);
> return -EFAULT;
> }
> - *pfn = range->values[HMM_PFN_ERROR];
> + *hmm_pfn = HMM_PFN_ERROR;
> return 0;
> }
>
> - *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
> + *hmm_pfn = pte_pfn(pte) | cpu_flags;
> return 0;
>
> fault:
> @@ -321,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> {
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> - uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
> + unsigned long *hmm_pfns =
> + &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
> unsigned long npages = (end - start) >> PAGE_SHIFT;
> unsigned long addr = start;
> pte_t *ptep;
> @@ -333,16 +324,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> return hmm_vma_walk_hole(start, end, -1, walk);
>
> if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
> - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
> hmm_vma_walk->last = addr;
> pmd_migration_entry_wait(walk->mm, pmdp);
> return -EBUSY;
> }
> - return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
> + return hmm_pfns_fill(start, end, range, 0);
> }
>
> if (!pmd_present(pmd)) {
> - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> return -EFAULT;
> return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> }
> @@ -362,7 +353,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
> goto again;
>
> - return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
> + return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
> }
>
> /*
> @@ -372,16 +363,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> * recover.
> */
> if (pmd_bad(pmd)) {
> - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
> + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
> return -EFAULT;
> return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> }
>
> ptep = pte_offset_map(pmdp, addr);
> - for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
> + for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
> int r;
>
> - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
> + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
> if (r) {
> /* hmm_vma_handle_pte() did pte_unmap() */
> return r;
> @@ -393,13 +384,12 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>
> #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
> defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
> -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
> +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
> + pud_t pud)
> {
> if (!pud_present(pud))
> return 0;
> - return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
> - range->flags[HMM_PFN_WRITE] :
> - range->flags[HMM_PFN_VALID];
> + return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
> }
>
> static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
> @@ -427,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
> if (pud_huge(pud) && pud_devmap(pud)) {
> unsigned long i, npages, pfn;
> unsigned int required_fault;
> - uint64_t *pfns, cpu_flags;
> + unsigned long *hmm_pfns;
> + unsigned long cpu_flags;
>
> if (!pud_present(pud)) {
> spin_unlock(ptl);
> @@ -436,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
>
> i = (addr - range->start) >> PAGE_SHIFT;
> npages = (end - addr) >> PAGE_SHIFT;
> - pfns = &range->pfns[i];
> + hmm_pfns = &range->hmm_pfns[i];
>
> cpu_flags = pud_to_hmm_pfn_flags(range, pud);
> - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
> + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
> npages, cpu_flags);
> if (required_fault) {
> spin_unlock(ptl);
> @@ -448,8 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
>
> pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> for (i = 0; i < npages; ++i, ++pfn)
> - pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
> - cpu_flags;
> + hmm_pfns[i] = pfn | cpu_flags;
> goto out_unlock;
> }
>
> @@ -473,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> struct vm_area_struct *vma = walk->vma;
> - uint64_t orig_pfn, cpu_flags;
> unsigned int required_fault;
> + unsigned long pfn_req_flags;
> + unsigned long cpu_flags;
> spinlock_t *ptl;
> pte_t entry;
>
> @@ -482,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
> entry = huge_ptep_get(pte);
>
> i = (start - range->start) >> PAGE_SHIFT;
> - orig_pfn = range->pfns[i];
> + pfn_req_flags = range->hmm_pfns[i];
> cpu_flags = pte_to_hmm_pfn_flags(range, entry);
> - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
> + required_fault =
> + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
> if (required_fault) {
> spin_unlock(ptl);
> return hmm_vma_fault(addr, end, required_fault, walk);
> @@ -492,8 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
>
> pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
> for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
> - range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
> - cpu_flags;
> + range->hmm_pfns[i] = pfn | cpu_flags;
> +
> spin_unlock(ptl);
> return 0;
> }
> @@ -524,7 +516,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
> * failure.
> */
> if (hmm_range_need_fault(hmm_vma_walk,
> - range->pfns +
> + range->hmm_pfns +
> ((start - range->start) >> PAGE_SHIFT),
> (end - start) >> PAGE_SHIFT, 0))
> return -EFAULT;

2020-04-22 19:11:21

by Ralph Campbell

[permalink] [raw]
Subject: Re: [PATCH hmm 0/5] Adjust hmm_range_fault() API


On 4/21/20 5:21 PM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <[email protected]>
>
> The API is a bit complicated for the uses we actually have, and
> disucssions for simplifying have come up a number of times.
>
> This small series removes the customizable pfn format and simplifies the
> return code of hmm_range_fault()
>
> All the drivers are adjusted to process in the simplified format.
> I would appreciated tested-by's for the two drivers, thanks!

For nouveau you can add:
Tested-by: Ralph Campbell <[email protected]>

2020-04-23 06:15:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

On Wed, Apr 22, 2020 at 09:39:11AM -0300, Jason Gunthorpe wrote:
> > Nice callchain from hell.. Unfortunately such "code listings" tend to
> > get out of date very quickly, so I'm not sure it is worth keeping in
> > the code. What would be really worthile is consolidating the two
> > different sets of defines (NVIF_VMM_PFNMAP_V0_ vs NVKM_VMM_PFN_)
> > to make the code a little easier to follow.
>
> I was mainly concerned that this function is using hmm properly,
> becuase it sure looks like it is just forming the CPU physical address
> into a HW specific data. But it turns out it is just an internal data
> for some other code and the dma_map is impossibly far away
>
> It took forever to find, I figured I'd leave a hint for the next poor
> soul that has to look at this..
>
> Also, I think it shows there is no 'performance' argument here, if
> this path needs more performance the above should be cleaned
> before we abuse hmm_range_fault.
>
> Put it in the commit message instead?

Yes, the graph itself sounds reasonable for the commit log as a point
of time information.

> > > npages = (range->end - range->start) >> PAGE_SHIFT;
> > > for (i = 0; i < npages; ++i) {
> > > struct page *page;
> > >
> > > + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
> > > + ioctl_addr[i] = 0;
> > > continue;
> > > + }
> >
> > Can't we rely on the caller pre-zeroing the array?
>
> This ends up as args.phys in nouveau_svm_fault - I didn't see a
> zeroing?
>
> I think it makes sense that this routine fully sets the output array
> and does not assume pre-initialize

Ok.

2020-04-29 19:42:55

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH hmm 2/5] mm/hmm: make hmm_range_fault return 0 or -1

On Wed, Apr 22, 2020 at 07:52:29AM +0200, Christoph Hellwig wrote:
> On Tue, Apr 21, 2020 at 09:21:43PM -0300, Jason Gunthorpe wrote:
> > From: Jason Gunthorpe <[email protected]>
> >
> > hmm_vma_walk->last is supposed to be updated after every write to the
> > pfns, so that it can be returned by hmm_range_fault(). However, this is
> > not done consistently. Fortunately nothing checks the return code of
> > hmm_range_fault() for anything other than error.
> >
> > More importantly last must be set before returning -EBUSY as it is used to
> > prevent reading an output pfn as an input flags when the loop restarts.
> >
> > For clarity and simplicity make hmm_range_fault() return 0 or -ERRNO. Only
> > set last when returning -EBUSY.
> >
> > Signed-off-by: Jason Gunthorpe <[email protected]>
> > Documentation/vm/hmm.rst | 2 +-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++--
> > drivers/gpu/drm/nouveau/nouveau_svm.c | 6 +++---
> > include/linux/hmm.h | 2 +-
> > mm/hmm.c | 25 +++++++++----------------
> > 5 files changed, 16 insertions(+), 23 deletions(-)
> >
> > diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
> > index 4e3e9362afeb10..9924f2caa0184c 100644
> > +++ b/Documentation/vm/hmm.rst
> > @@ -161,7 +161,7 @@ device must complete the update before the driver callback returns.
> > When the device driver wants to populate a range of virtual addresses, it can
> > use::
> >
> > - long hmm_range_fault(struct hmm_range *range);
> > + int hmm_range_fault(struct hmm_range *range);
> >
> > It will trigger a page fault on missing or read-only entries if write access is
> > requested (see below). Page faults use the generic mm page fault code path just
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index 6309ff72bd7876..efc1329a019127 100644
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -852,12 +852,12 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
> > down_read(&mm->mmap_sem);
> > r = hmm_range_fault(range);
> > up_read(&mm->mmap_sem);
> > - if (unlikely(r <= 0)) {
> > + if (unlikely(r)) {
> > /*
> > * FIXME: This timeout should encompass the retry from
> > * mmu_interval_read_retry() as well.
> > */
> > - if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout))
> > + if ((r == -EBUSY) && !time_after(jiffies, timeout))
>
> Please also kill the superflous inner braces here.
>
> > + * Return: 0 or -ERRNO with one of the following status codes:
>
> Maybe say something like:
>
> * Returns 0 on success or one of the following error codes:
>
> Otherwise this looks good:

Got it, thanks

Jason

2020-04-29 22:45:13

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

On Wed, Apr 22, 2020 at 01:52:32PM -0400, Felix Kuehling wrote:
> [+Philip Yang]
>
> Am 2020-04-21 um 8:21 p.m. schrieb Jason Gunthorpe:
> > From: Jason Gunthorpe <[email protected]>
> >
> > Presumably the intent here was that hmm_range_fault() could put the data
> > into some HW specific format and thus avoid some work. However, nothing
> > actually does that, and it isn't clear how anything actually could do that
> > as hmm_range_fault() provides CPU addresses which must be DMA mapped.
> >
> > Perhaps there is some special HW that does not need DMA mapping, but we
> > don't have any examples of this, and the theoretical performance win of
> > avoiding an extra scan over the pfns array doesn't seem worth the
> > complexity. Plus pfns needs to be scanned anyhow to sort out any
> > DEVICE_PRIVATE pages.
> >
> > This version replaces the uint64_t with an usigned long containing a pfn
> > and fix flags. On input flags is filled with the HMM_PFN_REQ_* values, on
> > successful output it is filled with HMM_PFN_* values, describing the state
> > of the pages.
> >
> > amdgpu is simple to convert, it doesn't use snapshot and doesn't use
> > per-page flags.
> >
> > nouveau uses only 16 hmm_pte entries at most (ie fits in a few cache
> > lines), and it sweeps over its pfns array a couple of times anyhow.
> >
> > Signed-off-by: Jason Gunthorpe <[email protected]>
> > Signed-off-by: Christoph Hellwig <[email protected]>
>
> Hi Jason,
>
> I pointed out a typo in the documentation inline. Other than that, the
> series is
>
> Acked-by: Felix Kuehling <[email protected]>
>
> I'll try to build it and run some basic tests later.

Got it, thanks! Let me know if there are problems

Jason