Subject: [PATCH v1 7/9] lib: add support for device coherent type in test_hmm

Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
0x140000000 physical address. Ex.
efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000

Signed-off-by: Alex Sierra <[email protected]>
---
lib/test_hmm.c | 191 +++++++++++++++++++++++++++++++++-----------
lib/test_hmm_uapi.h | 15 ++--
2 files changed, 153 insertions(+), 53 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 45334df28d7b..9e2cc0cd4412 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -471,6 +471,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+ int ret = -ENOMEM;

devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
@@ -553,7 +554,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
}
spin_unlock(&mdevice->lock);

- return true;
+ return 0;

err_release:
mutex_unlock(&mdevice->devmem_lock);
@@ -562,7 +563,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
err_devmem:
kfree(devmem);

- return false;
+ return ret;
}

static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
@@ -571,8 +572,10 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
struct page *rpage;

/*
- * This is a fake device so we alloc real system memory to store
- * our device memory.
+ * For ZONE_DEVICE private type, this is a fake device so we alloc real
+ * system memory to store our device memory.
+ * For ZONE_DEVICE coherent type we use the actual dpage to store the data
+ * and ignore rpage.
*/
rpage = alloc_page(GFP_HIGHUSER);
if (!rpage)
@@ -623,12 +626,17 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
* unallocated pte_none() or read-only zero page.
*/
spage = migrate_pfn_to_page(*src);
+ if (spage && is_zone_device_page(spage))
+ pr_err("page already in device spage pfn: 0x%lx\n",
+ page_to_pfn(spage));
+ WARN_ON(spage && is_zone_device_page(spage));

dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;

- rpage = dpage->zone_device_data;
+ rpage = is_device_private_page(dpage) ? dpage->zone_device_data :
+ dpage;
if (spage)
copy_highpage(rpage, spage);
else
@@ -640,8 +648,11 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
* the simulated device memory and that page holds the pointer
* to the mirror.
*/
+ rpage = dpage->zone_device_data;
rpage->zone_device_data = dmirror;

+ pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
+ page_to_pfn(spage), page_to_pfn(dpage));
*dst = migrate_pfn(page_to_pfn(dpage)) |
MIGRATE_PFN_LOCKED;
if ((*src & MIGRATE_PFN_WRITE) ||
@@ -721,10 +732,13 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
continue;

/*
- * Store the page that holds the data so the page table
- * doesn't have to deal with ZONE_DEVICE private pages.
+ * For ZONE_DEVICE private pages we store the page that
+ * holds the data so the page table doesn't have to deal it.
+ * For ZONE_DEVICE coherent pages we store the actual page, since
+ * the CPU has coherent access to the page.
*/
- entry = dpage->zone_device_data;
+ entry = is_device_private_page(dpage) ? dpage->zone_device_data :
+ dpage;
if (*dst & MIGRATE_PFN_WRITE)
entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
@@ -804,8 +818,110 @@ static int dmirror_exclusive(struct dmirror *dmirror,
return ret;
}

-static int dmirror_migrate(struct dmirror *dmirror,
- struct hmm_dmirror_cmd *cmd)
+static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
+ struct dmirror *dmirror)
+{
+ const unsigned long *src = args->src;
+ unsigned long *dst = args->dst;
+ unsigned long start = args->start;
+ unsigned long end = args->end;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE,
+ src++, dst++) {
+ struct page *dpage, *spage;
+
+ spage = migrate_pfn_to_page(*src);
+ if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ WARN_ON(!is_device_page(spage));
+ spage = is_device_private_page(spage) ? spage->zone_device_data :
+ spage;
+ dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
+ if (!dpage)
+ continue;
+ pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
+ page_to_pfn(spage), page_to_pfn(dpage));
+
+ lock_page(dpage);
+ xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
+ copy_highpage(dpage, spage);
+ *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
+ if (*src & MIGRATE_PFN_WRITE)
+ *dst |= MIGRATE_PFN_WRITE;
+ }
+ return 0;
+}
+
+static int dmirror_migrate_to_system(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
+{
+ unsigned long start, end, addr;
+ unsigned long size = cmd->npages << PAGE_SHIFT;
+ struct mm_struct *mm = dmirror->notifier.mm;
+ struct vm_area_struct *vma;
+ unsigned long src_pfns[64];
+ unsigned long dst_pfns[64];
+ struct migrate_vma args;
+ unsigned long next;
+ int ret;
+
+ start = cmd->addr;
+ end = start + size;
+ if (end < start)
+ return -EINVAL;
+
+ /* Since the mm is for the mirrored process, get a reference first. */
+ if (!mmget_not_zero(mm))
+ return -EINVAL;
+
+ mmap_read_lock(mm);
+ for (addr = start; addr < end; addr = next) {
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start ||
+ !(vma->vm_flags & VM_READ)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
+ if (next > vma->vm_end)
+ next = vma->vm_end;
+
+ args.vma = vma;
+ args.src = src_pfns;
+ args.dst = dst_pfns;
+ args.start = addr;
+ args.end = next;
+ args.pgmap_owner = dmirror->mdevice;
+ args.flags = (dmirror->mdevice->zone_device_type ==
+ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+
+ ret = migrate_vma_setup(&args);
+ if (ret)
+ goto out;
+
+ pr_debug("Migrating from device mem to sys mem\n");
+ dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
+
+ migrate_vma_pages(&args);
+ migrate_vma_finalize(&args);
+ }
+ mmap_read_unlock(mm);
+ mmput(mm);
+
+ return ret;
+
+out:
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return ret;
+}
+
+static int dmirror_migrate_to_device(struct dmirror *dmirror,
+ struct hmm_dmirror_cmd *cmd)
{
unsigned long start, end, addr;
unsigned long size = cmd->npages << PAGE_SHIFT;
@@ -849,6 +965,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
if (ret)
goto out;

+ pr_debug("Migrating from sys mem to device mem\n");
dmirror_migrate_alloc_and_copy(&args, dmirror);
migrate_vma_pages(&args);
dmirror_migrate_finalize_and_map(&args, dmirror);
@@ -857,7 +974,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
mmap_read_unlock(mm);
mmput(mm);

- /* Return the migrated data for verification. */
+ /* Return the migrated data for verification. only for pages in device zone */
ret = dmirror_bounce_init(&bounce, start, size);
if (ret)
return ret;
@@ -894,9 +1011,15 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
}

page = hmm_pfn_to_page(entry);
- if (is_device_private_page(page)) {
- /* Is the page migrated to this device or some other? */
- if (dmirror->mdevice == dmirror_page_to_device(page))
+ if (is_device_page(page)) {
+ /* Is page ZONE_DEVICE coherent? */
+ if (!is_device_private_page(page))
+ *perm = HMM_DMIRROR_PROT_DEV_COHERENT;
+ /*
+ * Is page ZONE_DEVICE private migrated to
+ * this device or some other?
+ */
+ else if (dmirror->mdevice == dmirror_page_to_device(page))
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
else
*perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
@@ -1096,8 +1219,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_write(dmirror, &cmd);
break;

- case HMM_DMIRROR_MIGRATE:
- ret = dmirror_migrate(dmirror, &cmd);
+ case HMM_DMIRROR_MIGRATE_TO_DEV:
+ ret = dmirror_migrate_to_device(dmirror, &cmd);
+ break;
+
+ case HMM_DMIRROR_MIGRATE_TO_SYS:
+ ret = dmirror_migrate_to_system(dmirror, &cmd);
break;

case HMM_DMIRROR_EXCLUSIVE:
@@ -1152,38 +1279,6 @@ static void dmirror_devmem_free(struct page *page)
spin_unlock(&mdevice->lock);
}

-static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
- struct dmirror *dmirror)
-{
- const unsigned long *src = args->src;
- unsigned long *dst = args->dst;
- unsigned long start = args->start;
- unsigned long end = args->end;
- unsigned long addr;
-
- for (addr = start; addr < end; addr += PAGE_SIZE,
- src++, dst++) {
- struct page *dpage, *spage;
-
- spage = migrate_pfn_to_page(*src);
- if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
- continue;
- spage = spage->zone_device_data;
-
- dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
- if (!dpage)
- continue;
-
- lock_page(dpage);
- xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
- copy_highpage(dpage, spage);
- *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
- if (*src & MIGRATE_PFN_WRITE)
- *dst |= MIGRATE_PFN_WRITE;
- }
- return 0;
-}
-
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{
struct migrate_vma args;
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index 77f81e6314eb..af15c35a90f4 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -19,6 +19,7 @@
* @npages: (in) number of pages to read/write
* @cpages: (out) number of pages copied
* @faults: (out) number of device page faults seen
+ * @zone_device_type: (out) zone device memory type
*/
struct hmm_dmirror_cmd {
__u64 addr;
@@ -32,11 +33,12 @@ struct hmm_dmirror_cmd {
/* Expose the address space of the calling process through hmm device file */
#define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd)
#define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
-#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd)

/*
* Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -51,6 +53,8 @@ struct hmm_dmirror_cmd {
* device the ioctl() is made
* HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
* other device
+ * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
+ * the ioctl() is made
*/
enum {
HMM_DMIRROR_PROT_ERROR = 0xFF,
@@ -62,6 +66,7 @@ enum {
HMM_DMIRROR_PROT_ZERO = 0x10,
HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
+ HMM_DMIRROR_PROT_DEV_COHERENT = 0x40,
};

enum {
--
2.32.0



2021-11-25 02:14:56

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH v1 7/9] lib: add support for device coherent type in test_hmm

Am 2021-11-15 um 2:30 p.m. schrieb Alex Sierra:
> Device Coherent type uses device memory that is coherently accesible by
> the CPU. This could be shown as SP (special purpose) memory range
> at the BIOS-e820 memory enumeration. If no SP memory is supported in
> system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.
>
> Currently, test_hmm only supports two different SP ranges of at least
> 256MB size. This could be specified in the kernel parameter variable
> efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
> 0x140000000 physical address. Ex.
> efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000
>
> Signed-off-by: Alex Sierra <[email protected]>
> ---
> lib/test_hmm.c | 191 +++++++++++++++++++++++++++++++++-----------
> lib/test_hmm_uapi.h | 15 ++--
> 2 files changed, 153 insertions(+), 53 deletions(-)
>
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 45334df28d7b..9e2cc0cd4412 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -471,6 +471,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> unsigned long pfn_first;
> unsigned long pfn_last;
> void *ptr;
> + int ret = -ENOMEM;
>
> devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
> if (!devmem)
> @@ -553,7 +554,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> }
> spin_unlock(&mdevice->lock);
>
> - return true;
> + return 0;

You're changing the meaning of the return value, but you're not updating
the callers. I think at least dmirror_devmem_alloc_page will be broken
by this change.


>
> err_release:
> mutex_unlock(&mdevice->devmem_lock);
> @@ -562,7 +563,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> err_devmem:
> kfree(devmem);
>
> - return false;
> + return ret;
> }
>
> static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
> @@ -571,8 +572,10 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
> struct page *rpage;
>
> /*
> - * This is a fake device so we alloc real system memory to store
> - * our device memory.
> + * For ZONE_DEVICE private type, this is a fake device so we alloc real
> + * system memory to store our device memory.
> + * For ZONE_DEVICE coherent type we use the actual dpage to store the data
> + * and ignore rpage.
> */
> rpage = alloc_page(GFP_HIGHUSER);
> if (!rpage)
> @@ -623,12 +626,17 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> * unallocated pte_none() or read-only zero page.
> */
> spage = migrate_pfn_to_page(*src);
> + if (spage && is_zone_device_page(spage))
> + pr_err("page already in device spage pfn: 0x%lx\n",
> + page_to_pfn(spage));
> + WARN_ON(spage && is_zone_device_page(spage));
You can write WARN_ON inside the if-condition:

if (WARN_ON(spage && is_zone_device_page(spage))
...

But I don't see why you need both pr_err and WARN_ON. You can add a
custom message by using WARN instead of WARN_ON:

WARN(spage && is_zone_device_page(spage),
"page already in device spage pfn: 0x%lx\n",
page_to_pfn(spage));


>
> dpage = dmirror_devmem_alloc_page(mdevice);
> if (!dpage)
> continue;
>
> - rpage = dpage->zone_device_data;
> + rpage = is_device_private_page(dpage) ? dpage->zone_device_data :
> + dpage;
> if (spage)
> copy_highpage(rpage, spage);
> else
> @@ -640,8 +648,11 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> * the simulated device memory and that page holds the pointer
> * to the mirror.
> */
> + rpage = dpage->zone_device_data;
> rpage->zone_device_data = dmirror;
>
> + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
> + page_to_pfn(spage), page_to_pfn(dpage));
> *dst = migrate_pfn(page_to_pfn(dpage)) |
> MIGRATE_PFN_LOCKED;
> if ((*src & MIGRATE_PFN_WRITE) ||
> @@ -721,10 +732,13 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
> continue;
>
> /*
> - * Store the page that holds the data so the page table
> - * doesn't have to deal with ZONE_DEVICE private pages.
> + * For ZONE_DEVICE private pages we store the page that
> + * holds the data so the page table doesn't have to deal it.
> + * For ZONE_DEVICE coherent pages we store the actual page, since
> + * the CPU has coherent access to the page.
> */

I find this explanation confusing. The comment in
dmirror_devmem_alloc_page is much clearer. I think all we need here is
that dpage->zone_device_data points to the backing page for
device_private pages. Device_coherent struct pages don't have a backing
page because they represent a real CPU-accessible page already.


> - entry = dpage->zone_device_data;
> + entry = is_device_private_page(dpage) ? dpage->zone_device_data :
> + dpage;

I see this in a few places. Maybe better to wrap that in a helper
function or macro. Something like this:

#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
(page)->zone_device_data : (page))


> if (*dst & MIGRATE_PFN_WRITE)
> entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
> entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
> @@ -804,8 +818,110 @@ static int dmirror_exclusive(struct dmirror *dmirror,
> return ret;
> }
>
> -static int dmirror_migrate(struct dmirror *dmirror,
> - struct hmm_dmirror_cmd *cmd)
> +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> + struct dmirror *dmirror)
> +{
> + const unsigned long *src = args->src;
> + unsigned long *dst = args->dst;
> + unsigned long start = args->start;
> + unsigned long end = args->end;
> + unsigned long addr;
> +
> + for (addr = start; addr < end; addr += PAGE_SIZE,
> + src++, dst++) {
> + struct page *dpage, *spage;
> +
> + spage = migrate_pfn_to_page(*src);
> + if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> + continue;
> +
> + WARN_ON(!is_device_page(spage));
> + spage = is_device_private_page(spage) ? spage->zone_device_data :
> + spage;
> + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> + if (!dpage)
> + continue;
> + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
> + page_to_pfn(spage), page_to_pfn(dpage));
> +
> + lock_page(dpage);
> + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> + copy_highpage(dpage, spage);
> + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> + if (*src & MIGRATE_PFN_WRITE)
> + *dst |= MIGRATE_PFN_WRITE;
> + }
> + return 0;
> +}
> +
> +static int dmirror_migrate_to_system(struct dmirror *dmirror,
> + struct hmm_dmirror_cmd *cmd)
> +{
> + unsigned long start, end, addr;
> + unsigned long size = cmd->npages << PAGE_SHIFT;
> + struct mm_struct *mm = dmirror->notifier.mm;
> + struct vm_area_struct *vma;
> + unsigned long src_pfns[64];
> + unsigned long dst_pfns[64];
> + struct migrate_vma args;
> + unsigned long next;
> + int ret;
> +
> + start = cmd->addr;
> + end = start + size;
> + if (end < start)
> + return -EINVAL;
> +
> + /* Since the mm is for the mirrored process, get a reference first. */
> + if (!mmget_not_zero(mm))
> + return -EINVAL;
> +
> + mmap_read_lock(mm);
> + for (addr = start; addr < end; addr = next) {
> + vma = find_vma(mm, addr);
> + if (!vma || addr < vma->vm_start ||
> + !(vma->vm_flags & VM_READ)) {
> + ret = -EINVAL;
> + goto out;
> + }
> + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
> + if (next > vma->vm_end)
> + next = vma->vm_end;
> +
> + args.vma = vma;
> + args.src = src_pfns;
> + args.dst = dst_pfns;
> + args.start = addr;
> + args.end = next;
> + args.pgmap_owner = dmirror->mdevice;
> + args.flags = (dmirror->mdevice->zone_device_type ==
> + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
> + MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
> + MIGRATE_VMA_SELECT_DEVICE_COHERENT;
> +
> + ret = migrate_vma_setup(&args);
> + if (ret)
> + goto out;
> +
> + pr_debug("Migrating from device mem to sys mem\n");
> + dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
> +
> + migrate_vma_pages(&args);
> + migrate_vma_finalize(&args);
> + }
> + mmap_read_unlock(mm);
> + mmput(mm);
> +
> + return ret;
> +
> +out:
> + mmap_read_unlock(mm);
> + mmput(mm);
> + return ret;
> +}
> +
> +static int dmirror_migrate_to_device(struct dmirror *dmirror,
> + struct hmm_dmirror_cmd *cmd)
> {
> unsigned long start, end, addr;
> unsigned long size = cmd->npages << PAGE_SHIFT;
> @@ -849,6 +965,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
> if (ret)
> goto out;
>
> + pr_debug("Migrating from sys mem to device mem\n");
> dmirror_migrate_alloc_and_copy(&args, dmirror);
> migrate_vma_pages(&args);
> dmirror_migrate_finalize_and_map(&args, dmirror);
> @@ -857,7 +974,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
> mmap_read_unlock(mm);
> mmput(mm);
>
> - /* Return the migrated data for verification. */
> + /* Return the migrated data for verification. only for pages in device zone */
> ret = dmirror_bounce_init(&bounce, start, size);
> if (ret)
> return ret;
> @@ -894,9 +1011,15 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
> }
>
> page = hmm_pfn_to_page(entry);
> - if (is_device_private_page(page)) {
> - /* Is the page migrated to this device or some other? */
> - if (dmirror->mdevice == dmirror_page_to_device(page))
> + if (is_device_page(page)) {
> + /* Is page ZONE_DEVICE coherent? */
> + if (!is_device_private_page(page))
> + *perm = HMM_DMIRROR_PROT_DEV_COHERENT;

Does it make sense to distinguish between DEV_COHERENT_LOCAL and
DEV_COHERENT_REMOTE as well?


> + /*
> + * Is page ZONE_DEVICE private migrated to
> + * this device or some other?
> + */
> + else if (dmirror->mdevice == dmirror_page_to_device(page))
> *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
> else
> *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
> @@ -1096,8 +1219,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
> ret = dmirror_write(dmirror, &cmd);
> break;
>
> - case HMM_DMIRROR_MIGRATE:
> - ret = dmirror_migrate(dmirror, &cmd);
> + case HMM_DMIRROR_MIGRATE_TO_DEV:
> + ret = dmirror_migrate_to_device(dmirror, &cmd);
> + break;
> +
> + case HMM_DMIRROR_MIGRATE_TO_SYS:
> + ret = dmirror_migrate_to_system(dmirror, &cmd);
> break;
>
> case HMM_DMIRROR_EXCLUSIVE:
> @@ -1152,38 +1279,6 @@ static void dmirror_devmem_free(struct page *page)
> spin_unlock(&mdevice->lock);
> }
>
> -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> - struct dmirror *dmirror)
> -{
> - const unsigned long *src = args->src;
> - unsigned long *dst = args->dst;
> - unsigned long start = args->start;
> - unsigned long end = args->end;
> - unsigned long addr;
> -
> - for (addr = start; addr < end; addr += PAGE_SIZE,
> - src++, dst++) {
> - struct page *dpage, *spage;
> -
> - spage = migrate_pfn_to_page(*src);
> - if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> - continue;
> - spage = spage->zone_device_data;
> -
> - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> - if (!dpage)
> - continue;
> -
> - lock_page(dpage);
> - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> - copy_highpage(dpage, spage);
> - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> - if (*src & MIGRATE_PFN_WRITE)
> - *dst |= MIGRATE_PFN_WRITE;
> - }
> - return 0;
> -}
> -
> static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
> {
> struct migrate_vma args;
> diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
> index 77f81e6314eb..af15c35a90f4 100644
> --- a/lib/test_hmm_uapi.h
> +++ b/lib/test_hmm_uapi.h
> @@ -19,6 +19,7 @@
> * @npages: (in) number of pages to read/write
> * @cpages: (out) number of pages copied
> * @faults: (out) number of device page faults seen
> + * @zone_device_type: (out) zone device memory type
> */
> struct hmm_dmirror_cmd {
> __u64 addr;
> @@ -32,11 +33,12 @@ struct hmm_dmirror_cmd {
> /* Expose the address space of the calling process through hmm device file */
> #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd)
> #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd)
>
> /*
> * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
> @@ -51,6 +53,8 @@ struct hmm_dmirror_cmd {
> * device the ioctl() is made
> * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
> * other device
> + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
> + * the ioctl() is made
> */
> enum {
> HMM_DMIRROR_PROT_ERROR = 0xFF,
> @@ -62,6 +66,7 @@ enum {
> HMM_DMIRROR_PROT_ZERO = 0x10,
> HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
> HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
> + HMM_DMIRROR_PROT_DEV_COHERENT = 0x40,

Does it make sense to distinguish between DEV_COHERENT_LOCAL and
DEV_COHERENT_REMOTE as well?

Regards,
  Felix


> };
>
> enum {