This patch serie implement coherent device memory using ZONE_DEVICE
and adds new helper to the HMM framework to support this new kind
of ZONE_DEVICE memory. This is on top of HMM v19 and you can find a
branch here:
https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-cdm
It needs more special casing as it behaves differently from regular
ZONE_DEVICE (persistent memory). Unlike the unaddressable memory
type added with HMM patchset, the CDM type can be access by CPU.
Because of this any page can be migrated to CDM memory, private
anonymous or share memory (file back or not).
It is missing some features like allowing to direct device fault to
directly allocates device memory (intention is to add new fields to
vm_fault struct for this).
This is mostly un-tested but i am posting now because i believe we
want to start discussion on design consideration. So this differ from
the NUMA approach by adding yet a new type to ZONE_DEVICE with more
special casing. While it is a rather small patchset, i might have
miss some code-path that might require more special casing (i and
other need to audit mm to make sure that everytime mm is confronted
so such page it behaves as we want).
So i believe question is do we want to keep on adding new type to
ZONE_DEVICE and more special casing each of them or is a NUMA like
approach better ?
My personal belief is that the hierarchy of memory is getting deeper
(DDR, HBM stack memory, persistent memory, device memory, ...) and
it may make sense to try to mirror this complexity within mm concept.
Generalizing the NUMA abstraction is probably the best starting point
for this. I know there are strong feelings against changing NUMA so
i believe now is the time to pick a direction.
Note that i don't think choosing one would mean we will be stuck with
it, as long as we don't expose anything new (syscall) to userspace
and hide thing through driver API then we keep our options open to
change direction latter on.
Nonetheless we need to make progress on this as they are hardware
right around the corner and it would be a shame if we could not
leverage such hardware with linux.
Jérôme Glisse (3):
mm/cache-coherent-device-memory: new type of ZONE_DEVICE
mm/hmm: add new helper to hotplug CDM memory region
mm/migrate: memory migration using a device DMA engine
include/linux/hmm.h | 10 +-
include/linux/ioport.h | 1 +
include/linux/memory_hotplug.h | 8 +
include/linux/memremap.h | 26 +++
include/linux/migrate.h | 40 ++---
mm/Kconfig | 9 +
mm/gup.c | 1 +
mm/hmm.c | 78 +++++++--
mm/memcontrol.c | 25 ++-
mm/memory.c | 18 ++
mm/migrate.c | 376 ++++++++++++++++++++++-------------------
11 files changed, 380 insertions(+), 212 deletions(-)
--
2.7.4
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.
Signed-off-by: Jérôme Glisse <[email protected]>
---
include/linux/ioport.h | 1 +
include/linux/memory_hotplug.h | 8 ++++++++
include/linux/memremap.h | 26 ++++++++++++++++++++++++++
mm/Kconfig | 9 +++++++++
mm/gup.c | 1 +
mm/memcontrol.c | 25 +++++++++++++++++++++++--
mm/memory.c | 18 ++++++++++++++++++
mm/migrate.c | 12 +++++++++++-
8 files changed, 97 insertions(+), 3 deletions(-)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index ec619dc..55cba87 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY = 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_MEMORY_UNADDRESSABLE = 6,
+ IORES_DESC_DEVICE_MEMORY_CACHE_COHERENT = 7,
};
/* helpers to define resources */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e60f203..7c587ce 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -36,11 +36,19 @@ struct resource;
* page must be treated as an opaque object, rather than a "normal" struct page.
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_CACHE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
*/
enum memory_type {
MEMORY_NORMAL = 0,
MEMORY_DEVICE_PERSISTENT,
MEMORY_DEVICE_UNADDRESSABLE,
+ MEMORY_DEVICE_CACHE_COHERENT,
};
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 3a9494e..6029ddf 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,6 +41,8 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
* page_fault()
* page_free()
*
+ * For MEMORY_DEVICE_CACHE_COHERENT only the page_free() callback matter.
+ *
* Additional notes about MEMORY_DEVICE_UNADDRESSABLE may be found in
* include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
* explanation in include/linux/memory_hotplug.h.
@@ -99,12 +101,26 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap);
struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+static inline bool is_device_persistent_page(const struct page *page)
+{
+ /* See MEMORY_DEVICE_UNADDRESSABLE in include/linux/memory_hotplug.h */
+ return ((page_zonenum(page) == ZONE_DEVICE) &&
+ (page->pgmap->type == MEMORY_DEVICE_PERSISTENT));
+}
+
static inline bool is_device_unaddressable_page(const struct page *page)
{
/* See MEMORY_DEVICE_UNADDRESSABLE in include/linux/memory_hotplug.h */
return ((page_zonenum(page) == ZONE_DEVICE) &&
(page->pgmap->type == MEMORY_DEVICE_UNADDRESSABLE));
}
+
+static inline bool is_device_cache_coherent_page(const struct page *page)
+{
+ /* See MEMORY_DEVICE_UNADDRESSABLE in include/linux/memory_hotplug.h */
+ return ((page_zonenum(page) == ZONE_DEVICE) &&
+ (page->pgmap->type == MEMORY_DEVICE_CACHE_COHERENT));
+}
#else
static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
@@ -124,10 +140,20 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
return NULL;
}
+static inline bool is_device_persistent_page(const struct page *page)
+{
+ return false;
+}
+
static inline bool is_device_unaddressable_page(const struct page *page)
{
return false;
}
+
+static inline bool is_device_cache_coherent_page(const struct page *page)
+{
+ return false;
+}
#endif
/**
diff --git a/mm/Kconfig b/mm/Kconfig
index 96dcf61..5c7b0ec 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -744,6 +744,15 @@ config DEVICE_UNADDRESSABLE
i.e., memory that is only accessible from the device (or group of
devices).
+config DEVICE_CACHE_COHERENT
+ bool "Cache coherent device memory (GPU memory, ...)"
+ depends on ZONE_DEVICE
+
+ help
+ Allow to create struct page for cache-coherent device memory
+ which is only do-able with advance system bus like CAPI or
+ CCIX.
+
config FRAME_VECTOR
bool
diff --git a/mm/gup.c b/mm/gup.c
index 4039ec2..4d54220 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -121,6 +121,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
page = pte_page(pte);
else
goto no_page;
+ pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 712a687..fd188cf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4388,6 +4388,7 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+ MC_TARGET_DEVICE,
};
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
@@ -4395,8 +4396,22 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
{
struct page *page = vm_normal_page(vma, addr, ptent);
- if (!page || !page_mapped(page))
+ if (!page || !page_mapped(page)) {
+ if (pte_devmap(pte)) {
+ struct dev_pagemap *pgmap = NULL;
+
+ page = pte_page(ptent);
+ if (!is_device_cache_coherent_page(page))
+ return NULL;
+
+ pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+ if (pgmap) {
+ get_page(page);
+ return page;
+ }
+ }
return NULL;
+ }
if (PageAnon(page)) {
if (!(mc.flags & MOVE_ANON))
return NULL;
@@ -4611,6 +4626,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+ if (is_device_cache_coherent_page(page))
+ ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
@@ -4896,12 +4913,16 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
+ bool device = false;
swp_entry_t ent;
if (!mc.precharge)
break;
switch (get_mctgt_type(vma, addr, ptent, &target)) {
+ case MC_TARGET_DEVICE:
+ device = true;
+ /* fall through */
case MC_TARGET_PAGE:
page = target.page;
/*
@@ -4912,7 +4933,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
*/
if (PageTransCompound(page))
goto put;
- if (isolate_lru_page(page))
+ if (!device && isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
diff --git a/mm/memory.c b/mm/memory.c
index d68c653..bf41258 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -979,6 +979,24 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
+ } else if (pte_devmap(pte)) {
+ struct dev_pagemap *pgmap = NULL;
+
+ page = pte_page(pte);
+
+ /*
+ * Cache coherent device memory behave like regular page and
+ * not like persistent memory page. For more informations see
+ * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+ */
+ if (is_device_cache_coherent_page(page)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+ if (pgmap) {
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
+ }
+ }
}
out_set_pte:
diff --git a/mm/migrate.c b/mm/migrate.c
index cbaa4f2..2497357 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -241,6 +241,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_mksoft_dirty(pte);
+ } else if (is_device_cache_coherent_page(new)) {
+ pte = pte_mkdevmap(pte);
+ flush_dcache_page(new);
} else
flush_dcache_page(new);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
@@ -2300,7 +2303,8 @@ static bool migrate_vma_check_page(struct page *page)
/* Page from ZONE_DEVICE have one extra reference */
if (is_zone_device_page(page)) {
- if (is_device_unaddressable_page(page)) {
+ if (is_device_unaddressable_page(page) ||
+ is_device_cache_coherent_page(page)) {
extra++;
} else
/* Other ZONE_DEVICE memory type are not supported */
@@ -2617,6 +2621,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
+ } else if (is_device_cache_coherent_page(newpage)) {
+ /*
+ * Anything can be migrated to a device cache
+ * coherent page.
+ */
+ continue;
} else {
/*
* Other types of ZONE_DEVICE page are not
--
2.7.4
Unlike unaddressable memory, coherent device memory has a real
resource associated with it on the system (as CPU can address
it). Add a new helper to hotplug such memory within the HMM
framework.
Signed-off-by: Jérôme Glisse <[email protected]>
---
include/linux/hmm.h | 3 +++
mm/hmm.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 374e5fd..e4fda18 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -392,6 +392,9 @@ struct hmm_devmem {
struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
struct device *device,
unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res);
void hmm_devmem_remove(struct hmm_devmem *devmem);
int hmm_devmem_fault_range(struct hmm_devmem *devmem,
diff --git a/mm/hmm.c b/mm/hmm.c
index ff8ec59..28c7fcb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1038,6 +1038,63 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
}
EXPORT_SYMBOL(hmm_devmem_add);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res)
+{
+ struct hmm_devmem *devmem;
+ int ret;
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = res;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+
+ devmem->resource->desc = MEMORY_DEVICE_CACHE_COHERENT;
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_devm_add_action;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add_resource);
+
/*
* hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
*
@@ -1051,6 +1108,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
{
resource_size_t start, size;
struct device *device;
+ bool cdm = false;
if (!devmem)
return;
@@ -1059,11 +1117,13 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
start = devmem->resource->start;
size = resource_size(devmem->resource);
+ cdm = devmem->resource->desc == MEMORY_DEVICE_CACHE_COHERENT;
hmm_devmem_ref_kill(&devmem->ref);
hmm_devmem_ref_exit(&devmem->ref);
hmm_devmem_pages_remove(devmem);
- devm_release_mem_region(device, start, size);
+ if (!cdm)
+ devm_release_mem_region(device, start, size);
}
EXPORT_SYMBOL(hmm_devmem_remove);
--
2.7.4
This reuse most of migrate_vma() infrastructure and generalize it
so that you can move any array of page using device DMA.
Signed-off-by: Jérôme Glisse <[email protected]>
---
include/linux/hmm.h | 7 +-
include/linux/migrate.h | 40 +++---
mm/hmm.c | 16 +--
mm/migrate.c | 364 +++++++++++++++++++++++++-----------------------
4 files changed, 219 insertions(+), 208 deletions(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index e4fda18..eff17d3 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -398,14 +398,11 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
void hmm_devmem_remove(struct hmm_devmem *devmem);
int hmm_devmem_fault_range(struct hmm_devmem *devmem,
+ struct migrate_dma_ctx *migrate_ctx,
struct vm_area_struct *vma,
- const struct migrate_vma_ops *ops,
- unsigned long *src,
- unsigned long *dst,
unsigned long start,
unsigned long addr,
- unsigned long end,
- void *private);
+ unsigned long end);
/*
* hmm_devmem_page_set_drvdata - set per-page driver data field
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7dd875a..fa7f53a 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -141,7 +141,8 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
#define MIGRATE_PFN_WRITE (1UL << 3)
#define MIGRATE_PFN_DEVICE (1UL << 4)
#define MIGRATE_PFN_ERROR (1UL << 5)
-#define MIGRATE_PFN_SHIFT 6
+#define MIGRATE_PFN_LRU (1UL << 6)
+#define MIGRATE_PFN_SHIFT 7
static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
{
@@ -155,8 +156,10 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
}
+struct migrate_dma_ctx;
+
/*
- * struct migrate_vma_ops - migrate operation callback
+ * struct migrate_dma_ops - migrate operation callback
*
* @alloc_and_copy: alloc destination memory and copy source memory to it
* @finalize_and_map: allow caller to map the successfully migrated pages
@@ -212,28 +215,25 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
* THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
* ENTRIES OR BAD THINGS WILL HAPPEN !
*/
-struct migrate_vma_ops {
- void (*alloc_and_copy)(struct vm_area_struct *vma,
- const unsigned long *src,
- unsigned long *dst,
- unsigned long start,
- unsigned long end,
- void *private);
- void (*finalize_and_map)(struct vm_area_struct *vma,
- const unsigned long *src,
- const unsigned long *dst,
- unsigned long start,
- unsigned long end,
- void *private);
+struct migrate_dma_ops {
+ void (*alloc_and_copy)(struct migrate_dma_ctx *ctx);
+ void (*finalize_and_map)(struct migrate_dma_ctx *ctx);
+};
+
+struct migrate_dma_ctx {
+ const struct migrate_dma_ops *ops;
+ unsigned long *dst;
+ unsigned long *src;
+ unsigned long cpages;
+ unsigned long npages;
};
-int migrate_vma(const struct migrate_vma_ops *ops,
+int migrate_vma(struct migrate_dma_ctx *ctx,
struct vm_area_struct *vma,
unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private);
+ unsigned long end);
+int migrate_dma(struct migrate_dma_ctx *migrate_ctx);
+
#endif /* CONFIG_MIGRATION */
diff --git a/mm/hmm.c b/mm/hmm.c
index 28c7fcb..c14aca5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1131,14 +1131,11 @@ EXPORT_SYMBOL(hmm_devmem_remove);
* hmm_devmem_fault_range() - migrate back a virtual range of memory
*
* @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
+ * @migrate_ctx: migrate context structure
* @vma: virtual memory area containing the range to be migrated
- * @ops: migration callback for allocating destination memory and copying
- * @src: array of unsigned long containing source pfns
- * @dst: array of unsigned long containing destination pfns
* @start: start address of the range to migrate (inclusive)
* @addr: fault address (must be inside the range)
* @end: end address of the range to migrate (exclusive)
- * @private: pointer passed back to each of the callback
* Returns: 0 on success, VM_FAULT_SIGBUS on error
*
* This is a wrapper around migrate_vma() which checks the migration status
@@ -1149,16 +1146,15 @@ EXPORT_SYMBOL(hmm_devmem_remove);
* This is a helper intendend to be used by the ZONE_DEVICE fault handler.
*/
int hmm_devmem_fault_range(struct hmm_devmem *devmem,
+ struct migrate_dma_ctx *migrate_ctx,
struct vm_area_struct *vma,
- const struct migrate_vma_ops *ops,
- unsigned long *src,
- unsigned long *dst,
unsigned long start,
unsigned long addr,
- unsigned long end,
- void *private)
+ unsigned long end)
{
- if (migrate_vma(ops, vma, start, end, src, dst, private))
+ unsigned long *dst = migrate_ctx->dst;
+
+ if (migrate_vma(migrate_ctx, vma, start, end))
return VM_FAULT_SIGBUS;
if (dst[(addr - start) >> PAGE_SHIFT] & MIGRATE_PFN_ERROR)
diff --git a/mm/migrate.c b/mm/migrate.c
index 2497357..5f252d6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2100,27 +2100,17 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
#endif /* CONFIG_NUMA */
-struct migrate_vma {
- struct vm_area_struct *vma;
- unsigned long *dst;
- unsigned long *src;
- unsigned long cpages;
- unsigned long npages;
- unsigned long start;
- unsigned long end;
-};
-
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
struct mm_walk *walk)
{
- struct migrate_vma *migrate = walk->private;
+ struct migrate_dma_ctx *migrate_ctx = walk->private;
unsigned long addr;
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
- migrate->cpages++;
- migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = 0;
+ migrate_ctx->cpages++;
+ migrate_ctx->dst[migrate_ctx->npages] = 0;
+ migrate_ctx->src[migrate_ctx->npages++] = 0;
}
return 0;
@@ -2131,7 +2121,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
unsigned long end,
struct mm_walk *walk)
{
- struct migrate_vma *migrate = walk->private;
+ struct migrate_dma_ctx *migrate_ctx = walk->private;
struct mm_struct *mm = walk->vma->vm_mm;
unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
@@ -2155,7 +2145,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pfn = pte_pfn(pte);
if (pte_none(pte)) {
- migrate->cpages++;
+ migrate_ctx->cpages++;
mpfn = pfn = 0;
goto next;
}
@@ -2178,7 +2168,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_write_device_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
- page = vm_normal_page(migrate->vma, addr, pte);
+ page = vm_normal_page(walk->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -2200,7 +2190,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
* can't be dropped from it).
*/
get_page(page);
- migrate->cpages++;
+ migrate_ctx->cpages++;
/*
* Optimize for the common case where page is only mapped once
@@ -2231,8 +2221,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
}
next:
- migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = mpfn;
+ migrate_ctx->dst[migrate_ctx->npages] = 0;
+ migrate_ctx->src[migrate_ctx->npages++] = mpfn;
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
@@ -2252,7 +2242,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
* valid page, it updates the src array and takes a reference on the page, in
* order to pin the page until we lock it and unmap it.
*/
-static void migrate_vma_collect(struct migrate_vma *migrate)
+static void migrate_vma_collect(struct migrate_dma_ctx *migrate_ctx,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
{
struct mm_walk mm_walk;
@@ -2261,30 +2254,24 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
mm_walk.pte_hole = migrate_vma_collect_hole;
mm_walk.hugetlb_entry = NULL;
mm_walk.test_walk = NULL;
- mm_walk.vma = migrate->vma;
- mm_walk.mm = migrate->vma->vm_mm;
- mm_walk.private = migrate;
-
- mmu_notifier_invalidate_range_start(mm_walk.mm,
- migrate->start,
- migrate->end);
- walk_page_range(migrate->start, migrate->end, &mm_walk);
- mmu_notifier_invalidate_range_end(mm_walk.mm,
- migrate->start,
- migrate->end);
-
- migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.private = migrate_ctx;
+
+ mmu_notifier_invalidate_range_start(mm_walk.mm, start, end);
+ walk_page_range(start, end, &mm_walk);
+ mmu_notifier_invalidate_range_end(mm_walk.mm, start, end);
}
/*
- * migrate_vma_check_page() - check if page is pinned or not
+ * migrate_dma_check_page() - check if page is pinned or not
* @page: struct page to check
*
* Pinned pages cannot be migrated. This is the same test as in
* migrate_page_move_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
-static bool migrate_vma_check_page(struct page *page)
+static bool migrate_dma_check_page(struct page *page)
{
/*
* One extra ref because caller holds an extra reference, either from
@@ -2318,34 +2305,31 @@ static bool migrate_vma_check_page(struct page *page)
}
/*
- * migrate_vma_prepare() - lock pages and isolate them from the lru
- * @migrate: migrate struct containing all migration information
+ * migrate_dma_prepare() - lock pages and isolate them from the lru
+ * @migrate_ctx: migrate struct containing all migration information
*
* This locks pages that have been collected by migrate_vma_collect(). Once each
* page is locked it is isolated from the lru (for non-device pages). Finally,
* the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
* migrated by concurrent kernel threads.
*/
-static void migrate_vma_prepare(struct migrate_vma *migrate)
+static unsigned long migrate_dma_prepare(struct migrate_dma_ctx *migrate_ctx)
{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- unsigned long addr, i, restore = 0;
+ const unsigned long npages = migrate_ctx->npages;
+ unsigned long i, restore = 0;
bool allow_drain = true;
lru_add_drain();
for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
- bool remap = true;
+ struct page *page = migrate_pfn_to_page(migrate_ctx->src[i]);
if (!page)
continue;
- if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
- remap = false;
+ if (!(migrate_ctx->src[i] & MIGRATE_PFN_LOCKED)) {
lock_page(page);
- migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ migrate_ctx->src[i] |= MIGRATE_PFN_LOCKED;
}
/* ZONE_DEVICE pages are not on LRU */
@@ -2357,64 +2341,34 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
}
if (isolate_lru_page(page)) {
- if (remap) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
- } else {
- migrate->src[i] = 0;
- unlock_page(page);
- migrate->cpages--;
- put_page(page);
- }
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->cpages--;
+ restore++;
continue;
}
/* Drop the reference we took in collect */
+ migrate_ctx->src[i] |= MIGRATE_PFN_LRU;
put_page(page);
}
- if (!migrate_vma_check_page(page)) {
- if (remap) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
-
- if (!is_zone_device_page(page)) {
- get_page(page);
- putback_lru_page(page);
- }
- } else {
- migrate->src[i] = 0;
- unlock_page(page);
- migrate->cpages--;
-
- if (!is_zone_device_page(page))
- putback_lru_page(page);
- else
- put_page(page);
- }
+ /*
+ * This is not the final check, it is an early check to avoid
+ * unecessary work if the page is pined.
+ */
+ if (!migrate_dma_check_page(page)) {
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->cpages--;
+ restore++;
}
}
- for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- remove_migration_pte(page, migrate->vma, addr, page);
-
- migrate->src[i] = 0;
- unlock_page(page);
- put_page(page);
- restore--;
- }
+ return restore;
}
/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
+ * migrate_dma_unmap() - replace page mapping with special migration pte entry
+ * @migrate_ctx: migrate struct containing migration context informations
*
* Replace page mapping (CPU page table pte) with a special migration pte entry
* and check again if it has been pinned. Pinned pages are restored because we
@@ -2423,17 +2377,16 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
* This is the last step before we call the device driver callback to allocate
* destination memory and copy contents of original page over to new page.
*/
-static void migrate_vma_unmap(struct migrate_vma *migrate)
+static unsigned long migrate_dma_unmap(struct migrate_dma_ctx *migrate_ctx)
{
int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- unsigned long addr, i, restore = 0;
+ const unsigned long npages = migrate_ctx->npages;
+ unsigned long i, restore = 0;
for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(migrate_ctx->src[i]);
- if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ if (!page || !(migrate_ctx->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (page_mapped(page)) {
@@ -2442,41 +2395,24 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
goto restore;
}
- if (migrate_vma_check_page(page))
+ if (migrate_dma_check_page(page))
continue;
restore:
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->cpages--;
restore++;
}
- for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- remove_migration_ptes(page, page, false);
-
- migrate->src[i] = 0;
- unlock_page(page);
- restore--;
-
- if (is_zone_device_page(page))
- put_page(page);
- else
- putback_lru_page(page);
- }
+ return restore;
}
-static void migrate_vma_insert_page(struct migrate_vma *migrate,
+static void migrate_vma_insert_page(struct vm_area_struct *vma,
unsigned long addr,
struct page *page,
unsigned long *src,
unsigned long *dst)
{
- struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
struct mem_cgroup *memcg;
spinlock_t *ptl;
@@ -2579,33 +2515,35 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}
/*
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
+ * migrate_dma_pages() - migrate meta-data from src page to dst page
+ * @migrate_ctx: migrate struct containing migration context informations
*
* This migrates struct page meta-data from source struct page to destination
* struct page. This effectively finishes the migration from source page to the
* destination page.
*/
-static void migrate_vma_pages(struct migrate_vma *migrate)
+static void migrate_dma_pages(struct migrate_dma_ctx *migrate_ctx,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
+ const unsigned long npages = migrate_ctx->npages;
unsigned long addr, i;
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ for (i = 0, addr = start; i < npages; i++, addr += PAGE_SIZE) {
+ struct page *newpage = migrate_pfn_to_page(migrate_ctx->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate_ctx->src[i]);
struct address_space *mapping;
int r;
if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
- } else if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ } else if (vma && !(migrate_ctx->src[i] & MIGRATE_PFN_MIGRATE)) {
if (!page)
- migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i],
- &migrate->dst[i]);
+ migrate_vma_insert_page(vma, addr, newpage,
+ &migrate_ctx->src[i],
+ &migrate_ctx->dst[i]);
continue;
}
@@ -2618,7 +2556,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
* migrating to un-addressable device memory.
*/
if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (is_device_cache_coherent_page(newpage)) {
@@ -2632,19 +2570,19 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
* Other types of ZONE_DEVICE page are not
* supported.
*/
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
}
r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate_ctx->src[i] &= ~MIGRATE_PFN_MIGRATE;
}
}
/*
- * migrate_vma_finalize() - restore CPU page table entry
+ * migrate_dma_finalize() - restore CPU page table entry
* @migrate: migrate struct containing all migration information
*
* This replaces the special migration pte entry with either a mapping to the
@@ -2654,14 +2592,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
* This also unlocks the pages and puts them back on the lru, or drops the extra
* refcount, for device pages.
*/
-static void migrate_vma_finalize(struct migrate_vma *migrate)
+static void migrate_dma_finalize(struct migrate_dma_ctx *migrate_ctx)
{
- const unsigned long npages = migrate->npages;
+ const unsigned long npages = migrate_ctx->npages;
unsigned long i;
for (i = 0; i < npages; i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *newpage = migrate_pfn_to_page(migrate_ctx->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate_ctx->src[i]);
if (!page) {
if (newpage) {
@@ -2671,7 +2609,7 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
continue;
}
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (!(migrate_ctx->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (newpage) {
unlock_page(newpage);
put_page(newpage);
@@ -2681,7 +2619,6 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
remove_migration_ptes(page, newpage, false);
unlock_page(page);
- migrate->cpages--;
if (is_zone_device_page(page))
put_page(page);
@@ -2698,16 +2635,42 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
+static void migrate_vma_restore(struct migrate_dma_ctx *migrate_ctx,
+ struct vm_area_struct *vma,
+ unsigned long restore,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long addr = start, i = 0;
+
+ for (; i < migrate_ctx->npages && restore; addr += PAGE_SIZE, i++) {
+ bool lru = migrate_ctx->src[i] & MIGRATE_PFN_LRU;
+ struct page *page;
+
+ page = migrate_pfn_to_page(migrate_ctx->src[i]);
+ if (!page || (migrate_ctx->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_ptes(page, page, false);
+
+ migrate_ctx->src[i] = 0;
+ unlock_page(page);
+ restore--;
+
+ if (!lru)
+ put_page(page);
+ else
+ putback_lru_page(page);
+ }
+}
+
/*
* migrate_vma() - migrate a range of memory inside vma
*
- * @ops: migration callback for allocating destination memory and copying
+ * @migrate_ctx: migrate context structure
* @vma: virtual memory area containing the range to be migrated
* @start: start address of the range to migrate (inclusive)
* @end: end address of the range to migrate (exclusive)
- * @src: array of hmm_pfn_t containing source pfns
- * @dst: array of hmm_pfn_t containing destination pfns
- * @private: pointer passed back to each of the callback
* Returns: 0 on success, error code otherwise
*
* This function tries to migrate a range of memory virtual address range, using
@@ -2749,50 +2712,45 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
* Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
* unsigned long entries.
*/
-int migrate_vma(const struct migrate_vma_ops *ops,
+int migrate_vma(struct migrate_dma_ctx *migrate_ctx,
struct vm_area_struct *vma,
unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private)
+ unsigned long end)
{
- struct migrate_vma migrate;
+ unsigned long npages, restore;
/* Sanity check the arguments */
start &= PAGE_MASK;
end &= PAGE_MASK;
if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
return -EINVAL;
- if (!vma || !ops || !src || !dst || start >= end)
+ if (!vma || !migrate_ctx || !migrate_ctx->src || !migrate_ctx->dst)
return -EINVAL;
- if (start < vma->vm_start || start >= vma->vm_end)
+ if (start >= end || start < vma->vm_start || start >= vma->vm_end)
return -EINVAL;
if (end <= vma->vm_start || end > vma->vm_end)
return -EINVAL;
- memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
- migrate.src = src;
- migrate.dst = dst;
- migrate.start = start;
- migrate.npages = 0;
- migrate.cpages = 0;
- migrate.end = end;
- migrate.vma = vma;
+ migrate_ctx->npages = 0;
+ migrate_ctx->cpages = 0;
+ npages = (end - start) >> PAGE_SHIFT;
+ memset(migrate_ctx->src, 0, sizeof(*migrate_ctx->src) * npages);
/* Collect, and try to unmap source pages */
- migrate_vma_collect(&migrate);
- if (!migrate.cpages)
+ migrate_vma_collect(migrate_ctx, vma, start, end);
+ if (!migrate_ctx->cpages)
return 0;
/* Lock and isolate page */
- migrate_vma_prepare(&migrate);
- if (!migrate.cpages)
+ restore = migrate_dma_prepare(migrate_ctx);
+ migrate_vma_restore(migrate_ctx, vma, restore, start, end);
+ if (!migrate_ctx->cpages)
return 0;
/* Unmap pages */
- migrate_vma_unmap(&migrate);
- if (!migrate.cpages)
+ restore = migrate_dma_unmap(migrate_ctx);
+ migrate_vma_restore(migrate_ctx, vma, restore, start, end);
+ if (!migrate_ctx->cpages)
return 0;
/*
@@ -2803,16 +2761,76 @@ int migrate_vma(const struct migrate_vma_ops *ops,
* Note that migration can fail in migrate_vma_struct_page() for each
* individual page.
*/
- ops->alloc_and_copy(vma, src, dst, start, end, private);
+ migrate_ctx->ops->alloc_and_copy(migrate_ctx);
/* This does the real migration of struct page */
- migrate_vma_pages(&migrate);
+ migrate_dma_pages(migrate_ctx, vma, start, end);
- ops->finalize_and_map(vma, src, dst, start, end, private);
+ migrate_ctx->ops->finalize_and_map(migrate_ctx);
/* Unlock and remap pages */
- migrate_vma_finalize(&migrate);
+ migrate_dma_finalize(migrate_ctx);
return 0;
}
EXPORT_SYMBOL(migrate_vma);
+
+/*
+ * migrate_dma() - migrate an array of pages using a device DMA engine
+ *
+ * @migrate_ctx: migrate context structure
+ *
+ * The context structure must have its src fields pointing to an array of
+ * migrate pfn entry each corresponding to a valid page and each page being
+ * lock. The dst entry must by an array as big as src, it will be use during
+ * migration to store the destination pfn.
+ *
+ */
+int migrate_dma(struct migrate_dma_ctx *migrate_ctx)
+{
+ unsigned long i;
+
+ /* Sanity check the arguments */
+ if (!migrate_ctx->ops || !migrate_ctx->src || !migrate_ctx->dst)
+ return -EINVAL;
+
+ /* Below code should be hidden behind some DEBUG config */
+ for (i = 0; i < migrate_ctx->npages; ++i) {
+ const unsigned long mask = MIGRATE_PFN_VALID |
+ MIGRATE_PFN_LOCKED;
+
+ if (!(migrate_ctx->src[i] & mask))
+ return -EINVAL;
+ }
+
+ /* Lock and isolate page */
+ migrate_dma_prepare(migrate_ctx);
+ if (!migrate_ctx->cpages)
+ return 0;
+
+ /* Unmap pages */
+ migrate_dma_unmap(migrate_ctx);
+ if (!migrate_ctx->cpages)
+ return 0;
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the callback.
+ *
+ * Note that migration can fail in migrate_vma_struct_page() for each
+ * individual page.
+ */
+ migrate_ctx->ops->alloc_and_copy(migrate_ctx);
+
+ /* This does the real migration of struct page */
+ migrate_dma_pages(migrate_ctx, NULL, 0, 0);
+
+ migrate_ctx->ops->finalize_and_map(migrate_ctx);
+
+ /* Unlock and remap pages */
+ migrate_dma_finalize(migrate_ctx);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_dma);
--
2.7.4
On Fri, 2017-04-07 at 16:28 -0400, Jérôme Glisse wrote:
> This patch serie implement coherent device memory using ZONE_DEVICE
> and adds new helper to the HMM framework to support this new kind
> of ZONE_DEVICE memory. This is on top of HMM v19 and you can find a
> branch here:
>
> https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-cdm
>
> It needs more special casing as it behaves differently from regular
> ZONE_DEVICE (persistent memory). Unlike the unaddressable memory
> type added with HMM patchset, the CDM type can be access by CPU.
> Because of this any page can be migrated to CDM memory, private
> anonymous or share memory (file back or not).
>
> It is missing some features like allowing to direct device fault to
> directly allocates device memory (intention is to add new fields to
> vm_fault struct for this).
>
>
> This is mostly un-tested but i am posting now because i believe we
> want to start discussion on design consideration. So this differ from
> the NUMA approach by adding yet a new type to ZONE_DEVICE with more
> special casing. While it is a rather small patchset, i might have
> miss some code-path that might require more special casing (i and
> other need to audit mm to make sure that everytime mm is confronted
> so such page it behaves as we want).
>
> So i believe question is do we want to keep on adding new type to
> ZONE_DEVICE and more special casing each of them or is a NUMA like
> approach better ?
>
>
> My personal belief is that the hierarchy of memory is getting deeper
> (DDR, HBM stack memory, persistent memory, device memory, ...) and
> it may make sense to try to mirror this complexity within mm concept.
> Generalizing the NUMA abstraction is probably the best starting point
> for this. I know there are strong feelings against changing NUMA so
> i believe now is the time to pick a direction.
Thanks for all your hard-work and effort on this.
I agree that NUMA is the best representation and in the we want
the mm to manage coherent memory. The device memory is very similar
to NUMA, it is cache coherent, can be simultaneously accessed from
both sides. Like you say, this will evolve, my current design proposal
is at
https://github.com/bsingharora/linux/commits/balbir/cdmv1
with HMM patches (v17) on top. The relevant commits are
c0750c30070e8537ca2ee3ddfce3c0bac7eaab26
dcb3ff6d7900ff644d08a3d1892b6c0ab6982021
9041c3fee859b40c1f9d3e60fd48e0f64ee69abb
b26b6e9f3b078a606a0eaada08bc187b96d966a5
I intend to rebase and repost them. The core motivation of this approach
compared to Anshuman's approach https://lwn.net/Articles/704403/ is
avoiding allocator changes, there are however mempolicy changes. Creating
N_COHERENT_MEMORY exclusive to N_MEMORY allows us to avoid changes in
the allocator paths, with the changes being controlled by mempolicy, where
an explicit node allocation works via changes to policy_zonelist() and policy_
nodemask(). This also isolates coherent memory from kswapd and other back-
ground processes, but direct reclaim and direct compaction, etc are expected
to work. The reason for isolation is performance to prevent wrong allocations
ending up on device memory, but there is no strict requirements, one could
easily use migrations to migrate misplaced memory.
>From a HMM perspective, we still find HMM useful for migration, specifically
your migrate_vma() API and the new propose migrate_dma() API that is a
part of this patchset. I think for isolation we prefer the NUMA approach.
We do find HMM useful for hardware that does not have
coherency, but for coherent devices we prefer the NUMA approach.
With HMM we'll start seeing ZONE_DEVICE pages mapped into user space and
that would mean a thorough audit of all code paths to make sure we are
ready for such a use case and enabling those use cases, like you've done
with patch 1. I've done a quick evaluation to check for features like
migration (page cache migration), fault handling to the right location
(direct page cache allocation in the coherent memory), mlock handling,
RSS accounting, memcg enforcement for pages not on LRU, etc.
>
> Note that i don't think choosing one would mean we will be stuck with
> it, as long as we don't expose anything new (syscall) to userspace
> and hide thing through driver API then we keep our options open to
> change direction latter on.
>
I agree, but I think user space will need to adopt, for example using
malloc on a coherent device will not work, the user space will need to
have a driver supported way of accessing coherent memory.
> Nonetheless we need to make progress on this as they are hardware
> right around the corner and it would be a shame if we could not
> leverage such hardware with linux.
>
>
I agree 100%
Balbir Singh.
(Had sent this to you directly. Reposting for the whole cc list.)
On Fri, Apr 07, 2017 at 04:28:53PM -0400, J?r?me Glisse wrote:
>--- a/include/linux/migrate.h
>+++ b/include/linux/migrate.h
>@@ -212,28 +215,25 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
> * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
> * ENTRIES OR BAD THINGS WILL HAPPEN !
> */
>-struct migrate_vma_ops {
>- void (*alloc_and_copy)(struct vm_area_struct *vma,
>- const unsigned long *src,
>- unsigned long *dst,
>- unsigned long start,
>- unsigned long end,
>- void *private);
>- void (*finalize_and_map)(struct vm_area_struct *vma,
>- const unsigned long *src,
>- const unsigned long *dst,
>- unsigned long start,
>- unsigned long end,
>- void *private);
>+struct migrate_dma_ops {
>+ void (*alloc_and_copy)(struct migrate_dma_ctx *ctx);
>+ void (*finalize_and_map)(struct migrate_dma_ctx *ctx);
>+};
>+
>+struct migrate_dma_ctx {
>+ const struct migrate_dma_ops *ops;
>+ unsigned long *dst;
>+ unsigned long *src;
>+ unsigned long cpages;
>+ unsigned long npages;
Could you add this so we can still pass arguments to the callbacks?
void *private;
> };
>
>-int migrate_vma(const struct migrate_vma_ops *ops,
>+int migrate_vma(struct migrate_dma_ctx *ctx,
> struct vm_area_struct *vma,
> unsigned long start,
>- unsigned long end,
>- unsigned long *src,
>- unsigned long *dst,
>- void *private);
>+ unsigned long end);
>+int migrate_dma(struct migrate_dma_ctx *migrate_ctx);
>+
>
> #endif /* CONFIG_MIGRATION */
>
...%<...
>--- a/mm/migrate.c
>+++ b/mm/migrate.c
>@@ -2803,16 +2761,76 @@ int migrate_vma(const struct migrate_vma_ops *ops,
> * Note that migration can fail in migrate_vma_struct_page() for each
> * individual page.
> */
>- ops->alloc_and_copy(vma, src, dst, start, end, private);
>+ migrate_ctx->ops->alloc_and_copy(migrate_ctx);
>
> /* This does the real migration of struct page */
>- migrate_vma_pages(&migrate);
>+ migrate_dma_pages(migrate_ctx, vma, start, end);
>
>- ops->finalize_and_map(vma, src, dst, start, end, private);
>+ migrate_ctx->ops->finalize_and_map(migrate_ctx);
>
> /* Unlock and remap pages */
>- migrate_vma_finalize(&migrate);
>+ migrate_dma_finalize(migrate_ctx);
>
> return 0;
> }
> EXPORT_SYMBOL(migrate_vma);
>+
>+/*
>+ * migrate_dma() - migrate an array of pages using a device DMA engine
>+ *
>+ * @migrate_ctx: migrate context structure
>+ *
>+ * The context structure must have its src fields pointing to an array of
>+ * migrate pfn entry each corresponding to a valid page and each page being
>+ * lock. The dst entry must by an array as big as src, it will be use during
>+ * migration to store the destination pfn.
>+ *
>+ */
>+int migrate_dma(struct migrate_dma_ctx *migrate_ctx)
>+{
>+ unsigned long i;
>+
>+ /* Sanity check the arguments */
>+ if (!migrate_ctx->ops || !migrate_ctx->src || !migrate_ctx->dst)
>+ return -EINVAL;
>+
>+ /* Below code should be hidden behind some DEBUG config */
>+ for (i = 0; i < migrate_ctx->npages; ++i) {
>+ const unsigned long mask = MIGRATE_PFN_VALID |
>+ MIGRATE_PFN_LOCKED;
This line is before the pages are locked. I think it should be
MIGRATE_PFN_MIGRATE;
>+
>+ if (!(migrate_ctx->src[i] & mask))
>+ return -EINVAL;
>+ }
>+
>+ /* Lock and isolate page */
>+ migrate_dma_prepare(migrate_ctx);
>+ if (!migrate_ctx->cpages)
>+ return 0;
>+
>+ /* Unmap pages */
>+ migrate_dma_unmap(migrate_ctx);
>+ if (!migrate_ctx->cpages)
>+ return 0;
>+
>+ /*
>+ * At this point pages are locked and unmapped, and thus they have
>+ * stable content and can safely be copied to destination memory that
>+ * is allocated by the callback.
>+ *
>+ * Note that migration can fail in migrate_vma_struct_page() for each
>+ * individual page.
>+ */
>+ migrate_ctx->ops->alloc_and_copy(migrate_ctx);
>+
>+ /* This does the real migration of struct page */
>+ migrate_dma_pages(migrate_ctx, NULL, 0, 0);
>+
>+ migrate_ctx->ops->finalize_and_map(migrate_ctx);
>+
>+ /* Unlock and remap pages */
>+ migrate_dma_finalize(migrate_ctx);
>+
>+ return 0;
>+}
>+EXPORT_SYMBOL(migrate_dma);
--
Reza Arbab