Subject: [PATCH v1 1/9] mm: add zone device coherent type memory support

Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI
or CXL). Any page of a process can be migrated to such memory. However,
no one should be allowed to pin such memory so that it can always be
evicted.

Signed-off-by: Alex Sierra <[email protected]>
---
include/linux/memremap.h | 8 ++++++++
include/linux/mm.h | 9 +++++++++
mm/memcontrol.c | 6 +++---
mm/memory-failure.c | 6 +++++-
mm/memremap.c | 5 ++++-
mm/migrate.c | 21 +++++++++++++--------
6 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8..ff4d398edf35 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -39,6 +39,13 @@ struct vmem_altmap {
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.rst.
*
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
* MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page
@@ -59,6 +66,7 @@ struct vmem_altmap {
enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+ MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 73a52aba448f..d23b6ebd2177 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page *page)
return false;
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
case MEMORY_DEVICE_FS_DAX:
return true;
default:
@@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

+static inline bool is_device_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ is_zone_device_page(page) &&
+ (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ page->pgmap->type == MEMORY_DEVICE_COHERENT);
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6da5020a8656..e0275ebe1198 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_COHERENT
+ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page))
+ if (is_device_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3e6449f2102a..51b55fc5172c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto unlock;
}

- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
/*
* TODO: Handle HMM pages which may need coordination
* with device-side memory.
*/
goto unlock;
+ default:
+ break;
}

/*
diff --git a/mm/memremap.c b/mm/memremap.c
index ed593bf87109..94d6a1e01d42 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT ||
pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_dec(&devmap_managed_key);
}
@@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT ||
pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_inc(&devmap_managed_key);
}
@@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)

switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
WARN(1, "Device private memory not supported\n");
return ERR_PTR(-EINVAL);
@@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_devmap_managed_page(struct page *page)
{
/* notify page idle for dax */
- if (!is_device_private_page(page)) {
+ if (!is_device_page(page)) {
wake_up_var(&page->_refcount);
return;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 1852d787e6ab..f74422a42192 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* Device private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
- expected_count += is_device_private_page(page);
+ expected_count += is_device_page(page);
if (mapping)
expected_count += thp_nr_pages(page) + page_has_private(page);

@@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
* FIXME proper solution is to rework migration_entry_wait() so
* it does not need to take a reference on page.
*/
- return is_device_private_page(page);
+ return is_device_page(page);
}

/* For file back page */
@@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
* handle_pte_fault()
* do_anonymous_page()
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
+ * private or coherent page.
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
@@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_readable_device_private_entry(
page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry);
+ } else if (is_device_page(page)) {
+ entry = pte_mkold(mk_pte(page,
+ READ_ONCE(vma->vm_page_prot)));
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
} else {
/*
- * For now we only support migrating to un-addressable
- * device memory.
+ * We support migrating to private and coherent types
+ * for device zone memory.
*/
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
goto abort;
@@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mapping = page_mapping(page);

if (is_zone_device_page(newpage)) {
- if (is_device_private_page(newpage)) {
+ if (is_device_page(newpage)) {
/*
- * For now only support private anonymous when
- * migrating to un-addressable device memory.
+ * For now only support private and coherent
+ * anonymous when migrating to device memory.
*/
if (mapping) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
--
2.32.0



2021-11-17 09:50:46

by [email protected]

[permalink] [raw]
Subject: Re: [PATCH v1 1/9] mm: add zone device coherent type memory support

On Mon, Nov 15, 2021 at 01:30:18PM -0600, Alex Sierra wrote:
> @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
> * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
> * target for charge migration. if @target is not NULL, the entry is stored
> * in target->ent.
> - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
> - * (so ZONE_DEVICE page and thus not on the lru).
> + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_COHERENT
> + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).

Please avoid the overly long line. But I don't think we we need to mention
the exact enum, but rather do something like:

* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
* thus not on the lru.

> + switch (pgmap->type) {
> + case MEMORY_DEVICE_PRIVATE:
> + case MEMORY_DEVICE_COHERENT:
> /*
> * TODO: Handle HMM pages which may need coordination
> * with device-side memory.

This might be a good opportunity for doing a s/HMM/device/ here.

2021-11-18 07:19:46

by Alistair Popple

[permalink] [raw]
Subject: Re: [PATCH v1 1/9] mm: add zone device coherent type memory support

On Tuesday, 16 November 2021 6:30:18 AM AEDT Alex Sierra wrote:
> Device memory that is cache coherent from device and CPU point of view.
> This is used on platforms that have an advanced system bus (like CAPI
> or CXL). Any page of a process can be migrated to such memory. However,
> no one should be allowed to pin such memory so that it can always be
> evicted.
>
> Signed-off-by: Alex Sierra <[email protected]>
> ---
> include/linux/memremap.h | 8 ++++++++
> include/linux/mm.h | 9 +++++++++
> mm/memcontrol.c | 6 +++---
> mm/memory-failure.c | 6 +++++-
> mm/memremap.c | 5 ++++-
> mm/migrate.c | 21 +++++++++++++--------
> 6 files changed, 42 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index c0e9d35889e8..ff4d398edf35 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -39,6 +39,13 @@ struct vmem_altmap {
> * A more complete discussion of unaddressable memory may be found in
> * include/linux/hmm.h and Documentation/vm/hmm.rst.
> *
> + * MEMORY_DEVICE_COHERENT:
> + * Device memory that is cache coherent from device and CPU point of view. This
> + * is used on platforms that have an advanced system bus (like CAPI or CXL). A
> + * driver can hotplug the device memory using ZONE_DEVICE and with that memory
> + * type. Any page of a process can be migrated to such memory. However no one
> + * should be allowed to pin such memory so that it can always be evicted.
> + *
> * MEMORY_DEVICE_FS_DAX:
> * Host memory that has similar access semantics as System RAM i.e. DMA
> * coherent and supports page pinning. In support of coordinating page
> @@ -59,6 +66,7 @@ struct vmem_altmap {
> enum memory_type {
> /* 0 is reserved to catch uninitialized type fields */
> MEMORY_DEVICE_PRIVATE = 1,
> + MEMORY_DEVICE_COHERENT,
> MEMORY_DEVICE_FS_DAX,
> MEMORY_DEVICE_GENERIC,
> MEMORY_DEVICE_PCI_P2PDMA,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 73a52aba448f..d23b6ebd2177 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page *page)
> return false;
> switch (page->pgmap->type) {
> case MEMORY_DEVICE_PRIVATE:
> + case MEMORY_DEVICE_COHERENT:
> case MEMORY_DEVICE_FS_DAX:
> return true;
> default:
> @@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const struct page *page)
> page->pgmap->type == MEMORY_DEVICE_PRIVATE;
> }
>
> +static inline bool is_device_page(const struct page *page)
> +{
> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> + is_zone_device_page(page) &&
> + (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
> + page->pgmap->type == MEMORY_DEVICE_COHERENT);
> +}
> +
> static inline bool is_pci_p2pdma_page(const struct page *page)
> {
> return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6da5020a8656..e0275ebe1198 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
> * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
> * target for charge migration. if @target is not NULL, the entry is stored
> * in target->ent.
> - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
> - * (so ZONE_DEVICE page and thus not on the lru).
> + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_COHERENT
> + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
> * For now we such page is charge like a regular page would be as for all
> * intent and purposes it is just special memory taking the place of a
> * regular page.
> @@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
> */
> if (page_memcg(page) == mc.from) {
> ret = MC_TARGET_PAGE;
> - if (is_device_private_page(page))
> + if (is_device_page(page))
> ret = MC_TARGET_DEVICE;
> if (target)
> target->page = page;
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3e6449f2102a..51b55fc5172c 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
> goto unlock;
> }
>
> - if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
> + switch (pgmap->type) {
> + case MEMORY_DEVICE_PRIVATE:
> + case MEMORY_DEVICE_COHERENT:
> /*
> * TODO: Handle HMM pages which may need coordination
> * with device-side memory.
> */
> goto unlock;
> + default:
> + break;
> }
>
> /*
> diff --git a/mm/memremap.c b/mm/memremap.c
> index ed593bf87109..94d6a1e01d42 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
> static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
> {
> if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> + pgmap->type == MEMORY_DEVICE_COHERENT ||
> pgmap->type == MEMORY_DEVICE_FS_DAX)
> static_branch_dec(&devmap_managed_key);
> }
> @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
> static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
> {
> if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> + pgmap->type == MEMORY_DEVICE_COHERENT ||
> pgmap->type == MEMORY_DEVICE_FS_DAX)
> static_branch_inc(&devmap_managed_key);
> }
> @@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>
> switch (pgmap->type) {
> case MEMORY_DEVICE_PRIVATE:
> + case MEMORY_DEVICE_COHERENT:
> if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
> WARN(1, "Device private memory not supported\n");
> return ERR_PTR(-EINVAL);
> @@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
> void free_devmap_managed_page(struct page *page)
> {
> /* notify page idle for dax */
> - if (!is_device_private_page(page)) {
> + if (!is_device_page(page)) {
> wake_up_var(&page->_refcount);
> return;
> }
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 1852d787e6ab..f74422a42192 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
> * Device private pages have an extra refcount as they are
> * ZONE_DEVICE pages.
> */
> - expected_count += is_device_private_page(page);
> + expected_count += is_device_page(page);
> if (mapping)
> expected_count += thp_nr_pages(page) + page_has_private(page);
>
> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
> * FIXME proper solution is to rework migration_entry_wait() so
> * it does not need to take a reference on page.
> */

Note that I have posted a patch to fix this - see
https://lore.kernel.org/all/[email protected]/. This
looks ok for now assuming coherent pages can never be pinned.

However that raises a question - what happens when something calls
get_user_pages() on a pfn pointing to a coherent device page? I can't see
anything in this series that prevents pinning of coherent device pages, so we
can't just assume they aren't pinned.

In the case of device-private pages this is enforced by the fact they never
have present pte's, so any attempt to GUP them results in a fault. But if I'm
understanding this series correctly that won't be the case for coherent device
pages right?

> - return is_device_private_page(page);
> + return is_device_page(page);
> }
>
> /* For file back page */
> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
> * handle_pte_fault()
> * do_anonymous_page()
> * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
> - * private page.
> + * private or coherent page.
> */
> static void migrate_vma_insert_page(struct migrate_vma *migrate,
> unsigned long addr,
> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
> swp_entry = make_readable_device_private_entry(
> page_to_pfn(page));
> entry = swp_entry_to_pte(swp_entry);
> + } else if (is_device_page(page)) {

How about adding an explicit `is_device_coherent_page()` helper? It would make
the test more explicit that this is expected to handle just coherent pages and
I bet there will be future changes that need to differentiate between private
and coherent pages anyway.

> + entry = pte_mkold(mk_pte(page,
> + READ_ONCE(vma->vm_page_prot)));
> + if (vma->vm_flags & VM_WRITE)
> + entry = pte_mkwrite(pte_mkdirty(entry));
> } else {
> /*
> - * For now we only support migrating to un-addressable
> - * device memory.
> + * We support migrating to private and coherent types
> + * for device zone memory.
> */
> pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
> goto abort;
> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> mapping = page_mapping(page);
>
> if (is_zone_device_page(newpage)) {
> - if (is_device_private_page(newpage)) {
> + if (is_device_page(newpage)) {
> /*
> - * For now only support private anonymous when
> - * migrating to un-addressable device memory.
> + * For now only support private and coherent
> + * anonymous when migrating to device memory.
> */
> if (mapping) {
> migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>





2021-11-18 18:59:25

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH v1 1/9] mm: add zone device coherent type memory support


Am 2021-11-18 um 1:53 a.m. schrieb Alistair Popple:
> On Tuesday, 16 November 2021 6:30:18 AM AEDT Alex Sierra wrote:
>> Device memory that is cache coherent from device and CPU point of view.
>> This is used on platforms that have an advanced system bus (like CAPI
>> or CXL). Any page of a process can be migrated to such memory. However,
>> no one should be allowed to pin such memory so that it can always be
>> evicted.
>>
>> Signed-off-by: Alex Sierra <[email protected]>
>> ---
>> include/linux/memremap.h | 8 ++++++++
>> include/linux/mm.h | 9 +++++++++
>> mm/memcontrol.c | 6 +++---
>> mm/memory-failure.c | 6 +++++-
>> mm/memremap.c | 5 ++++-
>> mm/migrate.c | 21 +++++++++++++--------
>> 6 files changed, 42 insertions(+), 13 deletions(-)
>>
>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>> index c0e9d35889e8..ff4d398edf35 100644
>> --- a/include/linux/memremap.h
>> +++ b/include/linux/memremap.h
>> @@ -39,6 +39,13 @@ struct vmem_altmap {
>> * A more complete discussion of unaddressable memory may be found in
>> * include/linux/hmm.h and Documentation/vm/hmm.rst.
>> *
>> + * MEMORY_DEVICE_COHERENT:
>> + * Device memory that is cache coherent from device and CPU point of view. This
>> + * is used on platforms that have an advanced system bus (like CAPI or CXL). A
>> + * driver can hotplug the device memory using ZONE_DEVICE and with that memory
>> + * type. Any page of a process can be migrated to such memory. However no one
>> + * should be allowed to pin such memory so that it can always be evicted.
>> + *
>> * MEMORY_DEVICE_FS_DAX:
>> * Host memory that has similar access semantics as System RAM i.e. DMA
>> * coherent and supports page pinning. In support of coordinating page
>> @@ -59,6 +66,7 @@ struct vmem_altmap {
>> enum memory_type {
>> /* 0 is reserved to catch uninitialized type fields */
>> MEMORY_DEVICE_PRIVATE = 1,
>> + MEMORY_DEVICE_COHERENT,
>> MEMORY_DEVICE_FS_DAX,
>> MEMORY_DEVICE_GENERIC,
>> MEMORY_DEVICE_PCI_P2PDMA,
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 73a52aba448f..d23b6ebd2177 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page *page)
>> return false;
>> switch (page->pgmap->type) {
>> case MEMORY_DEVICE_PRIVATE:
>> + case MEMORY_DEVICE_COHERENT:
>> case MEMORY_DEVICE_FS_DAX:
>> return true;
>> default:
>> @@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const struct page *page)
>> page->pgmap->type == MEMORY_DEVICE_PRIVATE;
>> }
>>
>> +static inline bool is_device_page(const struct page *page)
>> +{
>> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
>> + is_zone_device_page(page) &&
>> + (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> + page->pgmap->type == MEMORY_DEVICE_COHERENT);
>> +}
>> +
>> static inline bool is_pci_p2pdma_page(const struct page *page)
>> {
>> return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 6da5020a8656..e0275ebe1198 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
>> * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
>> * target for charge migration. if @target is not NULL, the entry is stored
>> * in target->ent.
>> - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
>> - * (so ZONE_DEVICE page and thus not on the lru).
>> + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_COHERENT
>> + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
>> * For now we such page is charge like a regular page would be as for all
>> * intent and purposes it is just special memory taking the place of a
>> * regular page.
>> @@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
>> */
>> if (page_memcg(page) == mc.from) {
>> ret = MC_TARGET_PAGE;
>> - if (is_device_private_page(page))
>> + if (is_device_page(page))
>> ret = MC_TARGET_DEVICE;
>> if (target)
>> target->page = page;
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 3e6449f2102a..51b55fc5172c 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
>> goto unlock;
>> }
>>
>> - if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
>> + switch (pgmap->type) {
>> + case MEMORY_DEVICE_PRIVATE:
>> + case MEMORY_DEVICE_COHERENT:
>> /*
>> * TODO: Handle HMM pages which may need coordination
>> * with device-side memory.
>> */
>> goto unlock;
>> + default:
>> + break;
>> }
>>
>> /*
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index ed593bf87109..94d6a1e01d42 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
>> static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
>> {
>> if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> + pgmap->type == MEMORY_DEVICE_COHERENT ||
>> pgmap->type == MEMORY_DEVICE_FS_DAX)
>> static_branch_dec(&devmap_managed_key);
>> }
>> @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
>> static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
>> {
>> if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
>> + pgmap->type == MEMORY_DEVICE_COHERENT ||
>> pgmap->type == MEMORY_DEVICE_FS_DAX)
>> static_branch_inc(&devmap_managed_key);
>> }
>> @@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>>
>> switch (pgmap->type) {
>> case MEMORY_DEVICE_PRIVATE:
>> + case MEMORY_DEVICE_COHERENT:
>> if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
>> WARN(1, "Device private memory not supported\n");
>> return ERR_PTR(-EINVAL);
>> @@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
>> void free_devmap_managed_page(struct page *page)
>> {
>> /* notify page idle for dax */
>> - if (!is_device_private_page(page)) {
>> + if (!is_device_page(page)) {
>> wake_up_var(&page->_refcount);
>> return;
>> }
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 1852d787e6ab..f74422a42192 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
>> * Device private pages have an extra refcount as they are
>> * ZONE_DEVICE pages.
>> */
>> - expected_count += is_device_private_page(page);
>> + expected_count += is_device_page(page);
>> if (mapping)
>> expected_count += thp_nr_pages(page) + page_has_private(page);
>>
>> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
>> * FIXME proper solution is to rework migration_entry_wait() so
>> * it does not need to take a reference on page.
>> */
> Note that I have posted a patch to fix this - see
> https://lore.kernel.org/all/[email protected]/ This
> looks ok for now assuming coherent pages can never be pinned.
>
> However that raises a question - what happens when something calls
> get_user_pages() on a pfn pointing to a coherent device page? I can't see
> anything in this series that prevents pinning of coherent device pages, so we
> can't just assume they aren't pinned.

I agree. I think we need to depend on your patch to go in first.

I'm also wondering if we need to do something to prevent get_user_pages
from pinning device pages. And by "pin", I think migrate_vma_check_page
is not talking about FOLL_PIN, but any get_user_pages call. As far as I
can tell, there should be nothing fundamentally wrong with pinning
device pages for a short time. But I think we'll want to avoid
FOLL_LONGTERM because that would affect our memory manager's ability to
evict device memory.


>
> In the case of device-private pages this is enforced by the fact they never
> have present pte's, so any attempt to GUP them results in a fault. But if I'm
> understanding this series correctly that won't be the case for coherent device
> pages right?

Right.

Regards,
  Felix


>
>> - return is_device_private_page(page);
>> + return is_device_page(page);
>> }
>>
>> /* For file back page */
>> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
>> * handle_pte_fault()
>> * do_anonymous_page()
>> * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
>> - * private page.
>> + * private or coherent page.
>> */
>> static void migrate_vma_insert_page(struct migrate_vma *migrate,
>> unsigned long addr,
>> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
>> swp_entry = make_readable_device_private_entry(
>> page_to_pfn(page));
>> entry = swp_entry_to_pte(swp_entry);
>> + } else if (is_device_page(page)) {
> How about adding an explicit `is_device_coherent_page()` helper? It would make
> the test more explicit that this is expected to handle just coherent pages and
> I bet there will be future changes that need to differentiate between private
> and coherent pages anyway.
>
>> + entry = pte_mkold(mk_pte(page,
>> + READ_ONCE(vma->vm_page_prot)));
>> + if (vma->vm_flags & VM_WRITE)
>> + entry = pte_mkwrite(pte_mkdirty(entry));
>> } else {
>> /*
>> - * For now we only support migrating to un-addressable
>> - * device memory.
>> + * We support migrating to private and coherent types
>> + * for device zone memory.
>> */
>> pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
>> goto abort;
>> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>> mapping = page_mapping(page);
>>
>> if (is_zone_device_page(newpage)) {
>> - if (is_device_private_page(newpage)) {
>> + if (is_device_page(newpage)) {
>> /*
>> - * For now only support private anonymous when
>> - * migrating to un-addressable device memory.
>> + * For now only support private and coherent
>> + * anonymous when migrating to device memory.
>> */
>> if (mapping) {
>> migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>
>
>

2021-11-22 02:40:50

by Alistair Popple

[permalink] [raw]
Subject: Re: [PATCH v1 1/9] mm: add zone device coherent type memory support

> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 1852d787e6ab..f74422a42192 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
> >> * Device private pages have an extra refcount as they are
> >> * ZONE_DEVICE pages.
> >> */
> >> - expected_count += is_device_private_page(page);
> >> + expected_count += is_device_page(page);
> >> if (mapping)
> >> expected_count += thp_nr_pages(page) + page_has_private(page);
> >>
> >> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
> >> * FIXME proper solution is to rework migration_entry_wait() so
> >> * it does not need to take a reference on page.
> >> */
> > Note that I have posted a patch to fix this - see
> > https://lore.kernel.org/all/[email protected]/ This
> > looks ok for now assuming coherent pages can never be pinned.
> >
> > However that raises a question - what happens when something calls
> > get_user_pages() on a pfn pointing to a coherent device page? I can't see
> > anything in this series that prevents pinning of coherent device pages, so we
> > can't just assume they aren't pinned.
>
> I agree. I think we need to depend on your patch to go in first.
>
> I'm also wondering if we need to do something to prevent get_user_pages
> from pinning device pages. And by "pin", I think migrate_vma_check_page
> is not talking about FOLL_PIN, but any get_user_pages call. As far as I
> can tell, there should be nothing fundamentally wrong with pinning
> device pages for a short time. But I think we'll want to avoid
> FOLL_LONGTERM because that would affect our memory manager's ability to
> evict device memory.

Right, so long as my fix goes in I don't think there is anything wrong with
pinning device public pages. Agree that we should avoid FOLL_LONGTERM pins for
device memory though. I think the way to do that is update is_pinnable_page()
so we treat device pages the same as other unpinnable pages ie. long-term pins
will migrate the page.

> >
> > In the case of device-private pages this is enforced by the fact they never
> > have present pte's, so any attempt to GUP them results in a fault. But if I'm
> > understanding this series correctly that won't be the case for coherent device
> > pages right?
>
> Right.
>
> Regards,
> Felix
>
>
> >
> >> - return is_device_private_page(page);
> >> + return is_device_page(page);
> >> }
> >>
> >> /* For file back page */
> >> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
> >> * handle_pte_fault()
> >> * do_anonymous_page()
> >> * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
> >> - * private page.
> >> + * private or coherent page.
> >> */
> >> static void migrate_vma_insert_page(struct migrate_vma *migrate,
> >> unsigned long addr,
> >> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
> >> swp_entry = make_readable_device_private_entry(
> >> page_to_pfn(page));
> >> entry = swp_entry_to_pte(swp_entry);
> >> + } else if (is_device_page(page)) {
> > How about adding an explicit `is_device_coherent_page()` helper? It would make
> > the test more explicit that this is expected to handle just coherent pages and
> > I bet there will be future changes that need to differentiate between private
> > and coherent pages anyway.
> >
> >> + entry = pte_mkold(mk_pte(page,
> >> + READ_ONCE(vma->vm_page_prot)));
> >> + if (vma->vm_flags & VM_WRITE)
> >> + entry = pte_mkwrite(pte_mkdirty(entry));
> >> } else {
> >> /*
> >> - * For now we only support migrating to un-addressable
> >> - * device memory.
> >> + * We support migrating to private and coherent types
> >> + * for device zone memory.
> >> */
> >> pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
> >> goto abort;
> >> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> >> mapping = page_mapping(page);
> >>
> >> if (is_zone_device_page(newpage)) {
> >> - if (is_device_private_page(newpage)) {
> >> + if (is_device_page(newpage)) {
> >> /*
> >> - * For now only support private anonymous when
> >> - * migrating to un-addressable device memory.
> >> + * For now only support private and coherent
> >> + * anonymous when migrating to device memory.
> >> */
> >> if (mapping) {
> >> migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>
> >
> >
>





2021-11-22 17:17:01

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH v1 1/9] mm: add zone device coherent type memory support

Am 2021-11-21 um 9:40 p.m. schrieb Alistair Popple:
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 1852d787e6ab..f74422a42192 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
>>>> * Device private pages have an extra refcount as they are
>>>> * ZONE_DEVICE pages.
>>>> */
>>>> - expected_count += is_device_private_page(page);
>>>> + expected_count += is_device_page(page);
>>>> if (mapping)
>>>> expected_count += thp_nr_pages(page) + page_has_private(page);
>>>>
>>>> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
>>>> * FIXME proper solution is to rework migration_entry_wait() so
>>>> * it does not need to take a reference on page.
>>>> */
>>> Note that I have posted a patch to fix this - see
>>> https://lore.kernel.org/all/[email protected]/ This
>>> looks ok for now assuming coherent pages can never be pinned.
>>>
>>> However that raises a question - what happens when something calls
>>> get_user_pages() on a pfn pointing to a coherent device page? I can't see
>>> anything in this series that prevents pinning of coherent device pages, so we
>>> can't just assume they aren't pinned.
>> I agree. I think we need to depend on your patch to go in first.
>>
>> I'm also wondering if we need to do something to prevent get_user_pages
>> from pinning device pages. And by "pin", I think migrate_vma_check_page
>> is not talking about FOLL_PIN, but any get_user_pages call. As far as I
>> can tell, there should be nothing fundamentally wrong with pinning
>> device pages for a short time. But I think we'll want to avoid
>> FOLL_LONGTERM because that would affect our memory manager's ability to
>> evict device memory.
> Right, so long as my fix goes in I don't think there is anything wrong with
> pinning device public pages. Agree that we should avoid FOLL_LONGTERM pins for
> device memory though. I think the way to do that is update is_pinnable_page()
> so we treat device pages the same as other unpinnable pages ie. long-term pins
> will migrate the page.

I'm trying to understand check_and_migrate_movable_pages in gup.c. It
doesn't look like the right way to migrate device pages. We may have to
do something different there as well. So instead of changing
is_pinnable_page, it maybe better to explicitly check for is_device_page
or is_device_coherent_page in check_and_migrate_movable_pages to migrate
it correctly, or just fail outright.

Thanks,
  Felix


>
>>> In the case of device-private pages this is enforced by the fact they never
>>> have present pte's, so any attempt to GUP them results in a fault. But if I'm
>>> understanding this series correctly that won't be the case for coherent device
>>> pages right?
>> Right.
>>
>> Regards,
>> Felix
>>
>>
>>>> - return is_device_private_page(page);
>>>> + return is_device_page(page);
>>>> }
>>>>
>>>> /* For file back page */
>>>> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
>>>> * handle_pte_fault()
>>>> * do_anonymous_page()
>>>> * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
>>>> - * private page.
>>>> + * private or coherent page.
>>>> */
>>>> static void migrate_vma_insert_page(struct migrate_vma *migrate,
>>>> unsigned long addr,
>>>> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
>>>> swp_entry = make_readable_device_private_entry(
>>>> page_to_pfn(page));
>>>> entry = swp_entry_to_pte(swp_entry);
>>>> + } else if (is_device_page(page)) {
>>> How about adding an explicit `is_device_coherent_page()` helper? It would make
>>> the test more explicit that this is expected to handle just coherent pages and
>>> I bet there will be future changes that need to differentiate between private
>>> and coherent pages anyway.
>>>
>>>> + entry = pte_mkold(mk_pte(page,
>>>> + READ_ONCE(vma->vm_page_prot)));
>>>> + if (vma->vm_flags & VM_WRITE)
>>>> + entry = pte_mkwrite(pte_mkdirty(entry));
>>>> } else {
>>>> /*
>>>> - * For now we only support migrating to un-addressable
>>>> - * device memory.
>>>> + * We support migrating to private and coherent types
>>>> + * for device zone memory.
>>>> */
>>>> pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
>>>> goto abort;
>>>> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>>>> mapping = page_mapping(page);
>>>>
>>>> if (is_zone_device_page(newpage)) {
>>>> - if (is_device_private_page(newpage)) {
>>>> + if (is_device_page(newpage)) {
>>>> /*
>>>> - * For now only support private anonymous when
>>>> - * migrating to un-addressable device memory.
>>>> + * For now only support private and coherent
>>>> + * anonymous when migrating to device memory.
>>>> */
>>>> if (mapping) {
>>>> migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
>>>>
>
>

2021-11-23 07:10:18

by Alistair Popple

[permalink] [raw]
Subject: Re: [PATCH v1 1/9] mm: add zone device coherent type memory support

On Tuesday, 23 November 2021 4:16:55 AM AEDT Felix Kuehling wrote:

[...]

> > Right, so long as my fix goes in I don't think there is anything wrong with
> > pinning device public pages. Agree that we should avoid FOLL_LONGTERM pins for
> > device memory though. I think the way to do that is update is_pinnable_page()
> > so we treat device pages the same as other unpinnable pages ie. long-term pins
> > will migrate the page.
>
> I'm trying to understand check_and_migrate_movable_pages in gup.c. It
> doesn't look like the right way to migrate device pages. We may have to
> do something different there as well. So instead of changing
> is_pinnable_page, it maybe better to explicitly check for is_device_page
> or is_device_coherent_page in check_and_migrate_movable_pages to migrate
> it correctly, or just fail outright.

Yes, I think you're right. I was thinking check_and_migrate_movable_pages()
would work for coherent device pages. Now I see it won't because it assumes
they are lru pages and it tries to isolate them which will never succeed
because device pages aren't on a lru.

I think migrating them is the right thing to do for FOLL_LONGTERM though.

- Alistair

> Thanks,
> Felix
>
> >
> >>> In the case of device-private pages this is enforced by the fact they never
> >>> have present pte's, so any attempt to GUP them results in a fault. But if I'm
> >>> understanding this series correctly that won't be the case for coherent device
> >>> pages right?
> >> Right.
> >>
> >> Regards,
> >> Felix
> >>
> >>
> >>>> - return is_device_private_page(page);
> >>>> + return is_device_page(page);
> >>>> }
> >>>>
> >>>> /* For file back page */
> >>>> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
> >>>> * handle_pte_fault()
> >>>> * do_anonymous_page()
> >>>> * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
> >>>> - * private page.
> >>>> + * private or coherent page.
> >>>> */
> >>>> static void migrate_vma_insert_page(struct migrate_vma *migrate,
> >>>> unsigned long addr,
> >>>> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
> >>>> swp_entry = make_readable_device_private_entry(
> >>>> page_to_pfn(page));
> >>>> entry = swp_entry_to_pte(swp_entry);
> >>>> + } else if (is_device_page(page)) {
> >>> How about adding an explicit `is_device_coherent_page()` helper? It would make
> >>> the test more explicit that this is expected to handle just coherent pages and
> >>> I bet there will be future changes that need to differentiate between private
> >>> and coherent pages anyway.
> >>>
> >>>> + entry = pte_mkold(mk_pte(page,
> >>>> + READ_ONCE(vma->vm_page_prot)));
> >>>> + if (vma->vm_flags & VM_WRITE)
> >>>> + entry = pte_mkwrite(pte_mkdirty(entry));
> >>>> } else {
> >>>> /*
> >>>> - * For now we only support migrating to un-addressable
> >>>> - * device memory.
> >>>> + * We support migrating to private and coherent types
> >>>> + * for device zone memory.
> >>>> */
> >>>> pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
> >>>> goto abort;
> >>>> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
> >>>> mapping = page_mapping(page);
> >>>>
> >>>> if (is_zone_device_page(newpage)) {
> >>>> - if (is_device_private_page(newpage)) {
> >>>> + if (is_device_page(newpage)) {
> >>>> /*
> >>>> - * For now only support private anonymous when
> >>>> - * migrating to un-addressable device memory.
> >>>> + * For now only support private and coherent
> >>>> + * anonymous when migrating to device memory.
> >>>> */
> >>>> if (mapping) {
> >>>> migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> >>>>
> >
> >
>