2022-02-08 14:05:39

by Christoph Hellwig

[permalink] [raw]
Subject: start sorting out the ZONE_DEVICE refcount mess

Hi all,

this series removes the offset by one refcount for ZONE_DEVICE pages
that are freed back to the driver owning them, which is just device
private ones for now, but also the planned device coherent pages
and the ehanced p2p ones pending.

It does not address the fsdax pages yet, which will be attacked in a
follow on series.

Diffstat:
arch/arm64/mm/mmu.c | 1
arch/powerpc/kvm/book3s_hv_uvmem.c | 1
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1
drivers/gpu/drm/drm_cache.c | 2
drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 -
drivers/gpu/drm/nouveau/nouveau_svm.c | 1
drivers/infiniband/core/rw.c | 1
drivers/nvdimm/pmem.h | 1
drivers/nvme/host/pci.c | 1
drivers/nvme/target/io-cmd-bdev.c | 1
fs/Kconfig | 2
fs/fuse/virtio_fs.c | 1
include/linux/hmm.h | 9 ----
include/linux/memremap.h | 22 +++++++++-
include/linux/mm.h | 59 ++++-------------------------
lib/test_hmm.c | 4 +
mm/Kconfig | 4 -
mm/internal.h | 2
mm/memcontrol.c | 11 +----
mm/memremap.c | 63 ++++++++++++++++---------------
mm/migrate.c | 6 --
mm/swap.c | 49 ++----------------------
23 files changed, 90 insertions(+), 157 deletions(-)


2022-02-08 16:02:49

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

Move the check for the actual pgmap types that need the free at refcount
one behavior into the out of line helper, and thus avoid the need to
pull memremap.h into mm.h.

Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/arm64/mm/mmu.c | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
drivers/gpu/drm/drm_cache.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 +
drivers/gpu/drm/nouveau/nouveau_svm.c | 1 +
drivers/infiniband/core/rw.c | 1 +
drivers/nvdimm/pmem.h | 1 +
drivers/nvme/host/pci.c | 1 +
drivers/nvme/target/io-cmd-bdev.c | 1 +
fs/fuse/virtio_fs.c | 1 +
include/linux/memremap.h | 18 ++++++++++++++++++
include/linux/mm.h | 20 --------------------
lib/test_hmm.c | 1 +
mm/memremap.c | 6 +++++-
14 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acfae9b41cc8c9..580abae6c0b93f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -17,6 +17,7 @@
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/memremap.h>
#include <linux/memory.h>
#include <linux/fs.h>
#include <linux/io.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ea68f3b3a4e9cb..6d643b4b791d87 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -25,6 +25,7 @@

#include <linux/hashtable.h>
#include <linux/mmu_notifier.h>
+#include <linux/memremap.h>
#include <linux/mutex.h>
#include <linux/types.h>
#include <linux/atomic.h>
diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index f19d9acbe95936..50b8a088f763a6 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -27,11 +27,11 @@
/*
* Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
*/
-
#include <linux/dma-buf-map.h>
#include <linux/export.h>
#include <linux/highmem.h>
#include <linux/cc_platform.h>
+#include <linux/ioport.h>
#include <xen/xen.h>

#include <drm/drm_cache.h>
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index e886a3b9e08c7d..a5cdfbe32b5e54 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -39,6 +39,7 @@

#include <linux/sched/mm.h>
#include <linux/hmm.h>
+#include <linux/memremap.h>
#include <linux/migrate.h>

/*
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 266809e511e2c1..090b9b47708cca 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -35,6 +35,7 @@
#include <linux/sched/mm.h>
#include <linux/sort.h>
#include <linux/hmm.h>
+#include <linux/memremap.h>
#include <linux/rmap.h>

struct nouveau_svm {
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 5a3bd41b331c93..4d98f931a13ddd 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -2,6 +2,7 @@
/*
* Copyright (c) 2016 HGST, a Western Digital Company.
*/
+#include <linux/memremap.h>
#include <linux/moduleparam.h>
#include <linux/slab.h>
#include <linux/pci-p2pdma.h>
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 59cfe13ea8a85c..1f51a23614299b 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -3,6 +3,7 @@
#define __NVDIMM_PMEM_H__
#include <linux/page-flags.h>
#include <linux/badblocks.h>
+#include <linux/memremap.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/fs.h>
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6a99ed68091589..ab15bc72710dbe 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
+#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 70ca9dfc1771a9..a141446db1bea3 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
+#include <linux/memremap.h>
#include <linux/module.h>
#include "nvmet.h"

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 9d737904d07c0b..86b7dbb6a0d43e 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -8,6 +8,7 @@
#include <linux/dax.h>
#include <linux/pci.h>
#include <linux/pfn_t.h>
+#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1fafcc38acbad6..514ab46f597e5c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_
+
+#include <linux/mm.h>
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>
@@ -129,6 +131,22 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
return 1 << pgmap->vmemmap_shift;
}

+static inline bool is_device_private_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+}
+
+static inline bool is_pci_p2pdma_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
+}
+
#ifdef CONFIG_ZONE_DEVICE
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 26baadcef4556b..80fccfe31c3444 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,7 +23,6 @@
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
-#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
@@ -1101,9 +1100,6 @@ static inline bool put_devmap_managed_page(struct page *page)
return false;
if (!is_zone_device_page(page))
return false;
- if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
- page->pgmap->type != MEMORY_DEVICE_FS_DAX)
- return false;
return __put_devmap_managed_page(page);
}

@@ -1114,22 +1110,6 @@ static inline bool put_devmap_managed_page(struct page *page)
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */

-static inline bool is_device_private_page(const struct page *page)
-{
- return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
- IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
- is_zone_device_page(page) &&
- page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
- return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
- IS_ENABLED(CONFIG_PCI_P2PDMA) &&
- is_zone_device_page(page) &&
- page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
-}
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 396beee6b061d4..e5fc14ba71f33e 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/cdev.h>
#include <linux/device.h>
+#include <linux/memremap.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
diff --git a/mm/memremap.c b/mm/memremap.c
index f41233a67edb12..a0ece2344c2cab 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -4,7 +4,7 @@
#include <linux/io.h>
#include <linux/kasan.h>
#include <linux/memory_hotplug.h>
-#include <linux/mm.h>
+#include <linux/memremap.h>
#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/mmzone.h>
@@ -504,6 +504,10 @@ void free_devmap_managed_page(struct page *page)

bool __put_devmap_managed_page(struct page *page)
{
+ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ return false;
+
/*
* devmap page refcounts are 1-based, rather than 0-based: if
* refcount is 1, then the page is free and the refcount is
--
2.30.2


2022-02-09 02:42:17

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

On Mon, Feb 7, 2022 at 3:49 PM Dan Williams <[email protected]> wrote:
>
> On Sun, Feb 6, 2022 at 10:33 PM Christoph Hellwig <[email protected]> wrote:
> >
> > Move the check for the actual pgmap types that need the free at refcount
> > one behavior into the out of line helper, and thus avoid the need to
> > pull memremap.h into mm.h.
>
> Looks good to me assuming the compile bots agree.
>
> Reviewed-by: Dan Williams <[email protected]>

Yeah, same as Logan:

mm/memcontrol.c: In function ‘get_mctgt_type’:
mm/memcontrol.c:5724:29: error: implicit declaration of function
‘is_device_private_page’; did you mean
‘is_device_private_entry’? [-Werror=implicit-function-declaration]
5724 | if (is_device_private_page(page))
| ^~~~~~~~~~~~~~~~~~~~~~
| is_device_private_entry

...needs:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d1e97a54ae53..0ac7515c85f9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -62,6 +62,7 @@
#include <linux/tracehook.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/memremap.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>

2022-02-09 07:10:02

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>


Am 2022-02-07 um 01:32 schrieb Christoph Hellwig:
> Move the check for the actual pgmap types that need the free at refcount
> one behavior into the out of line helper, and thus avoid the need to
> pull memremap.h into mm.h.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

The amdkfd part looks good to me.

It looks like this patch is not based on Alex Sierra's coherent memory
series. He added two new helpers is_device_coherent_page and
is_dev_private_or_coherent_page that would need to be moved along with
is_device_private_page and is_pci_p2pdma_page.

Acked-by: Felix Kuehling <[email protected]>


> ---
> arch/arm64/mm/mmu.c | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> drivers/gpu/drm/drm_cache.c | 2 +-
> drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 +
> drivers/gpu/drm/nouveau/nouveau_svm.c | 1 +
> drivers/infiniband/core/rw.c | 1 +
> drivers/nvdimm/pmem.h | 1 +
> drivers/nvme/host/pci.c | 1 +
> drivers/nvme/target/io-cmd-bdev.c | 1 +
> fs/fuse/virtio_fs.c | 1 +
> include/linux/memremap.h | 18 ++++++++++++++++++
> include/linux/mm.h | 20 --------------------
> lib/test_hmm.c | 1 +
> mm/memremap.c | 6 +++++-
> 14 files changed, 34 insertions(+), 22 deletions(-)
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index acfae9b41cc8c9..580abae6c0b93f 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -17,6 +17,7 @@
> #include <linux/mman.h>
> #include <linux/nodemask.h>
> #include <linux/memblock.h>
> +#include <linux/memremap.h>
> #include <linux/memory.h>
> #include <linux/fs.h>
> #include <linux/io.h>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ea68f3b3a4e9cb..6d643b4b791d87 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -25,6 +25,7 @@
>
> #include <linux/hashtable.h>
> #include <linux/mmu_notifier.h>
> +#include <linux/memremap.h>
> #include <linux/mutex.h>
> #include <linux/types.h>
> #include <linux/atomic.h>
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index f19d9acbe95936..50b8a088f763a6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -27,11 +27,11 @@
> /*
> * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
> */
> -
> #include <linux/dma-buf-map.h>
> #include <linux/export.h>
> #include <linux/highmem.h>
> #include <linux/cc_platform.h>
> +#include <linux/ioport.h>
> #include <xen/xen.h>
>
> #include <drm/drm_cache.h>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index e886a3b9e08c7d..a5cdfbe32b5e54 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -39,6 +39,7 @@
>
> #include <linux/sched/mm.h>
> #include <linux/hmm.h>
> +#include <linux/memremap.h>
> #include <linux/migrate.h>
>
> /*
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index 266809e511e2c1..090b9b47708cca 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -35,6 +35,7 @@
> #include <linux/sched/mm.h>
> #include <linux/sort.h>
> #include <linux/hmm.h>
> +#include <linux/memremap.h>
> #include <linux/rmap.h>
>
> struct nouveau_svm {
> diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
> index 5a3bd41b331c93..4d98f931a13ddd 100644
> --- a/drivers/infiniband/core/rw.c
> +++ b/drivers/infiniband/core/rw.c
> @@ -2,6 +2,7 @@
> /*
> * Copyright (c) 2016 HGST, a Western Digital Company.
> */
> +#include <linux/memremap.h>
> #include <linux/moduleparam.h>
> #include <linux/slab.h>
> #include <linux/pci-p2pdma.h>
> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
> index 59cfe13ea8a85c..1f51a23614299b 100644
> --- a/drivers/nvdimm/pmem.h
> +++ b/drivers/nvdimm/pmem.h
> @@ -3,6 +3,7 @@
> #define __NVDIMM_PMEM_H__
> #include <linux/page-flags.h>
> #include <linux/badblocks.h>
> +#include <linux/memremap.h>
> #include <linux/types.h>
> #include <linux/pfn_t.h>
> #include <linux/fs.h>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a99ed68091589..ab15bc72710dbe 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -15,6 +15,7 @@
> #include <linux/init.h>
> #include <linux/interrupt.h>
> #include <linux/io.h>
> +#include <linux/memremap.h>
> #include <linux/mm.h>
> #include <linux/module.h>
> #include <linux/mutex.h>
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 70ca9dfc1771a9..a141446db1bea3 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -6,6 +6,7 @@
> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> #include <linux/blkdev.h>
> #include <linux/blk-integrity.h>
> +#include <linux/memremap.h>
> #include <linux/module.h>
> #include "nvmet.h"
>
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 9d737904d07c0b..86b7dbb6a0d43e 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -8,6 +8,7 @@
> #include <linux/dax.h>
> #include <linux/pci.h>
> #include <linux/pfn_t.h>
> +#include <linux/memremap.h>
> #include <linux/module.h>
> #include <linux/virtio.h>
> #include <linux/virtio_fs.h>
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 1fafcc38acbad6..514ab46f597e5c 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -1,6 +1,8 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #ifndef _LINUX_MEMREMAP_H_
> #define _LINUX_MEMREMAP_H_
> +
> +#include <linux/mm.h>
> #include <linux/range.h>
> #include <linux/ioport.h>
> #include <linux/percpu-refcount.h>
> @@ -129,6 +131,22 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
> return 1 << pgmap->vmemmap_shift;
> }
>
> +static inline bool is_device_private_page(const struct page *page)
> +{
> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> + IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
> + is_zone_device_page(page) &&
> + page->pgmap->type == MEMORY_DEVICE_PRIVATE;
> +}
> +
> +static inline bool is_pci_p2pdma_page(const struct page *page)
> +{
> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> + IS_ENABLED(CONFIG_PCI_P2PDMA) &&
> + is_zone_device_page(page) &&
> + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
> +}
> +
> #ifdef CONFIG_ZONE_DEVICE
> void *memremap_pages(struct dev_pagemap *pgmap, int nid);
> void memunmap_pages(struct dev_pagemap *pgmap);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 26baadcef4556b..80fccfe31c3444 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -23,7 +23,6 @@
> #include <linux/err.h>
> #include <linux/page-flags.h>
> #include <linux/page_ref.h>
> -#include <linux/memremap.h>
> #include <linux/overflow.h>
> #include <linux/sizes.h>
> #include <linux/sched.h>
> @@ -1101,9 +1100,6 @@ static inline bool put_devmap_managed_page(struct page *page)
> return false;
> if (!is_zone_device_page(page))
> return false;
> - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
> - page->pgmap->type != MEMORY_DEVICE_FS_DAX)
> - return false;
> return __put_devmap_managed_page(page);
> }
>
> @@ -1114,22 +1110,6 @@ static inline bool put_devmap_managed_page(struct page *page)
> }
> #endif /* CONFIG_DEV_PAGEMAP_OPS */
>
> -static inline bool is_device_private_page(const struct page *page)
> -{
> - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> - IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
> - is_zone_device_page(page) &&
> - page->pgmap->type == MEMORY_DEVICE_PRIVATE;
> -}
> -
> -static inline bool is_pci_p2pdma_page(const struct page *page)
> -{
> - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> - IS_ENABLED(CONFIG_PCI_P2PDMA) &&
> - is_zone_device_page(page) &&
> - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
> -}
> -
> /* 127: arbitrary random number, small enough to assemble well */
> #define folio_ref_zero_or_close_to_overflow(folio) \
> ((unsigned int) folio_ref_count(folio) + 127u <= 127u)
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 396beee6b061d4..e5fc14ba71f33e 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -12,6 +12,7 @@
> #include <linux/kernel.h>
> #include <linux/cdev.h>
> #include <linux/device.h>
> +#include <linux/memremap.h>
> #include <linux/mutex.h>
> #include <linux/rwsem.h>
> #include <linux/sched.h>
> diff --git a/mm/memremap.c b/mm/memremap.c
> index f41233a67edb12..a0ece2344c2cab 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -4,7 +4,7 @@
> #include <linux/io.h>
> #include <linux/kasan.h>
> #include <linux/memory_hotplug.h>
> -#include <linux/mm.h>
> +#include <linux/memremap.h>
> #include <linux/pfn_t.h>
> #include <linux/swap.h>
> #include <linux/mmzone.h>
> @@ -504,6 +504,10 @@ void free_devmap_managed_page(struct page *page)
>
> bool __put_devmap_managed_page(struct page *page)
> {
> + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
> + page->pgmap->type != MEMORY_DEVICE_FS_DAX)
> + return false;
> +
> /*
> * devmap page refcounts are 1-based, rather than 0-based: if
> * refcount is 1, then the page is free and the refcount is

2022-02-09 08:05:32

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

ZONE_DEVICE struct pages have an extra reference count that complicates
the code for put_page() and several places in the kernel that need to
check the reference count to see that a page is not being used (gup,
compaction, migration, etc.). Clean up the code so the reference count
doesn't need to be treated specially for ZONE_DEVICE pages.

Note that this excludes the special idle page wakeup for fsdax pages,
which still happens at refcount 1. This is a separate issue and will
be sorted out later. Given that only fsdax pages require the
notifiacation when the refcount hits 1 now, the PAGEMAP_OPS Kconfig
symbol can go away and be replaced with a FS_DAX check for this hook
in the put_page fastpath.

Based on an earlier patch from Ralph Campbell <[email protected]>.

Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/powerpc/kvm/book3s_hv_uvmem.c | 1 -
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 -
drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 -
fs/Kconfig | 1 -
include/linux/memremap.h | 12 +++--
include/linux/mm.h | 6 +--
lib/test_hmm.c | 1 -
mm/Kconfig | 4 --
mm/internal.h | 2 +
mm/memcontrol.c | 11 ++---
mm/memremap.c | 57 ++++++++----------------
mm/migrate.c | 6 ---
mm/swap.c | 16 ++-----
13 files changed, 36 insertions(+), 83 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e414ca44839fd1..8b6438fa18fc2b 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -712,7 +712,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)

dpage = pfn_to_page(uvmem_pfn);
dpage->zone_device_data = pvt;
- get_page(dpage);
lock_page(dpage);
return dpage;
out_clear:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index cb835f95a76e66..e27ca375876230 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -225,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
page = pfn_to_page(pfn);
svm_range_bo_ref(prange->svm_bo);
page->zone_device_data = prange->svm_bo;
- get_page(page);
lock_page(page);
}

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index a5cdfbe32b5e54..7ba66ad68a8a1e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -326,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
return NULL;
}

- get_page(page);
lock_page(page);
return page;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index 7a2b11c0b8036d..05efea674bffa0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,7 +48,6 @@ config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
- select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
select FS_IOMAP
select DAX
help
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 514ab46f597e5c..d6a114dd5ea8b7 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -68,9 +68,9 @@ enum memory_type {

struct dev_pagemap_ops {
/*
- * Called once the page refcount reaches 1. (ZONE_DEVICE pages never
- * reach 0 refcount unless there is a refcount bug. This allows the
- * device driver to implement its own memory management.)
+ * Called once the page refcount reaches 0. The reference count will be
+ * reset to one by the core code after the method is called to prepare
+ * for handing out the page again.
*/
void (*page_free)(struct page *page);

@@ -133,16 +133,14 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)

static inline bool is_device_private_page(const struct page *page)
{
- return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
- IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
- return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
- IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+ return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fccfe31c3444..ff9f149ca2017e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1090,7 +1090,7 @@ static inline bool is_zone_movable_page(const struct page *page)
return page_zonenum(page) == ZONE_MOVABLE;
}

-#ifdef CONFIG_DEV_PAGEMAP_OPS
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);

bool __put_devmap_managed_page(struct page *page);
@@ -1103,12 +1103,12 @@ static inline bool put_devmap_managed_page(struct page *page)
return __put_devmap_managed_page(page);
}

-#else /* CONFIG_DEV_PAGEMAP_OPS */
+#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
static inline bool put_devmap_managed_page(struct page *page)
{
return false;
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e5fc14ba71f33e..cfe63204783918 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -566,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
}

dpage->zone_device_data = rpage;
- get_page(dpage);
lock_page(dpage);
return dpage;

diff --git a/mm/Kconfig b/mm/Kconfig
index 3326ee3903f330..a1901ae6d06293 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -776,9 +776,6 @@ config ZONE_DEVICE

If FS_DAX is enabled, then say Y.

-config DEV_PAGEMAP_OPS
- bool
-
#
# Helpers to mirror range of the CPU page tables of a process into device page
# tables.
@@ -790,7 +787,6 @@ config HMM_MIRROR
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ZONE_DEVICE
- select DEV_PAGEMAP_OPS

help
Allows creation of struct pages to represent unaddressable device
diff --git a/mm/internal.h b/mm/internal.h
index d80300392a194f..a67222d17e5987 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -718,4 +718,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);

+void free_zone_device_page(struct page *page);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09d342c7cbd0d9..d1e97a54ae535e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5503,17 +5503,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return NULL;

/*
- * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
- * a device and because they are not accessible by CPU they are store
- * as special swap entry in the CPU page table.
+ * Handle device private pages that are not accessible by the CPU, but
+ * stored as special swap entries in the page table.
*/
if (is_device_private_entry(ent)) {
page = pfn_swap_entry_to_page(ent);
- /*
- * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
- * a refcount of 1 when free (unlike normal page)
- */
- if (!page_ref_add_unless(page, 1, 1))
+ if (!get_page_unless_zero(page))
return NULL;
return page;
}
diff --git a/mm/memremap.c b/mm/memremap.c
index a0ece2344c2cab..fef5734d5e4933 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/xarray.h>
+#include "internal.h"

static DEFINE_XARRAY(pgmap_array);

@@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif

-#ifdef CONFIG_DEV_PAGEMAP_OPS
+#ifdef CONFIG_FS_DAX
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);

static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_dec(&devmap_managed_key);
}

static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_inc(&devmap_managed_key);
}
#else
@@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+#endif /* CONFIG_FS_DAX */

static void pgmap_array_delete(struct range *range)
{
@@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
return (range->start + range_len(range)) >> PAGE_SHIFT;
}

-static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
-{
- if (pfn % (1024 << pgmap->vmemmap_shift))
- cond_resched();
- return pfn + pgmap_vmemmap_nr(pgmap);
-}
-
static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
{
return (pfn_end(pgmap, range_id) -
pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}

-#define for_each_device_pfn(pfn, map, i) \
- for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
- pfn = pfn_next(map, pfn))
-
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
struct range *range = &pgmap->ranges[range_id];
@@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)

void memunmap_pages(struct dev_pagemap *pgmap)
{
- unsigned long pfn;
int i;

percpu_ref_kill(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
- for_each_device_pfn(pfn, pgmap, i)
- put_page(pfn_to_page(pfn));
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
wait_for_completion(&pgmap->done);
percpu_ref_exit(&pgmap->ref);

@@ -464,14 +450,10 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
}
EXPORT_SYMBOL_GPL(get_dev_pagemap);

-#ifdef CONFIG_DEV_PAGEMAP_OPS
-void free_devmap_managed_page(struct page *page)
+void free_zone_device_page(struct page *page)
{
- /* notify page idle for dax */
- if (!is_device_private_page(page)) {
- wake_up_var(&page->_refcount);
+ if (WARN_ON_ONCE(!is_device_private_page(page)))
return;
- }

__ClearPageWaiters(page);

@@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page)
*/
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+
+ /*
+ * Reset the page count to 1 to prepare for handing out the page again.
+ */
+ set_page_count(page, 1);
}

+#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_page(struct page *page)
{
- if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
- page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
return false;

/*
- * devmap page refcounts are 1-based, rather than 0-based: if
+ * fsdax page refcounts are 1-based, rather than 0-based: if
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
- switch (page_ref_dec_return(page)) {
- case 1:
- free_devmap_managed_page(page);
- break;
- case 0:
- __put_page(page);
- break;
- }
+ if (page_ref_dec_return(page) == 1)
+ wake_up_var(&page->_refcount);
return true;
}
EXPORT_SYMBOL(__put_devmap_managed_page);
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781b8..8e0370a73f8a43 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -341,14 +341,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
{
int expected_count = 1;

- /*
- * Device private pages have an extra refcount as they are
- * ZONE_DEVICE pages.
- */
- expected_count += is_device_private_page(page);
if (mapping)
expected_count += compound_nr(page) + page_has_private(page);
-
return expected_count;
}

diff --git a/mm/swap.c b/mm/swap.c
index 25b55c56614311..c84d6817043257 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -114,17 +114,9 @@ static void __put_compound_page(struct page *page)

void __put_page(struct page *page)
{
- if (is_zone_device_page(page)) {
- put_dev_pagemap(page->pgmap);
-
- /*
- * The page belongs to the device that created pgmap. Do
- * not return it to page allocator.
- */
- return;
- }
-
- if (unlikely(PageCompound(page)))
+ if (unlikely(is_zone_device_page(page)))
+ free_zone_device_page(page);
+ else if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
__put_single_page(page);
@@ -933,7 +925,7 @@ void release_pages(struct page **pages, int nr)
if (put_devmap_managed_page(page))
continue;
if (put_page_testzero(page))
- put_dev_pagemap(page->pgmap);
+ free_zone_device_page(page);
continue;
}

--
2.30.2


2022-02-09 08:10:54

by Logan Gunthorpe

[permalink] [raw]
Subject: Re: start sorting out the ZONE_DEVICE refcount mess



On 2022-02-06 11:32 p.m., Christoph Hellwig wrote:
> Hi all,
>
> this series removes the offset by one refcount for ZONE_DEVICE pages
> that are freed back to the driver owning them, which is just device
> private ones for now, but also the planned device coherent pages
> and the ehanced p2p ones pending.
>
> It does not address the fsdax pages yet, which will be attacked in a
> follow on series.
>
> Diffstat:
> arch/arm64/mm/mmu.c | 1
> arch/powerpc/kvm/book3s_hv_uvmem.c | 1
> drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1
> drivers/gpu/drm/drm_cache.c | 2
> drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 -
> drivers/gpu/drm/nouveau/nouveau_svm.c | 1
> drivers/infiniband/core/rw.c | 1
> drivers/nvdimm/pmem.h | 1
> drivers/nvme/host/pci.c | 1
> drivers/nvme/target/io-cmd-bdev.c | 1
> fs/Kconfig | 2
> fs/fuse/virtio_fs.c | 1
> include/linux/hmm.h | 9 ----
> include/linux/memremap.h | 22 +++++++++-
> include/linux/mm.h | 59 ++++-------------------------
> lib/test_hmm.c | 4 +
> mm/Kconfig | 4 -
> mm/internal.h | 2
> mm/memcontrol.c | 11 +----
> mm/memremap.c | 63 ++++++++++++++++---------------
> mm/migrate.c | 6 --
> mm/swap.c | 49 ++----------------------
> 23 files changed, 90 insertions(+), 157 deletions(-)

Looks good to me. I was wondering about the location of some of this
code, so it's nice to see it cleaned up. Except for the one minor issue
I noted on patch 6, it all looks good to me. I've reviewed all the
patches and tested the series under my p2pdma series.

Reviewed-by: Logan Gunthorpe <[email protected]>

Logan

2022-02-09 09:35:23

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

On Sun, Feb 6, 2022 at 10:33 PM Christoph Hellwig <[email protected]> wrote:
[..]
> @@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page)
> */
> page->mapping = NULL;
> page->pgmap->ops->page_free(page);
> +
> + /*
> + * Reset the page count to 1 to prepare for handing out the page again.
> + */
> + set_page_count(page, 1);

Interesting. I had expected that to really fix the refcount problem
that fs/dax.c would need to start taking real page references as pages
were added to a mapping, just like page cache.

This looks ok to me, and passes my tests. So given I'm still working
my way back to fixing the references properly I'm ok for this hack to
replace the more broken hack that is there presently.

Reviewed-by: Dan Williams <[email protected]>

2022-02-09 11:02:32

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

On Sun, Feb 6, 2022 at 10:33 PM Christoph Hellwig <[email protected]> wrote:
>
> Move the check for the actual pgmap types that need the free at refcount
> one behavior into the out of line helper, and thus avoid the need to
> pull memremap.h into mm.h.

Looks good to me assuming the compile bots agree.

Reviewed-by: Dan Williams <[email protected]>

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> arch/arm64/mm/mmu.c | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> drivers/gpu/drm/drm_cache.c | 2 +-
> drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 +
> drivers/gpu/drm/nouveau/nouveau_svm.c | 1 +
> drivers/infiniband/core/rw.c | 1 +
> drivers/nvdimm/pmem.h | 1 +
> drivers/nvme/host/pci.c | 1 +
> drivers/nvme/target/io-cmd-bdev.c | 1 +
> fs/fuse/virtio_fs.c | 1 +
> include/linux/memremap.h | 18 ++++++++++++++++++
> include/linux/mm.h | 20 --------------------
> lib/test_hmm.c | 1 +
> mm/memremap.c | 6 +++++-
> 14 files changed, 34 insertions(+), 22 deletions(-)
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index acfae9b41cc8c9..580abae6c0b93f 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -17,6 +17,7 @@
> #include <linux/mman.h>
> #include <linux/nodemask.h>
> #include <linux/memblock.h>
> +#include <linux/memremap.h>
> #include <linux/memory.h>
> #include <linux/fs.h>
> #include <linux/io.h>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ea68f3b3a4e9cb..6d643b4b791d87 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -25,6 +25,7 @@
>
> #include <linux/hashtable.h>
> #include <linux/mmu_notifier.h>
> +#include <linux/memremap.h>
> #include <linux/mutex.h>
> #include <linux/types.h>
> #include <linux/atomic.h>
> diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
> index f19d9acbe95936..50b8a088f763a6 100644
> --- a/drivers/gpu/drm/drm_cache.c
> +++ b/drivers/gpu/drm/drm_cache.c
> @@ -27,11 +27,11 @@
> /*
> * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
> */
> -
> #include <linux/dma-buf-map.h>
> #include <linux/export.h>
> #include <linux/highmem.h>
> #include <linux/cc_platform.h>
> +#include <linux/ioport.h>
> #include <xen/xen.h>
>
> #include <drm/drm_cache.h>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> index e886a3b9e08c7d..a5cdfbe32b5e54 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
> @@ -39,6 +39,7 @@
>
> #include <linux/sched/mm.h>
> #include <linux/hmm.h>
> +#include <linux/memremap.h>
> #include <linux/migrate.h>
>
> /*
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index 266809e511e2c1..090b9b47708cca 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -35,6 +35,7 @@
> #include <linux/sched/mm.h>
> #include <linux/sort.h>
> #include <linux/hmm.h>
> +#include <linux/memremap.h>
> #include <linux/rmap.h>
>
> struct nouveau_svm {
> diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
> index 5a3bd41b331c93..4d98f931a13ddd 100644
> --- a/drivers/infiniband/core/rw.c
> +++ b/drivers/infiniband/core/rw.c
> @@ -2,6 +2,7 @@
> /*
> * Copyright (c) 2016 HGST, a Western Digital Company.
> */
> +#include <linux/memremap.h>
> #include <linux/moduleparam.h>
> #include <linux/slab.h>
> #include <linux/pci-p2pdma.h>
> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
> index 59cfe13ea8a85c..1f51a23614299b 100644
> --- a/drivers/nvdimm/pmem.h
> +++ b/drivers/nvdimm/pmem.h
> @@ -3,6 +3,7 @@
> #define __NVDIMM_PMEM_H__
> #include <linux/page-flags.h>
> #include <linux/badblocks.h>
> +#include <linux/memremap.h>
> #include <linux/types.h>
> #include <linux/pfn_t.h>
> #include <linux/fs.h>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a99ed68091589..ab15bc72710dbe 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -15,6 +15,7 @@
> #include <linux/init.h>
> #include <linux/interrupt.h>
> #include <linux/io.h>
> +#include <linux/memremap.h>
> #include <linux/mm.h>
> #include <linux/module.h>
> #include <linux/mutex.h>
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 70ca9dfc1771a9..a141446db1bea3 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -6,6 +6,7 @@
> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> #include <linux/blkdev.h>
> #include <linux/blk-integrity.h>
> +#include <linux/memremap.h>
> #include <linux/module.h>
> #include "nvmet.h"
>
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 9d737904d07c0b..86b7dbb6a0d43e 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -8,6 +8,7 @@
> #include <linux/dax.h>
> #include <linux/pci.h>
> #include <linux/pfn_t.h>
> +#include <linux/memremap.h>
> #include <linux/module.h>
> #include <linux/virtio.h>
> #include <linux/virtio_fs.h>
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 1fafcc38acbad6..514ab46f597e5c 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -1,6 +1,8 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #ifndef _LINUX_MEMREMAP_H_
> #define _LINUX_MEMREMAP_H_
> +
> +#include <linux/mm.h>
> #include <linux/range.h>
> #include <linux/ioport.h>
> #include <linux/percpu-refcount.h>
> @@ -129,6 +131,22 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
> return 1 << pgmap->vmemmap_shift;
> }
>
> +static inline bool is_device_private_page(const struct page *page)
> +{
> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> + IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
> + is_zone_device_page(page) &&
> + page->pgmap->type == MEMORY_DEVICE_PRIVATE;
> +}
> +
> +static inline bool is_pci_p2pdma_page(const struct page *page)
> +{
> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> + IS_ENABLED(CONFIG_PCI_P2PDMA) &&
> + is_zone_device_page(page) &&
> + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
> +}
> +
> #ifdef CONFIG_ZONE_DEVICE
> void *memremap_pages(struct dev_pagemap *pgmap, int nid);
> void memunmap_pages(struct dev_pagemap *pgmap);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 26baadcef4556b..80fccfe31c3444 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -23,7 +23,6 @@
> #include <linux/err.h>
> #include <linux/page-flags.h>
> #include <linux/page_ref.h>
> -#include <linux/memremap.h>
> #include <linux/overflow.h>
> #include <linux/sizes.h>
> #include <linux/sched.h>
> @@ -1101,9 +1100,6 @@ static inline bool put_devmap_managed_page(struct page *page)
> return false;
> if (!is_zone_device_page(page))
> return false;
> - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
> - page->pgmap->type != MEMORY_DEVICE_FS_DAX)
> - return false;
> return __put_devmap_managed_page(page);
> }
>
> @@ -1114,22 +1110,6 @@ static inline bool put_devmap_managed_page(struct page *page)
> }
> #endif /* CONFIG_DEV_PAGEMAP_OPS */
>
> -static inline bool is_device_private_page(const struct page *page)
> -{
> - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> - IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
> - is_zone_device_page(page) &&
> - page->pgmap->type == MEMORY_DEVICE_PRIVATE;
> -}
> -
> -static inline bool is_pci_p2pdma_page(const struct page *page)
> -{
> - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> - IS_ENABLED(CONFIG_PCI_P2PDMA) &&
> - is_zone_device_page(page) &&
> - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
> -}
> -
> /* 127: arbitrary random number, small enough to assemble well */
> #define folio_ref_zero_or_close_to_overflow(folio) \
> ((unsigned int) folio_ref_count(folio) + 127u <= 127u)
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 396beee6b061d4..e5fc14ba71f33e 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -12,6 +12,7 @@
> #include <linux/kernel.h>
> #include <linux/cdev.h>
> #include <linux/device.h>
> +#include <linux/memremap.h>
> #include <linux/mutex.h>
> #include <linux/rwsem.h>
> #include <linux/sched.h>
> diff --git a/mm/memremap.c b/mm/memremap.c
> index f41233a67edb12..a0ece2344c2cab 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -4,7 +4,7 @@
> #include <linux/io.h>
> #include <linux/kasan.h>
> #include <linux/memory_hotplug.h>
> -#include <linux/mm.h>
> +#include <linux/memremap.h>
> #include <linux/pfn_t.h>
> #include <linux/swap.h>
> #include <linux/mmzone.h>
> @@ -504,6 +504,10 @@ void free_devmap_managed_page(struct page *page)
>
> bool __put_devmap_managed_page(struct page *page)
> {
> + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
> + page->pgmap->type != MEMORY_DEVICE_FS_DAX)
> + return false;
> +
> /*
> * devmap page refcounts are 1-based, rather than 0-based: if
> * refcount is 1, then the page is free and the refcount is
> --
> 2.30.2
>

2022-02-09 11:51:27

by Logan Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>



On 2022-02-06 11:32 p.m., Christoph Hellwig wrote:
> Move the check for the actual pgmap types that need the free at refcount
> one behavior into the out of line helper, and thus avoid the need to
> pull memremap.h into mm.h.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

I've noticed mm/memcontrol.c uses is_device_private_page() and also
needs a memremap.h include added to compile with my configuration.

Logan

2022-02-09 12:39:37

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

On Wed, Feb 09, 2022 at 07:23:45AM +0100, Christoph Hellwig wrote:
> On Tue, Feb 08, 2022 at 07:30:11PM -0800, Dan Williams wrote:
> > Interesting. I had expected that to really fix the refcount problem
> > that fs/dax.c would need to start taking real page references as pages
> > were added to a mapping, just like page cache.
>
> I think we should do that eventually. But I think this series that
> just attacks the device private type and extends to the device coherent
> and p2p enhacements is a good first step to stop the proliferation of
> the one off refcount and to allow to deal with the fsdax pages in another
> more focuessed series.

It is nice, but the other series are still impacted by the fsdax mess
- they still stuff pages into ptes without proper refcounts and have
to carry nonsense to dance around this problem.

I certainly would be unhappy if the amd driver, for instance, gained
the fsdax problem as well and started pushing 4k pages into PMDs.

Jason

2022-02-09 13:45:08

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

On Tue, Feb 08, 2022 at 07:30:11PM -0800, Dan Williams wrote:
> Interesting. I had expected that to really fix the refcount problem
> that fs/dax.c would need to start taking real page references as pages
> were added to a mapping, just like page cache.

I think we should do that eventually. But I think this series that
just attacks the device private type and extends to the device coherent
and p2p enhacements is a good first step to stop the proliferation of
the one off refcount and to allow to deal with the fsdax pages in another
more focuessed series.

2022-02-09 13:45:10

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

On Tue, Feb 08, 2022 at 03:53:14PM -0800, Dan Williams wrote:
> Yeah, same as Logan:
>
> mm/memcontrol.c: In function ‘get_mctgt_type’:
> mm/memcontrol.c:5724:29: error: implicit declaration of function
> ‘is_device_private_page’; did you mean
> ‘is_device_private_entry’? [-Werror=implicit-function-declaration]
> 5724 | if (is_device_private_page(page))
> | ^~~~~~~~~~~~~~~~~~~~~~
> | is_device_private_entry
>
> ...needs:

Yeah, the buildbot also complained. I've fixed this up locally now.

2022-02-09 15:38:38

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

On Wed, Feb 09, 2022 at 08:29:56AM -0400, Jason Gunthorpe wrote:
> It is nice, but the other series are still impacted by the fsdax mess
> - they still stuff pages into ptes without proper refcounts and have
> to carry nonsense to dance around this problem.
>
> I certainly would be unhappy if the amd driver, for instance, gained
> the fsdax problem as well and started pushing 4k pages into PMDs.

As said before: I think this all needs to be fixed. But I'd rather
fix it gradually and I think this series is a nice step forward.
After that we can look at the pte mappings.

2022-02-09 17:19:37

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 7/8] mm: remove the extra ZONE_DEVICE struct page refcount

On Wed, Feb 09, 2022 at 02:53:51PM +0100, Christoph Hellwig wrote:
> On Wed, Feb 09, 2022 at 08:29:56AM -0400, Jason Gunthorpe wrote:
> > It is nice, but the other series are still impacted by the fsdax mess
> > - they still stuff pages into ptes without proper refcounts and have
> > to carry nonsense to dance around this problem.
> >
> > I certainly would be unhappy if the amd driver, for instance, gained
> > the fsdax problem as well and started pushing 4k pages into PMDs.
>
> As said before: I think this all needs to be fixed. But I'd rather
> fix it gradually and I think this series is a nice step forward.
> After that we can look at the pte mappings.

Right, I agree with this

Jason

2022-02-09 18:12:29

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

On Mon, Feb 07, 2022 at 04:19:29PM -0500, Felix Kuehling wrote:
>
> Am 2022-02-07 um 01:32 schrieb Christoph Hellwig:
>> Move the check for the actual pgmap types that need the free at refcount
>> one behavior into the out of line helper, and thus avoid the need to
>> pull memremap.h into mm.h.
>>
>> Signed-off-by: Christoph Hellwig <[email protected]>
>
> The amdkfd part looks good to me.
>
> It looks like this patch is not based on Alex Sierra's coherent memory
> series. He added two new helpers is_device_coherent_page and
> is_dev_private_or_coherent_page that would need to be moved along with
> is_device_private_page and is_pci_p2pdma_page.

FYI, here is a branch that contains a rebase of the coherent memory
related patches on top of this series:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/pgmap-refcount

I don't have a good way to test this, but I'll at least let the build bot
finish before sending it out (probably tomorrow).

2022-02-10 02:11:47

by Alistair Popple

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

On Thursday, 10 February 2022 4:48:36 AM AEDT Christoph Hellwig wrote:
> On Mon, Feb 07, 2022 at 04:19:29PM -0500, Felix Kuehling wrote:
> >
> > Am 2022-02-07 um 01:32 schrieb Christoph Hellwig:
> >> Move the check for the actual pgmap types that need the free at refcount
> >> one behavior into the out of line helper, and thus avoid the need to
> >> pull memremap.h into mm.h.
> >>
> >> Signed-off-by: Christoph Hellwig <[email protected]>
> >
> > The amdkfd part looks good to me.
> >
> > It looks like this patch is not based on Alex Sierra's coherent memory
> > series. He added two new helpers is_device_coherent_page and
> > is_dev_private_or_coherent_page that would need to be moved along with
> > is_device_private_page and is_pci_p2pdma_page.
>
> FYI, here is a branch that contains a rebase of the coherent memory
> related patches on top of this series:
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/pgmap-refcount
>
> I don't have a good way to test this, but I'll at least let the build bot
> finish before sending it out (probably tomorrow).

Thanks, I ran up hmm-test which revealed a few minor problems with the rebase.
Fixes below.

---

diff --git a/mm/gup.c b/mm/gup.c
index cbb49abb7992..8e85c9fb8df4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2007,7 +2007,6 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
if (!ret && list_empty(&movable_page_list) && !isolation_error_count)
return nr_pages;

- ret = 0;
unpin_pages:
for (i = 0; i < nr_pages; i++)
if (!pages[i])
diff --git a/mm/migrate.c b/mm/migrate.c
index f909f5a92757..1ae3e99baa50 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2686,12 +2686,11 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_readable_device_private_entry(
page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry);
- } else {
- if (is_zone_device_page(page) &&
- is_device_coherent_page(page)) {
+ } else if (is_zone_device_page(page) &&
+ !is_device_coherent_page(page)) {
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
goto abort;
- }
+ } else {
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));




2022-02-10 06:54:55

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>

On Thu, Feb 10, 2022 at 01:10:47PM +1100, Alistair Popple wrote:
> diff --git a/mm/gup.c b/mm/gup.c
> index cbb49abb7992..8e85c9fb8df4 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2007,7 +2007,6 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
> if (!ret && list_empty(&movable_page_list) && !isolation_error_count)
> return nr_pages;
>
> - ret = 0;
> unpin_pages:

This isn't quite correct as ret is initially set to -EFAULT now. I'll
fix it by removing the early ret initialization and always using the
goto. I've also added another refactoring patch for this messy function.

I've folded the inversion of the is_device_coherent_page check in
migrate.c in as well, thanks!

2022-02-13 11:29:15

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH 6/8] mm: don't include <linux/memremap.h> in <linux/mm.h>


Am 2022-02-09 um 12:48 schrieb Christoph Hellwig:
> On Mon, Feb 07, 2022 at 04:19:29PM -0500, Felix Kuehling wrote:
>> Am 2022-02-07 um 01:32 schrieb Christoph Hellwig:
>>> Move the check for the actual pgmap types that need the free at refcount
>>> one behavior into the out of line helper, and thus avoid the need to
>>> pull memremap.h into mm.h.
>>>
>>> Signed-off-by: Christoph Hellwig <[email protected]>
>> The amdkfd part looks good to me.
>>
>> It looks like this patch is not based on Alex Sierra's coherent memory
>> series. He added two new helpers is_device_coherent_page and
>> is_dev_private_or_coherent_page that would need to be moved along with
>> is_device_private_page and is_pci_p2pdma_page.
> FYI, here is a branch that contains a rebase of the coherent memory
> related patches on top of this series:
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/pgmap-refcount
>
> I don't have a good way to test this, but I'll at least let the build bot
> finish before sending it out (probably tomorrow).

Thank you for taking care of this rebase! Alex tested it on one of our
coherent memory systems and it passed our tests.

I see you also included these rebased patches in your latest 27-patch
series. I'll try to review the changes in more detail over the weekend.

Regards,
  Felix