In order to keep pfn_to_page() a simple offset calculation the 'struct
page' memmap needs to be mapped and initialized in advance of any usage
of a page. This poses a problem for large memory systems as it delays
full availability of memory resources for 10s to 100s of seconds.
For typical 'System RAM' the problem is mitigated by the fact that large
memory allocations tend to happen after the kernel has fully initialized
and userspace services / applications are launched. A small amount, 2GB
of memory, is initialized up front. The remainder is initialized in the
background and freed to the page allocator over time.
Unfortunately, that scheme is not directly reusable for persistent
memory and dax because userspace has visibility to the entire resource
pool and can choose to access any offset directly at its choosing. In
other words there is no allocator indirection where the kernel can
satisfy requests with arbitrary pages as they become initialized.
That said, we can approximate the optimization by performing the
initialization in the background, allow the kernel to fully boot the
platform, start up pmem block devices, mount filesystems in dax mode,
and only incur the delay at the first userspace dax fault.
With this change an 8 socket system was observed to initialize pmem
namespaces in ~4 seconds whereas it was previously taking ~4 minutes.
These patches apply on top of the HMM + devm_memremap_pages() reworks
[1]. Andrew, once the reviews come back, please consider this series for
-mm as well.
[1]: https://lkml.org/lkml/2018/6/19/108
---
Dan Williams (9):
mm: Plumb dev_pagemap instead of vmem_altmap to memmap_init_zone()
mm: Enable asynchronous __add_pages() and vmemmap_populate_hugepages()
mm: Teach memmap_init_zone() to initialize ZONE_DEVICE pages
mm: Multithread ZONE_DEVICE initialization
mm: Allow an external agent to wait for memmap initialization
filesystem-dax: Make mount time pfn validation a debug check
libnvdimm, pmem: Initialize the memmap in the background
device-dax: Initialize the memmap in the background
libnvdimm, namespace: Publish page structure init state / control
Huaisheng Ye (4):
nvdimm/pmem: check the validity of the pointer pfn
nvdimm/pmem-dax: check the validity of the pointer pfn
s390/block/dcssblk: check the validity of the pointer pfn
fs/dax: Assign NULL to pfn of dax_direct_access if useless
arch/ia64/mm/init.c | 5 +
arch/powerpc/mm/mem.c | 5 +
arch/s390/mm/init.c | 8 +
arch/sh/mm/init.c | 5 +
arch/x86/mm/init_32.c | 8 +
arch/x86/mm/init_64.c | 27 +++--
drivers/dax/Kconfig | 10 ++
drivers/dax/dax-private.h | 2
drivers/dax/device-dax.h | 2
drivers/dax/device.c | 16 +++
drivers/dax/pmem.c | 5 +
drivers/dax/super.c | 64 +++++++-----
drivers/nvdimm/nd.h | 2
drivers/nvdimm/pfn_devs.c | 54 ++++++++--
drivers/nvdimm/pmem.c | 17 ++-
drivers/nvdimm/pmem.h | 1
drivers/s390/block/dcssblk.c | 5 +
fs/dax.c | 10 +-
include/linux/memmap_async.h | 55 ++++++++++
include/linux/memory_hotplug.h | 18 ++-
include/linux/memremap.h | 31 ++++++
include/linux/mm.h | 8 +
kernel/memremap.c | 85 ++++++++-------
mm/memory_hotplug.c | 73 ++++++++++---
mm/page_alloc.c | 215 +++++++++++++++++++++++++++++++++------
mm/sparse-vmemmap.c | 56 ++++++++--
tools/testing/nvdimm/pmem-dax.c | 11 ++
27 files changed, 610 insertions(+), 188 deletions(-)
create mode 100644 include/linux/memmap_async.h
In preparation for teaching memmap_init_zone() how to initialize
ZONE_DEVICE pages, pass in dev_pagemap.
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
include/linux/memory_hotplug.h | 3 ++-
include/linux/mm.h | 2 +-
kernel/memremap.c | 2 +-
mm/memory_hotplug.c | 4 ++--
mm/page_alloc.c | 5 ++++-
5 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4e9828cda7a2..e60085b2824d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -14,6 +14,7 @@ struct mem_section;
struct memory_block;
struct resource;
struct vmem_altmap;
+struct dev_pagemap;
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -326,7 +327,7 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online);
extern int arch_add_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap, bool want_memblock);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap);
+ unsigned long nr_pages, struct dev_pagemap *pgmap);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern bool is_memblock_offlined(struct memory_block *mem);
extern void remove_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0fbb9ffe380..319d01372efa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2140,7 +2140,7 @@ static inline void zero_resv_unavail(void) {}
extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
- enum memmap_context, struct vmem_altmap *);
+ enum memmap_context, struct dev_pagemap *);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index ecee37b44aa1..58327259420d 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -244,7 +244,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
if (!error)
move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, altmap);
+ align_size >> PAGE_SHIFT, pgmap);
}
mem_hotplug_done();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7deb49f69e27..aae4e6cc65e9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -779,7 +779,7 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
}
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages, struct dev_pagemap *pgmap)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -805,7 +805,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMMAP_HOTPLUG, altmap);
+ MEMMAP_HOTPLUG, pgmap);
set_zone_contiguous(zone);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..545a5860cce7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5459,10 +5459,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
- struct vmem_altmap *altmap)
+ struct dev_pagemap *pgmap)
{
unsigned long end_pfn = start_pfn + size;
pg_data_t *pgdat = NODE_DATA(nid);
+ struct vmem_altmap *altmap = NULL;
unsigned long pfn;
unsigned long nr_initialised = 0;
struct page *page;
@@ -5477,6 +5478,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* Honor reservation requested by the driver for this ZONE_DEVICE
* memory
*/
+ if (pgmap && pgmap->altmap_valid)
+ altmap = &pgmap->altmap;
if (altmap && start_pfn == altmap->base_pfn)
start_pfn += altmap->reserve;
Rather than run a loop over the freshly initialized pages in
devm_memremap_pages() *after* arch_add_memory() returns, teach
memmap_init_zone() to return the pages fully initialized. This is in
preparation for multi-threading page initialization work, but it also
has some straight line performance benefits to not incur another loop of
cache misses across a large (100s of GBs to TBs) address range.
Cc: Andrew Morton <[email protected]>
Cc: Logan Gunthorpe <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
kernel/memremap.c | 16 +---------------
mm/page_alloc.c | 19 +++++++++++++++++++
2 files changed, 20 insertions(+), 15 deletions(-)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b861fe909932..85e4a7c576b2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -173,8 +173,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
struct vmem_altmap *altmap = pgmap->altmap_valid ?
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
- unsigned long pfn, pgoff, order;
pgprot_t pgprot = PAGE_KERNEL;
+ unsigned long pgoff, order;
int error, nid, is_ram;
if (!pgmap->ref || !kill)
@@ -251,20 +251,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
if (error)
goto err_add_memory;
- for_each_device_pfn(pfn, pgmap) {
- struct page *page = pfn_to_page(pfn);
-
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer. It is a bug if a ZONE_DEVICE page is ever
- * freed or placed on a driver-private list. Seed the
- * storage with LIST_POISON* values.
- */
- list_del(&page->lru);
- page->pgmap = pgmap;
- percpu_ref_get(pgmap->ref);
- }
-
pgmap->kill = kill;
error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
pgmap);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f83682ef006e..fb45cfeb4a50 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5548,6 +5548,25 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+
+ if (is_zone_device_page(page)) {
+ if (WARN_ON_ONCE(!pgmap))
+ continue;
+
+ /* skip invalid device pages */
+ if (altmap && (pfn < (altmap->base_pfn
+ + vmem_altmap_offset(altmap))))
+ continue;
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer. It is a bug if a ZONE_DEVICE page is ever
+ * freed or placed on a driver-private list. Seed the
+ * storage with poison.
+ */
+ page->lru.prev = LIST_POISON2;
+ page->pgmap = pgmap;
+ percpu_ref_get(pgmap->ref);
+ }
}
}
On large / multi-socket persistent memory systems it can potentially
take minutes to initialize the memmap. Even though such systems have
multiple persistent memory namespaces that are registered
asynchronously, they serialize on the mem_hotplug_begin() lock.
The method for hiding memmap initialization in the typical memory case
can not be directly reused for persistent memory. In the typical /
volatile memory case pages are background freed to the memory allocator
as they become initialized. For persistent memory the aim is to push
everything to the background, but since it is dax mapped there is no way
to redirect applications to limit their usage to the initialized set.
I.e. any address may be directly accessed at any time.
The bulk of the work is memmap_init_zone(). Splitting the work into
threads yields a 1.5x to 2x performance in the time to initialize a
128GB namespace. However, the work is still serialized when there are
multiple namespaces and the work is ultimately limited by memory-media
write bandwidth. So, this commie is only a preparation step towards
ultimately moving all memmap initialization completely into the
background.
Cc: Andrew Morton <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
include/linux/memmap_async.h | 17 +++++
mm/page_alloc.c | 145 ++++++++++++++++++++++++++++--------------
2 files changed, 113 insertions(+), 49 deletions(-)
diff --git a/include/linux/memmap_async.h b/include/linux/memmap_async.h
index 11aa9f3a523e..d2011681a910 100644
--- a/include/linux/memmap_async.h
+++ b/include/linux/memmap_async.h
@@ -2,12 +2,24 @@
#ifndef __LINUX_MEMMAP_ASYNC_H
#define __LINUX_MEMMAP_ASYNC_H
#include <linux/async.h>
+#include <linux/ioport.h>
+struct dev_pagemap;
struct vmem_altmap;
+/*
+ * Regardless of how many threads we request here the workqueue core may
+ * limit based on the amount of other concurrent 'async' work in the
+ * system, see WQ_MAX_ACTIVE
+ */
+#define NR_MEMMAP_THREADS 16
+
struct memmap_init_env {
struct vmem_altmap *altmap;
+ struct dev_pagemap *pgmap;
bool want_memblock;
+ unsigned long zone;
+ int context;
int nid;
};
@@ -19,6 +31,11 @@ struct memmap_init_memmap {
int result;
};
+struct memmap_init_pages {
+ struct resource res;
+ struct memmap_init_env *env;
+};
+
struct memmap_async_state {
struct memmap_init_env env;
struct memmap_init_memmap memmap;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fb45cfeb4a50..6d0ed17cf305 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -38,6 +38,7 @@
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
+#include <linux/memmap_async.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
@@ -5455,6 +5456,68 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
ASYNC_DOMAIN_EXCLUSIVE(memmap_init_domain);
+static void __meminit memmap_init_one(unsigned long pfn, unsigned long zone,
+ int nid, enum memmap_context context, struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone, nid);
+ if (context == MEMMAP_HOTPLUG)
+ SetPageReserved(page);
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations to
+ * reserve their blocks rather than leaking throughout the
+ * address space during boot when many long-lived kernel
+ * allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap can
+ * be created for invalid pages (for alignment) check here not
+ * to call set_pageblock_migratetype() against pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
+ */
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+
+ if (is_zone_device_page(page)) {
+ struct vmem_altmap *altmap = &pgmap->altmap;
+
+ if (WARN_ON_ONCE(!pgmap))
+ return;
+
+ /* skip invalid device pages */
+ if (pgmap->altmap_valid && (pfn < (altmap->base_pfn
+ + vmem_altmap_offset(altmap))))
+ return;
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back
+ * pointer. It is a bug if a ZONE_DEVICE page is ever
+ * freed or placed on a driver-private list. Seed the
+ * storage with poison.
+ */
+ page->lru.prev = LIST_POISON2;
+ page->pgmap = pgmap;
+ percpu_ref_get(pgmap->ref);
+ }
+}
+
+static void __ref memmap_init_async(void *data, async_cookie_t cookie)
+{
+ struct memmap_init_pages *args = data;
+ struct memmap_init_env *env = args->env;
+ struct resource *res = &args->res;
+ unsigned long pfn;
+
+ for (pfn = PHYS_PFN(res->start); pfn < PHYS_PFN(res->end+1); pfn++)
+ memmap_init_one(pfn, env->zone, env->nid, env->context,
+ env->pgmap);
+}
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
@@ -5469,7 +5532,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
struct vmem_altmap *altmap = NULL;
unsigned long pfn;
unsigned long nr_initialised = 0;
- struct page *page;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
struct memblock_region *r = NULL, *tmp;
#endif
@@ -5486,14 +5548,43 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (altmap && start_pfn == altmap->base_pfn)
start_pfn += altmap->reserve;
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (context != MEMMAP_EARLY) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
+ ASYNC_DOMAIN_EXCLUSIVE(local);
+ struct memmap_init_pages args[NR_MEMMAP_THREADS];
+ struct memmap_init_env env = {
+ .nid = nid,
+ .zone = zone,
+ .pgmap = pgmap,
+ .context = context,
+ };
+ unsigned long step, rem;
+ int i;
+
+ size = end_pfn - start_pfn;
+ step = size / NR_MEMMAP_THREADS;
+ rem = size % NR_MEMMAP_THREADS;
+ for (i = 0; i < NR_MEMMAP_THREADS; i++) {
+ struct memmap_init_pages *t = &args[i];
+
+ t->env = &env;
+ t->res.start = PFN_PHYS(start_pfn);
+ t->res.end = PFN_PHYS(start_pfn + step) - 1;
+ if (i == NR_MEMMAP_THREADS-1)
+ t->res.end += PFN_PHYS(rem);
+
+ async_schedule_domain(memmap_init_async, t, &local);
+
+ start_pfn += step;
+ }
+ async_synchronize_full_domain(&local);
+ return;
+ }
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
@@ -5522,51 +5613,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
}
#endif
-
-not_early:
- page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
- SetPageReserved(page);
-
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
- */
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
-
- if (is_zone_device_page(page)) {
- if (WARN_ON_ONCE(!pgmap))
- continue;
-
- /* skip invalid device pages */
- if (altmap && (pfn < (altmap->base_pfn
- + vmem_altmap_offset(altmap))))
- continue;
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back
- * pointer. It is a bug if a ZONE_DEVICE page is ever
- * freed or placed on a driver-private list. Seed the
- * storage with poison.
- */
- page->lru.prev = LIST_POISON2;
- page->pgmap = pgmap;
- percpu_ref_get(pgmap->ref);
- }
+ memmap_init_one(pfn, zone, nid, context, NULL);
}
}
From: Huaisheng Ye <[email protected]>
direct_access needs to check the validity of pointer pfn for NULL
assignment. If pfn equals to NULL, it doesn't need to calculate the value.
Suggested-by: Dan Williams <[email protected]>
Signed-off-by: Huaisheng Ye <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
tools/testing/nvdimm/pmem-dax.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index b53596ad601b..d4cb5281b30e 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -33,7 +33,8 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
*kaddr = pmem->virt_addr + offset;
page = vmalloc_to_page(pmem->virt_addr + offset);
- *pfn = page_to_pfn_t(page);
+ if (pfn)
+ *pfn = page_to_pfn_t(page);
pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
__func__, pmem, pgoff, page_to_pfn(page));
@@ -41,7 +42,8 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
}
*kaddr = pmem->virt_addr + offset;
- *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ if (pfn)
+ *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
/*
* If badblocks are present, limit known good range to the
Applications may want to know that page structure initialization is
complete rather than be subject to delays at first DAX fault. Also,
page structure initialization consumes CPU resources impacting
application performance, so a environment may want to wait before
considering the system fully initialized.
Provide a sysfs attribute to display the current state, and when
written with 'sync' complete memmap initialization.
Cc: Ross Zwisler <[email protected]>
Cc: Vishal Verma <[email protected]>
Cc: Dave Jiang <[email protected]>
Cc: Johannes Thumshirn <[email protected]>
Cc: Jeff Moyer <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
drivers/nvdimm/pfn_devs.c | 53 +++++++++++++++++++++++++++++++++++----------
mm/page_alloc.c | 1 +
2 files changed, 42 insertions(+), 12 deletions(-)
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 147c62e2ef2b..00f1792d070c 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -1,15 +1,6 @@
-/*
- * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2013-2018 Intel Corporation. All rights reserved. */
+#include <linux/memory_hotplug.h>
#include <linux/memremap.h>
#include <linux/blkdev.h>
#include <linux/device.h>
@@ -103,6 +94,43 @@ static ssize_t mode_store(struct device *dev,
}
static DEVICE_ATTR_RW(mode);
+static ssize_t memmap_state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ struct memmap_async_state *async = &nd_pfn->async;
+
+ return sprintf(buf, "%s\n", bitmap_weight(async->active,
+ NR_MEMMAP_THREADS) ? "active" : "idle");
+}
+
+static ssize_t memmap_state_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int i;
+ struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
+ struct memmap_async_state *async = &nd_pfn->async;
+
+ if (strcmp(buf, "sync") == 0)
+ /* pass */;
+ else if (strcmp(buf, "sync\n") == 0)
+ /* pass */;
+ else
+ return -EINVAL;
+
+ for (i = 0; i < NR_MEMMAP_THREADS; i++) {
+ struct memmap_init_pages *thread = &async->page_init[i];
+
+ if (!test_bit(i, async->active))
+ continue;
+ async_synchronize_cookie_domain(thread->cookie,
+ &memmap_init_domain);
+ }
+
+ return len;
+}
+static DEVICE_ATTR_RW(memmap_state);
+
static ssize_t align_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -279,6 +307,7 @@ static struct attribute *nd_pfn_attributes[] = {
&dev_attr_resource.attr,
&dev_attr_size.attr,
&dev_attr_supported_alignments.attr,
+ &dev_attr_memmap_state.attr,
NULL,
};
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1466dd82bc2..90414c1d2ca8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5456,6 +5456,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
}
ASYNC_DOMAIN_EXCLUSIVE(memmap_init_domain);
+EXPORT_SYMBOL_GPL(memmap_init_domain);
static void __meminit memmap_init_one(unsigned long pfn, unsigned long zone,
int nid, enum memmap_context context, struct dev_pagemap *pgmap)
Given that the nd_dax shares device data with nd_pfn devices, arrange
for the nd_pfn memmap_async_state instance to be registered by the
devm_memremap_pages() call in the dax-pmem driver. Then, provide the
generic dev_pagemap instance to the device-dax driver so that it can
utilize memmap_sync() before dax-mapping pfns.
Cc: Dave Jiang <[email protected]>
Cc: Ross Zwisler <[email protected]>
Cc: Vishal Verma <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
drivers/dax/dax-private.h | 2 ++
drivers/dax/device-dax.h | 2 +-
drivers/dax/device.c | 16 +++++++++++++++-
drivers/dax/pmem.c | 5 ++++-
4 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index b6fc4f04636d..35bda544b334 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -25,6 +25,7 @@
* @align: allocation and mapping alignment for child dax devices
* @res: physical address range of the region
* @pfn_flags: identify whether the pfns are paged back or not
+ * @pgmap: backing page map for the device address range
*/
struct dax_region {
int id;
@@ -35,6 +36,7 @@ struct dax_region {
unsigned int align;
struct resource res;
unsigned long pfn_flags;
+ struct dev_pagemap *pgmap;
};
/**
diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
index 688b051750bd..1a2da8072a6e 100644
--- a/drivers/dax/device-dax.h
+++ b/drivers/dax/device-dax.h
@@ -19,7 +19,7 @@ struct dax_region;
void dax_region_put(struct dax_region *dax_region);
struct dax_region *alloc_dax_region(struct device *parent,
int region_id, struct resource *res, unsigned int align,
- void *addr, unsigned long flags);
+ void *addr, struct dev_pagemap *pgmap, unsigned long flags);
struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
int id, struct resource *res, int count);
#endif /* __DEVICE_DAX_H__ */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index de2f8297a210..2802d21a6e26 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -10,6 +10,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
+#include <linux/memmap_async.h>
#include <linux/pagemap.h>
#include <linux/module.h>
#include <linux/device.h>
@@ -101,7 +102,7 @@ static void dax_region_unregister(void *region)
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
struct resource *res, unsigned int align, void *addr,
- unsigned long pfn_flags)
+ struct dev_pagemap *pgmap, unsigned long pfn_flags)
{
struct dax_region *dax_region;
@@ -130,6 +131,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
dax_region->id = region_id;
ida_init(&dax_region->ida);
dax_region->align = align;
+ dax_region->pgmap = pgmap;
dax_region->dev = parent;
dax_region->base = addr;
if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
@@ -244,6 +246,15 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
return -1;
}
+static void dax_pfn_sync(struct dax_region *dax_region, pfn_t pfn,
+ unsigned long size)
+{
+ struct dev_pagemap *pgmap = dax_region->pgmap;
+ struct memmap_async_state *async = pgmap->async;
+
+ memmap_sync(pfn, PHYS_PFN(size), async);
+}
+
static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
{
struct device *dev = &dev_dax->dev;
@@ -273,6 +284,7 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
}
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ dax_pfn_sync(dax_region, pfn, PAGE_SIZE);
rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
@@ -328,6 +340,7 @@ static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
}
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ dax_pfn_sync(dax_region, pfn, PMD_SIZE);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
vmf->flags & FAULT_FLAG_WRITE);
@@ -379,6 +392,7 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
}
pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ dax_pfn_sync(dax_region, pfn, PUD_SIZE);
return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
vmf->flags & FAULT_FLAG_WRITE);
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 54cba20c8ba6..a05be7a03d02 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -11,6 +11,7 @@
* General Public License for more details.
*/
#include <linux/percpu-refcount.h>
+#include <linux/memmap_async.h>
#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/pfn_t.h>
@@ -110,6 +111,7 @@ static int dax_pmem_probe(struct device *dev)
return rc;
dax_pmem->pgmap.ref = &dax_pmem->ref;
+ dax_pmem->pgmap.async = &nd_pfn->async;
addr = devm_memremap_pages(dev, &dax_pmem->pgmap, dax_pmem_percpu_kill);
if (IS_ERR(addr))
return PTR_ERR(addr);
@@ -123,7 +125,8 @@ static int dax_pmem_probe(struct device *dev)
return -EINVAL;
dax_region = alloc_dax_region(dev, region_id, &res,
- le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP);
+ le32_to_cpu(pfn_sb->align), addr, &dax_pmem->pgmap,
+ PFN_DEV|PFN_MAP);
if (!dax_region)
return -ENOMEM;
Arrange for the pmem driver to call memmap_sync() when it is asked to
produce a valid pfn. The infrastructure is housed in the 'nd_pfn'
device which implies that the async init support only exists for
platform defined persistent memory, not the legacy / debug memmap=ss!nn
facility.
Another reason to restrict the capability to the 'nd_pfn' device case is
that nd_pfn devices have sysfs infrastructure to communicate the
memmap initialization state to userspace.
The sysfs publication of memmap init state is saved for a later patch.
Cc: Ross Zwisler <[email protected]>
Cc: Vishal Verma <[email protected]>
Cc: Dave Jiang <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
drivers/nvdimm/nd.h | 2 ++
drivers/nvdimm/pmem.c | 16 ++++++++++++----
drivers/nvdimm/pmem.h | 1 +
tools/testing/nvdimm/pmem-dax.c | 7 ++++++-
4 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 32e0364b48b9..ee4f76fb0cb5 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -12,6 +12,7 @@
*/
#ifndef __ND_H__
#define __ND_H__
+#include <linux/memmap_async.h>
#include <linux/libnvdimm.h>
#include <linux/badblocks.h>
#include <linux/blkdev.h>
@@ -208,6 +209,7 @@ struct nd_pfn {
unsigned long npfns;
enum nd_pfn_mode mode;
struct nd_pfn_sb *pfn_sb;
+ struct memmap_async_state async;
struct nd_namespace_common *ndns;
};
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c430536320a5..a1158181adc2 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -22,6 +22,7 @@
#include <linux/platform_device.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/memmap_async.h>
#include <linux/badblocks.h>
#include <linux/memremap.h>
#include <linux/vmalloc.h>
@@ -228,8 +229,13 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
PFN_PHYS(nr_pages))))
return -EIO;
*kaddr = pmem->virt_addr + offset;
- if (pfn)
+ if (pfn) {
+ struct dev_pagemap *pgmap = &pmem->pgmap;
+ struct memmap_async_state *async = pgmap->async;
+
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ memmap_sync(*pfn, nr_pages, async);
+ }
/*
* If badblocks are present, limit known good range to the
@@ -310,13 +316,15 @@ static void fsdax_pagefree(struct page *page, void *data)
wake_up_var(&page->_refcount);
}
-static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
+static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap,
+ struct memmap_async_state *async)
{
dev_pagemap_get_ops();
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
return -ENOMEM;
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;
+ pgmap->async = async;
return 0;
}
@@ -379,7 +387,7 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter;
if (is_nd_pfn(dev)) {
- if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+ if (setup_pagemap_fsdax(dev, &pmem->pgmap, &nd_pfn->async))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap,
pmem_freeze_queue);
@@ -393,7 +401,7 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false;
- if (setup_pagemap_fsdax(dev, &pmem->pgmap))
+ if (setup_pagemap_fsdax(dev, &pmem->pgmap, NULL))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap,
pmem_freeze_queue);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index a64ebc78b5df..93d226ea1006 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NVDIMM_PMEM_H__
#define __NVDIMM_PMEM_H__
+#include <linux/memmap_async.h>
#include <linux/badblocks.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index d4cb5281b30e..63151b75615c 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -42,8 +42,13 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
}
*kaddr = pmem->virt_addr + offset;
- if (pfn)
+ if (pfn) {
+ struct dev_pagemap *pgmap = &pmem->pgmap;
+ struct memmap_async_state *async = pgmap->async;
+
*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ memmap_sync(*pfn, nr_pages, async);
+ }
/*
* If badblocks are present, limit known good range to the
From: Huaisheng Ye <[email protected]>
direct_access needs to check the validity of pointer pfn for NULL
assignment. If pfn equals to NULL, it doesn't need to calculate the value.
Signed-off-by: Huaisheng Ye <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
drivers/s390/block/dcssblk.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index ed607288e696..a645b2c93c34 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -923,8 +923,9 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
dev_sz = dev_info->end - dev_info->start + 1;
*kaddr = (void *) dev_info->start + offset;
- *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset),
- PFN_DEV|PFN_SPECIAL);
+ if (pfn)
+ *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset),
+ PFN_DEV|PFN_SPECIAL);
return (dev_sz - offset) / PAGE_SIZE;
}
Now that memmap_init_zone() knows how to split the init work into
multiple threads, allow the tracking for those threads to be handled
via a passed in 'struct memmap_async_state' instance.
This infrastructure allows devm_memremap_pages() users, like the pmem
driver, to track memmap initialization in the backgroud, and use
memmap_sync() when it performs an operation that may result in a
pfn_to_page(), like dax mapping a pfn into userspace.
The approach mirrors what is done for background memmap initialization
and defers waiting for initialization to complete until the first
userspace consumer arrives.
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: "Jérôme Glisse" <[email protected]>
Cc: Logan Gunthorpe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
include/linux/memmap_async.h | 10 ++++
include/linux/memremap.h | 29 ++++++++++++
kernel/memremap.c | 65 ++++++++++++++++-----------
mm/page_alloc.c | 102 +++++++++++++++++++++++++++++++++++++-----
4 files changed, 169 insertions(+), 37 deletions(-)
diff --git a/include/linux/memmap_async.h b/include/linux/memmap_async.h
index d2011681a910..4633eca9290e 100644
--- a/include/linux/memmap_async.h
+++ b/include/linux/memmap_async.h
@@ -3,6 +3,9 @@
#define __LINUX_MEMMAP_ASYNC_H
#include <linux/async.h>
#include <linux/ioport.h>
+#include <linux/async.h>
+#include <linux/pfn_t.h>
+#include <linux/radix-tree.h>
struct dev_pagemap;
struct vmem_altmap;
@@ -32,14 +35,21 @@ struct memmap_init_memmap {
};
struct memmap_init_pages {
+ int id;
struct resource res;
+ async_cookie_t cookie;
struct memmap_init_env *env;
};
struct memmap_async_state {
struct memmap_init_env env;
struct memmap_init_memmap memmap;
+ struct memmap_init_pages page_init[NR_MEMMAP_THREADS];
+ unsigned long active[BITS_TO_LONGS(NR_MEMMAP_THREADS)];
+ struct radix_tree_root pfn_to_thread;
};
extern struct async_domain memmap_init_domain;
+extern void memmap_sync(pfn_t pfn, unsigned long nr_pages,
+ struct memmap_async_state *async);
#endif /* __LINUX_MEMMAP_ASYNC_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bfdc7363b13b..a2313fadd686 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_
+#include <linux/pfn.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>
@@ -101,6 +102,7 @@ typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
pmd_t *pmdp);
typedef void (*dev_page_free_t)(struct page *page, void *data);
+struct memmap_async_state;
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @page_fault: callback when CPU fault on an unaddressable device page
@@ -112,6 +114,7 @@ typedef void (*dev_page_free_t)(struct page *page, void *data);
* @dev: host device of the mapping for debug
* @data: private data pointer for page_free()
* @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @async: async memmap init context
*/
struct dev_pagemap {
dev_page_fault_t page_fault;
@@ -124,8 +127,34 @@ struct dev_pagemap {
struct device *dev;
void *data;
enum memory_type type;
+ struct memmap_async_state *async;
};
+static inline unsigned long order_at(struct resource *res, unsigned long pgoff)
+{
+ unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
+ unsigned long nr_pages, mask;
+
+ nr_pages = PHYS_PFN(resource_size(res));
+ if (nr_pages == pgoff)
+ return ULONG_MAX;
+
+ /*
+ * What is the largest aligned power-of-2 range available from
+ * this resource pgoff to the end of the resource range,
+ * considering the alignment of the current pgoff?
+ */
+ mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
+ if (!mask)
+ return ULONG_MAX;
+
+ return find_first_bit(&mask, BITS_PER_LONG);
+}
+
+#define foreach_order_pgoff(res, order, pgoff) \
+ for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
+ pgoff += 1UL << order, order = order_at((res), pgoff))
+
#ifdef CONFIG_ZONE_DEVICE
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
void (*kill)(struct percpu_ref *));
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 85e4a7c576b2..18719a596be5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -7,6 +7,7 @@
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/memmap_async.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/wait_bit.h>
@@ -16,31 +17,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-static unsigned long order_at(struct resource *res, unsigned long pgoff)
-{
- unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
- unsigned long nr_pages, mask;
-
- nr_pages = PHYS_PFN(resource_size(res));
- if (nr_pages == pgoff)
- return ULONG_MAX;
-
- /*
- * What is the largest aligned power-of-2 range available from
- * this resource pgoff to the end of the resource range,
- * considering the alignment of the current pgoff?
- */
- mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff);
- if (!mask)
- return ULONG_MAX;
-
- return find_first_bit(&mask, BITS_PER_LONG);
-}
-
-#define foreach_order_pgoff(res, order, pgoff) \
- for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
- pgoff += 1UL << order, order = order_at((res), pgoff))
-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
int device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
@@ -113,15 +89,46 @@ static unsigned long pfn_next(unsigned long pfn)
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+static void kill_memmap_async(struct memmap_async_state *async)
+{
+ struct radix_tree_iter iter;
+ void *slot;
+ int i;
+
+ if (!async)
+ return;
+
+ for (i = 0; i < NR_MEMMAP_THREADS; i++) {
+ async_cookie_t cookie;
+
+ if (!test_bit(i, async->active))
+ continue;
+
+ cookie = async->page_init[i].cookie;
+ async_synchronize_cookie_domain(cookie+1, &memmap_init_domain);
+ }
+ radix_tree_for_each_slot(slot, &async->pfn_to_thread, &iter, 0)
+ radix_tree_delete(&async->pfn_to_thread, iter.index);
+}
+
static void devm_memremap_pages_release(void *data)
{
struct dev_pagemap *pgmap = data;
struct device *dev = pgmap->dev;
struct resource *res = &pgmap->res;
resource_size_t align_start, align_size;
+ struct memmap_async_state *async = pgmap->async;
unsigned long pfn;
+ /*
+ * Once the pgmap is killed pgmap owners must disallow new
+ * direct_access / page mapping requests. I.e. memmap_sync()
+ * users must not race the teardown of the async->pfn_to_thread
+ * radix.
+ */
pgmap->kill(pgmap->ref);
+ kill_memmap_async(async);
+
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
@@ -240,7 +247,13 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
struct zone *zone;
error = arch_add_memory(nid, align_start, align_size, altmap,
- false, NULL);
+ false, pgmap->async);
+ if (error == -EWOULDBLOCK) {
+ /* fall back to synchronous */
+ pgmap->async = NULL;
+ error = arch_add_memory(nid, align_start, align_size,
+ altmap, false, NULL);
+ }
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
if (!error)
move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d0ed17cf305..d1466dd82bc2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/async.h>
+#include <linux/pfn_t.h>
#include <linux/nmi.h>
#include <asm/sections.h>
@@ -5510,12 +5511,80 @@ static void __ref memmap_init_async(void *data, async_cookie_t cookie)
{
struct memmap_init_pages *args = data;
struct memmap_init_env *env = args->env;
+ struct dev_pagemap *pgmap = env->pgmap;
+ struct memmap_async_state *async = pgmap ? pgmap->async : NULL;
struct resource *res = &args->res;
unsigned long pfn;
+ if (async)
+ async_synchronize_cookie_domain(async->memmap.cookie+1,
+ &memmap_init_domain);
+
for (pfn = PHYS_PFN(res->start); pfn < PHYS_PFN(res->end+1); pfn++)
memmap_init_one(pfn, env->zone, env->nid, env->context,
- env->pgmap);
+ pgmap);
+ if (async)
+ clear_bit(args->id, async->active);
+}
+
+void memmap_sync(pfn_t pfn, unsigned long nr_pages,
+ struct memmap_async_state *async)
+{
+ struct memmap_init_pages *args, *start, *end;
+ unsigned long raw_pfn = pfn_t_to_pfn(pfn);
+
+ if (!async || !pfn_t_has_page(pfn)
+ || !bitmap_weight(async->active, NR_MEMMAP_THREADS))
+ return;
+
+ start = radix_tree_lookup(&async->pfn_to_thread, raw_pfn);
+ end = radix_tree_lookup(&async->pfn_to_thread, raw_pfn + nr_pages - 1);
+ if (!start || !end) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ for (args = start; args <= end; args++) {
+ int id = args - &async->page_init[0];
+
+ async_synchronize_cookie_domain(args->cookie+1,
+ &memmap_init_domain);
+ pr_debug("%s: pfn: %#lx nr: %ld thread: %d\n",
+ __func__, raw_pfn, nr_pages, id);
+ }
+}
+EXPORT_SYMBOL_GPL(memmap_sync);
+
+static bool run_memmap_init(struct memmap_init_pages *thread,
+ struct memmap_async_state *async, struct async_domain *domain)
+{
+ struct resource *res = &thread->res;
+ unsigned long pgoff;
+ int order;
+
+ if (!async) {
+ async_schedule_domain(memmap_init_async, thread, domain);
+ return false;
+ }
+
+ thread->cookie = async_schedule_domain(memmap_init_async,
+ thread, domain);
+ set_bit(thread->id, async->active);
+ foreach_order_pgoff(res, order, pgoff) {
+ int rc = __radix_tree_insert(&async->pfn_to_thread,
+ PHYS_PFN(res->start) + pgoff, order, thread);
+ if (rc) {
+ /*
+ * Mark all threads inactive, and by returning
+ * false we'll sync all threads before returning
+ * from memmap_init_zone().
+ */
+ memset(async->active, 0, sizeof(unsigned long)
+ * BITS_TO_LONGS(NR_MEMMAP_THREADS));
+ return false;
+ }
+ }
+ return true;
}
/*
@@ -5554,33 +5623,44 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* function. They do not exist on hotplugged memory.
*/
ASYNC_DOMAIN_EXCLUSIVE(local);
- struct memmap_init_pages args[NR_MEMMAP_THREADS];
- struct memmap_init_env env = {
- .nid = nid,
- .zone = zone,
- .pgmap = pgmap,
- .context = context,
- };
+ struct memmap_async_state *async = pgmap ? pgmap->async : NULL;
+ struct memmap_init_pages _args[NR_MEMMAP_THREADS];
+ struct memmap_init_pages *args = async ? async->page_init : _args;
+ struct async_domain *domain;
+ struct memmap_init_env _env;
+ struct memmap_init_env *env = async ? &async->env : &_env;
unsigned long step, rem;
+ bool sync = !async;
int i;
+ domain = async ? &memmap_init_domain : &local;
+ env->pgmap = pgmap;
+ env->nid = nid;
+ env->zone = zone;
+ env->context = context;
+
size = end_pfn - start_pfn;
step = size / NR_MEMMAP_THREADS;
rem = size % NR_MEMMAP_THREADS;
+ if (async)
+ INIT_RADIX_TREE(&async->pfn_to_thread, GFP_KERNEL);
for (i = 0; i < NR_MEMMAP_THREADS; i++) {
struct memmap_init_pages *t = &args[i];
- t->env = &env;
+ t->id = i;
+ t->env = env;
t->res.start = PFN_PHYS(start_pfn);
t->res.end = PFN_PHYS(start_pfn + step) - 1;
if (i == NR_MEMMAP_THREADS-1)
t->res.end += PFN_PHYS(rem);
- async_schedule_domain(memmap_init_async, t, &local);
+ if (!run_memmap_init(t, async, domain))
+ sync = true;
start_pfn += step;
}
- async_synchronize_full_domain(&local);
+ if (sync)
+ async_synchronize_full_domain(domain);
return;
}
From: Huaisheng Ye <[email protected]>
Some functions within fs/dax don't need to get pfn from direct_access.
Assigning NULL to pfn of dax_direct_access is more intuitive and simple
than offering a useless local variable.
Signed-off-by: Huaisheng Ye <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
fs/dax.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 641192808bb6..28264ff4e343 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -647,7 +647,6 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
{
void *vto, *kaddr;
pgoff_t pgoff;
- pfn_t pfn;
long rc;
int id;
@@ -656,7 +655,7 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
@@ -1052,15 +1051,13 @@ int __dax_zero_page_range(struct block_device *bdev,
pgoff_t pgoff;
long rc, id;
void *kaddr;
- pfn_t pfn;
rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
- &pfn);
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
@@ -1116,7 +1113,6 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ssize_t map_len;
pgoff_t pgoff;
void *kaddr;
- pfn_t pfn;
if (fatal_signal_pending(current)) {
ret = -EINTR;
@@ -1128,7 +1124,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
break;
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
- &kaddr, &pfn);
+ &kaddr, NULL);
if (map_len < 0) {
ret = map_len;
break;
Do not ask for dax_direct_access() to retrieve a pfn in the
DAX_DRIVER_DEBUG=n case. This avoids an early call to memmap_sync() in
the driver.
Now that QUEUE_FLAG_DAX usage has been fixed the validation of the pfn
is only useful for dax driver developers. It is safe to assume that
pmem, dcssblk, and device-mapper-dax are correct with respect to dax
operation, so only retrieve the pfn for debug builds when qualifying a
new dax driver, if one ever arrives.
The moves the first consumption of a pfn from ->direct_access() to the
first dax mapping fault, rather than initial filesystem mount. I.e. more
time for memmap init to run in the background.
Cc: Jan Kara <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Ross Zwisler <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
drivers/dax/Kconfig | 10 ++++++++
drivers/dax/super.c | 64 ++++++++++++++++++++++++++++++++-------------------
2 files changed, 50 insertions(+), 24 deletions(-)
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index e0700bf4893a..b32f8827b983 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -9,6 +9,16 @@ menuconfig DAX
if DAX
+config DAX_DRIVER_DEBUG
+ bool "DAX: driver debug"
+ help
+ Enable validation of the page frame objects returned from a
+ driver's 'direct_access' operation. This validation is
+ performed relative to the requirements of the FS_DAX and
+ FS_DAX_LIMITED configuration options. If you are validating
+ the implementation of a dax device driver say Y otherwise
+ say N.
+
config DEV_DAX
tristate "Device DAX: direct access mapping device"
depends on TRANSPARENT_HUGEPAGE
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 903d9c473749..87b1c55b7c7a 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -72,6 +72,41 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
#endif
+static bool validate_dax_pfn(pfn_t *pfn)
+{
+ bool dax_enabled = false;
+
+ /*
+ * Unless debugging a new dax driver, or new dax architecture
+ * support there is no need to check the pfn. Delay the kernel's
+ * first need for a dax pfn until first userspace dax fault.
+ */
+ if (!pfn)
+ return true;
+
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(*pfn)) {
+ /*
+ * An arch that has enabled the pmem api should also
+ * have its drivers support pfn_t_devmap()
+ *
+ * This is a developer warning and should not trigger in
+ * production. dax_flush() will crash since it depends
+ * on being able to do (page_address(pfn_to_page())).
+ */
+ WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
+ dax_enabled = true;
+ } else if (pfn_t_devmap(*pfn)) {
+ struct dev_pagemap *pgmap;
+
+ pgmap = get_dev_pagemap(pfn_t_to_pfn(*pfn), NULL);
+ if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
+ dax_enabled = true;
+ put_dev_pagemap(pgmap);
+ }
+
+ return dax_enabled;
+}
+
/**
* __bdev_dax_supported() - Check if the device supports dax for filesystem
* @bdev: block device to check
@@ -85,11 +120,10 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
{
struct dax_device *dax_dev;
- bool dax_enabled = false;
+ pfn_t _pfn, *pfn;
pgoff_t pgoff;
int err, id;
void *kaddr;
- pfn_t pfn;
long len;
char buf[BDEVNAME_SIZE];
@@ -113,8 +147,10 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
return false;
}
+ pfn = IS_ENABLED(DAX_DRIVER_DEBUG) ? &_pfn : NULL;
+
id = dax_read_lock();
- len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+ len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, pfn);
dax_read_unlock(id);
put_dax(dax_dev);
@@ -125,27 +161,7 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
return false;
}
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
- /*
- * An arch that has enabled the pmem api should also
- * have its drivers support pfn_t_devmap()
- *
- * This is a developer warning and should not trigger in
- * production. dax_flush() will crash since it depends
- * on being able to do (page_address(pfn_to_page())).
- */
- WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
- dax_enabled = true;
- } else if (pfn_t_devmap(pfn)) {
- struct dev_pagemap *pgmap;
-
- pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
- if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
- dax_enabled = true;
- put_dev_pagemap(pgmap);
- }
-
- if (!dax_enabled) {
+ if (!validate_dax_pfn(pfn)) {
pr_debug("%s: error: dax support not enabled\n",
bdevname(bdev, buf));
return false;
In preparation for allowing all ZONE_DEVICE page init to happen in the
background, enable multiple vmemmap_populate_hugepages() invocations to
run in parallel.
To date the big memory-hotplug lock has been used to serialize changes
to the linear map and vmemmap. Finer grained locking is needed to
prevent 2 parallel invocations of vmemmap_populate_hugepages()
colliding.
Given that populating vmemmap has architecture specific implications
this new asynchronous support is only added for the x86_64
arch_add_memory(), all other implementations indicate no support for
async operations by returning -EWOULDBLOCK.
Cc: Tony Luck <[email protected]>
Cc: Fenghua Yu <[email protected]>
Cc: Benjamin Herrenschmidt <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: Martin Schwidefsky <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Yoshinori Sato <[email protected]>
Cc: Rich Felker <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: Andrew Morton <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
arch/ia64/mm/init.c | 5 ++-
arch/powerpc/mm/mem.c | 5 ++-
arch/s390/mm/init.c | 8 +++--
arch/sh/mm/init.c | 5 ++-
arch/x86/mm/init_32.c | 8 +++--
arch/x86/mm/init_64.c | 27 ++++++++++------
drivers/nvdimm/pfn_devs.c | 1 +
include/linux/memmap_async.h | 28 ++++++++++++++++
include/linux/memory_hotplug.h | 15 ++++++---
include/linux/memremap.h | 2 +
include/linux/mm.h | 6 ++-
kernel/memremap.c | 4 +-
mm/memory_hotplug.c | 69 ++++++++++++++++++++++++++++++----------
mm/page_alloc.c | 3 ++
mm/sparse-vmemmap.c | 56 +++++++++++++++++++++++++-------
15 files changed, 184 insertions(+), 58 deletions(-)
create mode 100644 include/linux/memmap_async.h
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 18278b448530..d331488dd76f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -649,12 +649,15 @@ mem_init (void)
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
+ if (async)
+ return -EWOULDBLOCK;
+
ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 5c8530d0c611..3205a361e37a 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -118,12 +118,15 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
}
int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int rc;
+ if (async)
+ return -EWOULDBLOCK;
+
resize_hpt_for_hotplug(memblock_phys_mem_size());
start = (unsigned long)__va(start);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 3fa3e5323612..ee87085a3a58 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -223,17 +223,21 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
+ if (async)
+ return -EWOULDBLOCK;
+
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+ rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock,
+ async);
if (rc)
vmem_remove_mapping(start, size);
return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 4034035fbede..534303de3ec2 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -430,12 +430,15 @@ void free_initrd_mem(unsigned long start, unsigned long end)
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
+ if (async)
+ return -EWOULDBLOCK;
+
/* We only have ZONE_NORMAL, so this is easy.. */
ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
if (unlikely(ret))
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 979e0a02cbe1..1be538746010 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -852,12 +852,16 @@ void __init mem_init(void)
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ if (async)
+ return -EWOULDBLOCK;
+
+ return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock,
+ async);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a688617c727e..40bd9ba052fe 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -784,11 +784,13 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock)
+ struct vmem_altmap *altmap, bool want_memblock,
+ struct memmap_async_state *async)
{
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock,
+ async);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
@@ -799,14 +801,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
}
int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
init_memory_mapping(start, start + size);
- return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock,
+ async);
}
#define PAGE_INUSE 0xFD
@@ -1412,26 +1415,30 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
{
unsigned long addr;
unsigned long next;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
+ pgd_t *pgd = NULL;
+ p4d_t *p4d = NULL;
+ pud_t *pud = NULL;
pmd_t *pmd;
for (addr = start; addr < end; addr = next) {
next = pmd_addr_end(addr, end);
- pgd = vmemmap_pgd_populate(addr, node);
+ pgd = vmemmap_pgd_populate(addr, node, pgd);
if (!pgd)
return -ENOMEM;
- p4d = vmemmap_p4d_populate(pgd, addr, node);
+ p4d = vmemmap_p4d_populate(pgd, addr, node, p4d);
if (!p4d)
return -ENOMEM;
- pud = vmemmap_pud_populate(p4d, addr, node);
+ pud = vmemmap_pud_populate(p4d, addr, node, pud);
if (!pud)
return -ENOMEM;
+ /*
+ * No lock required here as sections do not collide
+ * below the pud level.
+ */
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
void *p;
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 3f7ad5bc443e..147c62e2ef2b 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -577,6 +577,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
memcpy(altmap, &__altmap, sizeof(*altmap));
altmap->free = PHYS_PFN(offset - SZ_8K);
altmap->alloc = 0;
+ spin_lock_init(&altmap->lock);
pgmap->altmap_valid = true;
} else
return -ENXIO;
diff --git a/include/linux/memmap_async.h b/include/linux/memmap_async.h
new file mode 100644
index 000000000000..11aa9f3a523e
--- /dev/null
+++ b/include/linux/memmap_async.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MEMMAP_ASYNC_H
+#define __LINUX_MEMMAP_ASYNC_H
+#include <linux/async.h>
+
+struct vmem_altmap;
+
+struct memmap_init_env {
+ struct vmem_altmap *altmap;
+ bool want_memblock;
+ int nid;
+};
+
+struct memmap_init_memmap {
+ struct memmap_init_env *env;
+ async_cookie_t cookie;
+ int start_sec;
+ int end_sec;
+ int result;
+};
+
+struct memmap_async_state {
+ struct memmap_init_env env;
+ struct memmap_init_memmap memmap;
+};
+
+extern struct async_domain memmap_init_domain;
+#endif /* __LINUX_MEMMAP_ASYNC_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e60085b2824d..7565b2675863 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -15,6 +15,7 @@ struct memory_block;
struct resource;
struct vmem_altmap;
struct dev_pagemap;
+struct memmap_async_state;
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -116,18 +117,21 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct vmem_altmap *altmap, bool want_memblock,
+ struct memmap_async_state *async);
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
static inline int add_pages(int nid, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock,
+ async);
}
#else /* ARCH_HAS_ADD_PAGES */
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct vmem_altmap *altmap, bool want_memblock,
+ struct memmap_async_state *async);
#endif /* ARCH_HAS_ADD_PAGES */
#ifdef CONFIG_NUMA
@@ -325,7 +329,8 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
extern int add_memory(int nid, u64 start, u64 size);
extern int add_memory_resource(int nid, struct resource *resource, bool online);
extern int arch_add_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct vmem_altmap *altmap, bool want_memblock,
+ struct memmap_async_state *async);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct dev_pagemap *pgmap);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 71f5e7c7dfb9..bfdc7363b13b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -16,6 +16,7 @@ struct device;
* @free: free pages set aside in the mapping for memmap storage
* @align: pages reserved to meet allocation alignments
* @alloc: track pages consumed, private to vmemmap_populate()
+ * @lock: enable parallel allocations
*/
struct vmem_altmap {
const unsigned long base_pfn;
@@ -23,6 +24,7 @@ struct vmem_altmap {
unsigned long free;
unsigned long align;
unsigned long alloc;
+ spinlock_t lock;
};
/*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 319d01372efa..0fac83ff21c5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2654,9 +2654,9 @@ void sparse_mem_maps_populate_node(struct page **map_map,
struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
struct vmem_altmap *altmap);
-pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
-p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
-pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
+pgd_t *vmemmap_pgd_populate(unsigned long addr, int node, pgd_t *);
+p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node, p4d_t *);
+pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node, pud_t *);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
void *vmemmap_alloc_block(unsigned long size, int node);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 58327259420d..b861fe909932 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -235,12 +235,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap,
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
error = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL, false);
+ align_size >> PAGE_SHIFT, NULL, false, NULL);
} else {
struct zone *zone;
error = arch_add_memory(nid, align_start, align_size, altmap,
- false);
+ false, NULL);
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
if (!error)
move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aae4e6cc65e9..18f8e2c49089 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,8 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/memmap_async.h>
+#include <linux/async.h>
#include <linux/compaction.h>
#include <asm/tlbflush.h>
@@ -264,6 +266,32 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
}
+static void __ref section_init_async(void *data, async_cookie_t cookie)
+{
+ unsigned long i;
+ struct memmap_init_memmap *args = data;
+ struct memmap_init_env *env = args->env;
+ int start_sec = args->start_sec, end_sec = args->end_sec, err;
+
+ args->result = 0;
+ for (i = start_sec; i <= end_sec; i++) {
+ err = __add_section(env->nid, section_nr_to_pfn(i), env->altmap,
+ env->want_memblock);
+
+ /*
+ * EEXIST is finally dealt with by ioresource collision
+ * check. see add_memory() => register_memory_resource()
+ * Warning will be printed if there is collision.
+ */
+ if (err && (err != -EEXIST)) {
+ args->result = err;
+ break;
+ }
+ args->result = 0;
+ cond_resched();
+ }
+}
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -272,11 +300,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
*/
int __ref __add_pages(int nid, unsigned long phys_start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ bool want_memblock, struct memmap_async_state *async)
{
- unsigned long i;
int err = 0;
int start_sec, end_sec;
+ struct memmap_init_env _env, *env;
+ struct memmap_init_memmap _args, *args;
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -289,28 +318,32 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
if (altmap->base_pfn != phys_start_pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
altmap->alloc = 0;
}
- for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, section_nr_to_pfn(i), altmap,
- want_memblock);
+ env = async ? &async->env : &_env;
+ args = async ? &async->memmap : &_args;
- /*
- * EEXIST is finally dealt with by ioresource collision
- * check. see add_memory() => register_memory_resource()
- * Warning will be printed if there is collision.
- */
- if (err && (err != -EEXIST))
- break;
- err = 0;
- cond_resched();
+ env->nid = nid;
+ env->altmap = altmap;
+ env->want_memblock = want_memblock;
+
+ args->env = env;
+ args->end_sec = end_sec;
+ args->start_sec = start_sec;
+
+ if (async)
+ args->cookie = async_schedule_domain(section_init_async, args,
+ &memmap_init_domain);
+ else {
+ /* call the 'async' routine synchronously */
+ section_init_async(args, 0);
+ err = args->result;
}
+
vmemmap_populate_print_last();
-out:
return err;
}
@@ -1135,7 +1168,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
}
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, NULL, true);
+ ret = arch_add_memory(nid, start, size, NULL, true, NULL);
if (ret < 0)
goto error;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 545a5860cce7..f83682ef006e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
+#include <linux/async.h>
#include <linux/nmi.h>
#include <asm/sections.h>
@@ -5452,6 +5453,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
#endif
}
+ASYNC_DOMAIN_EXCLUSIVE(memmap_init_domain);
+
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bd0276d5f66b..9cdd82fb595d 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -93,6 +93,7 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
{
+ lockdep_assert_held(&altmap->lock);
return altmap->base_pfn + altmap->reserve + altmap->alloc
+ altmap->align;
}
@@ -101,6 +102,7 @@ static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
{
unsigned long allocated = altmap->alloc + altmap->align;
+ lockdep_assert_held(&altmap->lock);
if (altmap->free > allocated)
return altmap->free - allocated;
return 0;
@@ -124,16 +126,20 @@ void * __meminit altmap_alloc_block_buf(unsigned long size,
return NULL;
}
+ spin_lock(&altmap->lock);
pfn = vmem_altmap_next_pfn(altmap);
nr_pfns = size >> PAGE_SHIFT;
nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
nr_align = ALIGN(pfn, nr_align) - pfn;
- if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+ if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) {
+ spin_unlock(&altmap->lock);
return NULL;
+ }
altmap->alloc += nr_pfns;
altmap->align += nr_align;
pfn += nr_align;
+ spin_unlock(&altmap->lock);
pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
__func__, pfn, altmap->alloc, altmap->align, nr_pfns);
@@ -188,39 +194,63 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
return pmd;
}
-pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
+static DEFINE_MUTEX(vmemmap_pgd_lock);
+static DEFINE_MUTEX(vmemmap_p4d_lock);
+static DEFINE_MUTEX(vmemmap_pud_lock);
+
+pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node,
+ pud_t *pud)
{
- pud_t *pud = pud_offset(p4d, addr);
+ pud_t *new = pud_offset(p4d, addr);
+
+ if (new == pud)
+ return pud;
+ pud = new;
+ mutex_lock(&vmemmap_pud_lock);
if (pud_none(*pud)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pud_populate(&init_mm, pud, p);
}
+ mutex_unlock(&vmemmap_pud_lock);
return pud;
}
-p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
+p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node,
+ p4d_t * p4d)
{
- p4d_t *p4d = p4d_offset(pgd, addr);
+ p4d_t *new = p4d_offset(pgd, addr);
+
+ if (new == p4d)
+ return p4d;
+ p4d = new;
+ mutex_lock(&vmemmap_p4d_lock);
if (p4d_none(*p4d)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
p4d_populate(&init_mm, p4d, p);
}
+ mutex_unlock(&vmemmap_p4d_lock);
return p4d;
}
-pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node, pgd_t *pgd)
{
- pgd_t *pgd = pgd_offset_k(addr);
+ pgd_t *new = pgd_offset_k(addr);
+
+ if (new == pgd)
+ return pgd;
+ pgd = new;
+ mutex_lock(&vmemmap_pgd_lock);
if (pgd_none(*pgd)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pgd_populate(&init_mm, pgd, p);
}
+ mutex_unlock(&vmemmap_pgd_lock);
return pgd;
}
@@ -228,20 +258,20 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
unsigned long end, int node)
{
unsigned long addr = start;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
+ pgd_t *pgd = NULL;
+ p4d_t *p4d = NULL;
+ pud_t *pud = NULL;
pmd_t *pmd;
pte_t *pte;
for (; addr < end; addr += PAGE_SIZE) {
- pgd = vmemmap_pgd_populate(addr, node);
+ pgd = vmemmap_pgd_populate(addr, node, pgd);
if (!pgd)
return -ENOMEM;
- p4d = vmemmap_p4d_populate(pgd, addr, node);
+ p4d = vmemmap_p4d_populate(pgd, addr, node, p4d);
if (!p4d)
return -ENOMEM;
- pud = vmemmap_pud_populate(p4d, addr, node);
+ pud = vmemmap_pud_populate(p4d, addr, node, pud);
if (!pud)
return -ENOMEM;
pmd = vmemmap_pmd_populate(pud, addr, node);
From: Huaisheng Ye <[email protected]>
direct_access needs to check the validity of pointer pfn for NULL
assignment. If pfn equals to NULL, it doesn't need to calculate the value.
Signed-off-by: Huaisheng Ye <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
drivers/nvdimm/pmem.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e8ac6f244d2b..c430536320a5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -228,7 +228,8 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
PFN_PHYS(nr_pages))))
return -EIO;
*kaddr = pmem->virt_addr + offset;
- *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ if (pfn)
+ *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
/*
* If badblocks are present, limit known good range to the
On Wed, Jul 04, 2018 at 11:50:13PM -0700, Dan Williams wrote:
> +static ssize_t memmap_state_store(struct device *dev,
> + struct device_attribute *attr, const char *buf, size_t len)
> +{
> + int i;
> + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
> + struct memmap_async_state *async = &nd_pfn->async;
> +
> + if (strcmp(buf, "sync") == 0)
> + /* pass */;
> + else if (strcmp(buf, "sync\n") == 0)
> + /* pass */;
> + else
> + return -EINVAL;
Hmm what about:
if (strncmp(buf, "sync", 4))
return -EINVAL;
This collapses 6 lines into 4.
--
Johannes Thumshirn Storage
[email protected] +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N?rnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
On Thu, Jul 5, 2018 at 1:29 AM, Johannes Thumshirn <[email protected]> wrote:
> On Wed, Jul 04, 2018 at 11:50:13PM -0700, Dan Williams wrote:
>> +static ssize_t memmap_state_store(struct device *dev,
>> + struct device_attribute *attr, const char *buf, size_t len)
>> +{
>> + int i;
>> + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
>> + struct memmap_async_state *async = &nd_pfn->async;
>> +
>> + if (strcmp(buf, "sync") == 0)
>> + /* pass */;
>> + else if (strcmp(buf, "sync\n") == 0)
>> + /* pass */;
>> + else
>> + return -EINVAL;
>
> Hmm what about:
>
> if (strncmp(buf, "sync", 4))
> return -EINVAL;
>
> This collapses 6 lines into 4.
...but that also allows 'echo "syncAndThenSomeGarbage" >
/sys/.../memmap_state' to succeed.
On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
> ...but that also allows 'echo "syncAndThenSomeGarbage" >
> /sys/.../memmap_state' to succeed.
Yep it does :-(.
Damn
--
Johannes Thumshirn Storage
[email protected] +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N?rnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
> On Thu, Jul 5, 2018 at 1:29 AM, Johannes Thumshirn <[email protected]> wrote:
> > On Wed, Jul 04, 2018 at 11:50:13PM -0700, Dan Williams wrote:
> >> +static ssize_t memmap_state_store(struct device *dev,
> >> + struct device_attribute *attr, const char *buf, size_t len)
> >> +{
> >> + int i;
> >> + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
> >> + struct memmap_async_state *async = &nd_pfn->async;
> >> +
> >> + if (strcmp(buf, "sync") == 0)
> >> + /* pass */;
> >> + else if (strcmp(buf, "sync\n") == 0)
> >> + /* pass */;
> >> + else
> >> + return -EINVAL;
> >
> > Hmm what about:
> >
> > if (strncmp(buf, "sync", 4))
> > return -EINVAL;
> >
> > This collapses 6 lines into 4.
>
> ...but that also allows 'echo "syncAndThenSomeGarbage" >
> /sys/.../memmap_state' to succeed.
if (strncmp(buf, "sync", 4))
return -EINVAL;
if (buf[4] != '\0' && buf[4] != '\n')
return -EINVAL;
On Thu, Jul 5, 2018 at 12:49 PM, Matthew Wilcox <[email protected]> wrote:
> On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
>> On Thu, Jul 5, 2018 at 1:29 AM, Johannes Thumshirn <[email protected]> wrote:
>> > On Wed, Jul 04, 2018 at 11:50:13PM -0700, Dan Williams wrote:
>> >> +static ssize_t memmap_state_store(struct device *dev,
>> >> + struct device_attribute *attr, const char *buf, size_t len)
>> >> +{
>> >> + int i;
>> >> + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
>> >> + struct memmap_async_state *async = &nd_pfn->async;
>> >> +
>> >> + if (strcmp(buf, "sync") == 0)
>> >> + /* pass */;
>> >> + else if (strcmp(buf, "sync\n") == 0)
>> >> + /* pass */;
>> >> + else
>> >> + return -EINVAL;
>> >
>> > Hmm what about:
>> >
>> > if (strncmp(buf, "sync", 4))
>> > return -EINVAL;
>> >
>> > This collapses 6 lines into 4.
>>
>> ...but that also allows 'echo "syncAndThenSomeGarbage" >
>> /sys/.../memmap_state' to succeed.
>
> if (strncmp(buf, "sync", 4))
> return -EINVAL;
> if (buf[4] != '\0' && buf[4] != '\n')
> return -EINVAL;
>
Not sure that's a win either, I'd rather just:
+ if (strcmp(buf, "sync") == 0 || strcmp(buf, "sync\n") == 0)
+ /* pass */;
+ else
+ return -EINVAL;
If we're trying to save those 2 lines.
Dan Williams <[email protected]> writes:
> On Thu, Jul 5, 2018 at 12:49 PM, Matthew Wilcox <[email protected]> wrote:
>> On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
>>> On Thu, Jul 5, 2018 at 1:29 AM, Johannes Thumshirn <[email protected]> wrote:
>>> > On Wed, Jul 04, 2018 at 11:50:13PM -0700, Dan Williams wrote:
>>> >> +static ssize_t memmap_state_store(struct device *dev,
>>> >> + struct device_attribute *attr, const char *buf, size_t len)
>>> >> +{
>>> >> + int i;
>>> >> + struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
>>> >> + struct memmap_async_state *async = &nd_pfn->async;
>>> >> +
>>> >> + if (strcmp(buf, "sync") == 0)
>>> >> + /* pass */;
>>> >> + else if (strcmp(buf, "sync\n") == 0)
>>> >> + /* pass */;
>>> >> + else
>>> >> + return -EINVAL;
>>> >
>>> > Hmm what about:
>>> >
>>> > if (strncmp(buf, "sync", 4))
>>> > return -EINVAL;
>>> >
>>> > This collapses 6 lines into 4.
>>>
>>> ...but that also allows 'echo "syncAndThenSomeGarbage" >
>>> /sys/.../memmap_state' to succeed.
>>
>> if (strncmp(buf, "sync", 4))
>> return -EINVAL;
>> if (buf[4] != '\0' && buf[4] != '\n')
>> return -EINVAL;
>>
>
> Not sure that's a win either, I'd rather just:
>
> + if (strcmp(buf, "sync") == 0 || strcmp(buf, "sync\n") == 0)
> + /* pass */;
> + else
> + return -EINVAL;
>
> If we're trying to save those 2 lines.
WFM. I don't like that I had to go digging around in sysfs
documentation to convince myself that strcmp was safe, but I guess
that's my problem. ;-)
Cheers,
Jeff
On Thu, 5 Jul 2018 16:49:41 +0200 Johannes Thumshirn <[email protected]> wrote:
> On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
> > ...but that also allows 'echo "syncAndThenSomeGarbage" >
> > /sys/.../memmap_state' to succeed.
>
> Yep it does :-(.
>
> Damn
sysfs_streq()
On Thu, Jul 5, 2018 at 1:24 PM, Andrew Morton <[email protected]> wrote:
> On Thu, 5 Jul 2018 16:49:41 +0200 Johannes Thumshirn <[email protected]> wrote:
>
>> On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
>> > ...but that also allows 'echo "syncAndThenSomeGarbage" >
>> > /sys/.../memmap_state' to succeed.
>>
>> Yep it does :-(.
>>
>> Damn
>
> sysfs_streq()
Nice... /me stares down a long list of needed cleanups in the
libnvdimm sysfs implementation with that gem.
On Thu, Jul 05, 2018 at 01:24:55PM -0700, Andrew Morton wrote:
> On Thu, 5 Jul 2018 16:49:41 +0200 Johannes Thumshirn <[email protected]> wrote:
>
> > On Thu, Jul 05, 2018 at 07:46:05AM -0700, Dan Williams wrote:
> > > ...but that also allows 'echo "syncAndThenSomeGarbage" >
> > > /sys/.../memmap_state' to succeed.
> >
> > Yep it does :-(.
> >
> > Damn
>
> sysfs_streq()
Thanks! I didn't know that one existed.
It's kind of a shame that we realised this was a problem and decided
to solve it this way back in 2008 instead of realising that no driver
actually cares whether there's a \n or not and stripping off the \n
before the driver gets to see it. Probably too late to fix that now.
On Thu, Jul 05, 2018 at 01:34:01PM -0700, Dan Williams wrote:
> >
> > sysfs_streq()
>
> Nice... /me stares down a long list of needed cleanups in the
> libnvdimm sysfs implementation with that gem.
Cool. I think not only libnvdimm would profit from this. /me looks
into scsi and nvme now.
--
Johannes Thumshirn Storage
[email protected] +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N?rnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
On Wed 04-07-18 23:49:02, Dan Williams wrote:
> In order to keep pfn_to_page() a simple offset calculation the 'struct
> page' memmap needs to be mapped and initialized in advance of any usage
> of a page. This poses a problem for large memory systems as it delays
> full availability of memory resources for 10s to 100s of seconds.
>
> For typical 'System RAM' the problem is mitigated by the fact that large
> memory allocations tend to happen after the kernel has fully initialized
> and userspace services / applications are launched. A small amount, 2GB
> of memory, is initialized up front. The remainder is initialized in the
> background and freed to the page allocator over time.
>
> Unfortunately, that scheme is not directly reusable for persistent
> memory and dax because userspace has visibility to the entire resource
> pool and can choose to access any offset directly at its choosing. In
> other words there is no allocator indirection where the kernel can
> satisfy requests with arbitrary pages as they become initialized.
>
> That said, we can approximate the optimization by performing the
> initialization in the background, allow the kernel to fully boot the
> platform, start up pmem block devices, mount filesystems in dax mode,
> and only incur the delay at the first userspace dax fault.
>
> With this change an 8 socket system was observed to initialize pmem
> namespaces in ~4 seconds whereas it was previously taking ~4 minutes.
>
> These patches apply on top of the HMM + devm_memremap_pages() reworks
> [1]. Andrew, once the reviews come back, please consider this series for
> -mm as well.
>
> [1]: https://lkml.org/lkml/2018/6/19/108
One question: Why not (in addition to background initialization) have
->direct_access() initialize a block of struct pages around the pfn it
needs if it finds it's not initialized yet? That would make devices usable
immediately without waiting for init to complete...
Honza
>
> ---
>
> Dan Williams (9):
> mm: Plumb dev_pagemap instead of vmem_altmap to memmap_init_zone()
> mm: Enable asynchronous __add_pages() and vmemmap_populate_hugepages()
> mm: Teach memmap_init_zone() to initialize ZONE_DEVICE pages
> mm: Multithread ZONE_DEVICE initialization
> mm: Allow an external agent to wait for memmap initialization
> filesystem-dax: Make mount time pfn validation a debug check
> libnvdimm, pmem: Initialize the memmap in the background
> device-dax: Initialize the memmap in the background
> libnvdimm, namespace: Publish page structure init state / control
>
> Huaisheng Ye (4):
> nvdimm/pmem: check the validity of the pointer pfn
> nvdimm/pmem-dax: check the validity of the pointer pfn
> s390/block/dcssblk: check the validity of the pointer pfn
> fs/dax: Assign NULL to pfn of dax_direct_access if useless
>
>
> arch/ia64/mm/init.c | 5 +
> arch/powerpc/mm/mem.c | 5 +
> arch/s390/mm/init.c | 8 +
> arch/sh/mm/init.c | 5 +
> arch/x86/mm/init_32.c | 8 +
> arch/x86/mm/init_64.c | 27 +++--
> drivers/dax/Kconfig | 10 ++
> drivers/dax/dax-private.h | 2
> drivers/dax/device-dax.h | 2
> drivers/dax/device.c | 16 +++
> drivers/dax/pmem.c | 5 +
> drivers/dax/super.c | 64 +++++++-----
> drivers/nvdimm/nd.h | 2
> drivers/nvdimm/pfn_devs.c | 54 ++++++++--
> drivers/nvdimm/pmem.c | 17 ++-
> drivers/nvdimm/pmem.h | 1
> drivers/s390/block/dcssblk.c | 5 +
> fs/dax.c | 10 +-
> include/linux/memmap_async.h | 55 ++++++++++
> include/linux/memory_hotplug.h | 18 ++-
> include/linux/memremap.h | 31 ++++++
> include/linux/mm.h | 8 +
> kernel/memremap.c | 85 ++++++++-------
> mm/memory_hotplug.c | 73 ++++++++++---
> mm/page_alloc.c | 215 +++++++++++++++++++++++++++++++++------
> mm/sparse-vmemmap.c | 56 ++++++++--
> tools/testing/nvdimm/pmem-dax.c | 11 ++
> 27 files changed, 610 insertions(+), 188 deletions(-)
> create mode 100644 include/linux/memmap_async.h
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Mon, Jul 9, 2018 at 5:56 AM, Jan Kara <[email protected]> wrote:
> On Wed 04-07-18 23:49:02, Dan Williams wrote:
>> In order to keep pfn_to_page() a simple offset calculation the 'struct
>> page' memmap needs to be mapped and initialized in advance of any usage
>> of a page. This poses a problem for large memory systems as it delays
>> full availability of memory resources for 10s to 100s of seconds.
>>
>> For typical 'System RAM' the problem is mitigated by the fact that large
>> memory allocations tend to happen after the kernel has fully initialized
>> and userspace services / applications are launched. A small amount, 2GB
>> of memory, is initialized up front. The remainder is initialized in the
>> background and freed to the page allocator over time.
>>
>> Unfortunately, that scheme is not directly reusable for persistent
>> memory and dax because userspace has visibility to the entire resource
>> pool and can choose to access any offset directly at its choosing. In
>> other words there is no allocator indirection where the kernel can
>> satisfy requests with arbitrary pages as they become initialized.
>>
>> That said, we can approximate the optimization by performing the
>> initialization in the background, allow the kernel to fully boot the
>> platform, start up pmem block devices, mount filesystems in dax mode,
>> and only incur the delay at the first userspace dax fault.
>>
>> With this change an 8 socket system was observed to initialize pmem
>> namespaces in ~4 seconds whereas it was previously taking ~4 minutes.
>>
>> These patches apply on top of the HMM + devm_memremap_pages() reworks
>> [1]. Andrew, once the reviews come back, please consider this series for
>> -mm as well.
>>
>> [1]: https://lkml.org/lkml/2018/6/19/108
>
> One question: Why not (in addition to background initialization) have
> ->direct_access() initialize a block of struct pages around the pfn it
> needs if it finds it's not initialized yet? That would make devices usable
> immediately without waiting for init to complete...
Hmm, yes, relatively immediately... it would depend on the granularity
of the tracking where we can reliably steal initialization work from
the background thread. I'll give it a shot, I'm thinking dividing each
thread's work into 64 sub-units and track those units with a bitmap.
The worst case init time then becomes the time to initialize the pages
for a range that is namespace-size / (NR_MEMMAP_THREADS * 64).