Here is another version of the same trivial pmem driver, because two
obviously aren't enough. The first patch is the same pmem driver
that Ross posted a short time ago, just modified to use platform_devices
to find the persistant memory region instead of hardconding it in the
Kconfig. This allows to keep pmem.c separate from any discovery mechanism,
but still allow auto-discovery.
The other two patches are a heavily rewritten version of the code that
Intel gave to various storage vendors to discover the type 12 (and earlier
type 6) nvdimms, which I massaged into a form that is hopefully suitable
for mainline.
Note that pmem.c really is the minimal version as I think we need something
included ASAP. We'll eventually need to be able to do other I/O from and
to it, and as most people know everyone has their own preferre method to
do it, which I'd like to discuss once we have the basic driver in.
This has been tested on a real NVDIMM on a system with a type 12
capable BIOS.
Changes since V2:
- dropped support for the memmap= kernel command line override
- dropped the not needed memblock_reserve call
- merged various cleanups from Boaz in pmem.c
Changes since V1:
- s/E820_PROTECTED_KERN/E820_PMEM/g
- map the persistent memory as uncached
- better kernel parameter description
- various typo fixes
- MODULE_LICENSE fix
Various recent BIOSes support NVDIMMs or ADR using a non-standard
e820 memory type, and Intel supplied reference Linux code using this
type to various vendors.
Wire this e820 table type up to export platform devices for the pmem
driver so that we can use it in Linux.
Based on arlier work from Dave Jiang <[email protected]> and
Dan Williams <[email protected]>.
Signed-off-by: Christoph Hellwig <[email protected]>
Tested-by: Ross Zwisler <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
Acked-by: Dan Williams <[email protected]>
---
arch/x86/Kconfig | 10 ++++++++
arch/x86/include/uapi/asm/e820.h | 10 ++++++++
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/e820.c | 28 ++++++++++++++++-----
arch/x86/kernel/pmem.c | 53 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 96 insertions(+), 6 deletions(-)
create mode 100644 arch/x86/kernel/pmem.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..c0e8ee3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
source "mm/Kconfig"
+config X86_PMEM_LEGACY
+ bool "Support non-standard NVDIMMs and ADR protected memory"
+ help
+ Treat memory marked using the non-standard e820 type of 12 as used
+ by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+ The kernel will offer these regions to the pmem driver so
+ they can be used for persistent storage.
+
+ Say Y if unsure.
+
config HIGHPTE
bool "Allocate 3rd-level pagetables from highmem"
depends on HIGHMEM
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33..ce0d0bf 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
#define E820_NVS 4
#define E820_UNUSABLE 5
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot. The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
+ *
+ * Note that older platforms also used 6 for the same type of memory,
+ * but newer versions switched to 12 as 6 was assigned differently. Some
+ * time they will learn..
+ */
+#define E820_PRAM 12
/*
* reserved RAM used by kernel itself
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70..971f18c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201de..e26ca56 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
case E820_UNUSABLE:
printk(KERN_CONT "unusable");
break;
+ case E820_PRAM:
+ printk(KERN_CONT "persistent (type %u)", type);
+ break;
default:
printk(KERN_CONT "type %u", type);
break;
@@ -688,8 +691,15 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
register_nosave_region(pfn, PFN_UP(ei->addr));
pfn = PFN_DOWN(ei->addr + ei->size);
- if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+
+ switch (ei->type) {
+ case E820_RAM:
+ case E820_PRAM:
+ case E820_RESERVED_KERN:
+ break;
+ default:
register_nosave_region(PFN_UP(ei->addr), pfn);
+ }
if (pfn >= limit_pfn)
break;
@@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
@@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
unsigned long start_pfn;
unsigned long end_pfn;
- if (ei->type != type)
+ /*
+ * Persistent memory is accounted as ram for purposes of
+ * establishing max_pfn and mem_map.
+ */
+ if (ei->type != E820_RAM && ei->type != E820_PRAM)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +798,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
}
unsigned long __init e820_end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+ return e820_end_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+ return e820_end_pfn(1UL<<(32 - PAGE_SHIFT));
}
static void early_panic(char *msg)
@@ -907,6 +921,7 @@ static inline const char *e820_type_to_string(int e820_type)
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
case E820_UNUSABLE: return "Unusable memory";
+ case E820_PRAM: return "Persistent RAM";
default: return "reserved";
}
}
@@ -941,7 +956,8 @@ void __init e820_reserve_resources(void)
* pcibios_resource_survey()
*/
if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
- res->flags |= IORESOURCE_BUSY;
+ if (e820.map[i].type != E820_PRAM)
+ res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 0000000..89ab53c
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+ struct platform_device *pdev;
+ int error;
+
+ pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+ if (!pdev)
+ return;
+
+ error = platform_device_add_resources(pdev, res, 1);
+ if (error)
+ goto out_put_pdev;
+
+ error = platform_device_add(pdev);
+ if (error)
+ goto out_put_pdev;
+ return;
+
+out_put_pdev:
+ dev_warn(&pdev->dev, "failed to add pmem device!\n");
+ platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->type == E820_PRAM) {
+ struct resource res = {
+ .flags = IORESOURCE_MEM,
+ .start = ei->addr,
+ .end = ei->addr + ei->size - 1,
+ };
+ register_pmem_device(&res);
+ }
+ }
+
+ return 0;
+}
+device_initcall(register_pmem_devices);
--
1.9.1
From: Ross Zwisler <[email protected]>
PMEM is a new driver that presents a reserved range of memory as a
block device. This is useful for developing with NV-DIMMs, and
can be used with volatile memory as a development platform.
This patch contains the initial driver from Ross Zwisler, with
various changes from Boaz Harrosh and me.
Signed-off-by: Ross Zwisler <[email protected]>
[hch: convert to use a platform_device for discovery, fix partition
support, merged various patches from Boaz]
Signed-off-by: Christoph Hellwig <[email protected]>
Tested-by: Ross Zwisler <[email protected]>
Acked-by: Dan Williams <[email protected]>
---
MAINTAINERS | 6 ++
drivers/block/Kconfig | 11 +++
drivers/block/Makefile | 1 +
drivers/block/pmem.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 278 insertions(+)
create mode 100644 drivers/block/pmem.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 1de6afa..4517613 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8071,6 +8071,12 @@ S: Maintained
F: Documentation/blockdev/ramdisk.txt
F: drivers/block/brd.c
+PERSISTENT MEMORY DRIVER
+M: Ross Zwisler <[email protected]>
+L: [email protected]
+S: Supported
+F: drivers/block/pmem.c
+
RANDOM NUMBER DRIVER
M: "Theodore Ts'o" <[email protected]>
S: Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..34753cf 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,17 @@ config BLK_DEV_RAM_DAX
and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
+config BLK_DEV_PMEM
+ tristate "Persistent memory block device support"
+ help
+ Saying Y here will allow you to use a contiguous range of reserved
+ memory as one or more block devices.
+
+ To compile this driver as a module, choose M here: the module will be
+ called pmem.
+
+ If unsure, say N.
+
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media"
depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o
obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
+obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 0000000..0232ab7
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,260 @@
+/*
+ * Persistent Memory Driver
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <[email protected]>.
+ * Copyright (c) 2015, Boaz Harrosh <[email protected]>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define PMEM_MINORS 16
+
+struct pmem_device {
+ struct request_queue *pmem_queue;
+ struct gendisk *pmem_disk;
+
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ void *virt_addr;
+ size_t size;
+};
+
+static int pmem_major;
+static atomic_t pmem_index;
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ void *mem = kmap_atomic(page);
+ size_t pmem_off = sector << 9;
+
+ if (rw == READ) {
+ memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+ }
+
+ kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ int rw;
+ struct bio_vec bvec;
+ sector_t sector;
+ struct bvec_iter iter;
+ int err = 0;
+
+ if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+ err = -EIO;
+ goto out;
+ }
+
+ BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+ rw = bio_data_dir(bio);
+ sector = bio->bi_iter.bi_sector;
+ bio_for_each_segment(bvec, bio, iter) {
+ pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
+ rw, sector);
+ sector += bvec.bv_len >> 9;
+ }
+
+out:
+ bio_endio(bio, err);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+ pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ page_endio(page, rw & WRITE, 0);
+ return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ size_t offset = sector << 9;
+
+ if (!pmem)
+ return -ENODEV;
+
+ *kaddr = pmem->virt_addr + offset;
+ *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+
+ return pmem->size - offset;
+}
+
+static const struct block_device_operations pmem_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = pmem_rw_page,
+ .direct_access = pmem_direct_access,
+};
+
+static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
+{
+ struct pmem_device *pmem;
+ struct gendisk *disk;
+ int idx, err;
+
+ err = -ENOMEM;
+ pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+ if (!pmem)
+ goto out;
+
+ pmem->phys_addr = res->start;
+ pmem->size = resource_size(res);
+
+ err = -EINVAL;
+ if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
+ dev_warn(dev, "could not reserve region [0x%llx:0x%zx]\n",
+ pmem->phys_addr, pmem->size);
+ goto out_free_dev;
+ }
+
+ /*
+ * Map the memory as non-cachable, as we can't write back the contents
+ * of the CPU caches in case of a crash.
+ */
+ err = -ENOMEM;
+ pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+ if (!pmem->virt_addr)
+ goto out_release_region;
+
+ pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!pmem->pmem_queue)
+ goto out_unmap;
+
+ blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+ blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+ blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+
+ disk = alloc_disk(PMEM_MINORS);
+ if (!disk)
+ goto out_free_queue;
+
+ idx = atomic_inc_return(&pmem_index) - 1;
+
+ disk->major = pmem_major;
+ disk->first_minor = PMEM_MINORS * idx;
+ disk->fops = &pmem_fops;
+ disk->private_data = pmem;
+ disk->queue = pmem->pmem_queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(disk->disk_name, "pmem%d", idx);
+ disk->driverfs_dev = dev;
+ set_capacity(disk, pmem->size >> 9);
+ pmem->pmem_disk = disk;
+
+ add_disk(disk);
+ return pmem;
+
+out_free_queue:
+ blk_cleanup_queue(pmem->pmem_queue);
+out_unmap:
+ iounmap(pmem->virt_addr);
+out_release_region:
+ release_mem_region(pmem->phys_addr, pmem->size);
+out_free_dev:
+ kfree(pmem);
+out:
+ return ERR_PTR(err);
+}
+
+static void pmem_free(struct pmem_device *pmem)
+{
+ del_gendisk(pmem->pmem_disk);
+ put_disk(pmem->pmem_disk);
+ blk_cleanup_queue(pmem->pmem_queue);
+ iounmap(pmem->virt_addr);
+ release_mem_region(pmem->phys_addr, pmem->size);
+ kfree(pmem);
+}
+
+static int pmem_probe(struct platform_device *pdev)
+{
+ struct pmem_device *pmem;
+ struct resource *res;
+
+ if (WARN_ON(pdev->num_resources > 1))
+ return -ENXIO;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENXIO;
+
+ pmem = pmem_alloc(&pdev->dev, res);
+ if (IS_ERR(pmem))
+ return PTR_ERR(pmem);
+
+ platform_set_drvdata(pdev, pmem);
+ return 0;
+}
+
+static int pmem_remove(struct platform_device *pdev)
+{
+ struct pmem_device *pmem = platform_get_drvdata(pdev);
+
+ pmem_free(pmem);
+ return 0;
+}
+
+static struct platform_driver pmem_driver = {
+ .probe = pmem_probe,
+ .remove = pmem_remove,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "pmem",
+ },
+};
+
+static int __init pmem_init(void)
+{
+ int error;
+
+ pmem_major = register_blkdev(0, "pmem");
+ if (pmem_major < 0)
+ return pmem_major;
+
+ error = platform_driver_register(&pmem_driver);
+ if (error)
+ unregister_blkdev(pmem_major, "pmem");
+ return error;
+}
+
+static void pmem_exit(void)
+{
+ platform_driver_unregister(&pmem_driver);
+ unregister_blkdev(pmem_major, "pmem");
+}
+
+MODULE_AUTHOR("Ross Zwisler <[email protected]>");
+MODULE_LICENSE("GPL v2");
+
+module_init(pmem_init);
+module_exit(pmem_exit);
--
1.9.1
Finally with these fixes I'm able to define memmap=!
regions in NUMA machines. Any combination cross or
not cross NUMA boundary.
And not only the memmap=! regions had problems also
the real type-12 NvDIMMs had the same NUMA problems.
Now it all works.
Also
I have kept the "don't merge PRAM" regions for ease
of emulated NUMA setups.
Also I have reverted the change Ch did to
e820_mark_nosave_regions. From comment above the function
and if I'm reading register_nosave_region() correctly,
We certainly do not want the system to try and save any
pmem to swap or hibernate. (Actually it will be the
opposite right). Can we actually define swap on a /dev/pmemX ? ;-)
Signed-off-by: Boaz Harrosh <[email protected]>
---
Documentation/kernel-parameters.txt | 6 ++++++
arch/x86/kernel/e820.c | 20 +++++++++-----------
2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a6..c87122d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
or
memmap=0x10000$0x18690000
+ memmap=nn[KMG]!ss[KMG]
+ [KNL,X86] Mark specific memory as protected.
+ Region of memory to be used, from ss to ss+nn.
+ The memory region may be marked as e820 type 12 (0xc)
+ and is NVDIMM or ADR memory.
+
memory_corruption_check=0/1 [X86]
Some BIOSes seem to corrupt the first 64k of
memory when doing things like suspend/resume.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e26ca56..3572a22 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -346,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
* continue building up new bios map based on this
* information
*/
- if (current_type != last_type) {
+ if (current_type != last_type || current_type == E820_PRAM) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
@@ -692,14 +692,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
pfn = PFN_DOWN(ei->addr + ei->size);
- switch (ei->type) {
- case E820_RAM:
- case E820_PRAM:
- case E820_RESERVED_KERN:
- break;
- default:
+ if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
register_nosave_region(PFN_UP(ei->addr), pfn);
- }
if (pfn >= limit_pfn)
break;
@@ -880,6 +874,9 @@ static int __init parse_memmap_one(char *p)
} else if (*p == '$') {
start_at = memparse(p+1, &p);
e820_add_region(start_at, mem_size, E820_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_PRAM);
} else
e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
@@ -955,9 +952,10 @@ void __init e820_reserve_resources(void)
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
- if (e820.map[i].type != E820_PRAM)
- res->flags |= IORESOURCE_BUSY;
+ if (((e820.map[i].type != E820_RESERVED) &&
+ (e820.map[i].type != E820_PRAM)) ||
+ res->start < (1ULL<<20)) {
+ res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
--
1.9.3
On 04/01/2015 10:12 AM, Christoph Hellwig wrote:
> From: Ross Zwisler <[email protected]>
>
> PMEM is a new driver that presents a reserved range of memory as a
> block device. This is useful for developing with NV-DIMMs, and
> can be used with volatile memory as a development platform.
>
> This patch contains the initial driver from Ross Zwisler, with
> various changes from Boaz Harrosh and me.
>
> Signed-off-by: Ross Zwisler <[email protected]>
> [hch: convert to use a platform_device for discovery, fix partition
> support, merged various patches from Boaz]
> Signed-off-by: Christoph Hellwig <[email protected]>
> Tested-by: Ross Zwisler <[email protected]>
> Acked-by: Dan Williams <[email protected]>
> ---
> MAINTAINERS | 6 ++
> drivers/block/Kconfig | 11 +++
> drivers/block/Makefile | 1 +
> drivers/block/pmem.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 278 insertions(+)
> create mode 100644 drivers/block/pmem.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1de6afa..4517613 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8071,6 +8071,12 @@ S: Maintained
> F: Documentation/blockdev/ramdisk.txt
> F: drivers/block/brd.c
>
> +PERSISTENT MEMORY DRIVER
> +M: Ross Zwisler <[email protected]>
> +L: [email protected]
> +S: Supported
> +F: drivers/block/pmem.c
> +
> RANDOM NUMBER DRIVER
> M: "Theodore Ts'o" <[email protected]>
> S: Maintained
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 1b8094d..34753cf 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -404,6 +404,17 @@ config BLK_DEV_RAM_DAX
> and will prevent RAM block device backing store memory from being
> allocated from highmem (only a problem for highmem systems).
>
> +config BLK_DEV_PMEM
> + tristate "Persistent memory block device support"
> + help
> + Saying Y here will allow you to use a contiguous range of reserved
> + memory as one or more block devices.
> +
> + To compile this driver as a module, choose M here: the module will be
> + called pmem.
> +
> + If unsure, say N.
> +
> config CDROM_PKTCDVD
> tristate "Packet writing on CD/DVD media"
> depends on !UML
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index 02b688d..9cc6c18 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o
> obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
> obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
> obj-$(CONFIG_BLK_DEV_RAM) += brd.o
> +obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
> obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
> obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
> obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> new file mode 100644
> index 0000000..0232ab7
> --- /dev/null
> +++ b/drivers/block/pmem.c
> @@ -0,0 +1,260 @@
> +/*
> + * Persistent Memory Driver
> + * Copyright (c) 2014, Intel Corporation.
> + * Copyright (c) 2015, Christoph Hellwig <[email protected]>.
> + * Copyright (c) 2015, Boaz Harrosh <[email protected]>.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <asm/cacheflush.h>
> +#include <linux/blkdev.h>
> +#include <linux/hdreg.h>
> +#include <linux/init.h>
> +#include <linux/platform_device.h>
> +#include <linux/module.h>
> +#include <linux/moduleparam.h>
> +#include <linux/slab.h>
> +
> +#define PMEM_MINORS 16
> +
> +struct pmem_device {
> + struct request_queue *pmem_queue;
> + struct gendisk *pmem_disk;
> +
> + /* One contiguous memory region per device */
> + phys_addr_t phys_addr;
> + void *virt_addr;
> + size_t size;
> +};
> +
> +static int pmem_major;
> +static atomic_t pmem_index;
> +
> +static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
> + unsigned int len, unsigned int off, int rw,
> + sector_t sector)
> +{
> + void *mem = kmap_atomic(page);
> + size_t pmem_off = sector << 9;
> +
> + if (rw == READ) {
> + memcpy(mem + off, pmem->virt_addr + pmem_off, len);
> + flush_dcache_page(page);
> + } else {
> + flush_dcache_page(page);
> + memcpy(pmem->virt_addr + pmem_off, mem + off, len);
> + }
> +
> + kunmap_atomic(mem);
> +}
> +
> +static void pmem_make_request(struct request_queue *q, struct bio *bio)
> +{
> + struct block_device *bdev = bio->bi_bdev;
> + struct pmem_device *pmem = bdev->bd_disk->private_data;
> + int rw;
> + struct bio_vec bvec;
> + sector_t sector;
> + struct bvec_iter iter;
> + int err = 0;
> +
> + if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
> + err = -EIO;
> + goto out;
> + }
> +
> + BUG_ON(bio->bi_rw & REQ_DISCARD);
> +
> + rw = bio_data_dir(bio);
OK cool I was afraid since you did not like my unlikely patch
that you might have missed this fix.
> + sector = bio->bi_iter.bi_sector;
> + bio_for_each_segment(bvec, bio, iter) {
> + pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
> + rw, sector);
> + sector += bvec.bv_len >> 9;
> + }
> +
> +out:
> + bio_endio(bio, err);
> +}
> +
> +static int pmem_rw_page(struct block_device *bdev, sector_t sector,
> + struct page *page, int rw)
> +{
> + struct pmem_device *pmem = bdev->bd_disk->private_data;
> +
> + pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
> + page_endio(page, rw & WRITE, 0);
You might as well rw & WRITE also for the above pmem_do_bvec call.
If there can be such problem than the rw == READ inside
pmem_do_bvec will fail as well.
> + return 0;
> +}
> +
> +static long pmem_direct_access(struct block_device *bdev, sector_t sector,
> + void **kaddr, unsigned long *pfn, long size)
> +{
> + struct pmem_device *pmem = bdev->bd_disk->private_data;
> + size_t offset = sector << 9;
> +
> + if (!pmem)
> + return -ENODEV;
> +
> + *kaddr = pmem->virt_addr + offset;
> + *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
> +
> + return pmem->size - offset;
> +}
> +
> +static const struct block_device_operations pmem_fops = {
> + .owner = THIS_MODULE,
> + .rw_page = pmem_rw_page,
> + .direct_access = pmem_direct_access,
> +};
> +
> +static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
> +{
> + struct pmem_device *pmem;
> + struct gendisk *disk;
> + int idx, err;
> +
> + err = -ENOMEM;
> + pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
> + if (!pmem)
> + goto out;
> +
> + pmem->phys_addr = res->start;
> + pmem->size = resource_size(res);
> +
> + err = -EINVAL;
> + if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
> + dev_warn(dev, "could not reserve region [0x%llx:0x%zx]\n",
> + pmem->phys_addr, pmem->size);
> + goto out_free_dev;
> + }
> +
> + /*
> + * Map the memory as non-cachable, as we can't write back the contents
> + * of the CPU caches in case of a crash.
> + */
> + err = -ENOMEM;
> + pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
> + if (!pmem->virt_addr)
> + goto out_release_region;
You have removed the pmem_mapmem() helper and embedded all this here.
The patch to add page-structs that I'm posting on top of
this, uses the abstraction to switch between implementations.
I'll add the split-out to the page-structs patch.
Am heavy testing this set (Together with my small fix to e820)
And will report tomorrow if the lab is able to survive the night.
Thanks
Boaz
> +
> + pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
> + if (!pmem->pmem_queue)
> + goto out_unmap;
> +
> + blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
> + blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
> + blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
> +
> + disk = alloc_disk(PMEM_MINORS);
> + if (!disk)
> + goto out_free_queue;
> +
> + idx = atomic_inc_return(&pmem_index) - 1;
> +
> + disk->major = pmem_major;
> + disk->first_minor = PMEM_MINORS * idx;
> + disk->fops = &pmem_fops;
> + disk->private_data = pmem;
> + disk->queue = pmem->pmem_queue;
> + disk->flags = GENHD_FL_EXT_DEVT;
> + sprintf(disk->disk_name, "pmem%d", idx);
> + disk->driverfs_dev = dev;
> + set_capacity(disk, pmem->size >> 9);
> + pmem->pmem_disk = disk;
> +
> + add_disk(disk);
> + return pmem;
> +
> +out_free_queue:
> + blk_cleanup_queue(pmem->pmem_queue);
> +out_unmap:
> + iounmap(pmem->virt_addr);
> +out_release_region:
> + release_mem_region(pmem->phys_addr, pmem->size);
> +out_free_dev:
> + kfree(pmem);
> +out:
> + return ERR_PTR(err);
> +}
> +
> +static void pmem_free(struct pmem_device *pmem)
> +{
> + del_gendisk(pmem->pmem_disk);
> + put_disk(pmem->pmem_disk);
> + blk_cleanup_queue(pmem->pmem_queue);
> + iounmap(pmem->virt_addr);
> + release_mem_region(pmem->phys_addr, pmem->size);
> + kfree(pmem);
> +}
> +
> +static int pmem_probe(struct platform_device *pdev)
> +{
> + struct pmem_device *pmem;
> + struct resource *res;
> +
> + if (WARN_ON(pdev->num_resources > 1))
> + return -ENXIO;
> +
> + res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> + if (!res)
> + return -ENXIO;
> +
> + pmem = pmem_alloc(&pdev->dev, res);
> + if (IS_ERR(pmem))
> + return PTR_ERR(pmem);
> +
> + platform_set_drvdata(pdev, pmem);
> + return 0;
> +}
> +
> +static int pmem_remove(struct platform_device *pdev)
> +{
> + struct pmem_device *pmem = platform_get_drvdata(pdev);
> +
> + pmem_free(pmem);
> + return 0;
> +}
> +
> +static struct platform_driver pmem_driver = {
> + .probe = pmem_probe,
> + .remove = pmem_remove,
> + .driver = {
> + .owner = THIS_MODULE,
> + .name = "pmem",
> + },
> +};
> +
> +static int __init pmem_init(void)
> +{
> + int error;
> +
> + pmem_major = register_blkdev(0, "pmem");
> + if (pmem_major < 0)
> + return pmem_major;
> +
> + error = platform_driver_register(&pmem_driver);
> + if (error)
> + unregister_blkdev(pmem_major, "pmem");
> + return error;
> +}
> +
> +static void pmem_exit(void)
> +{
> + platform_driver_unregister(&pmem_driver);
> + unregister_blkdev(pmem_major, "pmem");
> +}
> +
> +MODULE_AUTHOR("Ross Zwisler <[email protected]>");
> +MODULE_LICENSE("GPL v2");
> +
> +module_init(pmem_init);
> +module_exit(pmem_exit);
>
On Wed, Apr 01, 2015 at 05:25:22PM +0300, Boaz Harrosh wrote:
> pfn = PFN_DOWN(ei->addr + ei->size);
>
> - switch (ei->type) {
> - case E820_RAM:
> - case E820_PRAM:
> - case E820_RESERVED_KERN:
> - break;
> - default:
> + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
> register_nosave_region(PFN_UP(ei->addr), pfn);
> - }
I guess this makes sense - if the content is persistent already we don't need
to save it.
> - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
> - if (e820.map[i].type != E820_PRAM)
> - res->flags |= IORESOURCE_BUSY;
> + if (((e820.map[i].type != E820_RESERVED) &&
> + (e820.map[i].type != E820_PRAM)) ||
> + res->start < (1ULL<<20)) {
So now we also trigger for PRAM regions under 1ULL<<20, was that the
intentional change? Honestly I don't really understand this 1ULL<<20
magic here even for the existing case. Guess this is magic from the
old ISA PC days?
> + res->flags |= IORESOURCE_BUSY;
Guess this is the real change, and I'd love to understand why this
makes a difference for you. IORESOURCE_BUSY is checked almost never,
and is intented to mean it's a driver mapping.
On Wed, Apr 01, 2015 at 06:18:54PM +0300, Boaz Harrosh wrote:
> You might as well rw & WRITE also for the above pmem_do_bvec call.
> If there can be such problem than the rw == READ inside
> pmem_do_bvec will fail as well.
Might be worth to pass a bool to pmem_do_bvec, but we're getting into
serious bikeshed territory here..
* Christoph Hellwig <[email protected]> wrote:
> On Wed, Apr 01, 2015 at 05:25:22PM +0300, Boaz Harrosh wrote:
> > pfn = PFN_DOWN(ei->addr + ei->size);
> >
> > - switch (ei->type) {
> > - case E820_RAM:
> > - case E820_PRAM:
> > - case E820_RESERVED_KERN:
> > - break;
> > - default:
> > + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
> > register_nosave_region(PFN_UP(ei->addr), pfn);
> > - }
>
> I guess this makes sense - if the content is persistent already we don't need
> to save it.
>
> > - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
> > - if (e820.map[i].type != E820_PRAM)
> > - res->flags |= IORESOURCE_BUSY;
> > + if (((e820.map[i].type != E820_RESERVED) &&
> > + (e820.map[i].type != E820_PRAM)) ||
> > + res->start < (1ULL<<20)) {
>
> So now we also trigger for PRAM regions under 1ULL<<20, was that the
> intentional change? Honestly I don't really understand this 1ULL<<20
> magic here even for the existing case. Guess this is magic from the
> old ISA PC days?
>
> > + res->flags |= IORESOURCE_BUSY;
>
> Guess this is the real change, and I'd love to understand why this
> makes a difference for you. IORESOURCE_BUSY is checked almost
> never, and is intented to mean it's a driver mapping.
So assuming this works on your test setup I'm inclined to squash
Boaz's fixes into the original patch, assuming you see no outright bug
in them. Anything else can be done as delta improvements.
Agreed?
Thanks,
Ingo
On Thu, Apr 02, 2015 at 11:37:40AM +0200, Ingo Molnar wrote:
> So assuming this works on your test setup I'm inclined to squash
> Boaz's fixes into the original patch, assuming you see no outright bug
> in them. Anything else can be done as delta improvements.
It looks sensible, but I'd really like to understand the changes a bit
better. In the meantime I'll test them on my setup.
On Thu, Apr 02, 2015 at 11:37:40AM +0200, Ingo Molnar wrote:
> So assuming this works on your test setup
Boaz's changes work fine here.
On 04/02/2015 12:30 PM, Christoph Hellwig wrote:
> On Wed, Apr 01, 2015 at 05:25:22PM +0300, Boaz Harrosh wrote:
>> pfn = PFN_DOWN(ei->addr + ei->size);
>>
>> - switch (ei->type) {
>> - case E820_RAM:
>> - case E820_PRAM:
>> - case E820_RESERVED_KERN:
>> - break;
>> - default:
>> + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
>> register_nosave_region(PFN_UP(ei->addr), pfn);
>> - }
>
> I guess this makes sense - if the content is persistent already we don't need
> to save it.
>
Yes
>> - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
>> - if (e820.map[i].type != E820_PRAM)
>> - res->flags |= IORESOURCE_BUSY;
>> + if (((e820.map[i].type != E820_RESERVED) &&
>> + (e820.map[i].type != E820_PRAM)) ||
>> + res->start < (1ULL<<20)) {
>
> So now we also trigger for PRAM regions under 1ULL<<20, was that the
> intentional change? Honestly I don't really understand this 1ULL<<20
> magic here even for the existing case. Guess this is magic from the
> old ISA PC days?
>
OK so by now I have had a chance to test all cases and figure out
what happened. I was running on your V1 and then V2. And had huge
problems with NUMA. The thing that actually fixed everything and
brought the system back to life, was already in your V3.
It was the remove of reserve_pmem() and the call to memblock_reserve()
and init_memory_mapping() from within. It was making a mess.
So your V3 was already running nice. But I already fixed everything
on my side by the time you sent V3, and what I sent here is the
diff from what I had and your V3.
But these are all good fixes still. (Though not fatal as V2 was)
3 small fixes here:
* Adding back the memmap=! now that everything works perfectly
as expected.
* The one above that fixes register_nosave_region
and ...
>> + res->flags |= IORESOURCE_BUSY;
>
> Guess this is the real change, and I'd love to understand why this
> makes a difference for you. IORESOURCE_BUSY is checked almost never,
> and is intented to mean it's a driver mapping.
This here is a very minor thing. I have lots of experience with this
code and with the internals of insert_resource() from my old e820.c
fixes (Which are BTW still relevant but no longer needed for any
current platform)
So what happens here is just the adding of the IORESOURCE_BUSY flag.
Actually all these resources are already in the resource list and this
code is just changing the flag. So if you are not changing the flag there
is just no point in calling insert_resource(). It will change nothing.
>From what I understand from the history of this file and from prints
that I put in this file and at the resource manager. The logic is that
it wants to make all E820_XXX busy so to not let any driver try and claim them.
And Also the code does not want to allow any kind of e820 resource below the 1M
be allowed for drivers, reserved or otherwise.
Any E820_RESERVED regions it allows for driver use by not setting IORESOURCE_BUSY.
Your code and mine are effectively the same only that I optimize out the call to
insert_resource() since the flags were not changed.
Testing status:
We had some birth problems with the new system and the APIs that changed so the
testing rig broke half through the night. But we have wrinkled out the last
problems and the machines are pumping away full steam, NUMA and everything.
So far it looks good. I will very soon now, Send you a tested-by: That
confirms these patches are just as good as what we had until now out-of-tree.
(I'm running with my page-struct patches on top of your pmem driver. Will publish
trees soon)
Thank you for your patience
Boaz
Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
Gitweb: http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
Author: Christoph Hellwig <[email protected]>
AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
x86/mm: Add support for the non-standard protected e820 type
Various recent BIOSes support NVDIMMs or ADR using a
non-standard e820 memory type, and Intel supplied reference
Linux code using this type to various vendors.
Wire this e820 table type up to export platform devices for the
pmem driver so that we can use it in Linux.
Based on earlier work from:
Dave Jiang <[email protected]>
Dan Williams <[email protected]>
Includes fixes for NUMA regions from Boaz Harrosh.
Tested-by: Ross Zwisler <[email protected]>
Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Dan Williams <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Boaz Harrosh <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/kernel-parameters.txt | 6 +++++
arch/x86/Kconfig | 10 +++++++
arch/x86/include/uapi/asm/e820.h | 10 +++++++
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/e820.c | 26 +++++++++++++-----
arch/x86/kernel/pmem.c | 53 +++++++++++++++++++++++++++++++++++++
6 files changed, 100 insertions(+), 6 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a6..c87122d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
or
memmap=0x10000$0x18690000
+ memmap=nn[KMG]!ss[KMG]
+ [KNL,X86] Mark specific memory as protected.
+ Region of memory to be used, from ss to ss+nn.
+ The memory region may be marked as e820 type 12 (0xc)
+ and is NVDIMM or ADR memory.
+
memory_corruption_check=0/1 [X86]
Some BIOSes seem to corrupt the first 64k of
memory when doing things like suspend/resume.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca..9e3bcd6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
source "mm/Kconfig"
+config X86_PMEM_LEGACY
+ bool "Support non-standard NVDIMMs and ADR protected memory"
+ help
+ Treat memory marked using the non-standard e820 type of 12 as used
+ by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+ The kernel will offer these regions to the 'pmem' driver so
+ they can be used for persistent storage.
+
+ Say Y if unsure.
+
config HIGHPTE
bool "Allocate 3rd-level pagetables from highmem"
depends on HIGHMEM
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33..960a8a9 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
#define E820_NVS 4
#define E820_UNUSABLE 5
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot. The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ * but newer versions switched to 12 as 6 was assigned differently. Some
+ * time they will learn... )
+ */
+#define E820_PRAM 12
/*
* reserved RAM used by kernel itself
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70..971f18c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201de..11cc7d5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
case E820_UNUSABLE:
printk(KERN_CONT "unusable");
break;
+ case E820_PRAM:
+ printk(KERN_CONT "persistent (type %u)", type);
+ break;
default:
printk(KERN_CONT "type %u", type);
break;
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
* continue building up new bios map based on this
* information
*/
- if (current_type != last_type) {
+ if (current_type != last_type || current_type == E820_PRAM) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
register_nosave_region(pfn, PFN_UP(ei->addr));
pfn = PFN_DOWN(ei->addr + ei->size);
+
if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
register_nosave_region(PFN_UP(ei->addr), pfn);
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
unsigned long start_pfn;
unsigned long end_pfn;
- if (ei->type != type)
+ /*
+ * Persistent memory is accounted as ram for purposes of
+ * establishing max_pfn and mem_map.
+ */
+ if (ei->type != E820_RAM && ei->type != E820_PRAM)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
}
unsigned long __init e820_end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+ return e820_end_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+ return e820_end_pfn(1UL << (32-PAGE_SHIFT));
}
static void early_panic(char *msg)
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
} else if (*p == '$') {
start_at = memparse(p+1, &p);
e820_add_region(start_at, mem_size, E820_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_PRAM);
} else
e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
case E820_UNUSABLE: return "Unusable memory";
+ case E820_PRAM: return "Persistent RAM";
default: return "reserved";
}
}
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+ if (((e820.map[i].type != E820_RESERVED) &&
+ (e820.map[i].type != E820_PRAM)) ||
+ res->start < (1ULL<<20)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 0000000..3420c87
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+ struct platform_device *pdev;
+ int error;
+
+ pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+ if (!pdev)
+ return;
+
+ error = platform_device_add_resources(pdev, res, 1);
+ if (error)
+ goto out_put_pdev;
+
+ error = platform_device_add(pdev);
+ if (error)
+ goto out_put_pdev;
+ return;
+
+out_put_pdev:
+ dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
+ platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->type == E820_PRAM) {
+ struct resource res = {
+ .flags = IORESOURCE_MEM,
+ .start = ei->addr,
+ .end = ei->addr + ei->size - 1,
+ };
+ register_pmem_device(&res);
+ }
+ }
+
+ return 0;
+}
+device_initcall(register_pmem_devices);
Commit-ID: 9e853f2313e5eb163cb1ea461b23c2332cf6438a
Gitweb: http://git.kernel.org/tip/9e853f2313e5eb163cb1ea461b23c2332cf6438a
Author: Ross Zwisler <[email protected]>
AuthorDate: Wed, 1 Apr 2015 09:12:19 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 1 Apr 2015 17:03:56 +0200
drivers/block/pmem: Add a driver for persistent memory
PMEM is a new driver that presents a reserved range of memory as
a block device. This is useful for developing with NV-DIMMs,
and can be used with volatile memory as a development platform.
This patch contains the initial driver from Ross Zwisler, with
various changes: converted it to use a platform_device for
discovery, fixed partition support and merged various patches
from Boaz Harrosh.
Tested-by: Ross Zwisler <[email protected]>
Signed-off-by: Ross Zwisler <[email protected]>
Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Dan Williams <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Boaz Harrosh <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Matthew Wilcox <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <[email protected]>
---
MAINTAINERS | 6 ++
drivers/block/Kconfig | 11 +++
drivers/block/Makefile | 1 +
drivers/block/pmem.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 281 insertions(+)
diff --git a/MAINTAINERS b/MAINTAINERS
index 1de6afa..4517613 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8071,6 +8071,12 @@ S: Maintained
F: Documentation/blockdev/ramdisk.txt
F: drivers/block/brd.c
+PERSISTENT MEMORY DRIVER
+M: Ross Zwisler <[email protected]>
+L: [email protected]
+S: Supported
+F: drivers/block/pmem.c
+
RANDOM NUMBER DRIVER
M: "Theodore Ts'o" <[email protected]>
S: Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1b8094d..eb1fed5 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -404,6 +404,17 @@ config BLK_DEV_RAM_DAX
and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
+config BLK_DEV_PMEM
+ tristate "Persistent memory block device support"
+ help
+ Saying Y here will allow you to use a contiguous range of reserved
+ memory as one or more persistent block devices.
+
+ To compile this driver as a module, choose M here: the module will be
+ called 'pmem'.
+
+ If unsure, say N.
+
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media"
depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..9cc6c18 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o
obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
+obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 0000000..988f384
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,263 @@
+/*
+ * Persistent Memory Driver
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <[email protected]>.
+ * Copyright (c) 2015, Boaz Harrosh <[email protected]>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define PMEM_MINORS 16
+
+struct pmem_device {
+ struct request_queue *pmem_queue;
+ struct gendisk *pmem_disk;
+
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ void *virt_addr;
+ size_t size;
+};
+
+static int pmem_major;
+static atomic_t pmem_index;
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ void *mem = kmap_atomic(page);
+ size_t pmem_off = sector << 9;
+
+ if (rw == READ) {
+ memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+ }
+
+ kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ int rw;
+ struct bio_vec bvec;
+ sector_t sector;
+ struct bvec_iter iter;
+ int err = 0;
+
+ if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+ err = -EIO;
+ goto out;
+ }
+
+ BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+ rw = bio_data_dir(bio);
+ sector = bio->bi_iter.bi_sector;
+ bio_for_each_segment(bvec, bio, iter) {
+ pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
+ rw, sector);
+ sector += bvec.bv_len >> 9;
+ }
+
+out:
+ bio_endio(bio, err);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+ pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ page_endio(page, rw & WRITE, 0);
+
+ return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ size_t offset = sector << 9;
+
+ if (!pmem)
+ return -ENODEV;
+
+ *kaddr = pmem->virt_addr + offset;
+ *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+
+ return pmem->size - offset;
+}
+
+static const struct block_device_operations pmem_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = pmem_rw_page,
+ .direct_access = pmem_direct_access,
+};
+
+static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
+{
+ struct pmem_device *pmem;
+ struct gendisk *disk;
+ int idx, err;
+
+ err = -ENOMEM;
+ pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+ if (!pmem)
+ goto out;
+
+ pmem->phys_addr = res->start;
+ pmem->size = resource_size(res);
+
+ err = -EINVAL;
+ if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
+ dev_warn(dev, "could not reserve region [0x%llx:0x%zx]\n",
+ pmem->phys_addr, pmem->size);
+ goto out_free_dev;
+ }
+
+ /*
+ * Map the memory as non-cachable, as we can't write back the contents
+ * of the CPU caches in case of a crash.
+ */
+ err = -ENOMEM;
+ pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+ if (!pmem->virt_addr)
+ goto out_release_region;
+
+ pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!pmem->pmem_queue)
+ goto out_unmap;
+
+ blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+ blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+ blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+
+ disk = alloc_disk(PMEM_MINORS);
+ if (!disk)
+ goto out_free_queue;
+
+ idx = atomic_inc_return(&pmem_index) - 1;
+
+ disk->major = pmem_major;
+ disk->first_minor = PMEM_MINORS * idx;
+ disk->fops = &pmem_fops;
+ disk->private_data = pmem;
+ disk->queue = pmem->pmem_queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(disk->disk_name, "pmem%d", idx);
+ disk->driverfs_dev = dev;
+ set_capacity(disk, pmem->size >> 9);
+ pmem->pmem_disk = disk;
+
+ add_disk(disk);
+
+ return pmem;
+
+out_free_queue:
+ blk_cleanup_queue(pmem->pmem_queue);
+out_unmap:
+ iounmap(pmem->virt_addr);
+out_release_region:
+ release_mem_region(pmem->phys_addr, pmem->size);
+out_free_dev:
+ kfree(pmem);
+out:
+ return ERR_PTR(err);
+}
+
+static void pmem_free(struct pmem_device *pmem)
+{
+ del_gendisk(pmem->pmem_disk);
+ put_disk(pmem->pmem_disk);
+ blk_cleanup_queue(pmem->pmem_queue);
+ iounmap(pmem->virt_addr);
+ release_mem_region(pmem->phys_addr, pmem->size);
+ kfree(pmem);
+}
+
+static int pmem_probe(struct platform_device *pdev)
+{
+ struct pmem_device *pmem;
+ struct resource *res;
+
+ if (WARN_ON(pdev->num_resources > 1))
+ return -ENXIO;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENXIO;
+
+ pmem = pmem_alloc(&pdev->dev, res);
+ if (IS_ERR(pmem))
+ return PTR_ERR(pmem);
+
+ platform_set_drvdata(pdev, pmem);
+
+ return 0;
+}
+
+static int pmem_remove(struct platform_device *pdev)
+{
+ struct pmem_device *pmem = platform_get_drvdata(pdev);
+
+ pmem_free(pmem);
+ return 0;
+}
+
+static struct platform_driver pmem_driver = {
+ .probe = pmem_probe,
+ .remove = pmem_remove,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "pmem",
+ },
+};
+
+static int __init pmem_init(void)
+{
+ int error;
+
+ pmem_major = register_blkdev(0, "pmem");
+ if (pmem_major < 0)
+ return pmem_major;
+
+ error = platform_driver_register(&pmem_driver);
+ if (error)
+ unregister_blkdev(pmem_major, "pmem");
+ return error;
+}
+module_init(pmem_init);
+
+static void pmem_exit(void)
+{
+ platform_driver_unregister(&pmem_driver);
+ unregister_blkdev(pmem_major, "pmem");
+}
+module_exit(pmem_exit);
+
+MODULE_AUTHOR("Ross Zwisler <[email protected]>");
+MODULE_LICENSE("GPL v2");
Hi Christoph, Ingo
Please consider this small patch below just a small print at module
load/unload so to know at user systems how things progressed.
As it is now, we know nothing. For any other disk kind we have two
tuns of prints.
---
From: Boaz Harrosh <[email protected]>
Date: Thu, 2 Apr 2015 16:43:48 +0300
Subject: [PATCH] pmem: Add prints at module load and unload
When debugging people's systems it is helpful
to see what went on. The load and unload of pmem is
an important event.
The importance is the number of loaded devices and
error status. The exact device's addresses created
we can see from the other prints at e820 so no need
to duplicate this information.
While at it fix up a small issue with rw flags.
Signed-off-by: Boaz Harrosh <[email protected]>
---
drivers/block/pmem.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 988f384..3f15fbc 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
{
struct pmem_device *pmem = bdev->bd_disk->private_data;
+ rw &= WRITE;
pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
- page_endio(page, rw & WRITE, 0);
+ page_endio(page, rw, 0);
return 0;
}
@@ -248,6 +249,9 @@ static int __init pmem_init(void)
error = platform_driver_register(&pmem_driver);
if (error)
unregister_blkdev(pmem_major, "pmem");
+
+ pr_info("pmem: init %d devices => %d\n",
+ atomic_read(&pmem_index), error);
return error;
}
module_init(pmem_init);
@@ -256,6 +260,7 @@ static void pmem_exit(void)
{
platform_driver_unregister(&pmem_driver);
unregister_blkdev(pmem_major, "pmem");
+ pr_info("pmem: exit\n");
}
module_exit(pmem_exit);
--
1.9.3
On Thu, Apr 2, 2015 at 8:31 AM, Boaz Harrosh <[email protected]> wrote:
> Hi Christoph, Ingo
>
> Please consider this small patch below just a small print at module
> load/unload so to know at user systems how things progressed.
> As it is now, we know nothing. For any other disk kind we have two
> tuns of prints.
>
> ---
> From: Boaz Harrosh <[email protected]>
> Date: Thu, 2 Apr 2015 16:43:48 +0300
> Subject: [PATCH] pmem: Add prints at module load and unload
>
> When debugging people's systems it is helpful
> to see what went on. The load and unload of pmem is
> an important event.
>
> The importance is the number of loaded devices and
> error status. The exact device's addresses created
> we can see from the other prints at e820 so no need
> to duplicate this information.
>
> While at it fix up a small issue with rw flags.
Separate patch for fixes please.
>
> Signed-off-by: Boaz Harrosh <[email protected]>
> ---
> drivers/block/pmem.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
> index 988f384..3f15fbc 100644
> --- a/drivers/block/pmem.c
> +++ b/drivers/block/pmem.c
> @@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
> {
> struct pmem_device *pmem = bdev->bd_disk->private_data;
>
> + rw &= WRITE;
> pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
> - page_endio(page, rw & WRITE, 0);
> + page_endio(page, rw, 0);
>
> return 0;
> }
> @@ -248,6 +249,9 @@ static int __init pmem_init(void)
> error = platform_driver_register(&pmem_driver);
> if (error)
> unregister_blkdev(pmem_major, "pmem");
> +
> + pr_info("pmem: init %d devices => %d\n",
> + atomic_read(&pmem_index), error);
If anything I think these should be dev_dbg().
On 04/02/2015 06:39 PM, Dan Williams wrote:
> On Thu, Apr 2, 2015 at 8:31 AM, Boaz Harrosh <[email protected]> wrote:
>> Hi Christoph, Ingo
>>
>> Please consider this small patch below just a small print at module
>> load/unload so to know at user systems how things progressed.
>> As it is now, we know nothing. For any other disk kind we have two
>> tuns of prints.
>>
>> ---
>> From: Boaz Harrosh <[email protected]>
>> Date: Thu, 2 Apr 2015 16:43:48 +0300
>> Subject: [PATCH] pmem: Add prints at module load and unload
>>
>> When debugging people's systems it is helpful
>> to see what went on. The load and unload of pmem is
>> an important event.
>>
>> The importance is the number of loaded devices and
>> error status. The exact device's addresses created
>> we can see from the other prints at e820 so no need
>> to duplicate this information.
>>
>> While at it fix up a small issue with rw flags.
>
> Separate patch for fixes please.
Sigh, OK I was preparing this and hopping it would just
be SQUASHED into the original patch but Ingo bit me to it
and already submitted the patches.
>
>>
>> Signed-off-by: Boaz Harrosh <[email protected]>
>> ---
>> drivers/block/pmem.c | 7 ++++++-
>> 1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
>> index 988f384..3f15fbc 100644
>> --- a/drivers/block/pmem.c
>> +++ b/drivers/block/pmem.c
>> @@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>> {
>> struct pmem_device *pmem = bdev->bd_disk->private_data;
>>
>> + rw &= WRITE;
>> pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
>> - page_endio(page, rw & WRITE, 0);
>> + page_endio(page, rw, 0);
>>
>> return 0;
>> }
>> @@ -248,6 +249,9 @@ static int __init pmem_init(void)
>> error = platform_driver_register(&pmem_driver);
>> if (error)
>> unregister_blkdev(pmem_major, "pmem");
>> +
>> + pr_info("pmem: init %d devices => %d\n",
>> + atomic_read(&pmem_index), error);
>
> If anything I think these should be dev_dbg().
We do not have a dev at any of this point, and it does not
belong to any specific device. Also I would like this
_info and not _dbg so to always have it, also for production.
See the chatter for a single SCSI disk the minimum we need
is just the small print that tells all that we need (for now)
Please consider as is?
Thanks
Boaz
On Thu, Apr 2, 2015 at 8:47 AM, Boaz Harrosh <[email protected]> wrote:
> On 04/02/2015 06:39 PM, Dan Williams wrote:
>> On Thu, Apr 2, 2015 at 8:31 AM, Boaz Harrosh <[email protected]> wrote:
>>> Hi Christoph, Ingo
>>>
>>> Please consider this small patch below just a small print at module
>>> load/unload so to know at user systems how things progressed.
>>> As it is now, we know nothing. For any other disk kind we have two
>>> tuns of prints.
>>>
>>> ---
>>> From: Boaz Harrosh <[email protected]>
>>> Date: Thu, 2 Apr 2015 16:43:48 +0300
>>> Subject: [PATCH] pmem: Add prints at module load and unload
>>>
>>> When debugging people's systems it is helpful
>>> to see what went on. The load and unload of pmem is
>>> an important event.
>>>
>>> The importance is the number of loaded devices and
>>> error status. The exact device's addresses created
>>> we can see from the other prints at e820 so no need
>>> to duplicate this information.
>>>
>>> While at it fix up a small issue with rw flags.
>>
>> Separate patch for fixes please.
>
> Sigh, OK I was preparing this and hopping it would just
> be SQUASHED into the original patch but Ingo bit me to it
> and already submitted the patches.
>
>>
>>>
>>> Signed-off-by: Boaz Harrosh <[email protected]>
>>> ---
>>> drivers/block/pmem.c | 7 ++++++-
>>> 1 file changed, 6 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
>>> index 988f384..3f15fbc 100644
>>> --- a/drivers/block/pmem.c
>>> +++ b/drivers/block/pmem.c
>>> @@ -91,8 +91,9 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>>> {
>>> struct pmem_device *pmem = bdev->bd_disk->private_data;
>>>
>>> + rw &= WRITE;
>>> pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
>>> - page_endio(page, rw & WRITE, 0);
>>> + page_endio(page, rw, 0);
>>>
>>> return 0;
>>> }
>>> @@ -248,6 +249,9 @@ static int __init pmem_init(void)
>>> error = platform_driver_register(&pmem_driver);
>>> if (error)
>>> unregister_blkdev(pmem_major, "pmem");
>>> +
>>> + pr_info("pmem: init %d devices => %d\n",
>>> + atomic_read(&pmem_index), error);
>>
>> If anything I think these should be dev_dbg().
>
> We do not have a dev at any of this point, and it does not
> belong to any specific device.
Ah, true this is prior to the driver attaching... that said it seems
more relevant to print from probe() (where we do have a device) than
init where the device may remain idle due to some other policy.
> Also I would like this
> _info and not _dbg so to always have it, also for production.
> See the chatter for a single SCSI disk the minimum we need
> is just the small print that tells all that we need (for now)
Not sure we want to follow so closely in the footsteps of SCSI's chattiness.
On Thu, Apr 02, 2015 at 09:01:14AM -0700, Dan Williams wrote:
> >> If anything I think these should be dev_dbg().
> >
> > We do not have a dev at any of this point, and it does not
> > belong to any specific device.
>
> Ah, true this is prior to the driver attaching... that said it seems
> more relevant to print from probe() (where we do have a device) than
> init where the device may remain idle due to some other policy.
>
> > Also I would like this
> > _info and not _dbg so to always have it, also for production.
> > See the chatter for a single SCSI disk the minimum we need
> > is just the small print that tells all that we need (for now)
>
> Not sure we want to follow so closely in the footsteps of SCSI's chattiness.
Defintively not! A single dev_info at ->probe time sounds ok, something
like:
dev_info(dev, "registering region [0x%pa:0x%zx]\n",
&pmem->phys_addr, pmem->size);
but there are plenty other drivers not that chatty, e.g. virtio and we're
doing just fine for them.
On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
<[email protected]> wrote:
> Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Gitweb: http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Author: Christoph Hellwig <[email protected]>
> AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>
> x86/mm: Add support for the non-standard protected e820 type
>
> Various recent BIOSes support NVDIMMs or ADR using a
> non-standard e820 memory type, and Intel supplied reference
> Linux code using this type to various vendors.
>
> Wire this e820 table type up to export platform devices for the
> pmem driver so that we can use it in Linux.
This scares me a bit. Do we know that the upcoming ACPI 6.0
enumeration mechanism *won't* use e820 type 12? If it will, what
stops a non-legacy device from being incorrectly claimed as a legacy
device?
--Andy
>
> Based on earlier work from:
>
> Dave Jiang <[email protected]>
> Dan Williams <[email protected]>
>
> Includes fixes for NUMA regions from Boaz Harrosh.
>
> Tested-by: Ross Zwisler <[email protected]>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Dan Williams <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Boaz Harrosh <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Keith Busch <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: [email protected]
> Link: http://lkml.kernel.org/r/[email protected]
> [ Minor cleanups. ]
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> Documentation/kernel-parameters.txt | 6 +++++
> arch/x86/Kconfig | 10 +++++++
> arch/x86/include/uapi/asm/e820.h | 10 +++++++
> arch/x86/kernel/Makefile | 1 +
> arch/x86/kernel/e820.c | 26 +++++++++++++-----
> arch/x86/kernel/pmem.c | 53 +++++++++++++++++++++++++++++++++++++
> 6 files changed, 100 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index bfcb1a6..c87122d 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
> or
> memmap=0x10000$0x18690000
>
> + memmap=nn[KMG]!ss[KMG]
> + [KNL,X86] Mark specific memory as protected.
> + Region of memory to be used, from ss to ss+nn.
> + The memory region may be marked as e820 type 12 (0xc)
> + and is NVDIMM or ADR memory.
> +
> memory_corruption_check=0/1 [X86]
> Some BIOSes seem to corrupt the first 64k of
> memory when doing things like suspend/resume.
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index b7d31ca..9e3bcd6 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
>
> source "mm/Kconfig"
>
> +config X86_PMEM_LEGACY
> + bool "Support non-standard NVDIMMs and ADR protected memory"
> + help
> + Treat memory marked using the non-standard e820 type of 12 as used
> + by the Intel Sandy Bridge-EP reference BIOS as protected memory.
> + The kernel will offer these regions to the 'pmem' driver so
> + they can be used for persistent storage.
> +
> + Say Y if unsure.
> +
> config HIGHPTE
> bool "Allocate 3rd-level pagetables from highmem"
> depends on HIGHMEM
> diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
> index d993e33..960a8a9 100644
> --- a/arch/x86/include/uapi/asm/e820.h
> +++ b/arch/x86/include/uapi/asm/e820.h
> @@ -33,6 +33,16 @@
> #define E820_NVS 4
> #define E820_UNUSABLE 5
>
> +/*
> + * This is a non-standardized way to represent ADR or NVDIMM regions that
> + * persist over a reboot. The kernel will ignore their special capabilities
> + * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
> + *
> + * ( Note that older platforms also used 6 for the same type of memory,
> + * but newer versions switched to 12 as 6 was assigned differently. Some
> + * time they will learn... )
> + */
> +#define E820_PRAM 12
>
> /*
> * reserved RAM used by kernel itself
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index cdb1b70..971f18c 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
> obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
> obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
> obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
> +obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
>
> obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
>
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 46201de..11cc7d5 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
> case E820_UNUSABLE:
> printk(KERN_CONT "unusable");
> break;
> + case E820_PRAM:
> + printk(KERN_CONT "persistent (type %u)", type);
> + break;
> default:
> printk(KERN_CONT "type %u", type);
> break;
> @@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
> * continue building up new bios map based on this
> * information
> */
> - if (current_type != last_type) {
> + if (current_type != last_type || current_type == E820_PRAM) {
> if (last_type != 0) {
> new_bios[new_bios_entry].size =
> change_point[chgidx]->addr - last_addr;
> @@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
> register_nosave_region(pfn, PFN_UP(ei->addr));
>
> pfn = PFN_DOWN(ei->addr + ei->size);
> +
> if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
> register_nosave_region(PFN_UP(ei->addr), pfn);
>
> @@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> /*
> * Find the highest page frame number we have available
> */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> {
> int i;
> unsigned long last_pfn = 0;
> @@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> unsigned long start_pfn;
> unsigned long end_pfn;
>
> - if (ei->type != type)
> + /*
> + * Persistent memory is accounted as ram for purposes of
> + * establishing max_pfn and mem_map.
> + */
> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> continue;
>
> start_pfn = ei->addr >> PAGE_SHIFT;
> @@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> }
> unsigned long __init e820_end_of_ram_pfn(void)
> {
> - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
> + return e820_end_pfn(MAX_ARCH_PFN);
> }
>
> unsigned long __init e820_end_of_low_ram_pfn(void)
> {
> - return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
> + return e820_end_pfn(1UL << (32-PAGE_SHIFT));
> }
>
> static void early_panic(char *msg)
> @@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
> } else if (*p == '$') {
> start_at = memparse(p+1, &p);
> e820_add_region(start_at, mem_size, E820_RESERVED);
> + } else if (*p == '!') {
> + start_at = memparse(p+1, &p);
> + e820_add_region(start_at, mem_size, E820_PRAM);
> } else
> e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
>
> @@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
> case E820_ACPI: return "ACPI Tables";
> case E820_NVS: return "ACPI Non-volatile Storage";
> case E820_UNUSABLE: return "Unusable memory";
> + case E820_PRAM: return "Persistent RAM";
> default: return "reserved";
> }
> }
> @@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
> * pci device BAR resource and insert them later in
> * pcibios_resource_survey()
> */
> - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
> + if (((e820.map[i].type != E820_RESERVED) &&
> + (e820.map[i].type != E820_PRAM)) ||
> + res->start < (1ULL<<20)) {
> res->flags |= IORESOURCE_BUSY;
> insert_resource(&iomem_resource, res);
> }
> diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
> new file mode 100644
> index 0000000..3420c87
> --- /dev/null
> +++ b/arch/x86/kernel/pmem.c
> @@ -0,0 +1,53 @@
> +/*
> + * Copyright (c) 2015, Christoph Hellwig.
> + */
> +#include <linux/memblock.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +#include <asm/e820.h>
> +#include <asm/page_types.h>
> +#include <asm/setup.h>
> +
> +static __init void register_pmem_device(struct resource *res)
> +{
> + struct platform_device *pdev;
> + int error;
> +
> + pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
> + if (!pdev)
> + return;
> +
> + error = platform_device_add_resources(pdev, res, 1);
> + if (error)
> + goto out_put_pdev;
> +
> + error = platform_device_add(pdev);
> + if (error)
> + goto out_put_pdev;
> + return;
> +
> +out_put_pdev:
> + dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
> + platform_device_put(pdev);
> +}
> +
> +static __init int register_pmem_devices(void)
> +{
> + int i;
> +
> + for (i = 0; i < e820.nr_map; i++) {
> + struct e820entry *ei = &e820.map[i];
> +
> + if (ei->type == E820_PRAM) {
> + struct resource res = {
> + .flags = IORESOURCE_MEM,
> + .start = ei->addr,
> + .end = ei->addr + ei->size - 1,
> + };
> + register_pmem_device(&res);
> + }
> + }
> +
> + return 0;
> +}
> +device_initcall(register_pmem_devices);
--
Andy Lutomirski
AMA Capital Management, LLC
* Andy Lutomirski <[email protected]> wrote:
> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
> <[email protected]> wrote:
> > Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> > Gitweb: http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> > Author: Christoph Hellwig <[email protected]>
> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> > Committer: Ingo Molnar <[email protected]>
> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
> >
> > x86/mm: Add support for the non-standard protected e820 type
> >
> > Various recent BIOSes support NVDIMMs or ADR using a
> > non-standard e820 memory type, and Intel supplied reference
> > Linux code using this type to various vendors.
> >
> > Wire this e820 table type up to export platform devices for the
> > pmem driver so that we can use it in Linux.
>
> This scares me a bit. Do we know that the upcoming ACPI 6.0
> enumeration mechanism *won't* use e820 type 12? [...]
So I know nothing about it, but I'd be surprised if e820 was touched
at all, as e820 isn't really well suited to enumerate more complex
resources, and it appears pmem wants to grow into complex directions?
Thanks,
Ingo
On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <[email protected]> wrote:
>
> * Andy Lutomirski <[email protected]> wrote:
>
>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
>> <[email protected]> wrote:
>> > Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> > Gitweb: http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> > Author: Christoph Hellwig <[email protected]>
>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
>> > Committer: Ingo Molnar <[email protected]>
>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>> >
>> > x86/mm: Add support for the non-standard protected e820 type
>> >
>> > Various recent BIOSes support NVDIMMs or ADR using a
>> > non-standard e820 memory type, and Intel supplied reference
>> > Linux code using this type to various vendors.
>> >
>> > Wire this e820 table type up to export platform devices for the
>> > pmem driver so that we can use it in Linux.
>>
>> This scares me a bit. Do we know that the upcoming ACPI 6.0
>> enumeration mechanism *won't* use e820 type 12? [...]
>
> So I know nothing about it, but I'd be surprised if e820 was touched
> at all, as e820 isn't really well suited to enumerate more complex
> resources, and it appears pmem wants to grow into complex directions?
I hope so, but I have no idea what the ACPI committee's schemes are.
We could require pmem.enable_legacy_e820=Y to load the driver for now
if we're concerned about it.
--Andy
>
> Thanks,
>
> Ingo
--
Andy Lutomirski
AMA Capital Management, LLC
On Wed, Apr 1, 2015 at 12:12 AM, Christoph Hellwig <[email protected]> wrote:
> Various recent BIOSes support NVDIMMs or ADR using a non-standard
> e820 memory type, and Intel supplied reference Linux code using this
> type to various vendors.
>
> Wire this e820 table type up to export platform devices for the pmem
> driver so that we can use it in Linux.
>
> Based on arlier work from Dave Jiang <[email protected]> and
> Dan Williams <[email protected]>.
>
...
> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> /*
> * Find the highest page frame number we have available
> */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> {
> int i;
> unsigned long last_pfn = 0;
> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> unsigned long start_pfn;
> unsigned long end_pfn;
>
> - if (ei->type != type)
> + /*
> + * Persistent memory is accounted as ram for purposes of
> + * establishing max_pfn and mem_map.
> + */
> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> continue;
>
> start_pfn = ei->addr >> PAGE_SHIFT;
> @@ -784,12 +798,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> }
> unsigned long __init e820_end_of_ram_pfn(void)
> {
> - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
> + return e820_end_pfn(MAX_ARCH_PFN);
> }
>
> unsigned long __init e820_end_of_low_ram_pfn(void)
> {
> - return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
> + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT));
> }
You are using iomap_nocache to access pmem.
Do you still need to account E820_PRAM to get those end_of_ram ?
You should not need that to help to set kernel mapping.
So please drop those not needed changes.
Thanks
Yinghai
On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
<[email protected]> wrote:
> Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Gitweb: http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> Author: Christoph Hellwig <[email protected]>
> AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> Committer: Ingo Molnar <[email protected]>
> CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>
> x86/mm: Add support for the non-standard protected e820 type
..
> @@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
> register_nosave_region(pfn, PFN_UP(ei->addr));
>
> pfn = PFN_DOWN(ei->addr + ei->size);
> +
> if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
> register_nosave_region(PFN_UP(ei->addr), pfn);
>
related?
> @@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> /*
> * Find the highest page frame number we have available
> */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> {
> int i;
> unsigned long last_pfn = 0;
> @@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> unsigned long start_pfn;
> unsigned long end_pfn;
>
> - if (ei->type != type)
> + /*
> + * Persistent memory is accounted as ram for purposes of
> + * establishing max_pfn and mem_map.
> + */
> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> continue;
>
> start_pfn = ei->addr >> PAGE_SHIFT;
> @@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> }
> unsigned long __init e820_end_of_ram_pfn(void)
> {
> - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
> + return e820_end_pfn(MAX_ARCH_PFN);
> }
>
> unsigned long __init e820_end_of_low_ram_pfn(void)
> {
> - return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
> + return e820_end_pfn(1UL << (32-PAGE_SHIFT));
> }
>
> static void early_panic(char *msg)
>
those eno_of_ram changes are not needed, as it will not used for
kernel mapping setup.
Thanks
Yinghai
On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
:
> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> /*
> * Find the highest page frame number we have available
> */
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> {
> int i;
> unsigned long last_pfn = 0;
> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> unsigned long start_pfn;
> unsigned long end_pfn;
>
> - if (ei->type != type)
> + /*
> + * Persistent memory is accounted as ram for purposes of
> + * establishing max_pfn and mem_map.
> + */
> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> continue;
Should we also delete this code, accounting E820_PRAM as ram, along with
the deletion of reserve_pmem() in this version?
Thanks,
-Toshi
On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
> On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> :
>> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>> /*
>> * Find the highest page frame number we have available
>> */
>> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>> {
>> int i;
>> unsigned long last_pfn = 0;
>> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>> unsigned long start_pfn;
>> unsigned long end_pfn;
>>
>> - if (ei->type != type)
>> + /*
>> + * Persistent memory is accounted as ram for purposes of
>> + * establishing max_pfn and mem_map.
>> + */
>> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
>> continue;
>
> Should we also delete this code, accounting E820_PRAM as ram, along with
> the deletion of reserve_pmem() in this version?
should revert those end_of_ram change as attached.
On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
> > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> > :
> >> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> >> /*
> >> * Find the highest page frame number we have available
> >> */
> >> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> >> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> >> {
> >> int i;
> >> unsigned long last_pfn = 0;
> >> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> >> unsigned long start_pfn;
> >> unsigned long end_pfn;
> >>
> >> - if (ei->type != type)
> >> + /*
> >> + * Persistent memory is accounted as ram for purposes of
> >> + * establishing max_pfn and mem_map.
> >> + */
> >> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> >> continue;
> >
> > Should we also delete this code, accounting E820_PRAM as ram, along with
> > the deletion of reserve_pmem() in this version?
>
> should revert those end_of_ram change as attached.
I confirmed that the pmem driver works with the change, and last_pfn is
updated as expected.
Thanks,
-Toshi
* Toshi Kani <[email protected]> wrote:
> On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
> > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> > > :
> > >> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> > >> /*
> > >> * Find the highest page frame number we have available
> > >> */
> > >> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > >> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> > >> {
> > >> int i;
> > >> unsigned long last_pfn = 0;
> > >> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > >> unsigned long start_pfn;
> > >> unsigned long end_pfn;
> > >>
> > >> - if (ei->type != type)
> > >> + /*
> > >> + * Persistent memory is accounted as ram for purposes of
> > >> + * establishing max_pfn and mem_map.
> > >> + */
> > >> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> > >> continue;
> > >
> > > Should we also delete this code, accounting E820_PRAM as ram, along with
> > > the deletion of reserve_pmem() in this version?
> >
> > should revert those end_of_ram change as attached.
>
> I confirmed that the pmem driver works with the change, and last_pfn is
> updated as expected.
Could someone please send the fix with a changelog, etc?
Thanks,
Ingo
On Sat, Apr 4, 2015 at 2:40 AM, Ingo Molnar <[email protected]> wrote:
>
> * Toshi Kani <[email protected]> wrote:
>
>> On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
>> > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
>> > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>> >
>> > should revert those end_of_ram change as attached.
>>
>> I confirmed that the pmem driver works with the change, and last_pfn is
>> updated as expected.
>
> Could someone please send the fix with a changelog, etc?
>
Why just fold those change into that commit.
Or you can just drop the patch and ask Christoph to resubmit updated
patch again.
I asked Christoph to remove reserved_pmem, and he agreed to do that
http://lkml.iu.edu/hypermail/linux/kernel/1503.3/02919.html
http://lkml.iu.edu/hypermail/linux/kernel/1503.3/03119.html
but sadly, he did not put me on the CC list for while sending updated patch.
and next day you picked it up to tip/pmem branch.
otherwise we could find the problem early
and he even did not put my name on the changelog :-(
with that, I could find the email early too..
Yinghai
On 04/02/2015 07:44 PM, Christoph Hellwig wrote:
> On Thu, Apr 02, 2015 at 09:01:14AM -0700, Dan Williams wrote:
>>>> If anything I think these should be dev_dbg().
>>>
>>> We do not have a dev at any of this point, and it does not
>>> belong to any specific device.
>>
>> Ah, true this is prior to the driver attaching... that said it seems
>> more relevant to print from probe() (where we do have a device) than
>> init where the device may remain idle due to some other policy.
>>
>>> Also I would like this
>>> _info and not _dbg so to always have it, also for production.
>>> See the chatter for a single SCSI disk the minimum we need
>>> is just the small print that tells all that we need (for now)
>>
>> Not sure we want to follow so closely in the footsteps of SCSI's chattiness.
>
> Defintively not! A single dev_info at ->probe time sounds ok, something
> like:
>
> dev_info(dev, "registering region [0x%pa:0x%zx]\n",
> &pmem->phys_addr, pmem->size);
>
> but there are plenty other drivers not that chatty, e.g. virtio and we're
> doing just fine for them.
>
For me, my print is just the exact amount.
So I will see in my dmesg:
[ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
...
[ +0.000537] pmem: init 2 devices => 0
So I have all the information. And I know the driver was actually loaded
successfully on the expected two regions.
Your print above will give me two prints with information I already have.
All I want to know from users logs is that the driver was actually loaded
successful or not. And when it was unloaded. I think this is the bear minimum
that gives me all the information I need. Less then this I would be missing
a very important event.
Thanks
Boaz
On 04/03/2015 08:12 PM, Yinghai Lu wrote:
> On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
>> On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>> :
>>> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>>> /*
>>> * Find the highest page frame number we have available
>>> */
>>> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>>> {
>>> int i;
>>> unsigned long last_pfn = 0;
>>> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>> unsigned long start_pfn;
>>> unsigned long end_pfn;
>>>
>>> - if (ei->type != type)
>>> + /*
>>> + * Persistent memory is accounted as ram for purposes of
>>> + * establishing max_pfn and mem_map.
>>> + */
>>> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
>>> continue;
>>
>> Should we also delete this code, accounting E820_PRAM as ram, along with
>> the deletion of reserve_pmem() in this version?
>
Hi Yinghai, Toshi
In my old patches I did not have these updates as well, and everything
was very much usable, for a long time.
However. I actually liked these changes in Christoph's patches and
thought they should stay, here is why.
Today I will be sending patches to make pmem be supported with
page-struct as an optional alternative to the use of ioremap.
This is for advanced users that wants to RDMA direct_IO and so
on directly out of pmem.
At one point we had a BUG in some mm/memory.c code that was checking max_pfn.
Actually that was a bug and we do not go through this code anymore. And between
us that global variable max_pfn is a bad hack. But I kind of like to have it as
long as it is used. So code that wants to protect by max_pfn can still accept
pmem memory submitted to it.
I have tried to audit the Kernel use of max_pfn and I do not see how
this can hurt? I do see were it would theoretically help.
Think of a system that looks like this as a memory map:
1. VM (Volitile mem)
2. PM
3. VM
4. PM
Which is what is returned by current and planned NUMA implementations.
So pmem region-2 will be covered by max_pfn. But pmem region 4 will not.
If any code checks for max_pfn it will be OK with pmem-2 but *not* with
pmem-4. This is highly unexpected.
I think the all max_pfn should be killed ASAP, but until it is then
it will not hurt for pmem to be covered.
Thanks
Boaz
On Sun, Apr 5, 2015 at 2:18 AM, Boaz Harrosh <[email protected]> wrote:
> On 04/03/2015 08:12 PM, Yinghai Lu wrote:
>> On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
>>> On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
>>> :
>>>> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
>>>> /*
>>>> * Find the highest page frame number we have available
>>>> */
>>>> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>>> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
>>>> {
>>>> int i;
>>>> unsigned long last_pfn = 0;
>>>> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
>>>> unsigned long start_pfn;
>>>> unsigned long end_pfn;
>>>>
>>>> - if (ei->type != type)
>>>> + /*
>>>> + * Persistent memory is accounted as ram for purposes of
>>>> + * establishing max_pfn and mem_map.
>>>> + */
>>>> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
>>>> continue;
>>>
>>> Should we also delete this code, accounting E820_PRAM as ram, along with
>>> the deletion of reserve_pmem() in this version?
>>
>
> Hi Yinghai, Toshi
>
> In my old patches I did not have these updates as well, and everything
> was very much usable, for a long time.
>
> However. I actually liked these changes in Christoph's patches and
> thought they should stay, here is why.
>
> Today I will be sending patches to make pmem be supported with
> page-struct as an optional alternative to the use of ioremap.
> This is for advanced users that wants to RDMA direct_IO and so
> on directly out of pmem.
but it is not related. Should just remove those lines.
And even his original changes about memblock is not needed.
| You did not modify memblock_x86_fill() to treat
| E820_PRAM as E820_RAM, so memblock will not have any
| entry for E820_PRAM, so you do not need to call memblock_reserve
| there.
|
| And the same time, init_memory_mapping() will call
| init_range_memory_mapping/for_each_mem_pfn_range() to
| set kernel mapping for memory range in memblock only.
| So here calling init_memory_mapping will not do anything.
| then just drop calling to that init_memory_mapping.
| --- so will not kernel mapping pmem, is that what you intended to have?
|
| After those two changes, You do not need this reserve_pmem at all.
| Just drop it.
On 04/05/2015 11:06 PM, Yinghai Lu wrote:
> On Sun, Apr 5, 2015 at 2:18 AM, Boaz Harrosh <[email protected]> wrote:
<>
>> Hi Yinghai, Toshi
>>
>> In my old patches I did not have these updates as well, and everything
>> was very much usable, for a long time.
>>
>> However. I actually liked these changes in Christoph's patches and
>> thought they should stay, here is why.
>>
>> Today I will be sending patches to make pmem be supported with
>> page-struct as an optional alternative to the use of ioremap.
>> This is for advanced users that wants to RDMA direct_IO and so
>> on directly out of pmem.
>
> but it is not related. Should just remove those lines.
>
> And even his original changes about memblock is not needed.
>
Never mind. Has I said it hit us once in the passed but do to
a bug. I do not mind you can remove the max_pfn thing I can do
without it.
Thanks
Boaz
> | You did not modify memblock_x86_fill() to treat
> | E820_PRAM as E820_RAM, so memblock will not have any
> | entry for E820_PRAM, so you do not need to call memblock_reserve
> | there.
> |
> | And the same time, init_memory_mapping() will call
> | init_range_memory_mapping/for_each_mem_pfn_range() to
> | set kernel mapping for memory range in memblock only.
> | So here calling init_memory_mapping will not do anything.
> | then just drop calling to that init_memory_mapping.
> | --- so will not kernel mapping pmem, is that what you intended to have?
> |
> | After those two changes, You do not need this reserve_pmem at all.
> | Just drop it.
>
* Yinghai Lu <[email protected]> wrote:
> On Sat, Apr 4, 2015 at 2:40 AM, Ingo Molnar <[email protected]> wrote:
> >
> > * Toshi Kani <[email protected]> wrote:
> >
> >> On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> >> > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
> >> > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> >> >
> >> > should revert those end_of_ram change as attached.
> >>
> >> I confirmed that the pmem driver works with the change, and last_pfn is
> >> updated as expected.
> >
> > Could someone please send the fix with a changelog, etc?
> >
>
> Why just fold those change into that commit.
Because we try to avoid doing rebases of otherwise tested trees, and
because fixes will outline the thinking behind the code as well.
Thanks,
Ingo
On Fri, Apr 03, 2015 at 10:12:39AM -0700, Yinghai Lu wrote:
> > Should we also delete this code, accounting E820_PRAM as ram, along with
> > the deletion of reserve_pmem() in this version?
>
> should revert those end_of_ram change as attached.
This works fine for me:
Tested-by: Christoph Hellwig <[email protected]>
> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
But I'd prefer not to re-add the argument here, it only obsfucates the
code.
On Sat, 2015-04-04 at 11:40 +0200, Ingo Molnar wrote:
> * Toshi Kani <[email protected]> wrote:
>
> > On Fri, 2015-04-03 at 10:12 -0700, Yinghai Lu wrote:
> > > On Fri, Apr 3, 2015 at 9:14 AM, Toshi Kani <[email protected]> wrote:
> > > > On Wed, 2015-04-01 at 09:12 +0200, Christoph Hellwig wrote:
> > > > :
> > > >> @@ -748,7 +758,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
> > > >> /*
> > > >> * Find the highest page frame number we have available
> > > >> */
> > > >> -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > > >> +static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
> > > >> {
> > > >> int i;
> > > >> unsigned long last_pfn = 0;
> > > >> @@ -759,7 +769,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
> > > >> unsigned long start_pfn;
> > > >> unsigned long end_pfn;
> > > >>
> > > >> - if (ei->type != type)
> > > >> + /*
> > > >> + * Persistent memory is accounted as ram for purposes of
> > > >> + * establishing max_pfn and mem_map.
> > > >> + */
> > > >> + if (ei->type != E820_RAM && ei->type != E820_PRAM)
> > > >> continue;
> > > >
> > > > Should we also delete this code, accounting E820_PRAM as ram, along with
> > > > the deletion of reserve_pmem() in this version?
> > >
> > > should revert those end_of_ram change as attached.
> >
> > I confirmed that the pmem driver works with the change, and last_pfn is
> > updated as expected.
>
> Could someone please send the fix with a changelog, etc?
OK, I will send a patch for the fix, with suggested update from
Christoph of not to re-add 'type' argument to e820_end_pfn().
Yinghai, I will add your sign-off to the patch. Let me know if you have
any concern.
Thanks,
-Toshi
On Mon, 2015-04-06 at 11:26 -0700, Yinghai Lu wrote:
> On Mon, Apr 6, 2015 at 10:29 AM, Toshi Kani <[email protected]> wrote:
> > On Sat, 2015-04-04 at 11:40 +0200, Ingo Molnar wrote:
>
> > OK, I will send a patch for the fix, with suggested update from
> > Christoph of not to re-add 'type' argument to e820_end_pfn().
> >
> > Yinghai, I will add your sign-off to the patch. Let me know if you have
> > any concern.
>
> I think we should restore all to original.
>
> e820_end_pfn(unsigned long limit_pfn, unsigned type)
> e820_end_of_ram_pfn
> e820_end_of_low_ram_pfn
>
> otherwise it will cause confusing, because last two really have ram_pfn,
> and the first one does not has ram in the function name.
Good point. I will revert back to the original.
Thanks,
-Toshi
On Mon, Apr 6, 2015 at 10:29 AM, Toshi Kani <[email protected]> wrote:
> On Sat, 2015-04-04 at 11:40 +0200, Ingo Molnar wrote:
> OK, I will send a patch for the fix, with suggested update from
> Christoph of not to re-add 'type' argument to e820_end_pfn().
>
> Yinghai, I will add your sign-off to the patch. Let me know if you have
> any concern.
I think we should restore all to original.
e820_end_pfn(unsigned long limit_pfn, unsigned type)
e820_end_of_ram_pfn
e820_end_of_low_ram_pfn
otherwise it will cause confusing, because last two really have ram_pfn,
and the first one does not has ram in the function name.
Thanks
Yinghai
On Sun, Apr 05, 2015 at 11:50:06AM +0300, Boaz Harrosh wrote:
> [ +0.000537] pmem: init 2 devices => 0
>
> So I have all the information. And I know the driver was actually loaded
> successfully on the expected two regions.
The second number will always be 0, so no point in printing it.
Also device can be hotplugged at runtime, e.g. your magic PCIe device,
so iff you really want to print anything ->probe is the place for it.
But I still don't think we need it, once booted you can trivially look
up the information in sysfs.
On 04/07/2015 06:19 PM, Christoph Hellwig wrote:
> On Sun, Apr 05, 2015 at 11:50:06AM +0300, Boaz Harrosh wrote:
>> [ +0.000537] pmem: init 2 devices => 0
>>
>> So I have all the information. And I know the driver was actually loaded
>> successfully on the expected two regions.
>
> The second number will always be 0, so no point in printing it.
Why it can be any error that was collected in the load process its
the error we are returning to the Kernel from pmem_init()
Any none zero will mean module will not load and modprobe will
return with an exit code.
(Errors from probe() will be returned here)
> Also device can be hotplugged at runtime, e.g. your magic PCIe device,
> so iff you really want to print anything ->probe is the place for it.
>
Yes but for now it is only very much static. We could add it later
when that happens, no?
> But I still don't think we need it, once booted you can trivially look
> up the information in sysfs.
>
Its not for the systems I can probe around in sysfs and or just ls /dev/pmem*
It is postmortem when all I have is the logs and say a stack trace.
For us in the lab it is very important, all we need is the single
line on load/unload. But at probe time is fine as well, just more
chatty as you were complaining.
I will post two versions of this right away, I did it and forgot
to post.
Thanks
Boaz
Hi Christoph, Ingo
It is important in the lab for postmortem analysis to know if
pmem driver loaded and/or unloaded. And the return code from this
operation.
I submit two versions [A] more chatty and version [B]. Both give me
the info I need.
I like [B] because [A] prints more lines, and also the driver might not
load at the end and we would still not see it from [A]'s prints.
But it does not matter that much just take any one you guys like
better.
Here are the commit logs:
---
[PATCH 1A] pmem: Add prints at pmem_probe/remove
Add small prints at creation/remove of pmem devices.
So we can see in dmesg logs when users loaded/unloaded
the pmem driver and what devices were created.
The prints will look like this:
Printed by e820 on load:
[ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
...
Printed by modprobe pmem:
[ +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
[ +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
...
Printed by modprobe -r pmem:
[ +16.299145] pmem pmem.1.auto: remove
[ +0.011155] pmem pmem.0.auto: remove
Signed-off-by: Boaz Harrosh <[email protected]>
---
[PATCH 1B] pmem: Add prints at module load/unload
When debugging people's systems it is helpful
to see what went on. The load and unload of pmem is
an important event.
The importance is the number of loaded devices and
error status. The exact device's addresses created
we can see from the other prints at e820 so no need
to duplicate this information.
After this patch the important info at dmesg will be:
Printed by the e820 code on load:
[ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
...
Printed by pmem modprobe:
[ +0.000537] pmem: init 2 devices => 0
...
Printed by pmem modprobe -r:
[ +0.000537] pmem: exit
Signed-off-by: Boaz Harrosh <[email protected]>
---
Thanks
Boaz
Add small prints at creation/remove of pmem devices.
So we can see in dmesg logs when users loaded/unloaded
the pmem driver and what devices were created.
The prints will look like this:
Printed by e820 on load:
[ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
...
Printed by modprobe pmem:
[ +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
[ +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
...
Printed by modprobe -r pmem:
[ +16.299145] pmem pmem.1.auto: remove
[ +0.011155] pmem pmem.0.auto: remove
Signed-off-by: Boaz Harrosh <[email protected]>
---
drivers/block/pmem.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 988f384..36017f1 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -216,6 +216,8 @@ static int pmem_probe(struct platform_device *pdev)
return PTR_ERR(pmem);
platform_set_drvdata(pdev, pmem);
+ dev_info(&pdev->dev, "probe [%pa:0x%zx]\n",
+ &pmem->phys_addr, pmem->size);
return 0;
}
@@ -224,6 +226,7 @@ static int pmem_remove(struct platform_device *pdev)
{
struct pmem_device *pmem = platform_get_drvdata(pdev);
+ dev_info(&pdev->dev, "remove\n");
pmem_free(pmem);
return 0;
}
--
1.9.3
When debugging people's systems it is helpful
to see what went on. The load and unload of pmem is
an important event.
The importance is the number of loaded devices and
error status. The exact device's addresses created
we can see from the other prints at e820 so no need
to duplicate this information.
After this patch the important info at dmesg will be:
Printed by the e820 code on load:
[ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
[ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
...
Printed by pmem modprobe:
[ +0.000537] pmem: init 2 devices => 0
...
Printed by pmem modprobe -r:
[ +0.000537] pmem: exit
Signed-off-by: Boaz Harrosh <[email protected]>
---
drivers/block/pmem.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
index 988f384..44d3f33 100644
--- a/drivers/block/pmem.c
+++ b/drivers/block/pmem.c
@@ -248,6 +248,9 @@ static int __init pmem_init(void)
error = platform_driver_register(&pmem_driver);
if (error)
unregister_blkdev(pmem_major, "pmem");
+
+ pr_info("pmem: init %d devices => %d\n",
+ atomic_read(&pmem_index), error);
return error;
}
module_init(pmem_init);
@@ -256,6 +259,7 @@ static void pmem_exit(void)
{
platform_driver_unregister(&pmem_driver);
unregister_blkdev(pmem_major, "pmem");
+ pr_info("pmem: exit\n");
}
module_exit(pmem_exit);
--
1.9.3
On Tue, Apr 07, 2015 at 06:46:15PM +0300, Boaz Harrosh wrote:
> Hi Christoph, Ingo
>
> It is important in the lab for postmortem analysis to know if
> pmem driver loaded and/or unloaded. And the return code from this
> operation.
>
> I submit two versions [A] more chatty and version [B]. Both give me
> the info I need.
>
> I like [B] because [A] prints more lines, and also the driver might not
> load at the end and we would still not see it from [A]'s prints.
>
> But it does not matter that much just take any one you guys like
> better.
>
> Here are the commit logs:
> ---
> [PATCH 1A] pmem: Add prints at pmem_probe/remove
>
> Add small prints at creation/remove of pmem devices.
> So we can see in dmesg logs when users loaded/unloaded
> the pmem driver and what devices were created.
>
> The prints will look like this:
> Printed by e820 on load:
> [ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
> [ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
> ...
> Printed by modprobe pmem:
> [ +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
> [ +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
> ...
> Printed by modprobe -r pmem:
> [ +16.299145] pmem pmem.1.auto: remove
> [ +0.011155] pmem pmem.0.auto: remove
>
> Signed-off-by: Boaz Harrosh <[email protected]>
Don't polute the kernel logs with "chatty" things like this, just
trigger off of the block device uevent if you really want to know if the
block device is still around or not.
thanks,
greg k-h
On 04/13/2015 12:05 PM, Greg KH wrote:
> On Tue, Apr 07, 2015 at 06:46:15PM +0300, Boaz Harrosh wrote:
>> Hi Christoph, Ingo
>>
>> It is important in the lab for postmortem analysis to know if
>> pmem driver loaded and/or unloaded. And the return code from this
>> operation.
>>
>> I submit two versions [A] more chatty and version [B]. Both give me
>> the info I need.
>>
>> I like [B] because [A] prints more lines, and also the driver might not
>> load at the end and we would still not see it from [A]'s prints.
>>
>> But it does not matter that much just take any one you guys like
>> better.
>>
>> Here are the commit logs:
>> ---
>> [PATCH 1A] pmem: Add prints at pmem_probe/remove
>>
>> Add small prints at creation/remove of pmem devices.
>> So we can see in dmesg logs when users loaded/unloaded
>> the pmem driver and what devices were created.
>>
>> The prints will look like this:
>> Printed by e820 on load:
>> [ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
>> [ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
>> ...
>> Printed by modprobe pmem:
>> [ +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
>> [ +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
>> ...
>> Printed by modprobe -r pmem:
>> [ +16.299145] pmem pmem.1.auto: remove
>> [ +0.011155] pmem pmem.0.auto: remove
>>
>> Signed-off-by: Boaz Harrosh <[email protected]>
>
> Don't polute the kernel logs with "chatty" things like this,
Why do you say this is chatty. With [B] This is a single line of print
on modprobe. With [A] it is a print per device (Is why I like [B])
Compare to all the other block-devices in the system, say scsi, that print
bunch of info for each device, this is very very minimalistic.
> just
> trigger off of the block device uevent if you really want to know if the
> block device is still around or not.
>
Again I do not need this for run time. At run time I have two tons of ways
to check and see. BTW a uevent is already triggered for insertion as part
of regular block core operation.
I need this at dmesg for when analyzing users logs, say when a crash happens.
I need to see what/when drivers were loaded/unloaded. It is common practice
in dmseg for block devices to leave foot prints.
Sigh, do you not believe that this single line in dmseg makes my life much
easier?
> thanks,
> greg k-h
Thanks
Boaz
On Mon, Apr 13, 2015 at 03:05:27PM +0300, Boaz Harrosh wrote:
> On 04/13/2015 12:05 PM, Greg KH wrote:
> > On Tue, Apr 07, 2015 at 06:46:15PM +0300, Boaz Harrosh wrote:
> >> Hi Christoph, Ingo
> >>
> >> It is important in the lab for postmortem analysis to know if
> >> pmem driver loaded and/or unloaded. And the return code from this
> >> operation.
> >>
> >> I submit two versions [A] more chatty and version [B]. Both give me
> >> the info I need.
> >>
> >> I like [B] because [A] prints more lines, and also the driver might not
> >> load at the end and we would still not see it from [A]'s prints.
> >>
> >> But it does not matter that much just take any one you guys like
> >> better.
> >>
> >> Here are the commit logs:
> >> ---
> >> [PATCH 1A] pmem: Add prints at pmem_probe/remove
> >>
> >> Add small prints at creation/remove of pmem devices.
> >> So we can see in dmesg logs when users loaded/unloaded
> >> the pmem driver and what devices were created.
> >>
> >> The prints will look like this:
> >> Printed by e820 on load:
> >> [ +0.000000] user: [mem 0x0000000100000000-0x000000015fffffff] persistent (type 12)
> >> [ +0.000000] user: [mem 0x0000000160000000-0x00000001dfffffff] persistent (type 12)
> >> ...
> >> Printed by modprobe pmem:
> >> [ +0.003065] pmem pmem.0.auto: probe [0x0000000100000000:0x60000000]
> >> [ +0.001816] pmem pmem.1.auto: probe [0x0000000160000000:0x80000000]
> >> ...
> >> Printed by modprobe -r pmem:
> >> [ +16.299145] pmem pmem.1.auto: remove
> >> [ +0.011155] pmem pmem.0.auto: remove
> >>
> >> Signed-off-by: Boaz Harrosh <[email protected]>
> >
> > Don't polute the kernel logs with "chatty" things like this,
>
> Why do you say this is chatty. With [B] This is a single line of print
> on modprobe. With [A] it is a print per device (Is why I like [B])
> Compare to all the other block-devices in the system, say scsi, that print
> bunch of info for each device, this is very very minimalistic.
>
> > just
> > trigger off of the block device uevent if you really want to know if the
> > block device is still around or not.
> >
>
> Again I do not need this for run time. At run time I have two tons of ways
> to check and see. BTW a uevent is already triggered for insertion as part
> of regular block core operation.
>
> I need this at dmesg for when analyzing users logs, say when a crash happens.
> I need to see what/when drivers were loaded/unloaded. It is common practice
> in dmseg for block devices to leave foot prints.
>
> Sigh, do you not believe that this single line in dmseg makes my life much
> easier?
it might, as grepping kernel logs is a "lazy" way of doing things. We
are working hard to make it so that each and every device found in the
system does _not_ print out kernel log messages, as they really are
pretty pointless and useless overall for the 99.99% of the time.
Anyway, I'm not the maintainer here of this driver, so I don't have the
final say, but really, if you are relying on kernel log messages for
specific things to happen in your system, you are doing it wrong as they
can change and disappear in any future kernel release, they are NOT an
api.
thanks,
greg k-h
On 04/13/2015 03:36 PM, Greg KH wrote:
> if you are relying on kernel log messages for
> specific things to happen in your system, you are doing it wrong as they
> can change and disappear in any future kernel release, they are NOT an
> api.
>
Again. I am not doing anything with these messages. Yes messages are not
API. I do not need them for a live and running system, at all.
I need it for a dead system. for a long dead system. A system that has failed
3 days ago and all I have is the system/application logs and the Kernel logs.
I guess I'm antic and you guys have other means for these things, I wish to
learn then. So far all we are asking to keep are the logs, Is there something
else I need to store from a *past* running system?
Until now this gave me a very good place to start my investigation. With
empty log files, how do I then proceed?
> thanks,
> greg k-h
It looks like everyone agrees with you though. I wish I knew something you
guys know, I do not see how I can analyze failing systems without logs.
[I promise this is my last email on the subject]
Thanks
Boaz
On Mon, Apr 13, 2015 at 04:20:37PM +0300, Boaz Harrosh wrote:
> On 04/13/2015 03:36 PM, Greg KH wrote:
> > if you are relying on kernel log messages for
> > specific things to happen in your system, you are doing it wrong as they
> > can change and disappear in any future kernel release, they are NOT an
> > api.
> >
>
> Again. I am not doing anything with these messages. Yes messages are not
> API. I do not need them for a live and running system, at all.
>
> I need it for a dead system. for a long dead system. A system that has failed
> 3 days ago and all I have is the system/application logs and the Kernel logs.
>
> I guess I'm antic and you guys have other means for these things, I wish to
> learn then. So far all we are asking to keep are the logs, Is there something
> else I need to store from a *past* running system?
>
> Until now this gave me a very good place to start my investigation. With
> empty log files, how do I then proceed?
What would this simple "the device was removed" log message help you out
with on a dead and old system?
Same goes for the "device found!" message, how does that help anyone
out? If they do help in some manner with debugging, then make them
debugging messages, and enable them on your systems when doing testing.
But for general purpose systems, if they don't do anything except be
noisy, might as well turn them off, right?
thanks,
greg k-h
On Thu, Apr 2, 2015 at 12:51 PM, Andy Lutomirski <[email protected]> wrote:
> On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <[email protected]> wrote:
>>
>> * Andy Lutomirski <[email protected]> wrote:
>>
>>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
>>> <[email protected]> wrote:
>>> > Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>>> > Gitweb: http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>>> > Author: Christoph Hellwig <[email protected]>
>>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
>>> > Committer: Ingo Molnar <[email protected]>
>>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>>> >
>>> > x86/mm: Add support for the non-standard protected e820 type
>>> >
>>> > Various recent BIOSes support NVDIMMs or ADR using a
>>> > non-standard e820 memory type, and Intel supplied reference
>>> > Linux code using this type to various vendors.
>>> >
>>> > Wire this e820 table type up to export platform devices for the
>>> > pmem driver so that we can use it in Linux.
>>>
>>> This scares me a bit. Do we know that the upcoming ACPI 6.0
>>> enumeration mechanism *won't* use e820 type 12? [...]
>>
>> So I know nothing about it, but I'd be surprised if e820 was touched
>> at all, as e820 isn't really well suited to enumerate more complex
>> resources, and it appears pmem wants to grow into complex directions?
>
> I hope so, but I have no idea what the ACPI committee's schemes are.
>
> We could require pmem.enable_legacy_e820=Y to load the driver for now
> if we're concerned about it.
>
ACPI 6.0 is out:
http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
AFAICT from a quick read, ACPI 6.0 systems will show EFI type 14
(EfiPersistentMemory), ACPI type 7 (AddressRangePersistentMemory) and
e820 type 7.
Type 12 is still "OEM defined". See table 15-312. Maybe I'm reading
this wrong.
*However*, ACPI 6.0 unsurprisingly also has a real enumeration
mechanism for NVDIMMs and such, and those should take precedence.
So this driver could plausibly be safe even on ACPI 6.0 systems.
Someone from one of the relevant vendors should probably confirm that.
I'm still a bit nervous, though.
--Andy
> -----Original Message-----
> From: [email protected] [mailto:linux-kernel-
> [email protected]] On Behalf Of Andy Lutomirski
> Sent: Thursday, April 16, 2015 5:31 PM
> To: Ingo Molnar
> Subject: Re: [tip:x86/pmem] x86/mm: Add support for the non-standard
> protected e820 type
>
> On Thu, Apr 2, 2015 at 12:51 PM, Andy Lutomirski <[email protected]>
> wrote:
> > On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <[email protected]> wrote:
> >>
> >> * Andy Lutomirski <[email protected]> wrote:
> >>
> >>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
> >>> <[email protected]> wrote:
> >>> > Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> >>> > Gitweb:
> http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
> >>> > Author: Christoph Hellwig <[email protected]>
> >>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
> >>> > Committer: Ingo Molnar <[email protected]>
> >>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
> >>> >
> >>> > x86/mm: Add support for the non-standard protected e820 type
> >>> >
> >>> > Various recent BIOSes support NVDIMMs or ADR using a
> >>> > non-standard e820 memory type, and Intel supplied reference
> >>> > Linux code using this type to various vendors.
> >>> >
> >>> > Wire this e820 table type up to export platform devices for the
> >>> > pmem driver so that we can use it in Linux.
> >>>
> >>> This scares me a bit. Do we know that the upcoming ACPI 6.0
> >>> enumeration mechanism *won't* use e820 type 12? [...]
> >>
> >> So I know nothing about it, but I'd be surprised if e820 was touched
> >> at all, as e820 isn't really well suited to enumerate more complex
> >> resources, and it appears pmem wants to grow into complex directions?
> >
> > I hope so, but I have no idea what the ACPI committee's schemes are.
> >
> > We could require pmem.enable_legacy_e820=Y to load the driver for now
> > if we're concerned about it.
> >
>
> ACPI 6.0 is out:
>
> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
>
> AFAICT from a quick read, ACPI 6.0 systems will show EFI type 14
> (EfiPersistentMemory), ACPI type 7 (AddressRangePersistentMemory) and
> e820 type 7.
>
> Type 12 is still "OEM defined". See table 15-312. Maybe I'm reading
> this wrong.
>
> *However*, ACPI 6.0 unsurprisingly also has a real enumeration
> mechanism for NVDIMMs and such, and those should take precedence.
>
> So this driver could plausibly be safe even on ACPI 6.0 systems.
> Someone from one of the relevant vendors should probably confirm that.
> I'm still a bit nervous, though.
That value was set aside on behalf of this pre-standard usage, to
keep future ACPI revisions from standardizing it for anything else.
The kernel should be just as safe in continuing to recognize that
value as it is now.
New legacy BIOS systems should follow ACPI 6.0 going forward and
report type 7 in their E820 tables, along with meeting the other
ACPI 6.0 requirements.
UEFI systems don't provide an E820 table; that's an artificial
creation by the bootloader (e.g., grub2) or the kernel with its
add_efi_memmmap parameter. These systems will just report the
EFI memory type 14. There was no pre-standard EFI type
identified that needed to be blocked out.
Any software creating a fake E820 table (grub2, etc.) should
map EFI type 14 to E820 type 7.
---
Robert Elliott, HP Server Storage
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Thu, Apr 16, 2015 at 5:55 PM, Elliott, Robert (Server Storage)
<[email protected]> wrote:
>
>
>> -----Original Message-----
>> From: [email protected] [mailto:linux-kernel-
>> [email protected]] On Behalf Of Andy Lutomirski
>> Sent: Thursday, April 16, 2015 5:31 PM
>> To: Ingo Molnar
>> Subject: Re: [tip:x86/pmem] x86/mm: Add support for the non-standard
>> protected e820 type
>>
>> On Thu, Apr 2, 2015 at 12:51 PM, Andy Lutomirski <[email protected]>
>> wrote:
>> > On Thu, Apr 2, 2015 at 12:13 PM, Ingo Molnar <[email protected]> wrote:
>> >>
>> >> * Andy Lutomirski <[email protected]> wrote:
>> >>
>> >>> On Thu, Apr 2, 2015 at 5:31 AM, tip-bot for Christoph Hellwig
>> >>> <[email protected]> wrote:
>> >>> > Commit-ID: ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> >>> > Gitweb:
>> http://git.kernel.org/tip/ec776ef6bbe1734c29cd6bd05219cd93b2731bd4
>> >>> > Author: Christoph Hellwig <[email protected]>
>> >>> > AuthorDate: Wed, 1 Apr 2015 09:12:18 +0200
>> >>> > Committer: Ingo Molnar <[email protected]>
>> >>> > CommitDate: Wed, 1 Apr 2015 17:02:43 +0200
>> >>> >
>> >>> > x86/mm: Add support for the non-standard protected e820 type
>> >>> >
>> >>> > Various recent BIOSes support NVDIMMs or ADR using a
>> >>> > non-standard e820 memory type, and Intel supplied reference
>> >>> > Linux code using this type to various vendors.
>> >>> >
>> >>> > Wire this e820 table type up to export platform devices for the
>> >>> > pmem driver so that we can use it in Linux.
>> >>>
>> >>> This scares me a bit. Do we know that the upcoming ACPI 6.0
>> >>> enumeration mechanism *won't* use e820 type 12? [...]
>> >>
>> >> So I know nothing about it, but I'd be surprised if e820 was touched
>> >> at all, as e820 isn't really well suited to enumerate more complex
>> >> resources, and it appears pmem wants to grow into complex directions?
>> >
>> > I hope so, but I have no idea what the ACPI committee's schemes are.
>> >
>> > We could require pmem.enable_legacy_e820=Y to load the driver for now
>> > if we're concerned about it.
>> >
>>
>> ACPI 6.0 is out:
>>
>> http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
>>
>> AFAICT from a quick read, ACPI 6.0 systems will show EFI type 14
>> (EfiPersistentMemory), ACPI type 7 (AddressRangePersistentMemory) and
>> e820 type 7.
>>
>> Type 12 is still "OEM defined". See table 15-312. Maybe I'm reading
>> this wrong.
>>
>> *However*, ACPI 6.0 unsurprisingly also has a real enumeration
>> mechanism for NVDIMMs and such, and those should take precedence.
>>
>> So this driver could plausibly be safe even on ACPI 6.0 systems.
>> Someone from one of the relevant vendors should probably confirm that.
>> I'm still a bit nervous, though.
>
> That value was set aside on behalf of this pre-standard usage, to
> keep future ACPI revisions from standardizing it for anything else.
> The kernel should be just as safe in continuing to recognize that
> value as it is now.
>
> New legacy BIOS systems should follow ACPI 6.0 going forward and
> report type 7 in their E820 tables, along with meeting the other
> ACPI 6.0 requirements.
>
> UEFI systems don't provide an E820 table; that's an artificial
> creation by the bootloader (e.g., grub2) or the kernel with its
> add_efi_memmmap parameter. These systems will just report the
> EFI memory type 14. There was no pre-standard EFI type
> identified that needed to be blocked out.
>
> Any software creating a fake E820 table (grub2, etc.) should
> map EFI type 14 to E820 type 7.
Great!
Off-topic: do you know what an NVDIMM control block is? Are we really
supposed to access NVDIMM registers using MMIO? If so, that must have
been a beast to get right in the memory controller. Do you know if
any vendor has published docs for this stuff?
(I'm trying to figure out whether we're supposed to use SMBUS cycles
for any purpose so I can either fix my driver or relegate it to
legacy-only status.)
--Andy