2015-05-04 20:57:44

by Tony Luck

[permalink] [raw]
Subject: [PATCH 0/3] Find mirrored memory, use for boot time allocations

UEFI published the spec that descibes the attribute bit we need to
find out which memory ranges are mirrored. So time to post the real
version of this series.

These patches are against 4.1-rc1 ... I think there are a couple of
trivial conflicts with the current mmotm.

Tony Luck (3):
mm/memblock: Add extra "flag" to memblock to allow selection of memory
based on attribute
mm/memblock: Allocate boot time data structures from mirrored memory
x86, mirror: x86 enabling - find mirrored memory ranges

arch/s390/kernel/crash_dump.c | 4 +-
arch/sparc/mm/init_64.c | 4 +-
arch/x86/kernel/check.c | 2 +-
arch/x86/kernel/e820.c | 2 +-
arch/x86/kernel/setup.c | 3 ++
arch/x86/mm/init_32.c | 2 +-
arch/x86/platform/efi/efi.c | 21 ++++++++
include/linux/efi.h | 3 ++
include/linux/memblock.h | 43 ++++++++++------
mm/cma.c | 4 +-
mm/memblock.c | 113 ++++++++++++++++++++++++++++++++----------
mm/memtest.c | 2 +-
mm/nobootmem.c | 12 ++++-
13 files changed, 162 insertions(+), 53 deletions(-)

--
2.1.4


2015-05-04 20:58:09

by Tony Luck

[permalink] [raw]
Subject: [PATCH 2/3] mm/memblock: Allocate boot time data structures from mirrored memory

Try to allocate all boot time kernel data structures from mirrored
memory. If we run out of mirrored memory print warnings, but fall
back to using non-mirrored memory to make sure that we still boot.

Signed-off-by: Tony Luck <[email protected]>
---
include/linux/memblock.h | 8 ++++++
mm/memblock.c | 71 ++++++++++++++++++++++++++++++++++++++++++------
mm/nobootmem.c | 10 ++++++-
3 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1d448879caae..20bf3dfab564 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -22,6 +22,7 @@

/* Definition of memblock flags. */
#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */
+#define MEMBLOCK_MIRROR 0x2 /* mirrored region */

struct memblock_region {
phys_addr_t base;
@@ -75,6 +76,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+u32 memblock_has_mirror(void);

/* Low level functions */
int memblock_add_range(struct memblock_type *type,
@@ -155,6 +158,11 @@ static inline bool movable_node_is_enabled(void)
}
#endif

+static inline bool memblock_is_mirror(struct memblock_region *m)
+{
+ return m->flags & MEMBLOCK_MIRROR;
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
diff --git a/mm/memblock.c b/mm/memblock.c
index ac3c94fff97c..7a0769555474 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
#ifdef CONFIG_MOVABLE_NODE
bool movable_node_enabled __initdata_memblock = false;
#endif
+static bool memblock_have_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;

+u32 __init_memblock memblock_has_mirror(void)
+{
+ return memblock_have_mirror ? MEMBLOCK_MIRROR : 0;
+}
+
/* inline so we don't get a warning when pr_debug is compiled out */
static __init_memblock const char *
memblock_type_name(struct memblock_type *type)
@@ -257,8 +263,19 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(size, align, start, end,
+ phys_addr_t ret;
+ u32 flag = memblock_has_mirror();
+
+ ret = memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE, flag);
+
+ if (!ret && flag) {
+ pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);
+ ret = memblock_find_in_range_node(size, align, start, end,
NUMA_NO_NODE, 0);
+ }
+
+ return ret;
}

static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -784,6 +801,21 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
}

/**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+ memblock_have_mirror = true;
+
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+
+
+/**
* __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
@@ -837,6 +869,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, u32 flags,
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;

+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -942,6 +978,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, u32 flags,
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;

+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -1092,7 +1132,17 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,

phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid, 0);
+ u32 flag = memblock_has_mirror();
+ phys_addr_t ret;
+
+again:
+ ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid, flag);
+
+ if (!ret && flag) {
+ flag = 0;
+ goto again;
+ }
+ return ret;
}

phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1161,6 +1211,7 @@ static void * __init memblock_virt_alloc_internal(
{
phys_addr_t alloc;
void *ptr;
+ u32 flag = memblock_has_mirror();

if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
@@ -1181,13 +1232,13 @@ static void * __init memblock_virt_alloc_internal(

again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid, 0);
+ nid, flag);
if (alloc)
goto done;

if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE, 0);
+ max_addr, NUMA_NO_NODE, flag);
if (alloc)
goto done;
}
@@ -1195,10 +1246,15 @@ again:
if (min_addr) {
min_addr = 0;
goto again;
- } else {
- goto error;
}

+ if (flag) {
+ flag = 0;
+ pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);
+ goto again;
+ }
+
+ return NULL;
done:
memblock_reserve(alloc, size);
ptr = phys_to_virt(alloc);
@@ -1213,9 +1269,6 @@ done:
kmemleak_alloc(ptr, size, 0, 0);

return ptr;
-
-error:
- return NULL;
}

/**
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index a4903046bcba..35423c935a46 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -37,11 +37,19 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
{
void *ptr;
u64 addr;
+ u32 flag = memblock_has_mirror();

if (limit > memblock.current_limit)
limit = memblock.current_limit;

- addr = memblock_find_in_range_node(size, align, goal, limit, nid, 0);
+again:
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid, flag);
+
+ if (flag && !addr) {
+ flag = 0;
+ pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);
+ goto again;
+ }
if (!addr)
return NULL;

--
2.1.4

2015-05-04 20:58:16

by Tony Luck

[permalink] [raw]
Subject: [PATCH 3/3] x86, mirror: x86 enabling - find mirrored memory ranges

UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory
address ranges. See UEFI 2.5 spec pages 157-158:

http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf

On EFI enabled systems scan the memory map and tell memblock about
any mirrored ranges.

Signed-off-by: Tony Luck <[email protected]>
---
arch/x86/kernel/setup.c | 3 +++
arch/x86/platform/efi/efi.c | 21 +++++++++++++++++++++
include/linux/efi.h | 3 +++
3 files changed, 27 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d74ac33290ae..ac85a1775661 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1103,6 +1103,9 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();

+ if (efi_enabled(EFI_BOOT))
+ efi_find_mirror();
+
/*
* The EFI specification says that boot service code won't be called
* after ExitBootServices(). This is, in fact, a lie.
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 02744df576d5..31635dc5bca4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
now->tv_nsec = 0;
}

+void __init efi_find_mirror(void)
+{
+ void *p;
+ unsigned long long mirror_size = 0, total_size = 0;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ total_size += size;
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ memblock_mark_mirror(start, size);
+ mirror_size += size;
+ }
+ }
+ if (mirror_size)
+ pr_info("Memory: %lldM/%lldM mirrored memory\n",
+ mirror_size>>20, total_size>>20);
+}
+
/*
* Tell the kernel about the EFI memory map. This might include
* more than the max 128 entries that can fit in the e820 legacy
diff --git a/include/linux/efi.h b/include/linux/efi.h
index af5be0368dec..3f13903346a2 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -96,6 +96,8 @@ typedef struct {
#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */
#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */
#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */
+#define EFI_MEMORY_MORE_RELIABLE \
+ ((u64)0x0000000000010000ULL) /* higher reliability */
#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */
#define EFI_MEMORY_DESCRIPTOR_VERSION 1

@@ -864,6 +866,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos
extern void efi_late_init(void);
extern void efi_free_boot_services(void);
extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
+extern void efi_find_mirror (void);
#else
static inline void efi_late_init(void) {}
static inline void efi_free_boot_services(void) {}
--
2.1.4

2015-05-04 20:57:59

by Tony Luck

[permalink] [raw]
Subject: [PATCH 1/3] mm/memblock: Add extra "flag" to memblock to allow selection of memory based on attribute

No functional changes

Signed-off-by: Tony Luck <[email protected]>
---
arch/s390/kernel/crash_dump.c | 4 ++--
arch/sparc/mm/init_64.c | 4 ++--
arch/x86/kernel/check.c | 2 +-
arch/x86/kernel/e820.c | 2 +-
arch/x86/mm/init_32.c | 2 +-
include/linux/memblock.h | 35 +++++++++++++++++--------------
mm/cma.c | 4 ++--
mm/memblock.c | 48 +++++++++++++++++++++++++------------------
mm/memtest.c | 2 +-
mm/nobootmem.c | 4 ++--
10 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 9f73c8059022..1b117a2a60af 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -33,11 +33,11 @@ static struct memblock_type oldmem_type = {
};

#define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \
- for (i = 0, __next_mem_range(&i, nid, &memblock.physmem, \
+ for (i = 0, __next_mem_range(&i, nid, 0, &memblock.physmem, \
&oldmem_type, p_start, \
p_end, p_nid); \
i != (u64)ULLONG_MAX; \
- __next_mem_range(&i, nid, &memblock.physmem, \
+ __next_mem_range(&i, nid, 0, &memblock.physmem, \
&oldmem_type, \
p_start, p_end, p_nid))

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 4ca0d6ba5ec8..0ac21f765142 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1952,7 +1952,7 @@ static phys_addr_t __init available_memory(void)
phys_addr_t pa_start, pa_end;
u64 i;

- for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, 0, &pa_start, &pa_end, NULL)
available = available + (pa_end - pa_start);

return available;
@@ -1971,7 +1971,7 @@ static void __init reduce_memory(phys_addr_t limit_ram)
if (limit_ram >= avail_ram)
return;

- for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, 0, &pa_start, &pa_end, NULL) {
phys_addr_t region_size = pa_end - pa_start;
phys_addr_t clip_start = pa_start;

diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6..46c8bc62f840 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)

corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);

- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, 0, &start, &end, NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db2283..ea75ec76ceea 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1123,7 +1123,7 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}

- for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, 0, &start, &end, NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a..6455c9f86bc8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
phys_addr_t start, end;
u64 i;

- for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
start_pfn, end_pfn);
unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9497ec7c77ea..1d448879caae 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,7 +61,7 @@ extern bool movable_node_enabled;

phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
- int nid);
+ int nid, u32 flag);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -85,11 +85,11 @@ int memblock_remove_range(struct memblock_type *type,
phys_addr_t base,
phys_addr_t size);

-void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range(u64 *idx, int nid, u32 flags, struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);

-void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range_rev(u64 *idx, int nid, u32 flags, struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);

@@ -100,16 +100,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flag: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
-#define for_each_mem_range(i, type_a, type_b, nid, \
+#define for_each_mem_range(i, type_a, type_b, nid, flag, \
p_start, p_end, p_nid) \
- for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \
+ for (i = 0, __next_mem_range(&i, nid, flag, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
- __next_mem_range(&i, nid, type_a, type_b, \
+ __next_mem_range(&i, nid, flag, type_a, type_b, \
p_start, p_end, p_nid))

/**
@@ -119,17 +120,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flag: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
-#define for_each_mem_range_rev(i, type_a, type_b, nid, \
+#define for_each_mem_range_rev(i, type_a, type_b, nid, flag, \
p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
- __next_mem_range_rev(&i, nid, type_a, type_b, \
+ __next_mem_range_rev(&i, nid, flag, type_a, type_b,\
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
- __next_mem_range_rev(&i, nid, type_a, type_b, \
+ __next_mem_range_rev(&i, nid, flag, type_a, type_b, \
p_start, p_end, p_nid))

#ifdef CONFIG_MOVABLE_NODE
@@ -181,13 +183,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flag: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
-#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \
+#define for_each_free_mem_range(i, nid, flag, p_start, p_end, p_nid) \
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
- nid, p_start, p_end, p_nid)
+ nid, flag, p_start, p_end, p_nid)

/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +199,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flag: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock in reverse
* order. Available as soon as memblock is initialized.
*/
-#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
- for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
- nid, p_start, p_end, p_nid)
+#define for_each_free_mem_range_reverse(i, nid, flag, p_start, p_end, p_nid) \
+ for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
+ nid, flag, p_start, p_end, p_nid)

static inline void memblock_set_region_flags(struct memblock_region *r,
unsigned long flags)
@@ -273,7 +277,8 @@ static inline bool memblock_bottom_up(void) { return false; }
#define MEMBLOCK_ALLOC_ACCESSIBLE 0

phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end);
+ phys_addr_t start, phys_addr_t end,
+ u32 flag);
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/mm/cma.c b/mm/cma.c
index 3a7a67b93394..69792c7c0c4c 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -316,13 +316,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range(size, alignment,
- highmem_start, limit);
+ highmem_start, limit, 0);
limit = highmem_start;
}

if (!addr) {
addr = memblock_alloc_range(size, alignment, base,
- limit);
+ limit, 0);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 9318b567ed79..ac3c94fff97c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -107,6 +107,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flag: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -115,12 +116,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ u32 flag)
{
phys_addr_t this_start, this_end, cand;
u64 i;

- for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, nid, flag, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);

@@ -139,6 +141,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flag: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -147,12 +150,13 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ u32 flag)
{
phys_addr_t this_start, this_end, cand;
u64 i;

- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range_reverse(i, nid, flag, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);

@@ -174,6 +178,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flag: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -190,7 +195,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, u32 flag)
{
phys_addr_t kernel_end, ret;

@@ -215,7 +220,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,

/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid);
+ size, align, nid, flag);
if (ret)
return ret;

@@ -233,7 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
"memory hotunplug may be affected\n");
}

- return __memblock_find_range_top_down(start, end, size, align, nid);
+ return __memblock_find_range_top_down(start, end, size, align, nid, flag);
}

/**
@@ -253,7 +258,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t align)
{
return memblock_find_in_range_node(size, align, start, end,
- NUMA_NO_NODE);
+ NUMA_NO_NODE, 0);
}

static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -782,6 +787,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +809,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, u32 flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -895,6 +901,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +910,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* Reverse of __next_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, u32 flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -1050,14 +1057,14 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,

static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, u32 flag)
{
phys_addr_t found;

if (!align)
align = SMP_CACHE_BYTES;

- found = memblock_find_in_range_node(size, align, start, end, nid);
+ found = memblock_find_in_range_node(size, align, start, end, nid, flag);
if (found && !memblock_reserve(found, size)) {
/*
* The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1077,27 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
}

phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end)
+ phys_addr_t start, phys_addr_t end,
+ u32 flag)
{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, flag);
}

static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
- int nid)
+ int nid, u32 flag)
{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flag);
}

phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid, 0);
}

phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, 0);
}

phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1173,13 +1181,13 @@ static void * __init memblock_virt_alloc_internal(

again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid);
+ nid, 0);
if (alloc)
goto done;

if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE);
+ max_addr, NUMA_NO_NODE, 0);
if (alloc)
goto done;
}
diff --git a/mm/memtest.c b/mm/memtest.c
index 1997d934b13b..2c5254d9ab3c 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
u64 i;
phys_addr_t this_start, this_end;

- for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, 0, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b50468333e..a4903046bcba 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;

- addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid, 0);
if (!addr)
return NULL;

@@ -121,7 +121,7 @@ static unsigned long __init free_low_memory_core_early(void)

memblock_clear_hotplug(0, -1);

- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, 0, &start, &end, NULL)
count += __free_memory_core(start, end);

#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
--
2.1.4

2015-05-06 23:29:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 0/3] Find mirrored memory, use for boot time allocations

On Mon, 4 May 2015 13:52:23 -0700 Tony Luck <[email protected]> wrote:

> UEFI published the spec that descibes the attribute bit we need to
> find out which memory ranges are mirrored. So time to post the real
> version of this series.

Can we please have an explanation for why we're doing this? Reading
further I see that the intent is to put kernel data structures into
mirrored memory. Why is this a good thing?

2015-05-06 23:29:33

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/memblock: Add extra "flag" to memblock to allow selection of memory based on attribute

On Wed, 29 Apr 2015 11:31:24 -0700 Tony Luck <[email protected]> wrote:

> No functional changes
>
> ...
>
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -61,7 +61,7 @@ extern bool movable_node_enabled;
>
> phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
> phys_addr_t start, phys_addr_t end,
> - int nid);
> + int nid, u32 flag);

Sometimes this is called "flag", other times it is called "flags". Can
we please be consistent? "flags" seems to be the way to go.

Also, memblock_region.flags has type unsigned long, but you've used
u32. ulong seems better.

2015-05-06 23:30:20

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm/memblock: Allocate boot time data structures from mirrored memory

On Tue, 3 Feb 2015 14:38:02 -0800 Tony Luck <[email protected]> wrote:

> Try to allocate all boot time kernel data structures from mirrored
> memory. If we run out of mirrored memory print warnings, but fall
> back to using non-mirrored memory to make sure that we still boot.
>
> ...
>
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 1d448879caae..20bf3dfab564 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -22,6 +22,7 @@
>
> /* Definition of memblock flags. */
> #define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */
> +#define MEMBLOCK_MIRROR 0x2 /* mirrored region */

It would be nice to make these an enum. Then all those literal "0"'s
which were added in [1/3] become MEMBLOCK_NONE, which is
self-documenting.

>
> ...
>
> +static inline bool memblock_is_mirror(struct memblock_region *m)
> +{
> + return m->flags & MEMBLOCK_MIRROR;
> +}
> +
>
> ...
>
> +u32 __init_memblock memblock_has_mirror(void)
> +{
> + return memblock_have_mirror ? MEMBLOCK_MIRROR : 0;
> +}

hm, these are very similar. But I guess they're different enough.

Gramatically, a function called "memblock_has_mirror()" should return a
bool. This guy is misnamed. "memblock_mirror_flag()"?


> /* inline so we don't get a warning when pr_debug is compiled out */
> static __init_memblock const char *
> memblock_type_name(struct memblock_type *type)
> @@ -257,8 +263,19 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
> phys_addr_t end, phys_addr_t size,
> phys_addr_t align)
> {
> - return memblock_find_in_range_node(size, align, start, end,
> + phys_addr_t ret;
> + u32 flag = memblock_has_mirror();
> +
> + ret = memblock_find_in_range_node(size, align, start, end,
> + NUMA_NO_NODE, flag);
> +
> + if (!ret && flag) {
> + pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);

This printk will warn on some configs. Print a phys_addr_t with %pap.
I think. See huge comment over lib/vsprintf.c:pointer(). There are
other instances of this.

> + ret = memblock_find_in_range_node(size, align, start, end,
> NUMA_NO_NODE, 0);
> + }
> +
> + return ret;
> }
>
> ...
>
> phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
> {
> - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid, 0);
> + u32 flag = memblock_has_mirror();
> + phys_addr_t ret;
> +
> +again:
> + ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid, flag);
> +
> + if (!ret && flag) {
> + flag = 0;
> + goto again;
> + }

What's going on here? This is where we're falling back to
non-mirrored. But it's happening silently? Should it warn, or is that
handled elsewhere?

This function isn't specific to mirrored memory - for any future flags,
falling back to flags==0 may not be the desired behavior. What do we
do then? I guess

if (!ret && (flag & MEMBLOCK_MIRROR)) (
flag &= ~MEMBLOCK_MIRROR;
goto again;

yes?

That can be done later if needed, I suppose.

> + return ret;
> }
>
>
> ...
>
> @@ -1181,13 +1232,13 @@ static void * __init memblock_virt_alloc_internal(
>
> again:
> alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
> - nid, 0);
> + nid, flag);
> if (alloc)
> goto done;
>
> if (nid != NUMA_NO_NODE) {
> alloc = memblock_find_in_range_node(size, align, min_addr,
> - max_addr, NUMA_NO_NODE, 0);
> + max_addr, NUMA_NO_NODE, flag);
> if (alloc)
> goto done;
> }
> @@ -1195,10 +1246,15 @@ again:
> if (min_addr) {
> min_addr = 0;
> goto again;
> - } else {
> - goto error;
> }
>
> + if (flag) {
> + flag = 0;
> + pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);

printk warning.

Please don't torture people who use 80-col displays!

> + goto again;
> + }
> +
> + return NULL;
>
> ...
>
> @@ -37,11 +37,19 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> {
> void *ptr;
> u64 addr;
> + u32 flag = memblock_has_mirror();
>
> if (limit > memblock.current_limit)
> limit = memblock.current_limit;
>
> - addr = memblock_find_in_range_node(size, align, goal, limit, nid, 0);
> +again:
> + addr = memblock_find_in_range_node(size, align, goal, limit, nid, flag);
> +
> + if (flag && !addr) {
> + flag = 0;
> + pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);

dittoes.

> + goto again;
> + }
> if (!addr)
> return NULL;
>

2015-05-06 23:30:32

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 3/3] x86, mirror: x86 enabling - find mirrored memory ranges

On Tue, 3 Feb 2015 14:40:19 -0800 Tony Luck <[email protected]> wrote:

> UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory
> address ranges. See UEFI 2.5 spec pages 157-158:
>
> http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf
>
> On EFI enabled systems scan the memory map and tell memblock about
> any mirrored ranges.
>
> ...
>
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
> now->tv_nsec = 0;
> }
>
> +void __init efi_find_mirror(void)
> +{
> + void *p;
> + unsigned long long mirror_size = 0, total_size = 0;
> +
> + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
> + efi_memory_desc_t *md = p;
> + unsigned long long start = md->phys_addr;
> + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;

efi_memory_desc_t uses u64 for all this stuff. Was there a reason for
using ull instead?

> + total_size += size;
> + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
> + memblock_mark_mirror(start, size);
> + mirror_size += size;
> + }
> + }
> + if (mirror_size)
> + pr_info("Memory: %lldM/%lldM mirrored memory\n",
> + mirror_size>>20, total_size>>20);
> +}
> +
>
> ...
>

2015-05-07 17:41:48

by Tony Luck

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm/memblock: Allocate boot time data structures from mirrored memory

On Wed, May 6, 2015 at 4:30 PM, Andrew Morton <[email protected]> wrote:
> Gramatically, a function called "memblock_has_mirror()" should return a
> bool. This guy is misnamed. "memblock_mirror_flag()"?

My misnaming is worse than that ... the intent here is to check
whether there is any
mirrored memory in the system ... i.e. should we go looking around
among memblocks
for mirrored memory - or is that a futile quest. Most systems won't
have any mirror
memory - so we won't want to spam the console logs with a ton of messages about
not being able to allocate mirrored memory.

I'll rename it to "system_has_mirror_memory()".

I'll fix all the other things too and re-spin. Keeping to 80 columns
might be challenging in some places.

-Tony

2015-05-07 21:24:49

by Tony Luck

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm/memblock: Allocate boot time data structures from mirrored memory

On Wed, May 6, 2015 at 4:30 PM, Andrew Morton <[email protected]> wrote:
>> + if (!ret && flag) {
>> + pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);
>
> This printk will warn on some configs. Print a phys_addr_t with %pap.
> I think. See huge comment over lib/vsprintf.c:pointer().

The comment may be huge - but it seems to lie about phys_addr_t :-(

I changed to %pap and got:

mm/memblock.c: In function ‘memblock_find_in_range’:
mm/memblock.c:276:3: warning: format ‘%p’ expects argument of type
‘void *’, but argument 2 has type ‘phys_addr_t’ [-Wformat=]
pr_warn("Could not allocate %pap bytes of mirrored memory\n",

<linux/types.h> says:
#ifdef CONFIG_PHYS_ADDR_T_64BIT
typedef u64 phys_addr_t;
#else
typedef u32 phys_addr_t;
#endif

So my original %lld would indeed have barfed on 32-bit builds ... but
%pap doesn't
seem to be the right answer either.

-Tony

2015-05-07 21:30:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm/memblock: Allocate boot time data structures from mirrored memory

On Thu, 7 May 2015 14:24:46 -0700 Tony Luck <[email protected]> wrote:

> On Wed, May 6, 2015 at 4:30 PM, Andrew Morton <[email protected]> wrote:
> >> + if (!ret && flag) {
> >> + pr_warn("Could not allocate %lld bytes of mirrored memory\n", size);
> >
> > This printk will warn on some configs. Print a phys_addr_t with %pap.
> > I think. See huge comment over lib/vsprintf.c:pointer().
>
> The comment may be huge - but it seems to lie about phys_addr_t :-(
>
> I changed to %pap and got:
>
> mm/memblock.c: In function 'memblock_find_in_range':
> mm/memblock.c:276:3: warning: format '%p' expects argument of type
> 'void *', but argument 2 has type 'phys_addr_t' [-Wformat=]
> pr_warn("Could not allocate %pap bytes of mirrored memory\n",

Use "&size" rather than "size". All the %p extensions require a
pointer to the thing-to-be-printed.