From: "Yinghai Lu <[email protected]>"
please check the patches regarding with early_res and bootmem
and at last it will make use early_res instead of bootmem with x86 64bits
the first two are needed for some amd_bus.c/intel_bus.c cleaning up patches too.
so put other x86/pci related into this series
hope that is ok to Jesse.
-v2: allocate vmemmap on one node together, and also seperate early_res
Thanks
Yinghai
fix for error that is introduced by
| x86: Use find_e820() instead of hard coded trampoline address
it should end with PAGE_SIZE + PAGE_SIZE
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/e820.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 05ed7ab..a1a7876 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -733,13 +733,13 @@ struct early_res {
};
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
- { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
#endif
{}
--
1.6.0.2
it is above 0x100, so if mmconf is not enable, need to skip it
Reported-by: Jens Axboe <[email protected]>
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/intel_bus.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
index b7a55dc..baf283a 100644
--- a/arch/x86/pci/intel_bus.c
+++ b/arch/x86/pci/intel_bus.c
@@ -49,6 +49,10 @@ static void __devinit pci_root_bus_res(struct pci_dev *dev)
u64 mmioh_base, mmioh_end;
int bus_base, bus_end;
+ /* some sys doesn't get mmconf enabled */
+ if (dev->cfg_size < 0x200)
+ return;
+
if (pci_root_num >= PCI_ROOT_NR) {
printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
return;
--
1.6.0.2
Signed-off-by: Yinghai Lu <[email protected]>
---
mm/bootmem.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d14868..8caf744 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -171,6 +171,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
struct page *page;
+ unsigned long *map;
unsigned long start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
@@ -188,10 +189,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
bdata - bootmem_node_data, start, end, aligned);
+ map = bdata->node_bootmem_map;
while (start < end) {
- unsigned long *map, idx, vec;
+ unsigned long idx, vec;
- map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
vec = ~map[idx / BITS_PER_LONG];
--
1.6.0.2
for AMD Fam10h, it we read mmconf from MSR early, we should just trust it
because we check it and correct it already.
so skip the reject check there.
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/mmconfig-shared.c | 10 +++++++++-
1 files changed, 9 insertions(+), 1 deletions(-)
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index b19d1e5..bcce99b 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -168,6 +168,7 @@ static const char __init *pci_mmcfg_intel_945(void)
return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
}
+static int __initdata amd_fam10h_mmconf_found_via_hostbridge;
static const char __init *pci_mmcfg_amd_fam10h(void)
{
u32 low, high, address;
@@ -215,6 +216,8 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
return NULL;
}
+ amd_fam10h_mmconf_found_via_hostbridge = 1;
+
return "AMD Family 10h NB";
}
@@ -606,7 +609,12 @@ static void __init __pci_mmcfg_init(int early)
if (!known_bridge)
acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
- pci_mmcfg_reject_broken(early);
+ /*
+ * if it is amd fam10h, and that is read from msr,
+ * we don't need check them again.
+ */
+ if (!amd_fam10h_mmconf_found_via_hostbridge)
+ pci_mmcfg_reject_broken(early);
if (list_empty(&pci_mmcfg_list))
return;
--
1.6.0.2
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/cpu/mtrr/cleanup.c | 180 +++---------------------------------
arch/x86/kernel/mmconf-fam10h_64.c | 7 +-
arch/x86/pci/amd_bus.c | 70 ++------------
include/linux/range.h | 22 +++++
kernel/Makefile | 2 +-
kernel/range.c | 154 ++++++++++++++++++++++++++++++
6 files changed, 205 insertions(+), 230 deletions(-)
create mode 100644 include/linux/range.h
create mode 100644 kernel/range.c
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698..669da09 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
+#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -34,11 +34,6 @@
#include "mtrr.h"
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
-static struct res_range __initdata range[RANGE_NUM];
+static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-static int __init
-add_range(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- /* Out of slots: */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- int i;
-
- /* Try to merge it with old one: */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* Need to add it: */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* Find the new spare: */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
-static int __init clean_sort_range(struct res_range *range, int az)
-{
- int i, j, k = az - 1, nr_range = 0;
-
- for (i = 0; i < k; i++) {
- if (range[i].end)
- continue;
- for (j = k; j > i; j--) {
- if (range[j].end) {
- k = j;
- break;
- }
- }
- if (j == i)
- break;
- range[i].start = range[k].start;
- range[i].end = range[k].end;
- range[k].start = 0;
- range[k].end = 0;
- k--;
- }
- /* count it */
- for (i = 0; i < az; i++) {
- if (!range[i].end) {
- nr_range = i;
- break;
- }
- }
-
- /* sort them */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
- return nr_range;
-}
-
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
@@ -223,13 +77,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size - 1);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end + 1);
}
@@ -252,10 +106,10 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size - 1);
}
if (extra_remove_size)
- subtract_range(range, extra_remove_base,
+ subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size - 1);
if (debug_print) {
@@ -263,7 +117,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end + 1);
}
}
@@ -273,20 +127,16 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end + 1);
}
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct res_range range_new[RANGE_NUM];
+ static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, nr_range, 0,
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
(1ULL<<(20 - PAGE_SHIFT)) - 1);
/* Sort the ranges: */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ sort_range(range, nr_range);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15f..7182580 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dmi.h>
+#include <linux/range.h>
+
#include <asm/pci-direct.h>
#include <linux/sort.h>
#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-struct range {
- u64 start;
- u64 end;
-};
-
static int __cpuinit cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 95ecbd4..2356ea1 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,6 +2,8 @@
#include <linux/pci.h>
#include <linux/topology.h>
#include <linux/cpu.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
#ifdef CONFIG_X86_64
@@ -17,58 +19,6 @@
#ifdef CONFIG_X86_64
-#define RANGE_NUM 16
-
-struct res_range {
- size_t start;
- size_t end;
-};
-
-static void __init update_range(struct res_range *range, size_t start,
- size_t end)
-{
- int i;
- int j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
struct pci_hostbridge_probe {
u32 bus;
u32 slot;
@@ -111,6 +61,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
}
+#define RANGE_NUM 16
+
/**
* early_fill_mp_bus_to_node()
* called before pcibios_scan_root and pci_scan_bus
@@ -132,7 +84,7 @@ static int __init early_fill_mp_bus_info(void)
struct resource *res;
size_t start;
size_t end;
- struct res_range range[RANGE_NUM];
+ struct range range[RANGE_NUM];
u64 val;
u32 address;
@@ -226,7 +178,7 @@ static int __init early_fill_mp_bus_info(void)
if (end > 0xffff)
end = 0xffff;
update_res(info, start, end, IORESOURCE_IO, 1);
- update_range(range, start, end);
+ subtract_range(range, RANGE_NUM, start, end);
}
/* add left over io port range to def node/link, [0, 0xffff] */
/* find the position */
@@ -256,14 +208,14 @@ static int __init early_fill_mp_bus_info(void)
end = (val & 0xffffff800000ULL);
printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
if (end < (1ULL<<32))
- update_range(range, 0, end - 1);
+ subtract_range(range, RANGE_NUM, 0, end - 1);
/* get mmconfig */
get_pci_mmcfg_amd_fam10h_range();
/* need to take out mmconf range */
if (fam10h_mmconf_end) {
printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
- update_range(range, fam10h_mmconf_start, fam10h_mmconf_end);
+ subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end);
}
/* mmio resource */
@@ -318,7 +270,7 @@ static int __init early_fill_mp_bus_info(void)
/* we got a hole */
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
- update_range(range, start, endx);
+ subtract_range(range, RANGE_NUM, start, endx);
printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
@@ -334,7 +286,7 @@ static int __init early_fill_mp_bus_info(void)
}
update_res(info, start, end, IORESOURCE_MEM, 1);
- update_range(range, start, end);
+ subtract_range(range, RANGE_NUM, start, end);
printk(KERN_CONT "\n");
}
@@ -349,7 +301,7 @@ static int __init early_fill_mp_bus_info(void)
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
- update_range(range, 1ULL<<32, end - 1);
+ subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1);
}
/*
diff --git a/include/linux/range.h b/include/linux/range.h
new file mode 100644
index 0000000..0789f14
--- /dev/null
+++ b/include/linux/range.h
@@ -0,0 +1,22 @@
+#ifndef _LINUX_RANGE_H
+#define _LINUX_RANGE_H
+
+struct range {
+ u64 start;
+ u64 end;
+};
+
+int add_range(struct range *range, int az, int nr_range,
+ u64 start, u64 end);
+
+
+int add_range_with_merge(struct range *range, int az, int nr_range,
+ u64 start, u64 end);
+
+void subtract_range(struct range *range, int az, u64 start, u64 end);
+
+int clean_sort_range(struct range *range, int az);
+
+void sort_range(struct range *range, int nr_range);
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75..ad47330 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
- async.o
+ async.o range.o
obj-y += groups.o
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 0000000..46a10c8
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,154 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+
+#include <linux/range.h>
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+ /* Out of slots: */
+ if (nr_range >= az)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+int add_range_with_merge(struct range *range, int az, int nr_range,
+ u64 start, u64 end)
+{
+ int i;
+
+ /* Try to merge it with old one: */
+ for (i = 0; i < nr_range; i++) {
+ u64 final_start, final_end;
+ u64 common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(range[i].start, start);
+ final_end = max(range[i].end, end);
+
+ range[i].start = final_start;
+ range[i].end = final_end;
+ return nr_range;
+ }
+
+ /* Need to add it: */
+ return add_range(range, az, nr_range, start, end);
+}
+
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+ int i, j;
+
+ for (j = 0; j < az; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end + 1) {
+ range[j].start = end + 1;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start - 1) {
+ range[j].end = start - 1;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* Find the new spare: */
+ for (i = 0; i < az; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < az) {
+ range[i].end = range[j].end;
+ range[i].start = end + 1;
+ } else {
+ printk(KERN_ERR "run of slot in ranges\n");
+ }
+ range[j].end = start - 1;
+ continue;
+ }
+ }
+}
+
+static int cmp_range(const void *x1, const void *x2)
+{
+ const struct range *r1 = x1;
+ const struct range *r2 = x2;
+ s64 start1, start2;
+
+ start1 = r1->start;
+ start2 = r2->start;
+
+ return start1 - start2;
+}
+
+int clean_sort_range(struct range *range, int az)
+{
+ int i, j, k = az - 1, nr_range = 0;
+
+ for (i = 0; i < k; i++) {
+ if (range[i].end)
+ continue;
+ for (j = k; j > i; j--) {
+ if (range[j].end) {
+ k = j;
+ break;
+ }
+ }
+ if (j == i)
+ break;
+ range[i].start = range[k].start;
+ range[i].end = range[k].end;
+ range[k].start = 0;
+ range[k].end = 0;
+ k--;
+ }
+ /* count it */
+ for (i = 0; i < az; i++) {
+ if (!range[i].end) {
+ nr_range = i;
+ break;
+ }
+ }
+
+ /* sort them */
+ sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+
+ return nr_range;
+}
+
+void sort_range(struct range *range, int nr_range)
+{
+ /* sort them */
+ sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
--
1.6.0.2
fend off wrong range
Signed-off-by: Yinghai Lu <[email protected]>
---
kernel/range.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/kernel/range.c b/kernel/range.c
index 46a10c8..71e0021 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -13,6 +13,9 @@
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
{
+ if (start > end)
+ return nr_range;
+
/* Out of slots: */
if (nr_range >= az)
return nr_range;
@@ -30,6 +33,9 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
{
int i;
+ if (start > end)
+ return nr_range;
+
/* Try to merge it with old one: */
for (i = 0; i < nr_range; i++) {
u64 final_start, final_end;
@@ -59,6 +65,9 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
{
int i, j;
+ if (start > end)
+ return;
+
for (j = 0; j < az; j++) {
if (!range[j].end)
continue;
--
1.6.0.2
prepare to enable 32bit intel and amd bus
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/bus_numa.c | 16 ++++++++--------
arch/x86/pci/bus_numa.h | 4 ++--
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 145df00..72ea29d 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -51,8 +51,8 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
}
}
-void __init update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge)
+void __init update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge)
{
int i;
struct resource *res;
@@ -65,20 +65,20 @@ void __init update_res(struct pci_root_info *info, size_t start,
/* try to merge it with old one */
for (i = 0; i < info->res_num; i++) {
- size_t final_start, final_end;
- size_t common_start, common_end;
+ resource_size_t final_start, final_end;
+ resource_size_t common_start, common_end;
res = &info->res[i];
if (res->flags != flags)
continue;
- common_start = max((size_t)res->start, start);
- common_end = min((size_t)res->end, end);
+ common_start = max(res->start, start);
+ common_end = min(res->end, end);
if (common_start > common_end + 1)
continue;
- final_start = min((size_t)res->start, start);
- final_end = max((size_t)res->end, end);
+ final_start = min(res->start, start);
+ final_end = max(res->end, end);
res->start = final_start;
res->end = final_end;
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index adbc23f..374ecc5 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -22,6 +22,6 @@ extern int pci_root_num;
extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
extern int found_all_numa_early;
-extern void update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge);
+extern void update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge);
#endif
--
1.6.0.2
found MSI amd k8 based laptops is hiding [0x70000000, 0x80000000) RAM from
e820.
enable amd one chain even for all.
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/amd_bus.c | 7 ++++---
arch/x86/pci/bus_numa.c | 5 -----
arch/x86/pci/bus_numa.h | 1 -
3 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 2356ea1..e467071 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -87,11 +87,12 @@ static int __init early_fill_mp_bus_info(void)
struct range range[RANGE_NUM];
u64 val;
u32 address;
+ int found;
if (!early_pci_allowed())
return -1;
- found_all_numa_early = 0;
+ found = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
u32 id;
u16 device;
@@ -105,12 +106,12 @@ static int __init early_fill_mp_bus_info(void)
device = (id>>16) & 0xffff;
if (pci_probes[i].vendor == vendor &&
pci_probes[i].device == device) {
- found_all_numa_early = 1;
+ found = 1;
break;
}
}
- if (!found_all_numa_early)
+ if (!found)
return 0;
pci_root_num = 0;
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 72ea29d..7ef0970 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -5,7 +5,6 @@
int pci_root_num;
struct pci_root_info pci_root_info[PCI_ROOT_NR];
-int found_all_numa_early;
void x86_pci_root_bus_res_quirks(struct pci_bus *b)
{
@@ -21,10 +20,6 @@ void x86_pci_root_bus_res_quirks(struct pci_bus *b)
if (!pci_root_num)
return;
- /* for amd, if only one root bus, don't need to do anything */
- if (pci_root_num < 2 && found_all_numa_early)
- return;
-
for (i = 0; i < pci_root_num; i++) {
if (pci_root_info[i].bus_min == b->number)
break;
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index 374ecc5..f63e802 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -20,7 +20,6 @@ struct pci_root_info {
#define PCI_ROOT_NR 4
extern int pci_root_num;
extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
-extern int found_all_numa_early;
extern void update_res(struct pci_root_info *info, resource_size_t start,
resource_size_t end, unsigned long flags, int merge);
--
1.6.0.2
prepare to enable it for 32bit
-v2: remove not needed cast
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/amd_bus.c | 16 ++++++++--------
1 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e467071..e8bb553 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -82,8 +82,8 @@ static int __init early_fill_mp_bus_info(void)
struct pci_root_info *info;
u32 reg;
struct resource *res;
- size_t start;
- size_t end;
+ u64 start;
+ u64 end;
struct range range[RANGE_NUM];
u64 val;
u32 address;
@@ -173,7 +173,7 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/* kernel only handle 16 bit only */
if (end > 0xffff)
@@ -207,7 +207,7 @@ static int __init early_fill_mp_bus_info(void)
address = MSR_K8_TOP_MEM1;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
if (end < (1ULL<<32))
subtract_range(range, RANGE_NUM, 0, end - 1);
@@ -246,7 +246,7 @@ static int __init early_fill_mp_bus_info(void)
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/*
* some sick allocation would have range overlap with fam10h
* mmconf range, so need to update start and end.
@@ -272,13 +272,13 @@ static int __init early_fill_mp_bus_info(void)
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
subtract_range(range, RANGE_NUM, start, endx);
- printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx);
+ printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
}
if (changed) {
if (start <= end) {
- printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end);
+ printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", start, end);
} else {
printk(KERN_CONT "%s\n", endx?"":" ==> none");
continue;
@@ -301,7 +301,7 @@ static int __init early_fill_mp_bus_info(void)
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1);
}
--
1.6.0.2
prepare for 32bit pci root bus
-v2: hpa said we should compare with (resource_size_t)~0
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/amd_bus.c | 8 +++++---
arch/x86/pci/bus_numa.c | 3 +++
arch/x86/pci/intel_bus.c | 5 ++++-
include/linux/range.h | 8 ++++++++
4 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index e8bb553..9bcb6fe 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -201,7 +201,7 @@ static int __init early_fill_mp_bus_info(void)
memset(range, 0, sizeof(range));
/* 0xfd00000000-0xffffffffff for HT */
- range[0].end = (0xfdULL<<32) - 1;
+ range[0].end = cap_resource((0xfdULL<<32) - 1);
/* need to take out [0, TOM) for RAM*/
address = MSR_K8_TOP_MEM1;
@@ -286,7 +286,8 @@ static int __init early_fill_mp_bus_info(void)
}
}
- update_res(info, start, end, IORESOURCE_MEM, 1);
+ update_res(info, cap_resource(start), cap_resource(end),
+ IORESOURCE_MEM, 1);
subtract_range(range, RANGE_NUM, start, end);
printk(KERN_CONT "\n");
}
@@ -321,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, cap_resource(range[i].start),
+ cap_resource(range[i].end),
IORESOURCE_MEM, 1);
}
}
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 7ef0970..411955f 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -55,6 +55,9 @@ void __init update_res(struct pci_root_info *info, resource_size_t start,
if (start > end)
return;
+ if (start == (resource_size_t)~0)
+ return;
+
if (!merge)
goto addit;
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
index baf283a..1f15df9 100644
--- a/arch/x86/pci/intel_bus.c
+++ b/arch/x86/pci/intel_bus.c
@@ -6,6 +6,8 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/init.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
#include "bus_numa.h"
@@ -85,7 +87,8 @@ static void __devinit pci_root_bus_res(struct pci_dev *dev)
mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
- update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
+ update_res(info, cap_resource(mmioh_base), cap_resource(mmioh_end),
+ IORESOURCE_MEM, 0);
print_ioh_resources(info);
}
diff --git a/include/linux/range.h b/include/linux/range.h
index 0789f14..17a23d2 100644
--- a/include/linux/range.h
+++ b/include/linux/range.h
@@ -19,4 +19,12 @@ int clean_sort_range(struct range *range, int az);
void sort_range(struct range *range, int nr_range);
+
+static inline resource_size_t cap_resource(u64 val)
+{
+ if (val > (resource_size_t)~0)
+ return (resource_size_t)~0;
+ else
+ return val;
+}
#endif
--
1.6.0.2
should be good for 32bit too.
-v3: cast res->start
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/pci/Makefile | 3 +--
arch/x86/pci/amd_bus.c | 14 +-------------
arch/x86/pci/bus_numa.h | 4 ++--
arch/x86/pci/i386.c | 4 ----
arch/x86/pci/intel_bus.c | 2 +-
5 files changed, 5 insertions(+), 22 deletions(-)
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 564b008..30e55d7 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -14,8 +14,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-y += common.o early.o
-obj-y += amd_bus.o
-obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o
+obj-y += amd_bus.o bus_numa.o intel_bus.o
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 9bcb6fe..ef4b3e7 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -6,9 +6,7 @@
#include <asm/pci_x86.h>
-#ifdef CONFIG_X86_64
#include <asm/pci-direct.h>
-#endif
#include "bus_numa.h"
@@ -17,8 +15,6 @@
* also get peer root bus resource for io,mmio
*/
-#ifdef CONFIG_X86_64
-
struct pci_hostbridge_probe {
u32 bus;
u32 slot;
@@ -342,21 +338,13 @@ static int __init early_fill_mp_bus_info(void)
printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n",
busnum, j,
(res->flags & IORESOURCE_IO)?"io port":"mmio",
- res->start, res->end);
+ (u64)res->start, (u64)res->end);
}
}
return 0;
}
-#else /* !CONFIG_X86_64 */
-
-static int __init early_fill_mp_bus_info(void) { return 0; }
-
-#endif /* !CONFIG_X86_64 */
-
-/* common 32/64 bit code */
-
#define ENABLE_CF8_EXT_CFG (1ULL << 46)
static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
index f63e802..08d8e15 100644
--- a/arch/x86/pci/bus_numa.h
+++ b/arch/x86/pci/bus_numa.h
@@ -1,5 +1,5 @@
-#ifdef CONFIG_X86_64
-
+#ifndef __BUS_NUMA_H
+#define __BUS_NUMA_H
/*
* sub bus (transparent) will use entres from 3 to store extra from
* root, so need to make sure we have enough slot there, Should we
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5dc9e8c..f4e8481 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -257,10 +257,6 @@ void __init pcibios_resource_survey(void)
*/
fs_initcall(pcibios_assign_resources);
-void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
-{
-}
-
/*
* If we set up a device for bus mastering, we need to check the latency
* timer as certain crappy BIOSes forget to set it properly.
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
index 1f15df9..6167be9 100644
--- a/arch/x86/pci/intel_bus.c
+++ b/arch/x86/pci/intel_bus.c
@@ -30,7 +30,7 @@ static inline void print_ioh_resources(struct pci_root_info *info)
busnum, i,
(res->flags & IORESOURCE_IO) ? "io port" :
"mmio",
- res->start, res->end);
+ (u64)res->start, (u64)res->end);
}
}
--
1.6.0.2
simplify setup_node_mem, do use bootmem from other node.
instead just find_e820_area in early_node_mem.
so we can keep the boundary between early_res and boot mem more clear.
and only call civertion one time instead of for all nodes.
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/setup.c | 1 +
arch/x86/mm/init_32.c | 1 -
arch/x86/mm/init_64.c | 3 +-
arch/x86/mm/numa_64.c | 62 +++++++++++++++-------------------------------
4 files changed, 22 insertions(+), 45 deletions(-)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f7b8b98..3ab0bf4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -942,6 +942,7 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
#ifdef CONFIG_X86_64
/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c973f8e..3c4fe2a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -764,7 +764,6 @@ static unsigned long __init setup_node_bootmem(int nodeid,
printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
nodeid, bootmap, bootmap + bootmap_size);
free_bootmem_with_active_regions(nodeid, end_pfn);
- early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
return bootmap + bootmap_size;
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5198b9b..1ea79ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -578,13 +578,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
#endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70..3232148 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -164,18 +164,21 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long align)
{
unsigned long mem = find_e820_area(start, end, size, align);
- void *ptr;
if (mem != -1L)
return __va(mem);
- ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
- if (ptr == NULL) {
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+
+ start = __pa(MAX_DMA_ADDRESS);
+ end = max_low_pfn_mapped << PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
+ if (mem != -1L)
+ return __va(mem);
+
+ printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
size, nodeid);
- return NULL;
- }
- return ptr;
+
+ return NULL;
}
/* Initialize bootmem allocator for a node */
@@ -211,8 +214,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
+ reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
+ nid = phys_to_nid(nodedata_phys);
+ if (nid != nodeid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
@@ -227,11 +234,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
* of alloc_bootmem, that could clash with reserved range
*/
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
- nid = phys_to_nid(nodedata_phys);
- if (nid == nodeid)
- bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
- else
- bootmap_start = roundup(start, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
@@ -239,18 +242,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
- if (nodedata_phys < start || nodedata_phys >= end) {
- /*
- * only need to free it if it is from other node
- * bootmem
- */
- if (nid != nodeid)
- free_bootmem(nodedata_phys, pgdat_size);
- }
+ free_early(nodedata_phys, nodedata_phys + pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
+ reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
+ "BOOTMAP");
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
@@ -259,31 +257,11 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
bootmap_start, bootmap_start + bootmap_size - 1,
bootmap_pages);
-
- free_bootmem_with_active_regions(nodeid, end);
-
- /*
- * convert early reserve to bootmem reserve earlier
- * otherwise early_node_mem could use early reserved mem
- * on previous node
- */
- early_res_to_bootmem(start, end);
-
- /*
- * in some case early_node_mem could use alloc_bootmem
- * to get range on other node, don't reserve that again
- */
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
- pgdat_size, BOOTMEM_DEFAULT);
nid = phys_to_nid(bootmap_start);
if (nid != nodeid)
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
+ free_bootmem_with_active_regions(nodeid, end);
node_set_online(nodeid);
}
--
1.6.0.2
to prepare allocate early res array from fine_e820_area
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/e820.c | 47 ++++++++++++++++++++++++++++++++---------------
1 files changed, 32 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a1a7876..291f6d2 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -724,14 +724,18 @@ core_initcall(e820_mark_nvs_memory);
/*
* Early reserved memory areas.
*/
-#define MAX_EARLY_RES 32
+/*
+ * need to make sure this one is bigger enough before
+ * find_e820_area could be used
+ */
+#define MAX_EARLY_RES_X 32
struct early_res {
u64 start, end;
- char name[16];
+ char name[15];
char overlap_ok;
};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
/*
@@ -745,12 +749,22 @@ static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{}
};
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata =
+#ifdef CONFIG_X86_32
+ 2
+#else
+ 1
+#endif
+ ;
+
static int __init find_overlapped_early(u64 start, u64 end)
{
int i;
struct early_res *r;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
r = &early_res[i];
if (end > r->start && start < r->end)
break;
@@ -768,13 +782,14 @@ static void __init drop_range(int i)
{
int j;
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ for (j = i + 1; j < max_early_res && early_res[j].end; j++)
;
memmove(&early_res[i], &early_res[i + 1],
(j - 1 - i) * sizeof(struct early_res));
early_res[j - 1].end = 0;
+ early_res_count--;
}
/*
@@ -793,9 +808,9 @@ static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
struct early_res *r;
u64 lower_start, lower_end;
u64 upper_start, upper_end;
- char name[16];
+ char name[15];
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
r = &early_res[i];
/* Continue past non-overlapping ranges */
@@ -851,7 +866,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name,
struct early_res *r;
i = find_overlapped_early(start, end);
- if (i >= MAX_EARLY_RES)
+ if (i >= max_early_res)
panic("Too many early reservations");
r = &early_res[i];
if (r->end)
@@ -864,6 +879,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name,
r->overlap_ok = overlap_ok;
if (name)
strncpy(r->name, name, sizeof(r->name) - 1);
+ early_res_count++;
}
/*
@@ -916,7 +932,7 @@ void __init free_early(u64 start, u64 end)
i = find_overlapped_early(start, end);
r = &early_res[i];
- if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+ if (i >= max_early_res || r->end != end || r->start != start)
panic("free_early on not reserved area: %llx-%llx!",
start, end - 1);
@@ -927,14 +943,15 @@ void __init early_res_to_bootmem(u64 start, u64 end)
{
int i, count;
u64 final_start, final_end;
+ int idx = 0;
count = 0;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
+ printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+ count - idx, max_early_res, start, end);
+ for (i = idx; i < count; i++) {
struct early_res *r = &early_res[i];
printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
r->start, r->end, r->name);
@@ -961,7 +978,7 @@ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
again:
i = find_overlapped_early(addr, addr + size);
r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
+ if (i < max_early_res && r->end) {
*addrp = addr = round_up(r->end, align);
changed = 1;
goto again;
@@ -978,7 +995,7 @@ static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
int changed = 0;
again:
last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
struct early_res *r = &early_res[i];
if (last > r->start && addr < r->start) {
size = r->start - addr;
--
1.6.0.2
use early_res_count to track the num, and use find_e820 to get new buffer.
and copy from old to new one.
also clear early_res to prevent later invalid using
-v2 _check_and_double_early_res should take new start
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/e820.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 53 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 291f6d2..949d688 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -908,6 +908,48 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
__reserve_early(start, end, name, 1);
}
+static void __init __check_and_double_early_res(u64 start)
+{
+ u64 end, size, mem;
+ struct early_res *new;
+
+ /* do we have enough slots left ? */
+ if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+ return;
+
+ /* double it */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ size = sizeof(struct early_res) * max_early_res * 2;
+ mem = find_e820_area(start, end, size, sizeof(struct early_res));
+
+ if (mem == -1ULL)
+ panic("can not find more space for early_res array");
+
+ new = __va(mem);
+ /* save the first one for own */
+ new[0].start = mem;
+ new[0].end = mem + size;
+ new[0].overlap_ok = 0;
+ /* copy old to new */
+ if (early_res == early_res_x) {
+ memcpy(&new[1], &early_res[0],
+ sizeof(struct early_res) * max_early_res);
+ memset(&new[max_early_res+1], 0,
+ sizeof(struct early_res) * (max_early_res - 1));
+ early_res_count++;
+ } else {
+ memcpy(&new[1], &early_res[1],
+ sizeof(struct early_res) * (max_early_res - 1));
+ memset(&new[max_early_res], 0,
+ sizeof(struct early_res) * max_early_res);
+ }
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = new;
+ max_early_res *= 2;
+ printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+ max_early_res, mem, mem + size - 1);
+}
+
/*
* Most early reservations come here.
*
@@ -921,6 +963,8 @@ void __init reserve_early(u64 start, u64 end, char *name)
if (start >= end)
return;
+ __check_and_double_early_res(end);
+
drop_overlaps_that_are_ok(start, end);
__reserve_early(start, end, name, 0);
}
@@ -949,6 +993,10 @@ void __init early_res_to_bootmem(u64 start, u64 end)
for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
count - idx, max_early_res, start, end);
for (i = idx; i < count; i++) {
@@ -966,6 +1014,11 @@ void __init early_res_to_bootmem(u64 start, u64 end)
reserve_bootmem_generic(final_start, final_end - final_start,
BOOTMEM_DEFAULT);
}
+ /* clear them */
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ early_res_count = 0;
}
/* Check for already reserved areas */
--
1.6.0.2
so we could double check if we have enough low pages later
-v2: fix errors checkpatch.pl reported
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/mm/init_64.c | 2 +
include/linux/bootmem.h | 2 +
mm/bootmem.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 96 insertions(+), 0 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ea79ad..f9530eb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -654,6 +654,8 @@ void __init mem_init(void)
long codesize, reservedpages, datasize, initsize;
unsigned long absent_pages;
+ print_bootmem_free();
+
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index b10ec49..3446bed 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -55,6 +55,8 @@ extern void free_bootmem_node(pg_data_t *pgdat,
extern void free_bootmem(unsigned long addr, unsigned long size);
extern void free_bootmem_late(unsigned long addr, unsigned long size);
+void print_bootmem_free(void);
+
/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
* the architecture-specific code should honor this).
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8caf744..954a7e7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -268,6 +268,98 @@ static void __init __free(bootmem_data_t *bdata,
BUG();
}
+static void __init print_all_bootmem_free_core(bootmem_data_t *bdata)
+{
+ int aligned;
+ unsigned long *map;
+ unsigned long start, end, count = 0;
+ unsigned long free_start = -1UL, free_end = 0;
+
+ if (!bdata->node_bootmem_map)
+ return;
+
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+
+ /*
+ * If the start is aligned to the machines wordsize, we might
+ * be able to count it in bulks of that order.
+ */
+ aligned = !(start & (BITS_PER_LONG - 1));
+
+ printk(KERN_DEBUG "nid=%td start=0x%010lx end=0x%010lx aligned=%d\n",
+ bdata - bootmem_node_data, start, end, aligned);
+ map = bdata->node_bootmem_map;
+
+ while (start < end) {
+ unsigned long idx, vec;
+
+ idx = start - bdata->node_min_pfn;
+ vec = ~map[idx / BITS_PER_LONG];
+
+ if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+ if (free_start == -1UL) {
+ free_start = idx;
+ free_end = free_start + BITS_PER_LONG;
+ } else {
+ if (free_end == idx) {
+ free_end += BITS_PER_LONG;
+ } else {
+ /* there is gap, print old */
+ printk(KERN_DEBUG " free [0x%010lx - 0x%010lx]\n",
+ free_start + bdata->node_min_pfn,
+ free_end + bdata->node_min_pfn);
+ free_start = idx;
+ free_end = idx + BITS_PER_LONG;
+ }
+ }
+ count += BITS_PER_LONG;
+ } else {
+ unsigned long off = 0;
+
+ while (vec && off < BITS_PER_LONG) {
+ if (vec & 1) {
+ if (free_start == -1UL) {
+ free_start = idx + off;
+ free_end = free_start + 1;
+ } else {
+ if (free_end == (idx + off)) {
+ free_end++;
+ } else {
+ /* there is gap, print old */
+ printk(KERN_DEBUG " free [0x%010lx - 0x%010lx]\n",
+ free_start + bdata->node_min_pfn,
+ free_end + bdata->node_min_pfn);
+ free_start = idx + off;
+ free_end = free_start + 1;
+ }
+ }
+ count++;
+ }
+ vec >>= 1;
+ off++;
+ }
+ }
+ start += BITS_PER_LONG;
+ }
+
+ /* last one */
+ if (free_start != -1UL)
+ printk(KERN_DEBUG " free [0x%010lx - 0x%010lx]\n",
+ free_start + bdata->node_min_pfn,
+ free_end + bdata->node_min_pfn);
+ printk(KERN_DEBUG " total free 0x%010lx\n", count);
+}
+
+void __init print_bootmem_free(void)
+{
+ bootmem_data_t *bdata;
+
+ list_for_each_entry(bdata, &bdata_list, list) {
+ print_all_bootmem_free_core(bdata);
+ }
+}
+
static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
unsigned long eidx, int flags)
{
--
1.6.0.2
so we could put pgdata for the node high, and later sparse
vmmap will get the section nr that need.
with this patch will make <4g ram will not use sparse vmmap
before this patch, will get, before swiotlb try get bootmem
[ 0.000000] nid=1 start=0 end=2080000 aligned=1
[ 0.000000] free [10 - 96]
[ 0.000000] free [b12 - 1000]
[ 0.000000] free [359f - 38a3]
[ 0.000000] free [38b5 - 3a00]
[ 0.000000] free [41e01 - 42000]
[ 0.000000] free [73dde - 73e00]
[ 0.000000] free [73fdd - 74000]
[ 0.000000] free [741dd - 74200]
[ 0.000000] free [743dd - 74400]
[ 0.000000] free [745dd - 74600]
[ 0.000000] free [747dd - 74800]
[ 0.000000] free [749dd - 74a00]
[ 0.000000] free [74bdd - 74c00]
[ 0.000000] free [74ddd - 74e00]
[ 0.000000] free [74fdd - 75000]
[ 0.000000] free [751dd - 75200]
[ 0.000000] free [753dd - 75400]
[ 0.000000] free [755dd - 75600]
[ 0.000000] free [757dd - 75800]
[ 0.000000] free [759dd - 75a00]
[ 0.000000] free [75bdd - 7bf5f]
[ 0.000000] free [7f730 - 7f750]
[ 0.000000] free [100000 - 2080000]
[ 0.000000] total free 1f87170
[ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000
[ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000
with this patch will get: before swiotlb try get bootmem
[ 0.000000] nid=1 start=0 end=2080000 aligned=1
[ 0.000000] free [a - 96]
[ 0.000000] free [702 - 1000]
[ 0.000000] free [359f - 3600]
[ 0.000000] free [37de - 3800]
[ 0.000000] free [39dd - 3a00]
[ 0.000000] free [3bdd - 3c00]
[ 0.000000] free [3ddd - 3e00]
[ 0.000000] free [3fdd - 4000]
[ 0.000000] free [41dd - 4200]
[ 0.000000] free [43dd - 4400]
[ 0.000000] free [45dd - 4600]
[ 0.000000] free [47dd - 4800]
[ 0.000000] free [49dd - 4a00]
[ 0.000000] free [4bdd - 4c00]
[ 0.000000] free [4ddd - 4e00]
[ 0.000000] free [4fdd - 5000]
[ 0.000000] free [51dd - 5200]
[ 0.000000] free [53dd - 5400]
[ 0.000000] free [55dd - 7bf5f]
[ 0.000000] free [7f730 - 7f750]
[ 0.000000] free [100428 - 100600]
[ 0.000000] free [13ea01 - 13ec00]
[ 0.000000] free [170800 - 2080000]
[ 0.000000] total free 1f87170
[ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
[ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000
[ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000
so will get enough space below 4G, aka pfn 0x100000
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/mm/numa_64.c | 23 ++++++++++++++++++-----
1 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3232148..02f13cb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,14 +163,27 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
unsigned long end, unsigned long size,
unsigned long align)
{
- unsigned long mem = find_e820_area(start, end, size, align);
+ unsigned long mem;
+ /*
+ * put it on high as possible
+ * something will go with NODE_DATA
+ */
+ if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
+ end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
-
- start = __pa(MAX_DMA_ADDRESS);
- end = max_low_pfn_mapped << PAGE_SHIFT;
+ /* extend the search scope */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ else
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
--
1.6.0.2
64bit NUMA already make enough space under 4G with new early_node_mem
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/include/asm/pci.h | 2 ++
arch/x86/include/asm/pci_64.h | 2 --
arch/x86/kernel/pci-dma.c | 13 ++++++++++---
arch/x86/kernel/setup.c | 7 -------
4 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c20..b4a00dd 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -124,6 +124,8 @@ extern void pci_iommu_alloc(void);
#include "pci_64.h"
#endif
+void dma32_reserve_bootmem(void);
+
/* implement the pci_ DMA API in terms of the generic device dma_ one */
#include <asm-generic/pci-dma-compat.h>
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f..fe15cfb 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
int reg, int len, u32 value);
-extern void dma32_reserve_bootmem(void);
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e2..1aa966c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
static __initdata void *dma32_bootmem_ptr;
static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
@@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void)
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#else
+void __init dma32_reserve_bootmem(void)
+{
+}
+static void __init dma32_free_bootmem(void)
+{
+}
+
#endif
void __init pci_iommu_alloc(void)
{
-#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
-#endif
+
if (pci_swiotlb_detect())
goto out;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3ab0bf4..2c67cab 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -944,14 +944,7 @@ void __init setup_arch(char **cmdline_p)
initmem_init(0, max_pfn, acpi, k8);
early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#ifdef CONFIG_X86_64
- /*
- * dma32_reserve_bootmem() allocates bootmem which may conflict
- * with the crashkernel command line, so do that after
- * reserve_crashkernel()
- */
dma32_reserve_bootmem();
-#endif
reserve_ibft_region();
--
1.6.0.2
finally we can use early_res to replace bootmem for x86_64 now.
still can use CONFIG_NO_BOOTMEM to enable it or not
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/Kconfig | 7 ++
arch/x86/include/asm/e820.h | 6 ++
arch/x86/kernel/e820.c | 157 +++++++++++++++++++++++++++++++---
arch/x86/kernel/setup.c | 2 +
arch/x86/mm/init_64.c | 7 ++-
arch/x86/mm/numa_64.c | 20 ++++-
include/linux/bootmem.h | 7 ++
include/linux/mm.h | 5 +
include/linux/mmzone.h | 2 +
mm/bootmem.c | 195 ++++++++++++++++++++++++++++++++++++++++++-
mm/page_alloc.c | 59 +++++++++++++-
mm/percpu.c | 3 +
mm/sparse-vmemmap.c | 2 +-
13 files changed, 448 insertions(+), 24 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cbc1b98..ffb025c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -567,6 +567,13 @@ config PARAVIRT_DEBUG
Enable to debug paravirt_ops internals. Specifically, BUG if
a paravirt_op is missing when it is called.
+config NO_BOOTMEM
+ default y
+ bool "Disable Bootmem code"
+ depends on X86_64
+ ---help---
+ use early_res directly instead of bootmem before slab is ready.
+
config MEMTEST
bool "Memtest"
---help---
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 761249e..7d72e5f 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -117,6 +117,12 @@ extern void free_early(u64 start, u64 end);
extern void early_res_to_bootmem(u64 start, u64 end);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+void reserve_early_without_check(u64 start, u64 end, char *name);
+u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+ u64 size, u64 align);
+#include <linux/range.h>
+int get_free_all_memory_range(struct range **rangep, int nodeid);
+
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
extern int e820_find_active_region(const struct e820entry *ei,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 949d688..6b801e0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -969,6 +969,25 @@ void __init reserve_early(u64 start, u64 end, char *name)
__reserve_early(start, end, name, 0);
}
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+ struct early_res *r;
+
+ if (start >= end)
+ return;
+
+ __check_and_double_early_res(end);
+
+ r = &early_res[early_res_count];
+
+ r->start = start;
+ r->end = end;
+ r->overlap_ok = 0;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+ early_res_count++;
+}
+
void __init free_early(u64 start, u64 end)
{
struct early_res *r;
@@ -983,6 +1002,92 @@ void __init free_early(u64 start, u64 end)
drop_range(i);
}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+ int i, count;
+ u64 final_start, final_end;
+ int idx = 0;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
+#if 1
+ printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+ for (i = idx; i < count; i++) {
+ struct early_res *r = &early_res[i];
+#if 0
+ printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i,
+ r->start, r->end, r->name);
+#endif
+ final_start = PFN_DOWN(r->start);
+ final_end = PFN_UP(r->end);
+ if (final_start >= final_end) {
+#if 0
+ printk(KERN_CONT "\n");
+#endif
+ continue;
+ }
+#if 0
+ printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n",
+ final_start, final_end);
+#endif
+ subtract_range(range, az, final_start, final_end - 1);
+ }
+
+}
+
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+ int i, count;
+ u64 start = 0, end;
+ u64 size;
+ u64 mem;
+ struct range *range;
+ int nr_range;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ count *= 2;
+
+ size = sizeof(struct range) * count;
+ if (max_pfn_mapped > MAX_DMA32_PFN)
+ start = MAX_DMA32_PFN << PAGE_SHIFT;
+ end = max_pfn_mapped << PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, sizeof(struct range));
+ if (mem == -1ULL)
+ panic("can not find more space for range free");
+
+ range = __va(mem);
+ /* use early_node_map[] and early_res to get range array at first */
+ memset(range, 0, size);
+ nr_range = 0;
+
+ /* need to go over early_node_map to find out good range for node */
+ nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+ subtract_early_res(range, count);
+ nr_range = clean_sort_range(range, count);
+
+ /* need to clear it ? */
+ if (nodeid == MAX_NUMNODES) {
+ memset(&early_res[0], 0,
+ sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ }
+
+ *rangep = range;
+ return nr_range;
+}
+#else
void __init early_res_to_bootmem(u64 start, u64 end)
{
int i, count;
@@ -1020,6 +1125,7 @@ void __init early_res_to_bootmem(u64 start, u64 end)
max_early_res = 0;
early_res_count = 0;
}
+#endif
/* Check for already reserved areas */
static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
@@ -1075,6 +1181,35 @@ again:
/*
* Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+ u64 size, u64 align)
+{
+ u64 addr, last;
+
+ addr = round_up(ei_start, align);
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ goto out;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ goto out;
+ if (last > end)
+ goto out;
+
+ return addr;
+
+out:
+ return -1ULL;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
*/
u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
{
@@ -1082,24 +1217,20 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ u64 addr;
+ u64 ei_start, ei_last;
if (ei->type != E820_RAM)
continue;
- addr = round_up(ei->addr, align);
+
ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
+
+ if (addr == -1ULL)
continue;
+
return addr;
}
return -1ULL;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2c67cab..824fef7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -942,7 +942,9 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
+#ifndef CONFIG_NO_BOOTMEM
early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+#endif
dma32_reserve_bootmem();
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f9530eb..f13e5bd 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -571,6 +571,7 @@ kernel_physical_mapping_init(unsigned long start,
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
+#ifndef CONFIG_NO_BOOTMEM
unsigned long bootmap_size, bootmap;
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -582,8 +583,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
- e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
+#else
+ e820_register_active_regions(0, start_pfn, end_pfn);
+#endif
}
#endif
@@ -654,7 +657,9 @@ void __init mem_init(void)
long codesize, reservedpages, datasize, initsize;
unsigned long absent_pages;
+#ifndef CONFIG_NO_BOOTMEM
print_bootmem_free();
+#endif
pci_iommu_alloc();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 02f13cb..a20e170 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -198,11 +198,13 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
void __init
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
- unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+ unsigned long start_pfn, last_pfn, nodedata_phys;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- unsigned long bootmap_start, nodedata_phys;
- void *bootmap;
int nid;
+#ifndef CONFIG_NO_BOOTMEM
+ unsigned long bootmap_start, bootmap_pages, bootmap_size;
+ void *bootmap;
+#endif
if (!end)
return;
@@ -216,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
start = roundup(start, ZONE_ALIGN);
- printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+ printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
start_pfn = start >> PAGE_SHIFT;
@@ -235,10 +237,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+ NODE_DATA(nodeid)->node_id = nodeid;
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
+#ifndef CONFIG_NO_BOOTMEM
+ NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+
/*
* Find a place for the bootmem map
* nodedata_phys could be on other nodes by alloc_bootmem,
@@ -275,6 +280,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
free_bootmem_with_active_regions(nodeid, end);
+#endif
node_set_online(nodeid);
}
@@ -733,6 +739,10 @@ unsigned long __init numa_free_all_bootmem(void)
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
+#ifdef CONFIG_NO_BOOTMEM
+ pages += free_all_memory_core_early(MAX_NUMNODES);
+#endif
+
return pages;
}
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3446bed..383daa8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -23,6 +23,7 @@ extern unsigned long max_pfn;
extern unsigned long saved_max_pfn;
#endif
+#ifndef CONFIG_NO_BOOTMEM
/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
@@ -37,6 +38,7 @@ typedef struct bootmem_data {
} bootmem_data_t;
extern bootmem_data_t bootmem_node_data[];
+#endif
extern unsigned long bootmem_bootmap_pages(unsigned long);
@@ -46,6 +48,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
unsigned long endpfn);
extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
+unsigned long free_all_memory_core_early(int nodeid);
extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
extern unsigned long free_all_bootmem(void);
@@ -86,6 +89,10 @@ extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
+void *__alloc_bootmem_node_high(pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal);
extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2265f28..c94ce6c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -12,6 +12,7 @@
#include <linux/prio_tree.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
+#include <linux/range.h>
struct mempolicy;
struct anon_vma;
@@ -1047,6 +1048,10 @@ extern void get_pfn_range_for_nid(unsigned int nid,
extern unsigned long find_min_pfn_with_active_regions(void);
extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
+int add_from_early_node_map(struct range *range, int az,
+ int nr_range, int nid);
+void *__alloc_memory_core_early(int nodeid, u64 size, u64 align,
+ u64 goal, u64 limit);
typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
extern void sparse_memory_present_with_active_regions(int nid);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 30fe668..eae8387 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -620,7 +620,9 @@ typedef struct pglist_data {
struct page_cgroup *node_page_cgroup;
#endif
#endif
+#ifndef CONFIG_NO_BOOTMEM
struct bootmem_data *bdata;
+#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Must be held any time you expect node_start_pfn, node_present_pages
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 954a7e7..9a9c8e4 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/kmemleak.h>
+#include <linux/range.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
unsigned long saved_max_pfn;
#endif
+#ifndef CONFIG_NO_BOOTMEM
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
min_low_pfn = start;
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-
+#endif
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
}
}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int i;
+ unsigned long start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
+
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
+
+ if (end_aligned <= start_aligned) {
+#if 1
+ printk(KERN_DEBUG " %lx - %lx\n", start, end);
+#endif
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ return;
+ }
+
+#if 1
+ printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
+ start, start_aligned, end_aligned, end);
+#endif
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+}
+
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+ int i;
+ u64 start, end;
+ unsigned long count = 0;
+ struct range *range = NULL;
+ int nr_range;
+
+ nr_range = get_free_all_memory_range(&range, nodeid);
+
+ for (i = 0; i < nr_range; i++) {
+ start = range[i].start;
+ end = range[i].end + 1;
+ count += end - start;
+ __free_pages_memory(start, end);
+ }
+
+ return count;
+}
+#else
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
@@ -228,6 +284,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
+#endif
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -238,7 +295,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+#ifdef CONFIG_NO_BOOTMEM
+ /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ return 0;
+#else
return free_all_bootmem_core(pgdat->bdata);
+#endif
}
/**
@@ -248,9 +310,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
*/
unsigned long __init free_all_bootmem(void)
{
+#ifdef CONFIG_NO_BOOTMEM
+ return free_all_memory_core_early(NODE_DATA(0)->node_id);
+#else
return free_all_bootmem_core(NODE_DATA(0)->bdata);
+#endif
}
+#ifndef CONFIG_NO_BOOTMEM
static void __init __free(bootmem_data_t *bdata,
unsigned long sidx, unsigned long eidx)
{
@@ -437,6 +504,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
}
BUG();
}
+#endif
/**
* free_bootmem_node - mark a page range as usable
@@ -451,6 +519,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ free_early(physaddr, physaddr + size);
+#if 0
+ printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
+#endif
+#else
unsigned long start, end;
kmemleak_free_part(__va(physaddr), size);
@@ -459,6 +533,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
end = PFN_DOWN(physaddr + size);
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+#endif
}
/**
@@ -472,6 +547,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ free_early(addr, addr + size);
+#if 0
+ printk(KERN_DEBUG "free %lx %lx\n", addr, size);
+#endif
+#else
unsigned long start, end;
kmemleak_free_part(__va(addr), size);
@@ -480,6 +561,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
end = PFN_DOWN(addr + size);
mark_bootmem(start, end, 0, 0);
+#endif
}
/**
@@ -496,12 +578,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
+#ifdef CONFIG_NO_BOOTMEM
+ panic("no bootmem");
+ return 0;
+#else
unsigned long start, end;
start = PFN_DOWN(physaddr);
end = PFN_UP(physaddr + size);
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+#endif
}
/**
@@ -517,14 +604,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
+#ifdef CONFIG_NO_BOOTMEM
+ panic("no bootmem");
+ return 0;
+#else
unsigned long start, end;
start = PFN_DOWN(addr);
end = PFN_UP(addr + size);
return mark_bootmem(start, end, 1, flags);
+#endif
}
+#ifndef CONFIG_NO_BOOTMEM
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
@@ -675,12 +768,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
#endif
return NULL;
}
+#endif
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
+#ifdef CONFIG_NO_BOOTMEM
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+
+ if (ptr)
+ return ptr;
+
+ if (goal != 0) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+#else
bootmem_data_t *bdata;
void *region;
@@ -706,6 +820,7 @@ restart:
}
return NULL;
+#endif
}
/**
@@ -724,7 +839,13 @@ restart:
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem_nopanic(size, align, goal, 0);
+ unsigned long limit = 0;
+
+#ifdef CONFIG_NO_BOOTMEM
+ limit = -1UL;
+#endif
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
}
static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -758,9 +879,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem(size, align, goal, 0);
+ unsigned long limit = 0;
+
+#ifdef CONFIG_NO_BOOTMEM
+ limit = -1UL;
+#endif
+
+ return ___alloc_bootmem(size, align, goal, limit);
}
+#ifndef CONFIG_NO_BOOTMEM
static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
@@ -777,6 +905,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
return ___alloc_bootmem(size, align, goal, limit);
}
+#endif
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
@@ -799,7 +928,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ return __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+#else
return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+#endif
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+ unsigned long end_pfn;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ /* update goal according ...MAX_DMA32_PFN */
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+ (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+ void *ptr;
+ unsigned long new_goal;
+
+ new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ new_goal, -1ULL);
+#else
+ ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+ new_goal, 0);
+#endif
+ if (ptr)
+ return ptr;
+ }
+#endif
+
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+
}
#ifdef CONFIG_SPARSEMEM
@@ -813,6 +981,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
+#ifdef CONFIG_NO_BOOTMEM
+ unsigned long pfn, goal, limit;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+
+ return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+ SMP_CACHE_BYTES, goal, limit);
+#else
bootmem_data_t *bdata;
unsigned long pfn, goal, limit;
@@ -822,6 +1000,7 @@ void * __init alloc_bootmem_section(unsigned long size,
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+#endif
}
#endif
@@ -833,11 +1012,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+#else
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+#endif
if (ptr)
return ptr;
@@ -888,6 +1072,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ return __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+#else
return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
+#endif
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e86965..92b6751 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3430,6 +3430,59 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+int __init add_from_early_node_map(struct range *range, int az,
+ int nr_range, int nid)
+{
+ int i;
+ u64 start, end;
+
+ /* need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ start = early_node_map[i].start_pfn;
+ end = early_node_map[i].end_pfn;
+ nr_range = add_range(range, az, nr_range, start, end - 1);
+ }
+ return nr_range;
+}
+
+void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+ void *ptr;
+
+ /* need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+ addr = find_early_area(ei_start, ei_last,
+ goal, limit, size, align);
+
+ if (addr == -1ULL)
+ continue;
+
+#if 0
+ printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
+ nid,
+ ei_start, ei_last, goal, limit, size,
+ align, addr);
+#endif
+
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ reserve_early_without_check(addr, addr + size, "BOOTMEM");
+ return ptr;
+ }
+
+ return NULL;
+}
+
+
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
{
int i;
@@ -4462,7 +4515,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = {
+#ifndef CONFIG_NO_BOOTMEM
+ .bdata = &bootmem_node_data[0]
+#endif
+ };
EXPORT_SYMBOL(contig_page_data);
#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index 442010c..48bb212 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1927,7 +1927,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
+#ifndef CONFIG_NO_BOOTMEM
+ /* fix partial free ! */
free_fn(ptr + size_sum, ai->unit_size - size_sum);
+#endif
}
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bd..9506c39 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,7 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node(NODE_DATA(node), size, align, goal);
+ return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
}
--
1.6.0.2
could save some buf instead of applying one by one
could help that system that is going to use early_res instead of bootmem
less entries in early_res make search more faster on system with more memory.
Signed-off-by: Yinghai Lu <[email protected]>
---
mm/sparse.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 66 insertions(+), 18 deletions(-)
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab..0cdaf0b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long count)
{
unsigned long section_nr;
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
* this problem.
*/
section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
- return alloc_bootmem_section(usemap_size(), section_nr);
+ return alloc_bootmem_section(usemap_size() * count, section_nr);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#else
static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long count)
{
return NULL;
}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
+static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long usemap_count, int nodeid)
{
- unsigned long *usemap;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
-
- usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
- if (usemap)
- return usemap;
+ void *usemap;
+ unsigned long pnum;
+ int size = usemap_size();
- usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
+ usemap_count);
if (usemap) {
- check_usemap_section_nr(nid, usemap);
- return usemap;
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ }
+ return;
}
- /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
- nid = 0;
+ usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+ if (usemap) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ check_usemap_section_nr(nodeid, usemap_map[pnum]);
+ }
+ return;
+ }
printk(KERN_WARNING "%s: allocation failed\n", __func__);
- return NULL;
}
#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -396,6 +411,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
{
}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -407,6 +423,9 @@ void __init sparse_init(void)
unsigned long *usemap;
unsigned long **usemap_map;
int size;
+ int nodeid_begin = 0;
+ unsigned long pnum_begin = 0;
+ unsigned long usemap_count;
/*
* map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +444,39 @@ void __init sparse_init(void)
panic("can not allocate usemap_map\n");
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
if (!present_section_nr(pnum))
continue;
- usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ usemap_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ usemap_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
+ usemap_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ usemap_count = 1;
}
+ /* ok, last chunk */
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
+ usemap_count, nodeid_begin);
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
--
1.6.0.2
add vmemmap_alloc_block_buf for mem map only.
it will fallback old wayif can not get that big.
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/mm/init_64.c | 2 +-
include/linux/mm.h | 7 +++
mm/sparse-vmemmap.c | 75 ++++++++++++++++++++++++++++++++-
mm/sparse.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 192 insertions(+), 3 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f13e5bd..21090d8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -961,7 +961,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
if (pmd_none(*pmd)) {
pte_t entry;
- p = vmemmap_alloc_block(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
if (!p)
return -ENOMEM;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c94ce6c..814cb8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1323,12 +1323,19 @@ extern int randomize_va_space;
const char * arch_vma_name(struct vm_area_struct *vma);
void print_vma_addr(char *prefix, unsigned long rip);
+void sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nodeid);
+
struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
void *vmemmap_alloc_block(unsigned long size, int node);
+void *vmemmap_alloc_block_buf(unsigned long size, int node);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
int vmemmap_populate_basepages(struct page *start_page,
unsigned long pages, int node);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 9506c39..5a4bc7a 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -43,6 +43,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
}
+static void *buf;
+static void *buf_end;
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
__pa(MAX_DMA_ADDRESS));
}
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+ void *ptr;
+
+ if (!buf)
+ return vmemmap_alloc_block(size, node);
+
+ /* take the from buf */
+ ptr = (void *)ALIGN((unsigned long)buf, size);
+ if (ptr + size > buf_end)
+ return vmemmap_alloc_block(size, node);
+
+ buf = ptr + size;
+
+ return ptr;
+}
+
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,56 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
return map;
}
+
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+ void *buf_start;
+
+ size = ALIGN(size, PMD_SIZE);
+ buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
+ PMD_SIZE, __pa(MAX_DMA_ADDRESS));
+
+ if (buf_start) {
+ buf = buf_start;
+ buf_end = buf_start + size * map_count;
+ }
+
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+
+ if (buf_start) {
+ /* need to free left buf */
+#ifdef CONFIG_NO_BOOTMEM
+ free_early(__pa(buf_start), __pa(buf_end));
+ if (buf_start < buf) {
+ char name[15];
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, 15, "MEMMAP %d", nodeid);
+ reserve_early_without_check(__pa(buf_start), __pa(buf),
+ name);
+ }
+#else
+ free_bootmem(__pa(buf), buf_end - buf);
+#endif
+ buf = NULL;
+ buf_end = NULL;
+ }
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index 0cdaf0b..b27f759 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -390,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
return map;
}
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ void *map;
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+
+ map = alloc_remap(nodeid, size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ size = PAGE_ALIGN(size);
+ map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ /* fallback */
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
+ map_count, nodeid);
+}
+
+#ifndef CONFIG_X86_64
static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -407,6 +464,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
ms->section_mem_map = 0;
return NULL;
}
+#endif
void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
{
@@ -420,12 +478,14 @@ void __init sparse_init(void)
{
unsigned long pnum;
struct page *map;
+ struct page **map_map;
unsigned long *usemap;
unsigned long **usemap_map;
- int size;
+ int size, size2;
int nodeid_begin = 0;
unsigned long pnum_begin = 0;
unsigned long usemap_count;
+ unsigned long map_count;
/*
* map is using big page (aka 2M in x86 64 bit)
@@ -478,6 +538,48 @@ void __init sparse_init(void)
sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
usemap_count, nodeid_begin);
+#ifdef CONFIG_X86_64
+ size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+ map_map = alloc_bootmem(size2);
+ if (!map_map)
+ panic("can not allocate map_map\n");
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ map_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ map_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
+ map_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ map_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
+ map_count, nodeid_begin);
+#endif
+
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
continue;
@@ -486,7 +588,11 @@ void __init sparse_init(void)
if (!usemap)
continue;
+#ifdef CONFIG_X86_64
+ map = map_map[pnum];
+#else
map = sparse_early_mem_map_alloc(pnum);
+#endif
if (!map)
continue;
@@ -496,6 +602,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
+#ifdef CONFIG_X86_64
+ free_bootmem(__pa(map_map), size2);
+#endif
free_bootmem(__pa(usemap_map), size);
}
--
1.6.0.2
so make interface more consistent with early_res.
later we can share some code with early_res.
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/cpu/mtrr/cleanup.c | 32 ++++++++++++++++----------------
arch/x86/kernel/e820.c | 2 +-
arch/x86/pci/amd_bus.c | 24 ++++++++++++++----------
kernel/range.c | 20 ++++++++++----------
mm/bootmem.c | 2 +-
mm/page_alloc.c | 2 +-
6 files changed, 43 insertions(+), 39 deletions(-)
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 669da09..06130b5 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -78,13 +78,13 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
- base, base + size - 1);
+ base, base + size);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end + 1);
+ range[i].start, range[i].end);
}
/* Take out UC ranges: */
@@ -106,11 +106,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, RANGE_NUM, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size);
}
if (extra_remove_size)
subtract_range(range, RANGE_NUM, extra_remove_base,
- extra_remove_base + extra_remove_size - 1);
+ extra_remove_base + extra_remove_size);
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
@@ -118,7 +118,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
if (!range[i].end)
continue;
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end + 1);
+ range[i].start, range[i].end);
}
}
@@ -128,7 +128,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end + 1);
+ range[i].start, range[i].end);
}
return nr_range;
@@ -142,7 +142,7 @@ static unsigned long __init sum_ranges(struct range *range, int nr_range)
int i;
for (i = 0; i < nr_range; i++)
- sum += range[i].end + 1 - range[i].start;
+ sum += range[i].end - range[i].start;
return sum;
}
@@ -489,7 +489,7 @@ x86_setup_var_mtrrs(struct range *range, int nr_range,
/* Write the range: */
for (i = 0; i < nr_range; i++) {
set_var_mtrr_range(&var_state, range[i].start,
- range[i].end - range[i].start + 1);
+ range[i].end - range[i].start);
}
/* Write the last range: */
@@ -720,7 +720,7 @@ int __init mtrr_cleanup(unsigned address_bits)
* and fixed mtrrs should take effect before var mtrr for it:
*/
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
- (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ 1ULL<<(20 - PAGE_SHIFT));
/* Sort the ranges: */
sort_range(range, nr_range);
@@ -939,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
nr_range = 0;
if (mtrr_tom2) {
range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
- range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
- if (highest_pfn < range[nr_range].end + 1)
- highest_pfn = range[nr_range].end + 1;
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
nr_range++;
}
nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -953,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* Check the holes: */
for (i = 0; i < nr_range - 1; i++) {
- if (range[i].end + 1 < range[i+1].start)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
range[i+1].start);
}
/* Check the top: */
i = nr_range - 1;
- if (range[i].end + 1 < end_pfn)
- total_trim_size += real_trim_memory(range[i].end + 1,
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
end_pfn);
if (total_trim_size) {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 6b801e0..3d3f33f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1038,7 +1038,7 @@ static void __init subtract_early_res(struct range *range, int az)
printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n",
final_start, final_end);
#endif
- subtract_range(range, az, final_start, final_end - 1);
+ subtract_range(range, az, final_start, final_end);
}
}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index ef4b3e7..05cb9ae 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -145,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
def_link = (reg >> 8) & 0x03;
memset(range, 0, sizeof(range));
- range[0].end = 0xffff;
+ add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
/* io port resource */
for (i = 0; i < 4; i++) {
reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -175,7 +175,7 @@ static int __init early_fill_mp_bus_info(void)
if (end > 0xffff)
end = 0xffff;
update_res(info, start, end, IORESOURCE_IO, 1);
- subtract_range(range, RANGE_NUM, start, end);
+ subtract_range(range, RANGE_NUM, start, end + 1);
}
/* add left over io port range to def node/link, [0, 0xffff] */
/* find the position */
@@ -190,14 +190,16 @@ static int __init early_fill_mp_bus_info(void)
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, range[i].start, range[i].end - 1,
IORESOURCE_IO, 1);
}
}
memset(range, 0, sizeof(range));
/* 0xfd00000000-0xffffffffff for HT */
- range[0].end = cap_resource((0xfdULL<<32) - 1);
+ end = cap_resource((0xfdULL<<32) - 1);
+ end++;
+ add_range(range, RANGE_NUM, 0, 0, end);
/* need to take out [0, TOM) for RAM*/
address = MSR_K8_TOP_MEM1;
@@ -205,14 +207,15 @@ static int __init early_fill_mp_bus_info(void)
end = (val & 0xffffff800000ULL);
printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
if (end < (1ULL<<32))
- subtract_range(range, RANGE_NUM, 0, end - 1);
+ subtract_range(range, RANGE_NUM, 0, end);
/* get mmconfig */
get_pci_mmcfg_amd_fam10h_range();
/* need to take out mmconf range */
if (fam10h_mmconf_end) {
printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
- subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end);
+ subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
+ fam10h_mmconf_end + 1);
}
/* mmio resource */
@@ -267,7 +270,8 @@ static int __init early_fill_mp_bus_info(void)
/* we got a hole */
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
- subtract_range(range, RANGE_NUM, start, endx);
+ subtract_range(range, RANGE_NUM, start,
+ endx + 1);
printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
@@ -284,7 +288,7 @@ static int __init early_fill_mp_bus_info(void)
update_res(info, cap_resource(start), cap_resource(end),
IORESOURCE_MEM, 1);
- subtract_range(range, RANGE_NUM, start, end);
+ subtract_range(range, RANGE_NUM, start, end + 1);
printk(KERN_CONT "\n");
}
@@ -299,7 +303,7 @@ static int __init early_fill_mp_bus_info(void)
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
- subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1);
+ subtract_range(range, RANGE_NUM, 1ULL<<32, end);
}
/*
@@ -319,7 +323,7 @@ static int __init early_fill_mp_bus_info(void)
continue;
update_res(info, cap_resource(range[i].start),
- cap_resource(range[i].end),
+ cap_resource(range[i].end - 1),
IORESOURCE_MEM, 1);
}
}
diff --git a/kernel/range.c b/kernel/range.c
index 71e0021..74e2e61 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -13,7 +13,7 @@
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
{
- if (start > end)
+ if (start >= end)
return nr_range;
/* Out of slots: */
@@ -33,7 +33,7 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
{
int i;
- if (start > end)
+ if (start >= end)
return nr_range;
/* Try to merge it with old one: */
@@ -46,7 +46,7 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
common_start = max(range[i].start, start);
common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
+ if (common_start > common_end)
continue;
final_start = min(range[i].start, start);
@@ -65,7 +65,7 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
{
int i, j;
- if (start > end)
+ if (start >= end)
return;
for (j = 0; j < az; j++) {
@@ -79,15 +79,15 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
}
if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
+ range[j].start < end) {
+ range[j].start = end;
continue;
}
if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
+ range[j].end > start) {
+ range[j].end = start;
continue;
}
@@ -99,11 +99,11 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
}
if (i < az) {
range[i].end = range[j].end;
- range[i].start = end + 1;
+ range[i].start = end;
} else {
printk(KERN_ERR "run of slot in ranges\n");
}
- range[j].end = start - 1;
+ range[j].end = start;
continue;
}
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9a9c8e4..e2a8fdc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -215,7 +215,7 @@ unsigned long __init free_all_memory_core_early(int nodeid)
for (i = 0; i < nr_range; i++) {
start = range[i].start;
- end = range[i].end + 1;
+ end = range[i].end;
count += end - start;
__free_pages_memory(start, end);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 92b6751..b086008 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3440,7 +3440,7 @@ int __init add_from_early_node_map(struct range *range, int az,
for_each_active_range_index_in_nid(i, nid) {
start = early_node_map[i].start_pfn;
end = early_node_map[i].end_pfn;
- nr_range = add_range(range, az, nr_range, start, end - 1);
+ nr_range = add_range(range, az, nr_range, start, end);
}
return nr_range;
}
--
1.6.0.2
so prepare to make one more clean early_res.c
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/e820.c | 22 ++--------------------
arch/x86/kernel/head32.c | 12 ++++++++++++
arch/x86/kernel/head64.c | 2 ++
3 files changed, 16 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3d3f33f..c138dbe 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -735,29 +735,11 @@ struct early_res {
char name[15];
char overlap_ok;
};
-static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
-#endif
-
- {}
-};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
static int max_early_res __initdata = MAX_EARLY_RES_X;
static struct early_res *early_res __initdata = &early_res_x[0];
-static int early_res_count __initdata =
-#ifdef CONFIG_X86_32
- 2
-#else
- 1
-#endif
- ;
+static int early_res_count __initdata;
static int __init find_overlapped_early(u64 start, u64 end)
{
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94..2e13544 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,6 +29,18 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
+ reserve_early_overlap_ok(0, PAGE_SIZE, "BIOS data page");
+
+#ifdef CONFIG_X86_TRAMPOLINE
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
+ "EX TRAMPOLINE");
+#endif
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5a9896..452b7c4 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,6 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
+ reserve_early_overlap_ok(0, PAGE_SIZE, "BIOS data page");
+
reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
--
1.6.0.2
to make e820.c smaller.
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/include/asm/e820.h | 13 +-
arch/x86/include/asm/early_res.h | 20 ++
arch/x86/kernel/Makefile | 2 +-
arch/x86/kernel/e820.c | 539 +-------------------------------------
arch/x86/kernel/early_res.c | 537 +++++++++++++++++++++++++++++++++++++
5 files changed, 560 insertions(+), 551 deletions(-)
create mode 100644 arch/x86/include/asm/early_res.h
create mode 100644 arch/x86/kernel/early_res.c
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 7d72e5f..efad699 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -109,19 +109,8 @@ static inline void early_memtest(unsigned long start, unsigned long end)
extern unsigned long end_user_pfn;
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
-extern void reserve_early(u64 start, u64 end, char *name);
-extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
-extern void free_early(u64 start, u64 end);
-extern void early_res_to_bootmem(u64 start, u64 end);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
-
-void reserve_early_without_check(u64 start, u64 end, char *name);
-u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
- u64 size, u64 align);
-#include <linux/range.h>
-int get_free_all_memory_range(struct range **rangep, int nodeid);
+#include <asm/early_res.h>
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h
new file mode 100644
index 0000000..2d43b16
--- /dev/null
+++ b/arch/x86/include/asm/early_res.h
@@ -0,0 +1,20 @@
+#ifndef _ASM_X86_EARLY_RES_H
+#define _ASM_X86_EARLY_RES_H
+#ifdef __KERNEL__
+
+extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
+extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
+extern void reserve_early(u64 start, u64 end, char *name);
+extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
+extern void free_early(u64 start, u64 end);
+extern void early_res_to_bootmem(u64 start, u64 end);
+
+void reserve_early_without_check(u64 start, u64 end, char *name);
+u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+ u64 size, u64 align);
+#include <linux/range.h>
+int get_free_all_memory_range(struct range **rangep, int nodeid);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_EARLY_RES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d87f09b..f5fb9f0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
-obj-y += bootflag.o e820.o
+obj-y += bootflag.o e820.o early_res.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index c138dbe..27a756e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,14 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/firmware-map.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
#include <asm/e820.h>
+#include <asm/early_res.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/trampoline.h>
/*
* The e820 map is the map that gets modified e.g. with command line parameters
@@ -722,536 +715,6 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
- * Early reserved memory areas.
- */
-/*
- * need to make sure this one is bigger enough before
- * find_e820_area could be used
- */
-#define MAX_EARLY_RES_X 32
-
-struct early_res {
- u64 start, end;
- char name[15];
- char overlap_ok;
-};
-static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
-
-static int max_early_res __initdata = MAX_EARLY_RES_X;
-static struct early_res *early_res __initdata = &early_res_x[0];
-static int early_res_count __initdata;
-
-static int __init find_overlapped_early(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
-
- for (i = 0; i < max_early_res && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- break;
- }
-
- return i;
-}
-
-/*
- * Drop the i-th range from the early reservation map,
- * by copying any higher ranges down one over it, and
- * clearing what had been the last slot.
- */
-static void __init drop_range(int i)
-{
- int j;
-
- for (j = i + 1; j < max_early_res && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
- early_res_count--;
-}
-
-/*
- * Split any existing ranges that:
- * 1) are marked 'overlap_ok', and
- * 2) overlap with the stated range [start, end)
- * into whatever portion (if any) of the existing range is entirely
- * below or entirely above the stated range. Drop the portion
- * of the existing range that overlaps with the stated range,
- * which will allow the caller of this routine to then add that
- * stated range without conflicting with any existing range.
- */
-static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
-{
- int i;
- struct early_res *r;
- u64 lower_start, lower_end;
- u64 upper_start, upper_end;
- char name[15];
-
- for (i = 0; i < max_early_res && early_res[i].end; i++) {
- r = &early_res[i];
-
- /* Continue past non-overlapping ranges */
- if (end <= r->start || start >= r->end)
- continue;
-
- /*
- * Leave non-ok overlaps as is; let caller
- * panic "Overlapping early reservations"
- * when it hits this overlap.
- */
- if (!r->overlap_ok)
- return;
-
- /*
- * We have an ok overlap. We will drop it from the early
- * reservation map, and add back in any non-overlapping
- * portions (lower or upper) as separate, overlap_ok,
- * non-overlapping ranges.
- */
-
- /* 1. Note any non-overlapping (lower or upper) ranges. */
- strncpy(name, r->name, sizeof(name) - 1);
-
- lower_start = lower_end = 0;
- upper_start = upper_end = 0;
- if (r->start < start) {
- lower_start = r->start;
- lower_end = start;
- }
- if (r->end > end) {
- upper_start = end;
- upper_end = r->end;
- }
-
- /* 2. Drop the original ok overlapping range */
- drop_range(i);
-
- i--; /* resume for-loop on copied down entry */
-
- /* 3. Add back in any non-overlapping ranges. */
- if (lower_end)
- reserve_early_overlap_ok(lower_start, lower_end, name);
- if (upper_end)
- reserve_early_overlap_ok(upper_start, upper_end, name);
- }
-}
-
-static void __init __reserve_early(u64 start, u64 end, char *name,
- int overlap_ok)
-{
- int i;
- struct early_res *r;
-
- i = find_overlapped_early(start, end);
- if (i >= max_early_res)
- panic("Too many early reservations");
- r = &early_res[i];
- if (r->end)
- panic("Overlapping early reservations "
- "%llx-%llx %s to %llx-%llx %s\n",
- start, end - 1, name?name:"", r->start,
- r->end - 1, r->name);
- r->start = start;
- r->end = end;
- r->overlap_ok = overlap_ok;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
- early_res_count++;
-}
-
-/*
- * A few early reservtations come here.
- *
- * The 'overlap_ok' in the name of this routine does -not- mean it
- * is ok for these reservations to overlap an earlier reservation.
- * Rather it means that it is ok for subsequent reservations to
- * overlap this one.
- *
- * Use this entry point to reserve early ranges when you are doing
- * so out of "Paranoia", reserving perhaps more memory than you need,
- * just in case, and don't mind a subsequent overlapping reservation
- * that is known to be needed.
- *
- * The drop_overlaps_that_are_ok() call here isn't really needed.
- * It would be needed if we had two colliding 'overlap_ok'
- * reservations, so that the second such would not panic on the
- * overlap with the first. We don't have any such as of this
- * writing, but might as well tolerate such if it happens in
- * the future.
- */
-void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
-{
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 1);
-}
-
-static void __init __check_and_double_early_res(u64 start)
-{
- u64 end, size, mem;
- struct early_res *new;
-
- /* do we have enough slots left ? */
- if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
- return;
-
- /* double it */
- end = max_pfn_mapped << PAGE_SHIFT;
- size = sizeof(struct early_res) * max_early_res * 2;
- mem = find_e820_area(start, end, size, sizeof(struct early_res));
-
- if (mem == -1ULL)
- panic("can not find more space for early_res array");
-
- new = __va(mem);
- /* save the first one for own */
- new[0].start = mem;
- new[0].end = mem + size;
- new[0].overlap_ok = 0;
- /* copy old to new */
- if (early_res == early_res_x) {
- memcpy(&new[1], &early_res[0],
- sizeof(struct early_res) * max_early_res);
- memset(&new[max_early_res+1], 0,
- sizeof(struct early_res) * (max_early_res - 1));
- early_res_count++;
- } else {
- memcpy(&new[1], &early_res[1],
- sizeof(struct early_res) * (max_early_res - 1));
- memset(&new[max_early_res], 0,
- sizeof(struct early_res) * max_early_res);
- }
- memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
- early_res = new;
- max_early_res *= 2;
- printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
- max_early_res, mem, mem + size - 1);
-}
-
-/*
- * Most early reservations come here.
- *
- * We first have drop_overlaps_that_are_ok() drop any pre-existing
- * 'overlap_ok' ranges, so that we can then reserve this memory
- * range without risk of panic'ing on an overlapping overlap_ok
- * early reservation.
- */
-void __init reserve_early(u64 start, u64 end, char *name)
-{
- if (start >= end)
- return;
-
- __check_and_double_early_res(end);
-
- drop_overlaps_that_are_ok(start, end);
- __reserve_early(start, end, name, 0);
-}
-
-void __init reserve_early_without_check(u64 start, u64 end, char *name)
-{
- struct early_res *r;
-
- if (start >= end)
- return;
-
- __check_and_double_early_res(end);
-
- r = &early_res[early_res_count];
-
- r->start = start;
- r->end = end;
- r->overlap_ok = 0;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
- early_res_count++;
-}
-
-void __init free_early(u64 start, u64 end)
-{
- struct early_res *r;
- int i;
-
- i = find_overlapped_early(start, end);
- r = &early_res[i];
- if (i >= max_early_res || r->end != end || r->start != start)
- panic("free_early on not reserved area: %llx-%llx!",
- start, end - 1);
-
- drop_range(i);
-}
-
-#ifdef CONFIG_NO_BOOTMEM
-static void __init subtract_early_res(struct range *range, int az)
-{
- int i, count;
- u64 final_start, final_end;
- int idx = 0;
-
- count = 0;
- for (i = 0; i < max_early_res && early_res[i].end; i++)
- count++;
-
- /* need to skip first one ?*/
- if (early_res != early_res_x)
- idx = 1;
-
-#if 1
- printk(KERN_INFO "Subtract (%d early reservations)\n", count);
-#endif
- for (i = idx; i < count; i++) {
- struct early_res *r = &early_res[i];
-#if 0
- printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i,
- r->start, r->end, r->name);
-#endif
- final_start = PFN_DOWN(r->start);
- final_end = PFN_UP(r->end);
- if (final_start >= final_end) {
-#if 0
- printk(KERN_CONT "\n");
-#endif
- continue;
- }
-#if 0
- printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n",
- final_start, final_end);
-#endif
- subtract_range(range, az, final_start, final_end);
- }
-
-}
-
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
- int i, count;
- u64 start = 0, end;
- u64 size;
- u64 mem;
- struct range *range;
- int nr_range;
-
- count = 0;
- for (i = 0; i < max_early_res && early_res[i].end; i++)
- count++;
-
- count *= 2;
-
- size = sizeof(struct range) * count;
- if (max_pfn_mapped > MAX_DMA32_PFN)
- start = MAX_DMA32_PFN << PAGE_SHIFT;
- end = max_pfn_mapped << PAGE_SHIFT;
- mem = find_e820_area(start, end, size, sizeof(struct range));
- if (mem == -1ULL)
- panic("can not find more space for range free");
-
- range = __va(mem);
- /* use early_node_map[] and early_res to get range array at first */
- memset(range, 0, size);
- nr_range = 0;
-
- /* need to go over early_node_map to find out good range for node */
- nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
- subtract_early_res(range, count);
- nr_range = clean_sort_range(range, count);
-
- /* need to clear it ? */
- if (nodeid == MAX_NUMNODES) {
- memset(&early_res[0], 0,
- sizeof(struct early_res) * max_early_res);
- early_res = NULL;
- max_early_res = 0;
- }
-
- *rangep = range;
- return nr_range;
-}
-#else
-void __init early_res_to_bootmem(u64 start, u64 end)
-{
- int i, count;
- u64 final_start, final_end;
- int idx = 0;
-
- count = 0;
- for (i = 0; i < max_early_res && early_res[i].end; i++)
- count++;
-
- /* need to skip first one ?*/
- if (early_res != early_res_x)
- idx = 1;
-
- printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count - idx, max_early_res, start, end);
- for (i = idx; i < count; i++) {
- struct early_res *r = &early_res[i];
- printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
- r->start, r->end, r->name);
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end) {
- printk(KERN_CONT "\n");
- continue;
- }
- printk(KERN_CONT " ==> [%010llx - %010llx]\n",
- final_start, final_end);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
- }
- /* clear them */
- memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
- early_res = NULL;
- max_early_res = 0;
- early_res_count = 0;
-}
-#endif
-
-/* Check for already reserved areas */
-static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
-{
- int i;
- u64 addr = *addrp;
- int changed = 0;
- struct early_res *r;
-again:
- i = find_overlapped_early(addr, addr + size);
- r = &early_res[i];
- if (i < max_early_res && r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
-{
- int i;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < max_early_res && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- * only with the area.between start to end is active range from early_node_map
- * so they are good as RAM
- */
-u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
- u64 size, u64 align)
-{
- u64 addr, last;
-
- addr = round_up(ei_start, align);
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- goto out;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- goto out;
- if (last > end)
- goto out;
-
- return addr;
-
-out:
- return -1ULL;
-}
-
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area(ei_start, ei_last, start, end,
- size, align);
-
- if (addr == -1ULL)
- continue;
-
- return addr;
- }
- return -1ULL;
-}
-
-/*
- * Find next free range after *start
- */
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
- }
-
- return -1ULL;
-}
-
-/*
* pre allocated 4k and reserved it in e820
*/
u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c
new file mode 100644
index 0000000..d50257b
--- /dev/null
+++ b/arch/x86/kernel/early_res.c
@@ -0,0 +1,537 @@
+/*
+ * early_res, could be used to replace bootmem
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+
+#include <asm/e820.h>
+#include <asm/early_res.h>
+#include <asm/proto.h>
+
+/*
+ * Early reserved memory areas.
+ */
+/*
+ * need to make sure this one is bigger enough before
+ * find_e820_area could be used
+ */
+#define MAX_EARLY_RES_X 32
+
+struct early_res {
+ u64 start, end;
+ char name[15];
+ char overlap_ok;
+};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
+
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata;
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (end > r->start && start < r->end)
+ break;
+ }
+
+ return i;
+}
+
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+ int j;
+
+ for (j = i + 1; j < max_early_res && early_res[j].end; j++)
+ ;
+
+ memmove(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+ early_res_count--;
+}
+
+/*
+ * Split any existing ranges that:
+ * 1) are marked 'overlap_ok', and
+ * 2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range. Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+ u64 lower_start, lower_end;
+ u64 upper_start, upper_end;
+ char name[15];
+
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
+ r = &early_res[i];
+
+ /* Continue past non-overlapping ranges */
+ if (end <= r->start || start >= r->end)
+ continue;
+
+ /*
+ * Leave non-ok overlaps as is; let caller
+ * panic "Overlapping early reservations"
+ * when it hits this overlap.
+ */
+ if (!r->overlap_ok)
+ return;
+
+ /*
+ * We have an ok overlap. We will drop it from the early
+ * reservation map, and add back in any non-overlapping
+ * portions (lower or upper) as separate, overlap_ok,
+ * non-overlapping ranges.
+ */
+
+ /* 1. Note any non-overlapping (lower or upper) ranges. */
+ strncpy(name, r->name, sizeof(name) - 1);
+
+ lower_start = lower_end = 0;
+ upper_start = upper_end = 0;
+ if (r->start < start) {
+ lower_start = r->start;
+ lower_end = start;
+ }
+ if (r->end > end) {
+ upper_start = end;
+ upper_end = r->end;
+ }
+
+ /* 2. Drop the original ok overlapping range */
+ drop_range(i);
+
+ i--; /* resume for-loop on copied down entry */
+
+ /* 3. Add back in any non-overlapping ranges. */
+ if (lower_end)
+ reserve_early_overlap_ok(lower_start, lower_end, name);
+ if (upper_end)
+ reserve_early_overlap_ok(upper_start, upper_end, name);
+ }
+}
+
+static void __init __reserve_early(u64 start, u64 end, char *name,
+ int overlap_ok)
+{
+ int i;
+ struct early_res *r;
+
+ i = find_overlapped_early(start, end);
+ if (i >= max_early_res)
+ panic("Too many early reservations");
+ r = &early_res[i];
+ if (r->end)
+ panic("Overlapping early reservations "
+ "%llx-%llx %s to %llx-%llx %s\n",
+ start, end - 1, name ? name : "", r->start,
+ r->end - 1, r->name);
+ r->start = start;
+ r->end = end;
+ r->overlap_ok = overlap_ok;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+ early_res_count++;
+}
+
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first. We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+ drop_overlaps_that_are_ok(start, end);
+ __reserve_early(start, end, name, 1);
+}
+
+static void __init __check_and_double_early_res(u64 start)
+{
+ u64 end, size, mem;
+ struct early_res *new;
+
+ /* do we have enough slots left ? */
+ if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+ return;
+
+ /* double it */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ size = sizeof(struct early_res) * max_early_res * 2;
+ mem = find_e820_area(start, end, size, sizeof(struct early_res));
+
+ if (mem == -1ULL)
+ panic("can not find more space for early_res array");
+
+ new = __va(mem);
+ /* save the first one for own */
+ new[0].start = mem;
+ new[0].end = mem + size;
+ new[0].overlap_ok = 0;
+ /* copy old to new */
+ if (early_res == early_res_x) {
+ memcpy(&new[1], &early_res[0],
+ sizeof(struct early_res) * max_early_res);
+ memset(&new[max_early_res+1], 0,
+ sizeof(struct early_res) * (max_early_res - 1));
+ early_res_count++;
+ } else {
+ memcpy(&new[1], &early_res[1],
+ sizeof(struct early_res) * (max_early_res - 1));
+ memset(&new[max_early_res], 0,
+ sizeof(struct early_res) * max_early_res);
+ }
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = new;
+ max_early_res *= 2;
+ printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+ max_early_res, mem, mem + size - 1);
+}
+
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+ if (start >= end)
+ return;
+
+ __check_and_double_early_res(end);
+
+ drop_overlaps_that_are_ok(start, end);
+ __reserve_early(start, end, name, 0);
+}
+
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+ struct early_res *r;
+
+ if (start >= end)
+ return;
+
+ __check_and_double_early_res(end);
+
+ r = &early_res[early_res_count];
+
+ r->start = start;
+ r->end = end;
+ r->overlap_ok = 0;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+ early_res_count++;
+}
+
+void __init free_early(u64 start, u64 end)
+{
+ struct early_res *r;
+ int i;
+
+ i = find_overlapped_early(start, end);
+ r = &early_res[i];
+ if (i >= max_early_res || r->end != end || r->start != start)
+ panic("free_early on not reserved area: %llx-%llx!",
+ start, end - 1);
+
+ drop_range(i);
+}
+
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+ int i, count;
+ u64 final_start, final_end;
+ int idx = 0;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
+#define DEBUG_PRINT_EARLY_RES 1
+
+#if DEBUG_PRINT_EARLY_RES
+ printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+ for (i = idx; i < count; i++) {
+ struct early_res *r = &early_res[i];
+#if DEBUG_PRINT_EARLY_RES
+ printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
+ r->start, r->end, r->name);
+#endif
+ final_start = PFN_DOWN(r->start);
+ final_end = PFN_UP(r->end);
+ if (final_start >= final_end)
+ continue;
+ subtract_range(range, az, final_start, final_end);
+ }
+
+}
+
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+ int i, count;
+ u64 start = 0, end;
+ u64 size;
+ u64 mem;
+ struct range *range;
+ int nr_range;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ count *= 2;
+
+ size = sizeof(struct range) * count;
+ if (max_pfn_mapped > MAX_DMA32_PFN)
+ start = MAX_DMA32_PFN << PAGE_SHIFT;
+ end = max_pfn_mapped << PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, sizeof(struct range));
+ if (mem == -1ULL)
+ panic("can not find more space for range free");
+
+ range = __va(mem);
+ /* use early_node_map[] and early_res to get range array at first */
+ memset(range, 0, size);
+ nr_range = 0;
+
+ /* need to go over early_node_map to find out good range for node */
+ nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+ subtract_early_res(range, count);
+ nr_range = clean_sort_range(range, count);
+
+ /* need to clear it ? */
+ if (nodeid == MAX_NUMNODES) {
+ memset(&early_res[0], 0,
+ sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ }
+
+ *rangep = range;
+ return nr_range;
+}
+#else
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+ int i, count;
+ u64 final_start, final_end;
+ int idx = 0;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
+ printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+ count - idx, max_early_res, start, end);
+ for (i = idx; i < count; i++) {
+ struct early_res *r = &early_res[i];
+ printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
+ r->start, r->end, r->name);
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end) {
+ printk(KERN_CONT "\n");
+ continue;
+ }
+ printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+ final_start, final_end);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
+ }
+ /* clear them */
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ early_res_count = 0;
+}
+#endif
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+ int i;
+ u64 addr = *addrp;
+ int changed = 0;
+ struct early_res *r;
+again:
+ i = find_overlapped_early(addr, addr + size);
+ r = &early_res[i];
+ if (i < max_early_res && r->end) {
+ *addrp = addr = round_up(r->end, align);
+ changed = 1;
+ goto again;
+ }
+ return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+ int i;
+ u64 addr = *addrp, last;
+ u64 size = *sizep;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last > r->start && addr < r->start) {
+ size = r->start - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last > r->end && addr < r->end) {
+ addr = round_up(r->end, align);
+ size = last - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last <= r->end && addr >= r->start) {
+ (*sizep)++;
+ return 0;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+ u64 size, u64 align)
+{
+ u64 addr, last;
+
+ addr = round_up(ei_start, align);
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ goto out;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ goto out;
+ if (last > end)
+ goto out;
+
+ return addr;
+
+out:
+ return -1ULL;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr;
+ u64 ei_start, ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+
+ ei_last = ei->addr + ei->size;
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
+
+ if (addr == -1ULL)
+ continue;
+
+ return addr;
+ }
+ return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) &&
+ addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ continue;
+ return addr;
+ }
+
+ return -1ULL;
+}
+
--
1.6.0.2
prepare to move bck find_e820_area_size back to e820.c
Signed-off-by: Yinghai Lu <[email protected]>
---
arch/x86/kernel/early_res.c | 45 ++++++++++++++++++++++++++++++------------
1 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c
index d50257b..51badaf 100644
--- a/arch/x86/kernel/early_res.c
+++ b/arch/x86/kernel/early_res.c
@@ -474,6 +474,29 @@ out:
return -1ULL;
}
+u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+ u64 *sizep, u64 align)
+{
+ u64 addr, last;
+
+ addr = round_up(ei_start, align);
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ goto out;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ goto out;
+
+ return addr;
+
+out:
+ return -1ULL;
+}
+
/*
* Find a free area with specified alignment in a specific range.
*/
@@ -511,24 +534,20 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ u64 addr;
+ u64 ei_start, ei_last;
if (ei->type != E820_RAM)
continue;
- addr = round_up(ei->addr, align);
+
ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
+ ei_start = ei->addr;
+ addr = find_early_area_size(ei_start, ei_last, start,
+ sizep, align);
+
+ if (addr == -1ULL)
continue;
+
return addr;
}
--
1.6.0.2
make early_res.c more clean, so later could move it to /kernel
Signed-off: Yinghai Lu <[email protected]>
---
arch/x86/include/asm/e820.h | 2 +
arch/x86/include/asm/early_res.h | 4 +-
arch/x86/kernel/e820.c | 57 ++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/early_res.c | 56 -------------------------------------
4 files changed, 61 insertions(+), 58 deletions(-)
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index efad699..a8299e1 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -109,6 +109,8 @@ static inline void early_memtest(unsigned long start, unsigned long end)
extern unsigned long end_user_pfn;
+extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
+extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
#include <asm/early_res.h>
diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h
index 2d43b16..5a4d2eb 100644
--- a/arch/x86/include/asm/early_res.h
+++ b/arch/x86/include/asm/early_res.h
@@ -2,8 +2,6 @@
#define _ASM_X86_EARLY_RES_H
#ifdef __KERNEL__
-extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
-extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
extern void reserve_early(u64 start, u64 end, char *name);
extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
extern void free_early(u64 start, u64 end);
@@ -12,6 +10,8 @@ extern void early_res_to_bootmem(u64 start, u64 end);
void reserve_early_without_check(u64 start, u64 end, char *name);
u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
u64 size, u64 align);
+u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+ u64 *sizep, u64 align);
#include <linux/range.h>
int get_free_all_memory_range(struct range **rangep, int nodeid);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 27a756e..acd7be6 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -715,6 +715,63 @@ core_initcall(e820_mark_nvs_memory);
#endif
/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr;
+ u64 ei_start, ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+
+ ei_last = ei->addr + ei->size;
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
+
+ if (addr == -1ULL)
+ continue;
+
+ return addr;
+ }
+ return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr;
+ u64 ei_start, ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+
+ ei_last = ei->addr + ei->size;
+ ei_start = ei->addr;
+ addr = find_early_area_size(ei_start, ei_last, start,
+ sizep, align);
+
+ if (addr == -1ULL)
+ continue;
+
+ return addr;
+ }
+
+ return -1ULL;
+}
+
+/*
* pre allocated 4k and reserved it in e820
*/
u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c
index 51badaf..8aa0362 100644
--- a/arch/x86/kernel/early_res.c
+++ b/arch/x86/kernel/early_res.c
@@ -497,60 +497,4 @@ out:
return -1ULL;
}
-/*
- * Find a free area with specified alignment in a specific range.
- */
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area(ei_start, ei_last, start, end,
- size, align);
-
- if (addr == -1ULL)
- continue;
-
- return addr;
- }
- return -1ULL;
-}
-
-/*
- * Find next free range after *start
- */
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area_size(ei_start, ei_last, start,
- sizep, align);
-
- if (addr == -1ULL)
- continue;
-
- return addr;
- }
-
- return -1ULL;
-}
--
1.6.0.2
On Tue, 22 Dec 2009, Yinghai Lu wrote:
> From: "Yinghai Lu <[email protected]>"
>
> please check the patches regarding with early_res and bootmem
>
> and at last it will make use early_res instead of bootmem with x86 64bits
>
> the first two are needed for some amd_bus.c/intel_bus.c cleaning up patches too.
> so put other x86/pci related into this series
> hope that is ok to Jesse.
>
> -v2: allocate vmemmap on one node together, and also seperate early_res
The point of this patchset is what?
Avoid use of bootmem altogether by x86 bootstrap? If so why?
On 01/04/2010 09:48 AM, Christoph Lameter wrote:
> On Tue, 22 Dec 2009, Yinghai Lu wrote:
>
>> From: "Yinghai Lu <[email protected]>"
>>
>> please check the patches regarding with early_res and bootmem
>>
>> and at last it will make use early_res instead of bootmem with x86 64bits
>>
>> the first two are needed for some amd_bus.c/intel_bus.c cleaning up patches too.
>> so put other x86/pci related into this series
>> hope that is ok to Jesse.
>>
>> -v2: allocate vmemmap on one node together, and also seperate early_res
>
> The point of this patchset is what?
>
> Avoid use of bootmem altogether by x86 bootstrap? If so why?
>
http://lkml.indiana.edu/hypermail/linux/kernel/0910.3/01432.html
YH
On Mon, 4 Jan 2010, Yinghai Lu wrote:
> http://lkml.indiana.edu/hypermail/linux/kernel/0910.3/01432.html
Ahh. Maybe use that material in your introduction somewhere?
On 01/04/2010 01:19 PM, Christoph Lameter wrote:
> On Mon, 4 Jan 2010, Yinghai Lu wrote:
>
>> http://lkml.indiana.edu/hypermail/linux/kernel/0910.3/01432.html
>
> Ahh. Maybe use that material in your introduction somewhere?
sure. next version.
YH
Commit-ID: 9dad0fd5a73d4048dff18069733c0b515f68df74
Gitweb: http://git.kernel.org/tip/9dad0fd5a73d4048dff18069733c0b515f68df74
Author: Yinghai Lu <[email protected]>
AuthorDate: Tue, 22 Dec 2009 15:40:39 -0800
Committer: H. Peter Anvin <[email protected]>
CommitDate: Mon, 4 Jan 2010 13:20:11 -0800
x86: Fix size for ex trampoline with 32bit
fix for error that is introduced by
| x86: Use find_e820() instead of hard coded trampoline address
it should end with PAGE_SIZE + PAGE_SIZE
Signed-off-by: Yinghai Lu <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/kernel/e820.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 05ed7ab..a1a7876 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -733,13 +733,13 @@ struct early_res {
};
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
/*
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
*/
- { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
#endif
{}
Commit-ID: a557aae29cf5916295c234d4b10ba3f8f29b8a96
Gitweb: http://git.kernel.org/tip/a557aae29cf5916295c234d4b10ba3f8f29b8a96
Author: Yinghai Lu <[email protected]>
AuthorDate: Tue, 22 Dec 2009 15:40:40 -0800
Committer: H. Peter Anvin <[email protected]>
CommitDate: Mon, 4 Jan 2010 13:21:51 -0800
x86/pci: Intel ioh bus num reg accessing fix
It is above 0x100 (PCI-Express extended register space), so if mmconf
is not enable, we can't access it.
[ hpa: changed the bound from 0x200 to 0x120, which is the tight
bound. ]
Reported-by: Jens Axboe <[email protected]>
Signed-off-by: Yinghai Lu <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/pci/intel_bus.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/pci/intel_bus.c b/arch/x86/pci/intel_bus.c
index b7a55dc..f81a2fa 100644
--- a/arch/x86/pci/intel_bus.c
+++ b/arch/x86/pci/intel_bus.c
@@ -49,6 +49,10 @@ static void __devinit pci_root_bus_res(struct pci_dev *dev)
u64 mmioh_base, mmioh_end;
int bus_base, bus_end;
+ /* some sys doesn't get mmconf enabled */
+ if (dev->cfg_size < 0x120)
+ return;
+
if (pci_root_num >= PCI_ROOT_NR) {
printk(KERN_DEBUG "intel_bus.c: PCI_ROOT_NR is too small\n");
return;