2005-09-12 17:53:24

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 1/2] i386: consolidate discontig functions into normal ones


There are quite a few functions in i386's discontig.c which are
actually NUMA-specific, not discontigmem. They are also very
similar to the generic, flat functions found in setup.c.

This patch takes the versions in setup.c and makes them work
for both NUMA and non-NUMA cases. In the process, quite a
few nasty #ifdef and externs can be removed.

One of the main mechanisms to do this is that highstart_pfn
and highend_pfn are now gone, replaced by node_start/end_pfn[].
However, this has no real impact on storage space, because
those arrays are declared with a length of MAX_NUMNODES, which
is 1 when NUMA is off.



---

arch/i386/kernel/signal.c | 0
memhotplug-dave/arch/i386/kernel/setup.c | 167 ++++++++++++++++++----
memhotplug-dave/arch/i386/mm/discontig.c | 232 +++----------------------------
memhotplug-dave/arch/i386/mm/init.c | 43 +++--
4 files changed, 186 insertions(+), 256 deletions(-)

diff -puN arch/i386/mm/init.c~i386-discontig-consolidate0 arch/i386/mm/init.c
--- memhotplug/arch/i386/mm/init.c~i386-discontig-consolidate0 2005-09-12 10:52:26.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/init.c 2005-09-12 10:52:26.000000000 -0700
@@ -44,7 +44,6 @@
unsigned int __VMALLOC_RESERVE = 128 << 20;

DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-unsigned long highstart_pfn, highend_pfn;

static int noinline do_test_wp_bit(void);

@@ -310,18 +309,33 @@ void online_page(struct page *page)
add_one_highpage_hotplug(page, page_to_pfn(page));
}

-
-#ifdef CONFIG_NUMA
-extern void set_highmem_pages_init(int);
-#else
-static void __init set_highmem_pages_init(int bad_ppro)
+void __init set_highmem_pages_init(int bad_ppro)
{
- int pfn;
- for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
- add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ struct zone *zone;
+ struct page *page;
+
+ for_each_zone(zone) {
+ unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ zone_start_pfn = zone->zone_start_pfn;
+ zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+ printk("Initializing %s for node %d (%08lx:%08lx)\n",
+ zone->name, zone->zone_pgdat->node_id,
+ zone_start_pfn, zone_end_pfn);
+
+ for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
+ if (!pfn_valid(node_pfn))
+ continue;
+ page = pfn_to_page(node_pfn);
+ add_one_highpage_init(page, node_pfn, bad_ppro);
+ }
+ }
totalram_pages += totalhigh_pages;
}
-#endif /* CONFIG_FLATMEM */

#else
#define kmap_init() do { } while (0)
@@ -556,11 +570,6 @@ static void __init test_wp_bit(void)

static void __init set_max_mapnr_init(void)
{
-#ifdef CONFIG_HIGHMEM
- num_physpages = highend_pfn;
-#else
- num_physpages = max_low_pfn;
-#endif
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
@@ -594,11 +603,7 @@ void __init mem_init(void)

set_max_mapnr_init();

-#ifdef CONFIG_HIGHMEM
- high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
-#else
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
-#endif

/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
diff -puN arch/i386/mm/discontig.c~i386-discontig-consolidate0 arch/i386/mm/discontig.c
--- memhotplug/arch/i386/mm/discontig.c~i386-discontig-consolidate0 2005-09-12 10:52:26.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/discontig.c 2005-09-12 10:52:26.000000000 -0700
@@ -39,19 +39,6 @@

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
-bootmem_data_t node0_bdata;
-
-/*
- * numa interface - we expect the numa architecture specfic code to have
- * populated the following initialisation.
- *
- * 1) node_online_map - the map of all nodes configured (online) in the system
- * 2) node_start_pfn - the starting page frame number for a node
- * 3) node_end_pfn - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
-

#ifdef CONFIG_DISCONTIGMEM
/*
@@ -103,7 +90,6 @@ extern void add_one_highpage_init(struct

extern struct e820map e820;
extern unsigned long init_pg_tables_end;
-extern unsigned long highend_pfn, highstart_pfn;
extern unsigned long max_low_pfn;
extern unsigned long totalram_pages;
extern unsigned long totalhigh_pages;
@@ -119,44 +105,6 @@ void set_pmd_pfn(unsigned long vaddr, un
void *node_remap_end_vaddr[MAX_NUMNODES];
void *node_remap_alloc_vaddr[MAX_NUMNODES];

-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- * a single node with all available processors in it with a flat
- * memory map.
- */
-int __init get_memcfg_numa_flat(void)
-{
- printk("NUMA - single node, flat memory mode\n");
-
- /* Run the memory configuration and find the top of memory. */
- find_max_pfn();
- node_start_pfn[0] = 0;
- node_end_pfn[0] = max_pfn;
- memory_present(0, 0, max_pfn);
-
- /* Indicate there is one node available. */
- nodes_clear(node_online_map);
- node_set_online(0);
- return 1;
-}
-
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init find_max_pfn_node(int nid)
-{
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
- /*
- * if a user has given mem=XXXX, then we need to make sure
- * that the node _starts_ before that, too, not just ends
- */
- if (node_start_pfn[nid] > max_pfn)
- node_start_pfn[nid] = max_pfn;
- if (node_start_pfn[nid] > node_end_pfn[nid])
- BUG();
-}
-
/* Find the owning node for a pfn. */
int early_pfn_to_nid(unsigned long pfn)
{
@@ -179,6 +127,7 @@ int early_pfn_to_nid(unsigned long pfn)
* node local data in physically node local memory. See setup_memory()
* for details.
*/
+static bootmem_data_t node0_bdata;
static void __init allocate_pgdat(int nid)
{
if (nid && node_has_online_mem(nid))
@@ -186,6 +135,30 @@ static void __init allocate_pgdat(int ni
else {
NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
min_low_pfn += PFN_UP(sizeof(pg_data_t));
+ memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
+ NODE_DATA(0)->bdata = &node0_bdata;
+ }
+}
+
+void setup_numa_kva_remap(void)
+{
+ int nid;
+ for_each_online_node(nid) {
+ if (NODE_DATA(nid))
+ continue;
+ node_remap_start_vaddr[nid] = pfn_to_kaddr(
+ max_low_pfn + node_remap_offset[nid]);
+ /* Init the node remap allocator */
+ node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+ (node_remap_size[nid] * PAGE_SIZE);
+ node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+ ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+ allocate_pgdat(nid);
+ printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+ (ulong) node_remap_start_vaddr[nid],
+ (ulong) pfn_to_kaddr(max_low_pfn
+ + node_remap_offset[nid] + node_remap_size[nid]));
}
}

@@ -220,7 +193,7 @@ void __init remap_numa_kva(void)
}
}

-static unsigned long calculate_numa_remap_pages(void)
+unsigned long calculate_numa_remap_pages(void)
{
int nid;
unsigned long size, reserve_pages = 0;
@@ -281,156 +254,3 @@ static unsigned long calculate_numa_rema
reserve_pages);
return reserve_pages;
}
-
-extern void setup_bootmem_allocator(void);
-unsigned long __init setup_memory(void)
-{
- int nid;
- unsigned long system_start_pfn, system_max_low_pfn;
- unsigned long reserve_pages;
-
- /*
- * When mapping a NUMA machine we allocate the node_mem_map arrays
- * from node local memory. They are then mapped directly into KVA
- * between zone normal and vmalloc space. Calculate the size of
- * this space and use it to adjust the boundry between ZONE_NORMAL
- * and ZONE_HIGHMEM.
- */
- find_max_pfn();
- get_memcfg_numa();
-
- reserve_pages = calculate_numa_remap_pages();
-
- /* partially used pages are not usable - thus round upwards */
- system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
-
- system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
- printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
- reserve_pages, max_low_pfn + reserve_pages);
- printk("max_pfn = %ld\n", max_pfn);
-#ifdef CONFIG_HIGHMEM
- highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > system_max_low_pfn)
- highstart_pfn = system_max_low_pfn;
- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
-#endif
- printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
- pages_to_mb(system_max_low_pfn));
- printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
- min_low_pfn, max_low_pfn, highstart_pfn);
-
- printk("Low memory ends at vaddr %08lx\n",
- (ulong) pfn_to_kaddr(max_low_pfn));
- for_each_online_node(nid) {
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- highstart_pfn + node_remap_offset[nid]);
- /* Init the node remap allocator */
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
- allocate_pgdat(nid);
- printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) pfn_to_kaddr(highstart_pfn
- + node_remap_offset[nid] + node_remap_size[nid]));
- }
- printk("High memory starts at vaddr %08lx\n",
- (ulong) pfn_to_kaddr(highstart_pfn));
- vmalloc_earlyreserve = reserve_pages * PAGE_SIZE;
- for_each_online_node(nid)
- find_max_pfn_node(nid);
-
- memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
- NODE_DATA(0)->bdata = &node0_bdata;
- setup_bootmem_allocator();
- return max_low_pfn;
-}
-
-void __init zone_sizes_init(void)
-{
- int nid;
-
- /*
- * Insert nodes into pgdat_list backward so they appear in order.
- * Clobber node 0's links and NULL out pgdat_list before starting.
- */
- pgdat_list = NULL;
- for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) {
- if (!node_online(nid))
- continue;
- NODE_DATA(nid)->pgdat_next = pgdat_list;
- pgdat_list = NODE_DATA(nid);
- }
-
- for_each_online_node(nid) {
- unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
- unsigned long *zholes_size;
- unsigned int max_dma;
-
- unsigned long low = max_low_pfn;
- unsigned long start = node_start_pfn[nid];
- unsigned long high = node_end_pfn[nid];
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
- if (node_has_online_mem(nid)){
- if (start > low) {
-#ifdef CONFIG_HIGHMEM
- BUG_ON(start > high);
- zones_size[ZONE_HIGHMEM] = high - start;
-#endif
- } else {
- if (low < max_dma)
- zones_size[ZONE_DMA] = low;
- else {
- BUG_ON(max_dma > low);
- BUG_ON(low > high);
- zones_size[ZONE_DMA] = max_dma;
- zones_size[ZONE_NORMAL] = low - max_dma;
-#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = high - low;
-#endif
- }
- }
- }
-
- zholes_size = get_zholes_size(nid);
-
- free_area_init_node(nid, NODE_DATA(nid), zones_size, start,
- zholes_size);
- }
- return;
-}
-
-void __init set_highmem_pages_init(int bad_ppro)
-{
-#ifdef CONFIG_HIGHMEM
- struct zone *zone;
- struct page *page;
-
- for_each_zone(zone) {
- unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
-
- if (!is_highmem(zone))
- continue;
-
- zone_start_pfn = zone->zone_start_pfn;
- zone_end_pfn = zone_start_pfn + zone->spanned_pages;
-
- printk("Initializing %s for node %d (%08lx:%08lx)\n",
- zone->name, zone->zone_pgdat->node_id,
- zone_start_pfn, zone_end_pfn);
-
- for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page, node_pfn, bad_ppro);
- }
- }
- totalram_pages += totalhigh_pages;
-#endif
-}
diff -puN mm/page_alloc.c~i386-discontig-consolidate0 mm/page_alloc.c
diff -puN mm/bootmem.c~i386-discontig-consolidate0 mm/bootmem.c
diff -puN arch/i386/kernel/setup.c~i386-discontig-consolidate0 arch/i386/kernel/setup.c
--- memhotplug/arch/i386/kernel/setup.c~i386-discontig-consolidate0 2005-09-12 10:52:26.000000000 -0700
+++ memhotplug-dave/arch/i386/kernel/setup.c 2005-09-12 10:52:26.000000000 -0700
@@ -51,6 +51,7 @@
#include <asm/apic.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
+#include <asm/mmzone.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/sections.h>
@@ -365,6 +366,37 @@ static void __init probe_roms(void)
}
}

+/*
+ * numa interface - we expect the numa architecture specfic code to have
+ * populated the following initialisation.
+ *
+ * 1) node_online_map - the map of all nodes configured (online) in the system
+ * 2) node_start_pfn - the starting page frame number for a node
+ * 3) node_end_pfn - the ending page fram number for a node
+ */
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+bootmem_data_t node0_bdata;
+
+/*
+ * FLAT - support for basic PC memory model with discontig enabled, essentially
+ * a single node with all available processors in it with a flat
+ * memory map.
+ */
+int __init get_memcfg_numa_flat(void)
+{
+ printk("NUMA - single node, flat memory mode\n");
+
+ /* Run the memory configuration and find the top of memory. */
+ node_start_pfn[0] = 0;
+ node_end_pfn[0] = max_pfn;
+
+ /* Indicate there is one node available. */
+ nodes_clear(node_online_map);
+ node_set_online(0);
+ return 1;
+}
+
static void __init limit_regions(unsigned long long size)
{
unsigned long long current_addr = 0;
@@ -1111,59 +1143,132 @@ static void __init reserve_ebda_region(v
reserve_bootmem(addr, PAGE_SIZE);
}

-#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init setup_bootmem_allocator(void);
-static unsigned long __init setup_memory(void)
+static void __init find_max_pfn_node(int nid)
{
+ if (node_end_pfn[nid] > max_pfn)
+ node_end_pfn[nid] = max_pfn;
/*
- * partially used pages are not usable - thus
- * we are rounding upwards:
+ * if a user has given mem=XXXX, then we need to make sure
+ * that the node _starts_ before that, too, not just ends
*/
- min_low_pfn = PFN_UP(init_pg_tables_end);
+ if (node_start_pfn[nid] > max_pfn)
+ node_start_pfn[nid] = max_pfn;
+ if (node_start_pfn[nid] > node_end_pfn[nid])
+ BUG();
+}

+unsigned long calculate_numa_remap_pages(void);
+void setup_numa_kva_remap(void);
+void __init setup_bootmem_allocator(void);
+unsigned long __init setup_memory(void)
+{
+ int nid;
+ unsigned long reserve_pages;
+
+ /*
+ * When mapping a NUMA machine we allocate the node_mem_map arrays
+ * from node local memory. They are then mapped directly into KVA
+ * between zone normal and vmalloc space. Calculate the size of
+ * this space and use it to adjust the boundry between ZONE_NORMAL
+ * and ZONE_HIGHMEM.
+ */
find_max_pfn();
+ get_memcfg_numa();
+ for_each_online_node(nid)
+ num_physpages = max(num_physpages, node_end_pfn[nid]);

- max_low_pfn = find_max_low_pfn();
+ reserve_pages = calculate_numa_remap_pages();

+ /* partially used pages are not usable - thus round upwards */
+ min_low_pfn = PFN_UP(init_pg_tables_end);
+ max_low_pfn = find_max_low_pfn() - reserve_pages;
+
+ if (reserve_pages)
+ printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
+ reserve_pages, max_low_pfn + reserve_pages);
+ printk(KERN_DEBUG "max_pfn = %ld\n", max_pfn);
#ifdef CONFIG_HIGHMEM
- highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > max_low_pfn) {
- highstart_pfn = max_low_pfn;
- }
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
+ pages_to_mb(max_pfn - max_low_pfn));
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
- pages_to_mb(max_low_pfn));
+ pages_to_mb(max_low_pfn - min_low_pfn));
+ printk(KERN_DEBUG "min_low_pfn = %ld, max_low_pfn = %ld\n",
+ min_low_pfn, max_low_pfn);
+
+ printk(KERN_NOTICE "Low memory ends at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(max_low_pfn));
+ setup_numa_kva_remap();
+ printk("High memory starts at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(max_low_pfn));
+ vmalloc_earlyreserve = reserve_pages * PAGE_SIZE;
+ for_each_online_node(nid)
+ find_max_pfn_node(nid);

setup_bootmem_allocator();
-
return max_low_pfn;
}

-void __init zone_sizes_init(void)
+static inline unsigned long max_hardware_dma_pfn(void)
+{
+ return virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+}
+static inline unsigned long nid_size_pages(int nid)
+{
+ return node_end_pfn[nid] - node_start_pfn[nid];
+}
+static inline int nid_starts_in_highmem(int nid)
+{
+ return node_start_pfn[nid] >= max_low_pfn;
+}
+
+void __init nid_zone_sizes_init(int nid)
{
unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
- unsigned int max_dma, low;
+ unsigned long max_dma;
+ unsigned long start = node_start_pfn[nid];
+ unsigned long end = node_end_pfn[nid];

- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- low = max_low_pfn;
+ if (node_has_online_mem(nid)){
+ if (nid_starts_in_highmem(nid)) {
+ zones_size[ZONE_HIGHMEM] = nid_size_pages(nid);
+ } else {
+ max_dma = min(max_hardware_dma_pfn(), max_low_pfn);
+ zones_size[ZONE_DMA] = max_dma;
+ zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+ zones_size[ZONE_HIGHMEM] = end - max_low_pfn;
+ }
+ }

- if (low < max_dma)
- zones_size[ZONE_DMA] = low;
- else {
- zones_size[ZONE_DMA] = max_dma;
- zones_size[ZONE_NORMAL] = low - max_dma;
-#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = highend_pfn - low;
-#endif
+ free_area_init_node(nid, NODE_DATA(nid), zones_size, start,
+ get_zholes_size(nid));
+}
+
+void __init init_pgdat_list(void)
+{
+ int nid;
+
+ /*
+ * Insert nodes into pgdat_list backward so they appear in order.
+ * Clobber node 0's links and NULL out pgdat_list before starting.
+ */
+ pgdat_list = NULL;
+ for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) {
+ if (!node_online(nid))
+ continue;
+ NODE_DATA(nid)->pgdat_next = pgdat_list;
+ pgdat_list = NODE_DATA(nid);
}
- free_area_init(zones_size);
}
-#else
-extern unsigned long __init setup_memory(void);
-extern void zone_sizes_init(void);
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init zone_sizes_init(void)
+{
+ int nid;
+
+ init_pgdat_list();
+ for_each_online_node(nid)
+ nid_zone_sizes_init(nid);
+}

void __init setup_bootmem_allocator(void)
{
diff -puN include/asm-i386/dma.h~i386-discontig-consolidate0 include/asm-i386/dma.h
diff -puN include/linux/mmzone.h~i386-discontig-consolidate0 include/linux/mmzone.h
diff -puN include/asm-i386/mmzone.h~i386-discontig-consolidate0 include/asm-i386/mmzone.h
diff -puN init/main.c~i386-discontig-consolidate0 init/main.c
diff -puN mm/memory.c~i386-discontig-consolidate0 mm/memory.c
diff -puN drivers/media/dvb/dvb-usb/dvb-usb-init.c~i386-discontig-consolidate0 drivers/media/dvb/dvb-usb/dvb-usb-init.c
diff -puN include/asm-i386/page.h~i386-discontig-consolidate0 include/asm-i386/page.h
diff -puN arch/i386/kernel/numaq.c~i386-discontig-consolidate0 arch/i386/kernel/numaq.c
diff -puN arch/i386/kernel/signal.c~i386-discontig-consolidate0 arch/i386/kernel/signal.c
diff -L ser -puN /dev/null /dev/null
_


2005-09-12 17:54:56

by Dave Hansen

[permalink] [raw]
Subject: [RFC][PATCH 2/2] i386: move NUMA code into numa.c


This patch removes the final dependencies of CONFIG_NUMA on
discontig.c. This is done by moving the NUMA KVA remap
code from discontig.c into a new file: numa.c



---

memhotplug-dave/arch/i386/kernel/setup.c | 2
memhotplug-dave/arch/i386/mm/Makefile | 3
memhotplug-dave/arch/i386/mm/discontig.c | 177 ----------------------
memhotplug-dave/arch/i386/mm/init.c | 7
memhotplug-dave/arch/i386/mm/numa.c | 167 ++++++++++++++++++++
memhotplug-dave/include/asm-i386/mmzone.h | 7
memhotplug-dave/include/asm-i386/pgtable-3level.h | 1
7 files changed, 177 insertions(+), 187 deletions(-)

diff -puN /dev/null arch/i386/mm/numa.c
--- /dev/null 2005-03-30 22:36:15.000000000 -0800
+++ memhotplug-dave/arch/i386/mm/numa.c 2005-09-12 10:18:29.000000000 -0700
@@ -0,0 +1,167 @@
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/numa.h>
+
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+
+void *node_remap_end_vaddr[MAX_NUMNODES];
+void *node_remap_alloc_vaddr[MAX_NUMNODES];
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+/*
+ * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
+ * method. For node zero take this from the bottom of memory, for
+ * subsequent nodes place them at node_remap_start_vaddr which contains
+ * node local data in physically node local memory. See setup_memory()
+ * for details.
+ */
+static bootmem_data_t node0_bdata;
+static void __init allocate_pgdat(int nid)
+{
+ if (nid && node_has_online_mem(nid))
+ NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
+ else {
+ NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
+ min_low_pfn += PFN_UP(sizeof(pg_data_t));
+ memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
+ NODE_DATA(0)->bdata = &node0_bdata;
+ }
+}
+
+void setup_numa_kva_remap(void)
+{
+ int nid;
+ for_each_online_node(nid) {
+ if (NODE_DATA(nid))
+ continue;
+ node_remap_start_vaddr[nid] = pfn_to_kaddr(
+ max_low_pfn + node_remap_offset[nid]);
+ /* Init the node remap allocator */
+ node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+ (node_remap_size[nid] * PAGE_SIZE);
+ node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+ ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+ allocate_pgdat(nid);
+ printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+ (ulong) node_remap_start_vaddr[nid],
+ (ulong) pfn_to_kaddr(max_low_pfn
+ + node_remap_offset[nid] + node_remap_size[nid]));
+ }
+}
+
+void *alloc_remap(int nid, unsigned long size)
+{
+ void *allocation = node_remap_alloc_vaddr[nid];
+
+ size = ALIGN(size, L1_CACHE_BYTES);
+
+ if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+ return 0;
+
+ node_remap_alloc_vaddr[nid] += size;
+ memset(allocation, 0, size);
+
+ return allocation;
+}
+
+void __init remap_numa_kva(void)
+{
+ void *vaddr;
+ unsigned long pfn;
+ int node;
+
+ for_each_online_node(node) {
+ for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+ vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+ set_pmd_pfn((ulong) vaddr,
+ node_remap_start_pfn[node] + pfn,
+ PAGE_KERNEL_LARGE);
+ }
+ }
+}
+
+unsigned long calculate_numa_remap_pages(void)
+{
+ int nid;
+ unsigned long size, reserve_pages = 0;
+ unsigned long pfn;
+
+ for_each_online_node(nid) {
+ /*
+ * The acpi/srat node info can show hot-add memroy zones
+ * where memory could be added but not currently present.
+ */
+ if (node_start_pfn[nid] > max_pfn)
+ continue;
+ if (node_end_pfn[nid] > max_pfn)
+ node_end_pfn[nid] = max_pfn;
+
+ /* ensure the remap includes space for the pgdat. */
+ size = node_remap_size[nid] + sizeof(pg_data_t);
+
+ /* convert size to large (pmd size) pages, rounding up */
+ size = (size + PMD_SIZE - 1) / PMD_SIZE;
+ /* now the roundup is correct, convert to PAGE_SIZE pages */
+ size = size * PTRS_PER_PTE;
+
+ /*
+ * Validate the region we are allocating only contains valid
+ * pages.
+ */
+ for (pfn = node_end_pfn[nid] - size;
+ pfn < node_end_pfn[nid]; pfn++)
+ if (!page_is_ram(pfn))
+ break;
+
+ if (pfn != node_end_pfn[nid])
+ size = 0;
+
+ printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+ size, nid);
+ node_remap_size[nid] = size;
+ node_remap_offset[nid] = reserve_pages;
+ reserve_pages += size;
+ printk("Shrinking node %d from %ld pages to %ld pages\n",
+ nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+
+ if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
+ /*
+ * Align node_end_pfn[] and node_remap_start_pfn[] to
+ * pmd boundary. remap_numa_kva will barf otherwise.
+ */
+ printk("Shrinking node %d further by %ld pages for proper alignment\n",
+ nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
+ size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
+ }
+
+ node_end_pfn[nid] -= size;
+ node_remap_start_pfn[nid] = node_end_pfn[nid];
+ }
+ printk("Reserving total of %ld pages for numa KVA remap\n",
+ reserve_pages);
+ return reserve_pages;
+}
+
+/* Find the owning node for a pfn. */
+int early_pfn_to_nid(unsigned long pfn)
+{
+ int nid;
+
+ for_each_node(nid) {
+ if (node_end_pfn[nid] == 0)
+ break;
+ if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn)
+ return nid;
+ }
+
+ return 0;
+}
diff -puN arch/i386/mm/discontig.c~i386-numa.c arch/i386/mm/discontig.c
--- memhotplug/arch/i386/mm/discontig.c~i386-numa.c 2005-09-12 10:18:29.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/discontig.c 2005-09-12 10:18:29.000000000 -0700
@@ -32,15 +32,10 @@
#include <linux/module.h>
#include <linux/kexec.h>

-#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
#include <bios_ebda.h>

-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-#ifdef CONFIG_DISCONTIGMEM
/*
* 4) physnode_map - the mapping between a pfn and owning node
* physnode_map keeps track of the physical memory layout of a generic
@@ -82,175 +77,3 @@ unsigned long node_memmap_size_bytes(int

return (nr_pages + 1) * sizeof(struct page);
}
-#endif
-
-extern unsigned long find_max_low_pfn(void);
-extern void find_max_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
-
-extern struct e820map e820;
-extern unsigned long init_pg_tables_end;
-extern unsigned long max_low_pfn;
-extern unsigned long totalram_pages;
-extern unsigned long totalhigh_pages;
-
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-unsigned long node_remap_start_pfn[MAX_NUMNODES];
-unsigned long node_remap_size[MAX_NUMNODES];
-unsigned long node_remap_offset[MAX_NUMNODES];
-void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-void *node_remap_end_vaddr[MAX_NUMNODES];
-void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/* Find the owning node for a pfn. */
-int early_pfn_to_nid(unsigned long pfn)
-{
- int nid;
-
- for_each_node(nid) {
- if (node_end_pfn[nid] == 0)
- break;
- if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn)
- return nid;
- }
-
- return 0;
-}
-
-/*
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method. For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory. See setup_memory()
- * for details.
- */
-static bootmem_data_t node0_bdata;
-static void __init allocate_pgdat(int nid)
-{
- if (nid && node_has_online_mem(nid))
- NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
- else {
- NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
- min_low_pfn += PFN_UP(sizeof(pg_data_t));
- memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
- NODE_DATA(0)->bdata = &node0_bdata;
- }
-}
-
-void setup_numa_kva_remap(void)
-{
- int nid;
- for_each_online_node(nid) {
- if (NODE_DATA(nid))
- continue;
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- max_low_pfn + node_remap_offset[nid]);
- /* Init the node remap allocator */
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
- allocate_pgdat(nid);
- printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) pfn_to_kaddr(max_low_pfn
- + node_remap_offset[nid] + node_remap_size[nid]));
- }
-}
-
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
- return 0;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-void __init remap_numa_kva(void)
-{
- void *vaddr;
- unsigned long pfn;
- int node;
-
- for_each_online_node(node) {
- for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
- vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
- set_pmd_pfn((ulong) vaddr,
- node_remap_start_pfn[node] + pfn,
- PAGE_KERNEL_LARGE);
- }
- }
-}
-
-unsigned long calculate_numa_remap_pages(void)
-{
- int nid;
- unsigned long size, reserve_pages = 0;
- unsigned long pfn;
-
- for_each_online_node(nid) {
- /*
- * The acpi/srat node info can show hot-add memroy zones
- * where memory could be added but not currently present.
- */
- if (node_start_pfn[nid] > max_pfn)
- continue;
- if (node_end_pfn[nid] > max_pfn)
- node_end_pfn[nid] = max_pfn;
-
- /* ensure the remap includes space for the pgdat. */
- size = node_remap_size[nid] + sizeof(pg_data_t);
-
- /* convert size to large (pmd size) pages, rounding up */
- size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
- /* now the roundup is correct, convert to PAGE_SIZE pages */
- size = size * PTRS_PER_PTE;
-
- /*
- * Validate the region we are allocating only contains valid
- * pages.
- */
- for (pfn = node_end_pfn[nid] - size;
- pfn < node_end_pfn[nid]; pfn++)
- if (!page_is_ram(pfn))
- break;
-
- if (pfn != node_end_pfn[nid])
- size = 0;
-
- printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
- size, nid);
- node_remap_size[nid] = size;
- node_remap_offset[nid] = reserve_pages;
- reserve_pages += size;
- printk("Shrinking node %d from %ld pages to %ld pages\n",
- nid, node_end_pfn[nid], node_end_pfn[nid] - size);
-
- if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
- /*
- * Align node_end_pfn[] and node_remap_start_pfn[] to
- * pmd boundary. remap_numa_kva will barf otherwise.
- */
- printk("Shrinking node %d further by %ld pages for proper alignment\n",
- nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
- size += node_end_pfn[nid] & (PTRS_PER_PTE-1);
- }
-
- node_end_pfn[nid] -= size;
- node_remap_start_pfn[nid] = node_end_pfn[nid];
- }
- printk("Reserving total of %ld pages for numa KVA remap\n",
- reserve_pages);
- return reserve_pages;
-}
diff -puN arch/i386/mm/Makefile~i386-numa.c arch/i386/mm/Makefile
--- memhotplug/arch/i386/mm/Makefile~i386-numa.c 2005-09-12 10:18:29.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/Makefile 2005-09-12 10:18:29.000000000 -0700
@@ -4,7 +4,8 @@

obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o

-obj-$(CONFIG_NUMA) += discontig.o
+obj-$(CONFIG_DISCONTIGMEM) += discontig.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
diff -puN include/linux/mmzone.h~i386-numa.c include/linux/mmzone.h
diff -puN include/asm-i386/mmzone.h~i386-numa.c include/asm-i386/mmzone.h
--- memhotplug/include/asm-i386/mmzone.h~i386-numa.c 2005-09-12 10:18:29.000000000 -0700
+++ memhotplug-dave/include/asm-i386/mmzone.h 2005-09-12 10:18:29.000000000 -0700
@@ -38,10 +38,15 @@ static inline void get_memcfg_numa(void)
}

extern int early_pfn_to_nid(unsigned long pfn);
-
+extern void __init remap_numa_kva(void);
+extern unsigned long calculate_numa_remap_pages(void);
+extern void setup_numa_kva_remap(void);
#else /* !CONFIG_NUMA */
#define get_memcfg_numa get_memcfg_numa_flat
#define get_zholes_size(n) (0)
+#define remap_numa_kva() do {} while (0)
+#define setup_numa_kva_remap() do {} while (0)
+#define calculate_numa_remap_pages() (0)
#endif /* CONFIG_NUMA */

#ifdef CONFIG_DISCONTIGMEM
diff -puN include/asm-i386/pgtable-3level.h~i386-numa.c include/asm-i386/pgtable-3level.h
--- memhotplug/include/asm-i386/pgtable-3level.h~i386-numa.c 2005-09-12 10:18:29.000000000 -0700
+++ memhotplug-dave/include/asm-i386/pgtable-3level.h 2005-09-12 10:18:29.000000000 -0700
@@ -65,6 +65,7 @@ static inline void set_pte(pte_t *ptep,
set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
#define set_pud(pudptr,pudval) \
(*(pudptr) = (pudval))
+extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);

/*
* Pentium-II erratum A13: in PAE mode we explicitly have to flush
diff -puN arch/i386/kernel/setup.c~i386-numa.c arch/i386/kernel/setup.c
--- memhotplug/arch/i386/kernel/setup.c~i386-numa.c 2005-09-12 10:18:29.000000000 -0700
+++ memhotplug-dave/arch/i386/kernel/setup.c 2005-09-12 10:19:48.000000000 -0700
@@ -1157,8 +1157,6 @@ static void __init find_max_pfn_node(int
BUG();
}

-unsigned long calculate_numa_remap_pages(void);
-void setup_numa_kva_remap(void);
void __init setup_bootmem_allocator(void);
unsigned long __init setup_memory(void)
{
diff -puN arch/i386/mm/init.c~i386-numa.c arch/i386/mm/init.c
--- memhotplug/arch/i386/mm/init.c~i386-numa.c 2005-09-12 10:18:29.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/init.c 2005-09-12 10:18:29.000000000 -0700
@@ -36,6 +36,7 @@
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
+#include <asm/mmzone.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
@@ -347,12 +348,6 @@ unsigned long long __PAGE_KERNEL = _PAGE
EXPORT_SYMBOL(__PAGE_KERNEL);
unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;

-#ifdef CONFIG_NUMA
-extern void __init remap_numa_kva(void);
-#else
-#define remap_numa_kva() do {} while (0)
-#endif
-
static void __init pagetable_init (void)
{
unsigned long vaddr;
_

2005-09-12 19:03:33

by Mika Penttilä

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/2] i386: consolidate discontig functions into normal ones

Dave Hansen wrote:

>There are quite a few functions in i386's discontig.c which are
>actually NUMA-specific, not discontigmem. They are also very
>similar to the generic, flat functions found in setup.c.
>
>This patch takes the versions in setup.c and makes them work
>for both NUMA and non-NUMA cases. In the process, quite a
>few nasty #ifdef and externs can be removed.
>
>One of the main mechanisms to do this is that highstart_pfn
>and highend_pfn are now gone, replaced by node_start/end_pfn[].
>However, this has no real impact on storage space, because
>those arrays are declared with a length of MAX_NUMNODES, which
>is 1 when NUMA is off.
>
>
>
>
I think you allocate remap pages for nothing in the flatmem case for
node0...those aren't used for the mem map in !NUMA.

--Mika

2005-09-12 19:08:54

by Dave Hansen

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/2] i386: consolidate discontig functions into normal ones

On Mon, 2005-09-12 at 22:04 +0300, Mika Penttil? wrote:
> I think you allocate remap pages for nothing in the flatmem case for
> node0...those aren't used for the mem map in !NUMA.

I believe that is fixed up in the second patch. It should compile a
do{}while(0) version instead of doing a real call.

-- Dave

2005-09-12 19:20:48

by Mika Penttilä

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/2] i386: consolidate discontig functions into normal ones

Dave Hansen wrote:

>On Mon, 2005-09-12 at 22:04 +0300, Mika Penttil? wrote:
>
>
>>I think you allocate remap pages for nothing in the flatmem case for
>>node0...those aren't used for the mem map in !NUMA.
>>
>>
>
>I believe that is fixed up in the second patch. It should compile a
>do{}while(0) version instead of doing a real call.
>
>-- Dave
>
>
>
>
Oh, yes, indeend it is.
Thanks,
Mika