This is unused, except for in an alpha header. Keep the alpha
one, kill the rest.
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/include/asm-i386/mmzone.h | 6 ------
memhotplug-dave/include/asm-m32r/mmzone.h | 6 ------
memhotplug-dave/include/asm-parisc/mmzone.h | 6 ------
memhotplug-dave/include/asm-ppc64/mmzone.h | 3 ---
memhotplug-dave/include/asm-x86_64/mmzone.h | 2 --
5 files changed, 23 deletions(-)
diff -puN include/asm-i386/mmzone.h~C0-kill-local_mapnr include/asm-i386/mmzone.h
--- memhotplug/include/asm-i386/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
+++ memhotplug-dave/include/asm-i386/mmzone.h 2005-08-18 14:59:43.000000000 -0700
@@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned lo
__pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
})
-#define local_mapnr(kvaddr) \
-({ \
- unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \
- (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \
-})
-
/* XXX: FIXME -- wli */
#define kern_addr_valid(kaddr) (0)
diff -puN include/asm-m32r/mmzone.h~C0-kill-local_mapnr include/asm-m32r/mmzone.h
--- memhotplug/include/asm-m32r/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
+++ memhotplug-dave/include/asm-m32r/mmzone.h 2005-08-18 14:59:43.000000000 -0700
@@ -21,12 +21,6 @@ extern struct pglist_data *node_data[];
__pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \
})
-#define local_mapnr(kvaddr) \
-({ \
- unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \
- (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \
-})
-
#define pfn_to_page(pfn) \
({ \
unsigned long __pfn = pfn; \
diff -puN include/asm-parisc/mmzone.h~C0-kill-local_mapnr include/asm-parisc/mmzone.h
--- memhotplug/include/asm-parisc/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
+++ memhotplug-dave/include/asm-parisc/mmzone.h 2005-08-18 14:59:43.000000000 -0700
@@ -27,12 +27,6 @@ extern struct node_map_data node_data[];
})
#define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid))
-#define local_mapnr(kvaddr) \
-({ \
- unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \
- (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \
-})
-
#define pfn_to_page(pfn) \
({ \
unsigned long __pfn = (pfn); \
diff -puN include/asm-ppc64/mmzone.h~C0-kill-local_mapnr include/asm-ppc64/mmzone.h
--- memhotplug/include/asm-ppc64/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
+++ memhotplug-dave/include/asm-ppc64/mmzone.h 2005-08-18 14:59:43.000000000 -0700
@@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned lon
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn)
-#define local_mapnr(kvaddr) \
- ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr))
-
#ifdef CONFIG_DISCONTIGMEM
/*
diff -puN include/asm-x86_64/mmzone.h~C0-kill-local_mapnr include/asm-x86_64/mmzone.h
--- memhotplug/include/asm-x86_64/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
+++ memhotplug-dave/include/asm-x86_64/mmzone.h 2005-08-18 14:59:43.000000000 -0700
@@ -38,8 +38,6 @@ static inline __attribute__((pure)) int
#ifdef CONFIG_DISCONTIGMEM
-#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
-#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr))
/* AK: this currently doesn't deal with invalid addresses. We'll see
if the 2.5 kernel doesn't pass them
_
Adds the necessary for non-NUMA hot-add of highmem
to an existing zone on i386.
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/arch/i386/mm/discontig.c | 4 +-
memhotplug-dave/arch/i386/mm/init.c | 61 ++++++++++++++++++++++++++++---
2 files changed, 58 insertions(+), 7 deletions(-)
diff -puN arch/i386/mm/discontig.c~D1-i386-hotplug-functions arch/i386/mm/discontig.c
--- memhotplug/arch/i386/mm/discontig.c~D1-i386-hotplug-functions 2005-08-18 14:59:50.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/discontig.c 2005-08-18 14:59:50.000000000 -0700
@@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int
extern unsigned long find_max_low_pfn(void);
extern void find_max_pfn(void);
-extern void one_highpage_init(struct page *, int, int);
+extern void add_one_highpage_init(struct page *, int, int);
extern struct e820map e820;
extern unsigned long init_pg_tables_end;
@@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int b
if (!pfn_valid(node_pfn))
continue;
page = pfn_to_page(node_pfn);
- one_highpage_init(page, node_pfn, bad_ppro);
+ add_one_highpage_init(page, node_pfn, bad_ppro);
}
}
totalram_pages += totalhigh_pages;
diff -puN arch/i386/mm/init.c~D1-i386-hotplug-functions arch/i386/mm/init.c
--- memhotplug/arch/i386/mm/init.c~D1-i386-hotplug-functions 2005-08-18 14:59:50.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/init.c 2005-08-18 14:59:50.000000000 -0700
@@ -27,6 +27,7 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/efi.h>
+#include <linux/memory_hotplug.h>
#include <asm/processor.h>
#include <asm/system.h>
@@ -265,17 +266,45 @@ static void __init permanent_kmaps_init(
pkmap_page_table = pte;
}
-void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
+void __devinit free_new_highpage(struct page *page)
+{
+ set_page_count(page, 1);
+ __free_page(page);
+ totalhigh_pages++;
+}
+
+void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
{
if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
ClearPageReserved(page);
- set_page_count(page, 1);
- __free_page(page);
- totalhigh_pages++;
} else
SetPageReserved(page);
}
+int add_one_highpage_hotplug(struct page *page, int pfn)
+{
+ free_new_highpage(page);
+ totalram_pages++;
+#ifdef CONFIG_FLATMEM
+ max_mapnr = max(pfn, max_mapnr);
+#endif
+ num_physpages++;
+ return 0;
+}
+
+/*
+ * Not currently handling the NUMA case.
+ * Assuming single node and all memory that
+ * has been added dynamically that would be
+ * onlined here is in HIGHMEM
+ */
+void online_page(struct page *page)
+{
+ ClearPageReserved(page);
+ add_one_highpage_hotplug(page, page_to_pfn(page));
+}
+
+
#ifdef CONFIG_NUMA
extern void set_highmem_pages_init(int);
#else
@@ -283,7 +312,7 @@ static void __init set_highmem_pages_ini
{
int pfn;
for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
- one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+ add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
totalram_pages += totalhigh_pages;
}
#endif /* CONFIG_FLATMEM */
@@ -614,6 +643,28 @@ void __init mem_init(void)
#endif
}
+/*
+ * this is for the non-NUMA, single node SMP system case.
+ * Specifically, in the case of x86, we will always add
+ * memory to the highmem for now.
+ */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+int add_memory(u64 start, u64 size)
+{
+ struct pglist_data *pgdata = &contig_page_data;
+ struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ return __add_pages(zone, start_pfn, nr_pages);
+}
+
+int remove_memory(u64 start, u64 size)
+{
+ return -EINVAL;
+}
+#endif
+
kmem_cache_t *pgd_cache;
kmem_cache_t *pmd_cache;
_
If a zone is empty at boot-time and then hot-added to later,
it needs to run the same init code that would have been run
on it at boot.
This patch breaks out zone table and per-cpu-pages functions
for use by the hotplug code. You can almost see all of the
free_area_init_core() function on one page now. :)
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/mm/page_alloc.c | 98 +++++++++++++++++++++++-----------------
1 files changed, 58 insertions(+), 40 deletions(-)
diff -puN mm/page_alloc.c~C1-pcp_zone_init mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~C1-pcp_zone_init 2005-09-02 12:12:32.000000000 -0700
+++ memhotplug-dave/mm/page_alloc.c 2005-09-02 12:17:34.000000000 -0700
@@ -1865,6 +1865,60 @@ void __init setup_per_cpu_pageset()
#endif
+static __devinit
+void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+{
+ int i;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_size = wait_table_size(zone_size_pages);
+ zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, zone->wait_table_size
+ * sizeof(wait_queue_head_t));
+
+ for(i = 0; i < zone->wait_table_size; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+}
+
+static __devinit void zone_pcp_init(struct zone *zone)
+{
+ int cpu;
+ unsigned long batch = zone_batchsize(zone);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+#ifdef CONFIG_NUMA
+ /* Early boot. Slab allocator not functional yet */
+ zone->pageset[cpu] = &boot_pageset[cpu];
+ setup_pageset(&boot_pageset[cpu],0);
+#else
+ setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
+ }
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone->name, zone->present_pages, batch);
+}
+
+static void init_currently_empty_zone(struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long size)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ zone_wait_table_init(zone, size);
+ pgdat->nr_zones = zone_idx(zone) + 1;
+
+ zone->zone_mem_map = pfn_to_page(zone_start_pfn);
+ zone->zone_start_pfn = zone_start_pfn;
+
+ memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+
+ zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1874,8 +1928,8 @@ void __init setup_per_cpu_pageset()
static void __init free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
- unsigned long i, j;
- int cpu, nid = pgdat->node_id;
+ unsigned long j;
+ int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
pgdat->nr_zones = 0;
@@ -1885,7 +1939,6 @@ static void __init free_area_init_core(s
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize;
- unsigned long batch;
realsize = size = zones_size[j];
if (zholes_size)
@@ -1905,19 +1958,7 @@ static void __init free_area_init_core(s
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- batch = zone_batchsize(zone);
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone->pageset[cpu] = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
+ zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
@@ -1928,32 +1969,9 @@ static void __init free_area_init_core(s
if (!size)
continue;
- /*
- * The per-page waitqueue mechanism uses hashed waitqueues
- * per zone.
- */
- zone->wait_table_size = wait_table_size(size);
- zone->wait_table_bits =
- wait_table_bits(zone->wait_table_size);
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, zone->wait_table_size
- * sizeof(wait_queue_head_t));
-
- for(i = 0; i < zone->wait_table_size; ++i)
- init_waitqueue_head(zone->wait_table + i);
-
- pgdat->nr_zones = j+1;
-
- zone->zone_mem_map = pfn_to_page(zone_start_pfn);
- zone->zone_start_pfn = zone_start_pfn;
-
- memmap_init(size, nid, j, zone_start_pfn);
-
zonetable_add(zone, nid, j, zone_start_pfn, size);
-
+ init_currently_empty_zone(zone, zone_start_pfn, size);
zone_start_pfn += size;
-
- zone_init_free_lists(pgdat, zone, zone->spanned_pages);
}
}
_
When doing memory hotplug operations, the size of existing zones can
obviously change. This means that zone->zone_{start_pfn,spanned_pages}
can change.
There are currently no locks that protect these structure members.
However, they are rarely accessed at runtime. Outside of swsusp, the
only place that I can find is bad_range().
So, split bad_range() up into two pieces: one that needs to be locked
and anther that doesn't.
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/mm/page_alloc.c | 26 +++++++++++++++++++++-----
1 files changed, 21 insertions(+), 5 deletions(-)
diff -puN mm/page_alloc.c~C5.1-bad_range-rework mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~C5.1-bad_range-rework 2005-08-18 14:59:45.000000000 -0700
+++ memhotplug-dave/mm/page_alloc.c 2005-08-18 14:59:45.000000000 -0700
@@ -77,21 +77,37 @@ int min_free_kbytes = 1024;
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
-/*
- * Temporary debugging check for pages not lying within a given zone.
- */
-static int bad_range(struct zone *zone, struct page *page)
+static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
return 1;
if (page_to_pfn(page) < zone->zone_start_pfn)
return 1;
+
+ return 0;
+}
+
+static int page_is_consistent(struct zone *zone, struct page *page)
+{
#ifdef CONFIG_HOLES_IN_ZONE
if (!pfn_valid(page_to_pfn(page)))
- return 1;
+ return 0;
#endif
if (zone != page_zone(page))
+ return 0;
+
+ return 1;
+}
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+ if (page_outside_zone_boundaries(zone, page))
return 1;
+ if (!page_is_consistent(zone, page))
+ return 1;
+
return 0;
}
_
Here is a set of ppc64 specific patches that at least allow
compilation/booting with the following configurations:
FLATMEM
SPARSEMEN
SPARSEMEM + MEMORY_HOTPLUG
Signed-off-by: Mike Kravetz <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/arch/ppc64/mm/init.c | 77 +++++++++++++++++++++++++++
memhotplug-dave/include/asm-ppc64/abs_addr.h | 2
2 files changed, 79 insertions(+)
diff -puN arch/ppc64/mm/init.c~D2-ppc64-hotplug-functions arch/ppc64/mm/init.c
--- memhotplug/arch/ppc64/mm/init.c~D2-ppc64-hotplug-functions 2005-08-18 14:59:50.000000000 -0700
+++ memhotplug-dave/arch/ppc64/mm/init.c 2005-08-18 14:59:50.000000000 -0700
@@ -870,3 +870,80 @@ pgprot_t phys_mem_access_prot(struct fil
return vma_prot;
}
EXPORT_SYMBOL(phys_mem_access_prot);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+ ClearPageReserved(page);
+ free_cold_page(page);
+ totalram_pages++;
+ num_physpages++;
+}
+
+/*
+ * This works only for the non-NUMA case. Later, we'll need a lookup
+ * to convert from real physical addresses to nid, that doesn't use
+ * pfn_to_nid().
+ */
+int __devinit add_memory(u64 start, u64 size)
+{
+ struct pglist_data *pgdata = NODE_DATA(0);
+ struct zone *zone;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ /* this should work for most non-highmem platforms */
+ zone = pgdata->node_zones;
+
+ return __add_pages(zone, start_pfn, nr_pages);
+
+ return 0;
+}
+
+/*
+ * First pass at this code will check to determine if the remove
+ * request is within the RMO. Do not allow removal within the RMO.
+ */
+int __devinit remove_memory(u64 start, u64 size)
+{
+ struct zone *zone;
+ unsigned long start_pfn, end_pfn, nr_pages;
+
+ start_pfn = start >> PAGE_SHIFT;
+ nr_pages = size >> PAGE_SHIFT;
+ end_pfn = start_pfn + nr_pages;
+
+ printk("%s(): Attempting to remove memoy in range "
+ "%lx to %lx\n", __func__, start, start+size);
+ /*
+ * check for range within RMO
+ */
+ zone = page_zone(pfn_to_page(start_pfn));
+
+ printk("%s(): memory will be removed from "
+ "the %s zone\n", __func__, zone->name);
+
+ /*
+ * not handling removing memory ranges that
+ * overlap multiple zones yet
+ */
+ if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages))
+ goto overlap;
+
+ /* make sure it is NOT in RMO */
+ if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) {
+ printk("%s(): range to be removed must NOT be in RMO!\n",
+ __func__);
+ goto in_rmo;
+ }
+
+ return __remove_pages(zone, start_pfn, nr_pages);
+
+overlap:
+ printk("%s(): memory range to be removed overlaps "
+ "multiple zones!!!\n", __func__);
+in_rmo:
+ return -1;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
pgdat->node_size_lock is basically only neeeded in one place in the
normal code: show_mem(), which is the arch-specific sysrq-m printing
function.
Strictly speaking, the architectures not doing memory hotplug do
no need this locking in show_mem(). However, they are all included
for completeness. This should also make any future consolidation
of all of the implementations a little more straightforward.
This lock is also held in the sparsemem code during a memory removal,
as sections are invalidated. This is the place there pfn_valid() is
made false for a memory area that's being removed. The lock is
only required when doing pfn_valid() operations on memory which the
user does not already have a reference on the page, such as in
show_mem().
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/arch/alpha/mm/numa.c | 3 ++
memhotplug-dave/arch/i386/mm/pgtable.c | 3 ++
memhotplug-dave/arch/ia64/mm/discontig.c | 7 ++++-
memhotplug-dave/arch/m32r/mm/init.c | 9 +++++-
memhotplug-dave/arch/parisc/mm/init.c | 3 ++
memhotplug-dave/arch/ppc64/mm/init.c | 6 ++++
memhotplug-dave/include/linux/memory_hotplug.h | 34 +++++++++++++++++++++++++
memhotplug-dave/include/linux/mmzone.h | 12 ++++++++
memhotplug-dave/mm/page_alloc.c | 1
9 files changed, 76 insertions(+), 2 deletions(-)
diff -puN arch/alpha/mm/numa.c~C5.2-pgdat_size_lock arch/alpha/mm/numa.c
--- memhotplug/arch/alpha/mm/numa.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/arch/alpha/mm/numa.c 2005-09-02 12:42:10.000000000 -0700
@@ -371,6 +371,8 @@ show_mem(void)
show_free_areas();
printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_online_node(nid) {
+ unsigned long flags;
+ pgdat_resize_lock(NODE_DATA(nid), &flags);
i = node_spanned_pages(nid);
while (i-- > 0) {
struct page *page = nid_page_nr(nid, i);
@@ -384,6 +386,7 @@ show_mem(void)
else
shared += page_count(page) - 1;
}
+ pgdat_resize_unlock(NODE_DATA(nid), &flags);
}
printk("%ld pages of RAM\n",total);
printk("%ld free pages\n",free);
diff -puN arch/i386/mm/pgtable.c~C5.2-pgdat_size_lock arch/i386/mm/pgtable.c
--- memhotplug/arch/i386/mm/pgtable.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/arch/i386/mm/pgtable.c 2005-09-02 12:42:10.000000000 -0700
@@ -31,11 +31,13 @@ void show_mem(void)
pg_data_t *pgdat;
unsigned long i;
struct page_state ps;
+ unsigned long flags;
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_pgdat(pgdat) {
+ pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
page = pgdat_page_nr(pgdat, i);
total++;
@@ -48,6 +50,7 @@ void show_mem(void)
else if (page_count(page))
shared += page_count(page) - 1;
}
+ pgdat_resize_unlock(pgdat, &flags);
}
printk(KERN_INFO "%d pages of RAM\n", total);
printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
diff -puN arch/ia64/mm/discontig.c~C5.2-pgdat_size_lock arch/ia64/mm/discontig.c
--- memhotplug/arch/ia64/mm/discontig.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/arch/ia64/mm/discontig.c 2005-09-02 13:43:06.000000000 -0700
@@ -524,9 +524,13 @@ void show_mem(void)
show_free_areas();
printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_pgdat(pgdat) {
- unsigned long present = pgdat->node_present_pages;
+ unsigned long present;
+ unsigned long flags;
int shared = 0, cached = 0, reserved = 0;
+
printk("Node ID: %d\n", pgdat->node_id);
+ pgdat_resize_lock(pgdat, &flags);
+ present = pgdat->node_present_pages;
for(i = 0; i < pgdat->node_spanned_pages; i++) {
struct page *page = pgdat_page_nr(pgdat, i);
if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
@@ -538,6 +542,7 @@ void show_mem(void)
else if (page_count(page))
shared += page_count(page)-1;
}
+ pgdat_resize_unlock(pgdat, &flags);
total_present += present;
total_reserved += reserved;
total_cached += cached;
diff -puN arch/m32r/mm/init.c~C5.2-pgdat_size_lock arch/m32r/mm/init.c
--- memhotplug/arch/m32r/mm/init.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/arch/m32r/mm/init.c 2005-09-02 12:42:10.000000000 -0700
@@ -48,6 +48,8 @@ void show_mem(void)
show_free_areas();
printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
for_each_pgdat(pgdat) {
+ unsigned long flags;
+ pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
page = pgdat_page_nr(pgdat, i);
total++;
@@ -60,6 +62,7 @@ void show_mem(void)
else if (page_count(page))
shared += page_count(page) - 1;
}
+ pgdat_resize_unlock(pgdat, &flags);
}
printk("%d pages of RAM\n", total);
printk("%d pages of HIGHMEM\n",highmem);
@@ -150,10 +153,14 @@ int __init reservedpages_count(void)
int reservedpages, nid, i;
reservedpages = 0;
- for_each_online_node(nid)
+ for_each_online_node(nid) {
+ unsigned long flags;
+ pgdat_resize_lock(NODE_DATA(nid), &flags);
for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
if (PageReserved(nid_page_nr(nid, i)))
reservedpages++;
+ pgdat_resize_unlock(NODE_DATA(nid), &flags);
+ }
return reservedpages;
}
diff -puN arch/parisc/mm/init.c~C5.2-pgdat_size_lock arch/parisc/mm/init.c
--- memhotplug/arch/parisc/mm/init.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/arch/parisc/mm/init.c 2005-09-02 12:42:10.000000000 -0700
@@ -505,7 +505,9 @@ void show_mem(void)
for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
struct page *p;
+ unsigned long flags;
+ pgdat_resize_lock(NODE_DATA(i), &flags);
p = nid_page_nr(i, j) - node_start_pfn(i);
total++;
@@ -517,6 +519,7 @@ void show_mem(void)
free++;
else
shared += page_count(p) - 1;
+ pgdat_resize_unlock(NODE_DATA(i), &flags);
}
}
#endif
diff -puN arch/ppc64/mm/init.c~C5.2-pgdat_size_lock arch/ppc64/mm/init.c
--- memhotplug/arch/ppc64/mm/init.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/arch/ppc64/mm/init.c 2005-09-02 13:43:08.000000000 -0700
@@ -104,6 +104,8 @@ void show_mem(void)
show_free_areas();
printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_pgdat(pgdat) {
+ unsigned long flags;
+ pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; i++) {
page = pgdat_page_nr(pgdat, i);
total++;
@@ -114,6 +116,7 @@ void show_mem(void)
else if (page_count(page))
shared += page_count(page) - 1;
}
+ pgdat_resize_unlock(pgdat, &flags);
}
printk("%ld pages of RAM\n", total);
printk("%ld reserved pages\n", reserved);
@@ -648,11 +651,14 @@ void __init mem_init(void)
#endif
for_each_pgdat(pgdat) {
+ unsigned long flags;
+ pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; i++) {
page = pgdat_page_nr(pgdat, i);
if (PageReserved(page))
reservedpages++;
}
+ pgdat_resize_unlock(pgdat, &flags);
}
codesize = (unsigned long)&_etext - (unsigned long)&_stext;
diff -puN /dev/null include/linux/memory_hotplug.h
--- /dev/null 2005-03-30 22:36:15.000000000 -0800
+++ memhotplug-dave/include/linux/memory_hotplug.h 2005-09-02 13:43:13.000000000 -0700
@@ -0,0 +1,34 @@
+#ifndef __LINUX_MEMORY_HOTPLUG_H
+#define __LINUX_MEMORY_HOTPLUG_H
+
+#include <linux/mmzone.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * pgdat resizing functions
+ */
+static inline
+void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
+{
+ spin_lock_irqsave(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
+{
+ spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_init(struct pglist_data *pgdat)
+{
+ spin_lock_init(&pgdat->node_size_lock);
+}
+#else /* ! CONFIG_MEMORY_HOTPLUG */
+/*
+ * Stub functions for when hotplug is off
+ */
+static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
+#endif
+#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff -puN include/linux/mmzone.h~C5.2-pgdat_size_lock include/linux/mmzone.h
--- memhotplug/include/linux/mmzone.h~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/include/linux/mmzone.h 2005-09-02 13:43:13.000000000 -0700
@@ -273,6 +273,16 @@ typedef struct pglist_data {
struct page *node_mem_map;
#endif
struct bootmem_data *bdata;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Must be held any time you expect node_start_pfn, node_present_pages
+ * or node_spanned_pages stay constant. Holding this will also
+ * guarantee that any pfn_valid() stays that way.
+ *
+ * Nests above zone->lock and zone->size_seqlock.
+ */
+ spinlock_t node_size_lock;
+#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
@@ -293,6 +303,8 @@ typedef struct pglist_data {
#endif
#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
+#include <linux/memory_hotplug.h>
+
extern struct pglist_data *pgdat_list;
void __get_zone_counts(unsigned long *active, unsigned long *inactive,
diff -puN mm/page_alloc.c~C5.2-pgdat_size_lock mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~C5.2-pgdat_size_lock 2005-09-02 12:42:10.000000000 -0700
+++ memhotplug-dave/mm/page_alloc.c 2005-09-02 13:43:13.000000000 -0700
@@ -1948,6 +1948,7 @@ static void __init free_area_init_core(s
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
+ pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
_
This adds generic memory add/remove and supporting functions
for memory hotplug into a new file as well as a memory hotplug
kernel config option.
Individual architecture patches will follow.
For now, disable memory hotplug when swsusp is enabled. There's
a lot of churn there right now. We'll fix it up properly once
it calms down.
Signed-off-by: Matt Tolentino <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/drivers/base/Makefile | 1
memhotplug-dave/drivers/base/init.c | 2
memhotplug-dave/drivers/base/memory.c | 455 +++++++++++++++++++++++++
memhotplug-dave/include/linux/memory.h | 94 +++++
memhotplug-dave/include/linux/memory_hotplug.h | 35 +
memhotplug-dave/include/linux/mm.h | 1
memhotplug-dave/mm/Kconfig | 8
memhotplug-dave/mm/Makefile | 2
memhotplug-dave/mm/memory_hotplug.c | 178 +++++++++
memhotplug-dave/mm/page_alloc.c | 4
10 files changed, 777 insertions(+), 3 deletions(-)
diff -puN drivers/base/Makefile~D0-sysfs-memory-class drivers/base/Makefile
--- memhotplug/drivers/base/Makefile~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/drivers/base/Makefile 2005-08-18 14:59:48.000000000 -0700
@@ -7,6 +7,7 @@ obj-y := core.o sys.o bus.o dd.o \
obj-y += power/
obj-$(CONFIG_FW_LOADER) += firmware_class.o
obj-$(CONFIG_NUMA) += node.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
ifeq ($(CONFIG_DEBUG_DRIVER),y)
EXTRA_CFLAGS += -DDEBUG
diff -puN drivers/base/init.c~D0-sysfs-memory-class drivers/base/init.c
--- memhotplug/drivers/base/init.c~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/drivers/base/init.c 2005-08-18 14:59:48.000000000 -0700
@@ -9,6 +9,7 @@
#include <linux/device.h>
#include <linux/init.h>
+#include <linux/memory.h>
extern int devices_init(void);
extern int buses_init(void);
@@ -39,5 +40,6 @@ void __init driver_init(void)
platform_bus_init();
system_bus_init();
cpu_dev_init();
+ memory_dev_init();
attribute_container_init();
}
diff -puN /dev/null drivers/base/memory.c
--- /dev/null 2005-03-30 22:36:15.000000000 -0800
+++ memhotplug-dave/drivers/base/memory.c 2005-08-18 14:59:48.000000000 -0700
@@ -0,0 +1,455 @@
+/*
+ * drivers/base/memory.c - basic Memory class support
+ *
+ * Written by Matt Tolentino <[email protected]>
+ * Dave Hansen <[email protected]>
+ *
+ * This file provides the necessary infrastructure to represent
+ * a SPARSEMEM-memory-model system's physical memory in /sysfs.
+ * All arch-independent code that assumes MEMORY_HOTPLUG requires
+ * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
+ */
+
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h> /* capable() */
+#include <linux/topology.h>
+#include <linux/device.h>
+#include <linux/memory.h>
+#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define MEMORY_CLASS_NAME "memory"
+
+struct sysdev_class memory_sysdev_class = {
+ set_kset_name(MEMORY_CLASS_NAME),
+};
+EXPORT_SYMBOL(memory_sysdev_class);
+
+static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj)
+{
+ return MEMORY_CLASS_NAME;
+}
+
+static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp,
+ int num_envp, char *buffer, int buffer_size)
+{
+ int retval = 0;
+
+ return retval;
+}
+
+static struct kset_hotplug_ops memory_hotplug_ops = {
+ .name = memory_hotplug_name,
+ .hotplug = memory_hotplug,
+};
+
+static struct notifier_block *memory_chain;
+
+int register_memory_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&memory_chain, nb);
+}
+
+void unregister_memory_notifier(struct notifier_block *nb)
+{
+ notifier_chain_unregister(&memory_chain, nb);
+}
+
+/*
+ * register_memory - Setup a sysfs device for a memory block
+ */
+int
+register_memory(struct memory_block *memory, struct mem_section *section,
+ struct node *root)
+{
+ int error;
+
+ memory->sysdev.cls = &memory_sysdev_class;
+ memory->sysdev.id = __section_nr(section);
+
+ error = sysdev_register(&memory->sysdev);
+
+ if (root && !error)
+ error = sysfs_create_link(&root->sysdev.kobj,
+ &memory->sysdev.kobj,
+ kobject_name(&memory->sysdev.kobj));
+
+ return error;
+}
+
+void
+unregister_memory(struct memory_block *memory, struct mem_section *section,
+ struct node *root)
+{
+ BUG_ON(memory->sysdev.cls != &memory_sysdev_class);
+ BUG_ON(memory->sysdev.id != __section_nr(section));
+
+ sysdev_unregister(&memory->sysdev);
+ if (root)
+ sysfs_remove_link(&root->sysdev.kobj,
+ kobject_name(&memory->sysdev.kobj));
+}
+
+/*
+ * use this as the physical section index that this memsection
+ * uses.
+ */
+
+static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf)
+{
+ struct memory_block *mem =
+ container_of(dev, struct memory_block, sysdev);
+ return sprintf(buf, "%08lx\n", mem->phys_index);
+}
+
+/*
+ * online, offline, going offline, etc.
+ */
+static ssize_t show_mem_state(struct sys_device *dev, char *buf)
+{
+ struct memory_block *mem =
+ container_of(dev, struct memory_block, sysdev);
+ ssize_t len = 0;
+
+ /*
+ * We can probably put these states in a nice little array
+ * so that they're not open-coded
+ */
+ switch (mem->state) {
+ case MEM_ONLINE:
+ len = sprintf(buf, "online\n");
+ break;
+ case MEM_OFFLINE:
+ len = sprintf(buf, "offline\n");
+ break;
+ case MEM_GOING_OFFLINE:
+ len = sprintf(buf, "going-offline\n");
+ break;
+ default:
+ len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
+ mem->state);
+ WARN_ON(1);
+ break;
+ }
+
+ return len;
+}
+
+static inline int memory_notify(unsigned long val, void *v)
+{
+ return notifier_call_chain(&memory_chain, val, v);
+}
+
+/*
+ * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
+ * OK to have direct references to sparsemem variables in here.
+ */
+static int
+memory_block_action(struct memory_block *mem, unsigned long action)
+{
+ int i;
+ unsigned long psection;
+ unsigned long start_pfn, start_paddr;
+ struct page *first_page;
+ int ret;
+ int old_state = mem->state;
+
+ psection = mem->phys_index;
+ first_page = pfn_to_page(psection << PFN_SECTION_SHIFT);
+
+ /*
+ * The probe routines leave the pages reserved, just
+ * as the bootmem code does. Make sure they're still
+ * that way.
+ */
+ if (action == MEM_ONLINE) {
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ if (PageReserved(first_page+i))
+ continue;
+
+ printk(KERN_WARNING "section number %ld page number %d "
+ "not reserved, was it already online? \n",
+ psection, i);
+ return -EBUSY;
+ }
+ }
+
+ switch (action) {
+ case MEM_ONLINE:
+ start_pfn = page_to_pfn(first_page);
+ ret = online_pages(start_pfn, PAGES_PER_SECTION);
+ break;
+ case MEM_OFFLINE:
+ mem->state = MEM_GOING_OFFLINE;
+ memory_notify(MEM_GOING_OFFLINE, NULL);
+ start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
+ ret = remove_memory(start_paddr,
+ PAGES_PER_SECTION << PAGE_SHIFT);
+ if (ret) {
+ mem->state = old_state;
+ break;
+ }
+ memory_notify(MEM_MAPPING_INVALID, NULL);
+ break;
+ default:
+ printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n",
+ __FUNCTION__, mem, action, action);
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+ /*
+ * For now, only notify on successful memory operations
+ */
+ if (!ret)
+ memory_notify(action, NULL);
+
+ return ret;
+}
+
+static int memory_block_change_state(struct memory_block *mem,
+ unsigned long to_state, unsigned long from_state_req)
+{
+ int ret = 0;
+ down(&mem->state_sem);
+
+ if (mem->state != from_state_req) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = memory_block_action(mem, to_state);
+ if (!ret)
+ mem->state = to_state;
+
+out:
+ up(&mem->state_sem);
+ return ret;
+}
+
+static ssize_t
+store_mem_state(struct sys_device *dev, const char *buf, size_t count)
+{
+ struct memory_block *mem;
+ unsigned int phys_section_nr;
+ int ret = -EINVAL;
+
+ mem = container_of(dev, struct memory_block, sysdev);
+ phys_section_nr = mem->phys_index;
+
+ if (!valid_section_nr(phys_section_nr))
+ goto out;
+
+ if (!strncmp(buf, "online", min((int)count, 6)))
+ ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+ else if(!strncmp(buf, "offline", min((int)count, 7)))
+ ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
+out:
+ if (ret)
+ return ret;
+ return count;
+}
+
+/*
+ * phys_device is a bad name for this. What I really want
+ * is a way to differentiate between memory ranges that
+ * are part of physical devices that constitute
+ * a complete removable unit or fru.
+ * i.e. do these ranges belong to the same physical device,
+ * s.t. if I offline all of these sections I can then
+ * remove the physical device?
+ */
+static ssize_t show_phys_device(struct sys_device *dev, char *buf)
+{
+ struct memory_block *mem =
+ container_of(dev, struct memory_block, sysdev);
+ return sprintf(buf, "%d\n", mem->phys_device);
+}
+
+SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL);
+SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
+SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
+
+#define mem_create_simple_file(mem, attr_name) \
+ sysdev_create_file(&mem->sysdev, &attr_##attr_name)
+#define mem_remove_simple_file(mem, attr_name) \
+ sysdev_remove_file(&mem->sysdev, &attr_##attr_name)
+
+/*
+ * Block size attribute stuff
+ */
+static ssize_t
+print_block_size(struct class *class, char *buf)
+{
+ return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE);
+}
+
+static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
+
+static int block_size_init(void)
+{
+ sysfs_create_file(&memory_sysdev_class.kset.kobj,
+ &class_attr_block_size_bytes.attr);
+ return 0;
+}
+
+/*
+ * Some architectures will have custom drivers to do this, and
+ * will not need to do it from userspace. The fake hot-add code
+ * as well as ppc64 will do all of their discovery in userspace
+ * and will require this interface.
+ */
+#ifdef CONFIG_ARCH_MEMORY_PROBE
+static ssize_t
+memory_probe_store(struct class *class, const char __user *buf, size_t count)
+{
+ u64 phys_addr;
+ int ret;
+
+ phys_addr = simple_strtoull(buf, NULL, 0);
+
+ ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+
+ if (ret)
+ count = ret;
+
+ return count;
+}
+static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
+
+static int memory_probe_init(void)
+{
+ sysfs_create_file(&memory_sysdev_class.kset.kobj,
+ &class_attr_probe.attr);
+ return 0;
+}
+#else
+#define memory_probe_init(...) do {} while (0)
+#endif
+
+/*
+ * Note that phys_device is optional. It is here to allow for
+ * differentiation between which *physical* devices each
+ * section belongs to...
+ */
+
+int add_memory_block(unsigned long node_id, struct mem_section *section,
+ unsigned long state, int phys_device)
+{
+ size_t size = sizeof(struct memory_block);
+ struct memory_block *mem = kmalloc(size, GFP_KERNEL);
+ int ret = 0;
+
+ if (!mem)
+ return -ENOMEM;
+
+ memset(mem, 0, size);
+
+ mem->phys_index = __section_nr(section);
+ mem->state = state;
+ init_MUTEX(&mem->state_sem);
+ mem->phys_device = phys_device;
+
+ ret = register_memory(mem, section, NULL);
+ if (!ret)
+ ret = mem_create_simple_file(mem, phys_index);
+ if (!ret)
+ ret = mem_create_simple_file(mem, state);
+ if (!ret)
+ ret = mem_create_simple_file(mem, phys_device);
+
+ return ret;
+}
+
+/*
+ * For now, we have a linear search to go find the appropriate
+ * memory_block corresponding to a particular phys_index. If
+ * this gets to be a real problem, we can always use a radix
+ * tree or something here.
+ *
+ * This could be made generic for all sysdev classes.
+ */
+struct memory_block *find_memory_block(struct mem_section *section)
+{
+ struct kobject *kobj;
+ struct sys_device *sysdev;
+ struct memory_block *mem;
+ char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
+
+ /*
+ * This only works because we know that section == sysdev->id
+ * slightly redundant with sysdev_register()
+ */
+ sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section));
+
+ kobj = kset_find_obj(&memory_sysdev_class.kset, name);
+ if (!kobj)
+ return NULL;
+
+ sysdev = container_of(kobj, struct sys_device, kobj);
+ mem = container_of(sysdev, struct memory_block, sysdev);
+
+ return mem;
+}
+
+int remove_memory_block(unsigned long node_id, struct mem_section *section,
+ int phys_device)
+{
+ struct memory_block *mem;
+
+ mem = find_memory_block(section);
+ mem_remove_simple_file(mem, phys_index);
+ mem_remove_simple_file(mem, state);
+ mem_remove_simple_file(mem, phys_device);
+ unregister_memory(mem, section, NULL);
+
+ return 0;
+}
+
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(struct mem_section *section)
+{
+ return add_memory_block(0, section, MEM_OFFLINE, 0);
+}
+
+int unregister_memory_section(struct mem_section *section)
+{
+ if (!valid_section(section))
+ return -EINVAL;
+
+ return remove_memory_block(0, section, 0);
+}
+
+/*
+ * Initialize the sysfs support for memory devices...
+ */
+int __init memory_dev_init(void)
+{
+ unsigned int i;
+ int ret;
+
+ memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops;
+ ret = sysdev_class_register(&memory_sysdev_class);
+
+ /*
+ * Create entries for memory sections that were found
+ * during boot and have been initialized
+ */
+ for (i = 0; i < NR_MEM_SECTIONS; i++) {
+ if (!valid_section_nr(i))
+ continue;
+ add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
+ }
+
+ memory_probe_init();
+ block_size_init();
+
+ return ret;
+}
diff -puN /dev/null include/linux/memory.h
--- /dev/null 2005-03-30 22:36:15.000000000 -0800
+++ memhotplug-dave/include/linux/memory.h 2005-08-18 14:59:48.000000000 -0700
@@ -0,0 +1,94 @@
+/*
+ * include/linux/memory.h - generic memory definition
+ *
+ * This is mainly for topological representation. We define the
+ * basic "struct memory_block" here, which can be embedded in per-arch
+ * definitions or NUMA information.
+ *
+ * Basic handling of the devices is done in drivers/base/memory.c
+ * and system devices are handled in drivers/base/sys.c.
+ *
+ * Memory block are exported via sysfs in the class/memory/devices/
+ * directory.
+ *
+ */
+#ifndef _LINUX_MEMORY_H_
+#define _LINUX_MEMORY_H_
+
+#include <linux/sysdev.h>
+#include <linux/node.h>
+#include <linux/compiler.h>
+
+#include <asm/semaphore.h>
+
+struct memory_block {
+ unsigned long phys_index;
+ unsigned long state;
+ /*
+ * This serializes all state change requests. It isn't
+ * held during creation because the control files are
+ * created long after the critical areas during
+ * initialization.
+ */
+ struct semaphore state_sem;
+ int phys_device; /* to which fru does this belong? */
+ void *hw; /* optional pointer to fw/hw data */
+ int (*phys_callback)(struct memory_block *);
+ struct sys_device sysdev;
+};
+
+/* These states are exposed to userspace as text strings in sysfs */
+#define MEM_ONLINE (1<<0) /* exposed to userspace */
+#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
+#define MEM_OFFLINE (1<<2) /* exposed to userspace */
+
+/*
+ * All of these states are currently kernel-internal for notifying
+ * kernel components and architectures.
+ *
+ * For MEM_MAPPING_INVALID, all notifier chains with priority >0
+ * are called before pfn_to_page() becomes invalid. The priority=0
+ * entry is reserved for the function that actually makes
+ * pfn_to_page() stop working. Any notifiers that want to be called
+ * after that should have priority <0.
+ */
+#define MEM_MAPPING_INVALID (1<<3)
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline int memory_dev_init(void)
+{
+ return 0;
+}
+static inline int register_memory_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+static inline void unregister_memory_notifier(struct notifier_block *nb)
+{
+}
+#else
+extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
+extern int register_new_memory(struct mem_section *);
+extern int unregister_memory_section(struct mem_section *);
+extern int memory_dev_init(void);
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
+
+extern int invalidate_phys_mapping(unsigned long, unsigned long);
+struct notifier_block;
+
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+extern struct sysdev_class memory_sysdev_class;
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#define hotplug_memory_notifier(fn, pri) { \
+ static struct notifier_block fn##_mem_nb = \
+ { .notifier_call = fn, .priority = pri }; \
+ register_memory_notifier(&fn##_mem_nb); \
+}
+
+#endif /* _LINUX_MEMORY_H_ */
diff -puN include/linux/memory_hotplug.h~D0-sysfs-memory-class include/linux/memory_hotplug.h
--- memhotplug/include/linux/memory_hotplug.h~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/include/linux/memory_hotplug.h 2005-08-18 14:59:48.000000000 -0700
@@ -3,6 +3,8 @@
#include <linux/mmzone.h>
#include <linux/spinlock.h>
+#include <linux/mmzone.h>
+#include <linux/notifier.h>
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -46,6 +48,19 @@ static inline void zone_seqlock_init(str
{
seqlock_init(&zone->span_seqlock);
}
+extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
+extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
+extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+/* need some defines for these for archs that don't support it */
+extern void online_page(struct page *page);
+/* VM interface that may be used by firmware interface */
+extern int add_memory(u64 start, u64 size);
+extern int remove_memory(u64 start, u64 size);
+extern int online_pages(unsigned long, unsigned long);
+
+/* reasonably generic interface to expand the physical pages in a zone */
+extern int __add_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages);
#else /* ! CONFIG_MEMORY_HOTPLUG */
/*
* Stub functions for when hotplug is off
@@ -65,5 +80,25 @@ static inline int zone_span_seqretry(str
static inline void zone_span_writelock(struct zone *zone) {}
static inline void zone_span_writeunlock(struct zone *zone) {}
static inline void zone_seqlock_init(struct zone *zone) {}
+
+static inline int mhp_notimplemented(const char *func)
+{
+ printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
+ dump_stack();
+ return -ENOSYS;
+}
+
+static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ return mhp_notimplemented(__FUNCTION__);
+}
#endif /* ! CONFIG_MEMORY_HOTPLUG */
+static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__);
+ dump_stack();
+ return -ENOSYS;
+}
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff -puN include/linux/mm.h~D0-sysfs-memory-class include/linux/mm.h
--- memhotplug/include/linux/mm.h~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/include/linux/mm.h 2005-08-18 14:59:48.000000000 -0700
@@ -791,6 +791,7 @@ extern void free_area_init_node(int nid,
unsigned long * zones_size, unsigned long zone_start_pfn,
unsigned long *zholes_size);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
+extern void setup_per_zone_pages_min(void);
extern void mem_init(void);
extern void show_mem(void);
extern void si_meminfo(struct sysinfo * val);
diff -puN mm/Kconfig~D0-sysfs-memory-class mm/Kconfig
--- memhotplug/mm/Kconfig~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/mm/Kconfig 2005-08-18 14:59:48.000000000 -0700
@@ -111,3 +111,11 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+ depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
diff -puN mm/Makefile~D0-sysfs-memory-class mm/Makefile
--- memhotplug/mm/Makefile~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/mm/Makefile 2005-08-18 14:59:48.000000000 -0700
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff -puN /dev/null mm/memory_hotplug.c
--- /dev/null 2005-03-30 22:36:15.000000000 -0800
+++ memhotplug-dave/mm/memory_hotplug.c 2005-08-18 14:59:48.000000000 -0700
@@ -0,0 +1,178 @@
+/*
+ * linux/mm/memory_hotplug.c
+ *
+ * Copyright (C)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/pagevec.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+ struct page *page, *ret;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+ page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+ if (page)
+ goto got_map_page;
+
+ ret = vmalloc(memmap_size);
+ if (ret)
+ goto got_map_ptr;
+
+ return NULL;
+got_map_page:
+ ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+ memset(ret, 0, memmap_size);
+
+ return ret;
+}
+
+extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+ unsigned long size);
+static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int nid = pgdat->node_id;
+ int zone_type;
+
+ zone_type = zone - pgdat->node_zones;
+ memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
+ zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+}
+
+extern int sparse_add_one_section(struct zone *, unsigned long,
+ struct page *mem_map);
+int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ struct page *memmap;
+ int ret;
+
+ /*
+ * This can potentially allocate memory, and does its own
+ * internal locking.
+ */
+ sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id);
+
+ pgdat_resize_lock(pgdat, &flags);
+ memmap = __kmalloc_section_memmap(nr_pages);
+ ret = sparse_add_one_section(zone, phys_start_pfn, memmap);
+ pgdat_resize_unlock(pgdat, &flags);
+
+ if (ret <= 0) {
+ /* the mem_map didn't get used */
+ if (memmap >= (struct page *)VMALLOC_START &&
+ memmap < (struct page *)VMALLOC_END)
+ vfree(memmap);
+ else
+ free_pages((unsigned long)memmap,
+ get_order(sizeof(struct page) * nr_pages));
+ }
+
+ if (ret < 0)
+ return ret;
+
+ __add_zone(zone, phys_start_pfn);
+ return register_new_memory(__pfn_to_section(phys_start_pfn));
+}
+
+/*
+ * Reasonably generic function for adding memory. It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ int err = 0;
+
+ for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) {
+ err = __add_section(zone, phys_start_pfn + i);
+
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static void grow_zone_span(struct zone *zone,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long old_zone_end_pfn;
+
+ zone_span_writelock(zone);
+
+ old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ if (start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ if (end_pfn > old_zone_end_pfn)
+ zone->spanned_pages = end_pfn - zone->zone_start_pfn;
+
+ zone_span_writeunlock(zone);
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ if (end_pfn > old_pgdat_end_pfn)
+ pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+}
+
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long i;
+ unsigned long flags;
+ unsigned long onlined_pages = 0;
+ struct zone *zone;
+
+ /*
+ * This doesn't need a lock to do pfn_to_page().
+ * The section can't be removed here because of the
+ * memory_block->state_sem.
+ */
+ zone = page_zone(pfn_to_page(pfn));
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ grow_zone_span(zone, pfn, pfn + nr_pages);
+ grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pfn_to_page(pfn + i);
+ online_page(page);
+ onlined_pages++;
+ }
+ zone->present_pages += onlined_pages;
+
+ return 0;
+}
diff -puN mm/page_alloc.c~D0-sysfs-memory-class mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~D0-sysfs-memory-class 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/mm/page_alloc.c 2005-08-18 14:59:48.000000000 -0700
@@ -1663,7 +1663,7 @@ static void __init calculate_zone_totalp
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
-void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
struct page *page;
@@ -2385,7 +2385,7 @@ static void setup_per_zone_lowmem_reserv
* that the pages_{min,low,high} values for each zone are set correctly
* with respect to min_free_kbytes.
*/
-static void setup_per_zone_pages_min(void)
+void setup_per_zone_pages_min(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
_
This basically keeps up from having to extern __kmalloc_section_memmap().
The vaddr_in_vmalloc_area() helper could go in a vmalloc header, but
that header gets hard to work with, because it needs some arch-specific
macros. Just stick it in here for now, instead of creating another header.
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/mm/memory_hotplug.c | 43 --------------------
memhotplug-dave/mm/sparse.c | 74 +++++++++++++++++++++++++++++++++---
2 files changed, 70 insertions(+), 47 deletions(-)
diff -puN mm/memory_hotplug.c~D0.6-move_memmap_kmalloc_to_sparse.c mm/memory_hotplug.c
--- memhotplug/mm/memory_hotplug.c~D0.6-move_memmap_kmalloc_to_sparse.c 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/mm/memory_hotplug.c 2005-08-18 14:59:48.000000000 -0700
@@ -24,28 +24,6 @@
#include <asm/tlbflush.h>
-static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
-{
- struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * nr_pages;
-
- page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
- if (page)
- goto got_map_page;
-
- ret = vmalloc(memmap_size);
- if (ret)
- goto got_map_ptr;
-
- return NULL;
-got_map_page:
- ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
-got_map_ptr:
- memset(ret, 0, memmap_size);
-
- return ret;
-}
-
extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
unsigned long size);
static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
@@ -60,7 +38,7 @@ static void __add_zone(struct zone *zone
zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
}
-extern int sparse_add_one_section(struct zone *, unsigned long,
+extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
struct page *mem_map);
int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
@@ -69,26 +47,7 @@ int __add_section(struct zone *zone, uns
struct page *memmap;
int ret;
- /*
- * This can potentially allocate memory, and does its own
- * internal locking.
- */
- sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id);
-
- pgdat_resize_lock(pgdat, &flags);
- memmap = __kmalloc_section_memmap(nr_pages);
ret = sparse_add_one_section(zone, phys_start_pfn, memmap);
- pgdat_resize_unlock(pgdat, &flags);
-
- if (ret <= 0) {
- /* the mem_map didn't get used */
- if (memmap >= (struct page *)VMALLOC_START &&
- memmap < (struct page *)VMALLOC_END)
- vfree(memmap);
- else
- free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * nr_pages));
- }
if (ret < 0)
return ret;
diff -puN mm/sparse.c~D0.6-move_memmap_kmalloc_to_sparse.c mm/sparse.c
--- memhotplug/mm/sparse.c~D0.6-move_memmap_kmalloc_to_sparse.c 2005-08-18 14:59:48.000000000 -0700
+++ memhotplug-dave/mm/sparse.c 2005-08-18 14:59:48.000000000 -0700
@@ -5,8 +5,10 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
#include <asm/dma.h>
/*
@@ -164,6 +166,45 @@ static struct page *sparse_early_mem_map
return NULL;
}
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+{
+ struct page *page, *ret;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
+
+ page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+ if (page)
+ goto got_map_page;
+
+ ret = vmalloc(memmap_size);
+ if (ret)
+ goto got_map_ptr;
+
+ return NULL;
+got_map_page:
+ ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
+got_map_ptr:
+ memset(ret, 0, memmap_size);
+
+ return ret;
+}
+
+static int vaddr_in_vmalloc_area(void *addr)
+{
+ if (addr >= (void *)VMALLOC_START &&
+ addr < (void *)VMALLOC_END)
+ return 1;
+ return 0;
+}
+
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+ if (vaddr_in_vmalloc_area(memmap))
+ vfree(memmap);
+ else
+ free_pages((unsigned long)memmap,
+ get_order(sizeof(struct page) * nr_pages));
+}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -189,14 +230,37 @@ void sparse_init(void)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages)
{
- struct mem_section *ms = __pfn_to_section(start_pfn);
+ unsigned long section_nr = pfn_to_section_nr(start_pfn);
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct mem_section *ms;
+ struct page *memmap;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * no locking for this, because it does its own
+ * plus, it does a kmalloc
+ */
+ sparse_index_init(section_nr, pgdat->node_id);
+ memmap = __kmalloc_section_memmap(nr_pages);
- if (ms->section_mem_map & SECTION_MARKED_PRESENT)
- return -EEXIST;
+ pgdat_resize_lock(pgdat, &flags);
+ ms = __pfn_to_section(start_pfn);
+ if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
+ ret = -EEXIST;
+ goto out;
+ }
ms->section_mem_map |= SECTION_MARKED_PRESENT;
- return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+ ret = sparse_init_one_section(ms, section_nr, memmap);
+
+ if (ret <= 0)
+ __kfree_section_memmap(memmap, nr_pages);
+out:
+ pgdat_resize_unlock(pgdat, &flags);
+ return ret;
}
_
A little helper that we use in the hotplug code.
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/include/linux/mmzone.h | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+)
diff -puN include/linux/mmzone.h~C3-__section_nr include/linux/mmzone.h
--- memhotplug/include/linux/mmzone.h~C3-__section_nr 2005-08-18 14:59:45.000000000 -0700
+++ memhotplug-dave/include/linux/mmzone.h 2005-08-18 14:59:45.000000000 -0700
@@ -511,6 +511,31 @@ static inline struct mem_section *__nr_t
}
/*
+ * Although written for the SPARSEMEM_EXTREME case, this happens
+ * to also work for the flat array case becase
+ * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
+ */
+static inline int __section_nr(struct mem_section* ms)
+{
+ unsigned long root_nr;
+ struct mem_section* root;
+
+ for (root_nr = 0;
+ root_nr < NR_MEM_SECTIONS;
+ root_nr += SECTIONS_PER_ROOT) {
+ root = __nr_to_section(root_nr);
+
+ if (!root)
+ continue;
+
+ if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
+ break;
+ }
+
+ return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
+}
+
+/*
* We use the lower bits of the mem_map pointer to store
* a little bit of information. There should be at least
* 3 bits here due to 32-bit alignment.
_
From: IWAMOTO Toshihiro <[email protected]>
> I found the tests does not work well with Dave's patchset.
> I've found the followings:
>
> - setup_per_zone_pages_min() calls should be added in
> capture_page_range() and online_pages()
> - lru_add_drain() should be called before try_to_migrate_pages()
The following patch deals with the first item.
Signed-off-by: IWAMOTO Toshihiro <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/mm/memory_hotplug.c | 2 ++
1 files changed, 2 insertions(+)
diff -puN mm/memory_hotplug.c~D0.7-call_setup_per_zone_pages_min_after_memory_size_change mm/memory_hotplug.c
--- memhotplug/mm/memory_hotplug.c~D0.7-call_setup_per_zone_pages_min_after_memory_size_change 2005-08-18 14:59:49.000000000 -0700
+++ memhotplug-dave/mm/memory_hotplug.c 2005-08-18 14:59:49.000000000 -0700
@@ -133,5 +133,7 @@ int online_pages(unsigned long pfn, unsi
}
zone->present_pages += onlined_pages;
+ setup_per_zone_pages_min();
+
return 0;
}
_
See the "fixup bad_range()" patch for more information, but this
actually creates a the lock to protect things making assumptions
about a zone's size staying constant at runtime.
Signed-off-by: Dave Hansen <[email protected]>
---
memhotplug-dave/include/linux/memory_hotplug.h | 39 +++++++++++++++++++++++--
memhotplug-dave/include/linux/mmzone.h | 15 +++++++++
memhotplug-dave/mm/page_alloc.c | 19 ++++++++----
3 files changed, 66 insertions(+), 7 deletions(-)
diff -puN include/linux/memory_hotplug.h~C6-zone-span_seqlock include/linux/memory_hotplug.h
--- memhotplug/include/linux/memory_hotplug.h~C6-zone-span_seqlock 2005-09-02 12:42:11.000000000 -0700
+++ memhotplug-dave/include/linux/memory_hotplug.h 2005-09-02 13:43:10.000000000 -0700
@@ -16,13 +16,36 @@ void pgdat_resize_lock(struct pglist_dat
static inline
void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
{
- spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
+ spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
}
static inline
void pgdat_resize_init(struct pglist_data *pgdat)
{
spin_lock_init(&pgdat->node_size_lock);
}
+/*
+ * Zone resizing functions
+ */
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+ return read_seqbegin(&zone->span_seqlock);
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+ return read_seqretry(&zone->span_seqlock, iv);
+}
+static inline void zone_span_writelock(struct zone *zone)
+{
+ write_seqlock(&zone->span_seqlock);
+}
+static inline void zone_span_writeunlock(struct zone *zone)
+{
+ write_sequnlock(&zone->span_seqlock);
+}
+static inline void zone_seqlock_init(struct zone *zone)
+{
+ seqlock_init(&zone->span_seqlock);
+}
#else /* ! CONFIG_MEMORY_HOTPLUG */
/*
* Stub functions for when hotplug is off
@@ -30,5 +53,17 @@ void pgdat_resize_init(struct pglist_dat
static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
-#endif
+
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+ return 0;
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+ return 0;
+}
+static inline void zone_span_writelock(struct zone *zone) {}
+static inline void zone_span_writeunlock(struct zone *zone) {}
+static inline void zone_seqlock_init(struct zone *zone) {}
+#endif /* ! CONFIG_MEMORY_HOTPLUG */
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff -puN include/linux/mmzone.h~C6-zone-span_seqlock include/linux/mmzone.h
--- memhotplug/include/linux/mmzone.h~C6-zone-span_seqlock 2005-09-02 12:42:11.000000000 -0700
+++ memhotplug-dave/include/linux/mmzone.h 2005-09-02 12:42:11.000000000 -0700
@@ -12,6 +12,7 @@
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
+#include <linux/seqlock.h>
#include <asm/atomic.h>
/* Free memory management - zoned buddy allocator. */
@@ -137,6 +138,10 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /* see spanned/present_pages for more description */
+ seqlock_t span_seqlock;
+#endif
struct free_area free_area[MAX_ORDER];
@@ -220,6 +225,16 @@ struct zone {
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
+ /*
+ * zone_start_pfn, spanned_pages and present_pages are all
+ * protected by span_seqlock. It is a seqlock because it has
+ * to be read outside of zone->lock, and it is done in the main
+ * allocator path. But, it is written quite infrequently.
+ *
+ * The lock is declared along with zone->lock because it is
+ * frequently read in proximity to zone->lock. It's good to
+ * give them a chance of being in the same cacheline.
+ */
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
diff -puN mm/page_alloc.c~C6-zone-span_seqlock mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~C6-zone-span_seqlock 2005-09-02 12:42:11.000000000 -0700
+++ memhotplug-dave/mm/page_alloc.c 2005-09-02 13:43:10.000000000 -0700
@@ -32,6 +32,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
@@ -79,12 +80,19 @@ unsigned long __initdata nr_all_pages;
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
- if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
- return 1;
- if (page_to_pfn(page) < zone->zone_start_pfn)
- return 1;
+ int ret = 0;
+ unsigned seq;
+ unsigned long pfn = page_to_pfn(page);
- return 0;
+ do {
+ seq = zone_span_seqbegin(zone);
+ if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+ ret = 1;
+ else if (pfn < zone->zone_start_pfn)
+ ret = 1;
+ } while (zone_span_seqretry(zone, seq));
+
+ return ret;
}
static int page_is_consistent(struct zone *zone, struct page *page)
@@ -1970,6 +1978,7 @@ static void __init free_area_init_core(s
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
+ zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
_
Dave Hansen <[email protected]> wrote:
>
> + for (i = 0; i < PAGES_PER_SECTION; i++) {
> + if (PageReserved(first_page+i))
> + continue;
How intimate do these patches get with PageReserved()? Bear in mind that
we're slowly working toward making PageReserved go away.
On Fri, 2005-09-02 at 15:13 -0700, Andrew Morton wrote:
> Dave Hansen <[email protected]> wrote:
> >
> > + for (i = 0; i < PAGES_PER_SECTION; i++) {
> > + if (PageReserved(first_page+i))
> > + continue;
>
> How intimate do these patches get with PageReserved()? Bear in mind that
> we're slowly working toward making PageReserved go away.
It's basically the same way that the init code uses it. When
initialized, a struct page has it set. In theory, an architecture could
decide to keep the bit set when it is doing online_pages(). However, I
don't think any do that today. Nobody would really notice if we killed
that. That check could probably instead be something like
page_is_ram().
-- Dave
On Friday 02 September 2005 23:56, Dave Hansen wrote:
>
> A little helper that we use in the hotplug code.
>
> Signed-off-by: Dave Hansen <[email protected]>
> ---
>
> memhotplug-dave/include/linux/mmzone.h | 25 +++++++++++++++++++++++++
> 1 files changed, 25 insertions(+)
>
> diff -puN include/linux/mmzone.h~C3-__section_nr include/linux/mmzone.h
> --- memhotplug/include/linux/mmzone.h~C3-__section_nr 2005-08-18 14:59:45.000000000 -0700
> +++ memhotplug-dave/include/linux/mmzone.h 2005-08-18 14:59:45.000000000 -0700
> @@ -511,6 +511,31 @@ static inline struct mem_section *__nr_t
> }
>
> /*
> + * Although written for the SPARSEMEM_EXTREME case, this happens
> + * to also work for the flat array case becase
> + * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
> + */
> +static inline int __section_nr(struct mem_section* ms)
> +{
> + unsigned long root_nr;
> + struct mem_section* root;
> +
> + for (root_nr = 0;
> + root_nr < NR_MEM_SECTIONS;
> + root_nr += SECTIONS_PER_ROOT) {
> + root = __nr_to_section(root_nr);
> +
> + if (!root)
> + continue;
> +
> + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
> + break;
> + }
> +
> + return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
> +}
> +
> +/*
isn't it too much for the inlining?
--
vda
Dave Hansen <[email protected]> wrote:
>
> --- memhotplug/include/asm-x86_64/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
> +++ memhotplug-dave/include/asm-x86_64/mmzone.h 2005-08-18 14:59:43.000000000 -0700
> @@ -38,8 +38,6 @@ static inline __attribute__((pure)) int
>
> #ifdef CONFIG_DISCONTIGMEM
>
> -#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
> -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr))
>
> /* AK: this currently doesn't deal with invalid addresses. We'll see
> if the 2.5 kernel doesn't pass them
> _
What's this bit doing here? It breaks the x86_64 build all over the place.
I'll drop that chunk and see how we go...
On Wed, 2005-09-07 at 02:37 -0700, Andrew Morton wrote:
> Dave Hansen <[email protected]> wrote:
> >
> > --- memhotplug/include/asm-x86_64/mmzone.h~C0-kill-local_mapnr 2005-08-18 14:59:43.000000000 -0700
> > +++ memhotplug-dave/include/asm-x86_64/mmzone.h 2005-08-18 14:59:43.000000000 -0700
> > @@ -38,8 +38,6 @@ static inline __attribute__((pure)) int
> >
> > #ifdef CONFIG_DISCONTIGMEM
> >
> > -#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT)
> > -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr))
> >
> > /* AK: this currently doesn't deal with invalid addresses. We'll see
> > if the 2.5 kernel doesn't pass them
> > _
>
> What's this bit doing here? It breaks the x86_64 build all over the place.
>
> I'll drop that chunk and see how we go...
That could have easily been some merge borkage on my part. I don't
think that hunk is valid, so dropping it is the right move.
-- Dave