2006-02-10 14:21:41

by Yasunori Goto

[permalink] [raw]
Subject: [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)


This patch is to initialize wait table and zonelists for new pgdat.
When new node is added, free_area_init_node() is called to initialize
pgdat. But, wait table must be allocated by kmalloc (not bootmem) for it.
And, zonelists is accessed from any other process every time,
So, stop_machine_run() is used for safety update.


Signed-off-by: Dave Hansen <[email protected]>
Signed-off-by: Hiroyuki Kamezawa <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

Index: pgdat2/mm/page_alloc.c
===================================================================
--- pgdat2.orig/mm/page_alloc.c 2006-02-10 17:02:22.000000000 +0900
+++ pgdat2/mm/page_alloc.c 2006-02-10 17:02:34.000000000 +0900
@@ -37,6 +37,7 @@
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/mempolicy.h>
+#include <linux/stop_machine.h>

#include <asm/tlbflush.h>
#include "internal.h"
@@ -2071,18 +2072,24 @@ void __init setup_per_cpu_pageset(void)
static __meminit
void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
- int i;
+ int i, hotadd = (system_state == SYSTEM_RUNNING);
struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long allocsize;

/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
+ if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
+ zone_size_pages = PAGES_PER_SECTION << 2;
zone->wait_table_size = wait_table_size(zone_size_pages);
zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, zone->wait_table_size
- * sizeof(wait_queue_head_t));
+ allocsize = zone->wait_table_size * sizeof(wait_queue_head_t);
+ if (hotadd)
+ zone->wait_table = kmalloc(allocsize, GFP_KERNEL);
+ else
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, allocsize);

for(i = 0; i < zone->wait_table_size; ++i)
init_waitqueue_head(zone->wait_table + i);
@@ -2111,7 +2118,6 @@ static __meminit void init_currently_emp
{
struct pglist_data *pgdat = zone->zone_pgdat;

- zone_wait_table_init(zone, size);
pgdat->nr_zones = zone_idx(zone) + 1;

zone->zone_mem_map = pfn_to_page(zone_start_pfn);
@@ -2120,6 +2126,7 @@ static __meminit void init_currently_emp
memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+ zone->spanned_pages = size;
}

/*
@@ -2175,6 +2182,7 @@ void __meminit free_area_init_core(struc
continue;

zonetable_add(zone, nid, j, zone_start_pfn, size);
+ zone_wait_table_init(zone, size);
init_currently_empty_zone(zone, zone_start_pfn, size);
zone_start_pfn += size;
}
@@ -2818,3 +2826,54 @@ void *__init alloc_large_system_hash(con

return table;
}
+
+static inline int zone_previously_initialized(struct zone *zone)
+{
+ if (zone->wait_table_size)
+ return 1;
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __build_all_zonelists(void *dummy)
+{
+ int i;
+ for_each_online_node(i)
+ build_zonelists(NODE_DATA(i));
+ /* XXX: Cpuset must be updated when node is hotplugged. */
+ return 0;
+}
+
+DEFINE_SPINLOCK(zone_init_lock);
+int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long size_pages)
+{
+ int ret = 0;
+ unsigned long flags;
+ spin_lock_irqsave(&zone_init_lock,flags);
+ if (zone_previously_initialized(zone)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ zone_wait_table_init(zone, size_pages);
+ printk(KERN_DEBUG "hot add zone init %lx %lx.....\n",
+ phys_start_pfn, size_pages);
+ init_currently_empty_zone(zone, phys_start_pfn, size_pages);
+ zone_pcp_init(zone);
+
+ /*
+ * This is an awfully blunt way to do this. But, the
+ * zonelists are accessed many times over large areas
+ * of performance-critical code in the allocator.
+ * That makes it very hard to get a conventional lock
+ * to work. This of this as a rw lock with a huge
+ * write cost.
+ */
+ stop_machine_run(__build_all_zonelists, zone->zone_pgdat, NR_CPUS);
+out:
+ spin_unlock_irqrestore(&zone_init_lock, flags);
+ return ret;
+}
+#endif
Index: pgdat2/include/linux/mmzone.h
===================================================================
--- pgdat2.orig/include/linux/mmzone.h 2006-02-10 16:59:51.000000000 +0900
+++ pgdat2/include/linux/mmzone.h 2006-02-10 17:02:34.000000000 +0900
@@ -403,7 +403,9 @@ static inline struct zone *next_zone(str

static inline int populated_zone(struct zone *zone)
{
- return (!!zone->present_pages);
+ /* When hot-dadd, present page is 0 at this point.
+ So check spanned_pages instead of present_pages */
+ return (!!zone->spanned_pages);
}

static inline int is_highmem_idx(int idx)
Index: pgdat2/mm/memory_hotplug.c
===================================================================
--- pgdat2.orig/mm/memory_hotplug.c 2006-02-10 16:59:51.000000000 +0900
+++ pgdat2/mm/memory_hotplug.c 2006-02-10 17:02:34.000000000 +0900
@@ -48,6 +48,8 @@ static int __add_section(struct zone *zo

ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

+ hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION);
+
if (ret < 0)
return ret;


--
Yasunori Goto



2006-02-10 16:32:22

by Dave Hansen

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

On Fri, 2006-02-10 at 23:20 +0900, Yasunori Goto wrote:
>
> static __meminit
> void zone_wait_table_init(struct zone *zone, unsigned long
> zone_size_pages)
> {
> - int i;
> + int i, hotadd = (system_state == SYSTEM_RUNNING);
> struct pglist_data *pgdat = zone->zone_pgdat;
> + unsigned long allocsize;
>
> /*
> * The per-page waitqueue mechanism uses hashed waitqueues
> * per zone.
> */
> + if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
> + zone_size_pages = PAGES_PER_SECTION << 2;

I don't think I understand this calculation. You online only 4 sections
worth of pages?

-- Dave

2006-02-10 16:33:32

by Dave Hansen

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

On Fri, 2006-02-10 at 23:20 +0900, Yasunori Goto wrote:
> This patch is to initialize wait table and zonelists for new pgdat.
> When new node is added, free_area_init_node() is called to initialize
> pgdat. But, wait table must be allocated by kmalloc (not bootmem) for
> it.
> And, zonelists is accessed from any other process every time,
> So, stop_machine_run() is used for safety update.

I do notice that you're not using init_currently_empty_zone() to
initialize currently empty zones. Why?

-- Dave

2006-02-10 21:59:27

by Joel Schopp

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

> + /* When hot-dadd, present page is 0 at this point.
> + So check spanned_pages instead of present_pages */
> + return (!!zone->spanned_pages);
> }

Typo in comment.

2006-02-11 04:16:30

by Yasunori Goto

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)



> On Fri, 2006-02-10 at 23:20 +0900, Yasunori Goto wrote:
> >
> > static __meminit
> > void zone_wait_table_init(struct zone *zone, unsigned long
> > zone_size_pages)
> > {
> > - int i;
> > + int i, hotadd = (system_state == SYSTEM_RUNNING);
> > struct pglist_data *pgdat = zone->zone_pgdat;
> > + unsigned long allocsize;
> >
> > /*
> > * The per-page waitqueue mechanism uses hashed waitqueues
> > * per zone.
> > */
> > + if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
> > + zone_size_pages = PAGES_PER_SECTION << 2;
>
> I don't think I understand this calculation. You online only 4 sections
> worth of pages?

Ummmmm.
I realized that I've forgotten many things about this patch
due to long time keeping in storage.
At least here looks strange indeed.
I need shake my brain to recall it. :-(


--
Yasunori Goto



2006-02-11 10:59:05

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

Yasunori Goto wrote:
>
>> On Fri, 2006-02-10 at 23:20 +0900, Yasunori Goto wrote:
>>> static __meminit
>>> void zone_wait_table_init(struct zone *zone, unsigned long
>>> zone_size_pages)
>>> {
>>> - int i;
>>> + int i, hotadd = (system_state == SYSTEM_RUNNING);
>>> struct pglist_data *pgdat = zone->zone_pgdat;
>>> + unsigned long allocsize;
>>>
>>> /*
>>> * The per-page waitqueue mechanism uses hashed waitqueues
>>> * per zone.
>>> */
>>> + if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
>>> + zone_size_pages = PAGES_PER_SECTION << 2;
>> I don't think I understand this calculation. You online only 4 sections
>> worth of pages?
>
> Ummmmm.
> I realized that I've forgotten many things about this patch
> due to long time keeping in storage.
> At least here looks strange indeed.
> I need shake my brain to recall it. :-(

Ah, I'm not sure but it was because I didn't have a patch for zone's waittable resizing,
and resizing it looked impossible. Above code was just a quick hack for the case a zone
is initialized with only 1 section.
How large it should be ? or Resizing it, is necessary to be discussed.

-- Kame


2006-02-14 07:35:36

by Yasunori Goto

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

Sorry for late response....

> On Fri, 2006-02-10 at 23:20 +0900, Yasunori Goto wrote:
> > This patch is to initialize wait table and zonelists for new pgdat.
> > When new node is added, free_area_init_node() is called to initialize
> > pgdat. But, wait table must be allocated by kmalloc (not bootmem) for
> > it.
> > And, zonelists is accessed from any other process every time,
> > So, stop_machine_run() is used for safety update.
>
> I do notice that you're not using init_currently_empty_zone() to
> initialize currently empty zones. Why?

Did you mention about here in free_area_init_core()?
I recall about here.

:
if (!size)
continue;

zonetable_add(zone, nid, j, zone_start_pfn, size);
zone_wait_table_init(zone, size);
init_currently_empty_zone(zone, zone_start_pfn, size);
zone_start_pfn += size;
:

All of zone_size[] is set zero at new_pgdat_init().
So, init_currently_empty_zone() is not called at this point. Indeed.

However, when __add_section() is called for first section on the node
after that, hot_add_zone_init() is called, and it calls
init_currently_empty_zone().

--- pgdat2.orig/mm/memory_hotplug.c 2006-02-10 16:59:51.000000000 +0900
+++ pgdat2/mm/memory_hotplug.c 2006-02-10 17:02:34.000000000 +0900
@@ -48,6 +48,8 @@ static int __add_section(struct zone *zo

ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

+ hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION);
+

Thanks.

--
Yasunori Goto


2006-02-14 13:25:32

by Yasunori Goto

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

> Yasunori Goto wrote:
> >
> >> On Fri, 2006-02-10 at 23:20 +0900, Yasunori Goto wrote:
> >>> static __meminit
> >>> void zone_wait_table_init(struct zone *zone, unsigned long
> >>> zone_size_pages)
> >>> {
> >>> - int i;
> >>> + int i, hotadd = (system_state == SYSTEM_RUNNING);
> >>> struct pglist_data *pgdat = zone->zone_pgdat;
> >>> + unsigned long allocsize;
> >>>
> >>> /*
> >>> * The per-page waitqueue mechanism uses hashed waitqueues
> >>> * per zone.
> >>> */
> >>> + if (hotadd && (zone_size_pages == PAGES_PER_SECTION))
> >>> + zone_size_pages = PAGES_PER_SECTION << 2;
> >> I don't think I understand this calculation. You online only 4 sections
> >> worth of pages?
> >
> > Ummmmm.
> > I realized that I've forgotten many things about this patch
> > due to long time keeping in storage.
> > At least here looks strange indeed.
> > I need shake my brain to recall it. :-(
>
> Ah, I'm not sure but it was because I didn't have a patch for zone's waittable resizing,
> and resizing it looked impossible. Above code was just a quick hack for the case a zone
> is initialized with only 1 section.
> How large it should be ? or Resizing it, is necessary to be discussed.

Ok. How is like this patch? I changed wait_table_size() for it.
If there is no objection, I'll continue....

Thanks.

-----------------------------

This patch is to initialize wait table and zonelists for new pgdat.
When new node is added, free_area_init_node() is called to initialize
pgdat. But, wait table must be allocated by kmalloc (not bootmem) for it.
And, zonelists is accessed from any other process every time,
So, stop_machine_run() is used for safety update.


Signed-off-by: Dave Hansen <[email protected]>
Signed-off-by: Hiroyuki Kamezawa <[email protected]>
Signed-off-by: Yasunori Goto <[email protected]>

Index: pgdat2/mm/page_alloc.c
===================================================================
--- pgdat2.orig/mm/page_alloc.c 2006-02-14 18:50:37.000000000 +0900
+++ pgdat2/mm/page_alloc.c 2006-02-14 22:11:53.000000000 +0900
@@ -37,6 +37,7 @@
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/mempolicy.h>
+#include <linux/stop_machine.h>

#include <asm/tlbflush.h>
#include "internal.h"
@@ -1789,6 +1790,24 @@ static inline unsigned long wait_table_s

pages /= PAGES_PER_WAITQUEUE;

+ /*
+ * In memory hotplug case, the number of pages is just
+ * for the first section. But, there is no information how
+ * much memory will be added in the future,
+ * and this hash table cann't be reallocated.
+ * When memory hotplug is required, much memory will be
+ * necessary. Probably, bigger size is better.
+ * But kmalloc might fail if size is over order 3.
+ * This is to take wait_table size as much as possible.
+ */
+ if (system_state == SYSTEM_RUNNING){
+ unsigned long queue_head_size = 1;
+ while (queue_head_size < sizeof(wait_queue_head_t))
+ queue_head_size <<= 1;
+
+ pages = (1 << (PAGE_SHIFT + 3)) / queue_head_size;
+ }
+
while (size < pages)
size <<= 1;

@@ -2071,8 +2090,9 @@ void __init setup_per_cpu_pageset(void)
static __meminit
void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
- int i;
+ int i, hotadd = (system_state == SYSTEM_RUNNING);
struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long allocsize;

/*
* The per-page waitqueue mechanism uses hashed waitqueues
@@ -2080,9 +2100,12 @@ void zone_wait_table_init(struct zone *z
*/
zone->wait_table_size = wait_table_size(zone_size_pages);
zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
- zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, zone->wait_table_size
- * sizeof(wait_queue_head_t));
+ allocsize = zone->wait_table_size * sizeof(wait_queue_head_t);
+ if (hotadd)
+ zone->wait_table = kmalloc(allocsize, GFP_KERNEL);
+ else
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, allocsize);

for(i = 0; i < zone->wait_table_size; ++i)
init_waitqueue_head(zone->wait_table + i);
@@ -2120,6 +2143,7 @@ static __meminit void init_currently_emp
memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+ zone->spanned_pages = size;
}

/*
@@ -2818,3 +2842,53 @@ void *__init alloc_large_system_hash(con

return table;
}
+
+static inline int zone_previously_initialized(struct zone *zone)
+{
+ if (zone->wait_table_size)
+ return 1;
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __build_all_zonelists(void *dummy)
+{
+ int i;
+ for_each_online_node(i)
+ build_zonelists(NODE_DATA(i));
+ /* XXX: Cpuset must be updated when node is hotplugged. */
+ return 0;
+}
+
+DEFINE_SPINLOCK(zone_init_lock);
+int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long size_pages)
+{
+ int ret = 0;
+ unsigned long flags;
+ spin_lock_irqsave(&zone_init_lock,flags);
+ if (zone_previously_initialized(zone)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ printk(KERN_DEBUG "hot add zone init %lx %lx.....\n",
+ phys_start_pfn, size_pages);
+ init_currently_empty_zone(zone, phys_start_pfn, size_pages);
+ zone_pcp_init(zone);
+
+ /*
+ * This is an awfully blunt way to do this. But, the
+ * zonelists are accessed many times over large areas
+ * of performance-critical code in the allocator.
+ * That makes it very hard to get a conventional lock
+ * to work. This of this as a rw lock with a huge
+ * write cost.
+ */
+ stop_machine_run(__build_all_zonelists, zone->zone_pgdat, NR_CPUS);
+out:
+ spin_unlock_irqrestore(&zone_init_lock, flags);
+ return ret;
+}
+#endif
Index: pgdat2/include/linux/mmzone.h
===================================================================
--- pgdat2.orig/include/linux/mmzone.h 2006-02-14 18:14:59.000000000 +0900
+++ pgdat2/include/linux/mmzone.h 2006-02-14 18:50:44.000000000 +0900
@@ -403,7 +403,9 @@ static inline struct zone *next_zone(str

static inline int populated_zone(struct zone *zone)
{
- return (!!zone->present_pages);
+ /* When hot-add, present page is 0 at this point.
+ So check spanned_pages instead of present_pages */
+ return (!!zone->spanned_pages);
}

static inline int is_highmem_idx(int idx)
Index: pgdat2/mm/memory_hotplug.c
===================================================================
--- pgdat2.orig/mm/memory_hotplug.c 2006-02-14 18:50:12.000000000 +0900
+++ pgdat2/mm/memory_hotplug.c 2006-02-14 22:10:12.000000000 +0900
@@ -48,6 +48,8 @@ static int __add_section(struct zone *zo

ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

+ hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION);
+
if (ret < 0)
return ret;


--
Yasunori Goto



2006-02-15 01:06:32

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [Lhms-devel] [RFC/PATCH: 002/010] Memory hotplug for new nodes with pgdat allocation. (Wait table and zonelists initalization)

Yasunori Goto wrote:
> + if (system_state == SYSTEM_RUNNING){
> + unsigned long queue_head_size = 1;
> + while (queue_head_size < sizeof(wait_queue_head_t))
> + queue_head_size <<= 1;
> +
> + pages = (1 << (PAGE_SHIFT + 3)) / queue_head_size;
> + }
> +
we have to kmalloc() wait_table after this.
I don't think we always succeed to alloc multiple contiguous pages by kmalloc().
How about allocating wait_table like this ?
==
size = 4096; /* 4096 is maximum size */
while(size) {
waittable = kmalloc(size * sizeof(wait_queue_head_t), GFP_KERNEL):
if (wait_table)
break;
size = size >> 1;
}
zone->wait_table_size = size;
zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
zone->wait_table = wait_table;
==


-- Kame