Memory balloon drivers can allocate a large amount of memory which
is not movable but could be freed to accomodate memory hotplug remove.
Prior to calling the memory hotplug notifier chain the memory in the
pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
isolation will not proceed, causing the memory removal for that page
range to fail.
Rather than failing pageblock isolation if the the migrateteype is not
MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
are owned by a registered balloon driver (or other entity) using a
notifier chain. If all of the non-movable pages are owned by a balloon,
they can be freed later through the memory notifier chain and the range
can still be isolated in set_migratetype_isolate().
Signed-off-by: Robert Jennings <[email protected]>
---
drivers/base/memory.c | 19 +++++++++++++++++++
include/linux/memory.h | 26 ++++++++++++++++++++++++++
mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++++++++++-------
3 files changed, 83 insertions(+), 7 deletions(-)
Index: b/drivers/base/memory.c
===================================================================
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
}
EXPORT_SYMBOL(unregister_memory_notifier);
+static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
+
+int register_memory_isolate_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&memory_isolate_chain, nb);
+}
+EXPORT_SYMBOL(register_memory_isolate_notifier);
+
+void unregister_memory_isolate_notifier(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
+}
+EXPORT_SYMBOL(unregister_memory_isolate_notifier);
+
/*
* register_memory - Setup a sysfs device for a memory block
*/
@@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
return blocking_notifier_call_chain(&memory_chain, val, v);
}
+int memory_isolate_notify(unsigned long val, void *v)
+{
+ return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
+}
+
/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
Index: b/include/linux/memory.h
===================================================================
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -50,6 +50,18 @@ struct memory_notify {
int status_change_nid;
};
+/*
+ * During pageblock isolation, count the number of pages in the
+ * range [start_pfn, start_pfn + nr_pages)
+ */
+#define MEM_ISOLATE_COUNT (1<<0)
+
+struct memory_isolate_notify {
+ unsigned long start_pfn;
+ unsigned int nr_pages;
+ unsigned int pages_found;
+};
+
struct notifier_block;
struct mem_section;
@@ -76,14 +88,28 @@ static inline int memory_notify(unsigned
{
return 0;
}
+static inline int register_memory_isolate_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
+{
+}
+static inline int memory_isolate_notify(unsigned long val, void *v)
+{
+ return 0;
+}
#else
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
+extern int register_memory_isolate_notifier(struct notifier_block *nb);
+extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
extern int register_new_memory(int, struct mem_section *);
extern int unregister_memory_section(struct mem_section *);
extern int memory_dev_init(void);
extern int remove_memory_block(unsigned long, struct mem_section *, int);
extern int memory_notify(unsigned long val, void *v);
+extern int memory_isolate_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(struct mem_section *);
#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
enum mem_add_context { BOOT, HOTPLUG };
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
+#include <linux/memory.h>
#include <trace/events/kmem.h>
#include <asm/tlbflush.h>
@@ -4985,23 +4986,53 @@ void set_pageblock_flags_group(struct pa
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
+ unsigned long flags, pfn, iter;
+ unsigned long immobile = 0;
+ struct memory_isolate_notify arg;
+ int notifier_ret;
int ret = -EBUSY;
int zone_idx;
zone = page_zone(page);
zone_idx = zone_idx(zone);
+
spin_lock_irqsave(&zone->lock, flags);
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
+ zone_idx == ZONE_MOVABLE) {
+ ret = 0;
+ goto out;
+ }
+
+ pfn = page_to_pfn(page);
+ arg.start_pfn = pfn;
+ arg.nr_pages = pageblock_nr_pages;
+ arg.pages_found = 0;
+
/*
- * In future, more migrate types will be able to be isolation target.
+ * The pageblock can be isolated even if the migrate type is
+ * not *_MOVABLE. The memory isolation notifier chain counts
+ * the number of pages in this pageblock that can be freed later
+ * through the memory notifier chain. If all of the pages are
+ * accounted for, isolation can continue.
*/
- if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
- zone_idx != ZONE_MOVABLE)
+ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+ notifier_ret = notifier_to_errno(notifier_ret);
+ if (notifier_ret || !arg.pages_found)
goto out;
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
- ret = 0;
+
+ for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
+ if (page_count(pfn_to_page(iter)))
+ immobile++;
+
+ if (arg.pages_found == immobile)
+ ret = 0;
+
out:
+ if (!ret) {
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
drain_all_pages();
The Collaborative Memory Manager (CMM) module allocates individual pages
over time that are not migratable. On a long running system this can
severely impact the ability to find enough pages to support a hotplug
memory remove operation.
This patch adds a memory isolation notifier and a memory hotplug notifier.
The memory isolation notifier will return the number of pages found
in the range specified. This is used to determine if all of the used
pages in a pageblock are owned by the balloon (or other entities in
the notifier chain). The hotplug notifier will free pages in the range
which is to be removed. The priority of this hotplug notifier is low
so that it will be called near last, this helps avoids removing loaned
pages in operations that fail due to other handlers.
CMM activity will be halted when hotplug remove operations are active
and resume activity after a delay period to allow the hypervisor time
to adjust.
Signed-off-by: Robert Jennings <[email protected]>
---
Minor update to cmm_count_pages() to account for changes in
struct memory_isolate_notify.
---
arch/powerpc/platforms/pseries/cmm.c | 207 ++++++++++++++++++++++++++++++++++-
1 file changed, 201 insertions(+), 6 deletions(-)
Index: b/arch/powerpc/platforms/pseries/cmm.c
===================================================================
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -38,19 +38,28 @@
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
+#include <linux/memory.h>
#include "plpar_wrappers.h"
#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15
static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
@@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+ "before activity resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
@@ -88,6 +101,8 @@ struct cmm_page_array {
static unsigned long loaned_pages;
static unsigned long loaned_pages_target;
static unsigned long oom_freed_pages;
+static atomic_t hotplug_active = ATOMIC_INIT(0);
+static atomic_t hotplug_occurred = ATOMIC_INIT(0);
static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);
@@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
cmm_dbg("Begin request for %ld pages\n", nr);
while (nr) {
+ if (atomic_read(&hotplug_active))
+ break;
+
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
@@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
- npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC |
+ __GFP_MOVABLE);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
@@ -273,9 +293,23 @@ static int cmm_thread(void *dummy)
while (1) {
timeleft = msleep_interruptible(delay * 1000);
- if (kthread_should_stop() || timeleft) {
- loaned_pages_target = loaned_pages;
+ if (kthread_should_stop() || timeleft)
break;
+
+ if (atomic_read(&hotplug_active)) {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
+ }
+
+ if (atomic_dec_if_positive(&hotplug_occurred) >= 0) {
+ cmm_dbg("Hotplug operation has occurred, loaning "
+ "activity suspended for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay * 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
}
cmm_get_mpp();
@@ -405,6 +439,159 @@ static struct notifier_block cmm_reboot_
};
/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+ unsigned long idx;
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ continue;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+ atomic_set(&hotplug_occurred, 1);
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ atomic_set(&hotplug_active, 1);
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ atomic_set(&hotplug_active, 0);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
* cmm_init - Module initialization
*
* Return value:
@@ -426,18 +613,24 @@ static int cmm_init(void)
if ((rc = cmm_sysfs_register(&cmm_sysdev)))
goto out_reboot_notifier;
+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
if (cmm_disabled)
return rc;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
- goto out_unregister_sysfs;
+ goto out_unregister_notifier;
}
return rc;
-out_unregister_sysfs:
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_sysdev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
@@ -458,6 +651,8 @@ static void cmm_exit(void)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_sysdev);
}
Hi,
I am currently working on the s390 port for the cmm + hotplug
patch, and I'm a little confused about the memory allocation
policy, see below. Is it correct that the balloon cannot grow
into ZONE_MOVABLE, while the pages for the balloon page list
can?
Robert Jennings wrote:
> @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
> cmm_dbg("Begin request for %ld pages\n", nr);
>
> while (nr) {
> + if (atomic_read(&hotplug_active))
> + break;
> +
> addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
> __GFP_NORETRY | __GFP_NOMEMALLOC);
> if (!addr)
> @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
> if (!pa || pa->index >= CMM_NR_PAGES) {
> /* Need a new page for the page list. */
> spin_unlock(&cmm_lock);
> - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
> - __GFP_NORETRY | __GFP_NOMEMALLOC);
> + npa = (struct cmm_page_array *)__get_free_page(
> + GFP_NOIO | __GFP_NOWARN |
> + __GFP_NORETRY | __GFP_NOMEMALLOC |
> + __GFP_MOVABLE);
> if (!npa) {
> pr_info("%s: Can not allocate new page list\n", __func__);
> free_page(addr);
Why is the __GFP_MOVABLE added here, for the page list alloc, and not
above for the balloon page alloc?
--
Regards,
Gerald
* Gerald Schaefer ([email protected]) wrote:
> Hi,
>
> I am currently working on the s390 port for the cmm + hotplug
> patch, and I'm a little confused about the memory allocation
> policy, see below. Is it correct that the balloon cannot grow
> into ZONE_MOVABLE, while the pages for the balloon page list
> can?
>
> Robert Jennings wrote:
>> @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
>> cmm_dbg("Begin request for %ld pages\n", nr);
>>
>> while (nr) {
>> + if (atomic_read(&hotplug_active))
>> + break;
>> +
>> addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
>> __GFP_NORETRY | __GFP_NOMEMALLOC);
>> if (!addr)
>> @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
>> if (!pa || pa->index >= CMM_NR_PAGES) {
>> /* Need a new page for the page list. */
>> spin_unlock(&cmm_lock);
>> - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
>> - __GFP_NORETRY | __GFP_NOMEMALLOC);
>> + npa = (struct cmm_page_array *)__get_free_page(
>> + GFP_NOIO | __GFP_NOWARN |
>> + __GFP_NORETRY | __GFP_NOMEMALLOC |
>> + __GFP_MOVABLE);
>> if (!npa) {
>> pr_info("%s: Can not allocate new page list\n", __func__);
>> free_page(addr);
>
> Why is the __GFP_MOVABLE added here, for the page list alloc, and not
> above for the balloon page alloc?
The pages allocated as __GFP_MOVABLE are used to store the list of pages
allocated by the balloon. They reference virtual addresses and it would
be fine for the kernel to migrate the physical pages for those, the
balloon would not notice this.
The pages loaned by the balloon are not allocated with __GFP_MOVABLE
because we will inform the hypervisor which page has been loaned by
Linux according to the physical address. Migration of those physical
pages would invalidate the loan, so we do not mark them as movable.
Regards,
Robert Jennings
On Fri, 2 Oct 2009 13:44:58 -0500
Robert Jennings <[email protected]> wrote:
> Memory balloon drivers can allocate a large amount of memory which
> is not movable but could be freed to accomodate memory hotplug remove.
>
> Prior to calling the memory hotplug notifier chain the memory in the
> pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> isolation will not proceed, causing the memory removal for that page
> range to fail.
>
> Rather than failing pageblock isolation if the the migrateteype is not
> MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> are owned by a registered balloon driver (or other entity) using a
> notifier chain. If all of the non-movable pages are owned by a balloon,
> they can be freed later through the memory notifier chain and the range
> can still be isolated in set_migratetype_isolate().
The patch looks sane enough to me.
I expect that if the powerpc and s390 guys want to work on CMM over the
next couple of months, they'd like this patch merged into 2.6.32. It's
a bit larger and more involved than one would like, but I guess we can
do that if suitable people (Mel? Kamezawa?) have had a close look and
are OK with it.
What do people think?
Has it been carefully compile- and run-time tested with
CONFIG_MEMORY_HOTPLUG_SPARSE=n?
On Fri, 2 Oct 2009 13:44:58 -0500
Robert Jennings <[email protected]> wrote:
> Memory balloon drivers can allocate a large amount of memory which
> is not movable but could be freed to accomodate memory hotplug remove.
>
> Prior to calling the memory hotplug notifier chain the memory in the
> pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> isolation will not proceed, causing the memory removal for that page
> range to fail.
>
> Rather than failing pageblock isolation if the the migrateteype is not
> MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> are owned by a registered balloon driver (or other entity) using a
> notifier chain. If all of the non-movable pages are owned by a balloon,
> they can be freed later through the memory notifier chain and the range
> can still be isolated in set_migratetype_isolate().
>
> Signed-off-by: Robert Jennings <[email protected]>
>
> ---
> drivers/base/memory.c | 19 +++++++++++++++++++
> include/linux/memory.h | 26 ++++++++++++++++++++++++++
> mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++++++++++-------
> 3 files changed, 83 insertions(+), 7 deletions(-)
>
> Index: b/drivers/base/memory.c
> ===================================================================
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> }
> EXPORT_SYMBOL(unregister_memory_notifier);
>
> +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> +
IIUC, this notifier is called under zone->lock.
please ATOMIC_NOTIFIER_HEAD().
> +int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(register_memory_isolate_notifier);
> +
> +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> +
> /*
> * register_memory - Setup a sysfs device for a memory block
> */
> @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> return blocking_notifier_call_chain(&memory_chain, val, v);
> }
>
> +int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> +}
> +
> /*
> * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> * OK to have direct references to sparsemem variables in here.
> Index: b/include/linux/memory.h
> ===================================================================
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -50,6 +50,18 @@ struct memory_notify {
> int status_change_nid;
> };
>
> +/*
> + * During pageblock isolation, count the number of pages in the
> + * range [start_pfn, start_pfn + nr_pages)
> + */
> +#define MEM_ISOLATE_COUNT (1<<0)
> +
> +struct memory_isolate_notify {
> + unsigned long start_pfn;
> + unsigned int nr_pages;
> + unsigned int pages_found;
> +};
Could you add commentary for each field ?
> +
> struct notifier_block;
> struct mem_section;
>
> @@ -76,14 +88,28 @@ static inline int memory_notify(unsigned
> {
> return 0;
> }
> +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return 0;
> +}
> +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> +}
> +static inline int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return 0;
> +}
> #else
> extern int register_memory_notifier(struct notifier_block *nb);
> extern void unregister_memory_notifier(struct notifier_block *nb);
> +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> extern int register_new_memory(int, struct mem_section *);
> extern int unregister_memory_section(struct mem_section *);
> extern int memory_dev_init(void);
> extern int remove_memory_block(unsigned long, struct mem_section *, int);
> extern int memory_notify(unsigned long val, void *v);
> +extern int memory_isolate_notify(unsigned long val, void *v);
> extern struct memory_block *find_memory_block(struct mem_section *);
> #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> enum mem_add_context { BOOT, HOTPLUG };
> Index: b/mm/page_alloc.c
> ===================================================================
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,6 +48,7 @@
> #include <linux/page_cgroup.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemleak.h>
> +#include <linux/memory.h>
> #include <trace/events/kmem.h>
>
> #include <asm/tlbflush.h>
> @@ -4985,23 +4986,53 @@ void set_pageblock_flags_group(struct pa
> int set_migratetype_isolate(struct page *page)
> {
> struct zone *zone;
> - unsigned long flags;
> + unsigned long flags, pfn, iter;
> + unsigned long immobile = 0;
> + struct memory_isolate_notify arg;
> + int notifier_ret;
> int ret = -EBUSY;
> int zone_idx;
>
> zone = page_zone(page);
> zone_idx = zone_idx(zone);
> +
> spin_lock_irqsave(&zone->lock, flags);
> + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
> + zone_idx == ZONE_MOVABLE) {
> + ret = 0;
> + goto out;
> + }
> +
> + pfn = page_to_pfn(page);
> + arg.start_pfn = pfn;
> + arg.nr_pages = pageblock_nr_pages;
> + arg.pages_found = 0;
> +
> /*
> - * In future, more migrate types will be able to be isolation target.
> + * The pageblock can be isolated even if the migrate type is
> + * not *_MOVABLE. The memory isolation notifier chain counts
> + * the number of pages in this pageblock that can be freed later
> + * through the memory notifier chain. If all of the pages are
> + * accounted for, isolation can continue.
> */
Could add explanation like this ?
==
Later, for example, when memory hotplug notifier runs, these pages reported as
"can be isoalted" should be isolated(freed) by callbacks.
==
> - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> - zone_idx != ZONE_MOVABLE)
> + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> + notifier_ret = notifier_to_errno(notifier_ret);
> + if (notifier_ret || !arg.pages_found)
> goto out;
> - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> - ret = 0;
> +
> + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> + if (page_count(pfn_to_page(iter)))
> + immobile++;
> +
> + if (arg.pages_found == immobile)
> + ret = 0;
> +
I can't understand this part. Does this mean all pages under this pageblock
are used by balloon driver ?
IOW, memory is hotpluggable only when all pages under this pageblock is used
by balloon ?
Hmm. Can't we do this kind of check..?
==
for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
page = pfn_to_page(iter);
if (!page_count(page) || PageLRU(page)) // This page is movable.
continue;
immobile++
}
==
Then, if a page is luckyly on LRU, we have more chances.
(This check can fail if a page is on percpu pagevec etc...)
Thanks,
-Kame
> out:
> + if (!ret) {
> + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> + }
> +
> spin_unlock_irqrestore(&zone->lock, flags);
> if (!ret)
> drain_all_pages();
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On Fri, Oct 02, 2009 at 01:44:58PM -0500, Robert Jennings wrote:
> Memory balloon drivers can allocate a large amount of memory which
> is not movable but could be freed to accomodate memory hotplug remove.
>
> Prior to calling the memory hotplug notifier chain the memory in the
> pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> isolation will not proceed, causing the memory removal for that page
> range to fail.
>
> Rather than failing pageblock isolation if the the migrateteype is not
s/the the migrateteype/the migratetype/
> MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> are owned by a registered balloon driver (or other entity) using a
> notifier chain. If all of the non-movable pages are owned by a balloon,
> they can be freed later through the memory notifier chain and the range
> can still be isolated in set_migratetype_isolate().
>
> Signed-off-by: Robert Jennings <[email protected]>
>
> ---
> drivers/base/memory.c | 19 +++++++++++++++++++
> include/linux/memory.h | 26 ++++++++++++++++++++++++++
> mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++++++++++-------
> 3 files changed, 83 insertions(+), 7 deletions(-)
>
> Index: b/drivers/base/memory.c
> ===================================================================
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> }
> EXPORT_SYMBOL(unregister_memory_notifier);
>
> +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> +
> +int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(register_memory_isolate_notifier);
> +
> +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> +
> /*
> * register_memory - Setup a sysfs device for a memory block
> */
> @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> return blocking_notifier_call_chain(&memory_chain, val, v);
> }
>
> +int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> +}
> +
> /*
> * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> * OK to have direct references to sparsemem variables in here.
> Index: b/include/linux/memory.h
> ===================================================================
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -50,6 +50,18 @@ struct memory_notify {
> int status_change_nid;
> };
>
> +/*
> + * During pageblock isolation, count the number of pages in the
> + * range [start_pfn, start_pfn + nr_pages)
> + */
>
The comment could have been slightly better. The count of pages in the
range is nr_pages - memory_holes but what you're counting is the number
of pages owned by the balloon driver in the notification chain.
> +#define MEM_ISOLATE_COUNT (1<<0)
> +
> +struct memory_isolate_notify {
> + unsigned long start_pfn;
> + unsigned int nr_pages;
> + unsigned int pages_found;
> +};
> +
> struct notifier_block;
> struct mem_section;
>
> @@ -76,14 +88,28 @@ static inline int memory_notify(unsigned
> {
> return 0;
> }
> +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return 0;
> +}
> +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> +}
> +static inline int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return 0;
> +}
> #else
> extern int register_memory_notifier(struct notifier_block *nb);
> extern void unregister_memory_notifier(struct notifier_block *nb);
> +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> extern int register_new_memory(int, struct mem_section *);
> extern int unregister_memory_section(struct mem_section *);
> extern int memory_dev_init(void);
> extern int remove_memory_block(unsigned long, struct mem_section *, int);
> extern int memory_notify(unsigned long val, void *v);
> +extern int memory_isolate_notify(unsigned long val, void *v);
> extern struct memory_block *find_memory_block(struct mem_section *);
> #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> enum mem_add_context { BOOT, HOTPLUG };
> Index: b/mm/page_alloc.c
> ===================================================================
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,6 +48,7 @@
> #include <linux/page_cgroup.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemleak.h>
> +#include <linux/memory.h>
> #include <trace/events/kmem.h>
>
> #include <asm/tlbflush.h>
> @@ -4985,23 +4986,53 @@ void set_pageblock_flags_group(struct pa
> int set_migratetype_isolate(struct page *page)
> {
> struct zone *zone;
> - unsigned long flags;
> + unsigned long flags, pfn, iter;
> + unsigned long immobile = 0;
> + struct memory_isolate_notify arg;
> + int notifier_ret;
> int ret = -EBUSY;
> int zone_idx;
>
> zone = page_zone(page);
> zone_idx = zone_idx(zone);
> +
> spin_lock_irqsave(&zone->lock, flags);
> + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
> + zone_idx == ZONE_MOVABLE) {
> + ret = 0;
> + goto out;
> + }
> +
> + pfn = page_to_pfn(page);
> + arg.start_pfn = pfn;
> + arg.nr_pages = pageblock_nr_pages;
> + arg.pages_found = 0;
> +
> /*
> - * In future, more migrate types will be able to be isolation target.
> + * The pageblock can be isolated even if the migrate type is
> + * not *_MOVABLE. The memory isolation notifier chain counts
> + * the number of pages in this pageblock that can be freed later
> + * through the memory notifier chain. If all of the pages are
> + * accounted for, isolation can continue.
This comment could have been clearer as well
* It may be possible to isolate a pageblock even if the migratetype is
* not MIGRATE_MOVABLE. The memory isolation notifier chain is used by
* balloon drivers to return the number of pages in a range that are held
* by the balloon driver to shrink memory. If all the pages are accounted
* for by balloons or are free, isolation can continue
> */
> - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> - zone_idx != ZONE_MOVABLE)
> + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> + notifier_ret = notifier_to_errno(notifier_ret);
> + if (notifier_ret || !arg.pages_found)
> goto out;
> - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> - ret = 0;
> +
> + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> + if (page_count(pfn_to_page(iter)))
> + immobile++;
> +
This part here is not safe when CONFIG_HOLES_IN_ZONE is set. You need to
make it something like
for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
if (!pfn_valid_within(pfn))
continue;
if (page_count(pfn_to_page(iter)))
immobile++;
}
You shouldn't need to run pfn_valid() as you're always starting from a valid
page and never going outside MAX_ORDER_NR_PAGES in this iterator.
> + if (arg.pages_found == immobile)
> + ret = 0;
> +
Ok, so if all pages in a range that are in use match the count returned
by the balloon, then it's ok to isolate.
> out:
> + if (!ret) {
> + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> + }
> +
> spin_unlock_irqrestore(&zone->lock, flags);
> if (!ret)
> drain_all_pages();
>
The patch looks more or less sane. It would be nice to have a follow-on patch
that clarified some details but it's not necessary. The pfn_valid_within()
should be done as a follow-on patch. I haven't actually tested this but
otherwise it looks ok. Once the pfn_valid_within() is sorted out, it has
my Ack.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
* KAMEZAWA Hiroyuki ([email protected]) wrote:
> On Fri, 2 Oct 2009 13:44:58 -0500
> Robert Jennings <[email protected]> wrote:
>
> > Memory balloon drivers can allocate a large amount of memory which
> > is not movable but could be freed to accomodate memory hotplug remove.
> >
> > Prior to calling the memory hotplug notifier chain the memory in the
> > pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> > isolation will not proceed, causing the memory removal for that page
> > range to fail.
> >
> > Rather than failing pageblock isolation if the the migrateteype is not
> > MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> > are owned by a registered balloon driver (or other entity) using a
> > notifier chain. If all of the non-movable pages are owned by a balloon,
> > they can be freed later through the memory notifier chain and the range
> > can still be isolated in set_migratetype_isolate().
> >
> > Signed-off-by: Robert Jennings <[email protected]>
> >
> > ---
> > drivers/base/memory.c | 19 +++++++++++++++++++
> > include/linux/memory.h | 26 ++++++++++++++++++++++++++
> > mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++++++++++-------
> > 3 files changed, 83 insertions(+), 7 deletions(-)
> >
> > Index: b/drivers/base/memory.c
> > ===================================================================
> > --- a/drivers/base/memory.c
> > +++ b/drivers/base/memory.c
> > @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> > }
> > EXPORT_SYMBOL(unregister_memory_notifier);
> >
> > +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> > +
>
> IIUC, this notifier is called under zone->lock.
> please ATOMIC_NOTIFIER_HEAD().
I will correct this.
> > +int register_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> > +}
> > +EXPORT_SYMBOL(register_memory_isolate_notifier);
> > +
> > +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> > +}
> > +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> > +
> > /*
> > * register_memory - Setup a sysfs device for a memory block
> > */
> > @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> > return blocking_notifier_call_chain(&memory_chain, val, v);
> > }
> >
> > +int memory_isolate_notify(unsigned long val, void *v)
> > +{
> > + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> > +}
> > +
> > /*
> > * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> > * OK to have direct references to sparsemem variables in here.
> > Index: b/include/linux/memory.h
> > ===================================================================
> > --- a/include/linux/memory.h
> > +++ b/include/linux/memory.h
> > @@ -50,6 +50,18 @@ struct memory_notify {
> > int status_change_nid;
> > };
> >
> > +/*
> > + * During pageblock isolation, count the number of pages in the
> > + * range [start_pfn, start_pfn + nr_pages)
> > + */
> > +#define MEM_ISOLATE_COUNT (1<<0)
> > +
> > +struct memory_isolate_notify {
> > + unsigned long start_pfn;
> > + unsigned int nr_pages;
> > + unsigned int pages_found;
> > +};
>
> Could you add commentary for each field ?
This will be documented in the next version of the patch.
> > +
> > struct notifier_block;
> > struct mem_section;
> >
> > @@ -76,14 +88,28 @@ static inline int memory_notify(unsigned
> > {
> > return 0;
> > }
> > +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + return 0;
> > +}
> > +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > +}
> > +static inline int memory_isolate_notify(unsigned long val, void *v)
> > +{
> > + return 0;
> > +}
> > #else
> > extern int register_memory_notifier(struct notifier_block *nb);
> > extern void unregister_memory_notifier(struct notifier_block *nb);
> > +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> > +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> > extern int register_new_memory(int, struct mem_section *);
> > extern int unregister_memory_section(struct mem_section *);
> > extern int memory_dev_init(void);
> > extern int remove_memory_block(unsigned long, struct mem_section *, int);
> > extern int memory_notify(unsigned long val, void *v);
> > +extern int memory_isolate_notify(unsigned long val, void *v);
> > extern struct memory_block *find_memory_block(struct mem_section *);
> > #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> > enum mem_add_context { BOOT, HOTPLUG };
> > Index: b/mm/page_alloc.c
> > ===================================================================
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -48,6 +48,7 @@
> > #include <linux/page_cgroup.h>
> > #include <linux/debugobjects.h>
> > #include <linux/kmemleak.h>
> > +#include <linux/memory.h>
> > #include <trace/events/kmem.h>
> >
> > #include <asm/tlbflush.h>
> > @@ -4985,23 +4986,53 @@ void set_pageblock_flags_group(struct pa
> > int set_migratetype_isolate(struct page *page)
> > {
> > struct zone *zone;
> > - unsigned long flags;
> > + unsigned long flags, pfn, iter;
> > + unsigned long immobile = 0;
> > + struct memory_isolate_notify arg;
> > + int notifier_ret;
> > int ret = -EBUSY;
> > int zone_idx;
> >
> > zone = page_zone(page);
> > zone_idx = zone_idx(zone);
> > +
> > spin_lock_irqsave(&zone->lock, flags);
> > + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
> > + zone_idx == ZONE_MOVABLE) {
> > + ret = 0;
> > + goto out;
> > + }
> > +
> > + pfn = page_to_pfn(page);
> > + arg.start_pfn = pfn;
> > + arg.nr_pages = pageblock_nr_pages;
> > + arg.pages_found = 0;
> > +
> > /*
> > - * In future, more migrate types will be able to be isolation target.
> > + * The pageblock can be isolated even if the migrate type is
> > + * not *_MOVABLE. The memory isolation notifier chain counts
> > + * the number of pages in this pageblock that can be freed later
> > + * through the memory notifier chain. If all of the pages are
> > + * accounted for, isolation can continue.
> > */
>
> Could add explanation like this ?
> ==
> Later, for example, when memory hotplug notifier runs, these pages reported as
> "can be isoalted" should be isolated(freed) by callbacks.
> ==
I'll change this comment. You and Mel both gave me good direction here.
> > - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> > - zone_idx != ZONE_MOVABLE)
> > + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> > + notifier_ret = notifier_to_errno(notifier_ret);
> > + if (notifier_ret || !arg.pages_found)
> > goto out;
> > - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> > - ret = 0;
> > +
> > + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> > + if (page_count(pfn_to_page(iter)))
> > + immobile++;
> > +
> > + if (arg.pages_found == immobile)
> > + ret = 0;
> > +
>
> I can't understand this part. Does this mean all pages under this pageblock
> are used by balloon driver ?
> IOW, memory is hotpluggable only when all pages under this pageblock is used
> by balloon ?
>
>
> Hmm. Can't we do this kind of check..?
> ==
> for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
> page = pfn_to_page(iter);
> if (!page_count(page) || PageLRU(page)) // This page is movable.
> continue;
> immobile++
> }
> ==
> Then, if a page is luckyly on LRU, we have more chances.
> (This check can fail if a page is on percpu pagevec etc...)
>
> Thanks,
> -Kame
I like this change, it will be in my next patch after I finish testing.
Thank you for your review.
> > out:
> > + if (!ret) {
> > + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> > + }
> > +
> > spin_unlock_irqrestore(&zone->lock, flags);
> > if (!ret)
> > drain_all_pages();
> >
> > --
* Mel Gorman ([email protected]) wrote:
> On Fri, Oct 02, 2009 at 01:44:58PM -0500, Robert Jennings wrote:
> > Memory balloon drivers can allocate a large amount of memory which
> > is not movable but could be freed to accomodate memory hotplug remove.
> >
> > Prior to calling the memory hotplug notifier chain the memory in the
> > pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> > isolation will not proceed, causing the memory removal for that page
> > range to fail.
> >
> > Rather than failing pageblock isolation if the the migrateteype is not
>
> s/the the migrateteype/the migratetype/
>
> > MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> > are owned by a registered balloon driver (or other entity) using a
> > notifier chain. If all of the non-movable pages are owned by a balloon,
> > they can be freed later through the memory notifier chain and the range
> > can still be isolated in set_migratetype_isolate().
> >
> > Signed-off-by: Robert Jennings <[email protected]>
> >
> > ---
> > drivers/base/memory.c | 19 +++++++++++++++++++
> > include/linux/memory.h | 26 ++++++++++++++++++++++++++
> > mm/page_alloc.c | 45 ++++++++++++++++++++++++++++++++++++++-------
> > 3 files changed, 83 insertions(+), 7 deletions(-)
> >
> > Index: b/drivers/base/memory.c
> > ===================================================================
> > --- a/drivers/base/memory.c
> > +++ b/drivers/base/memory.c
> > @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> > }
> > EXPORT_SYMBOL(unregister_memory_notifier);
> >
> > +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> > +
> > +int register_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> > +}
> > +EXPORT_SYMBOL(register_memory_isolate_notifier);
> > +
> > +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> > +}
> > +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> > +
> > /*
> > * register_memory - Setup a sysfs device for a memory block
> > */
> > @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> > return blocking_notifier_call_chain(&memory_chain, val, v);
> > }
> >
> > +int memory_isolate_notify(unsigned long val, void *v)
> > +{
> > + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> > +}
> > +
> > /*
> > * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> > * OK to have direct references to sparsemem variables in here.
> > Index: b/include/linux/memory.h
> > ===================================================================
> > --- a/include/linux/memory.h
> > +++ b/include/linux/memory.h
> > @@ -50,6 +50,18 @@ struct memory_notify {
> > int status_change_nid;
> > };
> >
> > +/*
> > + * During pageblock isolation, count the number of pages in the
> > + * range [start_pfn, start_pfn + nr_pages)
> > + */
> >
>
> The comment could have been slightly better. The count of pages in the
> range is nr_pages - memory_holes but what you're counting is the number
> of pages owned by the balloon driver in the notification chain.
Right, it is misleading. I'll fix this.
> > +#define MEM_ISOLATE_COUNT (1<<0)
> > +
> > +struct memory_isolate_notify {
> > + unsigned long start_pfn;
> > + unsigned int nr_pages;
> > + unsigned int pages_found;
> > +};
> > +
> > struct notifier_block;
> > struct mem_section;
> >
> > @@ -76,14 +88,28 @@ static inline int memory_notify(unsigned
> > {
> > return 0;
> > }
> > +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + return 0;
> > +}
> > +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > +}
> > +static inline int memory_isolate_notify(unsigned long val, void *v)
> > +{
> > + return 0;
> > +}
> > #else
> > extern int register_memory_notifier(struct notifier_block *nb);
> > extern void unregister_memory_notifier(struct notifier_block *nb);
> > +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> > +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> > extern int register_new_memory(int, struct mem_section *);
> > extern int unregister_memory_section(struct mem_section *);
> > extern int memory_dev_init(void);
> > extern int remove_memory_block(unsigned long, struct mem_section *, int);
> > extern int memory_notify(unsigned long val, void *v);
> > +extern int memory_isolate_notify(unsigned long val, void *v);
> > extern struct memory_block *find_memory_block(struct mem_section *);
> > #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> > enum mem_add_context { BOOT, HOTPLUG };
> > Index: b/mm/page_alloc.c
> > ===================================================================
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -48,6 +48,7 @@
> > #include <linux/page_cgroup.h>
> > #include <linux/debugobjects.h>
> > #include <linux/kmemleak.h>
> > +#include <linux/memory.h>
> > #include <trace/events/kmem.h>
> >
> > #include <asm/tlbflush.h>
> > @@ -4985,23 +4986,53 @@ void set_pageblock_flags_group(struct pa
> > int set_migratetype_isolate(struct page *page)
> > {
> > struct zone *zone;
> > - unsigned long flags;
> > + unsigned long flags, pfn, iter;
> > + unsigned long immobile = 0;
> > + struct memory_isolate_notify arg;
> > + int notifier_ret;
> > int ret = -EBUSY;
> > int zone_idx;
> >
> > zone = page_zone(page);
> > zone_idx = zone_idx(zone);
> > +
> > spin_lock_irqsave(&zone->lock, flags);
> > + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
> > + zone_idx == ZONE_MOVABLE) {
> > + ret = 0;
> > + goto out;
> > + }
> > +
> > + pfn = page_to_pfn(page);
> > + arg.start_pfn = pfn;
> > + arg.nr_pages = pageblock_nr_pages;
> > + arg.pages_found = 0;
> > +
> > /*
> > - * In future, more migrate types will be able to be isolation target.
> > + * The pageblock can be isolated even if the migrate type is
> > + * not *_MOVABLE. The memory isolation notifier chain counts
> > + * the number of pages in this pageblock that can be freed later
> > + * through the memory notifier chain. If all of the pages are
> > + * accounted for, isolation can continue.
>
> This comment could have been clearer as well
>
> * It may be possible to isolate a pageblock even if the migratetype is
> * not MIGRATE_MOVABLE. The memory isolation notifier chain is used by
> * balloon drivers to return the number of pages in a range that are held
> * by the balloon driver to shrink memory. If all the pages are accounted
> * for by balloons or are free, isolation can continue
* It may be possible to isolate a pageblock even if the migratetype is
* not MIGRATE_MOVABLE. The memory isolation notifier chain is used by
* balloon drivers to return the number of pages in a range that are held
* by the balloon driver to shrink memory. If all the pages are accounted
* for by balloons, are free, or on the LRU, isolation can continue.
* Later, for example, when memory hotplug notifier runs, these pages
* reported as "can be isolated" should be isolated(freed) by the balloon
* driver through the memory notifier chain.
> > */
> > - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> > - zone_idx != ZONE_MOVABLE)
> > + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> > + notifier_ret = notifier_to_errno(notifier_ret);
> > + if (notifier_ret || !arg.pages_found)
> > goto out;
> > - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> > - ret = 0;
> > +
> > + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> > + if (page_count(pfn_to_page(iter)))
> > + immobile++;
> > +
>
> This part here is not safe when CONFIG_HOLES_IN_ZONE is set. You need to
> make it something like
>
> for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
> if (!pfn_valid_within(pfn))
> continue;
>
> if (page_count(pfn_to_page(iter)))
> immobile++;
> }
>
> You shouldn't need to run pfn_valid() as you're always starting from a valid
> page and never going outside MAX_ORDER_NR_PAGES in this iterator.
>
I will make this change.
> > + if (arg.pages_found == immobile)
> > + ret = 0;
> > +
>
> Ok, so if all pages in a range that are in use match the count returned
> by the balloon, then it's ok to isolate.
Correct.
> > out:
> > + if (!ret) {
> > + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> > + }
> > +
> > spin_unlock_irqrestore(&zone->lock, flags);
> > if (!ret)
> > drain_all_pages();
> >
>
> The patch looks more or less sane. It would be nice to have a follow-on patch
> that clarified some details but it's not necessary. The pfn_valid_within()
> should be done as a follow-on patch. I haven't actually tested this but
> otherwise it looks ok. Once the pfn_valid_within() is sorted out, it has
> my Ack.
I'll be sending out a new revision of this patch rather than a
follow-on due to other changes (change from BLOCKING_NOTIFIER_HEAD to
ATOMIC_NOTIFIER_HEAD) and I will include changes discussed here. Thank
you for the review.
* Andrew Morton ([email protected]) wrote:
> On Fri, 2 Oct 2009 13:44:58 -0500
> Robert Jennings <[email protected]> wrote:
>
> > Memory balloon drivers can allocate a large amount of memory which
> > is not movable but could be freed to accomodate memory hotplug remove.
> >
> > Prior to calling the memory hotplug notifier chain the memory in the
> > pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> > isolation will not proceed, causing the memory removal for that page
> > range to fail.
> >
> > Rather than failing pageblock isolation if the the migrateteype is not
> > MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> > are owned by a registered balloon driver (or other entity) using a
> > notifier chain. If all of the non-movable pages are owned by a balloon,
> > they can be freed later through the memory notifier chain and the range
> > can still be isolated in set_migratetype_isolate().
>
> The patch looks sane enough to me.
>
> I expect that if the powerpc and s390 guys want to work on CMM over the
> next couple of months, they'd like this patch merged into 2.6.32. It's
> a bit larger and more involved than one would like, but I guess we can
> do that if suitable people (Mel? Kamezawa?) have had a close look and
> are OK with it.
>
> What do people think?
I'd love to get it in 2.6.32 if that's possible. I have gone over the
comments from Mel and Kamezawa I produced a new patchset. I just
finished testing it (and I also tested with
CONFIG_MEMORY_HOTPLUG_SPARSE=n) and it will be posted shortly.
>
> Has it been carefully compile- and run-time tested with
> CONFIG_MEMORY_HOTPLUG_SPARSE=n?
Yes, I have compiled the kernel CONFIG_MEMORY_HOTPLUG_SPARSE=n and made
sure that we didn't have any problems.
--Robert Jennings
On Fri, Oct 09, 2009 at 03:23:04PM -0500, Robert Jennings wrote:
> * Andrew Morton ([email protected]) wrote:
> > On Fri, 2 Oct 2009 13:44:58 -0500
> > Robert Jennings <[email protected]> wrote:
> >
> > > Memory balloon drivers can allocate a large amount of memory which
> > > is not movable but could be freed to accomodate memory hotplug remove.
> > >
> > > Prior to calling the memory hotplug notifier chain the memory in the
> > > pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> > > isolation will not proceed, causing the memory removal for that page
> > > range to fail.
> > >
> > > Rather than failing pageblock isolation if the the migrateteype is not
> > > MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock
> > > are owned by a registered balloon driver (or other entity) using a
> > > notifier chain. If all of the non-movable pages are owned by a balloon,
> > > they can be freed later through the memory notifier chain and the range
> > > can still be isolated in set_migratetype_isolate().
> >
> > The patch looks sane enough to me.
> >
> > I expect that if the powerpc and s390 guys want to work on CMM over the
> > next couple of months, they'd like this patch merged into 2.6.32. It's
> > a bit larger and more involved than one would like, but I guess we can
> > do that if suitable people (Mel? Kamezawa?) have had a close look and
> > are OK with it.
> >
> > What do people think?
>
> I'd love to get it in 2.6.32 if that's possible. I have gone over the
> comments from Mel and Kamezawa I produced a new patchset. I just
> finished testing it (and I also tested with
> CONFIG_MEMORY_HOTPLUG_SPARSE=n) and it will be posted shortly.
>
As you have tested this recently, would you be willing to post the
results? While it's not a requirement of the patch, it would be nice to have
an idea of how the effectiveness of memory hot-remove is improved when used
with the powerpc balloon. This might convince others developers for balloons
to register with the notifier.
Total aside, I'm not overly clear why so much of balloon driver logic is
in drivers and not in the core. At a casual glance, it would appear that
balloon logic could be improved by combining it with similar logic as is
used for lumpy reclaim. This comment is not intended to hurt the patch,
but for the people working on CMM to consider if it hasn't been
considered already.
> > Has it been carefully compile- and run-time tested with
> > CONFIG_MEMORY_HOTPLUG_SPARSE=n?
>
> Yes, I have compiled the kernel CONFIG_MEMORY_HOTPLUG_SPARSE=n and made
> sure that we didn't have any problems.
>
The pfn_valid_within() was the biggie as far as the core is concerned. That
sort of mistake causes fairly mad-looking oops. To be perfectly honest,
I didn't review the powerpc-specific portion assuming that people are
testing that and that there are developers more familiar with the area.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Fri, Oct 09, 2009 at 21:43:26 +0100, Mal Gorman wrote:
> As you have tested this recently, would you be willing to post the
> results? While it's not a requirement of the patch, it would be nice to have
> an idea of how the effectiveness of memory hot-remove is improved when used
> with the powerpc balloon. This might convince others developers for balloons
> to register with the notifier.
I did ten test runs without my patches and ten test runs with my patches
on a 2.6.32-rc3 kernel.
Without the patch:
6 out of 10 memory-remove operations without the patch removed 1 LMB
(64Mb), the rest of the memory-remove attempts failed to remove any LMBs.
With the patch:
All of the memory-remove operations removed some LMBs. The average
removed was just over 11 LMBs (704Mb) per attempt.
Linux was given 2Gb of memory. During the test runs the average memory in
use was 140Mb, not including cache and buffers, and the average amount
consumed by the balloon was 1217Mb. The system was idle while the
memory remove operation was performed. After each attempt the system
was rebooted and allowed ~10 minutes to settle after boot.
With a 2Gb configuration on POWER the LMB size is 64Mb. The drmgr command
(part of powerpc-utils) was used to remove memory by LBM, just as an
end-user would. Below is a list of the runs and the number of LMBs
removed.
Stock kernel (v2.6.32-rc3)
--------------------------
LMBs Used kb Loaned kb
removed
0 135232 1257280
0 151168 1231744
1 152128 1234176
1 150976 1239232
1 151808 1232064
0 136064 1249152
0 137088 1246976
1 135296 1289984
1 136384 1263104
1 152960 1243904
=======================
0.60 143910 1248762 Average
0.49 7929 16960 StdDev
Patched kernel
--------------------------
LMBs Used kb Loaned kb
removed
12 134336 1294336
10 152192 1250432
9 152832 1235520
15 153152 1237952
12 152320 1232704
13 135360 1252224
11 154176 1237056
10 153920 1243264
10 150720 1236416
13 151040 1230848
=======================
11.50 149005 1245075 Average
1.75 7158 17738 StdDev
Regards,
Robert Jennings
Robert Jennings wrote:
>>> @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
>>> cmm_dbg("Begin request for %ld pages\n", nr);
>>>
>>> while (nr) {
>>> + if (atomic_read(&hotplug_active))
>>> + break;
>>> +
>>> addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
>>> __GFP_NORETRY | __GFP_NOMEMALLOC);
>>> if (!addr)
>>> @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
>>> if (!pa || pa->index >= CMM_NR_PAGES) {
>>> /* Need a new page for the page list. */
>>> spin_unlock(&cmm_lock);
>>> - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
>>> - __GFP_NORETRY | __GFP_NOMEMALLOC);
>>> + npa = (struct cmm_page_array *)__get_free_page(
>>> + GFP_NOIO | __GFP_NOWARN |
>>> + __GFP_NORETRY | __GFP_NOMEMALLOC |
>>> + __GFP_MOVABLE);
>>> if (!npa) {
>>> pr_info("%s: Can not allocate new page list\n", __func__);
>>> free_page(addr);
>> Why is the __GFP_MOVABLE added here, for the page list alloc, and not
>> above for the balloon page alloc?
>
> The pages allocated as __GFP_MOVABLE are used to store the list of pages
> allocated by the balloon. They reference virtual addresses and it would
> be fine for the kernel to migrate the physical pages for those, the
> balloon would not notice this.
Does page migration really work for kernel pages that were allocated
with __get_free_page()? I was wondering if we can do this on s390, where
we have a 1:1 mapping of kernel virtual to physical addresses, but
looking at migrate_pages() and friends, it seems that kernel pages
w/o mapping and rmap should not be migrateable at all. Any thoughts from
the memory migration experts?
BTW, since we have real memory hotplug support on s390, allowing us
to add and remove memory chunks to/from ZONE_MOVABLE, this basically
makes cmm ballooning in ZONE_NORMAL obsolete, so we decided not to
support memory hotplug aware cmm on s390.
Regards,
Gerald
On Thu, 15 Oct 2009, Gerald Schaefer wrote:
> > The pages allocated as __GFP_MOVABLE are used to store the list of pages
> > allocated by the balloon. They reference virtual addresses and it would
> > be fine for the kernel to migrate the physical pages for those, the
> > balloon would not notice this.
>
> Does page migration really work for kernel pages that were allocated
> with __get_free_page()? I was wondering if we can do this on s390, where
> we have a 1:1 mapping of kernel virtual to physical addresses, but
> looking at migrate_pages() and friends, it seems that kernel pages
> w/o mapping and rmap should not be migrateable at all. Any thoughts from
> the memory migration experts?
page migration only works for pages where we have some way of accounting
for all the references to a page. This usually mean using reverse mappings
(anon list, radix trees and page tables).