Memory balloon drivers can allocate a large amount of memory which
is not movable but could be freed to accommodate memory hotplug remove.
Prior to calling the memory hotplug notifier chain the memory in the
pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
isolation will not proceed, causing the memory removal for that page
range to fail.
Rather than immediately failing pageblock isolation if the the
migrateteype is not MIGRATE_MOVABLE, this patch checks if all of the
pages in the pageblock are owned by a registered balloon driver using a
notifier chain. If all of the non-movable pages are owned by a balloon,
they can be freed later through the memory notifier chain and the range
can still be isolated in set_migratetype_isolate().
Signed-off-by: Robert Jennings <[email protected]>
---
drivers/base/memory.c | 19 +++++++++++++++++++
include/linux/memory.h | 22 ++++++++++++++++++++++
mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
3 files changed, 82 insertions(+), 8 deletions(-)
Index: b/drivers/base/memory.c
===================================================================
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
}
EXPORT_SYMBOL(unregister_memory_notifier);
+static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
+
+int register_memory_isolate_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&memory_isolate_chain, nb);
+}
+EXPORT_SYMBOL(register_memory_isolate_notifier);
+
+void unregister_memory_isolate_notifier(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
+}
+EXPORT_SYMBOL(unregister_memory_isolate_notifier);
+
/*
* register_memory - Setup a sysfs device for a memory block
*/
@@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
return blocking_notifier_call_chain(&memory_chain, val, v);
}
+int memory_isolate_notify(unsigned long val, void *v)
+{
+ return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
+}
+
/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
Index: b/include/linux/memory.h
===================================================================
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -50,6 +50,14 @@ struct memory_notify {
int status_change_nid;
};
+#define MEM_ISOLATE_COUNT (1<<0)
+
+struct memory_isolate_notify {
+ unsigned long start_addr;
+ unsigned int nr_pages;
+ unsigned int pages_found;
+};
+
struct notifier_block;
struct mem_section;
@@ -76,14 +84,28 @@ static inline int memory_notify(unsigned
{
return 0;
}
+static inline int register_memory_isolate_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
+{
+}
+static inline int memory_isolate_notify(unsigned long val, void *v)
+{
+ return 0;
+}
#else
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
+extern int register_memory_isolate_notifier(struct notifier_block *nb);
+extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
extern int register_new_memory(int, struct mem_section *);
extern int unregister_memory_section(struct mem_section *);
extern int memory_dev_init(void);
extern int remove_memory_block(unsigned long, struct mem_section *, int);
extern int memory_notify(unsigned long val, void *v);
+extern int memory_isolate_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(struct mem_section *);
#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
enum mem_add_context { BOOT, HOTPLUG };
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
+#include <linux/memory.h>
#include <trace/events/kmem.h>
#include <asm/tlbflush.h>
@@ -4985,23 +4986,55 @@ void set_pageblock_flags_group(struct pa
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
+ unsigned long flags, pfn, iter;
+ long immobile = 0;
+ struct memory_isolate_notify arg;
+ int notifier_ret;
int ret = -EBUSY;
int zone_idx;
zone = page_zone(page);
zone_idx = zone_idx(zone);
+
+ pfn = page_to_pfn(page);
+ arg.start_addr = (unsigned long)page_address(page);
+ arg.nr_pages = pageblock_nr_pages;
+ arg.pages_found = 0;
+
spin_lock_irqsave(&zone->lock, flags);
/*
* In future, more migrate types will be able to be isolation target.
*/
- if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
- zone_idx != ZONE_MOVABLE)
- goto out;
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
- ret = 0;
-out:
+ do {
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE &&
+ zone_idx == ZONE_MOVABLE) {
+ ret = 0;
+ break;
+ }
+
+ /*
+ * If all of the pages in a zone are used by a balloon,
+ * the range can be still be isolated. The balloon will
+ * free these pages from the memory notifier chain.
+ */
+ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+ notifier_ret = notifier_to_errno(ret);
+ if (notifier_ret || !arg.pages_found)
+ break;
+
+ for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
+ if (page_count(pfn_to_page(iter)))
+ immobile++;
+
+ if (arg.pages_found == immobile)
+ ret = 0;
+ } while (0);
+
+ if (!ret) {
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
drain_all_pages();
The Collaborative Memory Manager (CMM) module allocates individual pages
over time that are not migratable. On a long running system this can
severely impact the ability to find enough pages to support a hotplug
memory remove operation.
This patch adds a memory isolation notifier and a memory hotplug notifier.
The memory isolation notifier will return the number of pages found in
the range specified. This is used to determine if all of the pages in
a pageblock are owned by the balloon. The hotplug notifier will free
pages in the range which is to be removed. The priority of the hotplug
notifier is low so that it will be called near last, this helps avoids
removing loaned pages in operations that fail due to other handlers.
CMM activity will be halted when hotplug remove operations are active
and resume activity after a delay period to allow the hypervisor time
to adjust.
Signed-off-by: Robert Jennings <[email protected]>
---
arch/powerpc/platforms/pseries/cmm.c | 211 ++++++++++++++++++++++++++++++++++-
1 file changed, 205 insertions(+), 6 deletions(-)
Index: b/arch/powerpc/platforms/pseries/cmm.c
===================================================================
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -38,19 +38,28 @@
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
+#include <linux/memory.h>
#include "plpar_wrappers.h"
#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15
static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
@@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+ "before activity resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
@@ -88,6 +101,8 @@ struct cmm_page_array {
static unsigned long loaned_pages;
static unsigned long loaned_pages_target;
static unsigned long oom_freed_pages;
+static atomic_t hotplug_active = ATOMIC_INIT(0);
+static atomic_t hotplug_occurred = ATOMIC_INIT(0);
static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);
@@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
cmm_dbg("Begin request for %ld pages\n", nr);
while (nr) {
+ if (atomic_read(&hotplug_active))
+ break;
+
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
@@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
- npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC |
+ __GFP_MOVABLE);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
@@ -273,9 +293,23 @@ static int cmm_thread(void *dummy)
while (1) {
timeleft = msleep_interruptible(delay * 1000);
- if (kthread_should_stop() || timeleft) {
- loaned_pages_target = loaned_pages;
+ if (kthread_should_stop() || timeleft)
break;
+
+ if (atomic_read(&hotplug_active)) {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
+ }
+
+ if (atomic_dec_if_positive(&hotplug_occurred) >= 0) {
+ cmm_dbg("Hotplug operation has occurred, loaning "
+ "activity suspended for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay * 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
}
cmm_get_mpp();
@@ -405,6 +439,163 @@ static struct notifier_block cmm_reboot_
};
/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long idx;
+ unsigned long start;
+ unsigned long end;
+
+ start = marg->start_addr;
+ end = start + ((unsigned long)marg->nr_pages << PAGE_SHIFT);
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ continue;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+ atomic_set(&hotplug_occurred, 1);
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ atomic_set(&hotplug_active, 1);
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ atomic_set(&hotplug_active, 0);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
* cmm_init - Module initialization
*
* Return value:
@@ -426,18 +617,24 @@ static int cmm_init(void)
if ((rc = cmm_sysfs_register(&cmm_sysdev)))
goto out_reboot_notifier;
+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
if (cmm_disabled)
return rc;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
- goto out_unregister_sysfs;
+ goto out_unregister_notifier;
}
return rc;
-out_unregister_sysfs:
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_sysdev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
@@ -458,6 +655,8 @@ static void cmm_exit(void)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_sysdev);
}
Robert Jennings wrote:
> Memory balloon drivers can allocate a large amount of memory which
> is not movable but could be freed to accommodate memory hotplug remove.
>
> Prior to calling the memory hotplug notifier chain the memory in the
> pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> isolation will not proceed, causing the memory removal for that page
> range to fail.
>
> Rather than immediately failing pageblock isolation if the the
> migrateteype is not MIGRATE_MOVABLE, this patch checks if all of the
> pages in the pageblock are owned by a registered balloon driver using a
> notifier chain. If all of the non-movable pages are owned by a balloon,
> they can be freed later through the memory notifier chain and the range
> can still be isolated in set_migratetype_isolate().
>
> Signed-off-by: Robert Jennings <[email protected]>
>
> ---
> drivers/base/memory.c | 19 +++++++++++++++++++
> include/linux/memory.h | 22 ++++++++++++++++++++++
> mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
> 3 files changed, 82 insertions(+), 8 deletions(-)
>
> Index: b/drivers/base/memory.c
> ===================================================================
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> }
> EXPORT_SYMBOL(unregister_memory_notifier);
>
> +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> +
> +int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(register_memory_isolate_notifier);
> +
> +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> +
> /*
> * register_memory - Setup a sysfs device for a memory block
> */
> @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> return blocking_notifier_call_chain(&memory_chain, val, v);
> }
>
> +int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> +}
> +
> /*
> * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> * OK to have direct references to sparsemem variables in here.
> Index: b/include/linux/memory.h
> ===================================================================
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -50,6 +50,14 @@ struct memory_notify {
> int status_change_nid;
> };
>
> +#define MEM_ISOLATE_COUNT (1<<0)
> +
> +struct memory_isolate_notify {
> + unsigned long start_addr;
> + unsigned int nr_pages;
> + unsigned int pages_found;
> +};
> +
> struct notifier_block;
> struct mem_section;
>
> @@ -76,14 +84,28 @@ static inline int memory_notify(unsigned
> {
> return 0;
> }
> +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return 0;
> +}
> +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> +}
> +static inline int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return 0;
> +}
> #else
> extern int register_memory_notifier(struct notifier_block *nb);
> extern void unregister_memory_notifier(struct notifier_block *nb);
> +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> extern int register_new_memory(int, struct mem_section *);
> extern int unregister_memory_section(struct mem_section *);
> extern int memory_dev_init(void);
> extern int remove_memory_block(unsigned long, struct mem_section *, int);
> extern int memory_notify(unsigned long val, void *v);
> +extern int memory_isolate_notify(unsigned long val, void *v);
> extern struct memory_block *find_memory_block(struct mem_section *);
> #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> enum mem_add_context { BOOT, HOTPLUG };
> Index: b/mm/page_alloc.c
> ===================================================================
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,6 +48,7 @@
> #include <linux/page_cgroup.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemleak.h>
> +#include <linux/memory.h>
> #include <trace/events/kmem.h>
>
> #include <asm/tlbflush.h>
> @@ -4985,23 +4986,55 @@ void set_pageblock_flags_group(struct pa
> int set_migratetype_isolate(struct page *page)
> {
> struct zone *zone;
> - unsigned long flags;
> + unsigned long flags, pfn, iter;
> + long immobile = 0;
> + struct memory_isolate_notify arg;
> + int notifier_ret;
> int ret = -EBUSY;
> int zone_idx;
>
> zone = page_zone(page);
> zone_idx = zone_idx(zone);
> +
> + pfn = page_to_pfn(page);
> + arg.start_addr = (unsigned long)page_address(page);
> + arg.nr_pages = pageblock_nr_pages;
> + arg.pages_found = 0;
> +
> spin_lock_irqsave(&zone->lock, flags);
> /*
> * In future, more migrate types will be able to be isolation target.
> */
> - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> - zone_idx != ZONE_MOVABLE)
> - goto out;
> - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> - ret = 0;
> -out:
> + do {
> + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE &&
> + zone_idx == ZONE_MOVABLE) {
> + ret = 0;
> + break;
> + }
> +
> + /*
> + * If all of the pages in a zone are used by a balloon,
> + * the range can be still be isolated. The balloon will
> + * free these pages from the memory notifier chain.
> + */
> + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> + notifier_ret = notifier_to_errno(ret);
Should this be
notifier_ret = notifier_to_errno(notifier_ret);
-Nathan
> + if (notifier_ret || !arg.pages_found)
> + break;
> +
> + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> + if (page_count(pfn_to_page(iter)))
> + immobile++;
> +
> + if (arg.pages_found == immobile)
> + ret = 0;
> + } while (0);
> +
> + if (!ret) {
> + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> + }
> +
> spin_unlock_irqrestore(&zone->lock, flags);
> if (!ret)
> drain_all_pages();
> _______________________________________________
> Linuxppc-dev mailing list
> [email protected]
> https://lists.ozlabs.org/listinfo/linuxppc-dev
On Thu, Oct 01, 2009 at 02:53:11PM -0500, Robert Jennings wrote:
> Memory balloon drivers can allocate a large amount of memory which
> is not movable but could be freed to accommodate memory hotplug remove.
>
> Prior to calling the memory hotplug notifier chain the memory in the
> pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> isolation will not proceed, causing the memory removal for that page
> range to fail.
>
> Rather than immediately failing pageblock isolation if the the
> migrateteype is not MIGRATE_MOVABLE, this patch checks if all of the
> pages in the pageblock are owned by a registered balloon driver using a
> notifier chain. If all of the non-movable pages are owned by a balloon,
> they can be freed later through the memory notifier chain and the range
> can still be isolated in set_migratetype_isolate().
>
> Signed-off-by: Robert Jennings <[email protected]>
>
> ---
> drivers/base/memory.c | 19 +++++++++++++++++++
> include/linux/memory.h | 22 ++++++++++++++++++++++
> mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
> 3 files changed, 82 insertions(+), 8 deletions(-)
>
> Index: b/drivers/base/memory.c
> ===================================================================
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> }
> EXPORT_SYMBOL(unregister_memory_notifier);
>
> +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> +
> +int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(register_memory_isolate_notifier);
> +
> +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> +}
> +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> +
> /*
> * register_memory - Setup a sysfs device for a memory block
> */
> @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> return blocking_notifier_call_chain(&memory_chain, val, v);
> }
>
> +int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> +}
> +
> /*
> * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> * OK to have direct references to sparsemem variables in here.
> Index: b/include/linux/memory.h
> ===================================================================
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -50,6 +50,14 @@ struct memory_notify {
> int status_change_nid;
> };
>
> +#define MEM_ISOLATE_COUNT (1<<0)
> +
This needs a comment explaining that that this is an action to count the
number of pages within a range that have been isolated within a range of
pages and not a default value for "nr_pages" in the next structure.
> +struct memory_isolate_notify {
> + unsigned long start_addr;
> + unsigned int nr_pages;
> + unsigned int pages_found;
> +};
Is there any particular reason you used virtual address of the mapped
page instead of PFN? I am guessing at this point that the balloon driver
is based on addresses but the code that populates the structure more
commonly deals with PFNs. Outside of debugging code, page_address is
rarely used in mm/page_alloc.c .
It's picky but it feels more natural to me to have the structure have
start_pfn and nr_pages or start_addr and end_addr but not a mix of both.
> +
> struct notifier_block;
> struct mem_section;
>
> @@ -76,14 +84,28 @@ static inline int memory_notify(unsigned
> {
> return 0;
> }
> +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> +{
> + return 0;
> +}
> +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> +{
> +}
> +static inline int memory_isolate_notify(unsigned long val, void *v)
> +{
> + return 0;
> +}
> #else
> extern int register_memory_notifier(struct notifier_block *nb);
> extern void unregister_memory_notifier(struct notifier_block *nb);
> +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> extern int register_new_memory(int, struct mem_section *);
> extern int unregister_memory_section(struct mem_section *);
> extern int memory_dev_init(void);
> extern int remove_memory_block(unsigned long, struct mem_section *, int);
> extern int memory_notify(unsigned long val, void *v);
> +extern int memory_isolate_notify(unsigned long val, void *v);
> extern struct memory_block *find_memory_block(struct mem_section *);
> #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> enum mem_add_context { BOOT, HOTPLUG };
> Index: b/mm/page_alloc.c
> ===================================================================
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,6 +48,7 @@
> #include <linux/page_cgroup.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemleak.h>
> +#include <linux/memory.h>
> #include <trace/events/kmem.h>
>
> #include <asm/tlbflush.h>
> @@ -4985,23 +4986,55 @@ void set_pageblock_flags_group(struct pa
> int set_migratetype_isolate(struct page *page)
> {
> struct zone *zone;
> - unsigned long flags;
> + unsigned long flags, pfn, iter;
> + long immobile = 0;
So, the count in the structure is unsigned long, but long here. Why the
difference in types?
> + struct memory_isolate_notify arg;
> + int notifier_ret;
> int ret = -EBUSY;
> int zone_idx;
>
> zone = page_zone(page);
> zone_idx = zone_idx(zone);
> +
> + pfn = page_to_pfn(page);
> + arg.start_addr = (unsigned long)page_address(page);
> + arg.nr_pages = pageblock_nr_pages;
> + arg.pages_found = 0;
> +
> spin_lock_irqsave(&zone->lock, flags);
> /*
> * In future, more migrate types will be able to be isolation target.
> */
> - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> - zone_idx != ZONE_MOVABLE)
> - goto out;
> - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> - ret = 0;
> -out:
> + do {
> + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE &&
> + zone_idx == ZONE_MOVABLE) {
So, this condition requires the zone be MOVABLE and the migrate type
be movable. That prevents MIGRATE_RESERVE regions in ZONE_MOVABLE being
off-lined even though they can likely be off-lined. It also prevents
MIGRATE_MOVABLE sections in other zones being off-lined.
Did you mean || here instead of && ?
Might want to expand the comment explaining this condition instead of
leaving it in the old location which is confusing.
> + ret = 0;
> + break;
> + }
Why do you wrap all this in a do {} while(0) instead of preserving the
out: label and using goto?
> +
> + /*
> + * If all of the pages in a zone are used by a balloon,
> + * the range can be still be isolated. The balloon will
> + * free these pages from the memory notifier chain.
> + */
> + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> + notifier_ret = notifier_to_errno(ret);
> + if (notifier_ret || !arg.pages_found)
> + break;
> +
> + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> + if (page_count(pfn_to_page(iter)))
> + immobile++;
> +
> + if (arg.pages_found == immobile)
and here you compare a signed with an unsigned type. Probably harmless
but why do it?
> + ret = 0;
> + } while (0);
> +
So the out label would go here and you'd get rid of the do {} while(0)
loop.
> + if (!ret) {
> + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> + }
> +
> spin_unlock_irqrestore(&zone->lock, flags);
> if (!ret)
> drain_all_pages();
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
* Nathan Fontenot ([email protected]) wrote:
> Robert Jennings wrote:
>> Memory balloon drivers can allocate a large amount of memory which
>> is not movable but could be freed to accommodate memory hotplug remove.
>>
>> Prior to calling the memory hotplug notifier chain the memory in the
>> pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
>> isolation will not proceed, causing the memory removal for that page
>> range to fail.
>>
>> Rather than immediately failing pageblock isolation if the the
>> migrateteype is not MIGRATE_MOVABLE, this patch checks if all of the
>> pages in the pageblock are owned by a registered balloon driver using a
>> notifier chain. If all of the non-movable pages are owned by a balloon,
>> they can be freed later through the memory notifier chain and the range
>> can still be isolated in set_migratetype_isolate().
>>
>> Signed-off-by: Robert Jennings <[email protected]>
>>
>> ---
>> drivers/base/memory.c | 19 +++++++++++++++++++
>> include/linux/memory.h | 22 ++++++++++++++++++++++
>> mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
>> 3 files changed, 82 insertions(+), 8 deletions(-)
>>
<snip>
>> Index: b/mm/page_alloc.c
>> ===================================================================
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -48,6 +48,7 @@
>> #include <linux/page_cgroup.h>
>> #include <linux/debugobjects.h>
>> #include <linux/kmemleak.h>
>> +#include <linux/memory.h>
>> #include <trace/events/kmem.h>
>> #include <asm/tlbflush.h>
>> @@ -4985,23 +4986,55 @@ void set_pageblock_flags_group(struct pa
>> int set_migratetype_isolate(struct page *page)
>> {
>> struct zone *zone;
>> - unsigned long flags;
>> + unsigned long flags, pfn, iter;
>> + long immobile = 0;
>> + struct memory_isolate_notify arg;
>> + int notifier_ret;
>> int ret = -EBUSY;
>> int zone_idx;
>> zone = page_zone(page);
>> zone_idx = zone_idx(zone);
>> +
>> + pfn = page_to_pfn(page);
>> + arg.start_addr = (unsigned long)page_address(page);
>> + arg.nr_pages = pageblock_nr_pages;
>> + arg.pages_found = 0;
>> +
>> spin_lock_irqsave(&zone->lock, flags);
>> /*
>> * In future, more migrate types will be able to be isolation target.
>> */
>> - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
>> - zone_idx != ZONE_MOVABLE)
>> - goto out;
>> - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
>> - move_freepages_block(zone, page, MIGRATE_ISOLATE);
>> - ret = 0;
>> -out:
>> + do {
>> + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE &&
>> + zone_idx == ZONE_MOVABLE) {
>> + ret = 0;
>> + break;
>> + }
>> +
>> + /*
>> + * If all of the pages in a zone are used by a balloon,
>> + * the range can be still be isolated. The balloon will
>> + * free these pages from the memory notifier chain.
>> + */
>> + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
>> + notifier_ret = notifier_to_errno(ret);
>
> Should this be
>
> notifier_ret = notifier_to_errno(notifier_ret);
>
> -Nathan
I'll correct this. Thanks
* Mel Gorman ([email protected]) wrote:
> On Thu, Oct 01, 2009 at 02:53:11PM -0500, Robert Jennings wrote:
> > Memory balloon drivers can allocate a large amount of memory which
> > is not movable but could be freed to accommodate memory hotplug remove.
> >
> > Prior to calling the memory hotplug notifier chain the memory in the
> > pageblock is isolated. If the migrate type is not MIGRATE_MOVABLE the
> > isolation will not proceed, causing the memory removal for that page
> > range to fail.
> >
> > Rather than immediately failing pageblock isolation if the the
> > migrateteype is not MIGRATE_MOVABLE, this patch checks if all of the
> > pages in the pageblock are owned by a registered balloon driver using a
> > notifier chain. If all of the non-movable pages are owned by a balloon,
> > they can be freed later through the memory notifier chain and the range
> > can still be isolated in set_migratetype_isolate().
> >
> > Signed-off-by: Robert Jennings <[email protected]>
> >
> > ---
> > drivers/base/memory.c | 19 +++++++++++++++++++
> > include/linux/memory.h | 22 ++++++++++++++++++++++
> > mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++--------
> > 3 files changed, 82 insertions(+), 8 deletions(-)
> >
> > Index: b/drivers/base/memory.c
> > ===================================================================
> > --- a/drivers/base/memory.c
> > +++ b/drivers/base/memory.c
> > @@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
> > }
> > EXPORT_SYMBOL(unregister_memory_notifier);
> >
> > +static BLOCKING_NOTIFIER_HEAD(memory_isolate_chain);
> > +
> > +int register_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + return blocking_notifier_chain_register(&memory_isolate_chain, nb);
> > +}
> > +EXPORT_SYMBOL(register_memory_isolate_notifier);
> > +
> > +void unregister_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + blocking_notifier_chain_unregister(&memory_isolate_chain, nb);
> > +}
> > +EXPORT_SYMBOL(unregister_memory_isolate_notifier);
> > +
> > /*
> > * register_memory - Setup a sysfs device for a memory block
> > */
> > @@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
> > return blocking_notifier_call_chain(&memory_chain, val, v);
> > }
> >
> > +int memory_isolate_notify(unsigned long val, void *v)
> > +{
> > + return blocking_notifier_call_chain(&memory_isolate_chain, val, v);
> > +}
> > +
> > /*
> > * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
> > * OK to have direct references to sparsemem variables in here.
> > Index: b/include/linux/memory.h
> > ===================================================================
> > --- a/include/linux/memory.h
> > +++ b/include/linux/memory.h
> > @@ -50,6 +50,14 @@ struct memory_notify {
> > int status_change_nid;
> > };
> >
> > +#define MEM_ISOLATE_COUNT (1<<0)
> > +
>
> This needs a comment explaining that that this is an action to count the
> number of pages within a range that have been isolated within a range of
> pages and not a default value for "nr_pages" in the next structure.
I'll provide a clear explanation for this.
> > +struct memory_isolate_notify {
> > + unsigned long start_addr;
> > + unsigned int nr_pages;
> > + unsigned int pages_found;
> > +};
>
> Is there any particular reason you used virtual address of the mapped
> page instead of PFN? I am guessing at this point that the balloon driver
> is based on addresses but the code that populates the structure more
> commonly deals with PFNs. Outside of debugging code, page_address is
> rarely used in mm/page_alloc.c .
>
> It's picky but it feels more natural to me to have the structure have
> start_pfn and nr_pages or start_addr and end_addr but not a mix of both.
Changing this to use start_pfn and nr_pages, this will also match
struct memory_notify. Thanks for the review of this patch.
> > +
> > struct notifier_block;
> > struct mem_section;
> >
> > @@ -76,14 +84,28 @@ static inline int memory_notify(unsigned
> > {
> > return 0;
> > }
> > +static inline int register_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > + return 0;
> > +}
> > +static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
> > +{
> > +}
> > +static inline int memory_isolate_notify(unsigned long val, void *v)
> > +{
> > + return 0;
> > +}
> > #else
> > extern int register_memory_notifier(struct notifier_block *nb);
> > extern void unregister_memory_notifier(struct notifier_block *nb);
> > +extern int register_memory_isolate_notifier(struct notifier_block *nb);
> > +extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> > extern int register_new_memory(int, struct mem_section *);
> > extern int unregister_memory_section(struct mem_section *);
> > extern int memory_dev_init(void);
> > extern int remove_memory_block(unsigned long, struct mem_section *, int);
> > extern int memory_notify(unsigned long val, void *v);
> > +extern int memory_isolate_notify(unsigned long val, void *v);
> > extern struct memory_block *find_memory_block(struct mem_section *);
> > #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
> > enum mem_add_context { BOOT, HOTPLUG };
> > Index: b/mm/page_alloc.c
> > ===================================================================
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -48,6 +48,7 @@
> > #include <linux/page_cgroup.h>
> > #include <linux/debugobjects.h>
> > #include <linux/kmemleak.h>
> > +#include <linux/memory.h>
> > #include <trace/events/kmem.h>
> >
> > #include <asm/tlbflush.h>
> > @@ -4985,23 +4986,55 @@ void set_pageblock_flags_group(struct pa
> > int set_migratetype_isolate(struct page *page)
> > {
> > struct zone *zone;
> > - unsigned long flags;
> > + unsigned long flags, pfn, iter;
> > + long immobile = 0;
>
> So, the count in the structure is unsigned long, but long here. Why the
> difference in types?
No good reason, both will be unsigned long when I repost the patch.
> > + struct memory_isolate_notify arg;
> > + int notifier_ret;
> > int ret = -EBUSY;
> > int zone_idx;
> >
> > zone = page_zone(page);
> > zone_idx = zone_idx(zone);
> > +
> > + pfn = page_to_pfn(page);
> > + arg.start_addr = (unsigned long)page_address(page);
> > + arg.nr_pages = pageblock_nr_pages;
> > + arg.pages_found = 0;
> > +
> > spin_lock_irqsave(&zone->lock, flags);
> > /*
> > * In future, more migrate types will be able to be isolation target.
> > */
> > - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
> > - zone_idx != ZONE_MOVABLE)
> > - goto out;
> > - set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > - move_freepages_block(zone, page, MIGRATE_ISOLATE);
> > - ret = 0;
> > -out:
> > + do {
> > + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE &&
> > + zone_idx == ZONE_MOVABLE) {
>
> So, this condition requires the zone be MOVABLE and the migrate type
> be movable. That prevents MIGRATE_RESERVE regions in ZONE_MOVABLE being
> off-lined even though they can likely be off-lined. It also prevents
> MIGRATE_MOVABLE sections in other zones being off-lined.
>
> Did you mean || here instead of && ?
>
> Might want to expand the comment explaining this condition instead of
> leaving it in the old location which is confusing.
I will fix the logic and clean up the comments here.
> > + ret = 0;
> > + break;
> > + }
>
> Why do you wrap all this in a do {} while(0) instead of preserving the
> out: label and using goto?
I've put this back to how it was.
> > +
> > + /*
> > + * If all of the pages in a zone are used by a balloon,
> > + * the range can be still be isolated. The balloon will
> > + * free these pages from the memory notifier chain.
> > + */
> > + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
> > + notifier_ret = notifier_to_errno(ret);
> > + if (notifier_ret || !arg.pages_found)
> > + break;
> > +
> > + for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++)
> > + if (page_count(pfn_to_page(iter)))
> > + immobile++;
> > +
> > + if (arg.pages_found == immobile)
>
> and here you compare a signed with an unsigned type. Probably harmless
> but why do it?
This is corrected by making both variables unsigned longs.
> > + ret = 0;
> > + } while (0);
> > +
>
> So the out label would go here and you'd get rid of the do {} while(0)
> loop.
Fixed.
> > + if (!ret) {
> > + set_pageblock_migratetype(page, MIGRATE_ISOLATE);
> > + move_freepages_block(zone, page, MIGRATE_ISOLATE);
> > + }
> > +
> > spin_unlock_irqrestore(&zone->lock, flags);
> > if (!ret)
> > drain_all_pages();