2009-10-09 20:38:45

by Robert Jennings

[permalink] [raw]
Subject: [PATCH 1/2][v3] mm: add notifier in pageblock isolation for balloon drivers

Memory balloon drivers can allocate a large amount of memory which
is not movable but could be freed to accomodate memory hotplug remove.

Prior to calling the memory hotplug notifier chain the memory in
the pageblock is isolated. Currently, if the migrate type is not
MIGRATE_MOVABLE the isolation will not proceed, causing the memory
removal for that page range to fail.

Rather than failing pageblock isolation if the migrateteype is not
MIGRATE_MOVABLE, this patch checks if all of the pages in the pageblock,
and not on the LRU, are owned by a registered balloon driver (or other
entity) using a notifier chain. If all of the non-movable pages are
owned by a balloon, they can be freed later through the memory notifier
chain and the range can still be isolated in set_migratetype_isolate().

Signed-off-by: Robert Jennings <[email protected]>

---

Testing:
* With the patch memory remove succeeds for pageblocks containing balloon
allocations. The balloon driver frees pages in the ranges being
taken offline. Prior to the patch, pageblocks with balloon allocations
could not be taken offline.
* Tested with CONFIG_MEMORY_HOTPLUG_SPARSE=n to ensure that the patches
did not affect that configuration.

Changes since v2:
* comments cleaned up based on patch reviews.
* documented variables in struct memory_isolate_notify.
* made search for active pages in set_isolate_pageblock safe if
CONFIG_HOLES_IN_ZONE is set.
* changed notifier chain from BLOCKING to ATOMIC.
* added check for pages on an LRU to be considered movable.

drivers/base/memory.c | 19 ++++++++++++++++
include/linux/memory.h | 27 +++++++++++++++++++++++
mm/page_alloc.c | 57 +++++++++++++++++++++++++++++++++++++++++++------
3 files changed, 96 insertions(+), 7 deletions(-)

Index: b/drivers/base/memory.c
===================================================================
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -63,6 +63,20 @@ void unregister_memory_notifier(struct n
}
EXPORT_SYMBOL(unregister_memory_notifier);

+static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);
+
+int register_memory_isolate_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&memory_isolate_chain, nb);
+}
+EXPORT_SYMBOL(register_memory_isolate_notifier);
+
+void unregister_memory_isolate_notifier(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
+}
+EXPORT_SYMBOL(unregister_memory_isolate_notifier);
+
/*
* register_memory - Setup a sysfs device for a memory block
*/
@@ -157,6 +171,11 @@ int memory_notify(unsigned long val, voi
return blocking_notifier_call_chain(&memory_chain, val, v);
}

+int memory_isolate_notify(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
+}
+
/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
Index: b/include/linux/memory.h
===================================================================
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -50,6 +50,19 @@ struct memory_notify {
int status_change_nid;
};

+/*
+ * During pageblock isolation, count the number of pages within the
+ * range [start_pfn, start_pfn + nr_pages) which are owned by code
+ * in the notifier chain.
+ */
+#define MEM_ISOLATE_COUNT (1<<0)
+
+struct memory_isolate_notify {
+ unsigned long start_pfn; /* Start of range to check */
+ unsigned int nr_pages; /* # pages in range to check */
+ unsigned int pages_found; /* # pages owned found by callbacks */
+};
+
struct notifier_block;
struct mem_section;

@@ -76,14 +89,28 @@ static inline int memory_notify(unsigned
{
return 0;
}
+static inline int register_memory_isolate_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
+{
+}
+static inline int memory_isolate_notify(unsigned long val, void *v)
+{
+ return 0;
+}
#else
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
+extern int register_memory_isolate_notifier(struct notifier_block *nb);
+extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
extern int register_new_memory(int, struct mem_section *);
extern int unregister_memory_section(struct mem_section *);
extern int memory_dev_init(void);
extern int remove_memory_block(unsigned long, struct mem_section *, int);
extern int memory_notify(unsigned long val, void *v);
+extern int memory_isolate_notify(unsigned long val, void *v);
extern struct memory_block *find_memory_block(struct mem_section *);
#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
enum mem_add_context { BOOT, HOTPLUG };
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
+#include <linux/memory.h>
#include <trace/events/kmem.h>

#include <asm/tlbflush.h>
@@ -5003,23 +5004,65 @@ void set_pageblock_flags_group(struct pa
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- unsigned long flags;
+ struct page *curr_page;
+ unsigned long flags, pfn, iter;
+ unsigned long immobile = 0;
+ struct memory_isolate_notify arg;
+ int notifier_ret;
int ret = -EBUSY;
int zone_idx;

zone = page_zone(page);
zone_idx = zone_idx(zone);
+
spin_lock_irqsave(&zone->lock, flags);
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
+ zone_idx == ZONE_MOVABLE) {
+ ret = 0;
+ goto out;
+ }
+
+ pfn = page_to_pfn(page);
+ arg.start_pfn = pfn;
+ arg.nr_pages = pageblock_nr_pages;
+ arg.pages_found = 0;
+
/*
- * In future, more migrate types will be able to be isolation target.
+ * It may be possible to isolate a pageblock even if the
+ * migratetype is not MIGRATE_MOVABLE. The memory isolation
+ * notifier chain is used by balloon drivers to return the
+ * number of pages in a range that are held by the balloon
+ * driver to shrink memory. If all the pages are accounted for
+ * by balloons, are free, or on the LRU, isolation can continue.
+ * Later, for example, when memory hotplug notifier runs, these
+ * pages reported as "can be isolated" should be isolated(freed)
+ * by the balloon driver through the memory notifier chain.
*/
- if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
- zone_idx != ZONE_MOVABLE)
+ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+ notifier_ret = notifier_to_errno(notifier_ret);
+ if (notifier_ret || !arg.pages_found)
goto out;
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
- ret = 0;
+
+ for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+
+ curr_page = pfn_to_page(iter);
+ if (!page_count(curr_page) || PageLRU(curr_page))
+ continue;
+
+ immobile++;
+ }
+
+ if (arg.pages_found == immobile)
+ ret = 0;
+
out:
+ if (!ret) {
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
if (!ret)
drain_all_pages();


2009-10-09 20:42:09

by Robert Jennings

[permalink] [raw]
Subject: [PATCH 2/2][v3] powerpc: Make the CMM memory hotplug aware

The Collaborative Memory Manager (CMM) module allocates individual pages
over time that are not migratable. On a long running system this can
severely impact the ability to find enough pages to support a hotplug
memory remove operation.

This patch adds a memory isolation notifier and a memory hotplug notifier.
The memory isolation notifier will return the number of pages found
in the range specified. This is used to determine if all of the used
pages in a pageblock are owned by the balloon (or other entities in
the notifier chain). The hotplug notifier will free pages in the range
which is to be removed. The priority of this hotplug notifier is low
so that it will be called near last, this helps avoids removing loaned
pages in operations that fail due to other handlers.

CMM activity will be halted when hotplug remove operations are active
and resume activity after a delay period to allow the hypervisor time
to adjust.

Signed-off-by: Robert Jennings <[email protected]>

---

I'm not entirely sure of the ettiquette, but there are no changes to
this patch. I'm resending to keep it with the changes to the parent
patch.

arch/powerpc/platforms/pseries/cmm.c | 207 ++++++++++++++++++++++++++++++++++-
1 file changed, 201 insertions(+), 6 deletions(-)

Index: b/arch/powerpc/platforms/pseries/cmm.c
===================================================================
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -38,19 +38,28 @@
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
+#include <linux/memory.h>

#include "plpar_wrappers.h"

#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15

static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
@@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+ "before activity resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
@@ -88,6 +101,8 @@ struct cmm_page_array {
static unsigned long loaned_pages;
static unsigned long loaned_pages_target;
static unsigned long oom_freed_pages;
+static atomic_t hotplug_active = ATOMIC_INIT(0);
+static atomic_t hotplug_occurred = ATOMIC_INIT(0);

static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);
@@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
cmm_dbg("Begin request for %ld pages\n", nr);

while (nr) {
+ if (atomic_read(&hotplug_active))
+ break;
+
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
@@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
- npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC |
+ __GFP_MOVABLE);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
@@ -273,9 +293,23 @@ static int cmm_thread(void *dummy)
while (1) {
timeleft = msleep_interruptible(delay * 1000);

- if (kthread_should_stop() || timeleft) {
- loaned_pages_target = loaned_pages;
+ if (kthread_should_stop() || timeleft)
break;
+
+ if (atomic_read(&hotplug_active)) {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
+ }
+
+ if (atomic_dec_if_positive(&hotplug_occurred) >= 0) {
+ cmm_dbg("Hotplug operation has occurred, loaning "
+ "activity suspended for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay * 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
}

cmm_get_mpp();
@@ -405,6 +439,159 @@ static struct notifier_block cmm_reboot_
};

/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+ unsigned long idx;
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ continue;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+ atomic_set(&hotplug_occurred, 1);
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ atomic_set(&hotplug_active, 1);
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ atomic_set(&hotplug_active, 0);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
* cmm_init - Module initialization
*
* Return value:
@@ -426,18 +613,24 @@ static int cmm_init(void)
if ((rc = cmm_sysfs_register(&cmm_sysdev)))
goto out_reboot_notifier;

+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
if (cmm_disabled)
return rc;

cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
- goto out_unregister_sysfs;
+ goto out_unregister_notifier;
}

return rc;

-out_unregister_sysfs:
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_sysdev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
@@ -458,6 +651,8 @@ static void cmm_exit(void)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_sysdev);
}

2009-10-12 05:09:19

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH 2/2][v3] powerpc: Make the CMM memory hotplug aware

On Fri, 2009-10-09 at 15:41 -0500, Robert Jennings wrote:
> The Collaborative Memory Manager (CMM) module allocates individual pages
> over time that are not migratable. On a long running system this can
> severely impact the ability to find enough pages to support a hotplug
> memory remove operation.
>
> This patch adds a memory isolation notifier and a memory hotplug notifier.
> The memory isolation notifier will return the number of pages found
> in the range specified. This is used to determine if all of the used
> pages in a pageblock are owned by the balloon (or other entities in
> the notifier chain). The hotplug notifier will free pages in the range
> which is to be removed. The priority of this hotplug notifier is low
> so that it will be called near last, this helps avoids removing loaned
> pages in operations that fail due to other handlers.
>
> CMM activity will be halted when hotplug remove operations are active
> and resume activity after a delay period to allow the hypervisor time
> to adjust.
>
> Signed-off-by: Robert Jennings <[email protected]>

Do you need me to merge that via the powerpc tree after the relevant
generic parts go in ? This is 2.6.33 material ?

> +module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
> + "before activity resumes. "
> + "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");

What is the above ? That sounds scary :-)

> module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
> MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
> "[Default=" __stringify(CMM_OOM_KB) "]");
> @@ -88,6 +101,8 @@ struct cmm_page_array {
> static unsigned long loaned_pages;
> static unsigned long loaned_pages_target;
> static unsigned long oom_freed_pages;
> +static atomic_t hotplug_active = ATOMIC_INIT(0);
> +static atomic_t hotplug_occurred = ATOMIC_INIT(0);

That sounds like a hand made lock with atomics... rarely a good idea,
tends to miss appropriate barriers etc...

> static struct cmm_page_array *cmm_page_list;
> static DEFINE_SPINLOCK(cmm_lock);
> @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
> cmm_dbg("Begin request for %ld pages\n", nr);
>
> while (nr) {
> + if (atomic_read(&hotplug_active))
> + break;
> +

Ok so I'm not familiar with that whole memory hotplug stuff, so the code
might be right, but wouldn't the above be racy anyways in case hotplug
just becomes active after this statement ?

Shouldn't you use a mutex_trylock instead ? That has clearer semantics
and will provide the appropriate memory barriers.

> addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
> __GFP_NORETRY | __GFP_NOMEMALLOC);
> if (!addr)
> @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
> if (!pa || pa->index >= CMM_NR_PAGES) {
> /* Need a new page for the page list. */
> spin_unlock(&cmm_lock);
> - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
> - __GFP_NORETRY | __GFP_NOMEMALLOC);
> + npa = (struct cmm_page_array *)__get_free_page(
> + GFP_NOIO | __GFP_NOWARN |
> + __GFP_NORETRY | __GFP_NOMEMALLOC |
> + __GFP_MOVABLE);
> if (!npa) {
> pr_info("%s: Can not allocate new page list\n", __func__);
> free_page(addr);
> @@ -273,9 +293,23 @@ static int cmm_thread(void *dummy)
> while (1) {
> timeleft = msleep_interruptible(delay * 1000);
>
> - if (kthread_should_stop() || timeleft) {
> - loaned_pages_target = loaned_pages;
> + if (kthread_should_stop() || timeleft)
> break;
> +
> + if (atomic_read(&hotplug_active)) {
> + cmm_dbg("Hotplug operation in progress, activity "
> + "suspended\n");
> + continue;
> + }
> +
> + if (atomic_dec_if_positive(&hotplug_occurred) >= 0) {
> + cmm_dbg("Hotplug operation has occurred, loaning "
> + "activity suspended for %d seconds.\n",
> + hotplug_delay);
> + timeleft = msleep_interruptible(hotplug_delay * 1000);
> + if (kthread_should_stop() || timeleft)
> + break;
> + continue;
> }

I have less problems with hotplug_occured but if you use a
mutex_trylock, overall, you can turn the above into a normal int instead
of an atomic.

../..

> +static int cmm_memory_cb(struct notifier_block *self,
> + unsigned long action, void *arg)
> +{
> + int ret = 0;
> +
> + switch (action) {
> + case MEM_GOING_OFFLINE:
> + atomic_set(&hotplug_active, 1);

So that would become a mutex_lock(). Added advantage is that
it would wait for a current CMM operation to complete.

Cheers,
Ben.

2009-10-12 19:24:28

by Robert Jennings

[permalink] [raw]
Subject: Re: [PATCH 2/2][v3] powerpc: Make the CMM memory hotplug aware

* Benjamin Herrenschmidt ([email protected]) wrote:
> On Fri, 2009-10-09 at 15:41 -0500, Robert Jennings wrote:
> > The Collaborative Memory Manager (CMM) module allocates individual pages
> > over time that are not migratable. On a long running system this can
> > severely impact the ability to find enough pages to support a hotplug
> > memory remove operation.
> >
> > This patch adds a memory isolation notifier and a memory hotplug notifier.
> > The memory isolation notifier will return the number of pages found
> > in the range specified. This is used to determine if all of the used
> > pages in a pageblock are owned by the balloon (or other entities in
> > the notifier chain). The hotplug notifier will free pages in the range
> > which is to be removed. The priority of this hotplug notifier is low
> > so that it will be called near last, this helps avoids removing loaned
> > pages in operations that fail due to other handlers.
> >
> > CMM activity will be halted when hotplug remove operations are active
> > and resume activity after a delay period to allow the hypervisor time
> > to adjust.
> >
> > Signed-off-by: Robert Jennings <[email protected]>
>
> Do you need me to merge that via the powerpc tree after the relevant
> generic parts go in ? This is 2.6.33 material ?

I didn't know how this part works honestly, this is the first time I've
pushed patches with dependencies like this. Andrew Morton had pulled
an earlier version of this and the mm hotplug related changes for 2.6.32.

> > +module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
> > +MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
> > + "before activity resumes. "
> > + "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
>
> What is the above ? That sounds scary :-)

I'm changing this to read "Delay (in seconds) after memory hotplug
remove before loaning resumes." in order to clear this us. This is a
period where loaning from the balloon is paused after the hotplug
completes. This gives the userspace tools time to mark the sections
as isolated and unusable with the hypervisor and the hypervisor to take
this into account regarding the loaning levels it requests of the OS.

> > module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
> > MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
> > "[Default=" __stringify(CMM_OOM_KB) "]");
> > @@ -88,6 +101,8 @@ struct cmm_page_array {
> > static unsigned long loaned_pages;
> > static unsigned long loaned_pages_target;
> > static unsigned long oom_freed_pages;
> > +static atomic_t hotplug_active = ATOMIC_INIT(0);
> > +static atomic_t hotplug_occurred = ATOMIC_INIT(0);
>
> That sounds like a hand made lock with atomics... rarely a good idea,
> tends to miss appropriate barriers etc...
>

I have changes this so that we have a mutex held during the memory
hotplug remove. The hotplug_occurred flag is now and integer protected
by the mutex; it is used to provide the delay after the hotplug remove
completes.

> > static struct cmm_page_array *cmm_page_list;
> > static DEFINE_SPINLOCK(cmm_lock);
> > @@ -110,6 +125,9 @@ static long cmm_alloc_pages(long nr)
> > cmm_dbg("Begin request for %ld pages\n", nr);
> >
> > while (nr) {
> > + if (atomic_read(&hotplug_active))
> > + break;
> > +
>
> Ok so I'm not familiar with that whole memory hotplug stuff, so the code
> might be right, but wouldn't the above be racy anyways in case hotplug
> just becomes active after this statement ?
>
> Shouldn't you use a mutex_trylock instead ? That has clearer semantics
> and will provide the appropriate memory barriers.

I have changed this to use a mutex in the same location. This allows
the allocation of pages to terminate early during a hotplug remove
operation.

If hotplug becomes active after this check in cmm_alloc_pages() one page
will be allocated to the balloon, but this page will not belong to the
memory range going offline because the pageblock will have already been
isolated. After allocating the page we might need to wait on the lock to
add the page to the list if hotplug is removing pages from the balloon.
After one page is added, cmm_alloc_pages() will exit early when it checks
to see if hotplug is active or if it has occurred.

I wanted to keep the section locked by cmm_lock small, so that we can
safely respond to the hotplug notifier as quickly as possible while
minimizing memory pressure.

There are no checks in cmm_free_pages() to have it abort early during
hotplug memory remove with the rationale that it's good for hotplug
to allow the balloon to shrink even if it means holding the cmm_lock a
bit longer.

> > addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
> > __GFP_NORETRY | __GFP_NOMEMALLOC);
> > if (!addr)
> > @@ -119,8 +137,10 @@ static long cmm_alloc_pages(long nr)
> > if (!pa || pa->index >= CMM_NR_PAGES) {
> > /* Need a new page for the page list. */
> > spin_unlock(&cmm_lock);
> > - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
> > - __GFP_NORETRY | __GFP_NOMEMALLOC);
> > + npa = (struct cmm_page_array *)__get_free_page(
> > + GFP_NOIO | __GFP_NOWARN |
> > + __GFP_NORETRY | __GFP_NOMEMALLOC |
> > + __GFP_MOVABLE);
> > if (!npa) {
> > pr_info("%s: Can not allocate new page list\n", __func__);
> > free_page(addr);
> > @@ -273,9 +293,23 @@ static int cmm_thread(void *dummy)
> > while (1) {
> > timeleft = msleep_interruptible(delay * 1000);
> >
> > - if (kthread_should_stop() || timeleft) {
> > - loaned_pages_target = loaned_pages;
> > + if (kthread_should_stop() || timeleft)
> > break;
> > +
> > + if (atomic_read(&hotplug_active)) {
> > + cmm_dbg("Hotplug operation in progress, activity "
> > + "suspended\n");
> > + continue;
> > + }
> > +
> > + if (atomic_dec_if_positive(&hotplug_occurred) >= 0) {
> > + cmm_dbg("Hotplug operation has occurred, loaning "
> > + "activity suspended for %d seconds.\n",
> > + hotplug_delay);
> > + timeleft = msleep_interruptible(hotplug_delay * 1000);
> > + if (kthread_should_stop() || timeleft)
> > + break;
> > + continue;
> > }
>
> I have less problems with hotplug_occured but if you use a
> mutex_trylock, overall, you can turn the above into a normal int instead
> of an atomic.

Changed to a mutex to indicate that a hotplug operation is active and an
int protected by the mutex to indicate that a hotplug operation has
occurred.

> ../..
>
> > +static int cmm_memory_cb(struct notifier_block *self,
> > + unsigned long action, void *arg)
> > +{
> > + int ret = 0;
> > +
> > + switch (action) {
> > + case MEM_GOING_OFFLINE:
> > + atomic_set(&hotplug_active, 1);
>
> So that would become a mutex_lock(). Added advantage is that
> it would wait for a current CMM operation to complete.

I've added the mutex but the scope will not prevent hotplug from
starting before the current CMM operation has completed. This allows us
to abort the allocation. The important globals for managing the list of
pages are still covered by the cmm_lock spinlock.

I've tested the patch and I'll send it out shortly.

2009-10-12 19:49:59

by Robert Jennings

[permalink] [raw]
Subject: [PATCH 2/2][v4] powerpc: Make the CMM memory hotplug aware

The Collaborative Memory Manager (CMM) module allocates individual pages
over time that are not migratable. On a long running system this can
severely impact the ability to find enough pages to support a hotplug
memory remove operation.

This patch adds a memory isolation notifier and a memory hotplug notifier.
The memory isolation notifier will return the number of pages found
in the range specified. This is used to determine if all of the used
pages in a pageblock are owned by the balloon (or other entities in
the notifier chain). The hotplug notifier will free pages in the range
which is to be removed. The priority of this hotplug notifier is low
so that it will be called near last, this helps avoids removing loaned
pages in operations that fail due to other handlers.

CMM activity will be halted when hotplug remove operations are active
and resume activity after a delay period to allow the hypervisor time
to adjust.

Signed-off-by: Robert Jennings <[email protected]>

---

Changes since v3:
* Changed from atomic to mutex for hotplug state tracking.
* Clarified documentation for the new module parameter description.

Changes since v2:
* None, resent with parent patch to keep them together.

arch/powerpc/platforms/pseries/cmm.c | 221 ++++++++++++++++++++++++++++++++++-
1 file changed, 215 insertions(+), 6 deletions(-)

Index: b/arch/powerpc/platforms/pseries/cmm.c
===================================================================
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -38,19 +38,28 @@
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
+#include <linux/memory.h>

#include "plpar_wrappers.h"

#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15

static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
@@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+ "before loaning resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
@@ -92,6 +105,9 @@ static unsigned long oom_freed_pages;
static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);

+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
static struct task_struct *cmm_thread_ptr;

/**
@@ -110,6 +126,17 @@ static long cmm_alloc_pages(long nr)
cmm_dbg("Begin request for %ld pages\n", nr);

while (nr) {
+ /* Exit if a hotplug operation is in progress or occurred */
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ mutex_unlock(&hotplug_mutex);
+ break;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ break;
+ }
+
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
@@ -119,8 +146,10 @@ static long cmm_alloc_pages(long nr)
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
- npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC |
+ __GFP_MOVABLE);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
@@ -273,9 +302,28 @@ static int cmm_thread(void *dummy)
while (1) {
timeleft = msleep_interruptible(delay * 1000);

- if (kthread_should_stop() || timeleft) {
- loaned_pages_target = loaned_pages;
+ if (kthread_should_stop() || timeleft)
break;
+
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ hotplug_occurred = 0;
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Hotplug operation has occurred, "
+ "loaning activity suspended "
+ "for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay *
+ 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
}

cmm_get_mpp();
@@ -405,6 +453,159 @@ static struct notifier_block cmm_reboot_
};

/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+ unsigned long idx;
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ continue;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&hotplug_mutex);
+ hotplug_occurred = 1;
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
* cmm_init - Module initialization
*
* Return value:
@@ -426,18 +627,24 @@ static int cmm_init(void)
if ((rc = cmm_sysfs_register(&cmm_sysdev)))
goto out_reboot_notifier;

+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
if (cmm_disabled)
return rc;

cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
- goto out_unregister_sysfs;
+ goto out_unregister_notifier;
}

return rc;

-out_unregister_sysfs:
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_sysdev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
@@ -458,6 +665,8 @@ static void cmm_exit(void)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_sysdev);
}

2009-10-22 18:23:43

by Robert Jennings

[permalink] [raw]
Subject: [PATCH 2/2][v5] powerpc: Make the CMM memory hotplug aware

The Collaborative Memory Manager (CMM) module allocates individual pages
over time that are not migratable. On a long running system this can
severely impact the ability to find enough pages to support a hotplug
memory remove operation.

This patch adds a memory isolation notifier and a memory hotplug notifier.
The memory isolation notifier will return the number of pages found
in the range specified. This is used to determine if all of the used
pages in a pageblock are owned by the balloon (or other entities in
the notifier chain). The hotplug notifier will free pages in the range
which is to be removed. The priority of this hotplug notifier is low
so that it will be called near last, this helps avoids removing loaned
pages in operations that fail due to other handlers.

CMM activity will be halted when hotplug remove operations are active
and resume activity after a delay period to allow the hypervisor time
to adjust.

Signed-off-by: Robert Jennings <[email protected]>

---
Changes since v4:
* The structures for recording loaned pages are not allocated as MOVABLE
* The structures for recording loaned pages are removed from sections
being taken offline by moving their contents to a newly allocated page.

Changes since v3:
* Changed from atomic to mutex for hotplug state tracking.
* Clarified documentation for the new module parameter description.

Changes since v2:
* None, resent with parent patch to keep them together.

arch/powerpc/platforms/pseries/cmm.c | 254 ++++++++++++++++++++++++++++++++++-
1 file changed, 248 insertions(+), 6 deletions(-)

Index: b/arch/powerpc/platforms/pseries/cmm.c
===================================================================
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -38,19 +38,28 @@
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
+#include <linux/memory.h>

#include "plpar_wrappers.h"

#define CMM_DRIVER_VERSION "1.0.0"
#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
#define CMM_DEBUG 0
#define CMM_DISABLE 0
#define CMM_OOM_KB 1024
#define CMM_MIN_MEM_MB 256
#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15

static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
static unsigned int oom_kb = CMM_OOM_KB;
static unsigned int cmm_debug = CMM_DEBUG;
static unsigned int cmm_disabled = CMM_DISABLE;
@@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION);
module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
"[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+ "before loaning resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
"[Default=" __stringify(CMM_OOM_KB) "]");
@@ -92,6 +105,9 @@ static unsigned long oom_freed_pages;
static struct cmm_page_array *cmm_page_list;
static DEFINE_SPINLOCK(cmm_lock);

+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
static struct task_struct *cmm_thread_ptr;

/**
@@ -110,6 +126,17 @@ static long cmm_alloc_pages(long nr)
cmm_dbg("Begin request for %ld pages\n", nr);

while (nr) {
+ /* Exit if a hotplug operation is in progress or occurred */
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ mutex_unlock(&hotplug_mutex);
+ break;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ break;
+ }
+
addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
__GFP_NORETRY | __GFP_NOMEMALLOC);
if (!addr)
@@ -119,8 +146,9 @@ static long cmm_alloc_pages(long nr)
if (!pa || pa->index >= CMM_NR_PAGES) {
/* Need a new page for the page list. */
spin_unlock(&cmm_lock);
- npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_NOMEMALLOC);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
if (!npa) {
pr_info("%s: Can not allocate new page list\n", __func__);
free_page(addr);
@@ -273,9 +301,28 @@ static int cmm_thread(void *dummy)
while (1) {
timeleft = msleep_interruptible(delay * 1000);

- if (kthread_should_stop() || timeleft) {
- loaned_pages_target = loaned_pages;
+ if (kthread_should_stop() || timeleft)
break;
+
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ hotplug_occurred = 0;
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Hotplug operation has occurred, "
+ "loaning activity suspended "
+ "for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay *
+ 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
}

cmm_get_mpp();
@@ -405,6 +452,193 @@ static struct notifier_block cmm_reboot_
};

/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+ unsigned long idx;
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ if ((unsigned long)pa >= start && (unsigned long)pa < end)
+ marg->pages_found++;
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last, *npa;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ /* Search the page list for pages in the range to be offlined */
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ continue;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+
+ /* Search for page list structures in the range to be offlined */
+ pa_last = NULL;
+ pa_curr = cmm_page_list;
+ while (pa_curr) {
+ if (((unsigned long)pa_curr >= start_page) &&
+ ((unsigned long)pa_curr < end_page)) {
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!npa) {
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Failed to allocate memory for list "
+ "management. Memory hotplug "
+ "failed.\n");
+ return ENOMEM;
+ }
+ memcpy(npa, pa_curr, PAGE_SIZE);
+ if (pa_curr == cmm_page_list)
+ cmm_page_list = npa;
+ if (pa_last)
+ pa_last->next = npa;
+ free_page((unsigned long) pa_curr);
+ freed++;
+ pa_curr = npa;
+ }
+
+ pa_last = pa_curr;
+ pa_curr = pa_curr->next;
+ }
+
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&hotplug_mutex);
+ hotplug_occurred = 1;
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+
+ return ret;
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
* cmm_init - Module initialization
*
* Return value:
@@ -426,18 +660,24 @@ static int cmm_init(void)
if ((rc = cmm_sysfs_register(&cmm_sysdev)))
goto out_reboot_notifier;

+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
if (cmm_disabled)
return rc;

cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (IS_ERR(cmm_thread_ptr)) {
rc = PTR_ERR(cmm_thread_ptr);
- goto out_unregister_sysfs;
+ goto out_unregister_notifier;
}

return rc;

-out_unregister_sysfs:
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_unregister_sysfs(&cmm_sysdev);
out_reboot_notifier:
unregister_reboot_notifier(&cmm_reboot_nb);
@@ -458,6 +698,8 @@ static void cmm_exit(void)
kthread_stop(cmm_thread_ptr);
unregister_oom_notifier(&cmm_oom_nb);
unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
cmm_free_pages(loaned_pages);
cmm_unregister_sysfs(&cmm_sysdev);
}