Features and fixes:
- new version of memory hotplug patch which supports
among others memory allocation policies during errors
(try until success or stop at first error),
- this version of patch was tested with tmem
(selfballooning and frontswap) and works
very well with it,
- some other minor fixes.
Signed-off-by: Daniel Kiper <[email protected]>
---
drivers/xen/Kconfig | 10 ++
drivers/xen/balloon.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 230 insertions(+), 11 deletions(-)
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 07bec09..8f880aa 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -9,6 +9,16 @@ config XEN_BALLOON
the system to expand the domain's memory allocation, or alternatively
return unneeded memory to the system.
+config XEN_BALLOON_MEMORY_HOTPLUG
+ bool "Memory hotplug support for Xen balloon driver"
+ default n
+ depends on XEN_BALLOON && MEMORY_HOTPLUG
+ help
+ Memory hotplug support for Xen balloon driver allows expanding memory
+ available for the system above limit declared at system startup.
+ It is very useful on critical systems which require long
+ run without rebooting.
+
config XEN_SCRUB_PAGES
bool "Scrub pages before returning them to system"
depends on XEN_BALLOON
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index b1e199c..e43e928 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -6,6 +6,12 @@
* Copyright (c) 2003, B Dragovic
* Copyright (c) 2003-2004, M Williamson, K Fraser
* Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ * Copyright (c) 2010 Daniel Kiper
+ *
+ * Memory hotplug support was written by Daniel Kiper. Work on
+ * it was sponsored by Google under Google Summer of Code 2010
+ * program. Jeremy Fitzhardinge from Xen.org was the mentor for
+ * this project.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
@@ -44,6 +50,7 @@
#include <linux/list.h>
#include <linux/sysdev.h>
#include <linux/gfp.h>
+#include <linux/memory.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -80,6 +87,9 @@ enum bp_state {
BP_HUNGRY
};
+#define MH_POLICY_TRY_UNTIL_SUCCESS 0
+#define MH_POLICY_STOP_AT_FIRST_ERROR 1
+
struct balloon_stats {
/* We aim for 'current allocation' == 'target allocation'. */
unsigned long current_pages;
@@ -89,6 +99,10 @@ struct balloon_stats {
unsigned long balloon_high;
unsigned long schedule_delay;
unsigned long max_schedule_delay;
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+ unsigned long boot_max_pfn;
+ unsigned long mh_policy;
+#endif
};
static DEFINE_MUTEX(balloon_mutex);
@@ -206,18 +220,199 @@ static void update_schedule_delay(enum bp_state state)
balloon_stats.schedule_delay = new_schedule_delay;
}
-static unsigned long current_target(void)
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+static struct resource *allocate_memory_resource(unsigned long nr_pages)
+{
+ resource_size_t r_min, r_size;
+ struct resource *r;
+
+ /*
+ * Look for first unused memory region starting at page
+ * boundary. Skip last memory section created at boot time
+ * because it may contains unused memory pages with PG_reserved
+ * bit not set (online_pages() require PG_reserved bit set).
+ */
+
+ r = kzalloc(sizeof(struct resource), GFP_KERNEL);
+
+ if (!r)
+ return NULL;
+
+ r->name = "System RAM";
+ r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
+ r_size = nr_pages << PAGE_SHIFT;
+
+ if (allocate_resource(&iomem_resource, r, r_size, r_min,
+ ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
+ kfree(r);
+ return NULL;
+ }
+
+ return r;
+}
+
+static struct resource *adjust_memory_resource(struct resource *r, unsigned long nr_pages)
+{
+ int rc;
+
+ if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) {
+ rc = release_resource(r);
+ BUG_ON(rc < 0);
+ kfree(r);
+ return NULL;
+ }
+
+ rc = adjust_resource(r, r->start, r->end + 1 - r->start -
+ (nr_pages << PAGE_SHIFT));
+
+ BUG_ON(rc < 0);
+
+ return r;
+}
+
+static int allocate_additional_memory(struct resource *r, unsigned long nr_pages)
+{
+ int rc;
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+ unsigned long flags, i, pfn, pfn_start;
+
+ if (!nr_pages)
+ return 0;
+
+ pfn_start = PFN_UP(r->end) - nr_pages;
+
+ if (nr_pages > ARRAY_SIZE(frame_list))
+ nr_pages = ARRAY_SIZE(frame_list);
+
+ for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn)
+ frame_list[i] = pfn;
+
+ set_xen_guest_handle(reservation.extent_start, frame_list);
+ reservation.nr_extents = nr_pages;
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+
+ if (rc <= 0)
+ return (rc < 0) ? rc : -ENOMEM;
+
+ for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) {
+ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ phys_to_machine_mapping_valid(pfn));
+ set_phys_to_machine(pfn, frame_list[i]);
+ }
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ return rc;
+}
+
+static void hotplug_allocated_memory(struct resource *r)
{
- unsigned long target = balloon_stats.target_pages;
+ int nid, rc;
+ resource_size_t r_size;
+ struct memory_block *mem;
+ unsigned long pfn;
+
+ r_size = r->end + 1 - r->start;
+ nid = memory_add_physaddr_to_nid(r->start);
+
+ rc = add_registered_memory(nid, r->start, r_size);
+
+ if (rc) {
+ pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n",
+ __func__, rc);
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ return;
+ }
+
+ if (xen_pv_domain())
+ for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)
+ if (!PageHighMem(pfn_to_page(pfn))) {
+ rc = HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0);
+ BUG_ON(rc);
+ }
- target = min(target,
- balloon_stats.current_pages +
- balloon_stats.balloon_low +
- balloon_stats.balloon_high);
+ rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);
- return target;
+ if (rc) {
+ pr_err("%s: online_pages: Failed: %i\n", __func__, rc);
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ return;
+ }
+
+ for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) {
+ mem = find_memory_block(__pfn_to_section(pfn));
+ BUG_ON(!mem);
+ BUG_ON(!present_section_nr(mem->phys_index));
+ mutex_lock(&mem->state_mutex);
+ mem->state = MEM_ONLINE;
+ mutex_unlock(&mem->state_mutex);
+ }
+
+ balloon_stats.current_pages += r_size >> PAGE_SHIFT;
}
+static enum bp_state request_additional_memory(long credit)
+{
+ int rc;
+ static struct resource *r;
+ static unsigned long pages_left;
+
+ if ((credit <= 0 || balloon_stats.balloon_low ||
+ balloon_stats.balloon_high) && !r)
+ return BP_DONE;
+
+ if (!r) {
+ r = allocate_memory_resource(credit);
+
+ if (!r)
+ return BP_ERROR;
+
+ pages_left = credit;
+ }
+
+ rc = allocate_additional_memory(r, pages_left);
+
+ if (rc < 0) {
+ if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS)
+ return BP_ERROR;
+
+ r = adjust_memory_resource(r, pages_left);
+
+ if (!r)
+ return BP_ERROR;
+ } else {
+ pages_left -= rc;
+
+ if (pages_left)
+ return BP_HUNGRY;
+ }
+
+ hotplug_allocated_memory(r);
+
+ r = NULL;
+
+ return BP_DONE;
+}
+#else
+static enum bp_state request_additional_memory(long credit)
+{
+ if (balloon_stats.balloon_low && balloon_stats.balloon_high &&
+ balloon_stats.target_pages > balloon_stats.current_pages)
+ balloon_stats.target_pages = balloon_stats.current_pages;
+ return BP_DONE;
+}
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
+
static enum bp_state increase_reservation(unsigned long nr_pages)
{
enum bp_state state = BP_DONE;
@@ -352,15 +547,17 @@ static enum bp_state decrease_reservation(unsigned long nr_pages)
*/
static void balloon_process(struct work_struct *work)
{
- enum bp_state rc, state = BP_DONE;
+ enum bp_state rc, state;
long credit;
mutex_lock(&balloon_mutex);
do {
- credit = current_target() - balloon_stats.current_pages;
+ credit = balloon_stats.target_pages - balloon_stats.current_pages;
- if (credit > 0) {
+ state = request_additional_memory(credit);
+
+ if (credit > 0 && state == BP_DONE) {
rc = increase_reservation(credit);
state = (rc == BP_ERROR) ? BP_ERROR : state;
}
@@ -450,6 +647,11 @@ static int __init balloon_init(void)
balloon_stats.schedule_delay = 1;
balloon_stats.max_schedule_delay = 32;
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+ balloon_stats.boot_max_pfn = max_pfn;
+ balloon_stats.mh_policy = MH_POLICY_STOP_AT_FIRST_ERROR;
+#endif
+
register_balloon(&balloon_sysdev);
/*
@@ -506,6 +708,10 @@ BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
static SYSDEV_ULONG_ATTR(schedule_delay, 0644, balloon_stats.schedule_delay);
static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay);
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+static SYSDEV_ULONG_ATTR(memory_hotplug_policy, 0644, balloon_stats.mh_policy);
+#endif
+
static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
char *buf)
{
@@ -568,7 +774,10 @@ static struct sysdev_attribute *balloon_attrs[] = {
&attr_target_kb,
&attr_target,
&attr_schedule_delay.attr,
- &attr_max_schedule_delay.attr
+ &attr_max_schedule_delay.attr,
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+ &attr_memory_hotplug_policy.attr
+#endif
};
static struct attribute *balloon_info_attrs[] = {
--
1.5.6.5
On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote:
> +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> +{
> + resource_size_t r_min, r_size;
> + struct resource *r;
> +
> + /*
> + * Look for first unused memory region starting at page
> + * boundary. Skip last memory section created at boot time
> + * because it may contains unused memory pages with PG_reserved
> + * bit not set (online_pages() require PG_reserved bit set).
> + */
Could you elaborate on this comment a bit? I think it's covering both
the "PAGE_SIZE" argument to allocate_resource() and something else, but
I'm not quite sure.
> + r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> +
> + if (!r)
> + return NULL;
> +
> + r->name = "System RAM";
> + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
Did you do this for alignment reasons? It might be a better idea to
just make a nice sparsemem function to do alignment.
> + r_size = nr_pages << PAGE_SHIFT;
> +
> + if (allocate_resource(&iomem_resource, r, r_size, r_min,
> + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> + kfree(r);
> + return NULL;
> + }
> +
> + return r;
> +}
This function should probably be made generic. I bet some more
hypervisors come around and want to use this. They generally won't care
where the memory goes, and the kernel can allocate a spot for them.
...
> +static void hotplug_allocated_memory(struct resource *r)
> {
> - unsigned long target = balloon_stats.target_pages;
> + int nid, rc;
> + resource_size_t r_size;
> + struct memory_block *mem;
> + unsigned long pfn;
> +
> + r_size = r->end + 1 - r->start;
> + nid = memory_add_physaddr_to_nid(r->start);
> +
> + rc = add_registered_memory(nid, r->start, r_size);
> +
> + if (rc) {
> + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n",
> + __func__, rc);
> + balloon_stats.target_pages = balloon_stats.current_pages;
> + return;
> + }
> +
> + if (xen_pv_domain())
> + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)
> + if (!PageHighMem(pfn_to_page(pfn))) {
> + rc = HYPERVISOR_update_va_mapping(
> + (unsigned long)__va(pfn << PAGE_SHIFT),
> + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0);
> + BUG_ON(rc);
> + }
>
> - target = min(target,
> - balloon_stats.current_pages +
> - balloon_stats.balloon_low +
> - balloon_stats.balloon_high);
> + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);
The memory hotplug code is a bit weird at first glance. It has two
distinct stages: first, add_memory() is called from
architecture-specific code. Later, online_pages() is called, but that
part is driven by userspace.
For all the other hotplug users online_pages() is done later, and
triggered by userspace. The idea is that you could have memory sitting
in "added, but offline" state which would be trivial to remove if
someone else needed it, but also trivial to add without the need to
allocate additional memory.
In other words, I think you can take everything from and including
online_pages() down in the function and take it out. Use a udev hotplug
rule to online it immediately if that's what you want.
> - return target;
> + if (rc) {
> + pr_err("%s: online_pages: Failed: %i\n", __func__, rc);
> + balloon_stats.target_pages = balloon_stats.current_pages;
> + return;
> + }
> +
> + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) {
> + mem = find_memory_block(__pfn_to_section(pfn));
> + BUG_ON(!mem);
> + BUG_ON(!present_section_nr(mem->phys_index));
> + mutex_lock(&mem->state_mutex);
> + mem->state = MEM_ONLINE;
> + mutex_unlock(&mem->state_mutex);
> + }
> +
> + balloon_stats.current_pages += r_size >> PAGE_SHIFT;
> }
>
> +static enum bp_state request_additional_memory(long credit)
> +{
> + int rc;
> + static struct resource *r;
> + static unsigned long pages_left;
> +
> + if ((credit <= 0 || balloon_stats.balloon_low ||
> + balloon_stats.balloon_high) && !r)
> + return BP_DONE;
> +
> + if (!r) {
> + r = allocate_memory_resource(credit);
> +
> + if (!r)
> + return BP_ERROR;
> +
> + pages_left = credit;
> + }
'r' is effectively a global variable here. Could you give it a more
proper name? Maybe "last add location" or something. It might even
make sense to move it up in to the global scope to make it _much_ more
clear that it's not just locally used.
> + rc = allocate_additional_memory(r, pages_left);
> +
> + if (rc < 0) {
> + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS)
> + return BP_ERROR;
> +
> + r = adjust_memory_resource(r, pages_left);
> +
> + if (!r)
> + return BP_ERROR;
> + } else {
> + pages_left -= rc;
> +
> + if (pages_left)
> + return BP_HUNGRY;
> + }
> +
> + hotplug_allocated_memory(r);
> +
> + r = NULL;
> +
> + return BP_DONE;
> +}
Could you explain a bit about why you chose to do this stuff with memory
resources? Is that the only visibility that you have in to what memory
the guest actually has?
What troubles did you run in to when you did
add_memory(0, balloon_stats.boot_max_pfn, credit);
?
It's just that all the other memory hotplug users are _told_ by the
hardware where to put things. Is that not the case here?
-- Dave
On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote:
> On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote:
> > +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> > +{
> > + resource_size_t r_min, r_size;
> > + struct resource *r;
> > +
> > + /*
> > + * Look for first unused memory region starting at page
> > + * boundary. Skip last memory section created at boot time
> > + * because it may contains unused memory pages with PG_reserved
> > + * bit not set (online_pages() require PG_reserved bit set).
> > + */
>
> Could you elaborate on this comment a bit? I think it's covering both
> the "PAGE_SIZE" argument to allocate_resource() and something else, but
> I'm not quite sure.
Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource().
Additionally, r_min (calculated below) sets lower limit at which hoplugged
memory could be installed (due to PG_reserved bit requirment set up by
online_pages()). Later r_min is put as an argument to allocate_resource() call.
> > + r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> > +
> > + if (!r)
> > + return NULL;
> > +
> > + r->name = "System RAM";
> > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
>
> Did you do this for alignment reasons? It might be a better idea to
> just make a nice sparsemem function to do alignment.
Please look above.
> > + r_size = nr_pages << PAGE_SHIFT;
> > +
> > + if (allocate_resource(&iomem_resource, r, r_size, r_min,
> > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> > + kfree(r);
> > + return NULL;
> > + }
> > +
> > + return r;
> > +}
>
> This function should probably be made generic. I bet some more
> hypervisors come around and want to use this. They generally won't care
> where the memory goes, and the kernel can allocate a spot for them.
Yes, you are right. I think about new project in which
this function will be generic and then I would move it to
some more generic place. Now, I think it should stay here.
> ...
> > +static void hotplug_allocated_memory(struct resource *r)
> > {
> > - unsigned long target = balloon_stats.target_pages;
> > + int nid, rc;
> > + resource_size_t r_size;
> > + struct memory_block *mem;
> > + unsigned long pfn;
> > +
> > + r_size = r->end + 1 - r->start;
> > + nid = memory_add_physaddr_to_nid(r->start);
> > +
> > + rc = add_registered_memory(nid, r->start, r_size);
> > +
> > + if (rc) {
> > + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n",
> > + __func__, rc);
> > + balloon_stats.target_pages = balloon_stats.current_pages;
> > + return;
> > + }
> > +
> > + if (xen_pv_domain())
> > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)
> > + if (!PageHighMem(pfn_to_page(pfn))) {
> > + rc = HYPERVISOR_update_va_mapping(
> > + (unsigned long)__va(pfn << PAGE_SHIFT),
> > + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0);
> > + BUG_ON(rc);
> > + }
> >
> > - target = min(target,
> > - balloon_stats.current_pages +
> > - balloon_stats.balloon_low +
> > - balloon_stats.balloon_high);
> > + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);
>
> The memory hotplug code is a bit weird at first glance. It has two
> distinct stages: first, add_memory() is called from
> architecture-specific code. Later, online_pages() is called, but that
> part is driven by userspace.
>
> For all the other hotplug users online_pages() is done later, and
> triggered by userspace. The idea is that you could have memory sitting
> in "added, but offline" state which would be trivial to remove if
> someone else needed it, but also trivial to add without the need to
> allocate additional memory.
>
> In other words, I think you can take everything from and including
> online_pages() down in the function and take it out. Use a udev hotplug
> rule to online it immediately if that's what you want.
I agree. I discussed a bit about this problem with Jeremy, too. However,
there are some problems to implement that solution now. First of all it is
possible to online hotplugged memory using sysfs interface only in chunks
called sections. It means that it is not possible online once again section
which was onlined ealier partialy populated and now it contains new pages
to online. In this situation sysfs interface emits Invalid argument error.
In theory it should be possible to offline and then online whole section
once again, however, if memory from this section was used is not possible
to do that. It means that those properties does not allow hotplug memory
in guest in finer granulity than section and sysfs interface is too inflexible
to be used in that solution. That is why I decided to online hoplugged memory
using API which does not have those limitations.
I think that two solutions are possible (in order of prefernce) to cope
with that problem and move onlining to user domain:
- migrate to new sysfs interface for onlining which allows
address space description when onlining memory,
- create new onlining sysfs inteface for Xen only (I think that
first solution is much better because it will be generic).
However, those solution require time to implementation and that is
why, I think, that my solution (let's call it workaround) should
be implemented till new sysfs interface will be available.
> > - return target;
> > + if (rc) {
> > + pr_err("%s: online_pages: Failed: %i\n", __func__, rc);
> > + balloon_stats.target_pages = balloon_stats.current_pages;
> > + return;
> > + }
> > +
> > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) {
> > + mem = find_memory_block(__pfn_to_section(pfn));
> > + BUG_ON(!mem);
> > + BUG_ON(!present_section_nr(mem->phys_index));
> > + mutex_lock(&mem->state_mutex);
> > + mem->state = MEM_ONLINE;
> > + mutex_unlock(&mem->state_mutex);
> > + }
> > +
> > + balloon_stats.current_pages += r_size >> PAGE_SHIFT;
> > }
> >
> > +static enum bp_state request_additional_memory(long credit)
> > +{
> > + int rc;
> > + static struct resource *r;
> > + static unsigned long pages_left;
> > +
> > + if ((credit <= 0 || balloon_stats.balloon_low ||
> > + balloon_stats.balloon_high) && !r)
> > + return BP_DONE;
> > +
> > + if (!r) {
> > + r = allocate_memory_resource(credit);
> > +
> > + if (!r)
> > + return BP_ERROR;
> > +
> > + pages_left = credit;
> > + }
>
> 'r' is effectively a global variable here. Could you give it a more
> proper name? Maybe "last add location" or something. It might even
> make sense to move it up in to the global scope to make it _much_ more
> clear that it's not just locally used.
This is used between subsequent calls (memory is allocated in chunks
and onlined when all requested memory is allocated) to store address
space for allocation. That is why it is static. It is used only in this
function and if it is required then pointer to this variable is passed
to called functions. I think that it should stay local to this function
and do not pollute global variable namespace. However, I will rename it to
something more meaningful.
> > + rc = allocate_additional_memory(r, pages_left);
> > +
> > + if (rc < 0) {
> > + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS)
> > + return BP_ERROR;
> > +
> > + r = adjust_memory_resource(r, pages_left);
> > +
> > + if (!r)
> > + return BP_ERROR;
> > + } else {
> > + pages_left -= rc;
> > +
> > + if (pages_left)
> > + return BP_HUNGRY;
> > + }
> > +
> > + hotplug_allocated_memory(r);
> > +
> > + r = NULL;
> > +
> > + return BP_DONE;
> > +}
>
> Could you explain a bit about why you chose to do this stuff with memory
> resources? Is that the only visibility that you have in to what memory
> the guest actually has?
That depends on balloon_stats.mh_policy. If it is MH_POLICY_TRY_UNTIL_SUCCESS
and memory allocation error appears then *r is not changed and attempts
are made to allocate more memory. If it is MH_POLICY_STOP_AT_FIRST_ERROR
and memory allocation error appears then *r is alligned to currently
successfully allocated memory and memory is onlined (if size != 0).
> What troubles did you run in to when you did
>
> add_memory(0, balloon_stats.boot_max_pfn, credit);
>
> ?
>
> It's just that all the other memory hotplug users are _told_ by the
> hardware where to put things. Is that not the case here?
No. On bare metal BIOS/firmware cares about proper address space and
when everything goes OK then notify operating system about memory hotplug.
In Xen (and probably in others platforms will be) it is not a case and
memory should be allocated ealier by guest. However, before doing
that, I think, address space should be reserved (it is small chance that
something will request overlaping address space, however, I think it is
better to do that than sorry). Additionally, IIRC, add_memory() requires
that underlying memory is available before its call. That is why I
decided to create add_registered_memory() which do all things which are
done by add_memory() excluding address space reservation which is done
by allocate_memory_resource() before memory allocation.
Daniel
On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote:
> On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote:
> > On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote:
> > > +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> > > +{
> > > + resource_size_t r_min, r_size;
> > > + struct resource *r;
> > > +
> > > + /*
> > > + * Look for first unused memory region starting at page
> > > + * boundary. Skip last memory section created at boot time
> > > + * because it may contains unused memory pages with PG_reserved
> > > + * bit not set (online_pages() require PG_reserved bit set).
> > > + */
> >
> > Could you elaborate on this comment a bit? I think it's covering both
> > the "PAGE_SIZE" argument to allocate_resource() and something else, but
> > I'm not quite sure.
>
> Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource().
> Additionally, r_min (calculated below) sets lower limit at which hoplugged
> memory could be installed (due to PG_reserved bit requirment set up by
> online_pages()). Later r_min is put as an argument to allocate_resource() call.
OK, and you'll update the comment on that?
> > > + r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> > > +
> > > + if (!r)
> > > + return NULL;
> > > +
> > > + r->name = "System RAM";
> > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
> >
> > Did you do this for alignment reasons? It might be a better idea to
> > just make a nice sparsemem function to do alignment.
>
> Please look above.
You spoke about page alignment up there. Why is this section-aligned?
Should we make an "align to section" function in generic sparsemem code?
> > > + r_size = nr_pages << PAGE_SHIFT;
> > > +
> > > + if (allocate_resource(&iomem_resource, r, r_size, r_min,
> > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> > > + kfree(r);
> > > + return NULL;
> > > + }
> > > +
> > > + return r;
> > > +}
> >
> > This function should probably be made generic. I bet some more
> > hypervisors come around and want to use this. They generally won't care
> > where the memory goes, and the kernel can allocate a spot for them.
>
> Yes, you are right. I think about new project in which
> this function will be generic and then I would move it to
> some more generic place. Now, I think it should stay here.
Please move it to generic code. It doesn't belong in Xen code.
> > In other words, I think you can take everything from and including
> > online_pages() down in the function and take it out. Use a udev hotplug
> > rule to online it immediately if that's what you want.
>
> I agree. I discussed a bit about this problem with Jeremy, too. However,
> there are some problems to implement that solution now. First of all it is
> possible to online hotplugged memory using sysfs interface only in chunks
> called sections. It means that it is not possible online once again section
> which was onlined ealier partialy populated and now it contains new pages
> to online. In this situation sysfs interface emits Invalid argument error.
> In theory it should be possible to offline and then online whole section
> once again, however, if memory from this section was used is not possible
> to do that. It means that those properties does not allow hotplug memory
> in guest in finer granulity than section and sysfs interface is too inflexible
> to be used in that solution. That is why I decided to online hoplugged memory
> using API which does not have those limitations.
Sure, you have to _online_ the whole thing at once, but you don't have
to actually make the pages available. You also don't need to hook in to
the memory resource code like you're doing. It's sufficient to just try
and add the memory. If you get -EEXIST, then you can't add it there, so
move up and try again.
int xen_balloon_add_memory(u64 size)
{
unsigned long top_of_mem = max_pfn;
top_of_mem = section_align_up(top_of_mem);
while (1) {
int ret = add_memory(nid, top_of_mem, size);
if (ret == -EEXIST)
continue;
// error handling...
break;
}
return...;
}
As for telling the hypervisor where you've mapped things, that should be
done in arch_add_memory().
When it comes down to online_page(), you don't want your pages freed
back in to the buddy allocator, you want them put in to the balloon.
So, take the __free_page() in online_page(), and put a Xen hook in
there.
+void __attribute__((weak)) arch_free_hotplug_page(struct page *page)
+{
+ __free_page(page);
+}
void online_page(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
...
- __free_page(page);
+ arch_free_hotplug_page(page);
}
Then, have Xen override it:
void arch_free_hotplug_page(struct page *page)
{
if (xen_need_to_inflate_balloon())
put_page_in_balloon(page);
else
__free_page(page);
}
> Additionally, IIRC, add_memory() requires
> that underlying memory is available before its call.
No, that's not correct. s390's memory isn't available until after it
calls vmem_add_mapping(). See arch/s390/mm/init.c
> That is why I
> decided to create add_registered_memory() which do all things which are
> done by add_memory() excluding address space reservation which is done
> by allocate_memory_resource() before memory allocation.
The added memory itself isn't touched until online_pages() time. That's
yet another reason it was separated out logically in the code.
-- Dave
On Thu, Feb 03, 2011 at 05:30:33PM +0100, Daniel Kiper wrote:
> Features and fixes:
> - new version of memory hotplug patch which supports
> among others memory allocation policies during errors
> (try until success or stop at first error),
> - this version of patch was tested with tmem
> (selfballooning and frontswap) and works
> very well with it,
> - some other minor fixes.
>
> Signed-off-by: Daniel Kiper <[email protected]>
> ---
> drivers/xen/Kconfig | 10 ++
> drivers/xen/balloon.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++---
> 2 files changed, 230 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index 07bec09..8f880aa 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -9,6 +9,16 @@ config XEN_BALLOON
> the system to expand the domain's memory allocation, or alternatively
> return unneeded memory to the system.
>
> +config XEN_BALLOON_MEMORY_HOTPLUG
> + bool "Memory hotplug support for Xen balloon driver"
> + default n
> + depends on XEN_BALLOON && MEMORY_HOTPLUG
> + help
> + Memory hotplug support for Xen balloon driver allows expanding memory
> + available for the system above limit declared at system startup.
> + It is very useful on critical systems which require long
> + run without rebooting.
> +
> config XEN_SCRUB_PAGES
> bool "Scrub pages before returning them to system"
> depends on XEN_BALLOON
> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index b1e199c..e43e928 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -6,6 +6,12 @@
> * Copyright (c) 2003, B Dragovic
> * Copyright (c) 2003-2004, M Williamson, K Fraser
> * Copyright (c) 2005 Dan M. Smith, IBM Corporation
> + * Copyright (c) 2010 Daniel Kiper
> + *
> + * Memory hotplug support was written by Daniel Kiper. Work on
> + * it was sponsored by Google under Google Summer of Code 2010
> + * program. Jeremy Fitzhardinge from Xen.org was the mentor for
> + * this project.
> *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License version 2
> @@ -44,6 +50,7 @@
> #include <linux/list.h>
> #include <linux/sysdev.h>
> #include <linux/gfp.h>
> +#include <linux/memory.h>
>
> #include <asm/page.h>
> #include <asm/pgalloc.h>
> @@ -80,6 +87,9 @@ enum bp_state {
> BP_HUNGRY
> };
>
> +#define MH_POLICY_TRY_UNTIL_SUCCESS 0
> +#define MH_POLICY_STOP_AT_FIRST_ERROR 1
Cool.
> +
> struct balloon_stats {
> /* We aim for 'current allocation' == 'target allocation'. */
> unsigned long current_pages;
> @@ -89,6 +99,10 @@ struct balloon_stats {
> unsigned long balloon_high;
> unsigned long schedule_delay;
> unsigned long max_schedule_delay;
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> + unsigned long boot_max_pfn;
> + unsigned long mh_policy;
> +#endif
> };
>
> static DEFINE_MUTEX(balloon_mutex);
> @@ -206,18 +220,199 @@ static void update_schedule_delay(enum bp_state state)
> balloon_stats.schedule_delay = new_schedule_delay;
> }
>
> -static unsigned long current_target(void)
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> +{
> + resource_size_t r_min, r_size;
> + struct resource *r;
> +
> + /*
> + * Look for first unused memory region starting at page
> + * boundary. Skip last memory section created at boot time
> + * because it may contains unused memory pages with PG_reserved
> + * bit not set (online_pages() require PG_reserved bit set).
> + */
> +
> + r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> +
> + if (!r)
> + return NULL;
> +
> + r->name = "System RAM";
> + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
> + r_size = nr_pages << PAGE_SHIFT;
> +
> + if (allocate_resource(&iomem_resource, r, r_size, r_min,
> + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> + kfree(r);
> + return NULL;
> + }
> +
> + return r;
> +}
> +
> +static struct resource *adjust_memory_resource(struct resource *r, unsigned long nr_pages)
> +{
> + int rc;
> +
> + if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) {
Will this actually occur? Say I called 'allocate_additional_memory' with 512
and got -ENOMEM (so the hypercall failed complelty). The mh->policy
MH_POLICY_STOP_AT_FIRST_ERROR. So we end up here. Assume the r_min is
0x100000000, then r->start is 0x100000000 and r->end is 0x100200000.
So:
100200001 - (200000) == 0x100000000 ?
> + rc = release_resource(r);
> + BUG_ON(rc < 0);
> + kfree(r);
> + return NULL;
> + }
> +
> + rc = adjust_resource(r, r->start, r->end + 1 - r->start -
> + (nr_pages << PAGE_SHIFT));
If we wanted 512 pages, and only got 256 and want to adjust the region, we
would want it be:
0x100000000 -> 0x100100000 right?
So with the third argument that comes out to be:
0x100200000 + 1 - 0x100000000 - (100000) = 100001
which is just one page above what we requested?
> +
> + BUG_ON(rc < 0);
Can we just do WARN_ON, and return NULL instead (and also release the resource)?
> +
> + return r;
> +}
> +
> +static int allocate_additional_memory(struct resource *r, unsigned long nr_pages)
> +{
> + int rc;
> + struct xen_memory_reservation reservation = {
> + .address_bits = 0,
> + .extent_order = 0,
> + .domid = DOMID_SELF
> + };
> + unsigned long flags, i, pfn, pfn_start;
> +
> + if (!nr_pages)
> + return 0;
> +
> + pfn_start = PFN_UP(r->end) - nr_pages;
> +
> + if (nr_pages > ARRAY_SIZE(frame_list))
> + nr_pages = ARRAY_SIZE(frame_list);
> +
> + for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn)
> + frame_list[i] = pfn;
> +
> + set_xen_guest_handle(reservation.extent_start, frame_list);
> + reservation.nr_extents = nr_pages;
> +
> + spin_lock_irqsave(&xen_reservation_lock, flags);
> +
> + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
> +
> + if (rc <= 0)
> + return (rc < 0) ? rc : -ENOMEM;
> +
So if we populated some of them (say we want to 512, but only did 64),
don't we want to do the loop below? Also you look to be forgetting to
do a spin_unlock_irqrestore if you quit here.
> + for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) {
> + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
> + phys_to_machine_mapping_valid(pfn));
> + set_phys_to_machine(pfn, frame_list[i]);
> + }
> +
> + spin_unlock_irqrestore(&xen_reservation_lock, flags);
> +
> + return rc;
> +}
> +
> +static void hotplug_allocated_memory(struct resource *r)
> {
> - unsigned long target = balloon_stats.target_pages;
> + int nid, rc;
> + resource_size_t r_size;
> + struct memory_block *mem;
> + unsigned long pfn;
> +
> + r_size = r->end + 1 - r->start;
Why bump it by one byte?
> + nid = memory_add_physaddr_to_nid(r->start);
> +
> + rc = add_registered_memory(nid, r->start, r_size);
> +
> + if (rc) {
> + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n",
> + __func__, rc);
> + balloon_stats.target_pages = balloon_stats.current_pages;
> + return;
> + }
> +
> + if (xen_pv_domain())
> + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)
I think you the r->start to be PFN_UP just in case the r->start is not page
aligned. Thought I am not sure it would even happen anymore, as M A Young
found the culprit that made it possible for us to setup memory regions
non-aligned and that is fixed now (in 2.6.38).
> + if (!PageHighMem(pfn_to_page(pfn))) {
> + rc = HYPERVISOR_update_va_mapping(
> + (unsigned long)__va(pfn << PAGE_SHIFT),
> + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0);
> + BUG_ON(rc);
> + }
>
> - target = min(target,
> - balloon_stats.current_pages +
> - balloon_stats.balloon_low +
> - balloon_stats.balloon_high);
> + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);
>
> - return target;
> + if (rc) {
> + pr_err("%s: online_pages: Failed: %i\n", __func__, rc);
> + balloon_stats.target_pages = balloon_stats.current_pages;
> + return;
> + }
> +
> + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) {
Ditto. Can you do PFN_UP(r->start)?
> + mem = find_memory_block(__pfn_to_section(pfn));
> + BUG_ON(!mem);
> + BUG_ON(!present_section_nr(mem->phys_index));
> + mutex_lock(&mem->state_mutex);
> + mem->state = MEM_ONLINE;
> + mutex_unlock(&mem->state_mutex);
> + }
> +
> + balloon_stats.current_pages += r_size >> PAGE_SHIFT;
> }
>
> +static enum bp_state request_additional_memory(long credit)
> +{
> + int rc;
> + static struct resource *r;
> + static unsigned long pages_left;
> +
> + if ((credit <= 0 || balloon_stats.balloon_low ||
> + balloon_stats.balloon_high) && !r)
> + return BP_DONE;
> +
> + if (!r) {
> + r = allocate_memory_resource(credit);
> +
> + if (!r)
> + return BP_ERROR;
> +
> + pages_left = credit;
> + }
> +
> + rc = allocate_additional_memory(r, pages_left);
> +
> + if (rc < 0) {
> + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS)
> + return BP_ERROR;
> +
> + r = adjust_memory_resource(r, pages_left);
> +
> + if (!r)
> + return BP_ERROR;
Say we failed the hypercall completly and got -ENOMEM from the 'allocate_additional_memory'.
I presume the adjust_memory_resource at this point would have deleted 'r', which means
that (!r) and we return BP_ERROR.
But that means that we aren't following the MH_POLICY_STOP_AT_FIRST_ERROR as
the balloon_process will retry again and the again, and again??
> + } else {
> + pages_left -= rc;
> +
So say I request 512 pages (mh_policy is MH_POLICY_STOP_AT_FIRST_ERROR),
but only got 256. I adjust the pages_left to be 256 and then
> + if (pages_left)
> + return BP_HUNGRY;
we return BP_HUNGRY. That makes 'balloon_process' retry with 512 pages, and we
keep on trying and call "allocate_additional_memory", which fails once more
(returns 256), and we end up returning BP_HUNGRY, and retry... and so on.
Would it be make sense to have a check here for the MH_POLICY_STOP_AT_FIRST_ERROR
and if so call the adjust_memory_memory_resource as well?
> + }
> +
> + hotplug_allocated_memory(r);
> +
> + r = NULL;
> +
> + return BP_DONE;
> +}
> +#else
> +static enum bp_state request_additional_memory(long credit)
> +{
> + if (balloon_stats.balloon_low && balloon_stats.balloon_high &&
> + balloon_stats.target_pages > balloon_stats.current_pages)
> + balloon_stats.target_pages = balloon_stats.current_pages;
> + return BP_DONE;
> +}
> +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
> +
> static enum bp_state increase_reservation(unsigned long nr_pages)
> {
> enum bp_state state = BP_DONE;
> @@ -352,15 +547,17 @@ static enum bp_state decrease_reservation(unsigned long nr_pages)
> */
> static void balloon_process(struct work_struct *work)
> {
> - enum bp_state rc, state = BP_DONE;
> + enum bp_state rc, state;
> long credit;
>
> mutex_lock(&balloon_mutex);
>
> do {
> - credit = current_target() - balloon_stats.current_pages;
> + credit = balloon_stats.target_pages - balloon_stats.current_pages;
>
> - if (credit > 0) {
> + state = request_additional_memory(credit);
> +
> + if (credit > 0 && state == BP_DONE) {
> rc = increase_reservation(credit);
> state = (rc == BP_ERROR) ? BP_ERROR : state;
> }
> @@ -450,6 +647,11 @@ static int __init balloon_init(void)
> balloon_stats.schedule_delay = 1;
> balloon_stats.max_schedule_delay = 32;
>
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> + balloon_stats.boot_max_pfn = max_pfn;
> + balloon_stats.mh_policy = MH_POLICY_STOP_AT_FIRST_ERROR;
> +#endif
> +
> register_balloon(&balloon_sysdev);
>
> /*
> @@ -506,6 +708,10 @@ BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
> static SYSDEV_ULONG_ATTR(schedule_delay, 0644, balloon_stats.schedule_delay);
> static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay);
>
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +static SYSDEV_ULONG_ATTR(memory_hotplug_policy, 0644, balloon_stats.mh_policy);
> +#endif
> +
> static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
> char *buf)
> {
> @@ -568,7 +774,10 @@ static struct sysdev_attribute *balloon_attrs[] = {
> &attr_target_kb,
> &attr_target,
> &attr_schedule_delay.attr,
> - &attr_max_schedule_delay.attr
> + &attr_max_schedule_delay.attr,
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> + &attr_memory_hotplug_policy.attr
> +#endif
> };
>
> static struct attribute *balloon_info_attrs[] = {
> --
> 1.5.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote:
> On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote:
> > On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote:
> > > On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote:
> > > > +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> > > > +{
> > > > + resource_size_t r_min, r_size;
> > > > + struct resource *r;
> > > > +
> > > > + /*
> > > > + * Look for first unused memory region starting at page
> > > > + * boundary. Skip last memory section created at boot time
> > > > + * because it may contains unused memory pages with PG_reserved
> > > > + * bit not set (online_pages() require PG_reserved bit set).
> > > > + */
> > >
> > > Could you elaborate on this comment a bit? I think it's covering both
> > > the "PAGE_SIZE" argument to allocate_resource() and something else, but
> > > I'm not quite sure.
> >
> > Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource().
> > Additionally, r_min (calculated below) sets lower limit at which hoplugged
> > memory could be installed (due to PG_reserved bit requirment set up by
> > online_pages()). Later r_min is put as an argument to allocate_resource() call.
>
> OK, and you'll update the comment on that?
>
> > > > + r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> > > > +
> > > > + if (!r)
> > > > + return NULL;
> > > > +
> > > > + r->name = "System RAM";
> > > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> > > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
> > >
> > > Did you do this for alignment reasons? It might be a better idea to
> > > just make a nice sparsemem function to do alignment.
> >
> > Please look above.
>
> You spoke about page alignment up there. Why is this section-aligned?
> Should we make an "align to section" function in generic sparsemem code?
>
> > > > + r_size = nr_pages << PAGE_SHIFT;
> > > > +
> > > > + if (allocate_resource(&iomem_resource, r, r_size, r_min,
> > > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> > > > + kfree(r);
> > > > + return NULL;
> > > > + }
> > > > +
> > > > + return r;
> > > > +}
> > >
> > > This function should probably be made generic. I bet some more
> > > hypervisors come around and want to use this. They generally won't care
> > > where the memory goes, and the kernel can allocate a spot for them.
> >
> > Yes, you are right. I think about new project in which
> > this function will be generic and then I would move it to
> > some more generic place. Now, I think it should stay here.
>
> Please move it to generic code. It doesn't belong in Xen code.
>
> > > In other words, I think you can take everything from and including
> > > online_pages() down in the function and take it out. Use a udev hotplug
> > > rule to online it immediately if that's what you want.
> >
> > I agree. I discussed a bit about this problem with Jeremy, too. However,
> > there are some problems to implement that solution now. First of all it is
> > possible to online hotplugged memory using sysfs interface only in chunks
> > called sections. It means that it is not possible online once again section
> > which was onlined ealier partialy populated and now it contains new pages
> > to online. In this situation sysfs interface emits Invalid argument error.
> > In theory it should be possible to offline and then online whole section
> > once again, however, if memory from this section was used is not possible
> > to do that. It means that those properties does not allow hotplug memory
> > in guest in finer granulity than section and sysfs interface is too inflexible
> > to be used in that solution. That is why I decided to online hoplugged memory
> > using API which does not have those limitations.
>
> Sure, you have to _online_ the whole thing at once, but you don't have
> to actually make the pages available. You also don't need to hook in to
> the memory resource code like you're doing. It's sufficient to just try
> and add the memory. If you get -EEXIST, then you can't add it there, so
> move up and try again.
>
> int xen_balloon_add_memory(u64 size)
> {
> unsigned long top_of_mem = max_pfn;
> top_of_mem = section_align_up(top_of_mem);
>
> while (1) {
> int ret = add_memory(nid, top_of_mem, size);
> if (ret == -EEXIST)
> continue;
> // error handling...
> break;
> }
> return...;
> }
>
> As for telling the hypervisor where you've mapped things, that should be
> done in arch_add_memory().
>
> When it comes down to online_page(), you don't want your pages freed
> back in to the buddy allocator, you want them put in to the balloon.
> So, take the __free_page() in online_page(), and put a Xen hook in
> there.
>
> +void __attribute__((weak)) arch_free_hotplug_page(struct page *page)
> +{
> + __free_page(page);
> +}
I somehow have a vague recollection that the __weak was frowned upon? The issues
were that when you compile a pv-ops kernel it can run as baremetal so the..
>
> void online_page(struct page *page)
> {
> unsigned long pfn = page_to_pfn(page);
> ...
> - __free_page(page);
> + arch_free_hotplug_page(page);
> }
>
> Then, have Xen override it:
>
> void arch_free_hotplug_page(struct page *page)
> {
> if (xen_need_to_inflate_balloon())
> put_page_in_balloon(page);
> else
> __free_page(page);
call above would get called even on baremetal (and would require the header
file arch/x86/include/memory_hotplug.h to pull in header file from the balloon
driver). If we are going to persue this it might be prudent to follow what we did for MSI:
1525bf0d8f059a38c6e79353583854e1981b2e67
294ee6f89cfd629e276f632a6003a0fad7785dce
b5401a96b59475c1c878439caecb8c521bdfd4ad
On Thu, 2011-02-10 at 12:01 -0500, Konrad Rzeszutek Wilk wrote:
> On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote:
> > On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote:
> > > I agree. I discussed a bit about this problem with Jeremy, too. However,
> > > there are some problems to implement that solution now. First of all it is
> > > possible to online hotplugged memory using sysfs interface only in chunks
> > > called sections. It means that it is not possible online once again section
> > > which was onlined ealier partialy populated and now it contains new pages
> > > to online. In this situation sysfs interface emits Invalid argument error.
> > > In theory it should be possible to offline and then online whole section
> > > once again, however, if memory from this section was used is not possible
> > > to do that. It means that those properties does not allow hotplug memory
> > > in guest in finer granulity than section and sysfs interface is too inflexible
> > > to be used in that solution. That is why I decided to online hoplugged memory
> > > using API which does not have those limitations.
> >
> > Sure, you have to _online_ the whole thing at once, but you don't have
> > to actually make the pages available. You also don't need to hook in to
> > the memory resource code like you're doing. It's sufficient to just try
> > and add the memory. If you get -EEXIST, then you can't add it there, so
> > move up and try again.
> >
> > int xen_balloon_add_memory(u64 size)
> > {
> > unsigned long top_of_mem = max_pfn;
> > top_of_mem = section_align_up(top_of_mem);
> >
> > while (1) {
> > int ret = add_memory(nid, top_of_mem, size);
> > if (ret == -EEXIST)
> > continue;
> > // error handling...
> > break;
> > }
> > return...;
> > }
> >
> > As for telling the hypervisor where you've mapped things, that should be
> > done in arch_add_memory().
> >
> > When it comes down to online_page(), you don't want your pages freed
> > back in to the buddy allocator, you want them put in to the balloon.
> > So, take the __free_page() in online_page(), and put a Xen hook in
> > there.
> >
> > +void __attribute__((weak)) arch_free_hotplug_page(struct page *page)
> > +{
> > + __free_page(page);
> > +}
>
> I somehow have a vague recollection that the __weak was frowned upon? The issues
> were that when you compile a pv-ops kernel it can run as baremetal so the..
There are a bunch of alternatives to using 'weak'. Any of those would
probably be fine as well. Anything that allows us to check whether the
page should go back in to the allocator or the balloon.
> > void online_page(struct page *page)
> > {
> > unsigned long pfn = page_to_pfn(page);
> > ...
> > - __free_page(page);
> > + arch_free_hotplug_page(page);
> > }
> >
> > Then, have Xen override it:
> >
> > void arch_free_hotplug_page(struct page *page)
> > {
> > if (xen_need_to_inflate_balloon())
> > put_page_in_balloon(page);
> > else
> > __free_page(page);
>
> call above would get called even on baremetal (and would require the header
> file arch/x86/include/memory_hotplug.h to pull in header file from the balloon
> driver). If we are going to persue this it might be prudent to follow what we did for MSI:
>
> 1525bf0d8f059a38c6e79353583854e1981b2e67
> 294ee6f89cfd629e276f632a6003a0fad7785dce
> b5401a96b59475c1c878439caecb8c521bdfd4ad
That looks a bit complicated for what I'm trying to do here, but
whatever works.
-- Dave
On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote:
> On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote:
> > On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote:
> > > On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote:
> > > > +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> > > > +{
> > > > + resource_size_t r_min, r_size;
> > > > + struct resource *r;
> > > > +
> > > > + /*
> > > > + * Look for first unused memory region starting at page
> > > > + * boundary. Skip last memory section created at boot time
> > > > + * because it may contains unused memory pages with PG_reserved
> > > > + * bit not set (online_pages() require PG_reserved bit set).
> > > > + */
> > >
> > > Could you elaborate on this comment a bit? I think it's covering both
> > > the "PAGE_SIZE" argument to allocate_resource() and something else, but
> > > I'm not quite sure.
> >
> > Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource().
> > Additionally, r_min (calculated below) sets lower limit at which hoplugged
> > memory could be installed (due to PG_reserved bit requirment set up by
> > online_pages()). Later r_min is put as an argument to allocate_resource() call.
>
> OK, and you'll update the comment on that?
OK.
> > > > + r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> > > > +
> > > > + if (!r)
> > > > + return NULL;
> > > > +
> > > > + r->name = "System RAM";
> > > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> > > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));
> > >
> > > Did you do this for alignment reasons? It might be a better idea to
> > > just make a nice sparsemem function to do alignment.
> >
> > Please look above.
>
> You spoke about page alignment up there. Why is this section-aligned?
> Should we make an "align to section" function in generic sparsemem code?
It is done because all pages in relevant section starting from max_pfn
to the end of that section do not have PG_reserved bit set. It was tested
on Linux Kernel Ver. 2.6.32.x, however, I am going to do some tests on
current Linus tree. Currently, I do not expect that "align to section"
function is required by others.
> > > > + r_size = nr_pages << PAGE_SHIFT;
> > > > +
> > > > + if (allocate_resource(&iomem_resource, r, r_size, r_min,
> > > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> > > > + kfree(r);
> > > > + return NULL;
> > > > + }
> > > > +
> > > > + return r;
> > > > +}
> > >
> > > This function should probably be made generic. I bet some more
> > > hypervisors come around and want to use this. They generally won't care
> > > where the memory goes, and the kernel can allocate a spot for them.
> >
> > Yes, you are right. I think about new project in which
> > this function will be generic and then I would move it to
> > some more generic place. Now, I think it should stay here.
>
> Please move it to generic code. It doesn't belong in Xen code.
OK.
> > > In other words, I think you can take everything from and including
> > > online_pages() down in the function and take it out. Use a udev hotplug
> > > rule to online it immediately if that's what you want.
> >
> > I agree. I discussed a bit about this problem with Jeremy, too. However,
> > there are some problems to implement that solution now. First of all it is
> > possible to online hotplugged memory using sysfs interface only in chunks
> > called sections. It means that it is not possible online once again section
> > which was onlined ealier partialy populated and now it contains new pages
> > to online. In this situation sysfs interface emits Invalid argument error.
> > In theory it should be possible to offline and then online whole section
> > once again, however, if memory from this section was used is not possible
> > to do that. It means that those properties does not allow hotplug memory
> > in guest in finer granulity than section and sysfs interface is too inflexible
> > to be used in that solution. That is why I decided to online hoplugged memory
> > using API which does not have those limitations.
>
> Sure, you have to _online_ the whole thing at once, but you don't have
> to actually make the pages available. You also don't need to hook in to
> the memory resource code like you're doing. It's sufficient to just try
> and add the memory. If you get -EEXIST, then you can't add it there, so
> move up and try again.
>
> int xen_balloon_add_memory(u64 size)
> {
> unsigned long top_of_mem = max_pfn;
> top_of_mem = section_align_up(top_of_mem);
>
> while (1) {
> int ret = add_memory(nid, top_of_mem, size);
> if (ret == -EEXIST)
> continue;
> // error handling...
> break;
> }
> return...;
> }
>
> As for telling the hypervisor where you've mapped things, that should be
> done in arch_add_memory().
>
> When it comes down to online_page(), you don't want your pages freed
> back in to the buddy allocator, you want them put in to the balloon.
> So, take the __free_page() in online_page(), and put a Xen hook in
> there.
>
> +void __attribute__((weak)) arch_free_hotplug_page(struct page *page)
> +{
> + __free_page(page);
> +}
I think that this function should be registered dynamically at init
stage by specific balloon driver (in this case Xen balloon driver).
> void online_page(struct page *page)
> {
> unsigned long pfn = page_to_pfn(page);
> ...
> - __free_page(page);
> + arch_free_hotplug_page(page);
> }
>
> Then, have Xen override it:
>
> void arch_free_hotplug_page(struct page *page)
> {
> if (xen_need_to_inflate_balloon())
> put_page_in_balloon(page);
> else
> __free_page(page);
> }
Whole idea looks quiet interesting. I will try to migrate
to that solution.
> > Additionally, IIRC, add_memory() requires
> > that underlying memory is available before its call.
>
> No, that's not correct. s390's memory isn't available until after it
> calls vmem_add_mapping(). See arch/s390/mm/init.c
I was right to some extent. First versions of memory hotplug code were
written on the base of Linux Kernel Ver. 2.6.32.x. Tests done on that
versions showed that add_memory() required that underlying memory should
be available before its call. However, after short investigation it came
out that there are some issues with some Xen calls. Those issues does
not exists in current Linus tree.
Daniel
On Sat, 2011-02-12 at 00:13 +0100, Daniel Kiper wrote:
> On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote:
> > You spoke about page alignment up there. Why is this section-aligned?
> > Should we make an "align to section" function in generic sparsemem code?
>
> It is done because all pages in relevant section starting from max_pfn
> to the end of that section do not have PG_reserved bit set. It was tested
> on Linux Kernel Ver. 2.6.32.x, however, I am going to do some tests on
> current Linus tree. Currently, I do not expect that "align to section"
> function is required by others.
It doesn't matter if it gets used by anybody else. It's a generic
function that fits in well with the other sparsemem code. It should go
there.
...
> > As for telling the hypervisor where you've mapped things, that should be
> > done in arch_add_memory().
> >
> > When it comes down to online_page(), you don't want your pages freed
> > back in to the buddy allocator, you want them put in to the balloon.
> > So, take the __free_page() in online_page(), and put a Xen hook in
> > there.
> >
> > +void __attribute__((weak)) arch_free_hotplug_page(struct page *page)
> > +{
> > + __free_page(page);
> > +}
>
> I think that this function should be registered dynamically at init
> stage by specific balloon driver (in this case Xen balloon driver).
That sounds fine to me. I guess we could use some of the subarch stuff
or the pv_ops structure to do it as well. This isn't exactly a hot
path, either, so I'm not worried about it being some kind of
conditional.
Really, anything that allows us to divert pages over to the Xen balloon
driver rather than the buddy allocator is probably just fine.
> > > Additionally, IIRC, add_memory() requires
> > > that underlying memory is available before its call.
> >
> > No, that's not correct. s390's memory isn't available until after it
> > calls vmem_add_mapping(). See arch/s390/mm/init.c
>
> I was right to some extent. First versions of memory hotplug code were
> written on the base of Linux Kernel Ver. 2.6.32.x. Tests done on that
> versions showed that add_memory() required that underlying memory should
> be available before its call. However, after short investigation it came
> out that there are some issues with some Xen calls. Those issues does
> not exists in current Linus tree.
Sounds good, I'm looking forward to your next patch.
-- Dave
On Thu, Feb 10, 2011 at 11:53:16AM -0500, Konrad Rzeszutek Wilk wrote:
> On Thu, Feb 03, 2011 at 05:30:33PM +0100, Daniel Kiper wrote:
[...]
> > +static struct resource *adjust_memory_resource(struct resource *r, unsigned long nr_pages)
> > +{
> > + int rc;
> > +
> > + if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) {
>
> Will this actually occur? Say I called 'allocate_additional_memory' with 512
> and got -ENOMEM (so the hypercall failed complelty). The mh->policy
> MH_POLICY_STOP_AT_FIRST_ERROR. So we end up here. Assume the r_min is
> 0x100000000, then r->start is 0x100000000 and r->end is 0x100200000.
>
> So:
> 100200001 - (200000) == 0x100000000 ?
r->end points always to last byte in currently allocated resource.
It means that: r->end == r->start + size - 1
> > + rc = release_resource(r);
> > + BUG_ON(rc < 0);
> > + kfree(r);
> > + return NULL;
> > + }
> > +
> > + rc = adjust_resource(r, r->start, r->end + 1 - r->start -
> > + (nr_pages << PAGE_SHIFT));
>
> If we wanted 512 pages, and only got 256 and want to adjust the region, we
> would want it be:
> 0x100000000 -> 0x100100000 right?
>
> So with the third argument that comes out to be:
>
> 0x100200000 + 1 - 0x100000000 - (100000) = 100001
>
> which is just one page above what we requested?
Please, look above.
> > +
> > + BUG_ON(rc < 0);
>
> Can we just do WARN_ON, and return NULL instead (and also release the resource)?
I will rethink that once again.
> > +
> > + return r;
> > +}
> > +
> > +static int allocate_additional_memory(struct resource *r, unsigned long nr_pages)
> > +{
> > + int rc;
> > + struct xen_memory_reservation reservation = {
> > + .address_bits = 0,
> > + .extent_order = 0,
> > + .domid = DOMID_SELF
> > + };
> > + unsigned long flags, i, pfn, pfn_start;
> > +
> > + if (!nr_pages)
> > + return 0;
> > +
> > + pfn_start = PFN_UP(r->end) - nr_pages;
> > +
> > + if (nr_pages > ARRAY_SIZE(frame_list))
> > + nr_pages = ARRAY_SIZE(frame_list);
> > +
> > + for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn)
> > + frame_list[i] = pfn;
> > +
> > + set_xen_guest_handle(reservation.extent_start, frame_list);
> > + reservation.nr_extents = nr_pages;
> > +
> > + spin_lock_irqsave(&xen_reservation_lock, flags);
> > +
> > + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
> > +
> > + if (rc <= 0)
> > + return (rc < 0) ? rc : -ENOMEM;
> > +
>
> So if we populated some of them (say we want to 512, but only did 64),
> don't we want to do the loop below? Also you look to be forgetting to
> do a spin_unlock_irqrestore if you quit here.
Loop which you mentioned is skipped only when HYPERVISOR_memory_op()
does not allocate anything (0 pages) or something went wrong and
an error code was returned.
spin_lock_irqsave()/spin_unlock_irqrestore() should be removed
as like it was done in increase_reservation()/decrease_reservation().
I overlooked those calls. Thanks.
> > + for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) {
> > + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
> > + phys_to_machine_mapping_valid(pfn));
> > + set_phys_to_machine(pfn, frame_list[i]);
> > + }
> > +
> > + spin_unlock_irqrestore(&xen_reservation_lock, flags);
> > +
> > + return rc;
> > +}
> > +
> > +static void hotplug_allocated_memory(struct resource *r)
> > {
> > - unsigned long target = balloon_stats.target_pages;
> > + int nid, rc;
> > + resource_size_t r_size;
> > + struct memory_block *mem;
> > + unsigned long pfn;
> > +
> > + r_size = r->end + 1 - r->start;
>
> Why bump it by one byte?
Please, look above.
> > + nid = memory_add_physaddr_to_nid(r->start);
> > +
> > + rc = add_registered_memory(nid, r->start, r_size);
> > +
> > + if (rc) {
> > + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n",
> > + __func__, rc);
> > + balloon_stats.target_pages = balloon_stats.current_pages;
> > + return;
> > + }
> > +
> > + if (xen_pv_domain())
> > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)
>
> I think you the r->start to be PFN_UP just in case the r->start is not page
> aligned. Thought I am not sure it would even happen anymore, as M A Young
> found the culprit that made it possible for us to setup memory regions
> non-aligned and that is fixed now (in 2.6.38).
r->start is always page aligned because allocate_resource()
always returns page aligned resource (it is forced by arguments).
> > + if (!PageHighMem(pfn_to_page(pfn))) {
> > + rc = HYPERVISOR_update_va_mapping(
> > + (unsigned long)__va(pfn << PAGE_SHIFT),
> > + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0);
> > + BUG_ON(rc);
> > + }
> >
> > - target = min(target,
> > - balloon_stats.current_pages +
> > - balloon_stats.balloon_low +
> > - balloon_stats.balloon_high);
> > + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);
> >
> > - return target;
> > + if (rc) {
> > + pr_err("%s: online_pages: Failed: %i\n", __func__, rc);
> > + balloon_stats.target_pages = balloon_stats.current_pages;
> > + return;
> > + }
> > +
> > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) {
>
> Ditto. Can you do PFN_UP(r->start)?
Please, look above.
> > + mem = find_memory_block(__pfn_to_section(pfn));
> > + BUG_ON(!mem);
> > + BUG_ON(!present_section_nr(mem->phys_index));
> > + mutex_lock(&mem->state_mutex);
> > + mem->state = MEM_ONLINE;
> > + mutex_unlock(&mem->state_mutex);
> > + }
> > +
> > + balloon_stats.current_pages += r_size >> PAGE_SHIFT;
> > }
> >
> > +static enum bp_state request_additional_memory(long credit)
> > +{
> > + int rc;
> > + static struct resource *r;
> > + static unsigned long pages_left;
> > +
> > + if ((credit <= 0 || balloon_stats.balloon_low ||
> > + balloon_stats.balloon_high) && !r)
> > + return BP_DONE;
> > +
> > + if (!r) {
> > + r = allocate_memory_resource(credit);
> > +
> > + if (!r)
> > + return BP_ERROR;
> > +
> > + pages_left = credit;
> > + }
> > +
> > + rc = allocate_additional_memory(r, pages_left);
> > +
> > + if (rc < 0) {
> > + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS)
> > + return BP_ERROR;
> > +
> > + r = adjust_memory_resource(r, pages_left);
> > +
> > + if (!r)
> > + return BP_ERROR;
>
> Say we failed the hypercall completly and got -ENOMEM from the 'allocate_additional_memory'.
> I presume the adjust_memory_resource at this point would have deleted 'r', which means
> that (!r) and we return BP_ERROR.
>
> But that means that we aren't following the MH_POLICY_STOP_AT_FIRST_ERROR as
> the balloon_process will retry again and the again, and again??
You are right. It should be return BP_DONE.
> > + } else {
> > + pages_left -= rc;
> > +
>
> So say I request 512 pages (mh_policy is MH_POLICY_STOP_AT_FIRST_ERROR),
> but only got 256. I adjust the pages_left to be 256 and then
> > + if (pages_left)
> > + return BP_HUNGRY;
>
> we return BP_HUNGRY. That makes 'balloon_process' retry with 512 pages, and we
> keep on trying and call "allocate_additional_memory", which fails once more
> (returns 256), and we end up returning BP_HUNGRY, and retry... and so on.
>
> Would it be make sense to have a check here for the MH_POLICY_STOP_AT_FIRST_ERROR
> and if so call the adjust_memory_memory_resource as well?
Here it is OK. First time allocate_additional_memory() returns 256 which
is OK and next time if more memory is not available then it returns rc < 0
which forces execution of "if (rc < 0) {..." (as it was expected).
Daniel