Currently ballooned out pages are mapped to 0 and have INVALID_P2M_ENTRY
in the p2m. These ballooned out pages are used to map foreign grants
by gntdev and blkback (see alloc_xenballooned_pages).
Allocate a page per cpu and map all the ballooned out pages to the
corresponding mfn. Set the p2m accordingly. This way reading from a
ballooned out page won't cause a kernel crash (see
http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html).
Signed-off-by: Stefano Stabellini <[email protected]>
CC: [email protected]
CC: [email protected]
---
drivers/xen/balloon.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++--
include/xen/balloon.h | 3 ++
2 files changed, 58 insertions(+), 3 deletions(-)
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 930fb68..b9260dd 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -36,6 +36,7 @@
* IN THE SOFTWARE.
*/
+#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/errno.h>
@@ -50,6 +51,7 @@
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/percpu-defs.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -88,6 +90,8 @@ EXPORT_SYMBOL_GPL(balloon_stats);
/* We increase/decrease in batches which fit in a page */
static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
+static DEFINE_PER_CPU(struct page *, balloon_scratch_page);
+
#ifdef CONFIG_HIGHMEM
#define inc_totalhigh_pages() (totalhigh_pages++)
@@ -423,7 +427,8 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
if (xen_pv_domain() && !PageHighMem(page)) {
ret = HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
- __pte_ma(0), 0);
+ pfn_pte(page_to_pfn(__get_cpu_var(balloon_scratch_page)),
+ PAGE_KERNEL_RO), 0);
BUG_ON(ret);
}
#endif
@@ -436,7 +441,8 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
pfn = mfn_to_pfn(frame_list[i]);
- __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ __set_phys_to_machine(pfn,
+ pfn_to_mfn(page_to_pfn(__get_cpu_var(balloon_scratch_page))));
balloon_append(pfn_to_page(pfn));
}
@@ -491,6 +497,18 @@ static void balloon_process(struct work_struct *work)
mutex_unlock(&balloon_mutex);
}
+struct page* get_balloon_scratch_page(void)
+{
+ struct page *ret = get_cpu_var(balloon_scratch_page);
+ BUG_ON(ret == NULL);
+ return ret;
+}
+
+void put_balloon_scratch_page(void)
+{
+ put_cpu_var(balloon_scratch_page);
+}
+
/* Resets the Xen limit, sets new target, and kicks off processing. */
void balloon_set_new_target(unsigned long target)
{
@@ -584,13 +602,47 @@ static void __init balloon_add_region(unsigned long start_pfn,
}
}
+static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (long)hcpu;
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (per_cpu(balloon_scratch_page, cpu) != NULL)
+ break;
+ per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL);
+ if (per_cpu(balloon_scratch_page, cpu) == NULL) {
+ pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu);
+ return NOTIFY_BAD;
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block balloon_cpu_notifier __cpuinitdata = {
+ .notifier_call = balloon_cpu_notify,
+};
+
static int __init balloon_init(void)
{
- int i;
+ int i, cpu;
if (!xen_domain())
return -ENODEV;
+ for_each_online_cpu(cpu)
+ {
+ per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL);
+ if (per_cpu(balloon_scratch_page, cpu) == NULL) {
+ pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu);
+ return -ENOMEM;
+ }
+ }
+ register_cpu_notifier(&balloon_cpu_notifier);
+
pr_info("xen/balloon: Initialising balloon driver.\n");
balloon_stats.current_pages = xen_pv_domain()
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
index cc2e1a7..7a819b7 100644
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -29,6 +29,9 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages,
bool highmem);
void free_xenballooned_pages(int nr_pages, struct page **pages);
+struct page* get_balloon_scratch_page(void);
+void put_balloon_scratch_page(void);
+
struct device;
#ifdef CONFIG_XEN_SELFBALLOONING
extern int register_xen_selfballooning(struct device *dev);
--
1.7.2.5
On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> + unsigned long action, void *hcpu)
> +{
> + int cpu = (long)hcpu;
> + switch (action) {
> + case CPU_UP_PREPARE:
> + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> + break;
Thinking about this a bit more -- do we know what happens to the per-cpu
area for a CPU which is unplugged and then reintroduced? Is it preserved
or is it reset?
If it is reset then this gets more complicated :-( We might be able to
use the core mm page reference count, so that when the last reference is
removed the page is automatically reclaimed. We can obviously take a
reference whenever we add a mapping of the trade page, but I'm not sure
we are always on the path which removes such mappings... Even then you
could waste pages for some potentially large amount of time each time
you replug a VCPU.
Urg, I really hope the per-cpu area is preserved!
Ian.
On Tue, Jul 23, 2013 at 07:00:09PM +0100, Ian Campbell wrote:
> On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> > +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> > + unsigned long action, void *hcpu)
> > +{
> > + int cpu = (long)hcpu;
> > + switch (action) {
> > + case CPU_UP_PREPARE:
> > + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> > + break;
>
> Thinking about this a bit more -- do we know what happens to the per-cpu
> area for a CPU which is unplugged and then reintroduced? Is it preserved
> or is it reset?
>
> If it is reset then this gets more complicated :-( We might be able to
> use the core mm page reference count, so that when the last reference is
> removed the page is automatically reclaimed. We can obviously take a
> reference whenever we add a mapping of the trade page, but I'm not sure
> we are always on the path which removes such mappings... Even then you
> could waste pages for some potentially large amount of time each time
> you replug a VCPU.
>
> Urg, I really hope the per-cpu area is preserved!
It is. During bootup time you see this:
[ 0.000000] smpboot: Allowing 128 CPUs, 96 hotplug CPU
[ 0.000000] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:128 nr_node_ids:1
which means that all of the per_CPU are shrunk down to 128 (from
CONFIG_NR_CPUS=512 was built with) and stays for the lifetime of the kernel.
You might have to clear it when the vCPU comes back up though - otherwise you
will have garbage.
Or you can use the zalloc_cpumask_var_node which will allocate a dynamic
version of this. (based on the possible_cpus - so in this case 128).
>
> Ian.
>
On Tue, 23 Jul 2013, Konrad Rzeszutek Wilk wrote:
> On Tue, Jul 23, 2013 at 07:00:09PM +0100, Ian Campbell wrote:
> > On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> > > +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> > > + unsigned long action, void *hcpu)
> > > +{
> > > + int cpu = (long)hcpu;
> > > + switch (action) {
> > > + case CPU_UP_PREPARE:
> > > + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> > > + break;
> >
> > Thinking about this a bit more -- do we know what happens to the per-cpu
> > area for a CPU which is unplugged and then reintroduced? Is it preserved
> > or is it reset?
> >
> > If it is reset then this gets more complicated :-( We might be able to
> > use the core mm page reference count, so that when the last reference is
> > removed the page is automatically reclaimed. We can obviously take a
> > reference whenever we add a mapping of the trade page, but I'm not sure
> > we are always on the path which removes such mappings... Even then you
> > could waste pages for some potentially large amount of time each time
> > you replug a VCPU.
> >
> > Urg, I really hope the per-cpu area is preserved!
>
> It is. During bootup time you see this:
>
> [ 0.000000] smpboot: Allowing 128 CPUs, 96 hotplug CPU
> [ 0.000000] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:128 nr_node_ids:1
>
> which means that all of the per_CPU are shrunk down to 128 (from
> CONFIG_NR_CPUS=512 was built with) and stays for the lifetime of the kernel.
>
> You might have to clear it when the vCPU comes back up though - otherwise you
> will have garbage.
I don't see anything in the hotplug code that would modify the value of
the per_cpu area of offline cpus.
On Wed, Jul 24, 2013 at 12:05:05PM +0100, Stefano Stabellini wrote:
> On Tue, 23 Jul 2013, Konrad Rzeszutek Wilk wrote:
> > On Tue, Jul 23, 2013 at 07:00:09PM +0100, Ian Campbell wrote:
> > > On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> > > > +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> > > > + unsigned long action, void *hcpu)
> > > > +{
> > > > + int cpu = (long)hcpu;
> > > > + switch (action) {
> > > > + case CPU_UP_PREPARE:
> > > > + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> > > > + break;
> > >
> > > Thinking about this a bit more -- do we know what happens to the per-cpu
> > > area for a CPU which is unplugged and then reintroduced? Is it preserved
> > > or is it reset?
> > >
> > > If it is reset then this gets more complicated :-( We might be able to
> > > use the core mm page reference count, so that when the last reference is
> > > removed the page is automatically reclaimed. We can obviously take a
> > > reference whenever we add a mapping of the trade page, but I'm not sure
> > > we are always on the path which removes such mappings... Even then you
> > > could waste pages for some potentially large amount of time each time
> > > you replug a VCPU.
> > >
> > > Urg, I really hope the per-cpu area is preserved!
> >
> > It is. During bootup time you see this:
> >
> > [ 0.000000] smpboot: Allowing 128 CPUs, 96 hotplug CPU
> > [ 0.000000] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:128 nr_node_ids:1
> >
> > which means that all of the per_CPU are shrunk down to 128 (from
> > CONFIG_NR_CPUS=512 was built with) and stays for the lifetime of the kernel.
> >
> > You might have to clear it when the vCPU comes back up though - otherwise you
> > will have garbage.
>
> I don't see anything in the hotplug code that would modify the value of
> the per_cpu area of offline cpus.
You might have never onlined the CPUs and the kernel is built with DEBUG options
which poison the page.
Anyhow, doing a memset seems like a prudent thing to do? Perhaps when
built with CONFG_DEBUG_XENFS you add poison values to it?
On 23/07/13 18:27, Stefano Stabellini wrote:
> Currently ballooned out pages are mapped to 0 and have INVALID_P2M_ENTRY
> in the p2m. These ballooned out pages are used to map foreign grants
> by gntdev and blkback (see alloc_xenballooned_pages).
>
> Allocate a page per cpu and map all the ballooned out pages to the
> corresponding mfn. Set the p2m accordingly. This way reading from a
> ballooned out page won't cause a kernel crash (see
> http://lists.xen.org/archives/html/xen-devel/2012-12/msg01154.html).
Reviewed-by: David Vrabel <[email protected]>
A number of users of DEFINE_PER_CPU() initialize it with
for_each_possible_cpu() without registering a cpu notifier, so I think
there is no risk that offlining a CPU clears its per-cpu data and the
code as-is is fine.
David
On Wed, 2013-07-24 at 10:58 -0400, Konrad Rzeszutek Wilk wrote:
> On Wed, Jul 24, 2013 at 12:05:05PM +0100, Stefano Stabellini wrote:
> > On Tue, 23 Jul 2013, Konrad Rzeszutek Wilk wrote:
> > > On Tue, Jul 23, 2013 at 07:00:09PM +0100, Ian Campbell wrote:
> > > > On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> > > > > +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> > > > > + unsigned long action, void *hcpu)
> > > > > +{
> > > > > + int cpu = (long)hcpu;
> > > > > + switch (action) {
> > > > > + case CPU_UP_PREPARE:
> > > > > + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> > > > > + break;
> > > >
> > > > Thinking about this a bit more -- do we know what happens to the per-cpu
> > > > area for a CPU which is unplugged and then reintroduced? Is it preserved
> > > > or is it reset?
> > > >
> > > > If it is reset then this gets more complicated :-( We might be able to
> > > > use the core mm page reference count, so that when the last reference is
> > > > removed the page is automatically reclaimed. We can obviously take a
> > > > reference whenever we add a mapping of the trade page, but I'm not sure
> > > > we are always on the path which removes such mappings... Even then you
> > > > could waste pages for some potentially large amount of time each time
> > > > you replug a VCPU.
> > > >
> > > > Urg, I really hope the per-cpu area is preserved!
> > >
> > > It is. During bootup time you see this:
> > >
> > > [ 0.000000] smpboot: Allowing 128 CPUs, 96 hotplug CPU
> > > [ 0.000000] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:128 nr_node_ids:1
> > >
> > > which means that all of the per_CPU are shrunk down to 128 (from
> > > CONFIG_NR_CPUS=512 was built with) and stays for the lifetime of the kernel.
> > >
> > > You might have to clear it when the vCPU comes back up though - otherwise you
> > > will have garbage.
> >
> > I don't see anything in the hotplug code that would modify the value of
> > the per_cpu area of offline cpus.
>
> You might have never onlined the CPUs and the kernel is built with DEBUG options
> which poison the page.
>
> Anyhow, doing a memset seems like a prudent thing to do? Perhaps when
> built with CONFG_DEBUG_XENFS you add poison values to it?
The point is that the patches need for the per-cpu areas to *not* be
reinitialised over a vcpu unplug+plug, otherwise we will leak the
original page when we allocate the new one on plug.
We can't just free the page on vcpu unplug because it might still be in
use.
Ian.
On Thu, Jul 25, 2013 at 04:31:07AM +0100, Ian Campbell wrote:
> On Wed, 2013-07-24 at 10:58 -0400, Konrad Rzeszutek Wilk wrote:
> > On Wed, Jul 24, 2013 at 12:05:05PM +0100, Stefano Stabellini wrote:
> > > On Tue, 23 Jul 2013, Konrad Rzeszutek Wilk wrote:
> > > > On Tue, Jul 23, 2013 at 07:00:09PM +0100, Ian Campbell wrote:
> > > > > On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> > > > > > +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> > > > > > + unsigned long action, void *hcpu)
> > > > > > +{
> > > > > > + int cpu = (long)hcpu;
> > > > > > + switch (action) {
> > > > > > + case CPU_UP_PREPARE:
> > > > > > + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> > > > > > + break;
> > > > >
> > > > > Thinking about this a bit more -- do we know what happens to the per-cpu
> > > > > area for a CPU which is unplugged and then reintroduced? Is it preserved
> > > > > or is it reset?
> > > > >
> > > > > If it is reset then this gets more complicated :-( We might be able to
> > > > > use the core mm page reference count, so that when the last reference is
> > > > > removed the page is automatically reclaimed. We can obviously take a
> > > > > reference whenever we add a mapping of the trade page, but I'm not sure
> > > > > we are always on the path which removes such mappings... Even then you
> > > > > could waste pages for some potentially large amount of time each time
> > > > > you replug a VCPU.
> > > > >
> > > > > Urg, I really hope the per-cpu area is preserved!
> > > >
> > > > It is. During bootup time you see this:
> > > >
> > > > [ 0.000000] smpboot: Allowing 128 CPUs, 96 hotplug CPU
> > > > [ 0.000000] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:128 nr_node_ids:1
> > > >
> > > > which means that all of the per_CPU are shrunk down to 128 (from
> > > > CONFIG_NR_CPUS=512 was built with) and stays for the lifetime of the kernel.
> > > >
> > > > You might have to clear it when the vCPU comes back up though - otherwise you
> > > > will have garbage.
> > >
> > > I don't see anything in the hotplug code that would modify the value of
> > > the per_cpu area of offline cpus.
> >
> > You might have never onlined the CPUs and the kernel is built with DEBUG options
> > which poison the page.
> >
> > Anyhow, doing a memset seems like a prudent thing to do? Perhaps when
> > built with CONFG_DEBUG_XENFS you add poison values to it?
>
> The point is that the patches need for the per-cpu areas to *not* be
> reinitialised over a vcpu unplug+plug, otherwise we will leak the
> original page when we allocate the new one on plug.
OK.
>
> We can't just free the page on vcpu unplug because it might still be in
> use.
I am still worried about before-the-cpu-is-up-the-per-cpu-has-garbage case.
We could add code in the boot-before-smp (so when there is only one CPU) to
do:
for_each_possible(cpu)
memset(__per_cpu(some_memory),0,sizeof...);
and then I think it satisfies your concerns and mine?
>
> Ian.
>
On Mon, 29 Jul 2013, Konrad Rzeszutek Wilk wrote:
> On Thu, Jul 25, 2013 at 04:31:07AM +0100, Ian Campbell wrote:
> > On Wed, 2013-07-24 at 10:58 -0400, Konrad Rzeszutek Wilk wrote:
> > > On Wed, Jul 24, 2013 at 12:05:05PM +0100, Stefano Stabellini wrote:
> > > > On Tue, 23 Jul 2013, Konrad Rzeszutek Wilk wrote:
> > > > > On Tue, Jul 23, 2013 at 07:00:09PM +0100, Ian Campbell wrote:
> > > > > > On Tue, 2013-07-23 at 18:27 +0100, Stefano Stabellini wrote:
> > > > > > > +static int __cpuinit balloon_cpu_notify(struct notifier_block *self,
> > > > > > > + unsigned long action, void *hcpu)
> > > > > > > +{
> > > > > > > + int cpu = (long)hcpu;
> > > > > > > + switch (action) {
> > > > > > > + case CPU_UP_PREPARE:
> > > > > > > + if (per_cpu(balloon_scratch_page, cpu) != NULL)
> > > > > > > + break;
> > > > > >
> > > > > > Thinking about this a bit more -- do we know what happens to the per-cpu
> > > > > > area for a CPU which is unplugged and then reintroduced? Is it preserved
> > > > > > or is it reset?
> > > > > >
> > > > > > If it is reset then this gets more complicated :-( We might be able to
> > > > > > use the core mm page reference count, so that when the last reference is
> > > > > > removed the page is automatically reclaimed. We can obviously take a
> > > > > > reference whenever we add a mapping of the trade page, but I'm not sure
> > > > > > we are always on the path which removes such mappings... Even then you
> > > > > > could waste pages for some potentially large amount of time each time
> > > > > > you replug a VCPU.
> > > > > >
> > > > > > Urg, I really hope the per-cpu area is preserved!
> > > > >
> > > > > It is. During bootup time you see this:
> > > > >
> > > > > [ 0.000000] smpboot: Allowing 128 CPUs, 96 hotplug CPU
> > > > > [ 0.000000] setup_percpu: NR_CPUS:512 nr_cpumask_bits:512 nr_cpu_ids:128 nr_node_ids:1
> > > > >
> > > > > which means that all of the per_CPU are shrunk down to 128 (from
> > > > > CONFIG_NR_CPUS=512 was built with) and stays for the lifetime of the kernel.
> > > > >
> > > > > You might have to clear it when the vCPU comes back up though - otherwise you
> > > > > will have garbage.
> > > >
> > > > I don't see anything in the hotplug code that would modify the value of
> > > > the per_cpu area of offline cpus.
> > >
> > > You might have never onlined the CPUs and the kernel is built with DEBUG options
> > > which poison the page.
> > >
> > > Anyhow, doing a memset seems like a prudent thing to do? Perhaps when
> > > built with CONFG_DEBUG_XENFS you add poison values to it?
> >
> > The point is that the patches need for the per-cpu areas to *not* be
> > reinitialised over a vcpu unplug+plug, otherwise we will leak the
> > original page when we allocate the new one on plug.
>
> OK.
> >
> > We can't just free the page on vcpu unplug because it might still be in
> > use.
>
> I am still worried about before-the-cpu-is-up-the-per-cpu-has-garbage case.
> We could add code in the boot-before-smp (so when there is only one CPU) to
> do:
>
> for_each_possible(cpu)
> memset(__per_cpu(some_memory),0,sizeof...);
>
> and then I think it satisfies your concerns and mine?
OK, I'll add an early_initcall.