2010-04-04 21:52:12

by Dmitry Torokhov

[permalink] [raw]
Subject: [PATCH] VMware Balloon driver

This is standalone version of VMware Balloon driver. Unlike previous
version, that tried to integrate VMware ballooning transport into virtio
subsystem, and use stock virtio_ballon driver, this one implements both
controlling thread/algorithm and hypervisor transport.

We are submitting standalone driver because KVM maintainer (Avi Kivity)
expressed opinion (rightly) that our transport does not fit well into
virtqueue paradigm and thus it does not make much sense to integrate
with virtio.

Signed-off-by: Dmitry Torokhov <[email protected]>
---

arch/x86/kernel/cpu/vmware.c | 2
drivers/misc/Kconfig | 16 +
drivers/misc/Makefile | 1
drivers/misc/vmware_balloon.c | 745 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 764 insertions(+), 0 deletions(-)
create mode 100644 drivers/misc/vmware_balloon.c


diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97..dfdb4db 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
*/

#include <linux/dmi.h>
+#include <linux/module.h>
#include <asm/div64.h>
#include <asm/vmware.h>
#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)

return 0;
}
+EXPORT_SYMBOL(vmware_platform);

/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 2191c8d..0d0d625 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -311,6 +311,22 @@ config TI_DAC7512
This driver can also be built as a module. If so, the module
will be calles ti_dac7512.

+config VMWARE_BALLOON
+ tristate "VMware Balloon Driver"
+ depends on X86
+ help
+ This is VMware physical memory management driver which acts
+ like a "balloon" that can be inflated to reclaim physical pages
+ by reserving them in the guest and invalidating them in the
+ monitor, freeing up the underlying machine pages so they can
+ be allocated to other guests. The balloon can also be deflated
+ to allow the guest to use more physical memory.
+
+ If unsure, say N.
+
+ To compile this driver as a module, choose M here: the
+ module will be called vmware_balloon.
+
source "drivers/misc/c2port/Kconfig"
source "drivers/misc/eeprom/Kconfig"
source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 27c4843..7b6f7ee 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_C2PORT) += c2port/
obj-$(CONFIG_IWMC3200TOP) += iwmc3200top/
obj-y += eeprom/
obj-y += cb710/
+obj-$(CONFIG_VMWARE_BALLOON) += vmware_balloon.o
diff --git a/drivers/misc/vmware_balloon.c b/drivers/misc/vmware_balloon.c
new file mode 100644
index 0000000..6164355
--- /dev/null
+++ b/drivers/misc/vmware_balloon.c
@@ -0,0 +1,745 @@
+/*
+ * VMware Balloon driver.
+ *
+ * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Maintained by: Dmitry Torokhov <[email protected]>
+ */
+
+/*
+ * This is VMware physical memory management driver for Linux. The driver
+ * acts like a "balloon" that can be inflated to reclaim physical pages by
+ * reserving them in the guest and invalidating them in the monitor,
+ * freeing up the underlying machine pages so they can be allocated to
+ * other guests. The balloon can also be deflated to allow the guest to
+ * use more physical memory. Higher level policies can control the sizes
+ * of balloons in VMs in order to manage physical memory resources.
+ */
+
+//#define DEBUG
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/vmware.h>
+
+MODULE_AUTHOR("VMware, Inc.");
+MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
+MODULE_VERSION("1.2.1.0-K");
+MODULE_ALIAS("dmi:*:svnVMware*:*");
+MODULE_ALIAS("vmware_vmmemctl");
+MODULE_LICENSE("GPL");
+
+#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
+
+#define VMW_BALLOON_RATE_ALLOC_MIN 512U
+#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
+#define VMW_BALLOON_RATE_ALLOC_INC 16U
+
+#define VMW_BALLOON_RATE_FREE_MIN 512U
+#define VMW_BALLOON_RATE_FREE_MAX 16384U
+#define VMW_BALLOON_RATE_FREE_INC 16U
+
+/*
+ * When guest is under memory pressure, use a reduced page allocation
+ * rate for next several cycles.
+ */
+#define VMW_BALLOON_SLOW_CYCLES 4
+
+/*
+ * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
+ * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+ * __GFP_NOWARN, to suppress page allocation failure warnings.
+ */
+#define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN)
+
+/*
+ * Use GFP_HIGHUSER when executing in a separate kernel thread
+ * context and allocation can sleep. This is less stressful to
+ * the guest memory system, since it allows the thread to block
+ * while memory is reclaimed, and won't take pages from emergency
+ * low-memory pools.
+ */
+#define VMW_PAGE_ALLOC_CANSLEEP (GFP_HIGHUSER)
+
+/* Maximum number of page allocations without yielding processor */
+#define VMW_BALLOON_YIELD_THRESHOLD 1024
+
+#define VMW_BALLOON_HV_PORT 0x5670
+#define VMW_BALLOON_HV_MAGIC 0x456c6d6f
+#define VMW_BALLOON_PROTOCOL_VERSION 2
+#define VMW_BALLOON_GUEST_ID 1 /* Linux */
+
+#define VMW_BALLOON_CMD_START 0
+#define VMW_BALLOON_CMD_GET_TARGET 1
+#define VMW_BALLOON_CMD_LOCK 2
+#define VMW_BALLOON_CMD_UNLOCK 3
+#define VMW_BALLOON_CMD_GUEST_ID 4
+
+/* error codes */
+#define VMW_BALLOON_SUCCESS 0
+#define VMW_BALLOON_FAILURE -1
+#define VMW_BALLOON_ERROR_CMD_INVALID 1
+#define VMW_BALLOON_ERROR_PPN_INVALID 2
+#define VMW_BALLOON_ERROR_PPN_LOCKED 3
+#define VMW_BALLOON_ERROR_PPN_UNLOCKED 4
+#define VMW_BALLOON_ERROR_PPN_PINNED 5
+#define VMW_BALLOON_ERROR_PPN_NOTNEEDED 6
+#define VMW_BALLOON_ERROR_RESET 7
+#define VMW_BALLOON_ERROR_BUSY 8
+
+#define VMWARE_BALLOON_CMD(cmd, data, result) \
+({ \
+ unsigned long __stat, __dummy1, __dummy2; \
+ __asm__ __volatile__ ("inl (%%dx)" : \
+ "=a"(__stat), \
+ "=c"(__dummy1), \
+ "=d"(__dummy2), \
+ "=b"(result) : \
+ "0"(VMW_BALLOON_HV_MAGIC), \
+ "1"(VMW_BALLOON_CMD_##cmd), \
+ "2"(VMW_BALLOON_HV_PORT), \
+ "3"(data) : \
+ "memory"); \
+ result &= -1UL; \
+ __stat & -1UL; \
+})
+
+#define STATS_INC(stat) (stat)++
+
+struct vmballoon_stats {
+ unsigned int timer;
+
+ /* allocation statustics */
+ unsigned int alloc;
+ unsigned int alloc_fail;
+ unsigned int sleep_alloc;
+ unsigned int sleep_alloc_fail;
+ unsigned int refused_alloc;
+ unsigned int refused_free;
+ unsigned int free;
+
+ /* monitor operations */
+ unsigned int lock;
+ unsigned int lock_fail;
+ unsigned int unlock;
+ unsigned int unlock_fail;
+ unsigned int target;
+ unsigned int target_fail;
+ unsigned int start;
+ unsigned int start_fail;
+ unsigned int guest_type;
+ unsigned int guest_type_fail;
+};
+
+struct vmballoon {
+
+ /* list of reserved physical pages */
+ struct list_head pages;
+
+ /* transient list of non-balloonable pages */
+ struct list_head refused_pages;
+
+ /* balloon size in pages */
+ unsigned int size;
+ unsigned int target;
+
+ /* reset flag */
+ bool reset_required;
+
+ /* adjustment rates (pages per second) */
+ unsigned int rate_alloc;
+ unsigned int rate_free;
+
+ /* slowdown page allocations for next few cycles */
+ unsigned int slow_allocation_cycles;
+
+ /* statistics */
+ struct vmballoon_stats stats;
+
+ struct sysinfo sysinfo;
+
+ struct delayed_work dwork;
+};
+
+static struct vmballoon balloon;
+static struct workqueue_struct *vmballoon_wq;
+
+static bool vmballoon_send_start(struct vmballoon *b)
+{
+ unsigned long status, dummy;
+
+ STATS_INC(b->stats.start);
+
+ status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
+ if (status == VMW_BALLOON_SUCCESS)
+ return true;
+
+ pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ STATS_INC(b->stats.start_fail);
+ return false;
+}
+
+static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
+{
+ switch (status) {
+ case VMW_BALLOON_SUCCESS:
+ return true;
+
+ case VMW_BALLOON_ERROR_RESET:
+ b->reset_required = true;
+ /* fall through */
+
+ default:
+ return false;
+ }
+}
+
+static bool vmballoon_send_guest_id(struct vmballoon *b)
+{
+ unsigned long status, dummy;
+
+ status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
+
+ STATS_INC(b->stats.guest_type);
+
+ if (vmballoon_check_status(b, status))
+ return true;
+
+ pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ STATS_INC(b->stats.guest_type_fail);
+ return false;
+}
+
+static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
+{
+ unsigned long status;
+ unsigned long target;
+ unsigned long limit;
+ u32 limit32;
+
+ /*
+ * si_meminfo() is cheap. Moreover, we want to provide dynamic
+ * max balloon size later. So let us call si_meminfo() every
+ * iteration.
+ */
+ si_meminfo(&b->sysinfo);
+ limit = b->sysinfo.totalram;
+
+ /* Ensure limit fits in 32-bits */
+ limit32 = (u32)limit;
+ if (limit != limit32)
+ return false;
+
+ /* update stats */
+ STATS_INC(b->stats.target);
+
+ status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
+ if (vmballoon_check_status(b, status)) {
+ *new_target = target;
+ return true;
+ }
+
+ pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ STATS_INC(b->stats.target_fail);
+ return false;
+}
+
+static bool vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn)
+{
+ unsigned long status, dummy;
+ u32 pfn32;
+
+ pfn32 = (u32)pfn;
+ if (pfn32 != pfn)
+ return false;
+
+ STATS_INC(b->stats.lock);
+
+ status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
+ if (vmballoon_check_status(b, status))
+ return true;
+
+ pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
+ STATS_INC(b->stats.lock_fail);
+ return false;
+}
+
+static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
+{
+ unsigned long status, dummy;
+ u32 pfn32;
+
+ pfn32 = (u32)pfn;
+ if (pfn32 != pfn)
+ return false;
+
+ STATS_INC(b->stats.unlock);
+
+ status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
+ if (vmballoon_check_status(b, status))
+ return true;
+
+ pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
+ STATS_INC(b->stats.unlock_fail);
+ return false;
+}
+
+static void vmballoon_pop(struct vmballoon *b)
+{
+ struct page *page, *next;
+ unsigned int count = 0;
+
+ list_for_each_entry_safe(page, next, &b->pages, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ STATS_INC(b->stats.free);
+ b->size--;
+
+ if (++count >= b->rate_free) {
+ count = 0;
+ cond_resched();
+ }
+ }
+}
+
+static void vmballoon_reset(struct vmballoon *b)
+{
+ /* free all pages, skipping monitor unlock */
+ vmballoon_pop(b);
+
+ if (vmballoon_send_start(b)) {
+ b->reset_required = false;
+ if (!vmballoon_send_guest_id(b))
+ pr_err("failed to send guest ID to the host\n");
+ }
+}
+
+static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep)
+{
+ struct page *page;
+ gfp_t flags;
+ bool locked = false;
+
+ do {
+ if (!can_sleep)
+ STATS_INC(b->stats.alloc);
+ else
+ STATS_INC(b->stats.sleep_alloc);
+
+ flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP;
+ page = alloc_page(flags);
+ if (!page) {
+ if (!can_sleep)
+ STATS_INC(b->stats.alloc_fail);
+ else
+ STATS_INC(b->stats.sleep_alloc_fail);
+ return -ENOMEM;
+ }
+
+ /* inform monitor */
+ locked = vmballoon_send_lock_page(b, page_to_pfn(page));
+ if (!locked) {
+ if (b->reset_required) {
+ __free_page(page);
+ return -EIO;
+ }
+
+ /* place on list of non-balloonable pages, retry allocation */
+ list_add(&page->lru, &b->refused_pages);
+ STATS_INC(b->stats.refused_alloc);
+ }
+ } while (!locked);
+
+ /* track allocated page */
+ list_add(&page->lru, &b->pages);
+
+ /* update balloon size */
+ b->size++;
+
+ return 0;
+}
+
+static int vmballoon_release_page(struct vmballoon *b, struct page *page)
+{
+ if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
+ return -EIO;
+
+ list_del(&page->lru);
+
+ /* deallocate page */
+ __free_page(page);
+ STATS_INC(b->stats.free);
+
+ /* update balloon size */
+ b->size--;
+
+ return 0;
+}
+
+static void vmballoon_release_refused_pages(struct vmballoon *b)
+{
+ struct page *page, *next;
+
+ /* free all non-balloonable "refused" pages */
+ list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ STATS_INC(b->stats.refused_free);
+ }
+}
+
+static void vmballoon_inflate(struct vmballoon *b)
+{
+ unsigned int goal;
+ unsigned int rate;
+ unsigned int i;
+ unsigned int allocations = 0;
+ int error = 0;
+ bool alloc_can_sleep = false;
+
+ pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
+
+ /*
+ * First try NOSLEEP page allocations to inflate balloon.
+ *
+ * If we do not throttle nosleep allocations, we can drain all
+ * free pages in the guest quickly (if the balloon target is high).
+ * As a side-effect, draining free pages helps to inform (force)
+ * the guest to start swapping if balloon target is not met yet,
+ * which is a desired behavior. However, balloon driver can consume
+ * all available CPU cycles if too many pages are allocated in a
+ * second. Therefore, we throttle nosleep allocations even when
+ * the guest is not under memory pressure. OTOH, if we have already
+ * predicted that the guest is under memory pressure, then we
+ * slowdown page allocations considerably.
+ */
+
+ goal = b->target - b->size;
+ /*
+ * Start with no sleep allocation rate which may be higher
+ * than sleeping allocation rate.
+ */
+ rate = b->slow_allocation_cycles ?
+ b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
+
+ pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
+ __func__, goal, rate, b->rate_alloc);
+
+ for (i = 0; i < goal; i++) {
+
+ error = vmballoon_reserve_page(b, alloc_can_sleep);
+ if (error) {
+ if (error != -ENOMEM) {
+ /*
+ * Not a page allocation failure, stop this
+ * cycle. Maybe we'll get new target from
+ * the host soon.
+ */
+ break;
+ }
+
+ if (alloc_can_sleep) {
+ /*
+ * CANSLEEP page allocation failed, so guest
+ * is under severe memory pressure. Quickly
+ * decrease allocation rate.
+ */
+ b->rate_alloc = max(b->rate_alloc / 2,
+ VMW_BALLOON_RATE_ALLOC_MIN);
+ break;
+ }
+
+ /*
+ * NOSLEEP page allocation failed, so the guest is
+ * under memory pressure. Let us slow down page
+ * allocations for next few cycles so that the guest
+ * gets out of memory pressure. Also, if we already
+ * allocated b->rate_alloc pages, let's pause,
+ * otherwise switch to sleeping allocations.
+ */
+ b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
+
+ if (i >= b->rate_alloc)
+ break;
+
+ alloc_can_sleep = true;
+ /* Lower rate for sleeping allocations. */
+ rate = b->rate_alloc;
+ }
+
+ if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
+ cond_resched();
+ allocations = 0;
+ }
+
+ if (i >= rate) {
+ /* We allocated enough pages, let's take a break. */
+ break;
+ }
+ }
+
+ /*
+ * We reached our goal without failures so try increasing
+ * allocation rate.
+ */
+ if (error == 0 && i >= b->rate_alloc) {
+ unsigned int mult = i / b->rate_alloc;
+
+ b->rate_alloc =
+ min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
+ VMW_BALLOON_RATE_ALLOC_MAX);
+ }
+
+ vmballoon_release_refused_pages(b);
+}
+
+static void vmballoon_deflate(struct vmballoon *b)
+{
+ struct page *page, *next;
+ unsigned int i = 0;
+ unsigned int goal;
+ int error;
+
+ pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
+
+ /* limit deallocation rate */
+ goal = min(b->size - b->target, b->rate_free);
+
+ pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
+
+ /* free pages to reach target */
+ list_for_each_entry_safe(page, next, &b->pages, lru) {
+ error = vmballoon_release_page(b, page);
+ if (error) {
+ /* quickly decrease rate in case of error */
+ b->rate_free = max(b->rate_free / 2,
+ VMW_BALLOON_RATE_FREE_MIN);
+ return;
+ }
+
+ if (++i >= goal)
+ break;
+ }
+
+ /* slowly increase rate if there were no errors */
+ b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
+ VMW_BALLOON_RATE_FREE_MAX);
+}
+
+static void vmballoon_work(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
+ unsigned int target;
+
+ STATS_INC(b->stats.timer);
+
+ if (b->reset_required)
+ vmballoon_reset(b);
+
+ if (b->slow_allocation_cycles > 0)
+ b->slow_allocation_cycles--;
+
+ if (vmballoon_send_get_target(b, &target)) {
+ /* update target, adjust size */
+ b->target = target;
+
+ if (b->size < target)
+ vmballoon_inflate(b);
+ else if (b->size > target)
+ vmballoon_deflate(b);
+ }
+
+ queue_delayed_work(vmballoon_wq, dwork, round_jiffies_relative(HZ));
+}
+
+/*
+ * PROCFS Interface
+ */
+#ifdef CONFIG_PROC_FS
+
+static struct proc_dir_entry *vmballoon_pde;
+
+static int vmballoon_proc_show(struct seq_file *f, void *offset)
+{
+ struct vmballoon *b = f->private;
+ struct vmballoon_stats *stats = &b->stats;
+
+ /* format size info */
+ seq_printf(f,
+ "target: %8d pages\n"
+ "current: %8d pages\n",
+ b->target, b->size);
+
+ /* format rate info */
+ seq_printf(f,
+ "rateNoSleepAlloc: %8d pages/sec\n"
+ "rateSleepAlloc: %8d pages/sec\n"
+ "rateFree: %8d pages/sec\n",
+ VMW_BALLOON_NOSLEEP_ALLOC_MAX,
+ b->rate_alloc, b->rate_free);
+
+ seq_printf(f,
+ "\n"
+ "timer: %8u\n"
+ "start: %8u (%4u failed)\n"
+ "guestType: %8u (%4u failed)\n"
+ "lock: %8u (%4u failed)\n"
+ "unlock: %8u (%4u failed)\n"
+ "target: %8u (%4u failed)\n"
+ "primNoSleepAlloc: %8u (%4u failed)\n"
+ "primCanSleepAlloc: %8u (%4u failed)\n"
+ "primFree: %8u\n"
+ "errAlloc: %8u\n"
+ "errFree: %8u\n",
+ stats->timer,
+ stats->start, stats->start_fail,
+ stats->guest_type, stats->guest_type_fail,
+ stats->lock, stats->lock_fail,
+ stats->unlock, stats->unlock_fail,
+ stats->target, stats->target_fail,
+ stats->alloc, stats->alloc_fail,
+ stats->sleep_alloc, stats->sleep_alloc_fail,
+ stats->free,
+ stats->refused_alloc, stats->refused_free);
+
+ return 0;
+}
+
+static int vmballoon_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, vmballoon_proc_show, PDE(inode)->data);
+}
+
+static const struct file_operations vmballoon_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = vmballoon_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init vmballoon_procfs_init(struct vmballoon *b)
+{
+ vmballoon_pde = proc_create_data("vmmemctl", S_IFREG | S_IRUGO, NULL,
+ &vmballoon_proc_fops, b);
+ if (!vmballoon_pde) {
+ pr_err("failed to create proc entry\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit vmballoon_procfs_exit(void)
+{
+ remove_proc_entry("vmmemctl", NULL);
+}
+
+#else
+
+static inline int vmballoon_procfs_init(struct vmballoon *b)
+{
+ return 0;
+}
+
+static inline void vmballoon_procfs_exit(void)
+{
+}
+
+#endif /* CONFIG_PROC_FS */
+
+static int __init vmballoon_init(void)
+{
+ int error;
+
+ /*
+ * Check if we are running on VMware's hypervisor and bail out
+ * if we are not.
+ */
+ if (!vmware_platform())
+ return -ENODEV;
+
+ vmballoon_wq = create_freezeable_workqueue("vmmemctl");
+ if (!vmballoon_wq) {
+ pr_err("failed to create workqueue\n");
+ return -ENOMEM;
+ }
+
+ /* initialize global state */
+ memset(&balloon, 0, sizeof(balloon));
+ INIT_LIST_HEAD(&balloon.pages);
+ INIT_LIST_HEAD(&balloon.refused_pages);
+
+ /* initialize rates */
+ balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
+ balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
+
+ INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
+
+ /*
+ * Start balloon.
+ */
+ if (!vmballoon_send_start(&balloon)) {
+ pr_err("failed to send start command to the host\n");
+ error = -EIO;
+ goto fail;
+ }
+
+ if (!vmballoon_send_guest_id(&balloon)) {
+ pr_err("failed to send guest ID to the host\n");
+ error = -EIO;
+ goto fail;
+ }
+
+ error = vmballoon_procfs_init(&balloon);
+ if (error)
+ goto fail;
+
+ queue_delayed_work(vmballoon_wq, &balloon.dwork, 0);
+
+ return 0;
+
+fail:
+ destroy_workqueue(vmballoon_wq);
+ return error;
+}
+module_init(vmballoon_init)
+
+static void __exit vmballoon_exit(void)
+{
+ cancel_delayed_work_sync(&balloon.dwork);
+ destroy_workqueue(vmballoon_wq);
+
+ vmballoon_procfs_exit();
+
+ /*
+ * Deallocate all reserved memory, and reset connection with monitor.
+ * Reset connection before deallocating memory to avoid potential for
+ * additional spurious resets from guest touching deallocated pages.
+ */
+ vmballoon_send_start(&balloon);
+ vmballoon_pop(&balloon);
+}
+module_exit(vmballoon_exit)


2010-04-05 21:24:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Sun, 4 Apr 2010 14:52:02 -0700
Dmitry Torokhov <[email protected]> wrote:

> This is standalone version of VMware Balloon driver. Unlike previous
> version, that tried to integrate VMware ballooning transport into virtio
> subsystem, and use stock virtio_ballon driver, this one implements both
> controlling thread/algorithm and hypervisor transport.
>
> We are submitting standalone driver because KVM maintainer (Avi Kivity)
> expressed opinion (rightly) that our transport does not fit well into
> virtqueue paradigm and thus it does not make much sense to integrate
> with virtio.
>

I think I've forgotten what balloon drivers do. Are they as nasty a
hack as I remember believing them to be?

A summary of what this code sets out to do, and how it does it would be
useful.

Also please explain the applicability of this driver. Will xen use it?
kvm? Out-of-tree code?

The code implements a user-visible API (in /proc, at least). Please
fully describe the proposed interface(s) in the changelog so we can
review and understand that proposal.

>
> ...
>
> +static bool vmballoon_send_start(struct vmballoon *b)
> +{
> + unsigned long status, dummy;
> +
> + STATS_INC(b->stats.start);
> +
> + status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
> + if (status == VMW_BALLOON_SUCCESS)
> + return true;
> +
> + pr_debug("%s - failed, hv returns %ld\n", __func__, status);

The code refers to something called "hv". I suspect that's stale?

> + STATS_INC(b->stats.start_fail);
> + return false;
> +}
> +
> +static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
> +{
> + switch (status) {
> + case VMW_BALLOON_SUCCESS:
> + return true;
> +
> + case VMW_BALLOON_ERROR_RESET:
> + b->reset_required = true;
> + /* fall through */
> +
> + default:
> + return false;
> + }
> +}
> +
> +static bool vmballoon_send_guest_id(struct vmballoon *b)
> +{
> + unsigned long status, dummy;
> +
> + status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
> +
> + STATS_INC(b->stats.guest_type);
> +
> + if (vmballoon_check_status(b, status))
> + return true;
> +
> + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
> + STATS_INC(b->stats.guest_type_fail);
> + return false;
> +}

The lack of comments makes it all a bit hard to take in.

>
> ...
>
> +static int __init vmballoon_init(void)
> +{
> + int error;
> +
> + /*
> + * Check if we are running on VMware's hypervisor and bail out
> + * if we are not.
> + */
> + if (!vmware_platform())
> + return -ENODEV;
> +
> + vmballoon_wq = create_freezeable_workqueue("vmmemctl");
> + if (!vmballoon_wq) {
> + pr_err("failed to create workqueue\n");
> + return -ENOMEM;
> + }
> +
> + /* initialize global state */
> + memset(&balloon, 0, sizeof(balloon));

The memset seems to be unneeded.

> + INIT_LIST_HEAD(&balloon.pages);
> + INIT_LIST_HEAD(&balloon.refused_pages);
> +
> + /* initialize rates */
> + balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
> + balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
> +
> + INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
> +
> + /*
> + * Start balloon.
> + */
> + if (!vmballoon_send_start(&balloon)) {
> + pr_err("failed to send start command to the host\n");
> + error = -EIO;
> + goto fail;
> + }
> +
> + if (!vmballoon_send_guest_id(&balloon)) {
> + pr_err("failed to send guest ID to the host\n");
> + error = -EIO;
> + goto fail;
> + }
> +
> + error = vmballoon_procfs_init(&balloon);
> + if (error)
> + goto fail;
> +
> + queue_delayed_work(vmballoon_wq, &balloon.dwork, 0);
> +
> + return 0;
> +
> +fail:
> + destroy_workqueue(vmballoon_wq);
> + return error;
> +}
>
> ...
>

Oh well, ho hum. Help is needed on working out what to do about this,
please.

Congrats on the new job, btw ;)

2010-04-05 22:03:15

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/05/2010 02:24 PM, Andrew Morton wrote:
> I think I've forgotten what balloon drivers do. Are they as nasty a
> hack as I remember believing them to be?
>

(I haven't looked at Dmitry's patch yet, so this is from the Xen
perspective.)

In the simplest form, they just look like a driver which allocates a
pile of pages, and the underlying memory gets returned to the
hypervisor. When you want the memory back, it reattaches memory to the
pageframes and releases the memory back to the kernel. This allows a
virtual machine to shrink with respect to its original size.

Going the other way - expanding beyond the memory allocation - is a bit
trickier because you need to get some new page structures from
somewhere. We don't do this in Xen yet, but I've done some experiments
with hotplug memory to implement this. Or a simpler approach is to fake
up some reserved E820 ranges to grow into.


> A summary of what this code sets out to do, and how it does it would be
> useful.
>
> Also please explain the applicability of this driver. Will xen use it?
> kvm? Out-of-tree code?
>
The basic idea of the driver is to allow a guest system to give up
memory it isn't using so it can be reused by other virtual machines (or
the host itself).

Xen and KVM already have equivalents in the kernel. Now that I've had a
quick look at Dmitry's patch, it's certainly along the same lines as the
Xen code, but it isn't clear to me how much code they could end up
sharing. There's a couple of similar-looking loops, but the bulk of the
code appears to be VMware specific.

One area that would be very useful as common code would be some kind of
policy engine to drive the balloon driver. That is, something that can
look at the VM's state and say "we really have a couple hundred MB of
excess memory we could happily give back to the host". And - very
important - "don't go below X MB, because then we'll die in a flaming
swap storm".

At the moment this is driven by vendor-specific tools with heuristics of
varying degrees of sophistication (which could be as simple as
absolutely manual control). The problem has two sides because there's
the decision made by guests on how much memory they can afford to give
up, and also on the host side who knows what the system-wide memory
pressures are. And it can be affected by hypervisor-specific features,
such as whether pages can be transparently shared between domains,
demand-faulted from swap, etc.

And Dan Magenheimer is playing with a more fine-grained mechanism where
a guest kernel can draw on spare host memory without actually committing
that memory to the guest, which allows memory to be reallocated on the
fly with more fluidity.

> The code implements a user-visible API (in /proc, at least). Please
> fully describe the proposed interface(s) in the changelog so we can
> review and understand that proposal.
>

It seems to me that sysfs would be a better match. It would be nice to
try and avoid gratuitous differences.

>> ...
>>
>> +static bool vmballoon_send_start(struct vmballoon *b)
>> +{
>> + unsigned long status, dummy;
>> +
>> + STATS_INC(b->stats.start);
>> +
>> + status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
>> + if (status == VMW_BALLOON_SUCCESS)
>> + return true;
>> +
>> + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
>>
> The code refers to something called "hv". I suspect that's stale?
>

hv = hypervisor

J

2010-04-05 22:18:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, 05 Apr 2010 15:03:08 -0700
Jeremy Fitzhardinge <[email protected]> wrote:

> On 04/05/2010 02:24 PM, Andrew Morton wrote:
> > I think I've forgotten what balloon drivers do. Are they as nasty a
> > hack as I remember believing them to be?
> >
>
> (I haven't looked at Dmitry's patch yet, so this is from the Xen
> perspective.)
>
> In the simplest form, they just look like a driver which allocates a
> pile of pages, and the underlying memory gets returned to the
> hypervisor. When you want the memory back, it reattaches memory to the
> pageframes and releases the memory back to the kernel. This allows a
> virtual machine to shrink with respect to its original size.
>
> Going the other way - expanding beyond the memory allocation - is a bit
> trickier because you need to get some new page structures from
> somewhere. We don't do this in Xen yet, but I've done some experiments
> with hotplug memory to implement this. Or a simpler approach is to fake
> up some reserved E820 ranges to grow into.
>

Lots of stuff for Dmitry to add to his changelog ;)

> > A summary of what this code sets out to do, and how it does it would be
> > useful.
> >
> > Also please explain the applicability of this driver. Will xen use it?
> > kvm? Out-of-tree code?
> >
> The basic idea of the driver is to allow a guest system to give up
> memory it isn't using so it can be reused by other virtual machines (or
> the host itself).

So... does this differ in any fundamental way from what hibernation
does, via shrink_all_memory()?

2010-04-05 22:27:07

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 01:17 AM, Andrew Morton wrote:
>> The basic idea of the driver is to allow a guest system to give up
>> memory it isn't using so it can be reused by other virtual machines (or
>> the host itself).
>>
> So... does this differ in any fundamental way from what hibernation
> does, via shrink_all_memory()?
>

Just the _all_ bit, and the fact that we need to report the freed page
numbers to the hypervisor.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2010-04-05 22:41:33

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Tue, 06 Apr 2010 01:26:11 +0300
Avi Kivity <[email protected]> wrote:

> On 04/06/2010 01:17 AM, Andrew Morton wrote:
> >> The basic idea of the driver is to allow a guest system to give up
> >> memory it isn't using so it can be reused by other virtual machines (or
> >> the host itself).
> >>
> > So... does this differ in any fundamental way from what hibernation
> > does, via shrink_all_memory()?
> >
>
> Just the _all_ bit, and the fact that we need to report the freed page
> numbers to the hypervisor.
>

So... why not tweak that, rather than implementing some parallel thing?

2010-04-05 22:58:40

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, Apr 05, 2010 at 02:24:19PM -0700, Andrew Morton wrote:
> On Sun, 4 Apr 2010 14:52:02 -0700
> Dmitry Torokhov <[email protected]> wrote:
>
> > This is standalone version of VMware Balloon driver. Unlike previous
> > version, that tried to integrate VMware ballooning transport into virtio
> > subsystem, and use stock virtio_ballon driver, this one implements both
> > controlling thread/algorithm and hypervisor transport.
> >
> > We are submitting standalone driver because KVM maintainer (Avi Kivity)
> > expressed opinion (rightly) that our transport does not fit well into
> > virtqueue paradigm and thus it does not make much sense to integrate
> > with virtio.
> >
>
> I think I've forgotten what balloon drivers do. Are they as nasty a
> hack as I remember believing them to be?
>
> A summary of what this code sets out to do, and how it does it would be
> useful.
>

Jeremy provided a very good writeup; I will aldo expand changelog in the
next version.

> Also please explain the applicability of this driver. Will xen use it?
> kvm? Out-of-tree code?

The driver is expected to be used on VMware platform - mainly ESX.
Originally we tried to converge with KVM and use virtio and
stock virtio_balloon driver but Avi mentioned that our code emulating
virtqueue was more than balloon code itself and thus using virtio did
not make nuch sense.

>
> The code implements a user-visible API (in /proc, at least). Please
> fully describe the proposed interface(s) in the changelog so we can
> review and understand that proposal.

OK.

>
> >
> > ...
> >
> > +static bool vmballoon_send_start(struct vmballoon *b)
> > +{
> > + unsigned long status, dummy;
> > +
> > + STATS_INC(b->stats.start);
> > +
> > + status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
> > + if (status == VMW_BALLOON_SUCCESS)
> > + return true;
> > +
> > + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
>
> The code refers to something called "hv". I suspect that's stale?
>
> > + STATS_INC(b->stats.start_fail);
> > + return false;
> > +}
> > +
> > +static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
> > +{
> > + switch (status) {
> > + case VMW_BALLOON_SUCCESS:
> > + return true;
> > +
> > + case VMW_BALLOON_ERROR_RESET:
> > + b->reset_required = true;
> > + /* fall through */
> > +
> > + default:
> > + return false;
> > + }
> > +}
> > +
> > +static bool vmballoon_send_guest_id(struct vmballoon *b)
> > +{
> > + unsigned long status, dummy;
> > +
> > + status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
> > +
> > + STATS_INC(b->stats.guest_type);
> > +
> > + if (vmballoon_check_status(b, status))
> > + return true;
> > +
> > + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
> > + STATS_INC(b->stats.guest_type_fail);
> > + return false;
> > +}
>
> The lack of comments makes it all a bit hard to take in.

OK, I will address lack of comments.

>
> >
> > ...
> >
> > +static int __init vmballoon_init(void)
> > +{
> > + int error;
> > +
> > + /*
> > + * Check if we are running on VMware's hypervisor and bail out
> > + * if we are not.
> > + */
> > + if (!vmware_platform())
> > + return -ENODEV;
> > +
> > + vmballoon_wq = create_freezeable_workqueue("vmmemctl");
> > + if (!vmballoon_wq) {
> > + pr_err("failed to create workqueue\n");
> > + return -ENOMEM;
> > + }
> > +
> > + /* initialize global state */
> > + memset(&balloon, 0, sizeof(balloon));
>
> The memset seems to be unneeded.

OK.

>
> > + INIT_LIST_HEAD(&balloon.pages);
> > + INIT_LIST_HEAD(&balloon.refused_pages);
> > +
> > + /* initialize rates */
> > + balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
> > + balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
> > +
> > + INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
> > +
> > + /*
> > + * Start balloon.
> > + */
> > + if (!vmballoon_send_start(&balloon)) {
> > + pr_err("failed to send start command to the host\n");
> > + error = -EIO;
> > + goto fail;
> > + }
> > +
> > + if (!vmballoon_send_guest_id(&balloon)) {
> > + pr_err("failed to send guest ID to the host\n");
> > + error = -EIO;
> > + goto fail;
> > + }
> > +
> > + error = vmballoon_procfs_init(&balloon);
> > + if (error)
> > + goto fail;
> > +
> > + queue_delayed_work(vmballoon_wq, &balloon.dwork, 0);
> > +
> > + return 0;
> > +
> > +fail:
> > + destroy_workqueue(vmballoon_wq);
> > + return error;
> > +}
> >
> > ...
> >
>
> Oh well, ho hum. Help is needed on working out what to do about this,
> please.
>
> Congrats on the new job, btw ;)

Thanks ;). BTW, please send input stuff to my gmail addresss till.

--
Dmitry

2010-04-05 23:01:19

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, Apr 05, 2010 at 03:40:23PM -0700, Andrew Morton wrote:
> On Tue, 06 Apr 2010 01:26:11 +0300
> Avi Kivity <[email protected]> wrote:
>
> > On 04/06/2010 01:17 AM, Andrew Morton wrote:
> > >> The basic idea of the driver is to allow a guest system to give up
> > >> memory it isn't using so it can be reused by other virtual machines (or
> > >> the host itself).
> > >>
> > > So... does this differ in any fundamental way from what hibernation
> > > does, via shrink_all_memory()?
> > >
> >
> > Just the _all_ bit, and the fact that we need to report the freed page
> > numbers to the hypervisor.
> >
>
> So... why not tweak that, rather than implementing some parallel thing?

I guess the main difference is that freeing memory is not the primary
goal; we want to make sure that guest does not use some of its memory
without notifying hypervisor first.

--
Dmitry

2010-04-05 23:05:05

by Dan Magenheimer

[permalink] [raw]
Subject: RE: [PATCH] VMware Balloon driver

> > On 04/06/2010 01:17 AM, Andrew Morton wrote:
> > >> The basic idea of the driver is to allow a guest system to give up
> > >> memory it isn't using so it can be reused by other virtual
> machines (or
> > >> the host itself).
> > >>
> > > So... does this differ in any fundamental way from what
> hibernation
> > > does, via shrink_all_memory()?
> > >
> >
> > Just the _all_ bit, and the fact that we need to report the freed
> page
> > numbers to the hypervisor.
> >
>
> So... why not tweak that, rather than implementing some parallel
> thing?

I think Avi was being facetious ("_all_"). Hibernation assumes
everything in the machine is going to stop for awhile. Ballooning
assumes that the machine has lower memory need for awhile, but
is otherwise fully operational. Think of it as hot-plug memory
at a page granularity.

Historically, all OS's had a (relatively) fixed amount of memory
and, since it was fixed in size, there was no sense wasting any of it.
In a virtualized world, OS's should be trained to be much more
flexible as one virtual machine's "waste" could/should be another
virtual machine's "want". Ballooning is currently the mechanism
for this; it places memory pressure on the OS to encourage it
to get by with less memory. Unfortunately, it is difficult even
within an OS to determine what memory is wasted and what memory
might be used imminently... because LRU is only an approximation of
the future. Hypervisors have an even more difficult problem not
only because they must infer this information from external events,
but they can double the problem if they infer the opposite of what
the OS actually does.

As Jeremy mentioned, Transcendent Memory (and its Linux implementations
"cleancache" and "frontswap") allows a guest kernel to give up memory
for the broader good while still retaining a probability that it
can get the same data back quickly. This results in more memory
fluidity. Transcendent Memory ("tmem") still uses ballooning as
the mechanism to create memory pressure... it just provides an
insurance policy for that memory pressure.

Avi will point out that it is not clear that tmem can make use of
or benefit from tmem, but we need not repeat that discussion here.

Dan

2010-04-05 23:15:36

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, 5 Apr 2010 16:03:48 -0700 (PDT)
Dan Magenheimer <[email protected]> wrote:

> > > On 04/06/2010 01:17 AM, Andrew Morton wrote:
> > > >> The basic idea of the driver is to allow a guest system to give up
> > > >> memory it isn't using so it can be reused by other virtual
> > machines (or
> > > >> the host itself).
> > > >>
> > > > So... does this differ in any fundamental way from what
> > hibernation
> > > > does, via shrink_all_memory()?
> > > >
> > >
> > > Just the _all_ bit, and the fact that we need to report the freed
> > page
> > > numbers to the hypervisor.
> > >
> >
> > So... why not tweak that, rather than implementing some parallel
> > thing?
>
> I think Avi was being facetious ("_all_"). Hibernation assumes
> everything in the machine is going to stop for awhile. Ballooning
> assumes that the machine has lower memory need for awhile, but
> is otherwise fully operational.

shrink_all_memory() doesn't require that processes be stopped.

If the existing code doesn't exactly match virtualisation's
requirements, it can be changed.

> Think of it as hot-plug memory
> at a page granularity.

hotplug is different because it targets particular physical pages. For
this requirement any old page will do. Preferably one which won't be
needed soon, yes?

2010-04-05 23:28:27

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, Apr 05, 2010 at 04:11:10PM -0700, Andrew Morton wrote:
> On Mon, 5 Apr 2010 16:03:48 -0700 (PDT)
> Dan Magenheimer <[email protected]> wrote:
>
> > > > On 04/06/2010 01:17 AM, Andrew Morton wrote:
> > > > >> The basic idea of the driver is to allow a guest system to give up
> > > > >> memory it isn't using so it can be reused by other virtual
> > > machines (or
> > > > >> the host itself).
> > > > >>
> > > > > So... does this differ in any fundamental way from what
> > > hibernation
> > > > > does, via shrink_all_memory()?
> > > > >
> > > >
> > > > Just the _all_ bit, and the fact that we need to report the freed
> > > page
> > > > numbers to the hypervisor.
> > > >
> > >
> > > So... why not tweak that, rather than implementing some parallel
> > > thing?
> >
> > I think Avi was being facetious ("_all_"). Hibernation assumes
> > everything in the machine is going to stop for awhile. Ballooning
> > assumes that the machine has lower memory need for awhile, but
> > is otherwise fully operational.
>
> shrink_all_memory() doesn't require that processes be stopped.
>
> If the existing code doesn't exactly match virtualisation's
> requirements, it can be changed.
>
> > Think of it as hot-plug memory
> > at a page granularity.
>
> hotplug is different because it targets particular physical pages. For
> this requirement any old page will do. Preferably one which won't be
> needed soon, yes?

The best page would not old page but unused page.

We do rely on the standard mechanisms to find pages that can be freed to
inflate balloon, but once pages are allocated they are not available
till released. In case of shrinkig memory it can be allocated and used
as soon as we wake up (it shrink was done in course of hibernation
sequence).

--
Dmitry

2010-04-05 23:28:47

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/05/2010 03:17 PM, Andrew Morton wrote:
> On Mon, 05 Apr 2010 15:03:08 -0700
> Jeremy Fitzhardinge<[email protected]> wrote:
>
>
>> On 04/05/2010 02:24 PM, Andrew Morton wrote:
>>
>>> I think I've forgotten what balloon drivers do. Are they as nasty a
>>> hack as I remember believing them to be?
>>>
>>>
>> (I haven't looked at Dmitry's patch yet, so this is from the Xen
>> perspective.)
>>
>> In the simplest form, they just look like a driver which allocates a
>> pile of pages, and the underlying memory gets returned to the
>> hypervisor. When you want the memory back, it reattaches memory to the
>> pageframes and releases the memory back to the kernel. This allows a
>> virtual machine to shrink with respect to its original size.
>>
>> Going the other way - expanding beyond the memory allocation - is a bit
>> trickier because you need to get some new page structures from
>> somewhere. We don't do this in Xen yet, but I've done some experiments
>> with hotplug memory to implement this. Or a simpler approach is to fake
>> up some reserved E820 ranges to grow into.
>>
>>
> Lots of stuff for Dmitry to add to his changelog ;)
>
>
>>> A summary of what this code sets out to do, and how it does it would be
>>> useful.
>>>
>>> Also please explain the applicability of this driver. Will xen use it?
>>> kvm? Out-of-tree code?
>>>
>>>
>> The basic idea of the driver is to allow a guest system to give up
>> memory it isn't using so it can be reused by other virtual machines (or
>> the host itself).
>>
> So... does this differ in any fundamental way from what hibernation
> does, via shrink_all_memory()?
>

Note that we're using shrink and grow in opposite senses.
shrink_all_memory() is trying to free as much kernel memory as possible,
which to the virtual machine's host looks like the guest is growing
(since it has claimed more memory for its own use). A balloon "shrink"
appears to Linux as allocated memory (ie, locking down memory within
Linux to make it available to the rest of system).

The fact that shrink_all_memory() has much deeper insight into the
current state of the vm subsystem is interesting; it has much more to
work with than a simple alloc/free page. Does it actively try to
reclaim cold, unlikely to be used stuff, first? It appears it does to
my mm/ naive eye.

I guess a way to use it in the short term is to have a loop of the form:

while (guest_size> target) {
shrink_all_memory(guest_size - target); /* force pages to be free */
while (p = alloc_page(GFP_NORETRY)) /* vacuum up pages */
release_page_to_hypervisor(p);
/* twiddle thumbs */
}

...assuming the allocation would tend to pick up the pages that
shrink_all_memory just freed.

Or ideally, have a form of shrink_all_memory() which causes pages to
become unused, but rather than freeing them returns them to the caller.

And is there some way to get the vm subsystem to provide backpressure:
"I'm getting desperately short of memory!"? Experience has shown that
administrators often accidentally over-shrink their domains and
effectively kill them. Sometimes due to bad UI - entering the wrong
units - but also because they just don't know what the actual memory
demands are. Or they change over time.

Thanks,
J

2010-04-05 23:35:51

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, 05 Apr 2010 16:28:38 -0700
Jeremy Fitzhardinge <[email protected]> wrote:

> And is there some way to get the vm subsystem to provide backpressure:
> "I'm getting desperately short of memory!"?

Not really. One could presumably pull dopey tricks by hooking into
slab shrinker registration or even ->writepage(). But cooking up
something explicit doesn't sound too hard - the trickiest bit would be
actually defining what it should do.

2010-04-06 00:27:39

by Dan Magenheimer

[permalink] [raw]
Subject: RE: [PATCH] VMware Balloon driver

> From: Andrew Morton [mailto:[email protected]]
> Sent: Monday, April 05, 2010 5:35 PM
> To: Jeremy Fitzhardinge
> Cc: Dmitry Torokhov; [email protected]; pv-
> [email protected]; Avi Kivity; Dan Magenheimer
> Subject: Re: [PATCH] VMware Balloon driver
>
> On Mon, 05 Apr 2010 16:28:38 -0700
> Jeremy Fitzhardinge <[email protected]> wrote:
>
> > And is there some way to get the vm subsystem to provide
> backpressure:
> > "I'm getting desperately short of memory!"?
>
> Not really. One could presumably pull dopey tricks by hooking into
> slab shrinker registration or even ->writepage(). But cooking up
> something explicit doesn't sound too hard - the trickiest bit would be
> actually defining what it should do.

Sorry, I don't mean to be too self-serving. And I am far less
an expert in Linux mm code than others involved in this discussion.

But this backpressure metric is one thing that frontswap provides.
It also provides an "insurance policy" for "desperately short
of memory". It is the "yin" to the "yang" of cleancache.

If I understand the swap subsystem correctly, there IS NO
"getting desperately short of memory" except when a swap
device is unavailable or, more likely, too darn slow.

Frontswap writes synchronously to pseudo-RAM (tmem, in the
case of Xen) instead of a slow asynchronous swap device. It
hooks directly into swap_writepage()/swap_readpage() in
a very clean, well-defined (not dopey) way.
So -- I think -- it is a perfect feedback mechanism to
tell a balloon driver (or equivalent), "I need more memory"
while covering the short-term need until the balloon driver
(and/or hypervisor) can respond.

It works today with Xen, and Nitin Gupta is working on an
in-kernel memory compression backend for it. And Chris Mason
and I think it may also be a fine interface for SSD-used-
as-RAM-extension.

So please consider frontswap and cleancache before "cooking
up something [else] explicit"... these were previously part
of Transcendent Memory postings*, but I have revised them to
be more useful, well-defined, and standalone (from Xen/tmem)
and will be re-posting the revised versions soon.

Dan

* See:
http://lwn.net/Articles/340080/
http://lkml.indiana.edu/hypermail/linux/kernel/0912.2/01322.html
OLS 2009 proceedings
LCA 2010 proceedings

2010-04-06 16:28:16

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 01:40 AM, Andrew Morton wrote:
> On Tue, 06 Apr 2010 01:26:11 +0300
> Avi Kivity<[email protected]> wrote:
>
>
>> On 04/06/2010 01:17 AM, Andrew Morton wrote:
>>
>>>> The basic idea of the driver is to allow a guest system to give up
>>>> memory it isn't using so it can be reused by other virtual machines (or
>>>> the host itself).
>>>>
>>>>
>>> So... does this differ in any fundamental way from what hibernation
>>> does, via shrink_all_memory()?
>>>
>>>
>> Just the _all_ bit, and the fact that we need to report the freed page
>> numbers to the hypervisor.
>>
>>
> So... why not tweak that, rather than implementing some parallel thing?
>

That's maybe 5 lines of code. Most of the code is focused on
interpreting requests from the hypervisor and replying with the page
numbers.

--
error compiling committee.c: too many arguments to function

2010-04-06 16:54:30

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 01:58 AM, Dmitry Torokhov wrote:
>
>> Also please explain the applicability of this driver. Will xen use it?
>> kvm? Out-of-tree code?
>>
> The driver is expected to be used on VMware platform - mainly ESX.
> Originally we tried to converge with KVM and use virtio and
> stock virtio_balloon driver but Avi mentioned that our code emulating
> virtqueue was more than balloon code itself and thus using virtio did
> not make nuch sense.
>

Yeah. If we wanted commonality, we could make a balloon_core.c that
contains the common code. IMO that's premature, but perhaps there's
some meat there (like suspend/resume support and /proc//sys interface).

--
error compiling committee.c: too many arguments to function

2010-04-06 17:03:57

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 02:34 AM, Andrew Morton wrote:
> On Mon, 05 Apr 2010 16:28:38 -0700
> Jeremy Fitzhardinge<[email protected]> wrote:
>
>
>> And is there some way to get the vm subsystem to provide backpressure:
>> "I'm getting desperately short of memory!"?
>>
> Not really. One could presumably pull dopey tricks by hooking into
> slab shrinker registration or even ->writepage(). But cooking up
> something explicit doesn't sound too hard - the trickiest bit would be
> actually defining what it should do.
>

The oft-suggested approach is to look at the I/O load from guests and
give more memory to those that are thrashing. Of course not all I/O is
directly due to memory pressure.

--
error compiling committee.c: too many arguments to function

2010-04-06 17:06:46

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Tue, Apr 06, 2010 at 09:32:47AM -0700, Avi Kivity wrote:
> On 04/06/2010 01:58 AM, Dmitry Torokhov wrote:
> >
> >> Also please explain the applicability of this driver. Will xen use it?
> >> kvm? Out-of-tree code?
> >>
> > The driver is expected to be used on VMware platform - mainly ESX.
> > Originally we tried to converge with KVM and use virtio and
> > stock virtio_balloon driver but Avi mentioned that our code emulating
> > virtqueue was more than balloon code itself and thus using virtio did
> > not make nuch sense.
> >
>
> Yeah. If we wanted commonality, we could make a balloon_core.c that
> contains the common code. IMO that's premature, but perhaps there's

I really not sure if it makes much sense. Ripping out virtdev/virtqueue
from virtio_balloon leaves pretty much nothing.

> some meat there (like suspend/resume support and /proc//sys interface).

We do not need any special suspend/resume support - the freezeable
workqueue is stopped when suspending.

Thanks.

--
Dmitry

2010-04-06 17:28:39

by Dan Magenheimer

[permalink] [raw]
Subject: RE: [PATCH] VMware Balloon driver

> From: Avi Kivity [mailto:[email protected]]
> Sent: Tuesday, April 06, 2010 10:31 AM
> >
> >> And is there some way to get the vm subsystem to provide
> backpressure:
> >> "I'm getting desperately short of memory!"?
> >>
> > Not really. One could presumably pull dopey tricks by hooking into
> > slab shrinker registration or even ->writepage(). But cooking up
> > something explicit doesn't sound too hard - the trickiest bit would
> be
> > actually defining what it should do.
>
> The oft-suggested approach is to look at the I/O load from guests and
> give more memory to those that are thrashing. Of course not all I/O is
> directly due to memory pressure.

Which is why it is very useful to be able to differentiate between:
1) refault I/O (due to pagecache too small, and PFRA choices)
2) swap I/O (due to memory pressure)
3) normal file dirty writes (due to an app's need for persistence)

Again, the cleancache and frontswap hooks and APIs separate these
out nicely.

Dan "who worries he is sounding like a broken record"

2010-04-06 17:42:23

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 08:06 PM, Dmitry Torokhov wrote:
>
>> some meat there (like suspend/resume support and /proc//sys interface).
>>
> We do not need any special suspend/resume support - the freezeable
> workqueue is stopped when suspending.
>

Ah, virtio_balloon should do the same.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2010-04-06 18:26:02

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 09:32 AM, Avi Kivity wrote:
> Yeah. If we wanted commonality, we could make a balloon_core.c that
> contains the common code. IMO that's premature, but perhaps there's
> some meat there (like suspend/resume support and /proc//sys interface).

I think it would be useful to have common:

1. User and kernel mode ABIs for controlling ballooning. It assumes
that the different balloon implementations are sufficiently
similar in semantics. (Once there's a kernel ABI, adding a
common user ABI is trivial.)
2. Policy driving the ballooning driver, at least from the guest
side. That is, some good metrics from the vm subsystem about
memory pressure (both positive and negative), and something to
turn those metrics into requests to the balloon driver.

1) is not a huge amount of code, but something consistent would be
nice. 2) is something we've been missing and is a bit of an open
question/research project anyway.

J

2010-04-06 18:36:33

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 09:25 PM, Jeremy Fitzhardinge wrote:
> On 04/06/2010 09:32 AM, Avi Kivity wrote:
>> Yeah. If we wanted commonality, we could make a balloon_core.c that
>> contains the common code. IMO that's premature, but perhaps there's
>> some meat there (like suspend/resume support and /proc//sys interface).
>
> I think it would be useful to have common:
>
> 1. User and kernel mode ABIs for controlling ballooning. It assumes
> that the different balloon implementations are sufficiently
> similar in semantics. (Once there's a kernel ABI, adding a
> common user ABI is trivial.)
> 2. Policy driving the ballooning driver, at least from the guest
> side. That is, some good metrics from the vm subsystem about
> memory pressure (both positive and negative), and something to
> turn those metrics into requests to the balloon driver.
>
> 1) is not a huge amount of code, but something consistent would be
> nice. 2) is something we've been missing and is a bit of an open
> question/research project anyway.

3) Code that attempts to reclaim 2MB pages when possible

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

2010-04-06 19:18:44

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/06/2010 11:36 AM, Avi Kivity wrote:
> On 04/06/2010 09:25 PM, Jeremy Fitzhardinge wrote:
>> On 04/06/2010 09:32 AM, Avi Kivity wrote:
>>> Yeah. If we wanted commonality, we could make a balloon_core.c that
>>> contains the common code. IMO that's premature, but perhaps there's
>>> some meat there (like suspend/resume support and /proc//sys interface).
>>
>> I think it would be useful to have common:
>>
>> 1. User and kernel mode ABIs for controlling ballooning. It assumes
>> that the different balloon implementations are sufficiently
>> similar in semantics. (Once there's a kernel ABI, adding a
>> common user ABI is trivial.)
>> 2. Policy driving the ballooning driver, at least from the guest
>> side. That is, some good metrics from the vm subsystem about
>> memory pressure (both positive and negative), and something to
>> turn those metrics into requests to the balloon driver.
>>
>> 1) is not a huge amount of code, but something consistent would be
>> nice. 2) is something we've been missing and is a bit of an open
>> question/research project anyway.
>
> 3) Code that attempts to reclaim 2MB pages when possible

Yes. Ballooning in 4k units is a bit silly.

J

2010-04-06 23:21:15

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On Mon, 2010-04-05 at 16:28 -0700, Jeremy Fitzhardinge wrote:
> I guess a way to use it in the short term is to have a loop of the form:
>
> while (guest_size> target) {
> shrink_all_memory(guest_size - target); /* force pages to be free */
> while (p = alloc_page(GFP_NORETRY)) /* vacuum up pages */
> release_page_to_hypervisor(p);
> /* twiddle thumbs */
> }

We also need to remember to consolidate the Xen and virtio-balloon
drivers. They both have their own GFP flags, for instance, but I think
they actually want the exact same thing. They could probably also share
that snippet, right?

-- Dave

2010-04-08 05:30:38

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

Hi!

>>> 1) is not a huge amount of code, but something consistent would be
>>> nice. 2) is something we've been missing and is a bit of an open
>>> question/research project anyway.
>>
>> 3) Code that attempts to reclaim 2MB pages when possible
>
> Yes. Ballooning in 4k units is a bit silly.

Does it make sense to treat ballooning as a form of memory hotplug?

--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2010-04-08 07:18:48

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/08/2010 08:30 AM, Pavel Machek wrote:
> Hi!
>
>
>>>> 1) is not a huge amount of code, but something consistent would be
>>>> nice. 2) is something we've been missing and is a bit of an open
>>>> question/research project anyway.
>>>>
>>> 3) Code that attempts to reclaim 2MB pages when possible
>>>
>> Yes. Ballooning in 4k units is a bit silly.
>>
> Does it make sense to treat ballooning as a form of memory hotplug?
>

It's a fine granularity form of memory hotplug, yes.

--
error compiling committee.c: too many arguments to function

2010-04-08 17:02:08

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] VMware Balloon driver

On 04/07/2010 10:30 PM, Pavel Machek wrote:
> Hi!
>
>
>>>> 1) is not a huge amount of code, but something consistent would be
>>>> nice. 2) is something we've been missing and is a bit of an open
>>>> question/research project anyway.
>>>>
>>> 3) Code that attempts to reclaim 2MB pages when possible
>>>
>> Yes. Ballooning in 4k units is a bit silly.
>>
> Does it make sense to treat ballooning as a form of memory hotplug?
>

It has some similarities. The main difference is granularity;
ballooning works in pages (typically 4k, but 2M probably makes more
sense), whereas memory hotplug works in DIMM-like sizes (256MB+).
That's way too coarse for us; a domain might only have 256MB or less to
start with.

I experimented with a sort of hybrid scheme, in which I used hotplug
memory to add new struct pages to the system, but only incrementally
populated the underlying pages with the balloon driver. That worked
pretty well, but it doesn't fit very well with how memory hotplug works
(at least when I last looked at it a couple of years ago).

J

2010-04-15 21:00:33

by Dmitry Torokhov

[permalink] [raw]
Subject: [PATCH v2] VMware Balloon driver

This is standalone version of VMware Balloon driver. Ballooning is a
technique that allows hypervisor dynamically limit the amount of memory
available to the guest (with guest cooperation). In the overcommit
scenario, when hypervisor set detects that it needs to shuffle some memory,
it instructs the driver to allocate certain number of pages, and the
underlying memory gets returned to the hypervisor. Later hypervisor may
return memory to the guest by reattaching memory to the pageframes and
instructing the driver to "deflate" balloon.

Signed-off-by: Dmitry Torokhov <[email protected]>
---

Unlike previous version, that tried to integrate VMware ballooning transport
into virtio subsystem, and use stock virtio_ballon driver, this one implements
both controlling thread/algorithm and hypervisor transport.

We are submitting standalone driver because KVM maintainer (Avi Kivity)
expressed opinion (rightly) that our transport does not fit well into
virtqueue paradigm and thus it does not make much sense to integrate
with virtio.

There were also some concerns whether current ballooning technique is
the right thing. If there appears a better framework to achieve this we
are prepared to evaluate and switch to using it, but in the meantime
we'd like to get this driver upstream.

Changes since v1:
- added comments throughout the code;
- exported stats moved from /proc to debugfs;
- better changelog.

arch/x86/kernel/cpu/vmware.c | 2
drivers/misc/Kconfig | 16 +
drivers/misc/Makefile | 1
drivers/misc/vmware_balloon.c | 808 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 827 insertions(+), 0 deletions(-)
create mode 100644 drivers/misc/vmware_balloon.c


diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97..dfdb4db 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
*/

#include <linux/dmi.h>
+#include <linux/module.h>
#include <asm/div64.h>
#include <asm/vmware.h>
#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)

return 0;
}
+EXPORT_SYMBOL(vmware_platform);

/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 2191c8d..0d0d625 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -311,6 +311,22 @@ config TI_DAC7512
This driver can also be built as a module. If so, the module
will be calles ti_dac7512.

+config VMWARE_BALLOON
+ tristate "VMware Balloon Driver"
+ depends on X86
+ help
+ This is VMware physical memory management driver which acts
+ like a "balloon" that can be inflated to reclaim physical pages
+ by reserving them in the guest and invalidating them in the
+ monitor, freeing up the underlying machine pages so they can
+ be allocated to other guests. The balloon can also be deflated
+ to allow the guest to use more physical memory.
+
+ If unsure, say N.
+
+ To compile this driver as a module, choose M here: the
+ module will be called vmware_balloon.
+
source "drivers/misc/c2port/Kconfig"
source "drivers/misc/eeprom/Kconfig"
source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 27c4843..7b6f7ee 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_C2PORT) += c2port/
obj-$(CONFIG_IWMC3200TOP) += iwmc3200top/
obj-y += eeprom/
obj-y += cb710/
+obj-$(CONFIG_VMWARE_BALLOON) += vmware_balloon.o
diff --git a/drivers/misc/vmware_balloon.c b/drivers/misc/vmware_balloon.c
new file mode 100644
index 0000000..90bba04
--- /dev/null
+++ b/drivers/misc/vmware_balloon.c
@@ -0,0 +1,808 @@
+/*
+ * VMware Balloon driver.
+ *
+ * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Maintained by: Dmitry Torokhov <[email protected]>
+ */
+
+/*
+ * This is VMware physical memory management driver for Linux. The driver
+ * acts like a "balloon" that can be inflated to reclaim physical pages by
+ * reserving them in the guest and invalidating them in the monitor,
+ * freeing up the underlying machine pages so they can be allocated to
+ * other guests. The balloon can also be deflated to allow the guest to
+ * use more physical memory. Higher level policies can control the sizes
+ * of balloons in VMs in order to manage physical memory resources.
+ */
+
+//#define DEBUG
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <asm/vmware.h>
+
+MODULE_AUTHOR("VMware, Inc.");
+MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
+MODULE_VERSION("1.2.1.0-K");
+MODULE_ALIAS("dmi:*:svnVMware*:*");
+MODULE_ALIAS("vmware_vmmemctl");
+MODULE_LICENSE("GPL");
+
+#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
+
+#define VMW_BALLOON_RATE_ALLOC_MIN 512U
+#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
+#define VMW_BALLOON_RATE_ALLOC_INC 16U
+
+#define VMW_BALLOON_RATE_FREE_MIN 512U
+#define VMW_BALLOON_RATE_FREE_MAX 16384U
+#define VMW_BALLOON_RATE_FREE_INC 16U
+
+/*
+ * When guest is under memory pressure, use a reduced page allocation
+ * rate for next several cycles.
+ */
+#define VMW_BALLOON_SLOW_CYCLES 4
+
+/*
+ * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
+ * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+ * __GFP_NOWARN, to suppress page allocation failure warnings.
+ */
+#define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN)
+
+/*
+ * Use GFP_HIGHUSER when executing in a separate kernel thread
+ * context and allocation can sleep. This is less stressful to
+ * the guest memory system, since it allows the thread to block
+ * while memory is reclaimed, and won't take pages from emergency
+ * low-memory pools.
+ */
+#define VMW_PAGE_ALLOC_CANSLEEP (GFP_HIGHUSER)
+
+/* Maximum number of page allocations without yielding processor */
+#define VMW_BALLOON_YIELD_THRESHOLD 1024
+
+#define VMW_BALLOON_HV_PORT 0x5670
+#define VMW_BALLOON_HV_MAGIC 0x456c6d6f
+#define VMW_BALLOON_PROTOCOL_VERSION 2
+#define VMW_BALLOON_GUEST_ID 1 /* Linux */
+
+#define VMW_BALLOON_CMD_START 0
+#define VMW_BALLOON_CMD_GET_TARGET 1
+#define VMW_BALLOON_CMD_LOCK 2
+#define VMW_BALLOON_CMD_UNLOCK 3
+#define VMW_BALLOON_CMD_GUEST_ID 4
+
+/* error codes */
+#define VMW_BALLOON_SUCCESS 0
+#define VMW_BALLOON_FAILURE -1
+#define VMW_BALLOON_ERROR_CMD_INVALID 1
+#define VMW_BALLOON_ERROR_PPN_INVALID 2
+#define VMW_BALLOON_ERROR_PPN_LOCKED 3
+#define VMW_BALLOON_ERROR_PPN_UNLOCKED 4
+#define VMW_BALLOON_ERROR_PPN_PINNED 5
+#define VMW_BALLOON_ERROR_PPN_NOTNEEDED 6
+#define VMW_BALLOON_ERROR_RESET 7
+#define VMW_BALLOON_ERROR_BUSY 8
+
+#define VMWARE_BALLOON_CMD(cmd, data, result) \
+({ \
+ unsigned long __stat, __dummy1, __dummy2; \
+ __asm__ __volatile__ ("inl (%%dx)" : \
+ "=a"(__stat), \
+ "=c"(__dummy1), \
+ "=d"(__dummy2), \
+ "=b"(result) : \
+ "0"(VMW_BALLOON_HV_MAGIC), \
+ "1"(VMW_BALLOON_CMD_##cmd), \
+ "2"(VMW_BALLOON_HV_PORT), \
+ "3"(data) : \
+ "memory"); \
+ result &= -1UL; \
+ __stat & -1UL; \
+})
+
+#define STATS_INC(stat) (stat)++
+
+struct vmballoon_stats {
+ unsigned int timer;
+
+ /* allocation statustics */
+ unsigned int alloc;
+ unsigned int alloc_fail;
+ unsigned int sleep_alloc;
+ unsigned int sleep_alloc_fail;
+ unsigned int refused_alloc;
+ unsigned int refused_free;
+ unsigned int free;
+
+ /* monitor operations */
+ unsigned int lock;
+ unsigned int lock_fail;
+ unsigned int unlock;
+ unsigned int unlock_fail;
+ unsigned int target;
+ unsigned int target_fail;
+ unsigned int start;
+ unsigned int start_fail;
+ unsigned int guest_type;
+ unsigned int guest_type_fail;
+};
+
+struct vmballoon {
+
+ /* list of reserved physical pages */
+ struct list_head pages;
+
+ /* transient list of non-balloonable pages */
+ struct list_head refused_pages;
+
+ /* balloon size in pages */
+ unsigned int size;
+ unsigned int target;
+
+ /* reset flag */
+ bool reset_required;
+
+ /* adjustment rates (pages per second) */
+ unsigned int rate_alloc;
+ unsigned int rate_free;
+
+ /* slowdown page allocations for next few cycles */
+ unsigned int slow_allocation_cycles;
+
+ /* statistics */
+ struct vmballoon_stats stats;
+
+ /* debugfs file exporting statistics */
+ struct dentry *dbg_entry;
+
+ struct sysinfo sysinfo;
+
+ struct delayed_work dwork;
+};
+
+static struct vmballoon balloon;
+static struct workqueue_struct *vmballoon_wq;
+
+/*
+ * Send "start" command to the host, communicating supported version
+ * of the protocol.
+ */
+static bool vmballoon_send_start(struct vmballoon *b)
+{
+ unsigned long status, dummy;
+
+ STATS_INC(b->stats.start);
+
+ status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
+ if (status == VMW_BALLOON_SUCCESS)
+ return true;
+
+ pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ STATS_INC(b->stats.start_fail);
+ return false;
+}
+
+static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
+{
+ switch (status) {
+ case VMW_BALLOON_SUCCESS:
+ return true;
+
+ case VMW_BALLOON_ERROR_RESET:
+ b->reset_required = true;
+ /* fall through */
+
+ default:
+ return false;
+ }
+}
+
+/*
+ * Communicate guest type to the host so that it can adjust ballooning
+ * algorithm to the one most appropriate for the guest. This command
+ * is normally issued after sending "start" command and is part of
+ * standard reset sequence.
+ */
+static bool vmballoon_send_guest_id(struct vmballoon *b)
+{
+ unsigned long status, dummy;
+
+ status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
+
+ STATS_INC(b->stats.guest_type);
+
+ if (vmballoon_check_status(b, status))
+ return true;
+
+ pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ STATS_INC(b->stats.guest_type_fail);
+ return false;
+}
+
+/*
+ * Retrieve desired balloon size from the host.
+ */
+static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
+{
+ unsigned long status;
+ unsigned long target;
+ unsigned long limit;
+ u32 limit32;
+
+ /*
+ * si_meminfo() is cheap. Moreover, we want to provide dynamic
+ * max balloon size later. So let us call si_meminfo() every
+ * iteration.
+ */
+ si_meminfo(&b->sysinfo);
+ limit = b->sysinfo.totalram;
+
+ /* Ensure limit fits in 32-bits */
+ limit32 = (u32)limit;
+ if (limit != limit32)
+ return false;
+
+ /* update stats */
+ STATS_INC(b->stats.target);
+
+ status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
+ if (vmballoon_check_status(b, status)) {
+ *new_target = target;
+ return true;
+ }
+
+ pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ STATS_INC(b->stats.target_fail);
+ return false;
+}
+
+/*
+ * Notify the host about allocated page so that host can use it without
+ * fear that guest will need it. Host may reject some pages, we need to
+ * check the return value and maybe submit a different page.
+ */
+static bool vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn)
+{
+ unsigned long status, dummy;
+ u32 pfn32;
+
+ pfn32 = (u32)pfn;
+ if (pfn32 != pfn)
+ return false;
+
+ STATS_INC(b->stats.lock);
+
+ status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
+ if (vmballoon_check_status(b, status))
+ return true;
+
+ pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
+ STATS_INC(b->stats.lock_fail);
+ return false;
+}
+
+/*
+ * Notify the host that guest intends to release given page back into
+ * the pool of available (to the guest) pages.
+ */
+static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
+{
+ unsigned long status, dummy;
+ u32 pfn32;
+
+ pfn32 = (u32)pfn;
+ if (pfn32 != pfn)
+ return false;
+
+ STATS_INC(b->stats.unlock);
+
+ status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
+ if (vmballoon_check_status(b, status))
+ return true;
+
+ pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
+ STATS_INC(b->stats.unlock_fail);
+ return false;
+}
+
+/*
+ * Quickly release all pages allocated for the balloon. This function is
+ * called when host decides to "reset" balloon for one reason or another.
+ * Unlike normal "deflate" we do not (shall not) notify host of the pages
+ * being released.
+ */
+static void vmballoon_pop(struct vmballoon *b)
+{
+ struct page *page, *next;
+ unsigned int count = 0;
+
+ list_for_each_entry_safe(page, next, &b->pages, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ STATS_INC(b->stats.free);
+ b->size--;
+
+ if (++count >= b->rate_free) {
+ count = 0;
+ cond_resched();
+ }
+ }
+}
+
+/*
+ * Perform standard reset sequence by popping the balloon (in case it
+ * is not empty) and then restarting protocol. This operation normally
+ * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
+ */
+static void vmballoon_reset(struct vmballoon *b)
+{
+ /* free all pages, skipping monitor unlock */
+ vmballoon_pop(b);
+
+ if (vmballoon_send_start(b)) {
+ b->reset_required = false;
+ if (!vmballoon_send_guest_id(b))
+ pr_err("failed to send guest ID to the host\n");
+ }
+}
+
+/*
+ * Allocate (or reserve) a page for the balloon and notify the host. If host
+ * refuses the page put it on "refuse" list and allocate another one until host
+ * is satisfied. "Refused" pages are released at the end of inflation cycle
+ * (when we allocate b->rate_alloc pages).
+ */
+static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep)
+{
+ struct page *page;
+ gfp_t flags;
+ bool locked = false;
+
+ do {
+ if (!can_sleep)
+ STATS_INC(b->stats.alloc);
+ else
+ STATS_INC(b->stats.sleep_alloc);
+
+ flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP;
+ page = alloc_page(flags);
+ if (!page) {
+ if (!can_sleep)
+ STATS_INC(b->stats.alloc_fail);
+ else
+ STATS_INC(b->stats.sleep_alloc_fail);
+ return -ENOMEM;
+ }
+
+ /* inform monitor */
+ locked = vmballoon_send_lock_page(b, page_to_pfn(page));
+ if (!locked) {
+ if (b->reset_required) {
+ __free_page(page);
+ return -EIO;
+ }
+
+ /* place on list of non-balloonable pages, retry allocation */
+ list_add(&page->lru, &b->refused_pages);
+ STATS_INC(b->stats.refused_alloc);
+ }
+ } while (!locked);
+
+ /* track allocated page */
+ list_add(&page->lru, &b->pages);
+
+ /* update balloon size */
+ b->size++;
+
+ return 0;
+}
+
+/*
+ * Release the page allocated for the balloon. Note that we first notify
+ * the host so it can make sure the page will be available for the guest
+ * to use, if needed.
+ */
+static int vmballoon_release_page(struct vmballoon *b, struct page *page)
+{
+ if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
+ return -EIO;
+
+ list_del(&page->lru);
+
+ /* deallocate page */
+ __free_page(page);
+ STATS_INC(b->stats.free);
+
+ /* update balloon size */
+ b->size--;
+
+ return 0;
+}
+
+/*
+ * Release pages that were allocated while attempting to inflate the
+ * balloon but were refused by the host for one reason or another.
+ */
+static void vmballoon_release_refused_pages(struct vmballoon *b)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ STATS_INC(b->stats.refused_free);
+ }
+}
+
+/*
+ * Inflate the balloon towards its target size. Note that we try to limit
+ * the rate of allocation to make sure we are not choking the rest of the
+ * system.
+ */
+static void vmballoon_inflate(struct vmballoon *b)
+{
+ unsigned int goal;
+ unsigned int rate;
+ unsigned int i;
+ unsigned int allocations = 0;
+ int error = 0;
+ bool alloc_can_sleep = false;
+
+ pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
+
+ /*
+ * First try NOSLEEP page allocations to inflate balloon.
+ *
+ * If we do not throttle nosleep allocations, we can drain all
+ * free pages in the guest quickly (if the balloon target is high).
+ * As a side-effect, draining free pages helps to inform (force)
+ * the guest to start swapping if balloon target is not met yet,
+ * which is a desired behavior. However, balloon driver can consume
+ * all available CPU cycles if too many pages are allocated in a
+ * second. Therefore, we throttle nosleep allocations even when
+ * the guest is not under memory pressure. OTOH, if we have already
+ * predicted that the guest is under memory pressure, then we
+ * slowdown page allocations considerably.
+ */
+
+ goal = b->target - b->size;
+ /*
+ * Start with no sleep allocation rate which may be higher
+ * than sleeping allocation rate.
+ */
+ rate = b->slow_allocation_cycles ?
+ b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
+
+ pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
+ __func__, goal, rate, b->rate_alloc);
+
+ for (i = 0; i < goal; i++) {
+
+ error = vmballoon_reserve_page(b, alloc_can_sleep);
+ if (error) {
+ if (error != -ENOMEM) {
+ /*
+ * Not a page allocation failure, stop this
+ * cycle. Maybe we'll get new target from
+ * the host soon.
+ */
+ break;
+ }
+
+ if (alloc_can_sleep) {
+ /*
+ * CANSLEEP page allocation failed, so guest
+ * is under severe memory pressure. Quickly
+ * decrease allocation rate.
+ */
+ b->rate_alloc = max(b->rate_alloc / 2,
+ VMW_BALLOON_RATE_ALLOC_MIN);
+ break;
+ }
+
+ /*
+ * NOSLEEP page allocation failed, so the guest is
+ * under memory pressure. Let us slow down page
+ * allocations for next few cycles so that the guest
+ * gets out of memory pressure. Also, if we already
+ * allocated b->rate_alloc pages, let's pause,
+ * otherwise switch to sleeping allocations.
+ */
+ b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
+
+ if (i >= b->rate_alloc)
+ break;
+
+ alloc_can_sleep = true;
+ /* Lower rate for sleeping allocations. */
+ rate = b->rate_alloc;
+ }
+
+ if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
+ cond_resched();
+ allocations = 0;
+ }
+
+ if (i >= rate) {
+ /* We allocated enough pages, let's take a break. */
+ break;
+ }
+ }
+
+ /*
+ * We reached our goal without failures so try increasing
+ * allocation rate.
+ */
+ if (error == 0 && i >= b->rate_alloc) {
+ unsigned int mult = i / b->rate_alloc;
+
+ b->rate_alloc =
+ min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
+ VMW_BALLOON_RATE_ALLOC_MAX);
+ }
+
+ vmballoon_release_refused_pages(b);
+}
+
+/*
+ * Decrease the size of the balloon allowing guest to use more memory.
+ */
+static void vmballoon_deflate(struct vmballoon *b)
+{
+ struct page *page, *next;
+ unsigned int i = 0;
+ unsigned int goal;
+ int error;
+
+ pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
+
+ /* limit deallocation rate */
+ goal = min(b->size - b->target, b->rate_free);
+
+ pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
+
+ /* free pages to reach target */
+ list_for_each_entry_safe(page, next, &b->pages, lru) {
+ error = vmballoon_release_page(b, page);
+ if (error) {
+ /* quickly decrease rate in case of error */
+ b->rate_free = max(b->rate_free / 2,
+ VMW_BALLOON_RATE_FREE_MIN);
+ return;
+ }
+
+ if (++i >= goal)
+ break;
+ }
+
+ /* slowly increase rate if there were no errors */
+ b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
+ VMW_BALLOON_RATE_FREE_MAX);
+}
+
+/*
+ * Balloon work function: reset protocol, if needed, get the new size and
+ * adjust balloon as needed. Repeat in 1 sec.
+ */
+static void vmballoon_work(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
+ unsigned int target;
+
+ STATS_INC(b->stats.timer);
+
+ if (b->reset_required)
+ vmballoon_reset(b);
+
+ if (b->slow_allocation_cycles > 0)
+ b->slow_allocation_cycles--;
+
+ if (vmballoon_send_get_target(b, &target)) {
+ /* update target, adjust size */
+ b->target = target;
+
+ if (b->size < target)
+ vmballoon_inflate(b);
+ else if (b->size > target)
+ vmballoon_deflate(b);
+ }
+
+ queue_delayed_work(vmballoon_wq, dwork, round_jiffies_relative(HZ));
+}
+
+/*
+ * PROCFS Interface
+ */
+#ifdef CONFIG_DEBUG_FS
+
+static int vmballoon_debug_show(struct seq_file *f, void *offset)
+{
+ struct vmballoon *b = f->private;
+ struct vmballoon_stats *stats = &b->stats;
+
+ /* format size info */
+ seq_printf(f,
+ "target: %8d pages\n"
+ "current: %8d pages\n",
+ b->target, b->size);
+
+ /* format rate info */
+ seq_printf(f,
+ "rateNoSleepAlloc: %8d pages/sec\n"
+ "rateSleepAlloc: %8d pages/sec\n"
+ "rateFree: %8d pages/sec\n",
+ VMW_BALLOON_NOSLEEP_ALLOC_MAX,
+ b->rate_alloc, b->rate_free);
+
+ seq_printf(f,
+ "\n"
+ "timer: %8u\n"
+ "start: %8u (%4u failed)\n"
+ "guestType: %8u (%4u failed)\n"
+ "lock: %8u (%4u failed)\n"
+ "unlock: %8u (%4u failed)\n"
+ "target: %8u (%4u failed)\n"
+ "primNoSleepAlloc: %8u (%4u failed)\n"
+ "primCanSleepAlloc: %8u (%4u failed)\n"
+ "primFree: %8u\n"
+ "errAlloc: %8u\n"
+ "errFree: %8u\n",
+ stats->timer,
+ stats->start, stats->start_fail,
+ stats->guest_type, stats->guest_type_fail,
+ stats->lock, stats->lock_fail,
+ stats->unlock, stats->unlock_fail,
+ stats->target, stats->target_fail,
+ stats->alloc, stats->alloc_fail,
+ stats->sleep_alloc, stats->sleep_alloc_fail,
+ stats->free,
+ stats->refused_alloc, stats->refused_free);
+
+ return 0;
+}
+
+static int vmballoon_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, vmballoon_debug_show, inode->i_private);
+}
+
+static const struct file_operations vmballoon_debug_fops = {
+ .owner = THIS_MODULE,
+ .open = vmballoon_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init vmballoon_debugfs_init(struct vmballoon *b)
+{
+ int error;
+
+ b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
+ &vmballoon_debug_fops);
+ if (IS_ERR(b->dbg_entry)) {
+ error = PTR_ERR(b->dbg_entry);
+ pr_err("failed to create debugfs entry, error: %d\n", error);
+ return error;
+ }
+
+ return 0;
+}
+
+static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
+{
+ debugfs_remove(b->dbg_entry);
+}
+
+#else
+
+static inline int vmballoon_debugfs_init(struct vmballoon *b)
+{
+ return 0;
+}
+
+static inline void vmballoon_debugfs_exit(void)
+{
+}
+
+#endif /* CONFIG_PROC_FS */
+
+static int __init vmballoon_init(void)
+{
+ int error;
+
+ /*
+ * Check if we are running on VMware's hypervisor and bail out
+ * if we are not.
+ */
+ if (!vmware_platform())
+ return -ENODEV;
+
+ vmballoon_wq = create_freezeable_workqueue("vmmemctl");
+ if (!vmballoon_wq) {
+ pr_err("failed to create workqueue\n");
+ return -ENOMEM;
+ }
+
+ /* initialize global state */
+ memset(&balloon, 0, sizeof(balloon));
+ INIT_LIST_HEAD(&balloon.pages);
+ INIT_LIST_HEAD(&balloon.refused_pages);
+
+ /* initialize rates */
+ balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
+ balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
+
+ INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
+
+ /*
+ * Start balloon.
+ */
+ if (!vmballoon_send_start(&balloon)) {
+ pr_err("failed to send start command to the host\n");
+ error = -EIO;
+ goto fail;
+ }
+
+ if (!vmballoon_send_guest_id(&balloon)) {
+ pr_err("failed to send guest ID to the host\n");
+ error = -EIO;
+ goto fail;
+ }
+
+ error = vmballoon_debugfs_init(&balloon);
+ if (error)
+ goto fail;
+
+ queue_delayed_work(vmballoon_wq, &balloon.dwork, 0);
+
+ return 0;
+
+fail:
+ destroy_workqueue(vmballoon_wq);
+ return error;
+}
+module_init(vmballoon_init);
+
+static void __exit vmballoon_exit(void)
+{
+ cancel_delayed_work_sync(&balloon.dwork);
+ destroy_workqueue(vmballoon_wq);
+
+ vmballoon_debugfs_exit(&balloon);
+
+ /*
+ * Deallocate all reserved memory, and reset connection with monitor.
+ * Reset connection before deallocating memory to avoid potential for
+ * additional spurious resets from guest touching deallocated pages.
+ */
+ vmballoon_send_start(&balloon);
+ vmballoon_pop(&balloon);
+}
+module_exit(vmballoon_exit);

2010-04-21 19:59:38

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Thu, Apr 15, 2010 at 02:00:30PM -0700, Dmitry Torokhov wrote:
> This is standalone version of VMware Balloon driver. Ballooning is a
> technique that allows hypervisor dynamically limit the amount of memory
> available to the guest (with guest cooperation). In the overcommit
> scenario, when hypervisor set detects that it needs to shuffle some memory,
> it instructs the driver to allocate certain number of pages, and the
> underlying memory gets returned to the hypervisor. Later hypervisor may
> return memory to the guest by reattaching memory to the pageframes and
> instructing the driver to "deflate" balloon.
>
> Signed-off-by: Dmitry Torokhov <[email protected]>

Andrew,

Do you see any issues with the driver? Will you be the one picking it
up and queueing for mainline?

Thanks,

Dmitry


> ---
>
> Unlike previous version, that tried to integrate VMware ballooning transport
> into virtio subsystem, and use stock virtio_ballon driver, this one implements
> both controlling thread/algorithm and hypervisor transport.
>
> We are submitting standalone driver because KVM maintainer (Avi Kivity)
> expressed opinion (rightly) that our transport does not fit well into
> virtqueue paradigm and thus it does not make much sense to integrate
> with virtio.
>
> There were also some concerns whether current ballooning technique is
> the right thing. If there appears a better framework to achieve this we
> are prepared to evaluate and switch to using it, but in the meantime
> we'd like to get this driver upstream.
>
> Changes since v1:
> - added comments throughout the code;
> - exported stats moved from /proc to debugfs;
> - better changelog.
>
> arch/x86/kernel/cpu/vmware.c | 2
> drivers/misc/Kconfig | 16 +
> drivers/misc/Makefile | 1
> drivers/misc/vmware_balloon.c | 808 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 827 insertions(+), 0 deletions(-)
> create mode 100644 drivers/misc/vmware_balloon.c
>
>
> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
> index 1cbed97..dfdb4db 100644
> --- a/arch/x86/kernel/cpu/vmware.c
> +++ b/arch/x86/kernel/cpu/vmware.c
> @@ -22,6 +22,7 @@
> */
>
> #include <linux/dmi.h>
> +#include <linux/module.h>
> #include <asm/div64.h>
> #include <asm/vmware.h>
> #include <asm/x86_init.h>
> @@ -101,6 +102,7 @@ int vmware_platform(void)
>
> return 0;
> }
> +EXPORT_SYMBOL(vmware_platform);
>
> /*
> * VMware hypervisor takes care of exporting a reliable TSC to the guest.
> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> index 2191c8d..0d0d625 100644
> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -311,6 +311,22 @@ config TI_DAC7512
> This driver can also be built as a module. If so, the module
> will be calles ti_dac7512.
>
> +config VMWARE_BALLOON
> + tristate "VMware Balloon Driver"
> + depends on X86
> + help
> + This is VMware physical memory management driver which acts
> + like a "balloon" that can be inflated to reclaim physical pages
> + by reserving them in the guest and invalidating them in the
> + monitor, freeing up the underlying machine pages so they can
> + be allocated to other guests. The balloon can also be deflated
> + to allow the guest to use more physical memory.
> +
> + If unsure, say N.
> +
> + To compile this driver as a module, choose M here: the
> + module will be called vmware_balloon.
> +
> source "drivers/misc/c2port/Kconfig"
> source "drivers/misc/eeprom/Kconfig"
> source "drivers/misc/cb710/Kconfig"
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index 27c4843..7b6f7ee 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -29,3 +29,4 @@ obj-$(CONFIG_C2PORT) += c2port/
> obj-$(CONFIG_IWMC3200TOP) += iwmc3200top/
> obj-y += eeprom/
> obj-y += cb710/
> +obj-$(CONFIG_VMWARE_BALLOON) += vmware_balloon.o
> diff --git a/drivers/misc/vmware_balloon.c b/drivers/misc/vmware_balloon.c
> new file mode 100644
> index 0000000..90bba04
> --- /dev/null
> +++ b/drivers/misc/vmware_balloon.c
> @@ -0,0 +1,808 @@
> +/*
> + * VMware Balloon driver.
> + *
> + * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the
> + * Free Software Foundation; version 2 of the License and no later version.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for more
> + * details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
> + *
> + * Maintained by: Dmitry Torokhov <[email protected]>
> + */
> +
> +/*
> + * This is VMware physical memory management driver for Linux. The driver
> + * acts like a "balloon" that can be inflated to reclaim physical pages by
> + * reserving them in the guest and invalidating them in the monitor,
> + * freeing up the underlying machine pages so they can be allocated to
> + * other guests. The balloon can also be deflated to allow the guest to
> + * use more physical memory. Higher level policies can control the sizes
> + * of balloons in VMs in order to manage physical memory resources.
> + */
> +
> +//#define DEBUG
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/sched.h>
> +#include <linux/module.h>
> +#include <linux/workqueue.h>
> +#include <linux/debugfs.h>
> +#include <linux/seq_file.h>
> +#include <asm/vmware.h>
> +
> +MODULE_AUTHOR("VMware, Inc.");
> +MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
> +MODULE_VERSION("1.2.1.0-K");
> +MODULE_ALIAS("dmi:*:svnVMware*:*");
> +MODULE_ALIAS("vmware_vmmemctl");
> +MODULE_LICENSE("GPL");
> +
> +#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
> +
> +#define VMW_BALLOON_RATE_ALLOC_MIN 512U
> +#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
> +#define VMW_BALLOON_RATE_ALLOC_INC 16U
> +
> +#define VMW_BALLOON_RATE_FREE_MIN 512U
> +#define VMW_BALLOON_RATE_FREE_MAX 16384U
> +#define VMW_BALLOON_RATE_FREE_INC 16U
> +
> +/*
> + * When guest is under memory pressure, use a reduced page allocation
> + * rate for next several cycles.
> + */
> +#define VMW_BALLOON_SLOW_CYCLES 4
> +
> +/*
> + * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
> + * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
> + * __GFP_NOWARN, to suppress page allocation failure warnings.
> + */
> +#define VMW_PAGE_ALLOC_NOSLEEP (__GFP_HIGHMEM|__GFP_NOWARN)
> +
> +/*
> + * Use GFP_HIGHUSER when executing in a separate kernel thread
> + * context and allocation can sleep. This is less stressful to
> + * the guest memory system, since it allows the thread to block
> + * while memory is reclaimed, and won't take pages from emergency
> + * low-memory pools.
> + */
> +#define VMW_PAGE_ALLOC_CANSLEEP (GFP_HIGHUSER)
> +
> +/* Maximum number of page allocations without yielding processor */
> +#define VMW_BALLOON_YIELD_THRESHOLD 1024
> +
> +#define VMW_BALLOON_HV_PORT 0x5670
> +#define VMW_BALLOON_HV_MAGIC 0x456c6d6f
> +#define VMW_BALLOON_PROTOCOL_VERSION 2
> +#define VMW_BALLOON_GUEST_ID 1 /* Linux */
> +
> +#define VMW_BALLOON_CMD_START 0
> +#define VMW_BALLOON_CMD_GET_TARGET 1
> +#define VMW_BALLOON_CMD_LOCK 2
> +#define VMW_BALLOON_CMD_UNLOCK 3
> +#define VMW_BALLOON_CMD_GUEST_ID 4
> +
> +/* error codes */
> +#define VMW_BALLOON_SUCCESS 0
> +#define VMW_BALLOON_FAILURE -1
> +#define VMW_BALLOON_ERROR_CMD_INVALID 1
> +#define VMW_BALLOON_ERROR_PPN_INVALID 2
> +#define VMW_BALLOON_ERROR_PPN_LOCKED 3
> +#define VMW_BALLOON_ERROR_PPN_UNLOCKED 4
> +#define VMW_BALLOON_ERROR_PPN_PINNED 5
> +#define VMW_BALLOON_ERROR_PPN_NOTNEEDED 6
> +#define VMW_BALLOON_ERROR_RESET 7
> +#define VMW_BALLOON_ERROR_BUSY 8
> +
> +#define VMWARE_BALLOON_CMD(cmd, data, result) \
> +({ \
> + unsigned long __stat, __dummy1, __dummy2; \
> + __asm__ __volatile__ ("inl (%%dx)" : \
> + "=a"(__stat), \
> + "=c"(__dummy1), \
> + "=d"(__dummy2), \
> + "=b"(result) : \
> + "0"(VMW_BALLOON_HV_MAGIC), \
> + "1"(VMW_BALLOON_CMD_##cmd), \
> + "2"(VMW_BALLOON_HV_PORT), \
> + "3"(data) : \
> + "memory"); \
> + result &= -1UL; \
> + __stat & -1UL; \
> +})
> +
> +#define STATS_INC(stat) (stat)++
> +
> +struct vmballoon_stats {
> + unsigned int timer;
> +
> + /* allocation statustics */
> + unsigned int alloc;
> + unsigned int alloc_fail;
> + unsigned int sleep_alloc;
> + unsigned int sleep_alloc_fail;
> + unsigned int refused_alloc;
> + unsigned int refused_free;
> + unsigned int free;
> +
> + /* monitor operations */
> + unsigned int lock;
> + unsigned int lock_fail;
> + unsigned int unlock;
> + unsigned int unlock_fail;
> + unsigned int target;
> + unsigned int target_fail;
> + unsigned int start;
> + unsigned int start_fail;
> + unsigned int guest_type;
> + unsigned int guest_type_fail;
> +};
> +
> +struct vmballoon {
> +
> + /* list of reserved physical pages */
> + struct list_head pages;
> +
> + /* transient list of non-balloonable pages */
> + struct list_head refused_pages;
> +
> + /* balloon size in pages */
> + unsigned int size;
> + unsigned int target;
> +
> + /* reset flag */
> + bool reset_required;
> +
> + /* adjustment rates (pages per second) */
> + unsigned int rate_alloc;
> + unsigned int rate_free;
> +
> + /* slowdown page allocations for next few cycles */
> + unsigned int slow_allocation_cycles;
> +
> + /* statistics */
> + struct vmballoon_stats stats;
> +
> + /* debugfs file exporting statistics */
> + struct dentry *dbg_entry;
> +
> + struct sysinfo sysinfo;
> +
> + struct delayed_work dwork;
> +};
> +
> +static struct vmballoon balloon;
> +static struct workqueue_struct *vmballoon_wq;
> +
> +/*
> + * Send "start" command to the host, communicating supported version
> + * of the protocol.
> + */
> +static bool vmballoon_send_start(struct vmballoon *b)
> +{
> + unsigned long status, dummy;
> +
> + STATS_INC(b->stats.start);
> +
> + status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_PROTOCOL_VERSION, dummy);
> + if (status == VMW_BALLOON_SUCCESS)
> + return true;
> +
> + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
> + STATS_INC(b->stats.start_fail);
> + return false;
> +}
> +
> +static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
> +{
> + switch (status) {
> + case VMW_BALLOON_SUCCESS:
> + return true;
> +
> + case VMW_BALLOON_ERROR_RESET:
> + b->reset_required = true;
> + /* fall through */
> +
> + default:
> + return false;
> + }
> +}
> +
> +/*
> + * Communicate guest type to the host so that it can adjust ballooning
> + * algorithm to the one most appropriate for the guest. This command
> + * is normally issued after sending "start" command and is part of
> + * standard reset sequence.
> + */
> +static bool vmballoon_send_guest_id(struct vmballoon *b)
> +{
> + unsigned long status, dummy;
> +
> + status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
> +
> + STATS_INC(b->stats.guest_type);
> +
> + if (vmballoon_check_status(b, status))
> + return true;
> +
> + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
> + STATS_INC(b->stats.guest_type_fail);
> + return false;
> +}
> +
> +/*
> + * Retrieve desired balloon size from the host.
> + */
> +static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
> +{
> + unsigned long status;
> + unsigned long target;
> + unsigned long limit;
> + u32 limit32;
> +
> + /*
> + * si_meminfo() is cheap. Moreover, we want to provide dynamic
> + * max balloon size later. So let us call si_meminfo() every
> + * iteration.
> + */
> + si_meminfo(&b->sysinfo);
> + limit = b->sysinfo.totalram;
> +
> + /* Ensure limit fits in 32-bits */
> + limit32 = (u32)limit;
> + if (limit != limit32)
> + return false;
> +
> + /* update stats */
> + STATS_INC(b->stats.target);
> +
> + status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
> + if (vmballoon_check_status(b, status)) {
> + *new_target = target;
> + return true;
> + }
> +
> + pr_debug("%s - failed, hv returns %ld\n", __func__, status);
> + STATS_INC(b->stats.target_fail);
> + return false;
> +}
> +
> +/*
> + * Notify the host about allocated page so that host can use it without
> + * fear that guest will need it. Host may reject some pages, we need to
> + * check the return value and maybe submit a different page.
> + */
> +static bool vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn)
> +{
> + unsigned long status, dummy;
> + u32 pfn32;
> +
> + pfn32 = (u32)pfn;
> + if (pfn32 != pfn)
> + return false;
> +
> + STATS_INC(b->stats.lock);
> +
> + status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
> + if (vmballoon_check_status(b, status))
> + return true;
> +
> + pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
> + STATS_INC(b->stats.lock_fail);
> + return false;
> +}
> +
> +/*
> + * Notify the host that guest intends to release given page back into
> + * the pool of available (to the guest) pages.
> + */
> +static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
> +{
> + unsigned long status, dummy;
> + u32 pfn32;
> +
> + pfn32 = (u32)pfn;
> + if (pfn32 != pfn)
> + return false;
> +
> + STATS_INC(b->stats.unlock);
> +
> + status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
> + if (vmballoon_check_status(b, status))
> + return true;
> +
> + pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
> + STATS_INC(b->stats.unlock_fail);
> + return false;
> +}
> +
> +/*
> + * Quickly release all pages allocated for the balloon. This function is
> + * called when host decides to "reset" balloon for one reason or another.
> + * Unlike normal "deflate" we do not (shall not) notify host of the pages
> + * being released.
> + */
> +static void vmballoon_pop(struct vmballoon *b)
> +{
> + struct page *page, *next;
> + unsigned int count = 0;
> +
> + list_for_each_entry_safe(page, next, &b->pages, lru) {
> + list_del(&page->lru);
> + __free_page(page);
> + STATS_INC(b->stats.free);
> + b->size--;
> +
> + if (++count >= b->rate_free) {
> + count = 0;
> + cond_resched();
> + }
> + }
> +}
> +
> +/*
> + * Perform standard reset sequence by popping the balloon (in case it
> + * is not empty) and then restarting protocol. This operation normally
> + * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
> + */
> +static void vmballoon_reset(struct vmballoon *b)
> +{
> + /* free all pages, skipping monitor unlock */
> + vmballoon_pop(b);
> +
> + if (vmballoon_send_start(b)) {
> + b->reset_required = false;
> + if (!vmballoon_send_guest_id(b))
> + pr_err("failed to send guest ID to the host\n");
> + }
> +}
> +
> +/*
> + * Allocate (or reserve) a page for the balloon and notify the host. If host
> + * refuses the page put it on "refuse" list and allocate another one until host
> + * is satisfied. "Refused" pages are released at the end of inflation cycle
> + * (when we allocate b->rate_alloc pages).
> + */
> +static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep)
> +{
> + struct page *page;
> + gfp_t flags;
> + bool locked = false;
> +
> + do {
> + if (!can_sleep)
> + STATS_INC(b->stats.alloc);
> + else
> + STATS_INC(b->stats.sleep_alloc);
> +
> + flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP;
> + page = alloc_page(flags);
> + if (!page) {
> + if (!can_sleep)
> + STATS_INC(b->stats.alloc_fail);
> + else
> + STATS_INC(b->stats.sleep_alloc_fail);
> + return -ENOMEM;
> + }
> +
> + /* inform monitor */
> + locked = vmballoon_send_lock_page(b, page_to_pfn(page));
> + if (!locked) {
> + if (b->reset_required) {
> + __free_page(page);
> + return -EIO;
> + }
> +
> + /* place on list of non-balloonable pages, retry allocation */
> + list_add(&page->lru, &b->refused_pages);
> + STATS_INC(b->stats.refused_alloc);
> + }
> + } while (!locked);
> +
> + /* track allocated page */
> + list_add(&page->lru, &b->pages);
> +
> + /* update balloon size */
> + b->size++;
> +
> + return 0;
> +}
> +
> +/*
> + * Release the page allocated for the balloon. Note that we first notify
> + * the host so it can make sure the page will be available for the guest
> + * to use, if needed.
> + */
> +static int vmballoon_release_page(struct vmballoon *b, struct page *page)
> +{
> + if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
> + return -EIO;
> +
> + list_del(&page->lru);
> +
> + /* deallocate page */
> + __free_page(page);
> + STATS_INC(b->stats.free);
> +
> + /* update balloon size */
> + b->size--;
> +
> + return 0;
> +}
> +
> +/*
> + * Release pages that were allocated while attempting to inflate the
> + * balloon but were refused by the host for one reason or another.
> + */
> +static void vmballoon_release_refused_pages(struct vmballoon *b)
> +{
> + struct page *page, *next;
> +
> + list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
> + list_del(&page->lru);
> + __free_page(page);
> + STATS_INC(b->stats.refused_free);
> + }
> +}
> +
> +/*
> + * Inflate the balloon towards its target size. Note that we try to limit
> + * the rate of allocation to make sure we are not choking the rest of the
> + * system.
> + */
> +static void vmballoon_inflate(struct vmballoon *b)
> +{
> + unsigned int goal;
> + unsigned int rate;
> + unsigned int i;
> + unsigned int allocations = 0;
> + int error = 0;
> + bool alloc_can_sleep = false;
> +
> + pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
> +
> + /*
> + * First try NOSLEEP page allocations to inflate balloon.
> + *
> + * If we do not throttle nosleep allocations, we can drain all
> + * free pages in the guest quickly (if the balloon target is high).
> + * As a side-effect, draining free pages helps to inform (force)
> + * the guest to start swapping if balloon target is not met yet,
> + * which is a desired behavior. However, balloon driver can consume
> + * all available CPU cycles if too many pages are allocated in a
> + * second. Therefore, we throttle nosleep allocations even when
> + * the guest is not under memory pressure. OTOH, if we have already
> + * predicted that the guest is under memory pressure, then we
> + * slowdown page allocations considerably.
> + */
> +
> + goal = b->target - b->size;
> + /*
> + * Start with no sleep allocation rate which may be higher
> + * than sleeping allocation rate.
> + */
> + rate = b->slow_allocation_cycles ?
> + b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
> +
> + pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
> + __func__, goal, rate, b->rate_alloc);
> +
> + for (i = 0; i < goal; i++) {
> +
> + error = vmballoon_reserve_page(b, alloc_can_sleep);
> + if (error) {
> + if (error != -ENOMEM) {
> + /*
> + * Not a page allocation failure, stop this
> + * cycle. Maybe we'll get new target from
> + * the host soon.
> + */
> + break;
> + }
> +
> + if (alloc_can_sleep) {
> + /*
> + * CANSLEEP page allocation failed, so guest
> + * is under severe memory pressure. Quickly
> + * decrease allocation rate.
> + */
> + b->rate_alloc = max(b->rate_alloc / 2,
> + VMW_BALLOON_RATE_ALLOC_MIN);
> + break;
> + }
> +
> + /*
> + * NOSLEEP page allocation failed, so the guest is
> + * under memory pressure. Let us slow down page
> + * allocations for next few cycles so that the guest
> + * gets out of memory pressure. Also, if we already
> + * allocated b->rate_alloc pages, let's pause,
> + * otherwise switch to sleeping allocations.
> + */
> + b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
> +
> + if (i >= b->rate_alloc)
> + break;
> +
> + alloc_can_sleep = true;
> + /* Lower rate for sleeping allocations. */
> + rate = b->rate_alloc;
> + }
> +
> + if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
> + cond_resched();
> + allocations = 0;
> + }
> +
> + if (i >= rate) {
> + /* We allocated enough pages, let's take a break. */
> + break;
> + }
> + }
> +
> + /*
> + * We reached our goal without failures so try increasing
> + * allocation rate.
> + */
> + if (error == 0 && i >= b->rate_alloc) {
> + unsigned int mult = i / b->rate_alloc;
> +
> + b->rate_alloc =
> + min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
> + VMW_BALLOON_RATE_ALLOC_MAX);
> + }
> +
> + vmballoon_release_refused_pages(b);
> +}
> +
> +/*
> + * Decrease the size of the balloon allowing guest to use more memory.
> + */
> +static void vmballoon_deflate(struct vmballoon *b)
> +{
> + struct page *page, *next;
> + unsigned int i = 0;
> + unsigned int goal;
> + int error;
> +
> + pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
> +
> + /* limit deallocation rate */
> + goal = min(b->size - b->target, b->rate_free);
> +
> + pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
> +
> + /* free pages to reach target */
> + list_for_each_entry_safe(page, next, &b->pages, lru) {
> + error = vmballoon_release_page(b, page);
> + if (error) {
> + /* quickly decrease rate in case of error */
> + b->rate_free = max(b->rate_free / 2,
> + VMW_BALLOON_RATE_FREE_MIN);
> + return;
> + }
> +
> + if (++i >= goal)
> + break;
> + }
> +
> + /* slowly increase rate if there were no errors */
> + b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
> + VMW_BALLOON_RATE_FREE_MAX);
> +}
> +
> +/*
> + * Balloon work function: reset protocol, if needed, get the new size and
> + * adjust balloon as needed. Repeat in 1 sec.
> + */
> +static void vmballoon_work(struct work_struct *work)
> +{
> + struct delayed_work *dwork = to_delayed_work(work);
> + struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
> + unsigned int target;
> +
> + STATS_INC(b->stats.timer);
> +
> + if (b->reset_required)
> + vmballoon_reset(b);
> +
> + if (b->slow_allocation_cycles > 0)
> + b->slow_allocation_cycles--;
> +
> + if (vmballoon_send_get_target(b, &target)) {
> + /* update target, adjust size */
> + b->target = target;
> +
> + if (b->size < target)
> + vmballoon_inflate(b);
> + else if (b->size > target)
> + vmballoon_deflate(b);
> + }
> +
> + queue_delayed_work(vmballoon_wq, dwork, round_jiffies_relative(HZ));
> +}
> +
> +/*
> + * PROCFS Interface
> + */
> +#ifdef CONFIG_DEBUG_FS
> +
> +static int vmballoon_debug_show(struct seq_file *f, void *offset)
> +{
> + struct vmballoon *b = f->private;
> + struct vmballoon_stats *stats = &b->stats;
> +
> + /* format size info */
> + seq_printf(f,
> + "target: %8d pages\n"
> + "current: %8d pages\n",
> + b->target, b->size);
> +
> + /* format rate info */
> + seq_printf(f,
> + "rateNoSleepAlloc: %8d pages/sec\n"
> + "rateSleepAlloc: %8d pages/sec\n"
> + "rateFree: %8d pages/sec\n",
> + VMW_BALLOON_NOSLEEP_ALLOC_MAX,
> + b->rate_alloc, b->rate_free);
> +
> + seq_printf(f,
> + "\n"
> + "timer: %8u\n"
> + "start: %8u (%4u failed)\n"
> + "guestType: %8u (%4u failed)\n"
> + "lock: %8u (%4u failed)\n"
> + "unlock: %8u (%4u failed)\n"
> + "target: %8u (%4u failed)\n"
> + "primNoSleepAlloc: %8u (%4u failed)\n"
> + "primCanSleepAlloc: %8u (%4u failed)\n"
> + "primFree: %8u\n"
> + "errAlloc: %8u\n"
> + "errFree: %8u\n",
> + stats->timer,
> + stats->start, stats->start_fail,
> + stats->guest_type, stats->guest_type_fail,
> + stats->lock, stats->lock_fail,
> + stats->unlock, stats->unlock_fail,
> + stats->target, stats->target_fail,
> + stats->alloc, stats->alloc_fail,
> + stats->sleep_alloc, stats->sleep_alloc_fail,
> + stats->free,
> + stats->refused_alloc, stats->refused_free);
> +
> + return 0;
> +}
> +
> +static int vmballoon_debug_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, vmballoon_debug_show, inode->i_private);
> +}
> +
> +static const struct file_operations vmballoon_debug_fops = {
> + .owner = THIS_MODULE,
> + .open = vmballoon_debug_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = single_release,
> +};
> +
> +static int __init vmballoon_debugfs_init(struct vmballoon *b)
> +{
> + int error;
> +
> + b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
> + &vmballoon_debug_fops);
> + if (IS_ERR(b->dbg_entry)) {
> + error = PTR_ERR(b->dbg_entry);
> + pr_err("failed to create debugfs entry, error: %d\n", error);
> + return error;
> + }
> +
> + return 0;
> +}
> +
> +static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
> +{
> + debugfs_remove(b->dbg_entry);
> +}
> +
> +#else
> +
> +static inline int vmballoon_debugfs_init(struct vmballoon *b)
> +{
> + return 0;
> +}
> +
> +static inline void vmballoon_debugfs_exit(void)
> +{
> +}
> +
> +#endif /* CONFIG_PROC_FS */
> +
> +static int __init vmballoon_init(void)
> +{
> + int error;
> +
> + /*
> + * Check if we are running on VMware's hypervisor and bail out
> + * if we are not.
> + */
> + if (!vmware_platform())
> + return -ENODEV;
> +
> + vmballoon_wq = create_freezeable_workqueue("vmmemctl");
> + if (!vmballoon_wq) {
> + pr_err("failed to create workqueue\n");
> + return -ENOMEM;
> + }
> +
> + /* initialize global state */
> + memset(&balloon, 0, sizeof(balloon));
> + INIT_LIST_HEAD(&balloon.pages);
> + INIT_LIST_HEAD(&balloon.refused_pages);
> +
> + /* initialize rates */
> + balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
> + balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
> +
> + INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
> +
> + /*
> + * Start balloon.
> + */
> + if (!vmballoon_send_start(&balloon)) {
> + pr_err("failed to send start command to the host\n");
> + error = -EIO;
> + goto fail;
> + }
> +
> + if (!vmballoon_send_guest_id(&balloon)) {
> + pr_err("failed to send guest ID to the host\n");
> + error = -EIO;
> + goto fail;
> + }
> +
> + error = vmballoon_debugfs_init(&balloon);
> + if (error)
> + goto fail;
> +
> + queue_delayed_work(vmballoon_wq, &balloon.dwork, 0);
> +
> + return 0;
> +
> +fail:
> + destroy_workqueue(vmballoon_wq);
> + return error;
> +}
> +module_init(vmballoon_init);
> +
> +static void __exit vmballoon_exit(void)
> +{
> + cancel_delayed_work_sync(&balloon.dwork);
> + destroy_workqueue(vmballoon_wq);
> +
> + vmballoon_debugfs_exit(&balloon);
> +
> + /*
> + * Deallocate all reserved memory, and reset connection with monitor.
> + * Reset connection before deallocating memory to avoid potential for
> + * additional spurious resets from guest touching deallocated pages.
> + */
> + vmballoon_send_start(&balloon);
> + vmballoon_pop(&balloon);
> +}
> +module_exit(vmballoon_exit);

2010-04-21 20:18:53

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Wed, 21 Apr 2010 12:59:35 -0700
Dmitry Torokhov <[email protected]> wrote:

> Do you see any issues with the driver?

Only my near-complete cluelessness on the whole topic.

> Will you be the one picking it
> up and queueing for mainline?

Spose so.

2010-04-21 20:52:10

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Wed, Apr 21, 2010 at 01:18:36PM -0700, Andrew Morton wrote:
> On Wed, 21 Apr 2010 12:59:35 -0700
> Dmitry Torokhov <[email protected]> wrote:
>
> > Do you see any issues with the driver?
>
> Only my near-complete cluelessness on the whole topic.
>
> > Will you be the one picking it
> > up and queueing for mainline?
>
> Spose so.

Good. I don't suppose we have a chance making into .34? Being a
completely new driver and all...

Thanks,

Dmitry

2010-04-21 21:14:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Wed, 21 Apr 2010 13:52:08 -0700
Dmitry Torokhov <[email protected]> wrote:

> On Wed, Apr 21, 2010 at 01:18:36PM -0700, Andrew Morton wrote:
> > On Wed, 21 Apr 2010 12:59:35 -0700
> > Dmitry Torokhov <[email protected]> wrote:
> >
> > > Do you see any issues with the driver?
> >
> > Only my near-complete cluelessness on the whole topic.
> >
> > > Will you be the one picking it
> > > up and queueing for mainline?
> >
> > Spose so.
>
> Good. I don't suppose we have a chance making into .34? Being a
> completely new driver and all...

It's foggy. Is there a good-sounding reason for pushing it in this
late?

2010-04-21 23:55:37

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Thu, 15 Apr 2010 14:00:31 -0700
Dmitry Torokhov <[email protected]> wrote:

> This is standalone version of VMware Balloon driver. Ballooning is a
> technique that allows hypervisor dynamically limit the amount of memory
> available to the guest (with guest cooperation). In the overcommit
> scenario, when hypervisor set detects that it needs to shuffle some memory,
> it instructs the driver to allocate certain number of pages, and the
> underlying memory gets returned to the hypervisor. Later hypervisor may
> return memory to the guest by reattaching memory to the pageframes and
> instructing the driver to "deflate" balloon.
>
> Signed-off-by: Dmitry Torokhov <[email protected]>
> ---
>
> Unlike previous version, that tried to integrate VMware ballooning transport
> into virtio subsystem, and use stock virtio_ballon driver, this one implements
> both controlling thread/algorithm and hypervisor transport.
>
> We are submitting standalone driver because KVM maintainer (Avi Kivity)
> expressed opinion (rightly) that our transport does not fit well into
> virtqueue paradigm and thus it does not make much sense to integrate
> with virtio.
>
> There were also some concerns whether current ballooning technique is
> the right thing. If there appears a better framework to achieve this we
> are prepared to evaluate and switch to using it, but in the meantime
> we'd like to get this driver upstream.
>
> Changes since v1:
> - added comments throughout the code;
> - exported stats moved from /proc to debugfs;
> - better changelog.
>
>
> ...
>
> +#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
> +
> +#define VMW_BALLOON_RATE_ALLOC_MIN 512U
> +#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
> +#define VMW_BALLOON_RATE_ALLOC_INC 16U
> +
> +#define VMW_BALLOON_RATE_FREE_MIN 512U
> +#define VMW_BALLOON_RATE_FREE_MAX 16384U
> +#define VMW_BALLOON_RATE_FREE_INC 16U

hum. What do these do and what units are they in? Needs a comment?

>
> ...
>
> +#define VMWARE_BALLOON_CMD(cmd, data, result) \
> +({ \
> + unsigned long __stat, __dummy1, __dummy2; \
> + __asm__ __volatile__ ("inl (%%dx)" : \
> + "=a"(__stat), \
> + "=c"(__dummy1), \
> + "=d"(__dummy2), \
> + "=b"(result) : \
> + "0"(VMW_BALLOON_HV_MAGIC), \
> + "1"(VMW_BALLOON_CMD_##cmd), \
> + "2"(VMW_BALLOON_HV_PORT), \
> + "3"(data) : \
> + "memory"); \
> + result &= -1UL; \
> + __stat & -1UL; \
> +})

This is OK for both x86_32 and x86_64?

Was it actually intended that this driver be enabled for 32-bit?

> +#define STATS_INC(stat) (stat)++
> +
> +struct vmballoon_stats {
> + unsigned int timer;
> +
> + /* allocation statustics */
> + unsigned int alloc;
> + unsigned int alloc_fail;
> + unsigned int sleep_alloc;
> + unsigned int sleep_alloc_fail;
> + unsigned int refused_alloc;
> + unsigned int refused_free;
> + unsigned int free;
> +
> + /* monitor operations */
> + unsigned int lock;
> + unsigned int lock_fail;
> + unsigned int unlock;
> + unsigned int unlock_fail;
> + unsigned int target;
> + unsigned int target_fail;
> + unsigned int start;
> + unsigned int start_fail;
> + unsigned int guest_type;
> + unsigned int guest_type_fail;
> +};
> +
> +struct vmballoon {
> +
> + /* list of reserved physical pages */
> + struct list_head pages;
> +
> + /* transient list of non-balloonable pages */
> + struct list_head refused_pages;
> +
> + /* balloon size in pages */
> + unsigned int size;
> + unsigned int target;
> +
> + /* reset flag */
> + bool reset_required;
> +
> + /* adjustment rates (pages per second) */
> + unsigned int rate_alloc;
> + unsigned int rate_free;
> +
> + /* slowdown page allocations for next few cycles */
> + unsigned int slow_allocation_cycles;
> +
> + /* statistics */
> + struct vmballoon_stats stats;
> +
> + /* debugfs file exporting statistics */
> + struct dentry *dbg_entry;
> +
> + struct sysinfo sysinfo;
> +
> + struct delayed_work dwork;
> +};

afaict all the stats stuff is useless if CONFIG_DEBUG_FS=n. Perhaps in
that case the vmballoon.stats field should be omitted and STATS_INC
be made a no-op?

>
> ...
>

2010-04-22 00:00:51

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Wed, Apr 21, 2010 at 04:54:49PM -0700, Andrew Morton wrote:
> On Thu, 15 Apr 2010 14:00:31 -0700
> Dmitry Torokhov <[email protected]> wrote:
>
> > This is standalone version of VMware Balloon driver. Ballooning is a
> > technique that allows hypervisor dynamically limit the amount of memory
> > available to the guest (with guest cooperation). In the overcommit
> > scenario, when hypervisor set detects that it needs to shuffle some memory,
> > it instructs the driver to allocate certain number of pages, and the
> > underlying memory gets returned to the hypervisor. Later hypervisor may
> > return memory to the guest by reattaching memory to the pageframes and
> > instructing the driver to "deflate" balloon.
> >
> > Signed-off-by: Dmitry Torokhov <[email protected]>
> > ---
> >
> > Unlike previous version, that tried to integrate VMware ballooning transport
> > into virtio subsystem, and use stock virtio_ballon driver, this one implements
> > both controlling thread/algorithm and hypervisor transport.
> >
> > We are submitting standalone driver because KVM maintainer (Avi Kivity)
> > expressed opinion (rightly) that our transport does not fit well into
> > virtqueue paradigm and thus it does not make much sense to integrate
> > with virtio.
> >
> > There were also some concerns whether current ballooning technique is
> > the right thing. If there appears a better framework to achieve this we
> > are prepared to evaluate and switch to using it, but in the meantime
> > we'd like to get this driver upstream.
> >
> > Changes since v1:
> > - added comments throughout the code;
> > - exported stats moved from /proc to debugfs;
> > - better changelog.
> >
> >
> > ...
> >
> > +#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
> > +
> > +#define VMW_BALLOON_RATE_ALLOC_MIN 512U
> > +#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
> > +#define VMW_BALLOON_RATE_ALLOC_INC 16U
> > +
> > +#define VMW_BALLOON_RATE_FREE_MIN 512U
> > +#define VMW_BALLOON_RATE_FREE_MAX 16384U
> > +#define VMW_BALLOON_RATE_FREE_INC 16U
>
> hum. What do these do and what units are they in? Needs a comment?

These control inflating/deflating rate of the ballon, mesured in
pages/sec. I will add a comment.

>
> >
> > ...
> >
> > +#define VMWARE_BALLOON_CMD(cmd, data, result) \
> > +({ \
> > + unsigned long __stat, __dummy1, __dummy2; \
> > + __asm__ __volatile__ ("inl (%%dx)" : \
> > + "=a"(__stat), \
> > + "=c"(__dummy1), \
> > + "=d"(__dummy2), \
> > + "=b"(result) : \
> > + "0"(VMW_BALLOON_HV_MAGIC), \
> > + "1"(VMW_BALLOON_CMD_##cmd), \
> > + "2"(VMW_BALLOON_HV_PORT), \
> > + "3"(data) : \
> > + "memory"); \
> > + result &= -1UL; \
> > + __stat & -1UL; \
> > +})
>
> This is OK for both x86_32 and x86_64?

Yes it is.

>
> Was it actually intended that this driver be enabled for 32-bit?
>

Yes.

> > +#define STATS_INC(stat) (stat)++
> > +
> > +struct vmballoon_stats {
> > + unsigned int timer;
> > +
> > + /* allocation statustics */
> > + unsigned int alloc;
> > + unsigned int alloc_fail;
> > + unsigned int sleep_alloc;
> > + unsigned int sleep_alloc_fail;
> > + unsigned int refused_alloc;
> > + unsigned int refused_free;
> > + unsigned int free;
> > +
> > + /* monitor operations */
> > + unsigned int lock;
> > + unsigned int lock_fail;
> > + unsigned int unlock;
> > + unsigned int unlock_fail;
> > + unsigned int target;
> > + unsigned int target_fail;
> > + unsigned int start;
> > + unsigned int start_fail;
> > + unsigned int guest_type;
> > + unsigned int guest_type_fail;
> > +};
> > +
> > +struct vmballoon {
> > +
> > + /* list of reserved physical pages */
> > + struct list_head pages;
> > +
> > + /* transient list of non-balloonable pages */
> > + struct list_head refused_pages;
> > +
> > + /* balloon size in pages */
> > + unsigned int size;
> > + unsigned int target;
> > +
> > + /* reset flag */
> > + bool reset_required;
> > +
> > + /* adjustment rates (pages per second) */
> > + unsigned int rate_alloc;
> > + unsigned int rate_free;
> > +
> > + /* slowdown page allocations for next few cycles */
> > + unsigned int slow_allocation_cycles;
> > +
> > + /* statistics */
> > + struct vmballoon_stats stats;
> > +
> > + /* debugfs file exporting statistics */
> > + struct dentry *dbg_entry;
> > +
> > + struct sysinfo sysinfo;
> > +
> > + struct delayed_work dwork;
> > +};
>
> afaict all the stats stuff is useless if CONFIG_DEBUG_FS=n. Perhaps in
> that case the vmballoon.stats field should be omitted and STATS_INC
> be made a no-op?
>

OK, will do.

Thanks Andrew.

--
Dmitry

2010-04-22 00:09:24

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [PATCH v2] VMware Balloon driver

On Wed, Apr 21, 2010 at 02:13:25PM -0700, Andrew Morton wrote:
> On Wed, 21 Apr 2010 13:52:08 -0700
> Dmitry Torokhov <[email protected]> wrote:
>
> > On Wed, Apr 21, 2010 at 01:18:36PM -0700, Andrew Morton wrote:
> > > On Wed, 21 Apr 2010 12:59:35 -0700
> > > Dmitry Torokhov <[email protected]> wrote:
> > >
> > > > Do you see any issues with the driver?
> > >
> > > Only my near-complete cluelessness on the whole topic.
> > >
> > > > Will you be the one picking it
> > > > up and queueing for mainline?
> > >
> > > Spose so.
> >
> > Good. I don't suppose we have a chance making into .34? Being a
> > completely new driver and all...
>
> It's foggy. Is there a good-sounding reason for pushing it in this
> late?
>

We want to get the driver accepted in distributions so that users do not
have to deal with an out-of-tree module and many distributions have
"upstream first" requirement.

The driver has been shipping for a number of years and users running on
VMware platform will have it installed as part of VMware Tools even if
it will not come from a distribution, thus there should not be
additional risk in pulling the driver into mainline. The driver will
only activate if host is VMware so everyone else should not be affected
at all.

Thanks,

Dmitry

2010-04-22 01:02:51

by Dmitry Torokhov

[permalink] [raw]
Subject: Re: [Pv-drivers] [PATCH v2] VMware Balloon driver

On Wed, Apr 21, 2010 at 05:00:49PM -0700, Dmitry Torokhov wrote:
> On Wed, Apr 21, 2010 at 04:54:49PM -0700, Andrew Morton wrote:
> > On Thu, 15 Apr 2010 14:00:31 -0700
> > Dmitry Torokhov <[email protected]> wrote:
> >
> > > This is standalone version of VMware Balloon driver. Ballooning is a
> > > technique that allows hypervisor dynamically limit the amount of memory
> > > available to the guest (with guest cooperation). In the overcommit
> > > scenario, when hypervisor set detects that it needs to shuffle some memory,
> > > it instructs the driver to allocate certain number of pages, and the
> > > underlying memory gets returned to the hypervisor. Later hypervisor may
> > > return memory to the guest by reattaching memory to the pageframes and
> > > instructing the driver to "deflate" balloon.
> > >
> > > Signed-off-by: Dmitry Torokhov <[email protected]>
> > > ---
> > >
> > > Unlike previous version, that tried to integrate VMware ballooning transport
> > > into virtio subsystem, and use stock virtio_ballon driver, this one implements
> > > both controlling thread/algorithm and hypervisor transport.
> > >
> > > We are submitting standalone driver because KVM maintainer (Avi Kivity)
> > > expressed opinion (rightly) that our transport does not fit well into
> > > virtqueue paradigm and thus it does not make much sense to integrate
> > > with virtio.
> > >
> > > There were also some concerns whether current ballooning technique is
> > > the right thing. If there appears a better framework to achieve this we
> > > are prepared to evaluate and switch to using it, but in the meantime
> > > we'd like to get this driver upstream.
> > >
> > > Changes since v1:
> > > - added comments throughout the code;
> > > - exported stats moved from /proc to debugfs;
> > > - better changelog.
> > >
> > >
> > > ...
> > >
> > > +#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
> > > +
> > > +#define VMW_BALLOON_RATE_ALLOC_MIN 512U
> > > +#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
> > > +#define VMW_BALLOON_RATE_ALLOC_INC 16U
> > > +
> > > +#define VMW_BALLOON_RATE_FREE_MIN 512U
> > > +#define VMW_BALLOON_RATE_FREE_MAX 16384U
> > > +#define VMW_BALLOON_RATE_FREE_INC 16U
> >
> > hum. What do these do and what units are they in? Needs a comment?
>
> These control inflating/deflating rate of the ballon, mesured in
> pages/sec. I will add a comment.
>
> >
> > >
> > > ...
> > >
> > > +#define VMWARE_BALLOON_CMD(cmd, data, result) \
> > > +({ \
> > > + unsigned long __stat, __dummy1, __dummy2; \
> > > + __asm__ __volatile__ ("inl (%%dx)" : \
> > > + "=a"(__stat), \
> > > + "=c"(__dummy1), \
> > > + "=d"(__dummy2), \
> > > + "=b"(result) : \
> > > + "0"(VMW_BALLOON_HV_MAGIC), \
> > > + "1"(VMW_BALLOON_CMD_##cmd), \
> > > + "2"(VMW_BALLOON_HV_PORT), \
> > > + "3"(data) : \
> > > + "memory"); \
> > > + result &= -1UL; \
> > > + __stat & -1UL; \
> > > +})
> >
> > This is OK for both x86_32 and x86_64?
>
> Yes it is.
>
> >
> > Was it actually intended that this driver be enabled for 32-bit?
> >
>
> Yes.
>
> > > +#define STATS_INC(stat) (stat)++
> > > +
> > > +struct vmballoon_stats {
> > > + unsigned int timer;
> > > +
> > > + /* allocation statustics */
> > > + unsigned int alloc;
> > > + unsigned int alloc_fail;
> > > + unsigned int sleep_alloc;
> > > + unsigned int sleep_alloc_fail;
> > > + unsigned int refused_alloc;
> > > + unsigned int refused_free;
> > > + unsigned int free;
> > > +
> > > + /* monitor operations */
> > > + unsigned int lock;
> > > + unsigned int lock_fail;
> > > + unsigned int unlock;
> > > + unsigned int unlock_fail;
> > > + unsigned int target;
> > > + unsigned int target_fail;
> > > + unsigned int start;
> > > + unsigned int start_fail;
> > > + unsigned int guest_type;
> > > + unsigned int guest_type_fail;
> > > +};
> > > +
> > > +struct vmballoon {
> > > +
> > > + /* list of reserved physical pages */
> > > + struct list_head pages;
> > > +
> > > + /* transient list of non-balloonable pages */
> > > + struct list_head refused_pages;
> > > +
> > > + /* balloon size in pages */
> > > + unsigned int size;
> > > + unsigned int target;
> > > +
> > > + /* reset flag */
> > > + bool reset_required;
> > > +
> > > + /* adjustment rates (pages per second) */
> > > + unsigned int rate_alloc;
> > > + unsigned int rate_free;
> > > +
> > > + /* slowdown page allocations for next few cycles */
> > > + unsigned int slow_allocation_cycles;
> > > +
> > > + /* statistics */
> > > + struct vmballoon_stats stats;
> > > +
> > > + /* debugfs file exporting statistics */
> > > + struct dentry *dbg_entry;
> > > +
> > > + struct sysinfo sysinfo;
> > > +
> > > + struct delayed_work dwork;
> > > +};
> >
> > afaict all the stats stuff is useless if CONFIG_DEBUG_FS=n. Perhaps in
> > that case the vmballoon.stats field should be omitted and STATS_INC
> > be made a no-op?
> >
>
> OK, will do.
>
> Thanks Andrew.
>

OK, so here is the incremental patch addressing your comments. Or do you
want the entire thing resent?

Thanks.

--
Dmitry


vmware-balloon: miscellaneous fixes

- document rate allocation constants
- do not compile statistics code when debugfs is disabled
- fix compilation error when debugfs is disabled

Signed-off-by: Dmitry Torokhov <[email protected]>
---

drivers/misc/vmware_balloon.c | 38 +++++++++++++++++++++++++++++++-------
1 files changed, 31 insertions(+), 7 deletions(-)


diff --git a/drivers/misc/vmware_balloon.c b/drivers/misc/vmware_balloon.c
index 90bba04..e7161c4 100644
--- a/drivers/misc/vmware_balloon.c
+++ b/drivers/misc/vmware_balloon.c
@@ -50,12 +50,28 @@ MODULE_ALIAS("dmi:*:svnVMware*:*");
MODULE_ALIAS("vmware_vmmemctl");
MODULE_LICENSE("GPL");

+/*
+ * Various constants controlling rate of inflaint/deflating balloon,
+ * measured in pages.
+ */
+
+/*
+ * Rate of allocating memory when there is no memory pressure
+ * (driver performs non-sleeping allocations).
+ */
#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U

+/*
+ * Rates of memory allocaton when guest experiences memory pressure
+ * (driver performs sleeping allocations).
+ */
#define VMW_BALLOON_RATE_ALLOC_MIN 512U
#define VMW_BALLOON_RATE_ALLOC_MAX 2048U
#define VMW_BALLOON_RATE_ALLOC_INC 16U

+/*
+ * Rates for releasing pages while deflating balloon.
+ */
#define VMW_BALLOON_RATE_FREE_MIN 512U
#define VMW_BALLOON_RATE_FREE_MAX 16384U
#define VMW_BALLOON_RATE_FREE_INC 16U
@@ -85,6 +101,10 @@ MODULE_LICENSE("GPL");
/* Maximum number of page allocations without yielding processor */
#define VMW_BALLOON_YIELD_THRESHOLD 1024

+
+/*
+ * Hypervisor communication port definitions.
+ */
#define VMW_BALLOON_HV_PORT 0x5670
#define VMW_BALLOON_HV_MAGIC 0x456c6d6f
#define VMW_BALLOON_PROTOCOL_VERSION 2
@@ -125,8 +145,7 @@ MODULE_LICENSE("GPL");
__stat & -1UL; \
})

-#define STATS_INC(stat) (stat)++
-
+#ifdef CONFIG_DEBUG_FS
struct vmballoon_stats {
unsigned int timer;

@@ -152,6 +171,11 @@ struct vmballoon_stats {
unsigned int guest_type_fail;
};

+#define STATS_INC(stat) (stat)++
+#else
+#define STATS_INC(stat)
+#endif
+
struct vmballoon {

/* list of reserved physical pages */
@@ -174,11 +198,13 @@ struct vmballoon {
/* slowdown page allocations for next few cycles */
unsigned int slow_allocation_cycles;

+#ifdef CONFIG_DEBUG_FS
/* statistics */
struct vmballoon_stats stats;

/* debugfs file exporting statistics */
struct dentry *dbg_entry;
+#endif

struct sysinfo sysinfo;

@@ -637,7 +663,7 @@ static void vmballoon_work(struct work_struct *work)
}

/*
- * PROCFS Interface
+ * DEBUGFS Interface
*/
#ifdef CONFIG_DEBUG_FS

@@ -727,11 +753,11 @@ static inline int vmballoon_debugfs_init(struct vmballoon *b)
return 0;
}

-static inline void vmballoon_debugfs_exit(void)
+static inline void vmballoon_debugfs_exit(struct vmballoon *b)
{
}

-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_DEBUG_FS */

static int __init vmballoon_init(void)
{
@@ -750,8 +776,6 @@ static int __init vmballoon_init(void)
return -ENOMEM;
}

- /* initialize global state */
- memset(&balloon, 0, sizeof(balloon));
INIT_LIST_HEAD(&balloon.pages);
INIT_LIST_HEAD(&balloon.refused_pages);