2021-03-18 20:15:57

by Sunil Muthuswamy

[permalink] [raw]
Subject: [PATCH v4] x86/Hyper-V: Support for free page reporting

Linux has support for free page reporting now (36e66c554b5c) for
virtualized environment. On Hyper-V when virtually backed VMs are
configured, Hyper-V will advertise cold memory discard capability,
when supported. This patch adds the support to hook into the free
page reporting infrastructure and leverage the Hyper-V cold memory
discard hint hypercall to report/free these pages back to the host.

Signed-off-by: Sunil Muthuswamy <[email protected]>
Tested-by: Matheus Castello <[email protected]>
---
In V2:
- Addressed feedback comments
- Added page reporting config option tied to hyper-v balloon config

In V3:
- Addressed feedback from Vitaly

In V4:
- Queried and cached the Hyper-V extended capability for the lifetime
of the VM
- Addressed feedback from Michael Kelley.
---
arch/x86/hyperv/hv_init.c | 45 +++++++++++++++-
arch/x86/kernel/cpu/mshyperv.c | 9 ++--
drivers/hv/Kconfig | 1 +
drivers/hv/hv_balloon.c | 89 +++++++++++++++++++++++++++++++
include/asm-generic/hyperv-tlfs.h | 35 +++++++++++-
include/asm-generic/mshyperv.h | 3 +-
6 files changed, 174 insertions(+), 8 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 9d100257b3af..3937c56145b5 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -498,6 +498,8 @@ void __init hyperv_init(void)
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif

+ /* Query the VMs extended capability once, so that it can be cached. */
+ hv_query_ext_cap(0);
return;

remove_cpuhp_state:
@@ -601,7 +603,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);

enum hv_isolation_type hv_get_isolation_type(void)
{
- if (!(ms_hyperv.features_b & HV_ISOLATION))
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
return HV_ISOLATION_TYPE_NONE;
return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
}
@@ -612,3 +614,44 @@ bool hv_is_isolation_supported(void)
return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
+
+/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
+bool hv_query_ext_cap(u64 cap_query)
+{
+ /*
+ * The address of the 'hv_extended_cap' variable will be used as an
+ * output parameter to the hypercall below and so it should be
+ * compatible with 'virt_to_phys'. Which means, it's address should be
+ * directly mapped. Use 'static' to keep it compatible; stack variables
+ * can be virtually mapped, making them imcompatible with
+ * 'virt_to_phys'.
+ * Hypercall input/output addresses should also be 8-byte aligned.
+ */
+ static u64 hv_extended_cap __aligned(8);
+ static bool hv_extended_cap_queried;
+ u64 status;
+
+ /*
+ * Querying extended capabilities is an extended hypercall. Check if the
+ * partition supports extended hypercall, first.
+ */
+ if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
+ return false;
+
+ /* Extended capabilities do not change at runtime. */
+ if (hv_extended_cap_queried)
+ return hv_extended_cap & cap_query;
+
+ status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
+ &hv_extended_cap);
+ hv_extended_cap_queried = true;
+ status &= HV_HYPERCALL_RESULT_MASK;
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
+ status);
+ return false;
+ }
+
+ return hv_extended_cap & cap_query;
+}
+EXPORT_SYMBOL_GPL(hv_query_ext_cap);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index cebed535ec56..3546d3e21787 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -265,12 +265,13 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
- ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);

- pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+ pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ ms_hyperv.misc_features);

ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -316,7 +317,7 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}

- if (ms_hyperv.features_b & HV_ISOLATION) {
+ if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 79e5356a737a..66c794d92391 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -23,6 +23,7 @@ config HYPERV_UTILS
config HYPERV_BALLOON
tristate "Microsoft Hyper-V Balloon driver"
depends on HYPERV
+ select PAGE_REPORTING
help
Select this option to enable Hyper-V Balloon driver.

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 2f776d78e3c1..58af84e30144 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -21,6 +21,7 @@
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/percpu_counter.h>
+#include <linux/page_reporting.h>

#include <linux/hyperv.h>
#include <asm/hyperv-tlfs.h>
@@ -563,6 +564,8 @@ struct hv_dynmem_device {
* The negotiated version agreed by host.
*/
__u32 version;
+
+ struct page_reporting_dev_info pr_dev_info;
};

static struct hv_dynmem_device dm_device;
@@ -1568,6 +1571,89 @@ static void balloon_onchannelcallback(void *context)

}

+/* Hyper-V only supports reporting 2MB pages or higher */
+#define HV_MIN_PAGE_REPORTING_ORDER 9
+#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER)
+static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+ struct scatterlist *sgl, unsigned int nents)
+{
+ unsigned long flags;
+ struct hv_memory_hint *hint;
+ int i;
+ u64 status;
+ struct scatterlist *sg;
+
+ WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+ WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN);
+ local_irq_save(flags);
+ hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ if (!hint) {
+ local_irq_restore(flags);
+ return -ENOSPC;
+ }
+
+ hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+ hint->reserved = 0;
+ for_each_sg(sgl, sg, nents, i) {
+ union hv_gpa_page_range *range;
+
+ range = &hint->ranges[i];
+ range->address_space = 0;
+ /* page reporting only reports 2MB pages or higher */
+ range->page.largepage = 1;
+ range->page.additional_pages =
+ (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1;
+ range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
+ range->base_large_pfn =
+ page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER;
+ }
+
+ status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
+ hint, NULL);
+ local_irq_restore(flags);
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ pr_err("Cold memory discard hypercall failed with status %llx\n",
+ status);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void enable_page_reporting(void)
+{
+ int ret;
+
+ /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */
+ if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) {
+ pr_debug("Cold memory discard is only supported on 2MB pages and above\n");
+ return;
+ }
+
+ if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
+ pr_debug("Cold memory discard hint not supported by Hyper-V\n");
+ return;
+ }
+
+ BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
+ dm_device.pr_dev_info.report = hv_free_page_report;
+ ret = page_reporting_register(&dm_device.pr_dev_info);
+ if (ret < 0) {
+ dm_device.pr_dev_info.report = NULL;
+ pr_err("Failed to enable cold memory discard: %d\n", ret);
+ } else {
+ pr_info("Cold memory discard hint enabled\n");
+ }
+}
+
+static void disable_page_reporting(void)
+{
+ if (dm_device.pr_dev_info.report) {
+ page_reporting_unregister(&dm_device.pr_dev_info);
+ dm_device.pr_dev_info.report = NULL;
+ }
+}
+
static int balloon_connect_vsp(struct hv_device *dev)
{
struct dm_version_request version_req;
@@ -1713,6 +1799,7 @@ static int balloon_probe(struct hv_device *dev,
if (ret != 0)
return ret;

+ enable_page_reporting();
dm_device.state = DM_INITIALIZED;

dm_device.thread =
@@ -1727,6 +1814,7 @@ static int balloon_probe(struct hv_device *dev,
probe_error:
dm_device.state = DM_INIT_ERROR;
dm_device.thread = NULL;
+ disable_page_reporting();
vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
@@ -1749,6 +1837,7 @@ static int balloon_remove(struct hv_device *dev)
cancel_work_sync(&dm->ha_wrk.wrk);

kthread_stop(dm->thread);
+ disable_page_reporting();
vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 9cf10837d005..515c3fb06ab3 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -89,9 +89,9 @@
#define HV_ACCESS_STATS BIT(8)
#define HV_DEBUGGING BIT(11)
#define HV_CPU_MANAGEMENT BIT(12)
+#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20)
#define HV_ISOLATION BIT(22)

-
/*
* TSC page layout.
*/
@@ -159,11 +159,18 @@ struct ms_hyperv_tsc_page {
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0

+/* Extended hypercalls */
+#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001
+#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003
+
#define HV_FLUSH_ALL_PROCESSORS BIT(0)
#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)

+/* Extended capability bits */
+#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
+
enum HV_GENERIC_SET_FORMAT {
HV_GENERIC_SET_SPARSE_4K,
HV_GENERIC_SET_ALL,
@@ -408,8 +415,10 @@ struct hv_guest_mapping_flush {
* by the bitwidth of "additional_pages" in union hv_gpa_page_range.
*/
#define HV_MAX_FLUSH_PAGES (2048)
+#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB 0
+#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB 1

-/* HvFlushGuestPhysicalAddressList hypercall */
+/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */
union hv_gpa_page_range {
u64 address_space;
struct {
@@ -417,6 +426,12 @@ union hv_gpa_page_range {
u64 largepage:1;
u64 basepfn:52;
} page;
+ struct {
+ u64 reserved:12;
+ u64 page_size:1;
+ u64 reserved1:8;
+ u64 base_large_pfn:43;
+ };
};

/*
@@ -774,4 +789,20 @@ struct hv_input_unmap_device_interrupt {
#define HV_SOURCE_SHADOW_NONE 0x0
#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1

+/*
+ * The whole argument should fit in a page to be able to pass to the hypervisor
+ * in one hypercall.
+ */
+#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \
+ sizeof(union hv_gpa_page_range))
+
+/* HvExtCallMemoryHeatHint hypercall */
+#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2
+struct hv_memory_hint {
+ u64 type:2;
+ u64 reserved:62;
+ union hv_gpa_page_range ranges[];
+} __packed;
+
#endif
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 2d1b6cd9f000..63c0e579bf6d 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -27,7 +27,7 @@

struct ms_hyperv_info {
u32 features;
- u32 features_b;
+ u32 priv_high;
u32 misc_features;
u32 hints;
u32 nested_features;
@@ -179,6 +179,7 @@ bool hv_is_hibernation_supported(void);
enum hv_isolation_type hv_get_isolation_type(void);
bool hv_is_isolation_supported(void);
void hyperv_cleanup(void);
+bool hv_query_ext_cap(u64 cap_query);
#else /* CONFIG_HYPERV */
static inline bool hv_is_hyperv_initialized(void) { return false; }
static inline bool hv_is_hibernation_supported(void) { return false; }
--
2.17.1


2021-03-19 19:25:14

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [PATCH v4] x86/Hyper-V: Support for free page reporting

From: Sunil Muthuswamy <[email protected]> Sent: Thursday, March 18, 2021 1:14 PM
>
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
>
> Signed-off-by: Sunil Muthuswamy <[email protected]>
> Tested-by: Matheus Castello <[email protected]>
> ---
> In V2:
> - Addressed feedback comments
> - Added page reporting config option tied to hyper-v balloon config
>
> In V3:
> - Addressed feedback from Vitaly
>
> In V4:
> - Queried and cached the Hyper-V extended capability for the lifetime
> of the VM
> - Addressed feedback from Michael Kelley.
> ---
> arch/x86/hyperv/hv_init.c | 45 +++++++++++++++-
> arch/x86/kernel/cpu/mshyperv.c | 9 ++--
> drivers/hv/Kconfig | 1 +
> drivers/hv/hv_balloon.c | 89 +++++++++++++++++++++++++++++++
> include/asm-generic/hyperv-tlfs.h | 35 +++++++++++-
> include/asm-generic/mshyperv.h | 3 +-
> 6 files changed, 174 insertions(+), 8 deletions(-)
>

[snip]

> +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
> +bool hv_query_ext_cap(u64 cap_query)
> +{
> + /*
> + * The address of the 'hv_extended_cap' variable will be used as an
> + * output parameter to the hypercall below and so it should be
> + * compatible with 'virt_to_phys'. Which means, it's address should be
> + * directly mapped. Use 'static' to keep it compatible; stack variables
> + * can be virtually mapped, making them imcompatible with
> + * 'virt_to_phys'.
> + * Hypercall input/output addresses should also be 8-byte aligned.
> + */
> + static u64 hv_extended_cap __aligned(8);
> + static bool hv_extended_cap_queried;
> + u64 status;
> +
> + /*
> + * Querying extended capabilities is an extended hypercall. Check if the
> + * partition supports extended hypercall, first.
> + */
> + if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
> + return false;
> +
> + /* Extended capabilities do not change at runtime. */
> + if (hv_extended_cap_queried)
> + return hv_extended_cap & cap_query;
> +
> + status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
> + &hv_extended_cap);
> + hv_extended_cap_queried = true;

What's the strategy for this flag in the unlikely event that the hypercall fails?
It doesn't seem right to have hv_query_ext_cap() fail, but leave the
static flag set to true. Just move that line down to after the status check
has succeeded?

> + status &= HV_HYPERCALL_RESULT_MASK;
> + if (status != HV_STATUS_SUCCESS) {
> + pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
> + status);
> + return false;
> + }
> +
> + return hv_extended_cap & cap_query;
> +}
> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);

Other than the above about the flag when the hypercall fails,
everything else looks good.

Michael

2021-03-19 21:23:19

by Sunil Muthuswamy

[permalink] [raw]
Subject: RE: [PATCH v4] x86/Hyper-V: Support for free page reporting

> What's the strategy for this flag in the unlikely event that the hypercall fails?
> It doesn't seem right to have hv_query_ext_cap() fail, but leave the
> static flag set to true. Just move that line down to after the status check
> has succeeded?

That call should not fail in any normal circumstances. The current idea was to
avoid repeating the same call on persistent failure. But, since we don't expect
the query capability to be called in any kind of hot path, I am ok moving this
down.

>
> Other than the above about the flag when the hypercall fails,
> everything else looks good.
Thanks for the review.

2021-03-19 21:34:25

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [PATCH v4] x86/Hyper-V: Support for free page reporting

From: Sunil Muthuswamy <[email protected]> Sent: Friday, March 19, 2021 2:21 PM
>
> > What's the strategy for this flag in the unlikely event that the hypercall fails?
> > It doesn't seem right to have hv_query_ext_cap() fail, but leave the
> > static flag set to true. Just move that line down to after the status check
> > has succeeded?
>
> That call should not fail in any normal circumstances. The current idea was to
> avoid repeating the same call on persistent failure.

OK, I can see that as a valid strategy. And the assumption is that a failed
hypercall would leave hv_extended_cap unmodified and hence all zeros.

I'm OK with this approach if you want to keep it. But perhaps add a short
comment about the intent so it doesn't look like a bug. :-)

Michael

> But, since we don't expect
> the query capability to be called in any kind of hot path, I am ok moving this
> down.
>
> >
> > Other than the above about the flag when the hypercall fails,
> > everything else looks good.
> Thanks for the review.

2021-03-22 11:20:06

by Wei Liu

[permalink] [raw]
Subject: Re: [PATCH v4] x86/Hyper-V: Support for free page reporting

On Fri, Mar 19, 2021 at 09:30:50PM +0000, Michael Kelley wrote:
> From: Sunil Muthuswamy <[email protected]> Sent: Friday, March 19, 2021 2:21 PM
> >
> > > What's the strategy for this flag in the unlikely event that the hypercall fails?
> > > It doesn't seem right to have hv_query_ext_cap() fail, but leave the
> > > static flag set to true. Just move that line down to after the status check
> > > has succeeded?
> >
> > That call should not fail in any normal circumstances. The current idea was to
> > avoid repeating the same call on persistent failure.
>
> OK, I can see that as a valid strategy. And the assumption is that a failed
> hypercall would leave hv_extended_cap unmodified and hence all zeros.
>
> I'm OK with this approach if you want to keep it. But perhaps add a short
> comment about the intent so it doesn't look like a bug. :-)
>

Sunil, if you can send an updated version of your patch by either
providing a comment or moving the code around, I can queue it up for
hyperv-next.

I think adding a comment is perhaps the easier thing to do.

Wei.