2020-05-19 18:40:45

by Sunil Muthuswamy

[permalink] [raw]
Subject: [PATCH] x86/Hyper-V: Support for free page reporting

Linux has support for free page reporting now (36e66c554b5c) for
virtualized environment. On Hyper-V when virtually backed VMs are
configured, Hyper-V will advertise cold memory discard capability,
when supported. This patch adds the support to hook into the free
page reporting infrastructure and leverage the Hyper-V cold memory
discard hint hypercall to report/free these pages back to the host.

Signed-off-by: Sunil Muthuswamy <[email protected]>
---
First patch mail bounced backed. Sending it again with the email
addresses fixed.
---
arch/x86/hyperv/hv_init.c | 24 ++++++++
arch/x86/kernel/cpu/mshyperv.c | 6 +-
drivers/hv/hv_balloon.c | 93 +++++++++++++++++++++++++++++++
include/asm-generic/hyperv-tlfs.h | 29 ++++++++++
include/asm-generic/mshyperv.h | 2 +
5 files changed, 152 insertions(+), 2 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 624f5d9b0f79..925e2f7eb82c 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -506,3 +506,27 @@ bool hv_is_hibernation_supported(void)
return acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
+
+u64 hv_query_ext_cap(void)
+{
+ u64 *cap;
+ unsigned long flags;
+ u64 ext_cap = 0;
+
+ /*
+ * Querying extended capabilities is an extended hypercall. Check if the
+ * partition supports extended hypercall, first.
+ */
+ if (!(ms_hyperv.b_features & HV_ENABLE_EXTENDED_HYPERCALLS))
+ return 0;
+
+ local_irq_save(flags);
+ cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
+ HV_STATUS_SUCCESS)
+ ext_cap = *cap;
+
+ local_irq_restore(flags);
+ return ext_cap;
+}
+EXPORT_SYMBOL_GPL(hv_query_ext_cap);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index ebf34c7bc8bc..2de3f692c8bf 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -224,11 +224,13 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.b_features = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);

- pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+ pr_info("Hyper-V: features 0x%x, additional features: 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.b_features, ms_hyperv.hints,
+ ms_hyperv.misc_features);

ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 32e3bc0aa665..77be31094556 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -21,6 +21,7 @@
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/percpu_counter.h>
+#include <linux/page_reporting.h>

#include <linux/hyperv.h>
#include <asm/hyperv-tlfs.h>
@@ -563,6 +564,10 @@ struct hv_dynmem_device {
* The negotiated version agreed by host.
*/
__u32 version;
+
+#ifdef CONFIG_PAGE_REPORTING
+ struct page_reporting_dev_info pr_dev_info;
+#endif
};

static struct hv_dynmem_device dm_device;
@@ -1565,6 +1570,83 @@ static void balloon_onchannelcallback(void *context)

}

+#ifdef CONFIG_PAGE_REPORTING
+static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
+ struct scatterlist *sgl, unsigned int nents)
+{
+ unsigned long flags;
+ struct hv_memory_hint *hint;
+ int i;
+ u64 status;
+ struct scatterlist *sg;
+
+ WARN_ON(nents > HV_MAX_GPA_PAGE_RANGES);
+ local_irq_save(flags);
+ hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ if (!hint) {
+ local_irq_restore(flags);
+ return -ENOSPC;
+ }
+
+ hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+ hint->reserved = 0;
+ for (i = 0, sg = sgl; sg; sg = sg_next(sg), i++) {
+ int order;
+ union hv_gpa_page_range *range;
+
+ order = get_order(sg->length);
+ range = &hint->ranges[i];
+ range->address_space = 0;
+ range->page.largepage = 1;
+ range->page.additional_pages = (1ull << (order - 9)) - 1;
+ range->base_large_pfn = page_to_pfn(sg_page(sg)) >> 9;
+ }
+
+ status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
+ hint, NULL);
+ local_irq_restore(flags);
+ status &= HV_HYPERCALL_RESULT_MASK;
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("Cold memory discard hypercall failed with status %llx\n",
+ status);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int enable_page_reporting(void)
+{
+ int ret;
+
+ if (!(hv_query_ext_cap() &
+ HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
+ pr_info("Cold memory discard hint not supported by Hyper-V\n");
+ return 0;
+ }
+
+ BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MAX_GPA_PAGE_RANGES);
+ dm_device.pr_dev_info.report = hv_free_page_report;
+ ret = page_reporting_register(&dm_device.pr_dev_info);
+ if (ret < 0) {
+ dm_device.pr_dev_info.report = NULL;
+ pr_err("Failed to enable cold memory discard: %d\n", ret);
+ } else {
+ pr_info("Cold memory discard hint enabled\n");
+ }
+
+ return ret;
+}
+
+static void disable_page_reporting(void)
+{
+ if (dm_device.pr_dev_info.report) {
+ page_reporting_unregister(&dm_device.pr_dev_info);
+ dm_device.pr_dev_info.report = NULL;
+ }
+}
+#endif //CONFIG_PAGE_REPORTING
+
static int balloon_connect_vsp(struct hv_device *dev)
{
struct dm_version_request version_req;
@@ -1710,6 +1792,11 @@ static int balloon_probe(struct hv_device *dev,
if (ret != 0)
return ret;

+#ifdef CONFIG_PAGE_REPORTING
+ if (enable_page_reporting() < 0)
+ goto probe_error;
+#endif
+
dm_device.state = DM_INITIALIZED;

dm_device.thread =
@@ -1728,6 +1815,9 @@ static int balloon_probe(struct hv_device *dev,
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
restore_online_page_callback(&hv_online_page);
+#endif
+#ifdef CONFIG_PAGE_REPORTING
+ disable_page_reporting();
#endif
return ret;
}
@@ -1750,6 +1840,9 @@ static int balloon_remove(struct hv_device *dev)
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
restore_online_page_callback(&hv_online_page);
+#endif
+#ifdef CONFIG_PAGE_REPORTING
+ disable_page_reporting();
#endif
spin_lock_irqsave(&dm_device.ha_lock, flags);
list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 262fae9526b1..75aeea0e7f9b 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -89,6 +89,7 @@
#define HV_ACCESS_STATS BIT(8)
#define HV_DEBUGGING BIT(11)
#define HV_CPU_POWER_MANAGEMENT BIT(12)
+#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20)


/*
@@ -149,11 +150,18 @@ struct ms_hyperv_tsc_page {
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0

+/* Extended hypercalls */
+#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001
+#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003
+
#define HV_FLUSH_ALL_PROCESSORS BIT(0)
#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)

+/* Extended capability bits */
+#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
+
enum HV_GENERIC_SET_FORMAT {
HV_GENERIC_SET_SPARSE_4K,
HV_GENERIC_SET_ALL,
@@ -371,6 +379,12 @@ union hv_gpa_page_range {
u64 largepage:1;
u64 basepfn:52;
} page;
+ struct {
+ u64:12;
+ u64 page_size:1;
+ u64 reserved:8;
+ u64 base_large_pfn:43;
+ };
};

/*
@@ -490,4 +504,19 @@ struct hv_set_vp_registers_input {
} element[];
} __packed;

+/*
+ * The whole argument should fit in a page to be able to pass to the hypervisor
+ * in one hypercall.
+ */
+#define HV_MAX_GPA_PAGE_RANGES ((PAGE_SIZE - sizeof(u64)) / \
+ sizeof(union hv_gpa_page_range))
+
+/* HvExtCallMemoryHeatHint hypercall */
+#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2
+struct hv_memory_hint {
+ u64 type:2;
+ u64 reserved:62;
+ union hv_gpa_page_range ranges[1];
+};
+
#endif
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 1c4fd950f091..c664af4a7503 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -27,6 +27,7 @@

struct ms_hyperv_info {
u32 features;
+ u32 b_features;
u32 misc_features;
u32 hints;
u32 nested_features;
@@ -169,6 +170,7 @@ bool hv_is_hyperv_initialized(void);
bool hv_is_hibernation_supported(void);
void hyperv_cleanup(void);
void hv_setup_sched_clock(void *sched_clock);
+u64 hv_query_ext_cap(void);
#else /* CONFIG_HYPERV */
static inline bool hv_is_hyperv_initialized(void) { return false; }
static inline bool hv_is_hibernation_supported(void) { return false; }
--
2.17.1


2020-05-20 09:02:30

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH] x86/Hyper-V: Support for free page reporting

Sunil Muthuswamy <[email protected]> writes:

> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
>
> Signed-off-by: Sunil Muthuswamy <[email protected]>
> ---
> First patch mail bounced backed. Sending it again with the email
> addresses fixed.
> ---
> arch/x86/hyperv/hv_init.c | 24 ++++++++
> arch/x86/kernel/cpu/mshyperv.c | 6 +-
> drivers/hv/hv_balloon.c | 93 +++++++++++++++++++++++++++++++
> include/asm-generic/hyperv-tlfs.h | 29 ++++++++++
> include/asm-generic/mshyperv.h | 2 +
> 5 files changed, 152 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 624f5d9b0f79..925e2f7eb82c 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -506,3 +506,27 @@ bool hv_is_hibernation_supported(void)
> return acpi_sleep_state_supported(ACPI_STATE_S4);
> }
> EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
> +
> +u64 hv_query_ext_cap(void)
> +{

As the only usage of this function looks like

if (!(hv_query_ext_cap() & HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT))

I would've change the interface to

bool hv_query_ext_cap(u64 cap)

so the usage would look like

if (!(hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT))
...

> + u64 *cap;
> + unsigned long flags;
> + u64 ext_cap = 0;
> +
> + /*
> + * Querying extended capabilities is an extended hypercall. Check if the
> + * partition supports extended hypercall, first.
> + */
> + if (!(ms_hyperv.b_features & HV_ENABLE_EXTENDED_HYPERCALLS))
> + return 0;
> +
> + local_irq_save(flags);
> + cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> + HV_STATUS_SUCCESS)
> + ext_cap = *cap;
> +
> + local_irq_restore(flags);
> + return ext_cap;
> +}
> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index ebf34c7bc8bc..2de3f692c8bf 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -224,11 +224,13 @@ static void __init ms_hyperv_init_platform(void)
> * Extract the features and hints
> */
> ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> + ms_hyperv.b_features = cpuid_ebx(HYPERV_CPUID_FEATURES);
> ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
> ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
>
> - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> + pr_info("Hyper-V: features 0x%x, additional features: 0x%x, hints 0x%x, misc 0x%x\n",
> + ms_hyperv.features, ms_hyperv.b_features, ms_hyperv.hints,
> + ms_hyperv.misc_features);

HYPERV_CPUID_FEATURES(0x40000003) EAX and EBX correspond to Partition
Privilege Flags (TLFS), I'd suggest to take the opportunity and rename
this to something like 'privilege flags low=0x%x high=0x%x'.

Also, I don't quite like 'ms_hyperv.b_features' as I'll always have to
look at what it's being assigned to understand what it holds. I'd even
suggest to rename ms_hyperv.features to ms_hyperv.priv_low and
ms_hyperv.b_features tp ms_hyperv.priv_high. Or maybe even better, pack
them to the same 'u64 ms_hyperv.privileges'.

>
> ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
> ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 32e3bc0aa665..77be31094556 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -21,6 +21,7 @@
> #include <linux/memory.h>
> #include <linux/notifier.h>
> #include <linux/percpu_counter.h>
> +#include <linux/page_reporting.h>
>
> #include <linux/hyperv.h>
> #include <asm/hyperv-tlfs.h>
> @@ -563,6 +564,10 @@ struct hv_dynmem_device {
> * The negotiated version agreed by host.
> */
> __u32 version;
> +
> +#ifdef CONFIG_PAGE_REPORTING
> + struct page_reporting_dev_info pr_dev_info;
> +#endif
> };
>
> static struct hv_dynmem_device dm_device;
> @@ -1565,6 +1570,83 @@ static void balloon_onchannelcallback(void *context)
>
> }
>
> +#ifdef CONFIG_PAGE_REPORTING
> +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> + struct scatterlist *sgl, unsigned int nents)
> +{
> + unsigned long flags;
> + struct hv_memory_hint *hint;
> + int i;
> + u64 status;
> + struct scatterlist *sg;
> +
> + WARN_ON(nents > HV_MAX_GPA_PAGE_RANGES);
> + local_irq_save(flags);
> + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (!hint) {
> + local_irq_restore(flags);
> + return -ENOSPC;
> + }
> +
> + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> + hint->reserved = 0;
> + for (i = 0, sg = sgl; sg; sg = sg_next(sg), i++) {
> + int order;
> + union hv_gpa_page_range *range;
> +
> + order = get_order(sg->length);
> + range = &hint->ranges[i];
> + range->address_space = 0;
> + range->page.largepage = 1;

Why is largepage always '1'?

> + range->page.additional_pages = (1ull << (order - 9)) - 1;
> + range->base_large_pfn = page_to_pfn(sg_page(sg)) >> 9;

What is '9'? Could you please define it through PAGE_*/HPAGE_* macro?

> + }
> +
> + status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
> + hint, NULL);
> + local_irq_restore(flags);
> + status &= HV_HYPERCALL_RESULT_MASK;
> + if (status != HV_STATUS_SUCCESS) {

Nit: you could've just used

if (status & HV_HYPERCALL_RESULT_MASK != HV_STATUS_SUCCESS) {
...

> + pr_err("Cold memory discard hypercall failed with status %llx\n",
> + status);
> + return -1;

-EFAULT or something like it maybe?

> + }
> +
> + return 0;
> +}
> +
> +static int enable_page_reporting(void)
> +{
> + int ret;
> +
> + if (!(hv_query_ext_cap() &
> + HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
> + pr_info("Cold memory discard hint not supported by Hyper-V\n");
> + return 0;
> + }
> +
> + BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MAX_GPA_PAGE_RANGES);
> + dm_device.pr_dev_info.report = hv_free_page_report;
> + ret = page_reporting_register(&dm_device.pr_dev_info);
> + if (ret < 0) {
> + dm_device.pr_dev_info.report = NULL;
> + pr_err("Failed to enable cold memory discard: %d\n", ret);
> + } else {
> + pr_info("Cold memory discard hint enabled\n");
> + }
> +
> + return ret;
> +}
> +
> +static void disable_page_reporting(void)
> +{
> + if (dm_device.pr_dev_info.report) {
> + page_reporting_unregister(&dm_device.pr_dev_info);
> + dm_device.pr_dev_info.report = NULL;
> + }
> +}
> +#endif //CONFIG_PAGE_REPORTING
> +
> static int balloon_connect_vsp(struct hv_device *dev)
> {
> struct dm_version_request version_req;
> @@ -1710,6 +1792,11 @@ static int balloon_probe(struct hv_device *dev,
> if (ret != 0)
> return ret;
>
> +#ifdef CONFIG_PAGE_REPORTING
> + if (enable_page_reporting() < 0)
> + goto probe_error;

Why? The hyperv-balloon driver itself may still be functional and you
already set dm_device.pr_dev_info.report to NULL.

> +#endif
> +
> dm_device.state = DM_INITIALIZED;
>
> dm_device.thread =
> @@ -1728,6 +1815,9 @@ static int balloon_probe(struct hv_device *dev,
> #ifdef CONFIG_MEMORY_HOTPLUG
> unregister_memory_notifier(&hv_memory_nb);
> restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> + disable_page_reporting();
> #endif
> return ret;
> }
> @@ -1750,6 +1840,9 @@ static int balloon_remove(struct hv_device *dev)
> #ifdef CONFIG_MEMORY_HOTPLUG
> unregister_memory_notifier(&hv_memory_nb);
> restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> + disable_page_reporting();
> #endif
> spin_lock_irqsave(&dm_device.ha_lock, flags);
> list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
> index 262fae9526b1..75aeea0e7f9b 100644
> --- a/include/asm-generic/hyperv-tlfs.h
> +++ b/include/asm-generic/hyperv-tlfs.h
> @@ -89,6 +89,7 @@
> #define HV_ACCESS_STATS BIT(8)
> #define HV_DEBUGGING BIT(11)
> #define HV_CPU_POWER_MANAGEMENT BIT(12)
> +#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20)
>
>
> /*
> @@ -149,11 +150,18 @@ struct ms_hyperv_tsc_page {
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
>
> +/* Extended hypercalls */
> +#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001
> +#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003
> +
> #define HV_FLUSH_ALL_PROCESSORS BIT(0)
> #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
> #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
> #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
>
> +/* Extended capability bits */
> +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
> +
> enum HV_GENERIC_SET_FORMAT {
> HV_GENERIC_SET_SPARSE_4K,
> HV_GENERIC_SET_ALL,
> @@ -371,6 +379,12 @@ union hv_gpa_page_range {

There is a comment before this structure:

/* HvFlushGuestPhysicalAddressList hypercall */

which is now obsolete.

> u64 largepage:1;
> u64 basepfn:52;
> } page;
> + struct {
> + u64:12;

What is this unnamed member? Another 'reserved', 'pad'? Let's name it.

> + u64 page_size:1;
> + u64 reserved:8;
> + u64 base_large_pfn:43;
> + };

Please name this structure in the union.

> };
>
> /*
> @@ -490,4 +504,19 @@ struct hv_set_vp_registers_input {
> } element[];
> } __packed;
>
> +/*
> + * The whole argument should fit in a page to be able to pass to the hypervisor
> + * in one hypercall.
> + */
> +#define HV_MAX_GPA_PAGE_RANGES ((PAGE_SIZE - sizeof(u64)) / \
> + sizeof(union hv_gpa_page_range))
> +

The name HV_MAX_GPA_PAGE_RANGES sounds too generic and I think this is
specific to the HvExtCallMemoryHeatHint hypercall as other hypercalls
may have a different header length.

> +/* HvExtCallMemoryHeatHint hypercall */
> +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2
> +struct hv_memory_hint {
> + u64 type:2;
> + u64 reserved:62;
> + union hv_gpa_page_range ranges[1];

Why '[1]' and not '[]'? If it was '[]' you could've used 'sizeof(struct
hv_memory_hint)' in HV_MAX_GPA_PAGE_RANGES macro definition instead of
'sizeof(u64)'.

> +};
> +
> #endif
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index 1c4fd950f091..c664af4a7503 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -27,6 +27,7 @@
>
> struct ms_hyperv_info {
> u32 features;
> + u32 b_features;
> u32 misc_features;
> u32 hints;
> u32 nested_features;
> @@ -169,6 +170,7 @@ bool hv_is_hyperv_initialized(void);
> bool hv_is_hibernation_supported(void);
> void hyperv_cleanup(void);
> void hv_setup_sched_clock(void *sched_clock);
> +u64 hv_query_ext_cap(void);
> #else /* CONFIG_HYPERV */
> static inline bool hv_is_hyperv_initialized(void) { return false; }
> static inline bool hv_is_hibernation_supported(void) { return false; }

--
Vitaly

2020-05-20 09:04:38

by Wei Liu

[permalink] [raw]
Subject: Re: [PATCH] x86/Hyper-V: Support for free page reporting

On Tue, May 19, 2020 at 06:37:57PM +0000, Sunil Muthuswamy wrote:
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
>
> Signed-off-by: Sunil Muthuswamy <[email protected]>
> ---
> First patch mail bounced backed. Sending it again with the email
> addresses fixed.
> ---
> arch/x86/hyperv/hv_init.c | 24 ++++++++
> arch/x86/kernel/cpu/mshyperv.c | 6 +-
> drivers/hv/hv_balloon.c | 93 +++++++++++++++++++++++++++++++
> include/asm-generic/hyperv-tlfs.h | 29 ++++++++++
> include/asm-generic/mshyperv.h | 2 +
> 5 files changed, 152 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 624f5d9b0f79..925e2f7eb82c 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -506,3 +506,27 @@ bool hv_is_hibernation_supported(void)
> return acpi_sleep_state_supported(ACPI_STATE_S4);
> }
> EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
> +
> +u64 hv_query_ext_cap(void)
> +{
> + u64 *cap;
> + unsigned long flags;
> + u64 ext_cap = 0;
> +
> + /*
> + * Querying extended capabilities is an extended hypercall. Check if the
> + * partition supports extended hypercall, first.
> + */
> + if (!(ms_hyperv.b_features & HV_ENABLE_EXTENDED_HYPERCALLS))
> + return 0;
> +
> + local_irq_save(flags);
> + cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);

The cast here is not strictly needed.

> + if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> + HV_STATUS_SUCCESS)

You're using the input page as the output parameter. Ideally we should
introduce hyperv_pcpu_output_arg page, but that would waste one page per
cpu just for this one call.

So for now I think this setup is fine, but I would like to add the
following comment.

/*
* Repurpose the input_arg page to accept output from Hyper-V for
* now because this is the only call that needs output from the
* hypervisor. It should be fixed properly by introducing an
* output_arg page once we have more places that require output.
*/

> + ext_cap = *cap;
> +
> + local_irq_restore(flags);
> + return ext_cap;
> +}
> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index ebf34c7bc8bc..2de3f692c8bf 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -224,11 +224,13 @@ static void __init ms_hyperv_init_platform(void)
> * Extract the features and hints
> */
> ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> + ms_hyperv.b_features = cpuid_ebx(HYPERV_CPUID_FEATURES);
> ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
> ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
>
> - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> + pr_info("Hyper-V: features 0x%x, additional features: 0x%x, hints 0x%x, misc 0x%x\n",
> + ms_hyperv.features, ms_hyperv.b_features, ms_hyperv.hints,
> + ms_hyperv.misc_features);
>
> ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
> ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 32e3bc0aa665..77be31094556 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -21,6 +21,7 @@
> #include <linux/memory.h>
> #include <linux/notifier.h>
> #include <linux/percpu_counter.h>
> +#include <linux/page_reporting.h>
>
> #include <linux/hyperv.h>
> #include <asm/hyperv-tlfs.h>
> @@ -563,6 +564,10 @@ struct hv_dynmem_device {
> * The negotiated version agreed by host.
> */
> __u32 version;
> +
> +#ifdef CONFIG_PAGE_REPORTING
> + struct page_reporting_dev_info pr_dev_info;
> +#endif
> };
>
> static struct hv_dynmem_device dm_device;
> @@ -1565,6 +1570,83 @@ static void balloon_onchannelcallback(void *context)
>
> }
>
> +#ifdef CONFIG_PAGE_REPORTING
> +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> + struct scatterlist *sgl, unsigned int nents)
> +{
> + unsigned long flags;
> + struct hv_memory_hint *hint;
> + int i;
> + u64 status;
> + struct scatterlist *sg;
> +
> + WARN_ON(nents > HV_MAX_GPA_PAGE_RANGES);

Should we return -ENOSPC here?

> + local_irq_save(flags);
> + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (!hint) {
> + local_irq_restore(flags);
> + return -ENOSPC;
> + }
> +
> + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> + hint->reserved = 0;
> + for (i = 0, sg = sgl; sg; sg = sg_next(sg), i++) {
> + int order;
> + union hv_gpa_page_range *range;
> +

Unfortunately I can't find the semantics of this hypercall in TLFS 6, so
I have a few questions here.

> + order = get_order(sg->length);
> + range = &hint->ranges[i];
> + range->address_space = 0;

I guess this means all address spaces?

> + range->page.largepage = 1;

What effect does this have? What if the page is a 4k page?

> + range->page.additional_pages = (1ull << (order - 9)) - 1;

What is 9 here? Is there a macro name *ORDER that you can use?

Wei.

2020-05-20 09:09:29

by Wei Liu

[permalink] [raw]
Subject: Re: [PATCH] x86/Hyper-V: Support for free page reporting

On Wed, May 20, 2020 at 10:59:22AM +0200, Vitaly Kuznetsov wrote:
> Sunil Muthuswamy <[email protected]> writes:
[...]
> > +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> > diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> > index ebf34c7bc8bc..2de3f692c8bf 100644
> > --- a/arch/x86/kernel/cpu/mshyperv.c
> > +++ b/arch/x86/kernel/cpu/mshyperv.c
> > @@ -224,11 +224,13 @@ static void __init ms_hyperv_init_platform(void)
> > * Extract the features and hints
> > */
> > ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> > + ms_hyperv.b_features = cpuid_ebx(HYPERV_CPUID_FEATURES);
> > ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
> > ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
> >
> > - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> > - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> > + pr_info("Hyper-V: features 0x%x, additional features: 0x%x, hints 0x%x, misc 0x%x\n",
> > + ms_hyperv.features, ms_hyperv.b_features, ms_hyperv.hints,
> > + ms_hyperv.misc_features);
>
> HYPERV_CPUID_FEATURES(0x40000003) EAX and EBX correspond to Partition
> Privilege Flags (TLFS), I'd suggest to take the opportunity and rename
> this to something like 'privilege flags low=0x%x high=0x%x'.
>
> Also, I don't quite like 'ms_hyperv.b_features' as I'll always have to
> look at what it's being assigned to understand what it holds. I'd even
> suggest to rename ms_hyperv.features to ms_hyperv.priv_low and
> ms_hyperv.b_features tp ms_hyperv.priv_high. Or maybe even better, pack
> them to the same 'u64 ms_hyperv.privileges'.

+1 for this. :-)

Wei.

2020-05-20 22:36:02

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] x86/Hyper-V: Support for free page reporting

Hi Sunil,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on next-20200519]
[cannot apply to tip/auto-latest linus/master tip/x86/core asm-generic/master linux/master v5.7-rc6 v5.7-rc5 v5.7-rc4 v5.7-rc6]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url: https://github.com/0day-ci/linux/commits/Sunil-Muthuswamy/x86-Hyper-V-Support-for-free-page-reporting/20200520-024140
base: fb57b1fabcb28f358901b2df90abd2b48abc1ca8
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.1-193-gb8fad4bc-dirty
# save the attached .config to linux build tree
make C=1 ARCH=x86_64 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <[email protected]>


sparse warnings: (new ones prefixed by >>)

>> drivers/hv/hv_balloon.c:1585:43: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected void const [noderef] <asn:3> *__vpp_verify @@ got const [noderef] <asn:3> *__vpp_verify @@
>> drivers/hv/hv_balloon.c:1585:43: sparse: expected void const [noderef] <asn:3> *__vpp_verify
>> drivers/hv/hv_balloon.c:1585:43: sparse: got void [noderef] <asn:3> **

vim +1585 drivers/hv/hv_balloon.c

1572
1573 #ifdef CONFIG_PAGE_REPORTING
1574 static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
1575 struct scatterlist *sgl, unsigned int nents)
1576 {
1577 unsigned long flags;
1578 struct hv_memory_hint *hint;
1579 int i;
1580 u64 status;
1581 struct scatterlist *sg;
1582
1583 WARN_ON(nents > HV_MAX_GPA_PAGE_RANGES);
1584 local_irq_save(flags);
> 1585 hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
1586 if (!hint) {
1587 local_irq_restore(flags);
1588 return -ENOSPC;
1589 }
1590
1591 hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
1592 hint->reserved = 0;
1593 for (i = 0, sg = sgl; sg; sg = sg_next(sg), i++) {
1594 int order;
1595 union hv_gpa_page_range *range;
1596
1597 order = get_order(sg->length);
1598 range = &hint->ranges[i];
1599 range->address_space = 0;
1600 range->page.largepage = 1;
1601 range->page.additional_pages = (1ull << (order - 9)) - 1;
1602 range->base_large_pfn = page_to_pfn(sg_page(sg)) >> 9;
1603 }
1604
1605 status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
1606 hint, NULL);
1607 local_irq_restore(flags);
1608 status &= HV_HYPERCALL_RESULT_MASK;
1609 if (status != HV_STATUS_SUCCESS) {
1610 pr_err("Cold memory discard hypercall failed with status %llx\n",
1611 status);
1612 return -1;
1613 }
1614
1615 return 0;
1616 }
1617

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (3.18 kB)
.config.gz (73.18 kB)
Download all attachments

2020-05-22 16:43:04

by Sunil Muthuswamy

[permalink] [raw]
Subject: RE: [EXTERNAL] Re: [PATCH] x86/Hyper-V: Support for free page reporting

> > + if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> > + HV_STATUS_SUCCESS)
>
> You're using the input page as the output parameter. Ideally we should
> introduce hyperv_pcpu_output_arg page, but that would waste one page per
> cpu just for this one call.
>
> So for now I think this setup is fine, but I would like to add the
> following comment.
>
> /*
> * Repurpose the input_arg page to accept output from Hyper-V for
> * now because this is the only call that needs output from the
> * hypervisor. It should be fixed properly by introducing an
> * output_arg page once we have more places that require output.
> */

Sounds good. Will add it in v2.

> > +#ifdef CONFIG_PAGE_REPORTING
> > +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> > + struct scatterlist *sgl, unsigned int nents)
> > +{
> > + unsigned long flags;
> > + struct hv_memory_hint *hint;
> > + int i;
> > + u64 status;
> > + struct scatterlist *sg;
> > +
> > + WARN_ON(nents > HV_MAX_GPA_PAGE_RANGES);
>
> Should we return -ENOSPC here?

This is more of an assert because PAGE_REPORTING_CAPACITY is set to 32 and has
already been checked that it is < HV_MAX_GPA_PAGE_RANGES in enable_page_reporting.

> > + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> > + hint->reserved = 0;
> > + for (i = 0, sg = sgl; sg; sg = sg_next(sg), i++) {
> > + int order;
> > + union hv_gpa_page_range *range;
> > +
>
> Unfortunately I can't find the semantics of this hypercall in TLFS 6, so
> I have a few questions here.

This structure is not specific to this hypercall.

>
> > + order = get_order(sg->length);
> > + range = &hint->ranges[i];
> > + range->address_space = 0;
>
> I guess this means all address spaces?

'address_space' is being used here just as a zero initializer for the union. I think the use
of 'address_space' in the definition of hv_gpa_page_range is not appropriate. This struct is
defined in the TLFS 6.0 with the same name, if you want to look it up.

>
> > + range->page.largepage = 1;
>
> What effect does this have? What if the page is a 4k page?

Page reporting infrastructure doesn't report 4k pages today, but only huge pages (see
PAGE_REPORTING_MIN_ORDER in page_reporting.h). Additionally, the Hyper-V hypervisor
only supports reporting of 2M pages and above. The current code assumes that the minimum
order will be 9 i.e 2M pages and above.
If we feel that this could change in the future, or an implementation detail that should be
protected against, I can add some checks in hv_balloon.c. But, in that case, the ballon driver
should be able to query the page reporting min order somehow, which it is currently, since it is
private.
Alexander, do you have any suggestions or feedback here?

>
> > + range->page.additional_pages = (1ull << (order - 9)) - 1;
>
> What is 9 here? Is there a macro name *ORDER that you can use?

It is to determine the count of 2M pages. I will define a macro.

>
> Wei.

2020-05-22 19:15:00

by Sunil Muthuswamy

[permalink] [raw]
Subject: RE: [EXTERNAL] Re: [PATCH] x86/Hyper-V: Support for free page reporting

> As the only usage of this function looks like
> if (!(hv_query_ext_cap() & HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT))
>
> I would've change the interface to
>
> bool hv_query_ext_cap(u64 cap)
>
> so the usage would look like
>
> if (!(hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT))

Good idea. Will do in v2.

> > ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> > + ms_hyperv.b_features = cpuid_ebx(HYPERV_CPUID_FEATURES);
> > ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
> > ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
> >
> > - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> > - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> > + pr_info("Hyper-V: features 0x%x, additional features: 0x%x, hints 0x%x, misc 0x%x\n",
> > + ms_hyperv.features, ms_hyperv.b_features, ms_hyperv.hints,
> > + ms_hyperv.misc_features);
>
> HYPERV_CPUID_FEATURES(0x40000003) EAX and EBX correspond to Partition
> Privilege Flags (TLFS), I'd suggest to take the opportunity and rename
> this to something like 'privilege flags low=0x%x high=0x%x'.
>
> Also, I don't quite like 'ms_hyperv.b_features' as I'll always have to
> look at what it's being assigned to understand what it holds. I'd even
> suggest to rename ms_hyperv.features to ms_hyperv.priv_low and
> ms_hyperv.b_features tp ms_hyperv.priv_high. Or maybe even better, pack
> them to the same 'u64 ms_hyperv.privileges'.

Good idea. I will make the change to rename this to 'priv_high' in v2. I like the idea of
combining 'features' & 'priv_high' to a u64, but that would be a cleanup change and a
separate patch.

>
> Why is largepage always '1'?
I have responded to a similar question by Wei. Page reporting only supports huge pages
and, so does the Hyper-V hypervisor. Let's follow this there.

>
> > + range->page.additional_pages = (1ull << (order - 9)) - 1;
> > + range->base_large_pfn = page_to_pfn(sg_page(sg)) >> 9;
>
> What is '9'? Could you please define it through PAGE_*/HPAGE_* macro?
Yes, I will define a macro. Essentially, it is to get a count of 2M pages.

> Nit: you could've just used
>
> if (status & HV_HYPERCALL_RESULT_MASK != HV_STATUS_SUCCESS) {
Sure, coming in v2.

> ...
>
> > + pr_err("Cold memory discard hypercall failed with status %llx\n",
> > + status);
> > + return -1;
>
> -EFAULT or something like it maybe?
Coming in v2.

> > +#ifdef CONFIG_PAGE_REPORTING
> > + if (enable_page_reporting() < 0)
> > + goto probe_error;
>
> Why? The hyperv-balloon driver itself may still be functional and you
> already set dm_device.pr_dev_info.report to NULL.
An error here would reflect an internal error and should not happen and
it was to make it easy to catch such errors, which are otherwise difficult
with just a print. But, the code should follow the general spirit. I will change
this in v2.

>
> > enum HV_GENERIC_SET_FORMAT {
> > HV_GENERIC_SET_SPARSE_4K,
> > HV_GENERIC_SET_ALL,
> > @@ -371,6 +379,12 @@ union hv_gpa_page_range {
>
> There is a comment before this structure:
>
> /* HvFlushGuestPhysicalAddressList hypercall */
>
> which is now obsolete.

I will add that it also applies to 'HvExtCallMemoryHeatHint' hypercall also.

>
> > u64 largepage:1;
> > u64 basepfn:52;
> > } page;
> > + struct {
> > + u64:12;
>
> What is this unnamed member? Another 'reserved', 'pad'? Let's name it.

Sure, coming in v2.

>
> > + u64 page_size:1;
> > + u64 reserved:8;
> > + u64 base_large_pfn:43;
> > + };
>
> Please name this structure in the union.

Sure, coming in v2.

> > + */
> > +#define HV_MAX_GPA_PAGE_RANGES ((PAGE_SIZE - sizeof(u64)) / \
> > + sizeof(union hv_gpa_page_range))
> > +
>
> The name HV_MAX_GPA_PAGE_RANGES sounds too generic and I think this is
> specific to the HvExtCallMemoryHeatHint hypercall as other hypercalls
> may have a different header length.
>

Good idea to rename. Coming in v2.

> > +/* HvExtCallMemoryHeatHint hypercall */
> > +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2
> > +struct hv_memory_hint {
> > + u64 type:2;
> > + u64 reserved:62;
> > + union hv_gpa_page_range ranges[1];
>
> Why '[1]' and not '[]'? If it was '[]' you could've used 'sizeof(struct
> hv_memory_hint)' in HV_MAX_GPA_PAGE_RANGES macro definition instead of
> 'sizeof(u64)'.
>
Good idea, coming in v2.

- Sunil

2020-05-22 20:14:11

by Alexander Duyck

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH] x86/Hyper-V: Support for free page reporting

On Fri, May 22, 2020 at 9:42 AM Sunil Muthuswamy <[email protected]> wrote:
>

[...]

> >
> > > + order = get_order(sg->length);
> > > + range = &hint->ranges[i];
> > > + range->address_space = 0;
> >
> > I guess this means all address spaces?
>
> 'address_space' is being used here just as a zero initializer for the union. I think the use
> of 'address_space' in the definition of hv_gpa_page_range is not appropriate. This struct is
> defined in the TLFS 6.0 with the same name, if you want to look it up.
>
> >
> > > + range->page.largepage = 1;
> >
> > What effect does this have? What if the page is a 4k page?
>
> Page reporting infrastructure doesn't report 4k pages today, but only huge pages (see
> PAGE_REPORTING_MIN_ORDER in page_reporting.h). Additionally, the Hyper-V hypervisor
> only supports reporting of 2M pages and above. The current code assumes that the minimum
> order will be 9 i.e 2M pages and above.
> If we feel that this could change in the future, or an implementation detail that should be
> protected against, I can add some checks in hv_balloon.c. But, in that case, the ballon driver
> should be able to query the page reporting min order somehow, which it is currently, since it is
> private.
> Alexander, do you have any suggestions or feedback here?

For now we are keeping the limit to order 9 for a couple reasons. The
first being that we don't want to trigger the breaking apart of
transparent huge pages on the host, and the second being for
performance as it is better to report larger pages rather than smaller
ones.

If we were to enable bringing the value down lower we would likely
make it a part of the page reporting dev info structure and would have
to be set at initialization time. That way the device itself could
configure the minimum value. I don't see the value itself being
lowered without adding an option like that since it would likely cause
issues for several different reasons going forward though. If nothing
else you could do a BUILD_BUG_ON that would assert if
PAGE_REPORTING_MIN_ORDER was anything other than the same size as the
"large_page" size referenced above.

> >
> > > + range->page.additional_pages = (1ull << (order - 9)) - 1;
> >
> > What is 9 here? Is there a macro name *ORDER that you can use?
>
> It is to determine the count of 2M pages. I will define a macro.

Is this the only spot where you are using order? Instead of converting
the length to an order why not just divide it by the 2M page size? I
would think that would be faster than having to do something like
having to call get_order on the length? Also instead of using "9" it
might make more sense if you have a define somewhere that says what
"large_page" size actually is. Then you could just divide by that
which should be translated into a shift which is fast and cheap.

2020-05-23 00:42:38

by Michael Kelley (LINUX)

[permalink] [raw]
Subject: RE: [EXTERNAL] Re: [PATCH] x86/Hyper-V: Support for free page reporting

From: Sunil Muthuswamy <[email protected]> Sent: Friday, May 22, 2020 9:40 AM
>
> > > + if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> > > + HV_STATUS_SUCCESS)
> >
> > You're using the input page as the output parameter. Ideally we should
> > introduce hyperv_pcpu_output_arg page, but that would waste one page per
> > cpu just for this one call.
> >
> > So for now I think this setup is fine, but I would like to add the
> > following comment.
> >
> > /*
> > * Repurpose the input_arg page to accept output from Hyper-V for
> > * now because this is the only call that needs output from the
> > * hypervisor. It should be fixed properly by introducing an
> > * output_arg page once we have more places that require output.
> > */
>
> Sounds good. Will add it in v2.
>

Note that the only real requirement for the output parameter to hypercalls
is that it not cross a page boundary. Since '*cap' is only 64-bits, you can
declare it as a static variable or even as a local on the stack. It will
naturally be aligned (or can add __aligned(8) to be explicit??), so it won't
cross a page boundary. Then you can skip using the per-cpu input arg
altogether, along with the associated local_irq_save()/restore().

Michael

2020-11-25 02:52:03

by Matheus Castello

[permalink] [raw]
Subject: Re: [PATCH] x86/Hyper-V: Support for free page reporting

Hi Sunil,

first thank you very much for your work here. I am using this patch in
the latest mainline releases with WSL 2 and it is helping me a lot. Let
me know if you're still interested in continuing to work on this patch.
It would be great to have it included in the mainline. If you need any
help let me know too. I'm including my test tag. Thanks!

Em 5/19/2020 3:37 PM, Sunil Muthuswamy escreveu:
> Linux has support for free page reporting now (36e66c554b5c) for
> virtualized environment. On Hyper-V when virtually backed VMs are
> configured, Hyper-V will advertise cold memory discard capability,
> when supported. This patch adds the support to hook into the free
> page reporting infrastructure and leverage the Hyper-V cold memory
> discard hint hypercall to report/free these pages back to the host.
>
> Signed-off-by: Sunil Muthuswamy <[email protected]>
> Reported-by: kbuild test robot <[email protected]>
> ---
> First patch mail bounced backed. Sending it again with the email
> addresses fixed.
> ---
> arch/x86/hyperv/hv_init.c | 24 ++++++++
> arch/x86/kernel/cpu/mshyperv.c | 6 +-
> drivers/hv/hv_balloon.c | 93 +++++++++++++++++++++++++++++++
> include/asm-generic/hyperv-tlfs.h | 29 ++++++++++
> include/asm-generic/mshyperv.h | 2 +
> 5 files changed, 152 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 624f5d9b0f79..925e2f7eb82c 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -506,3 +506,27 @@ bool hv_is_hibernation_supported(void)
> return acpi_sleep_state_supported(ACPI_STATE_S4);
> }
> EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
> +
> +u64 hv_query_ext_cap(void)
> +{
> + u64 *cap;
> + unsigned long flags;
> + u64 ext_cap = 0;
> +
> + /*
> + * Querying extended capabilities is an extended hypercall. Check if the
> + * partition supports extended hypercall, first.
> + */
> + if (!(ms_hyperv.b_features & HV_ENABLE_EXTENDED_HYPERCALLS))
> + return 0;
> +
> + local_irq_save(flags);
> + cap = *(u64 **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, cap) ==
> + HV_STATUS_SUCCESS)
> + ext_cap = *cap;
> +
> + local_irq_restore(flags);
> + return ext_cap;
> +}
> +EXPORT_SYMBOL_GPL(hv_query_ext_cap);
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index ebf34c7bc8bc..2de3f692c8bf 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -224,11 +224,13 @@ static void __init ms_hyperv_init_platform(void)
> * Extract the features and hints
> */
> ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
> + ms_hyperv.b_features = cpuid_ebx(HYPERV_CPUID_FEATURES);
> ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
> ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
>
> - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
> - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
> + pr_info("Hyper-V: features 0x%x, additional features: 0x%x, hints 0x%x, misc 0x%x\n",
> + ms_hyperv.features, ms_hyperv.b_features, ms_hyperv.hints,
> + ms_hyperv.misc_features);
>
> ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
> ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 32e3bc0aa665..77be31094556 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -21,6 +21,7 @@
> #include <linux/memory.h>
> #include <linux/notifier.h>
> #include <linux/percpu_counter.h>
> +#include <linux/page_reporting.h>
>
> #include <linux/hyperv.h>
> #include <asm/hyperv-tlfs.h>
> @@ -563,6 +564,10 @@ struct hv_dynmem_device {
> * The negotiated version agreed by host.
> */
> __u32 version;
> +
> +#ifdef CONFIG_PAGE_REPORTING
> + struct page_reporting_dev_info pr_dev_info;
> +#endif
> };
>
> static struct hv_dynmem_device dm_device;
> @@ -1565,6 +1570,83 @@ static void balloon_onchannelcallback(void *context)
>
> }
>
> +#ifdef CONFIG_PAGE_REPORTING
> +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
> + struct scatterlist *sgl, unsigned int nents)
> +{
> + unsigned long flags;
> + struct hv_memory_hint *hint;
> + int i;
> + u64 status;
> + struct scatterlist *sg;
> +
> + WARN_ON(nents > HV_MAX_GPA_PAGE_RANGES);
> + local_irq_save(flags);
> + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + if (!hint) {
> + local_irq_restore(flags);
> + return -ENOSPC;
> + }
> +
> + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
> + hint->reserved = 0;
> + for (i = 0, sg = sgl; sg; sg = sg_next(sg), i++) {
> + int order;
> + union hv_gpa_page_range *range;
> +
> + order = get_order(sg->length);
> + range = &hint->ranges[i];
> + range->address_space = 0;
> + range->page.largepage = 1;
> + range->page.additional_pages = (1ull << (order - 9)) - 1;
> + range->base_large_pfn = page_to_pfn(sg_page(sg)) >> 9;
> + }
> +
> + status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
> + hint, NULL);
> + local_irq_restore(flags);
> + status &= HV_HYPERCALL_RESULT_MASK;
> + if (status != HV_STATUS_SUCCESS) {
> + pr_err("Cold memory discard hypercall failed with status %llx\n",
> + status);
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int enable_page_reporting(void)
> +{
> + int ret;
> +
> + if (!(hv_query_ext_cap() &
> + HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
> + pr_info("Cold memory discard hint not supported by Hyper-V\n");
> + return 0;
> + }
> +
> + BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MAX_GPA_PAGE_RANGES);
> + dm_device.pr_dev_info.report = hv_free_page_report;
> + ret = page_reporting_register(&dm_device.pr_dev_info);
> + if (ret < 0) {
> + dm_device.pr_dev_info.report = NULL;
> + pr_err("Failed to enable cold memory discard: %d\n", ret);
> + } else {
> + pr_info("Cold memory discard hint enabled\n");
> + }
> +
> + return ret;
> +}
> +
> +static void disable_page_reporting(void)
> +{
> + if (dm_device.pr_dev_info.report) {
> + page_reporting_unregister(&dm_device.pr_dev_info);
> + dm_device.pr_dev_info.report = NULL;
> + }
> +}
> +#endif //CONFIG_PAGE_REPORTING
> +
> static int balloon_connect_vsp(struct hv_device *dev)
> {
> struct dm_version_request version_req;
> @@ -1710,6 +1792,11 @@ static int balloon_probe(struct hv_device *dev,
> if (ret != 0)
> return ret;
>
> +#ifdef CONFIG_PAGE_REPORTING
> + if (enable_page_reporting() < 0)
> + goto probe_error;
> +#endif
> +
> dm_device.state = DM_INITIALIZED;
>
> dm_device.thread =
> @@ -1728,6 +1815,9 @@ static int balloon_probe(struct hv_device *dev,
> #ifdef CONFIG_MEMORY_HOTPLUG
> unregister_memory_notifier(&hv_memory_nb);
> restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> + disable_page_reporting();
> #endif
> return ret;
> }
> @@ -1750,6 +1840,9 @@ static int balloon_remove(struct hv_device *dev)
> #ifdef CONFIG_MEMORY_HOTPLUG
> unregister_memory_notifier(&hv_memory_nb);
> restore_online_page_callback(&hv_online_page);
> +#endif
> +#ifdef CONFIG_PAGE_REPORTING
> + disable_page_reporting();
> #endif
> spin_lock_irqsave(&dm_device.ha_lock, flags);
> list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
> index 262fae9526b1..75aeea0e7f9b 100644
> --- a/include/asm-generic/hyperv-tlfs.h
> +++ b/include/asm-generic/hyperv-tlfs.h
> @@ -89,6 +89,7 @@
> #define HV_ACCESS_STATS BIT(8)
> #define HV_DEBUGGING BIT(11)
> #define HV_CPU_POWER_MANAGEMENT BIT(12)
> +#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20)
>
>
> /*
> @@ -149,11 +150,18 @@ struct ms_hyperv_tsc_page {
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
> #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
>
> +/* Extended hypercalls */
> +#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001
> +#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003
> +
> #define HV_FLUSH_ALL_PROCESSORS BIT(0)
> #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
> #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
> #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
>
> +/* Extended capability bits */
> +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8)
> +
> enum HV_GENERIC_SET_FORMAT {
> HV_GENERIC_SET_SPARSE_4K,
> HV_GENERIC_SET_ALL,
> @@ -371,6 +379,12 @@ union hv_gpa_page_range {
> u64 largepage:1;
> u64 basepfn:52;
> } page;
> + struct {
> + u64:12;
> + u64 page_size:1;
> + u64 reserved:8;
> + u64 base_large_pfn:43;
> + };
> };
>
> /*
> @@ -490,4 +504,19 @@ struct hv_set_vp_registers_input {
> } element[];
> } __packed;
>
> +/*
> + * The whole argument should fit in a page to be able to pass to the hypervisor
> + * in one hypercall.
> + */
> +#define HV_MAX_GPA_PAGE_RANGES ((PAGE_SIZE - sizeof(u64)) / \
> + sizeof(union hv_gpa_page_range))
> +
> +/* HvExtCallMemoryHeatHint hypercall */
> +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2
> +struct hv_memory_hint {
> + u64 type:2;
> + u64 reserved:62;
> + union hv_gpa_page_range ranges[1];
> +};
> +
> #endif
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index 1c4fd950f091..c664af4a7503 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -27,6 +27,7 @@
>
> struct ms_hyperv_info {
> u32 features;
> + u32 b_features;
> u32 misc_features;
> u32 hints;
> u32 nested_features;
> @@ -169,6 +170,7 @@ bool hv_is_hyperv_initialized(void);
> bool hv_is_hibernation_supported(void);
> void hyperv_cleanup(void);
> void hv_setup_sched_clock(void *sched_clock);
> +u64 hv_query_ext_cap(void);
> #else /* CONFIG_HYPERV */
> static inline bool hv_is_hyperv_initialized(void) { return false; }
> static inline bool hv_is_hibernation_supported(void) { return false; }
>

Tested-by: Matheus Castello <[email protected]>