From: KY Srinivasan <kys@microsoft.com>
To: Vitaly Kuznetsov <vkuznets@redhat.com>,
        "devel@linuxdriverproject.org" <devel@linuxdriverproject.org>,
        "x86@kernel.org" <x86@kernel.org>
CC: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
        "Haiyang Zhang" <haiyangz@microsoft.com>,
        Stephen Hemminger <sthemmin@microsoft.com>,
        Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@redhat.com>,
        "H. Peter Anvin" <hpa@zytor.com>, Steven Rostedt <rostedt@goodmis.org>,
        "Jork Loeser" <Jork.Loeser@microsoft.com>
Subject: RE: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush
Thread-Topic: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush
Thread-Index: AQHSr5HvFlY0JcZSoUGew8hIo1ql6aG7rYqw
Date: Sat, 8 Apr 2017 16:47:27 +0000
Message-ID: <BN6PR03MB24818610B89DBF4B91099892A00F0@BN6PR03MB2481.namprd03.prod.outlook.com>
References: <20170407112701.17157-1-vkuznets@redhat.com>
 <20170407112701.17157-7-vkuznets@redhat.com>
In-Reply-To: <20170407112701.17157-7-vkuznets@redhat.com>
Accept-Language: en-US
Content-Language: en-US
spamdiagnosticoutput: 1:99
spamdiagnosticmetadata: NSPM
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
X-MS-Exchange-CrossTenant-originalarrivaltime: 08 Apr 2017 16:47:27.3762
 (UTC)
X-MS-Exchange-CrossTenant-fromentityheader: Hosted
X-MS-Exchange-CrossTenant-id: 72f988bf-86f1-41af-91ab-2d7cd011db47
X-MS-Exchange-Transport-CrossTenantHeadersStamped: BLUPR03MB1412
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Transfer-Encoding: 8bit
X-MIME-Autoconverted: from quoted-printable to 8bit by mail.home.local id v38GmTec010902
Content-Length: 8413
Lines: 268


> -----Original Message-----
> From: Vitaly Kuznetsov [mailto:vkuznets@redhat.com]
> Sent: Friday, April 7, 2017 4:27 AM
> To: devel@linuxdriverproject.org; x86@kernel.org
> Cc: linux-kernel@vger.kernel.org; KY Srinivasan <kys@microsoft.com>;
> Haiyang Zhang <haiyangz@microsoft.com>; Stephen Hemminger
> <sthemmin@microsoft.com>; Thomas Gleixner <tglx@linutronix.de>; Ingo
> Molnar <mingo@redhat.com>; H. Peter Anvin <hpa@zytor.com>; Steven
> Rostedt <rostedt@goodmis.org>; Jork Loeser <Jork.Loeser@microsoft.com>
> Subject: [PATCH 6/7] x86/hyper-v: use hypercall for remove TLB flush
> 
> Hyper-V host can suggest us to use hypercall for doing remote TLB flush,
> this is supposed to work faster than IPIs.
> 
> Implementation details: to do HvFlushVirtualAddress{Space,List} hypercalls
> we need to put the input somewhere in memory and we don't really want to
> have memory allocation on each call so we pre-allocate per cpu memory
> areas
> on boot. These areas are of fixes size, limit them with an arbitrary number
> of 16 (16 gvas are able to specify 16 * 4096 pages).
> 
> pv_ops patching is happening very early so we need to separate
> hyperv_setup_mmu_ops() and hyper_alloc_mmu().
> 
> It is possible and easy to implement local TLB flushing too and there is
> even a hint for that. However, I don't see a room for optimization on the
> host side as both hypercall and native tlb flush will result in vmexit. The
> hint is also not set on modern Hyper-V versions.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> ---
>  arch/x86/hyperv/Makefile           |   2 +-
>  arch/x86/hyperv/hv_init.c          |   2 +
>  arch/x86/hyperv/mmu.c              | 128
> +++++++++++++++++++++++++++++++++++++
>  arch/x86/include/asm/mshyperv.h    |   2 +
>  arch/x86/include/uapi/asm/hyperv.h |   7 ++
>  arch/x86/kernel/cpu/mshyperv.c     |   1 +
>  6 files changed, 141 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/hyperv/mmu.c
> 
> diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
> index 171ae09..367a820 100644
> --- a/arch/x86/hyperv/Makefile
> +++ b/arch/x86/hyperv/Makefile
> @@ -1 +1 @@
> -obj-y		:= hv_init.o
> +obj-y		:= hv_init.o mmu.o
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 1c14088..2cf8a98 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -163,6 +163,8 @@ void hyperv_init(void)
>  	hypercall_msr.guest_physical_address =
> vmalloc_to_pfn(hv_hypercall_pg);
>  	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
> 
> +	hyper_alloc_mmu();
> +
>  	/*
>  	 * Register Hyper-V specific clocksource.
>  	 */
> diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
> new file mode 100644
> index 0000000..fb487cb
> --- /dev/null
> +++ b/arch/x86/hyperv/mmu.c
> @@ -0,0 +1,128 @@
> +#include <linux/types.h>
> +#include <linux/hyperv.h>
> +#include <linux/slab.h>
> +#include <asm/mshyperv.h>
> +#include <asm/tlbflush.h>
> +#include <asm/msr.h>
> +#include <asm/fpu/api.h>
> +
> +/*
> + * Arbitrary number; we need to pre-allocate per-cpu struct for doing TLB
> + * flush hypercalls and we need to pick a size. '16' means we'll be able
> + * to flush 16 * 4096 pages (256MB) with one hypercall.
> + */
> +#define HV_MMU_MAX_GVAS 16

Did you experiment with different sizes here.
> +
> +/* HvFlushVirtualAddressSpace*, HvFlushVirtualAddressList hypercalls */
> +struct hv_flush_pcpu {
> +	struct {
> +		__u64 address_space;
> +		__u64 flags;
> +		__u64 processor_mask;
> +		__u64 gva_list[HV_MMU_MAX_GVAS];
> +	} flush;
> +
> +	spinlock_t lock;
> +};
> +
We may be supporting more than 64 CPUs in this hypercall. I am going to inquire with
the Windows folks and get back to you.

> +static struct hv_flush_pcpu __percpu *pcpu_flush;
> +
> +static void hyperv_flush_tlb_others(const struct cpumask *cpus,
> +				    struct mm_struct *mm, unsigned long
> start,
> +				    unsigned long end)
> +{
> +	struct hv_flush_pcpu *flush;
> +	unsigned long cur, flags;
> +	u64 status = -1ULL;
> +	int cpu, vcpu, gva_n;
> +
> +	if (!pcpu_flush || !hv_hypercall_pg)
> +		goto do_native;
> +
> +	if (cpumask_empty(cpus))
> +		return;
> +
> +	flush = this_cpu_ptr(pcpu_flush);
> +	spin_lock_irqsave(&flush->lock, flags);
> +
> +	flush->flush.address_space = virt_to_phys(mm->pgd);
> +	flush->flush.processor_mask = 0;
> +	if (cpumask_equal(cpus, cpu_present_mask)) {
> +		flush->flush.flags = HV_FLUSH_ALL_PROCESSORS;
> +	} else {
> +		flush->flush.flags = 0;
> +		for_each_cpu(cpu, cpus) {
> +			vcpu = vmbus_cpu_number_to_vp_number(cpu);
> +			if (vcpu != -1 && vcpu < 64)
> +				flush->flush.processor_mask |= 1 << vcpu;
> +			else
> +				goto unlock_do_native;
> +		}
> +	}
> +
> +	if (end == TLB_FLUSH_ALL) {
> +		flush->flush.flags =
> HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
> +		status =
> hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
> +					 &flush->flush, NULL);
> +	} else {
> +		cur = start;
> +more_gvas:
> +		gva_n = 0;
> +
> +		do {
> +			flush->flush.gva_list[gva_n] = cur & PAGE_MASK;
> +			/*
> +			 * Lower 12 bits encode the number of additional
> +			 * pages to flush (in addition to the 'cur' page).
> +			 */
> +			if (end >= cur + PAGE_SIZE * PAGE_SIZE)
> +				flush->flush.gva_list[gva_n] |=
> ~PAGE_MASK;
> +			else if (end > cur)
> +				flush->flush.gva_list[gva_n] |=
> +					(end - cur - 1) >> PAGE_SHIFT;
> +
> +			cur += PAGE_SIZE * PAGE_SIZE;
> +			++gva_n;
> +
> +		} while (cur < end && gva_n < HV_MMU_MAX_GVAS);
> +
> +		status =
> hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
> +					     gva_n, &flush->flush, NULL);
> +
> +		if (!(status & 0xffff) && cur < end)
> +			goto more_gvas;
> +	}
> +
> +unlock_do_native:
> +	spin_unlock_irqrestore(&flush->lock, flags);
> +
> +	if (!(status & 0xffff))
> +		return;
> +do_native:
> +	native_flush_tlb_others(cpus, mm, start, end);
> +}
> +
> +void hyperv_setup_mmu_ops(void)
> +{
> +	if (ms_hyperv.hints &
> HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) {
> +		pr_info("Hyper-V: Using hypercall for remote TLB flush\n");
> +		pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
> +	}
> +}
> +
> +void hyper_alloc_mmu(void)
> +{
> +	int cpu;
> +	struct hv_flush_pcpu *flush;
> +
> +	if (ms_hyperv.hints &
> HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED) {
> +		pcpu_flush = alloc_percpu(struct hv_flush_pcpu);
> +		if (!pcpu_flush)
> +			return;
> +
> +		for_each_possible_cpu(cpu) {
> +			flush = per_cpu_ptr(pcpu_flush, cpu);
> +			spin_lock_init(&flush->lock);
> +		}
> +	}
> +}
> diff --git a/arch/x86/include/asm/mshyperv.h
> b/arch/x86/include/asm/mshyperv.h
> index 1293c84..a5041c3 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -301,6 +301,8 @@ static inline int
> vmbus_cpu_number_to_vp_number(int cpu_number)
>  }
> 
>  void hyperv_init(void);
> +void hyperv_setup_mmu_ops(void);
> +void hyper_alloc_mmu(void);
>  void hyperv_report_panic(struct pt_regs *regs);
>  bool hv_is_hypercall_page_setup(void);
>  void hyperv_cleanup(void);
> diff --git a/arch/x86/include/uapi/asm/hyperv.h
> b/arch/x86/include/uapi/asm/hyperv.h
> index c87e900..3d44036 100644
> --- a/arch/x86/include/uapi/asm/hyperv.h
> +++ b/arch/x86/include/uapi/asm/hyperv.h
> @@ -239,6 +239,8 @@
>  		(~((1ull <<
> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
> 
>  /* Declare the various hypercall operations. */
> +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
> +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
>  #define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
>  #define HVCALL_POST_MESSAGE			0x005c
>  #define HVCALL_SIGNAL_EVENT			0x005d
> @@ -256,6 +258,11 @@
>  #define HV_PROCESSOR_POWER_STATE_C2		2
>  #define HV_PROCESSOR_POWER_STATE_C3		3
> 
> +#define HV_FLUSH_ALL_PROCESSORS			0x00000001
> +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	0x00000002
> +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	0x00000004
> +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	0x00000008
> +
>  /* Hypercall interface */
>  union hv_hypercall_input {
>  	u64 as_uint64;
> diff --git a/arch/x86/kernel/cpu/mshyperv.c
> b/arch/x86/kernel/cpu/mshyperv.c
> index 04cb8d3..fc228d8 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -233,6 +233,7 @@ static void __init ms_hyperv_init_platform(void)
>  	 * Setup the hook to get control post apic initialization.
>  	 */
>  	x86_platform.apic_post_init = hyperv_init;
> +	hyperv_setup_mmu_ops();
>  #endif
>  }
> 
> --
> 2.9.3