Hypercall code page is specified in the Hyper-V TLFS to be an overlay
page, ie., guest chooses a GPA and the host _places_ a page at that
location, making it visible to the guest and the existing page becomes
inaccessible. Similarly when disabled, the host should _remove_ the
overlay and the old page should become visible to the guest.
Currently KVM directly patches the hypercall code into the guest chosen
GPA. Since the guest seldom moves the hypercall code page around, it
doesn't see any problems even though we are corrupting the exiting data
in that GPA.
VSM API introduces more complex overlay workflows during VTL switches
where the guest starts to expect that the existing page is intact. This
means we need a more generic approach to handling overlay pages: add a
new exit reason KVM_EXIT_HYPERV_OVERLAY that exits to userspace with the
expectation that a page gets overlaid there.
In the interest of maintaing userspace exposed behaviour, add a new KVM
capability to allow the VMMs to enable this if they can handle the
hypercall page in userspace.
Signed-off-by: Siddharth Chandrasekaran <[email protected]>
CR: https://code.amazon.com/reviews/CR-49011379
---
arch/x86/include/asm/kvm_host.h | 4 ++++
arch/x86/kvm/hyperv.c | 25 ++++++++++++++++++++++---
arch/x86/kvm/x86.c | 5 +++++
include/uapi/linux/kvm.h | 10 ++++++++++
4 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3768819693e5..2b560e77f8bc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -925,6 +925,10 @@ struct kvm_hv {
struct hv_partition_assist_pg *hv_pa_pg;
struct kvm_hv_syndbg hv_syndbg;
+
+ struct {
+ u64 overlay_hcall_page:1;
+ } flags;
};
struct msr_bitmap_range {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f98370a39936..e7d9d3bb39dc 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -191,6 +191,21 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
srcu_read_unlock(&kvm->irq_srcu, idx);
}
+static void overlay_exit(struct kvm_vcpu *vcpu, u32 msr, u64 gpa,
+ u32 data_len, const u8 *data)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_OVERLAY;
+ hv_vcpu->exit.u.overlay.msr = msr;
+ hv_vcpu->exit.u.overlay.gpa = gpa;
+ hv_vcpu->exit.u.overlay.data_len = data_len;
+ if (data_len)
+ memcpy(hv_vcpu->exit.u.overlay.data, data, data_len);
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
{
struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
@@ -1246,9 +1261,13 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
/* ret */
((unsigned char *)instructions)[i++] = 0xc3;
- addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
- if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
- return 1;
+ if (kvm->arch.hyperv.flags.overlay_hcall_page) {
+ overlay_exit(vcpu, msr, data, (u32)i, instructions);
+ } else {
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
+ return 1;
+ }
hv->hv_hypercall = data;
break;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eca63625aee4..b3e497343e5c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3745,6 +3745,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE:
case KVM_CAP_SYS_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
@@ -5357,6 +5358,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.bus_lock_detection_enabled = true;
r = 0;
break;
+ case KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE:
+ kvm->arch.hyperv.flags.overlay_hcall_page = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f6afee209620..37b0715da4fd 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -185,10 +185,13 @@ struct kvm_s390_cmma_log {
__u64 values;
};
+#define KVM_EXIT_HV_OVERLAY_DATA_SIZE 64
+
struct kvm_hyperv_exit {
#define KVM_EXIT_HYPERV_SYNIC 1
#define KVM_EXIT_HYPERV_HCALL 2
#define KVM_EXIT_HYPERV_SYNDBG 3
+#define KVM_EXIT_HYPERV_OVERLAY 4
__u32 type;
__u32 pad1;
union {
@@ -213,6 +216,12 @@ struct kvm_hyperv_exit {
__u64 recv_page;
__u64 pending_page;
} syndbg;
+ struct {
+ __u32 msr;
+ __u32 data_len;
+ __u64 gpa;
+ __u8 data[KVM_EXIT_HV_OVERLAY_DATA_SIZE];
+ } overlay;
} u;
};
@@ -1078,6 +1087,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_DIRTY_LOG_RING 192
#define KVM_CAP_X86_BUS_LOCK_EXIT 193
#define KVM_CAP_PPC_DAWR1 194
+#define KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE 195
#ifdef KVM_CAP_IRQ_ROUTING
--
2.17.1
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On 23.04.21 11:03, Siddharth Chandrasekaran wrote:
> Hypercall code page is specified in the Hyper-V TLFS to be an overlay
> page, ie., guest chooses a GPA and the host _places_ a page at that
> location, making it visible to the guest and the existing page becomes
> inaccessible. Similarly when disabled, the host should _remove_ the
> overlay and the old page should become visible to the guest.
>
> Currently KVM directly patches the hypercall code into the guest chosen
> GPA. Since the guest seldom moves the hypercall code page around, it
> doesn't see any problems even though we are corrupting the exiting data
> in that GPA.
>
> VSM API introduces more complex overlay workflows during VTL switches
> where the guest starts to expect that the existing page is intact. This
> means we need a more generic approach to handling overlay pages: add a
> new exit reason KVM_EXIT_HYPERV_OVERLAY that exits to userspace with the
> expectation that a page gets overlaid there.
I can see how that may get interesting for other overlay pages later,
but this one in particular is just an MSR write, no? Is there any reason
we can't just use the user space MSR handling logic instead?
What's missing then is a way to pull the hcall page contents from KVM.
But even there I'm not convinced that KVM should be the reference point
for its contents. Isn't user space in an as good position to assemble it?
>
> In the interest of maintaing userspace exposed behaviour, add a new KVM
> capability to allow the VMMs to enable this if they can handle the
> hypercall page in userspace.
>
> Signed-off-by: Siddharth Chandrasekaran <[email protected]>
>
> CR: https://code.amazon.com/reviews/CR-49011379
Please remove this line from upstream submissions :).
> ---
> arch/x86/include/asm/kvm_host.h | 4 ++++
> arch/x86/kvm/hyperv.c | 25 ++++++++++++++++++++++---
> arch/x86/kvm/x86.c | 5 +++++
> include/uapi/linux/kvm.h | 10 ++++++++++
You're modifying / adding a user space API. Please make sure to update
the documentation in Documentation/virt/kvm/api.rst when you do that.
Alex
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Siddharth Chandrasekaran <[email protected]> writes:
> Hypercall code page is specified in the Hyper-V TLFS to be an overlay
> page, ie., guest chooses a GPA and the host _places_ a page at that
> location, making it visible to the guest and the existing page becomes
> inaccessible. Similarly when disabled, the host should _remove_ the
> overlay and the old page should become visible to the guest.
>
> Currently KVM directly patches the hypercall code into the guest chosen
> GPA. Since the guest seldom moves the hypercall code page around, it
> doesn't see any problems even though we are corrupting the exiting data
> in that GPA.
>
> VSM API introduces more complex overlay workflows during VTL switches
> where the guest starts to expect that the existing page is intact. This
> means we need a more generic approach to handling overlay pages: add a
> new exit reason KVM_EXIT_HYPERV_OVERLAY that exits to userspace with the
> expectation that a page gets overlaid there.
>
> In the interest of maintaing userspace exposed behaviour, add a new KVM
> capability to allow the VMMs to enable this if they can handle the
> hypercall page in userspace.
>
> Signed-off-by: Siddharth Chandrasekaran <[email protected]>
>
> CR: https://code.amazon.com/reviews/CR-49011379
This line wasn't supposed to go to the upstream patch, was it? :-)
> ---
> arch/x86/include/asm/kvm_host.h | 4 ++++
> arch/x86/kvm/hyperv.c | 25 ++++++++++++++++++++++---
> arch/x86/kvm/x86.c | 5 +++++
> include/uapi/linux/kvm.h | 10 ++++++++++
> 4 files changed, 41 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3768819693e5..2b560e77f8bc 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -925,6 +925,10 @@ struct kvm_hv {
>
> struct hv_partition_assist_pg *hv_pa_pg;
> struct kvm_hv_syndbg hv_syndbg;
> +
> + struct {
> + u64 overlay_hcall_page:1;
> + } flags;
Do you plan to add more flags here? If not, I'd suggest we use a simple
boolean instead of the whole 'flags' structure.
> };
>
> struct msr_bitmap_range {
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index f98370a39936..e7d9d3bb39dc 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -191,6 +191,21 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
> srcu_read_unlock(&kvm->irq_srcu, idx);
> }
>
> +static void overlay_exit(struct kvm_vcpu *vcpu, u32 msr, u64 gpa,
> + u32 data_len, const u8 *data)
> +{
> + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> +
> + hv_vcpu->exit.type = KVM_EXIT_HYPERV_OVERLAY;
> + hv_vcpu->exit.u.overlay.msr = msr;
> + hv_vcpu->exit.u.overlay.gpa = gpa;
> + hv_vcpu->exit.u.overlay.data_len = data_len;
> + if (data_len)
> + memcpy(hv_vcpu->exit.u.overlay.data, data, data_len);
It seems this exit to userspace has double meaning:
1) Please put an overlay page at GPA ... (are we sure we will never need
more than one page?)
2) Do something else depending on the MSR which triggered the write (are
we sure all such exits are going to be triggered by an MSR write?)
and I'm wondering if it would be possible to actually limit
KVM_EXIT_HYPERV_OVERLAY to 'put an overlay page' and do the rest somehow
differently.
In particularly, I think we can still do hypercall page patching
directly from KVM after overlay page setup. With VTL, when the logic is
more complex, do you expect it to be implemented primarily in userspace?
> +
> + kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
> +}
> +
> static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
> {
> struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
> @@ -1246,9 +1261,13 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
> /* ret */
> ((unsigned char *)instructions)[i++] = 0xc3;
>
> - addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
> - if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
> - return 1;
> + if (kvm->arch.hyperv.flags.overlay_hcall_page) {
> + overlay_exit(vcpu, msr, data, (u32)i, instructions);
> + } else {
> + addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
> + if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
> + return 1;
> + }
> hv->hv_hypercall = data;
> break;
> }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eca63625aee4..b3e497343e5c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3745,6 +3745,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_HYPERV_TLBFLUSH:
> case KVM_CAP_HYPERV_SEND_IPI:
> case KVM_CAP_HYPERV_CPUID:
> + case KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE:
> case KVM_CAP_SYS_HYPERV_CPUID:
> case KVM_CAP_PCI_SEGMENT:
> case KVM_CAP_DEBUGREGS:
> @@ -5357,6 +5358,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> kvm->arch.bus_lock_detection_enabled = true;
> r = 0;
> break;
> + case KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE:
> + kvm->arch.hyperv.flags.overlay_hcall_page = true;
> + r = 0;
> + break;
> default:
> r = -EINVAL;
> break;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f6afee209620..37b0715da4fd 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -185,10 +185,13 @@ struct kvm_s390_cmma_log {
> __u64 values;
> };
>
> +#define KVM_EXIT_HV_OVERLAY_DATA_SIZE 64
Could you please elaborate on why you think 64 bytes is going to be
enough? (like what structures we'll be passing here for VTL)
> +
> struct kvm_hyperv_exit {
> #define KVM_EXIT_HYPERV_SYNIC 1
> #define KVM_EXIT_HYPERV_HCALL 2
> #define KVM_EXIT_HYPERV_SYNDBG 3
> +#define KVM_EXIT_HYPERV_OVERLAY 4
Please document this in Documentation/virt/kvm/api.rst
> __u32 type;
> __u32 pad1;
> union {
> @@ -213,6 +216,12 @@ struct kvm_hyperv_exit {
> __u64 recv_page;
> __u64 pending_page;
> } syndbg;
> + struct {
> + __u32 msr;
> + __u32 data_len;
> + __u64 gpa;
> + __u8 data[KVM_EXIT_HV_OVERLAY_DATA_SIZE];
... in partucular, please document the meaning of 'data' (in case it
needs to be here).
> + } overlay;
> } u;
> };
>
> @@ -1078,6 +1087,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_DIRTY_LOG_RING 192
> #define KVM_CAP_X86_BUS_LOCK_EXIT 193
> #define KVM_CAP_PPC_DAWR1 194
> +#define KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE 195
>
> #ifdef KVM_CAP_IRQ_ROUTING
--
Vitaly
On Fri, Apr 23, 2021 at 11:24:04AM +0200, Alexander Graf wrote:
>
>
> On 23.04.21 11:03, Siddharth Chandrasekaran wrote:
> > Hypercall code page is specified in the Hyper-V TLFS to be an overlay
> > page, ie., guest chooses a GPA and the host _places_ a page at that
> > location, making it visible to the guest and the existing page becomes
> > inaccessible. Similarly when disabled, the host should _remove_ the
> > overlay and the old page should become visible to the guest.
> >
> > Currently KVM directly patches the hypercall code into the guest chosen
> > GPA. Since the guest seldom moves the hypercall code page around, it
> > doesn't see any problems even though we are corrupting the exiting data
> > in that GPA.
> >
> > VSM API introduces more complex overlay workflows during VTL switches
> > where the guest starts to expect that the existing page is intact. This
> > means we need a more generic approach to handling overlay pages: add a
> > new exit reason KVM_EXIT_HYPERV_OVERLAY that exits to userspace with the
> > expectation that a page gets overlaid there.
>
> I can see how that may get interesting for other overlay pages later, but
> this one in particular is just an MSR write, no? Is there any reason we
> can't just use the user space MSR handling logic instead?
>
> What's missing then is a way to pull the hcall page contents from KVM. But
> even there I'm not convinced that KVM should be the reference point for its
> contents. Isn't user space in an as good position to assemble it?
Makes sense. Let me explore that route and get back to you.
> >
> > In the interest of maintaing userspace exposed behaviour, add a new KVM
> > capability to allow the VMMs to enable this if they can handle the
> > hypercall page in userspace.
> >
> > Signed-off-by: Siddharth Chandrasekaran <[email protected]>
> >
> > CR: https://code.amazon.com/reviews/CR-49011379
>
> Please remove this line from upstream submissions :).
I noticed it a bit late (a tooling gap). You shouldn't see this in any
of my future patches.
> > ---
> > arch/x86/include/asm/kvm_host.h | 4 ++++
> > arch/x86/kvm/hyperv.c | 25 ++++++++++++++++++++++---
> > arch/x86/kvm/x86.c | 5 +++++
> > include/uapi/linux/kvm.h | 10 ++++++++++
>
> You're modifying / adding a user space API. Please make sure to update the
> documentation in Documentation/virt/kvm/api.rst when you do that.
Ack. Will add it.
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On 23/04/21 11:24, Alexander Graf wrote:
> I can see how that may get interesting for other overlay pages later,
> but this one in particular is just an MSR write, no? Is there any reason
> we can't just use the user space MSR handling logic instead?
>
> What's missing then is a way to pull the hcall page contents from KVM.
> But even there I'm not convinced that KVM should be the reference point
> for its contents. Isn't user space in an as good position to assemble it?
In theory userspace doesn't know how KVM wishes to implement the
hypercall page, especially if Xen hypercalls are enabled as well.
But userspace has two plausible ways to get the page contents:
1) add a ioctl to write the hypercall page contents to an arbitrary
userspace address
2) after userspace updates the memslots to add the overlay page at the
right place, use KVM_SET_MSR from userspace (which won't be filtered
because it's host initiated)
The second has the advantage of not needing any new code at all, but
it's a bit more ugly.
Paolo
On 23.04.21 11:50, Paolo Bonzini wrote:
> CAUTION: This email originated from outside of the organization. Do not
> click links or open attachments unless you can confirm the sender and
> know thecontent is safe.
>
>
>
> On 23/04/21 11:24, Alexander Graf wrote:
>> I can see how that may get interesting for other overlay pages later,
>> but this one in particular is just an MSR write, no? Is there any reason
>> we can't just use the user space MSR handling logic instead?
>>
>> What's missing then is a way to pull the hcall page contents from KVM.
>> But even there I'm not convinced that KVM should be the reference point
>> for its contents. Isn't user space in an as good position to assemble it?
>
> In theory userspace doesn't know how KVM wishes to implement the
> hypercall page, especially if Xen hypercalls are enabled as well.
I'm not sure I agree with that sentiment :). User space is the one that
sets the xen compat mode. All we need to do is declare the ORing as part
of the KVM ABI. Which we effectively are doing already, because it's
part of the ABI to the guest, no?
>
> But userspace has two plausible ways to get the page contents:
>
> 1) add a ioctl to write the hypercall page contents to an arbitrary
> userspace address
>
> 2) after userspace updates the memslots to add the overlay page at the
> right place, use KVM_SET_MSR from userspace (which won't be filtered
> because it's host initiated)
>
> The second has the advantage of not needing any new code at all, but
> it's a bit more ugly.
The more of all of that hyper-v code we can have live in user space, the
happier I am :).
Alex
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On Fri, Apr 23, 2021 at 11:36:27AM +0200, Vitaly Kuznetsov wrote:
> Siddharth Chandrasekaran <[email protected]> writes:
> > Hypercall code page is specified in the Hyper-V TLFS to be an overlay
> > page, ie., guest chooses a GPA and the host _places_ a page at that
> > location, making it visible to the guest and the existing page becomes
> > inaccessible. Similarly when disabled, the host should _remove_ the
> > overlay and the old page should become visible to the guest.
> >
> > Currently KVM directly patches the hypercall code into the guest chosen
> > GPA. Since the guest seldom moves the hypercall code page around, it
> > doesn't see any problems even though we are corrupting the exiting data
> > in that GPA.
> >
> > VSM API introduces more complex overlay workflows during VTL switches
> > where the guest starts to expect that the existing page is intact. This
> > means we need a more generic approach to handling overlay pages: add a
> > new exit reason KVM_EXIT_HYPERV_OVERLAY that exits to userspace with the
> > expectation that a page gets overlaid there.
> >
> > In the interest of maintaing userspace exposed behaviour, add a new KVM
> > capability to allow the VMMs to enable this if they can handle the
> > hypercall page in userspace.
> >
> > Signed-off-by: Siddharth Chandrasekaran <[email protected]>
> >
> > CR: https://code.amazon.com/reviews/CR-49011379
>
> This line wasn't supposed to go to the upstream patch, was it? :-)
Yes. Will be removed in future :)
> > ---
> > arch/x86/include/asm/kvm_host.h | 4 ++++
> > arch/x86/kvm/hyperv.c | 25 ++++++++++++++++++++++---
> > arch/x86/kvm/x86.c | 5 +++++
> > include/uapi/linux/kvm.h | 10 ++++++++++
> > 4 files changed, 41 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 3768819693e5..2b560e77f8bc 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -925,6 +925,10 @@ struct kvm_hv {
> >
> > struct hv_partition_assist_pg *hv_pa_pg;
> > struct kvm_hv_syndbg hv_syndbg;
> > +
> > + struct {
> > + u64 overlay_hcall_page:1;
> > + } flags;
>
> Do you plan to add more flags here? If not, I'd suggest we use a simple
> boolean instead of the whole 'flags' structure.
No nothing in particular, was just being futuristic. I'll change it too bool.
> > };
> >
> > struct msr_bitmap_range {
> > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> > index f98370a39936..e7d9d3bb39dc 100644
> > --- a/arch/x86/kvm/hyperv.c
> > +++ b/arch/x86/kvm/hyperv.c
> > @@ -191,6 +191,21 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
> > srcu_read_unlock(&kvm->irq_srcu, idx);
> > }
> >
> > +static void overlay_exit(struct kvm_vcpu *vcpu, u32 msr, u64 gpa,
> > + u32 data_len, const u8 *data)
> > +{
> > + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
> > +
> > + hv_vcpu->exit.type = KVM_EXIT_HYPERV_OVERLAY;
> > + hv_vcpu->exit.u.overlay.msr = msr;
> > + hv_vcpu->exit.u.overlay.gpa = gpa;
> > + hv_vcpu->exit.u.overlay.data_len = data_len;
> > + if (data_len)
> > + memcpy(hv_vcpu->exit.u.overlay.data, data, data_len);
>
> It seems this exit to userspace has double meaning:
> 1) Please put an overlay page at GPA ... (are we sure we will never need
> more than one page?)
> 2) Do something else depending on the MSR which triggered the write (are
> we sure all such exits are going to be triggered by an MSR write?)
Currently there are 4 types of overlay pages defined in the Hyper-V TLFS:
- Hypercall code page
- Synic message page
- Synic event page
- VP Assist page
All of them are single pages and are MSR triggered. The userspace is
responsible for overlaying the page and copying the initial data if-any
to that page.
> and I'm wondering if it would be possible to actually limit
> KVM_EXIT_HYPERV_OVERLAY to 'put an overlay page' and do the rest somehow
> differently.
>
> In particularly, I think we can still do hypercall page patching
> directly from KVM after overlay page setup.
After we have given up control to the userspace, we don't have a way to
write to the page that got overlaid before the guest execution resumes
(as it expects the page to be ready after the MSR write).
The other alternative is to expose a new ioctl to write this data but
that seems excessive considering what we are trying to achieve here.
> With VTL, when the logic is
> more complex, do you expect it to be implemented primarily in userspace?
During VTL switches, we plan to exit to user space to re-overlay the
right per-VTL page.
> > +
> > + kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
> > +}
> > +
> > static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
> > {
> > struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
> > @@ -1246,9 +1261,13 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
> > /* ret */
> > ((unsigned char *)instructions)[i++] = 0xc3;
> >
> > - addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
> > - if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
> > - return 1;
> > + if (kvm->arch.hyperv.flags.overlay_hcall_page) {
> > + overlay_exit(vcpu, msr, data, (u32)i, instructions);
> > + } else {
> > + addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
> > + if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
> > + return 1;
> > + }
> > hv->hv_hypercall = data;
> > break;
> > }
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index eca63625aee4..b3e497343e5c 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3745,6 +3745,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> > case KVM_CAP_HYPERV_TLBFLUSH:
> > case KVM_CAP_HYPERV_SEND_IPI:
> > case KVM_CAP_HYPERV_CPUID:
> > + case KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE:
> > case KVM_CAP_SYS_HYPERV_CPUID:
> > case KVM_CAP_PCI_SEGMENT:
> > case KVM_CAP_DEBUGREGS:
> > @@ -5357,6 +5358,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> > kvm->arch.bus_lock_detection_enabled = true;
> > r = 0;
> > break;
> > + case KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE:
> > + kvm->arch.hyperv.flags.overlay_hcall_page = true;
> > + r = 0;
> > + break;
> > default:
> > r = -EINVAL;
> > break;
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index f6afee209620..37b0715da4fd 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -185,10 +185,13 @@ struct kvm_s390_cmma_log {
> > __u64 values;
> > };
> >
> > +#define KVM_EXIT_HV_OVERLAY_DATA_SIZE 64
>
> Could you please elaborate on why you think 64 bytes is going to be
> enough? (like what structures we'll be passing here for VTL)
With VSM, we need 36 bytes. That number was rounded up to next power of
2 for future needs.
> > +
> > struct kvm_hyperv_exit {
> > #define KVM_EXIT_HYPERV_SYNIC 1
> > #define KVM_EXIT_HYPERV_HCALL 2
> > #define KVM_EXIT_HYPERV_SYNDBG 3
> > +#define KVM_EXIT_HYPERV_OVERLAY 4
>
> Please document this in Documentation/virt/kvm/api.rst
>
> > __u32 type;
> > __u32 pad1;
> > union {
> > @@ -213,6 +216,12 @@ struct kvm_hyperv_exit {
> > __u64 recv_page;
> > __u64 pending_page;
> > } syndbg;
> > + struct {
> > + __u32 msr;
> > + __u32 data_len;
> > + __u64 gpa;
> > + __u8 data[KVM_EXIT_HV_OVERLAY_DATA_SIZE];
>
> ... in partucular, please document the meaning of 'data' (in case it
> needs to be here).
Ack.
> > + } overlay;
> > } u;
> > };
> >
> > @@ -1078,6 +1087,7 @@ struct kvm_ppc_resize_hpt {
> > #define KVM_CAP_DIRTY_LOG_RING 192
> > #define KVM_CAP_X86_BUS_LOCK_EXIT 193
> > #define KVM_CAP_PPC_DAWR1 194
> > +#define KVM_CAP_HYPERV_OVERLAY_HCALL_PAGE 195
> >
> > #ifdef KVM_CAP_IRQ_ROUTING
>
> --
Sid.
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On 23/04/21 11:58, Alexander Graf wrote:
>> In theory userspace doesn't know how KVM wishes to implement the
>> hypercall page, especially if Xen hypercalls are enabled as well.
>
> I'm not sure I agree with that sentiment :). User space is the one that
> sets the xen compat mode. All we need to do is declare the ORing as part
> of the KVM ABI. Which we effectively are doing already, because it's
> part of the ABI to the guest, no?
Good point. But it may change in the future based on KVM_ENABLE_CAP or
whatever, and duplicating code between userspace and kernel is ugly. We
already have too many unwritten conventions around CPUID, MSRs, get/set
state ioctls, etc.
That said, this definitely tilts the balance against adding an ioctl to
write the hypercall page contents. Userspace can either use the
KVM_SET_MSR or assemble it on its own, and one of the two should be okay.
Paolo
>>
>> But userspace has two plausible ways to get the page contents:
>>
>> 1) add a ioctl to write the hypercall page contents to an arbitrary
>> userspace address
>>
>> 2) after userspace updates the memslots to add the overlay page at the
>> right place, use KVM_SET_MSR from userspace (which won't be filtered
>> because it's host initiated)
>>
>> The second has the advantage of not needing any new code at all, but
>> it's a bit more ugly.
>
> The more of all of that hyper-v code we can have live in user space, the
> happier I am :).
On 23.04.21 12:15, Paolo Bonzini wrote:
> CAUTION: This email originated from outside of the organization. Do not
> click links or open attachments unless you can confirm the sender and
> know thecontent is safe.
>
>
>
> On 23/04/21 11:58, Alexander Graf wrote:
>>> In theory userspace doesn't know how KVM wishes to implement the
>>> hypercall page, especially if Xen hypercalls are enabled as well.
>>
>> I'm not sure I agree with that sentiment :). User space is the one that
>> sets the xen compat mode. All we need to do is declare the ORing as part
>> of the KVM ABI. Which we effectively are doing already, because it's
>> part of the ABI to the guest, no?
>
> Good point. But it may change in the future based on KVM_ENABLE_CAP or
> whatever, and duplicating code between userspace and kernel is ugly. We
> already have too many unwritten conventions around CPUID, MSRs, get/set
> state ioctls, etc.
Yes, I agree. So we can just declare that there won't be any changes to
the hcall page in-kernel handling code going forward, no? :)
If you want to support a new CAP, support an actual overlay page first -
and thus actually respect the TLFS.
> That said, this definitely tilts the balance against adding an ioctl to
> write the hypercall page contents. Userspace can either use the
> KVM_SET_MSR or assemble it on its own, and one of the two should be okay.
Sounds great. And in the future if we need to move the Xen offset, we
should rather make the Xen offsetting a parameter from user space.
Alex
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
On Fri, Apr 23, 2021 at 12:18:31PM +0200, Alexander Graf wrote:
> On 23.04.21 12:15, Paolo Bonzini wrote:
> > On 23/04/21 11:58, Alexander Graf wrote:
> > > > In theory userspace doesn't know how KVM wishes to implement the
> > > > hypercall page, especially if Xen hypercalls are enabled as well.
> > >
> > > I'm not sure I agree with that sentiment :). User space is the one that
> > > sets the xen compat mode. All we need to do is declare the ORing as part
> > > of the KVM ABI. Which we effectively are doing already, because it's
> > > part of the ABI to the guest, no?
> >
> > Good point. But it may change in the future based on KVM_ENABLE_CAP or
> > whatever, and duplicating code between userspace and kernel is ugly. We
> > already have too many unwritten conventions around CPUID, MSRs, get/set
> > state ioctls, etc.
>
> Yes, I agree. So we can just declare that there won't be any changes to the
> hcall page in-kernel handling code going forward, no? :)
>
> If you want to support a new CAP, support an actual overlay page first - and
> thus actually respect the TLFS.
>
> > That said, this definitely tilts the balance against adding an ioctl to
> > write the hypercall page contents. Userspace can either use the
> > KVM_SET_MSR or assemble it on its own, and one of the two should be okay.
>
> Sounds great. And in the future if we need to move the Xen offset, we should
> rather make the Xen offsetting a parameter from user space.
Okay, assembling the hypercall page contents in user space is possible
but doesn't help us much:
1. It is best to keep the instruction patching at one place; the
kernel is already doing it (which we cannot remove).
2. It is not possible to assemble all overlay pages in user space. For
instance, we cannot assemble the VP assist page. The hypercall code
page is really a special case.
So I'd side with the KVM_SET_MSR approach and have a convention that all
overlay page requests would be trapped to user space first - where the
page get overlaid - and then user space forwards the MSR write to kernel
so it can do a kvm_vcpu_write_guest() if needed. IMO, this allows best
flexibility.
~ Sid.
Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879