2020-08-07 08:48:15

by Chenyi Qiang

[permalink] [raw]
Subject: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

PKS MSR passes through guest directly. Configure the MSR to match the
L0/L1 settings so that nested VM runs PKS properly.

Signed-off-by: Chenyi Qiang <[email protected]>
---
arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/vmcs12.c | 2 ++
arch/x86/kvm/vmx/vmcs12.h | 6 +++++-
arch/x86/kvm/vmx/vmx.c | 10 ++++++++++
arch/x86/kvm/vmx/vmx.h | 1 +
5 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index df2c2e733549..1f9823d21ecd 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
MSR_IA32_PRED_CMD,
MSR_TYPE_W);

+ if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PKRS,
+ MSR_TYPE_R | MSR_TYPE_W);
+
kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);

return true;
@@ -2427,6 +2433,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
+ vmcs_write64(GUEST_IA32_PKRS, vmcs12->guest_ia32_pkrs);
}

if (nested_cpu_has_xsaves(vmcs12))
@@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
+ vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);

/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
@@ -2849,6 +2864,10 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
vmcs12->host_ia32_perf_global_ctrl)))
return -EINVAL;

+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS) &&
+ CC(!kvm_pkrs_valid(vmcs12->host_ia32_pkrs)))
+ return -EINVAL;
+
#ifdef CONFIG_X86_64
ia32e = !!(vcpu->arch.efer & EFER_LMA);
#else
@@ -2998,6 +3017,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
if (nested_check_guest_non_reg_state(vmcs12))
return -EINVAL;

+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS) &&
+ CC(!kvm_pkrs_valid(vmcs12->guest_ia32_pkrs)))
+ return -EINVAL;
+
return 0;
}

@@ -3271,6 +3294,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (kvm_mpx_supported() &&
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))
+ vmx->nested.vmcs01_guest_pkrs = vmcs_read64(GUEST_IA32_PKRS);

/*
* Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
@@ -3865,6 +3891,7 @@ static bool is_vmcs12_ext_field(unsigned long field)
case GUEST_IDTR_BASE:
case GUEST_PENDING_DBG_EXCEPTIONS:
case GUEST_BNDCFGS:
+ case GUEST_IA32_PKRS:
return true;
default:
break;
@@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
if (kvm_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ if (kvm_cpu_cap_has(X86_FEATURE_PKS))
+ vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);

vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
}
@@ -4151,6 +4180,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->host_ia32_perf_global_ctrl));

+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PKRS)
+ vmcs_write64(GUEST_IA32_PKRS, vmcs12->host_ia32_pkrs);
+
/* Set L1 segment info according to Intel SDM
27.5.2 Loading Host Segment and Descriptor-Table Registers */
seg = (struct kvm_segment) {
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index c8e51c004f78..df7b2143b807 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -61,9 +61,11 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(GUEST_PDPTR2, guest_pdptr2),
FIELD64(GUEST_PDPTR3, guest_pdptr3),
FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+ FIELD64(GUEST_IA32_PKRS, guest_ia32_pkrs),
FIELD64(HOST_IA32_PAT, host_ia32_pat),
FIELD64(HOST_IA32_EFER, host_ia32_efer),
FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+ FIELD64(HOST_IA32_PKRS, host_ia32_pkrs),
FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
FIELD(EXCEPTION_BITMAP, exception_bitmap),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 80232daf00ff..009b4c317375 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -69,7 +69,9 @@ struct __packed vmcs12 {
u64 vm_function_control;
u64 eptp_list_address;
u64 pml_address;
- u64 padding64[3]; /* room for future expansion */
+ u64 guest_ia32_pkrs;
+ u64 host_ia32_pkrs;
+ u64 padding64[1]; /* room for future expansion */
/*
* To allow migration of L1 (complete with its L2 guests) between
* machines of different natural widths (32 or 64 bit), we cannot have
@@ -256,6 +258,8 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(vm_function_control, 296);
CHECK_OFFSET(eptp_list_address, 304);
CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(guest_ia32_pkrs, 320);
+ CHECK_OFFSET(host_ia32_pkrs, 328);
CHECK_OFFSET(cr0_guest_host_mask, 344);
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5cdf4d3848fb..3c3fb554c3fc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7178,6 +7178,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ cr4_fixed1_update(X86_CR4_PKS, ecx, feature_bit(PKS));

#undef cr4_fixed1_update
}
@@ -7197,6 +7198,15 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
}
}
+
+ if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PKS)) {
+ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PKRS;
+ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_LOAD_IA32_PKRS;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_IA32_PKRS;
+ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_LOAD_IA32_PKRS;
+ }
}

static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 639798e4a6ca..2819d3e030b9 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -176,6 +176,7 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
u64 vmcs01_guest_bndcfgs;
+ u64 vmcs01_guest_pkrs;

/* to migrate it to L1 if L2 writes to L1's CR8 directly */
int l1_tpr_threshold;
--
2.17.1


2020-08-11 00:07:07

by Jim Mattson

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
>
> PKS MSR passes through guest directly. Configure the MSR to match the
> L0/L1 settings so that nested VM runs PKS properly.
>
> Signed-off-by: Chenyi Qiang <[email protected]>
> ---
> arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/vmx/vmcs12.c | 2 ++
> arch/x86/kvm/vmx/vmcs12.h | 6 +++++-
> arch/x86/kvm/vmx/vmx.c | 10 ++++++++++
> arch/x86/kvm/vmx/vmx.h | 1 +
> 5 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index df2c2e733549..1f9823d21ecd 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> MSR_IA32_PRED_CMD,
> MSR_TYPE_W);
>
> + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
> + nested_vmx_disable_intercept_for_msr(
> + msr_bitmap_l1, msr_bitmap_l0,
> + MSR_IA32_PKRS,
> + MSR_TYPE_R | MSR_TYPE_W);

What if L1 intercepts only *reads* of MSR_IA32_PKRS?

> kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
>
> return true;

> @@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
> if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
> vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
> +
> + if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&

Is the above check superfluous? I would assume that the L1 guest can't
set VM_ENTRY_LOAD_IA32_PKRS unless this is true.

> + (!vmx->nested.nested_run_pending ||
> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);

This doesn't seem right to me. On the target of a live migration, with
L2 active at the time the snapshot was taken (i.e.,
vmx->nested.nested_run_pending=0), it looks like we're going to try to
overwrite the current L2 PKRS value with L1's PKRS value (except that
in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
0). Am I missing something?

> vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>
> /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the


> @@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> if (kvm_mpx_supported())
> vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
> + if (kvm_cpu_cap_has(X86_FEATURE_PKS))

Shouldn't we be checking to see if the *virtual* CPU supports PKS
before writing anything into vmcs12->guest_ia32_pkrs?

> + vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);
>
> vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
> }

2020-08-12 15:01:11

by Sean Christopherson

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

On Mon, Aug 10, 2020 at 05:05:36PM -0700, Jim Mattson wrote:
> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
> >
> > PKS MSR passes through guest directly. Configure the MSR to match the
> > L0/L1 settings so that nested VM runs PKS properly.
> >
> > Signed-off-by: Chenyi Qiang <[email protected]>
> > ---
> > arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
> > arch/x86/kvm/vmx/vmcs12.c | 2 ++
> > arch/x86/kvm/vmx/vmcs12.h | 6 +++++-
> > arch/x86/kvm/vmx/vmx.c | 10 ++++++++++
> > arch/x86/kvm/vmx/vmx.h | 1 +
> > 5 files changed, 50 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > index df2c2e733549..1f9823d21ecd 100644
> > --- a/arch/x86/kvm/vmx/nested.c
> > +++ b/arch/x86/kvm/vmx/nested.c
> > @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> > MSR_IA32_PRED_CMD,
> > MSR_TYPE_W);
> >
> > + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
> > + nested_vmx_disable_intercept_for_msr(
> > + msr_bitmap_l1, msr_bitmap_l0,
> > + MSR_IA32_PKRS,
> > + MSR_TYPE_R | MSR_TYPE_W);
>
> What if L1 intercepts only *reads* of MSR_IA32_PKRS?

nested_vmx_disable_intercept_for_msr() handles merging L1's desires, the
(MSR_TYPE_R | MSR_TYPE_W) param is effectively L0's desire for L2.

2020-08-12 18:34:35

by Jim Mattson

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

On Wed, Aug 12, 2020 at 8:00 AM Sean Christopherson
<[email protected]> wrote:
>
> On Mon, Aug 10, 2020 at 05:05:36PM -0700, Jim Mattson wrote:
> > On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
> > >
> > > PKS MSR passes through guest directly. Configure the MSR to match the
> > > L0/L1 settings so that nested VM runs PKS properly.
> > >
> > > Signed-off-by: Chenyi Qiang <[email protected]>
> > > ---
> > > arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
> > > arch/x86/kvm/vmx/vmcs12.c | 2 ++
> > > arch/x86/kvm/vmx/vmcs12.h | 6 +++++-
> > > arch/x86/kvm/vmx/vmx.c | 10 ++++++++++
> > > arch/x86/kvm/vmx/vmx.h | 1 +
> > > 5 files changed, 50 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> > > index df2c2e733549..1f9823d21ecd 100644
> > > --- a/arch/x86/kvm/vmx/nested.c
> > > +++ b/arch/x86/kvm/vmx/nested.c
> > > @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
> > > MSR_IA32_PRED_CMD,
> > > MSR_TYPE_W);
> > >
> > > + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
> > > + nested_vmx_disable_intercept_for_msr(
> > > + msr_bitmap_l1, msr_bitmap_l0,
> > > + MSR_IA32_PKRS,
> > > + MSR_TYPE_R | MSR_TYPE_W);
> >
> > What if L1 intercepts only *reads* of MSR_IA32_PKRS?
>
> nested_vmx_disable_intercept_for_msr() handles merging L1's desires, the
> (MSR_TYPE_R | MSR_TYPE_W) param is effectively L0's desire for L2.

I should know better than to assume that a function in kvm actually
does anything like what its name implies, but I never seem to learn.
:-(

Thanks!

2020-08-13 04:55:23

by Chenyi Qiang

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM



On 8/11/2020 8:05 AM, Jim Mattson wrote:
> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
>>
>> PKS MSR passes through guest directly. Configure the MSR to match the
>> L0/L1 settings so that nested VM runs PKS properly.
>>
>> Signed-off-by: Chenyi Qiang <[email protected]>
>> ---
>> arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++++++++++++++++
>> arch/x86/kvm/vmx/vmcs12.c | 2 ++
>> arch/x86/kvm/vmx/vmcs12.h | 6 +++++-
>> arch/x86/kvm/vmx/vmx.c | 10 ++++++++++
>> arch/x86/kvm/vmx/vmx.h | 1 +
>> 5 files changed, 50 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
>> index df2c2e733549..1f9823d21ecd 100644
>> --- a/arch/x86/kvm/vmx/nested.c
>> +++ b/arch/x86/kvm/vmx/nested.c
>> @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>> MSR_IA32_PRED_CMD,
>> MSR_TYPE_W);
>>
>> + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
>> + nested_vmx_disable_intercept_for_msr(
>> + msr_bitmap_l1, msr_bitmap_l0,
>> + MSR_IA32_PKRS,
>> + MSR_TYPE_R | MSR_TYPE_W);
>
> What if L1 intercepts only *reads* of MSR_IA32_PKRS?
>
>> kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
>>
>> return true;
>
>> @@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>> if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
>> !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
>> vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
>> +
>> + if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&
>
> Is the above check superfluous? I would assume that the L1 guest can't
> set VM_ENTRY_LOAD_IA32_PKRS unless this is true.
>

I enforce this check to ensure vmcs_write to the Guest_IA32_PKRS without
error. if deleted, vmcs_write to GUEST_IA32_PKRS may executed when PKS
is unsupported.

>> + (!vmx->nested.nested_run_pending ||
>> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
>> + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
>
> This doesn't seem right to me. On the target of a live migration, with
> L2 active at the time the snapshot was taken (i.e.,
> vmx->nested.nested_run_pending=0), it looks like we're going to try to
> overwrite the current L2 PKRS value with L1's PKRS value (except that
> in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
> 0). Am I missing something?
>

We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
PKRS to L2.

>> vmx_set_rflags(vcpu, vmcs12->guest_rflags);
>>
>> /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
>
>
>> @@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
>> vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>> if (kvm_mpx_supported())
>> vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
>> + if (kvm_cpu_cap_has(X86_FEATURE_PKS))
>
> Shouldn't we be checking to see if the *virtual* CPU supports PKS
> before writing anything into vmcs12->guest_ia32_pkrs?
>

Yes, It's reasonable.

>> + vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);
>>
>> vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
>> }

2020-08-13 17:54:06

by Jim Mattson

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

On Wed, Aug 12, 2020 at 9:54 PM Chenyi Qiang <[email protected]> wrote:
>
>
>
> On 8/11/2020 8:05 AM, Jim Mattson wrote:
> > On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
> >>
> >> PKS MSR passes through guest directly. Configure the MSR to match the
> >> L0/L1 settings so that nested VM runs PKS properly.
> >>
> >> Signed-off-by: Chenyi Qiang <[email protected]>
> >> ---

> >> + (!vmx->nested.nested_run_pending ||
> >> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> >> + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
> >
> > This doesn't seem right to me. On the target of a live migration, with
> > L2 active at the time the snapshot was taken (i.e.,
> > vmx->nested.nested_run_pending=0), it looks like we're going to try to
> > overwrite the current L2 PKRS value with L1's PKRS value (except that
> > in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
> > 0). Am I missing something?
> >
>
> We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
> Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
> PKRS to L2.

I'm thinking of the case where vmx->nested.nested_run_pending is
false, and we are processing a KVM_SET_NESTED_STATE ioctl, yet
VM_ENTRY_LOAD_IA32_PKRS *is* set in the vmcs12.

2020-08-14 11:32:20

by Chenyi Qiang

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM



On 8/14/2020 1:52 AM, Jim Mattson wrote:
> On Wed, Aug 12, 2020 at 9:54 PM Chenyi Qiang <[email protected]> wrote:
>>
>>
>>
>> On 8/11/2020 8:05 AM, Jim Mattson wrote:
>>> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
>>>>
>>>> PKS MSR passes through guest directly. Configure the MSR to match the
>>>> L0/L1 settings so that nested VM runs PKS properly.
>>>>
>>>> Signed-off-by: Chenyi Qiang <[email protected]>
>>>> ---
>
>>>> + (!vmx->nested.nested_run_pending ||
>>>> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
>>>> + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
>>>
>>> This doesn't seem right to me. On the target of a live migration, with
>>> L2 active at the time the snapshot was taken (i.e.,
>>> vmx->nested.nested_run_pending=0), it looks like we're going to try to
>>> overwrite the current L2 PKRS value with L1's PKRS value (except that
>>> in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
>>> 0). Am I missing something?
>>>
>>
>> We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
>> Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
>> PKRS to L2.
>
> I'm thinking of the case where vmx->nested.nested_run_pending is
> false, and we are processing a KVM_SET_NESTED_STATE ioctl, yet
> VM_ENTRY_LOAD_IA32_PKRS *is* set in the vmcs12.
>

Oh, I miss this case. What I'm still confused here is that the
restoration for GUEST_IA32_DEBUGCTL and GUEST_BNDCFGS have the same
issue, right? or I miss something.

2020-08-14 19:35:56

by Jim Mattson

[permalink] [raw]
Subject: Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

On Fri, Aug 14, 2020 at 3:09 AM Chenyi Qiang <[email protected]> wrote:
>
>
>
> On 8/14/2020 1:52 AM, Jim Mattson wrote:
> > On Wed, Aug 12, 2020 at 9:54 PM Chenyi Qiang <[email protected]> wrote:
> >>
> >>
> >>
> >> On 8/11/2020 8:05 AM, Jim Mattson wrote:
> >>> On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang <[email protected]> wrote:
> >>>>
> >>>> PKS MSR passes through guest directly. Configure the MSR to match the
> >>>> L0/L1 settings so that nested VM runs PKS properly.
> >>>>
> >>>> Signed-off-by: Chenyi Qiang <[email protected]>
> >>>> ---
> >
> >>>> + (!vmx->nested.nested_run_pending ||
> >>>> + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
> >>>> + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);
> >>>
> >>> This doesn't seem right to me. On the target of a live migration, with
> >>> L2 active at the time the snapshot was taken (i.e.,
> >>> vmx->nested.nested_run_pending=0), it looks like we're going to try to
> >>> overwrite the current L2 PKRS value with L1's PKRS value (except that
> >>> in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
> >>> 0). Am I missing something?
> >>>
> >>
> >> We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS.
> >> Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's
> >> PKRS to L2.
> >
> > I'm thinking of the case where vmx->nested.nested_run_pending is
> > false, and we are processing a KVM_SET_NESTED_STATE ioctl, yet
> > VM_ENTRY_LOAD_IA32_PKRS *is* set in the vmcs12.
> >
>
> Oh, I miss this case. What I'm still confused here is that the
> restoration for GUEST_IA32_DEBUGCTL and GUEST_BNDCFGS have the same
> issue, right? or I miss something.

I take it back. This does work, assuming that userspace calls
KVM_SET_MSRS before calling KVM_SET_NESTED_STATE. Assuming L2 is
active when the checkpoint is taken, the MSR values saved will be the
L2 values. When restoring the MSRs with KVM_SET_MSRS, the L2 MSR
values will be written into vmcs01. They don't belong there, but we're
never going to launch vmcs01 with those MSR values. Instead, when
userspace calls KVM_SET_NESTED_STATE, those values will be transferred
first to the vmcs01_<msr> fields of the vmx->nested struct, and then
to vmcs02.

This is subtle, and I don't think it's documented anywhere that
KVM_SET_NESTED_STATE must be called after KVM_SET_MSRS. In fact, there
are probably a number of dependencies among the various KVM_SET_*
functions that aren't documented anywhere.