This patch fix bug https://bugzilla.kernel.org/show_bug.cgi?id=61411
TPR shadow/threshold feature is important to speed up the Windows guest.
Besides, it is a must feature for certain VMM.
We map virtual APIC page address and TPR threshold from L1 VMCS. If
TPR_BELOW_THRESHOLD VM exit is triggered by L2 guest and L1 interested
in, we inject it into L1 VMM for handling.
Signed-off-by: Wanpeng Li <[email protected]>
---
arch/x86/kvm/vmx.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a3845b8..f60846c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2331,7 +2331,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
- CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
@@ -6937,7 +6937,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_MCE_DURING_VMENTRY:
return 0;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
- return 1;
+ return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
@@ -7058,6 +7058,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
+ if (is_guest_mode(vcpu))
+ return;
+
if (irr == -1 || tpr < irr) {
vmcs_write32(TPR_THRESHOLD, 0);
return;
@@ -7962,14 +7965,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (!vmx->rdtscp_enabled)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
/* Take the following fields only from vmcs12 */
- exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ exec_control &= ~(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
+ struct page *virtual_apic_page;
/*
* Translate L1 physical address to host physical
* address for vmcs02. Keep the page pinned, so this
@@ -7992,6 +7995,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
+
+ virtual_apic_page = nested_get_page(vcpu,
+ vmcs12->virtual_apic_page_addr);
+ if (vmcs_read64(VIRTUAL_APIC_PAGE_ADDR) !=
+ page_to_phys(virtual_apic_page))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ page_to_phys(virtual_apic_page));
+ nested_release_page(virtual_apic_page);
+
} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -8002,6 +8014,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
/*
* Set host-state according to L0's settings (vmcs12 is irrelevant here)
--
1.9.1
Il 30/07/2014 14:04, Wanpeng Li ha scritto:
> @@ -7962,14 +7965,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> if (!vmx->rdtscp_enabled)
> exec_control &= ~SECONDARY_EXEC_RDTSCP;
> /* Take the following fields only from vmcs12 */
> - exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
> - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
> + exec_control &= ~(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
> SECONDARY_EXEC_APIC_REGISTER_VIRT);
This change is wrong. You don't have to take L0's "virtualize APIC
accesses" setting into account, because while running L2 you cannot
modify L1's CR8 (only the virtual nested one).
> +
> + virtual_apic_page = nested_get_page(vcpu,
> + vmcs12->virtual_apic_page_addr);
> + if (vmcs_read64(VIRTUAL_APIC_PAGE_ADDR) !=
> + page_to_phys(virtual_apic_page))
> + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> + page_to_phys(virtual_apic_page));
> + nested_release_page(virtual_apic_page);
> +
You cannot release this page here. You need to the exactly the same
thing that is done for apic_access_page.
One thing:
> + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
> + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
I think you can just do this write unconditionally, since most
hypervisors will enable this. Also, you probably can add the tpr
threshold field to the read-write fields for shadow VMCS.
Paolo
Hi Paolo,
On Wed, Jul 30, 2014 at 05:20:58PM +0200, Paolo Bonzini wrote:
>Il 30/07/2014 14:04, Wanpeng Li ha scritto:
>> @@ -7962,14 +7965,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>> if (!vmx->rdtscp_enabled)
>> exec_control &= ~SECONDARY_EXEC_RDTSCP;
>> /* Take the following fields only from vmcs12 */
>> - exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
>> - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
>> + exec_control &= ~(SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
>> SECONDARY_EXEC_APIC_REGISTER_VIRT);
>
>This change is wrong. You don't have to take L0's "virtualize APIC
>accesses" setting into account, because while running L2 you cannot
>modify L1's CR8 (only the virtual nested one).
>
Agreed.
>> +
>> + virtual_apic_page = nested_get_page(vcpu,
>> + vmcs12->virtual_apic_page_addr);
>> + if (vmcs_read64(VIRTUAL_APIC_PAGE_ADDR) !=
>> + page_to_phys(virtual_apic_page))
>> + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
>> + page_to_phys(virtual_apic_page));
>> + nested_release_page(virtual_apic_page);
>> +
>
>You cannot release this page here. You need to the exactly the same
>thing that is done for apic_access_page.
>
Agreed.
>One thing:
>
>> + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
>> + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
>
>I think you can just do this write unconditionally, since most
>hypervisors will enable this. Also, you probably can add the tpr
What will happen if a hypervisor doesn't enable it? I make it more
cleaner in version two.
>threshold field to the read-write fields for shadow VMCS.
>
Agreed.
Regards,
Wanpeng Li
>Paolo
>--
>To unsubscribe from this list: send the line "unsubscribe kvm" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
Il 31/07/2014 10:03, Wanpeng Li ha scritto:
>> One thing:
>>
>>> + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
>>> + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
>>
>> I think you can just do this write unconditionally, since most
>> hypervisors will enable this. Also, you probably can add the tpr
>
> What will happen if a hypervisor doesn't enable it? I make it more
> cleaner in version two.
TPR_THRESHOLD will be likely written as zero, but the processor will
never use it anyway. It's just a small optimization because
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) will almost always be true.
Paolo
>> threshold field to the read-write fields for shadow VMCS.
>
> Agreed.
>
> Regards,
> Wanpeng Li
Paolo Bonzini wrote on 2014-07-31:
> Il 31/07/2014 10:03, Wanpeng Li ha scritto:
>>> One thing:
>>>
>>>> + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
>>>> + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
>>>
>>> I think you can just do this write unconditionally, since most
>>> hypervisors will enable this. Also, you probably can add the tpr
>>
>> What will happen if a hypervisor doesn't enable it? I make it more
>> cleaner in version two.
>
> TPR_THRESHOLD will be likely written as zero, but the processor will
> never use it anyway. It's just a small optimization because
> nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) will almost always be true.
Theoretically, you are right. But we should not expect all VMMs follow it. It is not worth to violate the SDM just for saving two or three instructions' cost.
>
> Paolo
>
>>> threshold field to the read-write fields for shadow VMCS.
>>
>> Agreed.
>>
>> Regards,
>> Wanpeng Li
Best regards,
Yang