Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757124AbcCRLVa (ORCPT ); Fri, 18 Mar 2016 07:21:30 -0400 Received: from mail-wm0-f48.google.com ([74.125.82.48]:33631 "EHLO mail-wm0-f48.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751783AbcCRLVU (ORCPT ); Fri, 18 Mar 2016 07:21:20 -0400 Subject: Re: [PART1 RFC v3 06/12] KVM: x86: Detect and Initialize AVIC support To: Suravee Suthikulpanit , rkrcmar@redhat.com, joro@8bytes.org, bp@alien8.de, gleb@kernel.org, alex.williamson@redhat.com References: <1458281388-14452-1-git-send-email-Suravee.Suthikulpanit@amd.com> <1458281388-14452-7-git-send-email-Suravee.Suthikulpanit@amd.com> Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, wei@redhat.com, sherry.hurwitz@amd.com From: Paolo Bonzini Message-ID: <56EBE4A9.4080708@redhat.com> Date: Fri, 18 Mar 2016 12:21:13 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.6.0 MIME-Version: 1.0 In-Reply-To: <1458281388-14452-7-git-send-email-Suravee.Suthikulpanit@amd.com> Content-Type: text/plain; charset=windows-1252 Content-Transfer-Encoding: 8bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 13210 Lines: 450 On 18/03/2016 07:09, Suravee Suthikulpanit wrote: > This patch introduces AVIC-related data structure, and AVIC > initialization code. > > There are three main data structures for AVIC: > * Virtual APIC (vAPIC) backing page (per-VCPU) > * Physical APIC ID table (per-VM) > * Logical APIC ID table (per-VM) > > Currently, AVIC is disabled by default. Users can manually > enable AVIC via kernel boot option kvm-amd.avic=1 or during > kvm-amd module loading with parameter avic=1. > > Signed-off-by: Suravee Suthikulpanit > --- > arch/x86/include/asm/kvm_host.h | 8 ++ > arch/x86/include/asm/svm.h | 3 + > arch/x86/kvm/svm.c | 232 +++++++++++++++++++++++++++++++++++++++- > 3 files changed, 242 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 87eac2a..2698e40 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -754,6 +754,14 @@ struct kvm_arch { > > bool irqchip_split; > u8 nr_reserved_ioapic_pins; > + > + /* Struct members for AVIC */ > + struct hlist_node hnode; Unused, please remove. > + struct kvm *kvm; Not needed, please use container_of instead like struct kvm_arch *ka; ... struct kvm *kvm = container_of(ka, struct kvm, arch) > + u32 ldr_mode; > + u32 avic_tag; Where is avic_tag used? > + struct page *avic_log_apic_id_table_page; Remove apic_ if you want to keep the variable name shorter, but please spell logical and physical in full. > + struct page *avic_phy_apic_id_table_page; > }; > > struct kvm_vm_stat { > diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h > index 66e26a0..287e635 100644 > --- a/arch/x86/include/asm/svm.h > +++ b/arch/x86/include/asm/svm.h > @@ -116,6 +116,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { > #define V_INTR_MASKING_SHIFT 24 > #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) > > +#define AVIC_ENABLE_SHIFT 31 > +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) > + > #define SVM_INTERRUPT_SHADOW_MASK 1 > > #define SVM_IOIO_STR_SHIFT 2 > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index c13a64b..621c948 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -14,6 +14,9 @@ > * the COPYING file in the top-level directory. > * > */ > + > +#define pr_fmt(fmt) "SVM: " fmt > + > #include > > #include "irq.h" > @@ -78,6 +81,11 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); > #define TSC_RATIO_MIN 0x0000000000000001ULL > #define TSC_RATIO_MAX 0x000000ffffffffffULL > > +#define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF) > + > +/* NOTE: Current max index allowed for physical APIC ID table is 255 */ > +#define AVIC_PHY_APIC_ID_MAX 0xFF > + > static bool erratum_383_found __read_mostly; > > static const u32 host_save_user_msrs[] = { > @@ -162,8 +170,20 @@ struct vcpu_svm { > > /* cached guest cpuid flags for faster access */ > bool nrips_enabled : 1; > + > + struct page *avic_backing_page; > + u64 *avic_phy_apic_id_cache; Spelling same as above. > + bool avic_was_running; > }; > > +#define AVIC_LOG_APIC_ID__GUEST_PHY_APIC_ID_MSK (0xFF) MSK->MASK LOG_APIC->LOGICAL PHY_APIC->PHYSICAL > +#define AVIC_LOG_APIC_ID__VALID_MSK (1 << 31) > + > +#define AVIC_PHY_APIC_ID__HOST_PHY_APIC_ID_MSK (0xFFULL) > +#define AVIC_PHY_APIC_ID__BACKING_PAGE_MSK (0xFFFFFFFFFFULL << 12) > +#define AVIC_PHY_APIC_ID__IS_RUN_MSK (1ULL << 62) IS_RUN->IS_RUNNING > +#define AVIC_PHY_APIC_ID__VALID_MSK (1ULL << 63) > + > static DEFINE_PER_CPU(u64, current_tsc_ratio); > #define TSC_RATIO_DEFAULT 0x0100000000ULL > > @@ -205,6 +225,10 @@ module_param(npt, int, S_IRUGO); > static int nested = true; > module_param(nested, int, S_IRUGO); > > +/* enable / disable AVIC */ > +static int avic; > +module_param(avic, int, S_IRUGO); > + > static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); > static void svm_flush_tlb(struct kvm_vcpu *vcpu); > static void svm_complete_interrupts(struct vcpu_svm *svm); > @@ -234,6 +258,18 @@ enum { > /* TPR and CR2 are always written before VMRUN */ > #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) > > +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL > + > +static inline bool svm_vcpu_avic_enabled(struct vcpu_svm *svm) > +{ > + return (avic && (svm->vmcb->control.int_ctl & AVIC_ENABLE_MASK)); > +} > + > +static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) > +{ > + svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; > +} > + > static inline void mark_all_dirty(struct vmcb *vmcb) > { > vmcb->control.clean = 0; > @@ -923,6 +959,13 @@ static __init int svm_hardware_setup(void) > } else > kvm_disable_tdp(); > > + if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC))) > + avic = false; > + > + if (avic) { > + printk(KERN_INFO "kvm: AVIC enabled\n"); > + } > + > return 0; > > err: > @@ -1000,6 +1043,24 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) > mark_dirty(svm->vmcb, VMCB_INTERCEPTS); > } > > +static void avic_init_vmcb(struct vcpu_svm *svm) > +{ > + struct vmcb *vmcb = svm->vmcb; > + struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; > + phys_addr_t bpa = page_to_phys(svm->avic_backing_page); > + phys_addr_t lpa = page_to_phys(vm_data->avic_log_apic_id_table_page); > + phys_addr_t ppa = page_to_phys(vm_data->avic_phy_apic_id_table_page); > + > + if (!vmcb) > + return; > + > + vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; > + vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK; > + vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK; > + vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX; > + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; > +} > + > static void init_vmcb(struct vcpu_svm *svm) > { > struct vmcb_control_area *control = &svm->vmcb->control; > @@ -1113,6 +1174,139 @@ static void init_vmcb(struct vcpu_svm *svm) > mark_all_dirty(svm->vmcb); > > enable_gif(svm); > + > + if (avic) > + avic_init_vmcb(svm); > +} > + > +static u64 *avic_get_phy_apic_id_entry(struct kvm_vcpu *vcpu, int index) > +{ > + u64 *avic_phy_ait; u64 *avic_physical_id_table; > + struct kvm_arch *vm_data = &vcpu->kvm->arch; > + > + /* Note: APIC ID = 0xff is used for broadcast. > + * APIC ID > 0xff is reserved. > + */ > + if (index >= 0xff) > + return NULL; > + > + avic_phy_ait = page_address(vm_data->avic_phy_apic_id_table_page); > + > + return &avic_phy_ait[index]; > +} > + > +static int avic_init_backing_page(struct kvm_vcpu *vcpu) > +{ > + u64 *entry, new_entry; > + int id = vcpu->vcpu_id; > + struct vcpu_svm *svm = to_svm(vcpu); > + > + if (id >= AVIC_PHY_APIC_ID_MAX) > + return -EINVAL; > + > + if (!svm->vcpu.arch.apic->regs) > + return -EINVAL; > + > + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); > + > + avic_init_vmcb(svm); > + > + /* Setting AVIC backing page address in the phy APIC ID table */ > + entry = avic_get_phy_apic_id_entry(vcpu, id); > + if (!entry) > + return -EINVAL; > + > + new_entry = READ_ONCE(*entry); > + new_entry = (page_to_phys(svm->avic_backing_page) & > + AVIC_PHY_APIC_ID__BACKING_PAGE_MSK) | > + AVIC_PHY_APIC_ID__VALID_MSK; > + WRITE_ONCE(*entry, new_entry); > + > + svm->avic_phy_apic_id_cache = entry; > + > + return 0; > +} > + > +static void avic_vm_uninit(struct kvm *kvm) > +{ > + unsigned long flags; > + struct kvm_arch *vm_data = &kvm->arch; > + > + if (vm_data->avic_log_apic_id_table_page) > + __free_page(vm_data->avic_log_apic_id_table_page); > + if (vm_data->avic_phy_apic_id_table_page) > + __free_page(vm_data->avic_phy_apic_id_table_page); > +} > + > +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + > + if (!avic) > + return; > + > + if (svm->avic_phy_apic_id_cache) > + svm->avic_phy_apic_id_cache = NULL; > +} > + > +static atomic_t avic_tag_gen = ATOMIC_INIT(1); > + > +static inline u32 avic_get_next_tag(void) > +{ > + u32 tag = atomic_read(&avic_tag_gen); > + > + atomic_inc(&avic_tag_gen); > + return tag; This should be atomic_inc_return, otherwise it's not thread-safe. > +} > + > +static int avic_vm_init(struct kvm *kvm) > +{ > + int err = -ENOMEM; > + struct kvm_arch *vm_data = &kvm->arch; > + struct page *p_page; > + struct page *l_page; > + > + /** > + * Note: > + * AVIC hardware walks the nested page table to check permissions, > + * but does not use the SPA address specified in the leaf page > + * table entry since it uses address in the AVIC_BACKING_PAGE pointer > + * field of the VMCB. Therefore, we set up the > + * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. > + */ > + mutex_lock(&kvm->slots_lock); > + err = __x86_set_memory_region(kvm, > + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, > + APIC_DEFAULT_PHYS_BASE, > + PAGE_SIZE); > + mutex_unlock(&kvm->slots_lock); Please use x86_set_memory_region, since you don't have to do anything else while the lock is taken. Paolo > + if (err) > + goto free_avic; > + kvm->arch.apic_access_page_done = true; > + > + /* Allocating physical APIC ID table (4KB) */ > + p_page = alloc_page(GFP_KERNEL); > + if (!p_page) > + goto free_avic; > + > + vm_data->avic_phy_apic_id_table_page = p_page; > + clear_page(page_address(p_page)); > + > + /* Allocating logical APIC ID table (4KB) */ > + l_page = alloc_page(GFP_KERNEL); > + if (!l_page) > + goto free_avic; > + > + vm_data->avic_log_apic_id_table_page = l_page; > + clear_page(page_address(l_page)); > + > + vm_data->avic_tag = avic_get_next_tag(); > + > + return 0; > + > +free_avic: > + avic_vm_uninit(kvm); > + return err; > } > > static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > @@ -1131,6 +1325,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); > kvm_register_write(vcpu, VCPU_REGS_RDX, eax); > + > + if (svm_vcpu_avic_enabled(svm) && !init_event) > + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); > } > > static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > @@ -1169,6 +1366,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > if (!hsave_page) > goto free_page3; > > + if (avic) { > + err = avic_init_backing_page(&svm->vcpu); > + if (err) { > + avic_vcpu_uninit(&svm->vcpu); > + goto free_page4; > + } > + } > + > svm->nested.hsave = page_address(hsave_page); > > svm->msrpm = page_address(msrpm_pages); > @@ -1187,6 +1392,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > > return &svm->vcpu; > > +free_page4: > + __free_page(hsave_page); > free_page3: > __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); > free_page2: > @@ -1209,6 +1416,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) > __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); > __free_page(virt_to_page(svm->nested.hsave)); > __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); > + avic_vcpu_uninit(vcpu); > kvm_vcpu_uninit(vcpu); > kmem_cache_free(kvm_vcpu_cache, svm); > } > @@ -3371,6 +3579,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) > pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); > pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); > pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); > + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); > pr_err("%-20s%08x\n", "event_inj:", control->event_inj); > pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); > pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); > @@ -3602,11 +3811,27 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) > > static bool svm_get_enable_apicv(void) > { > - return false; > + return avic; > } > > +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) > +{ > +} > + > +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) > +{ > +} > + > +/* Note: Currently only used by Hyper-V. */ > static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) > { > + struct vcpu_svm *svm = to_svm(vcpu); > + struct vmcb *vmcb = svm->vmcb; > + > + if (!avic) > + return; > + > + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; > } > > static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) > @@ -4318,6 +4543,9 @@ static struct kvm_x86_ops svm_x86_ops = { > .vcpu_free = svm_free_vcpu, > .vcpu_reset = svm_vcpu_reset, > > + .vm_init = avic_vm_init, > + .vm_uninit = avic_vm_uninit, > + > .prepare_guest_switch = svm_prepare_guest_switch, > .vcpu_load = svm_vcpu_load, > .vcpu_put = svm_vcpu_put, > @@ -4375,6 +4603,8 @@ static struct kvm_x86_ops svm_x86_ops = { > .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, > .load_eoi_exitmap = svm_load_eoi_exitmap, > .sync_pir_to_irr = svm_sync_pir_to_irr, > + .hwapic_irr_update = svm_hwapic_irr_update, > + .hwapic_isr_update = svm_hwapic_isr_update, > > .set_tss_addr = svm_set_tss_addr, > .get_tdp_level = get_npt_level, >