Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753418AbcCGQld (ORCPT ); Mon, 7 Mar 2016 11:41:33 -0500 Received: from mail-wm0-f50.google.com ([74.125.82.50]:34825 "EHLO mail-wm0-f50.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753236AbcCGQlW (ORCPT ); Mon, 7 Mar 2016 11:41:22 -0500 Subject: Re: [PART1 RFC v2 05/10] KVM: x86: Detect and Initialize AVIC support To: Suravee Suthikulpanit , rkrcmar@redhat.com, joro@8bytes.org, bp@alien8.de, gleb@kernel.org, alex.williamson@redhat.com References: <1457124368-2025-1-git-send-email-Suravee.Suthikulpanit@amd.com> <1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com> Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, wei@redhat.com, sherry.hurwitz@amd.com From: Paolo Bonzini Message-ID: <56DDAF2D.3060703@redhat.com> Date: Mon, 7 Mar 2016 17:41:17 +0100 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.6.0 MIME-Version: 1.0 In-Reply-To: <1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com> Content-Type: text/plain; charset=windows-1252 Content-Transfer-Encoding: 8bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12682 Lines: 462 On 04/03/2016 21:46, Suravee Suthikulpanit wrote: > @@ -162,6 +170,37 @@ struct vcpu_svm { > > /* cached guest cpuid flags for faster access */ > bool nrips_enabled : 1; > + > + struct page *avic_bk_page; > + void *in_kernel_lapic_regs; > +}; > + > +struct __attribute__ ((__packed__)) > +svm_avic_log_ait_entry { > + u32 guest_phy_apic_id : 8, > + res : 23, > + valid : 1; > +}; > + > +struct __attribute__ ((__packed__)) > +svm_avic_phy_ait_entry { > + u64 host_phy_apic_id : 8, > + res1 : 4, > + bk_pg_ptr : 40, > + res2 : 10, > + is_running : 1, > + valid : 1; > +}; Please don't use bitfields. > +/* Note: This structure is per VM */ > +struct svm_vm_data { > + atomic_t count; > + u32 ldr_mode; > + u32 avic_max_vcpu_id; > + u32 avic_tag; > + > + struct page *avic_log_ait_page; > + struct page *avic_phy_ait_page; You can put these directly in kvm_arch. Do not use abbreviations: struct page *avic_logical_apic_id_table_page; struct page *avic_physical_apic_id_table_page; > @@ -1000,6 +1057,41 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) > mark_dirty(svm->vmcb, VMCB_INTERCEPTS); > } > > +static void avic_init_vmcb(struct vcpu_svm *svm) > +{ > + void *vapic_bkpg; > + struct vmcb *vmcb = svm->vmcb; > + struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data; > + phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page)); > + phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page)); > + phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page)); Please use page_to_phys. > + if (!vmcb) > + return; > + > + pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n", > + __func__, (unsigned long long)bpa, > + (unsigned long long)lpa, (unsigned long long)ppa); No pr_debug. > + > + /* > + * Here we copy the value from the emulated LAPIC register > + * to the vAPIC backign page, and then flip regs frame. > + */ > + if (!svm->in_kernel_lapic_regs) > + svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs; > + > + vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page)); Please use kmap instead. > + memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE); > + svm->vcpu.arch.apic->regs = vapic_bkpg; Can you explain the flipping logic, and why you cannot just use the existing apic.regs? > + vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK; No abbreviations ("avic_backing_page"). > + vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK; > + vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK; > + vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX; > + vmcb->control.avic_enable = 1; > +} > + > static void init_vmcb(struct vcpu_svm *svm) > { > struct vmcb_control_area *control = &svm->vmcb->control; > @@ -1113,6 +1205,293 @@ static void init_vmcb(struct vcpu_svm *svm) > mark_all_dirty(svm->vmcb); > > enable_gif(svm); > + > + if (avic) > + avic_init_vmcb(svm); > +} > + > +static struct svm_avic_phy_ait_entry * > +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index) > +{ > + struct svm_avic_phy_ait_entry *avic_phy_ait; > + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; > + > + if (!vm_data) > + return NULL; > + > + /* Note: APIC ID = 0xff is used for broadcast. > + * APIC ID > 0xff is reserved. > + */ > + if (index >= 0xff) > + return NULL; > + > + avic_phy_ait = page_address(vm_data->avic_phy_ait_page); > + > + return &avic_phy_ait[index]; > +} > + > +struct svm_avic_log_ait_entry * > +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat) > +{ > + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; > + int index; > + struct svm_avic_log_ait_entry *avic_log_ait; > + > + if (!vm_data) > + return NULL; > + > + if (is_flat) { /* flat */ > + if (mda > 7) > + return NULL; > + index = mda; > + } else { /* cluster */ > + int apic_id = mda & 0xf; > + int cluster_id = (mda & 0xf0) >> 8; > + > + if (apic_id > 4 || cluster_id >= 0xf) > + return NULL; > + index = (cluster_id << 2) + apic_id; > + } > + avic_log_ait = (struct svm_avic_log_ait_entry *) > + page_address(vm_data->avic_log_ait_page); > + > + return &avic_log_ait[index]; > +} Instead of these functions, create a complete function to handle APIC_ID and APIC_LDR writes. Then use kmap/kunmap instead of page_address. > +static inline void > +avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val) > +{ > + void *avic_bk = page_address(svm->avic_bk_page); > + > + *((u32 *) (avic_bk + reg_off)) = val; > +} Unused function. > +static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset) > +{ > + char *tmp = (char *)page_address(svm->avic_bk_page); > + > + return (u32 *)(tmp+offset); > +} I think you should be able to use kvm_apic_get_reg instead. > +static int avic_init_bk_page(struct kvm_vcpu *vcpu) No abbreviations (but again, please try reusing apic.regs). > +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id) > +{ > + int ret = 0, i; > + bool realloc = false; > + struct kvm_vcpu *vcpu; > + struct kvm *kvm = svm->vcpu.kvm; > + struct svm_vm_data *vm_data = kvm->arch.arch_data; > + > + mutex_lock(&kvm->slots_lock); > + > + /* Check if we have already allocated vAPIC backing > + * page for this vCPU. If not, we need to realloc > + * a new one and re-assign all other vCPU. > + */ > + if (kvm->arch.apic_access_page_done && > + (id > vm_data->avic_max_vcpu_id)) { > + kvm_for_each_vcpu(i, vcpu, kvm) > + avic_unalloc_bk_page(vcpu); > + > + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, > + 0, 0); > + realloc = true; > + vm_data->avic_max_vcpu_id = 0; > + } > + > + /* > + * We are allocating vAPIC backing page > + * upto the max vCPU ID > + */ > + if (id >= vm_data->avic_max_vcpu_id) { > + ret = __x86_set_memory_region(kvm, > + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, > + APIC_DEFAULT_PHYS_BASE, > + PAGE_SIZE * (id + 1)); Why is this necessary? The APIC access page is a peculiarity of Intel processors (and the special memslot for only needs to map 0xfee00000 to 0xfee00fff; after that there is the MSI area). > + if (ret) > + goto out; > + > + vm_data->avic_max_vcpu_id = id; > + } > + > + /* Reinit vAPIC backing page for exisinting vcpus */ > + if (realloc) > + kvm_for_each_vcpu(i, vcpu, kvm) > + avic_init_bk_page(vcpu); Why is this necessary? > + avic_init_bk_page(&svm->vcpu); > + > + kvm->arch.apic_access_page_done = true; > + > +out: > + mutex_unlock(&kvm->slots_lock); > + return ret; > +} > + > +static void avic_vm_uninit(struct kvm *kvm) > +{ > + struct svm_vm_data *vm_data = kvm->arch.arch_data; > + > + if (!vm_data) > + return; > + > + if (vm_data->avic_log_ait_page) > + __free_page(vm_data->avic_log_ait_page); > + if (vm_data->avic_phy_ait_page) > + __free_page(vm_data->avic_phy_ait_page); > + kfree(vm_data); > + kvm->arch.arch_data = NULL; > +} > + > +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_svm *svm = to_svm(vcpu); > + struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data; > + > + if (!avic) > + return; > + > + avic_unalloc_bk_page(vcpu); > + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs; > + svm->in_kernel_lapic_regs = NULL; > + > + if (vm_data && > + (atomic_read(&vm_data->count) == 0 || > + atomic_dec_and_test(&vm_data->count))) > + avic_vm_uninit(vcpu->kvm); Add a new kvm_x86_ops callback .free_vcpus and call it from kvm_free_vcpus; then count is not necessary. We probably should use it for Intel too. The calls to x86_set_memory_region in kvm_arch_destroy_vm are hideous. > +} > + > +static atomic_t avic_tag_gen = ATOMIC_INIT(1); > + > +static inline u32 avic_get_next_tag(void) > +{ > + u32 tag = atomic_read(&avic_tag_gen); > + > + atomic_inc(&avic_tag_gen); > + return tag; > +} > + > +static int avic_vm_init(struct kvm *kvm) > +{ > + int err = -ENOMEM; > + struct svm_vm_data *vm_data; > + struct page *avic_phy_ait_page; > + struct page *avic_log_ait_page; This is probably also best moved to a new callback .init_vm (called from kvm_arch_init_vm). > + vm_data = kzalloc(sizeof(struct svm_vm_data), > + GFP_KERNEL); > + if (!vm_data) > + return err; > + > + kvm->arch.arch_data = vm_data; > + atomic_set(&vm_data->count, 0); > + > + /* Allocating physical APIC ID table (4KB) */ > + avic_phy_ait_page = alloc_page(GFP_KERNEL); > + if (!avic_phy_ait_page) > + goto free_avic; > + > + vm_data->avic_phy_ait_page = avic_phy_ait_page; > + clear_page(page_address(avic_phy_ait_page)); You can use get_zeroed_page and free_page, instead of alloc_page/clear_page/__free_page. Thanks, Paolo > + /* Allocating logical APIC ID table (4KB) */ > + avic_log_ait_page = alloc_page(GFP_KERNEL); > + if (!avic_log_ait_page) > + goto free_avic; > + > + vm_data->avic_log_ait_page = avic_log_ait_page; > + clear_page(page_address(avic_log_ait_page)); > + > + vm_data->avic_tag = avic_get_next_tag(); > + > + return 0; > + > +free_avic: > + avic_vm_uninit(kvm); > + return err; > +} > + > +static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id) > +{ > + int err; > + struct svm_vm_data *vm_data = NULL; > + > + /* Note: svm_vm_data is per VM */ > + if (!kvm->arch.arch_data) { > + err = avic_vm_init(kvm); > + if (err) > + return err; > + } > + > + err = avic_alloc_bk_page(svm, id); > + if (err) { > + avic_vcpu_uninit(&svm->vcpu); > + return err; > + } > + > + vm_data = kvm->arch.arch_data; > + atomic_inc(&vm_data->count); > + > + return 0; > } > > static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > @@ -1131,6 +1510,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); > kvm_register_write(vcpu, VCPU_REGS_RDX, eax); > + > + if (avic && !init_event) > + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); > } > > static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > @@ -1169,6 +1551,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > if (!hsave_page) > goto free_page3; > > + if (avic) { > + err = avic_vcpu_init(kvm, svm, id); > + if (err) > + goto free_page4; > + } > + > svm->nested.hsave = page_address(hsave_page); > > svm->msrpm = page_address(msrpm_pages); > @@ -1187,6 +1575,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) > > return &svm->vcpu; > > +free_page4: > + __free_page(hsave_page); > free_page3: > __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); > free_page2: > @@ -1209,6 +1599,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) > __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); > __free_page(virt_to_page(svm->nested.hsave)); > __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); > + avic_vcpu_uninit(vcpu); > kvm_vcpu_uninit(vcpu); > kmem_cache_free(kvm_vcpu_cache, svm); > } > @@ -3382,6 +3773,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) > pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); > pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); > pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); > + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); > pr_err("%-20s%08x\n", "event_inj:", control->event_inj); > pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); > pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); > @@ -3613,11 +4005,31 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) > > static bool svm_get_enable_apicv(void) > { > - return false; > + return avic; > +} > + > +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) > +{ > } > > +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) > +{ > +} > + > +/* Note: Currently only used by Hyper-V. */ > static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) > { > + struct vcpu_svm *svm = to_svm(vcpu); > + struct vmcb *vmcb = svm->vmcb; > + > + if (!avic) > + return; > + > + if (!svm->in_kernel_lapic_regs) > + return; > + > + svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs; > + vmcb->control.avic_enable = 0; > } > > static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) > @@ -4387,6 +4799,8 @@ static struct kvm_x86_ops svm_x86_ops = { > .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, > .load_eoi_exitmap = svm_load_eoi_exitmap, > .sync_pir_to_irr = svm_sync_pir_to_irr, > + .hwapic_irr_update = svm_hwapic_irr_update, > + .hwapic_isr_update = svm_hwapic_isr_update, > > .set_tss_addr = svm_set_tss_addr, > .get_tdp_level = get_npt_level, >