Subject: Re: [PART1 RFC v2 05/10] KVM: x86: Detect and Initialize AVIC support
To: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>, rkrcmar@redhat.com,
        joro@8bytes.org, bp@alien8.de, gleb@kernel.org,
        alex.williamson@redhat.com
References: <1457124368-2025-1-git-send-email-Suravee.Suthikulpanit@amd.com>
 <1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, wei@redhat.com,
        sherry.hurwitz@amd.com
From: Paolo Bonzini <pbonzini@redhat.com>
Message-ID: <56DDAF2D.3060703@redhat.com>
Date: Mon, 7 Mar 2016 17:41:17 +0100
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
 Thunderbird/38.6.0
MIME-Version: 1.0
In-Reply-To: <1457124368-2025-6-git-send-email-Suravee.Suthikulpanit@amd.com>
Content-Type: text/plain; charset=windows-1252
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 12682
Lines: 462


On 04/03/2016 21:46, Suravee Suthikulpanit wrote:

> @@ -162,6 +170,37 @@ struct vcpu_svm {
>  
>  	/* cached guest cpuid flags for faster access */
>  	bool nrips_enabled	: 1;
> +
> +	struct page *avic_bk_page;
> +	void *in_kernel_lapic_regs;
> +};
> +
> +struct __attribute__ ((__packed__))
> +svm_avic_log_ait_entry {
> +	u32 guest_phy_apic_id	: 8,
> +	    res			: 23,
> +	    valid		: 1;
> +};
> +
> +struct __attribute__ ((__packed__))
> +svm_avic_phy_ait_entry {
> +	u64 host_phy_apic_id	: 8,
> +	    res1		: 4,
> +	    bk_pg_ptr		: 40,
> +	    res2		: 10,
> +	    is_running		: 1,
> +	    valid		: 1;
> +};

Please don't use bitfields.

> +/* Note: This structure is per VM */
> +struct svm_vm_data {
> +	atomic_t count;
> +	u32 ldr_mode;
> +	u32 avic_max_vcpu_id;
> +	u32 avic_tag;
> +
> +	struct page *avic_log_ait_page;
> +	struct page *avic_phy_ait_page;

You can put these directly in kvm_arch.  Do not use abbreviations:

	struct page *avic_logical_apic_id_table_page;
	struct page *avic_physical_apic_id_table_page;

> @@ -1000,6 +1057,41 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
>  	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
>  }
>  
> +static void avic_init_vmcb(struct vcpu_svm *svm)
> +{
> +	void *vapic_bkpg;
> +	struct vmcb *vmcb = svm->vmcb;
> +	struct svm_vm_data *vm_data = svm->vcpu.kvm->arch.arch_data;
> +	phys_addr_t bpa = PFN_PHYS(page_to_pfn(svm->avic_bk_page));
> +	phys_addr_t lpa = PFN_PHYS(page_to_pfn(vm_data->avic_log_ait_page));
> +	phys_addr_t ppa = PFN_PHYS(page_to_pfn(vm_data->avic_phy_ait_page));

Please use page_to_phys.

> +	if (!vmcb)
> +		return;
> +
> +	pr_debug("%s: bpa=%#llx, lpa=%#llx, ppa=%#llx\n",
> +		 __func__, (unsigned long long)bpa,
> +		 (unsigned long long)lpa, (unsigned long long)ppa);

No pr_debug.

> +
> +	/*
> +	 * Here we copy the value from the emulated LAPIC register
> +	 * to the vAPIC backign page, and then flip regs frame.
> +	 */
> +	if (!svm->in_kernel_lapic_regs)
> +		svm->in_kernel_lapic_regs = svm->vcpu.arch.apic->regs;
> +
> +	vapic_bkpg = pfn_to_kaddr(page_to_pfn(svm->avic_bk_page));

Please use kmap instead.

> +	memcpy(vapic_bkpg, svm->in_kernel_lapic_regs, PAGE_SIZE);
> +	svm->vcpu.arch.apic->regs = vapic_bkpg;

Can you explain the flipping logic, and why you cannot just use the
existing apic.regs?

> +	vmcb->control.avic_bk_page = bpa & AVIC_HPA_MASK;

No abbreviations ("avic_backing_page").

> +	vmcb->control.avic_log_apic_id = lpa & AVIC_HPA_MASK;
> +	vmcb->control.avic_phy_apic_id = ppa & AVIC_HPA_MASK;
> +	vmcb->control.avic_phy_apic_id |= AVIC_PHY_APIC_ID_MAX;
> +	vmcb->control.avic_enable = 1;
> +}
> +
>  static void init_vmcb(struct vcpu_svm *svm)
>  {
>  	struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -1113,6 +1205,293 @@ static void init_vmcb(struct vcpu_svm *svm)
>  	mark_all_dirty(svm->vmcb);
>  
>  	enable_gif(svm);
> +
> +	if (avic)
> +		avic_init_vmcb(svm);
> +}
> +
> +static struct svm_avic_phy_ait_entry *
> +avic_get_phy_ait_entry(struct kvm_vcpu *vcpu, int index)
> +{
> +	struct svm_avic_phy_ait_entry *avic_phy_ait;
> +	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +
> +	if (!vm_data)
> +		return NULL;
> +
> +	/* Note: APIC ID = 0xff is used for broadcast.
> +	 *       APIC ID > 0xff is reserved.
> +	 */
> +	if (index >= 0xff)
> +		return NULL;
> +
> +	avic_phy_ait = page_address(vm_data->avic_phy_ait_page);
> +
> +	return &avic_phy_ait[index];
> +}
> +
> +struct svm_avic_log_ait_entry *
> +avic_get_log_ait_entry(struct kvm_vcpu *vcpu, u8 mda, bool is_flat)
> +{
> +	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +	int index;
> +	struct svm_avic_log_ait_entry *avic_log_ait;
> +
> +	if (!vm_data)
> +		return NULL;
> +
> +	if (is_flat) { /* flat */
> +		if (mda > 7)
> +			return NULL;
> +		index = mda;
> +	} else { /* cluster */
> +		int apic_id = mda & 0xf;
> +		int cluster_id = (mda & 0xf0) >> 8;
> +
> +		if (apic_id > 4 || cluster_id >= 0xf)
> +			return NULL;
> +		index = (cluster_id << 2) + apic_id;
> +	}
> +	avic_log_ait = (struct svm_avic_log_ait_entry *)
> +				page_address(vm_data->avic_log_ait_page);
> +
> +	return &avic_log_ait[index];
> +}

Instead of these functions, create a complete function to handle APIC_ID
and APIC_LDR writes.  Then use kmap/kunmap instead of page_address.

> +static inline void
> +avic_set_bk_page_entry(struct vcpu_svm *svm, int reg_off, u32 val)
> +{
> +	void *avic_bk = page_address(svm->avic_bk_page);
> +
> +	*((u32 *) (avic_bk + reg_off)) = val;
> +}

Unused function.

> +static inline u32 *avic_get_bk_page_entry(struct vcpu_svm *svm, u32 offset)
> +{
> +	char *tmp = (char *)page_address(svm->avic_bk_page);
> +
> +	return (u32 *)(tmp+offset);
> +}

I think you should be able to use kvm_apic_get_reg instead.

> +static int avic_init_bk_page(struct kvm_vcpu *vcpu)

No abbreviations (but again, please try reusing apic.regs).

> +static int avic_alloc_bk_page(struct vcpu_svm *svm, int id)
> +{
> +	int ret = 0, i;
> +	bool realloc = false;
> +	struct kvm_vcpu *vcpu;
> +	struct kvm *kvm = svm->vcpu.kvm;
> +	struct svm_vm_data *vm_data = kvm->arch.arch_data;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	/* Check if we have already allocated vAPIC backing
> +	 * page for this vCPU. If not, we need to realloc
> +	 * a new one and re-assign all other vCPU.
> +	 */
> +	if (kvm->arch.apic_access_page_done &&
> +	    (id > vm_data->avic_max_vcpu_id)) {
> +		kvm_for_each_vcpu(i, vcpu, kvm)
> +			avic_unalloc_bk_page(vcpu);
> +
> +		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> +					0, 0);
> +		realloc = true;
> +		vm_data->avic_max_vcpu_id = 0;
> +	}
> +
> +	/*
> +	 * We are allocating vAPIC backing page
> +	 * upto the max vCPU ID
> +	 */
> +	if (id >= vm_data->avic_max_vcpu_id) {
> +		ret = __x86_set_memory_region(kvm,
> +					      APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
> +					      APIC_DEFAULT_PHYS_BASE,
> +					      PAGE_SIZE * (id + 1));

Why is this necessary?  The APIC access page is a peculiarity of Intel
processors (and the special memslot for only needs to map 0xfee00000 to
0xfee00fff; after that there is the MSI area).

> +		if (ret)
> +			goto out;
> +
> +		vm_data->avic_max_vcpu_id = id;
> +	}
> +
> +	/* Reinit vAPIC backing page for exisinting vcpus */
> +	if (realloc)
> +		kvm_for_each_vcpu(i, vcpu, kvm)
> +			avic_init_bk_page(vcpu);

Why is this necessary?

> +	avic_init_bk_page(&svm->vcpu);
> +
> +	kvm->arch.apic_access_page_done = true;
> +
> +out:
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +static void avic_vm_uninit(struct kvm *kvm)
> +{
> +	struct svm_vm_data *vm_data = kvm->arch.arch_data;
> +
> +	if (!vm_data)
> +		return;
> +
> +	if (vm_data->avic_log_ait_page)
> +		__free_page(vm_data->avic_log_ait_page);
> +	if (vm_data->avic_phy_ait_page)
> +		__free_page(vm_data->avic_phy_ait_page);
> +	kfree(vm_data);
> +	kvm->arch.arch_data = NULL;
> +}
> +
> +static void avic_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +	struct svm_vm_data *vm_data = vcpu->kvm->arch.arch_data;
> +
> +	if (!avic)
> +		return;
> +
> +	avic_unalloc_bk_page(vcpu);
> +	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
> +	svm->in_kernel_lapic_regs = NULL;
> +
> +	if (vm_data &&
> +	    (atomic_read(&vm_data->count) == 0 ||
> +	     atomic_dec_and_test(&vm_data->count)))
> +		avic_vm_uninit(vcpu->kvm);

Add a new kvm_x86_ops callback .free_vcpus and call it from
kvm_free_vcpus; then count is not necessary.

We probably should use it for Intel too.  The calls to
x86_set_memory_region in kvm_arch_destroy_vm are hideous.

> +}
> +
> +static atomic_t avic_tag_gen = ATOMIC_INIT(1);
> +
> +static inline u32 avic_get_next_tag(void)
> +{
> +	u32 tag = atomic_read(&avic_tag_gen);
> +
> +	atomic_inc(&avic_tag_gen);
> +	return tag;
> +}
> +
> +static int avic_vm_init(struct kvm *kvm)
> +{
> +	int err = -ENOMEM;
> +	struct svm_vm_data *vm_data;
> +	struct page *avic_phy_ait_page;
> +	struct page *avic_log_ait_page;

This is probably also best moved to a new callback .init_vm (called from
kvm_arch_init_vm).

> +	vm_data = kzalloc(sizeof(struct svm_vm_data),
> +				      GFP_KERNEL);
> +	if (!vm_data)
> +		return err;
> +
> +	kvm->arch.arch_data = vm_data;
> +	atomic_set(&vm_data->count, 0);
> +
> +	/* Allocating physical APIC ID table (4KB) */
> +	avic_phy_ait_page = alloc_page(GFP_KERNEL);
> +	if (!avic_phy_ait_page)
> +		goto free_avic;
> +
> +	vm_data->avic_phy_ait_page = avic_phy_ait_page;
> +	clear_page(page_address(avic_phy_ait_page));

You can use get_zeroed_page and free_page, instead of
alloc_page/clear_page/__free_page.

Thanks,

Paolo

> +	/* Allocating logical APIC ID table (4KB) */
> +	avic_log_ait_page = alloc_page(GFP_KERNEL);
> +	if (!avic_log_ait_page)
> +		goto free_avic;
> +
> +	vm_data->avic_log_ait_page = avic_log_ait_page;
> +	clear_page(page_address(avic_log_ait_page));
> +
> +	vm_data->avic_tag = avic_get_next_tag();
> +
> +	return 0;
> +
> +free_avic:
> +	avic_vm_uninit(kvm);
> +	return err;
> +}
> +
> +static int avic_vcpu_init(struct kvm *kvm, struct vcpu_svm *svm, int id)
> +{
> +	int err;
> +	struct svm_vm_data *vm_data = NULL;
> +
> +	/* Note: svm_vm_data is per VM */
> +	if (!kvm->arch.arch_data) {
> +		err = avic_vm_init(kvm);
> +		if (err)
> +			return err;
> +	}
> +
> +	err = avic_alloc_bk_page(svm, id);
> +	if (err) {
> +		avic_vcpu_uninit(&svm->vcpu);
> +		return err;
> +	}
> +
> +	vm_data = kvm->arch.arch_data;
> +	atomic_inc(&vm_data->count);
> +
> +	return 0;
>  }
>  
>  static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
> @@ -1131,6 +1510,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  
>  	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
>  	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
> +
> +	if (avic && !init_event)
> +		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
>  }
>  
>  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
> @@ -1169,6 +1551,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>  	if (!hsave_page)
>  		goto free_page3;
>  
> +	if (avic) {
> +		err = avic_vcpu_init(kvm, svm, id);
> +		if (err)
> +			goto free_page4;
> +	}
> +
>  	svm->nested.hsave = page_address(hsave_page);
>  
>  	svm->msrpm = page_address(msrpm_pages);
> @@ -1187,6 +1575,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>  
>  	return &svm->vcpu;
>  
> +free_page4:
> +	__free_page(hsave_page);
>  free_page3:
>  	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
>  free_page2:
> @@ -1209,6 +1599,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
>  	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
>  	__free_page(virt_to_page(svm->nested.hsave));
>  	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
> +	avic_vcpu_uninit(vcpu);
>  	kvm_vcpu_uninit(vcpu);
>  	kmem_cache_free(kvm_vcpu_cache, svm);
>  }
> @@ -3382,6 +3773,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
>  	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
>  	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
>  	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
> +	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
>  	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
>  	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
>  	pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
> @@ -3613,11 +4005,31 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
>  
>  static bool svm_get_enable_apicv(void)
>  {
> -	return false;
> +	return avic;
> +}
> +
> +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
> +{
>  }
>  
> +static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
> +{
> +}
> +
> +/* Note: Currently only used by Hyper-V. */
>  static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
>  {
> +	struct vcpu_svm *svm = to_svm(vcpu);
> +	struct vmcb *vmcb = svm->vmcb;
> +
> +	if (!avic)
> +		return;
> +
> +	if (!svm->in_kernel_lapic_regs)
> +		return;
> +
> +	svm->vcpu.arch.apic->regs = svm->in_kernel_lapic_regs;
> +	vmcb->control.avic_enable = 0;
>  }
>  
>  static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
> @@ -4387,6 +4799,8 @@ static struct kvm_x86_ops svm_x86_ops = {
>  	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
>  	.load_eoi_exitmap = svm_load_eoi_exitmap,
>  	.sync_pir_to_irr = svm_sync_pir_to_irr,
> +	.hwapic_irr_update = svm_hwapic_irr_update,
> +	.hwapic_isr_update = svm_hwapic_isr_update,
>  
>  	.set_tss_addr = svm_set_tss_addr,
>  	.get_tdp_level = get_npt_level,
>