2021-11-22 17:58:30

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 0/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Commit 63f5a1909f9e ("KVM: x86: Alert userspace that KVM_SET_CPUID{,2}
after KVM_RUN is broken") officially deprecated KVM_SET_CPUID{,2} ioctls
after first successful KVM_RUN and promissed to make this sequence forbiden
in 5.16. TO fulfil the promise 'hyperv_features' selftest needs to be fixed
first.

Vitaly Kuznetsov (2):
KVM: selftests: Avoid KVM_SET_CPUID2 after KVM_RUN in hyperv_features
test
KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

arch/x86/kvm/mmu/mmu.c | 20 +--
arch/x86/kvm/x86.c | 27 ++++
.../selftests/kvm/x86_64/hyperv_features.c | 140 +++++++++---------
3 files changed, 101 insertions(+), 86 deletions(-)

--
2.33.1



2021-11-22 17:58:34

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 1/2] KVM: selftests: Avoid KVM_SET_CPUID2 after KVM_RUN in hyperv_features test

hyperv_features's sole purpose is to test access to various Hyper-V MSRs
and hypercalls with different CPUID data. As KVM_SET_CPUID2 after KVM_RUN
is deprecated and soon-to-be forbidden, avoid it by re-creating test VM
for each sub-test.

Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
.../selftests/kvm/x86_64/hyperv_features.c | 140 +++++++++---------
1 file changed, 71 insertions(+), 69 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
index 91d88aaa9899..672915ce73d8 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -165,10 +165,10 @@ static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid,
vcpu_set_cpuid(vm, VCPU_ID, cpuid);
}

-static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
- struct kvm_cpuid2 *best)
+static void guest_test_msrs_access(void)
{
struct kvm_run *run;
+ struct kvm_vm *vm;
struct ucall uc;
int stage = 0, r;
struct kvm_cpuid_entry2 feat = {
@@ -180,11 +180,34 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
struct kvm_cpuid_entry2 dbg = {
.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
};
- struct kvm_enable_cap cap = {0};
-
- run = vcpu_state(vm, VCPU_ID);
+ struct kvm_cpuid2 *best;
+ vm_vaddr_t msr_gva;
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
+ .args = {1}
+ };
+ struct msr_data *msr;

while (true) {
+ vm = vm_create_default(VCPU_ID, 0, guest_msr);
+
+ msr_gva = vm_vaddr_alloc_page(vm);
+ memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
+ msr = addr_gva2hva(vm, msr_gva);
+
+ vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
+ vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+ best = kvm_get_supported_hv_cpuid();
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+ run = vcpu_state(vm, VCPU_ID);
+
switch (stage) {
case 0:
/*
@@ -315,6 +338,7 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
* capability enabled and guest visible CPUID bit unset.
*/
cap.cap = KVM_CAP_HYPERV_SYNIC2;
+ cap.args[0] = 0;
vcpu_enable_cap(vm, VCPU_ID, &cap);
break;
case 22:
@@ -461,9 +485,9 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,

switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_SYNC:
- TEST_ASSERT(uc.args[1] == stage,
- "Unexpected stage: %ld (%d expected)\n",
- uc.args[1], stage);
+ TEST_ASSERT(uc.args[1] == 0,
+ "Unexpected stage: %ld (0 expected)\n",
+ uc.args[1]);
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -474,13 +498,14 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
}

stage++;
+ kvm_vm_free(vm);
}
}

-static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall,
- void *input, void *output, struct kvm_cpuid2 *best)
+static void guest_test_hcalls_access(void)
{
struct kvm_run *run;
+ struct kvm_vm *vm;
struct ucall uc;
int stage = 0, r;
struct kvm_cpuid_entry2 feat = {
@@ -493,10 +518,38 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
struct kvm_cpuid_entry2 dbg = {
.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
};
-
- run = vcpu_state(vm, VCPU_ID);
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
+ .args = {1}
+ };
+ vm_vaddr_t hcall_page, hcall_params;
+ struct hcall_data *hcall;
+ struct kvm_cpuid2 *best;

while (true) {
+ vm = vm_create_default(VCPU_ID, 0, guest_hcall);
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+ /* Hypercall input/output */
+ hcall_page = vm_vaddr_alloc_pages(vm, 2);
+ hcall = addr_gva2hva(vm, hcall_page);
+ memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+ hcall_params = vm_vaddr_alloc_page(vm);
+ memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
+
+ vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
+ vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+ vcpu_set_hv_cpuid(vm, VCPU_ID);
+
+ best = kvm_get_supported_hv_cpuid();
+
+ run = vcpu_state(vm, VCPU_ID);
+
switch (stage) {
case 0:
hcall->control = 0xdeadbeef;
@@ -606,9 +659,9 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall

switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_SYNC:
- TEST_ASSERT(uc.args[1] == stage,
- "Unexpected stage: %ld (%d expected)\n",
- uc.args[1], stage);
+ TEST_ASSERT(uc.args[1] == 0,
+ "Unexpected stage: %ld (0 expected)\n",
+ uc.args[1]);
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -619,66 +672,15 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
}

stage++;
+ kvm_vm_free(vm);
}
}

int main(void)
{
- struct kvm_cpuid2 *best;
- struct kvm_vm *vm;
- vm_vaddr_t msr_gva, hcall_page, hcall_params;
- struct kvm_enable_cap cap = {
- .cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
- .args = {1}
- };
-
- /* Test MSRs */
- vm = vm_create_default(VCPU_ID, 0, guest_msr);
-
- msr_gva = vm_vaddr_alloc_page(vm);
- memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
- vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
- vcpu_enable_cap(vm, VCPU_ID, &cap);
-
- vcpu_set_hv_cpuid(vm, VCPU_ID);
-
- best = kvm_get_supported_hv_cpuid();
-
- vm_init_descriptor_tables(vm);
- vcpu_init_descriptor_tables(vm, VCPU_ID);
- vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
-
pr_info("Testing access to Hyper-V specific MSRs\n");
- guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva),
- best);
- kvm_vm_free(vm);
-
- /* Test hypercalls */
- vm = vm_create_default(VCPU_ID, 0, guest_hcall);
-
- vm_init_descriptor_tables(vm);
- vcpu_init_descriptor_tables(vm, VCPU_ID);
- vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
-
- /* Hypercall input/output */
- hcall_page = vm_vaddr_alloc_pages(vm, 2);
- memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
-
- hcall_params = vm_vaddr_alloc_page(vm);
- memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
-
- vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
- vcpu_enable_cap(vm, VCPU_ID, &cap);
-
- vcpu_set_hv_cpuid(vm, VCPU_ID);
-
- best = kvm_get_supported_hv_cpuid();
+ guest_test_msrs_access();

pr_info("Testing access to Hyper-V hypercalls\n");
- guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params),
- addr_gva2hva(vm, hcall_page),
- addr_gva2hva(vm, hcall_page) + getpagesize(),
- best);
-
- kvm_vm_free(vm);
+ guest_test_hcalls_access();
}
--
2.33.1


2021-11-22 17:58:39

by Vitaly Kuznetsov

[permalink] [raw]
Subject: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Commit 63f5a1909f9e ("KVM: x86: Alert userspace that KVM_SET_CPUID{,2}
after KVM_RUN is broken") officially deprecated KVM_SET_CPUID{,2} ioctls
after first successful KVM_RUN and promissed to make this sequence forbiden
in 5.16. It's time to fulfil the promise.

Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 20 +++-----------------
arch/x86/kvm/x86.c | 27 +++++++++++++++++++++++++++
2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3be9beea838d..669e86688cbf 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5032,24 +5032,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);

/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
- * sweep the problem under the rug.
- *
- * KVM's horrific CPUID ABI makes the problem all but impossible to
- * solve, as correctly handling multiple vCPU models (with respect to
- * paging and physical address properties) in a single VM would require
- * tracking all relevant CPUID information in kvm_mmu_page_role. That
- * is very undesirable as it would double the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments), and in practice
- * no sane VMM mucks with the core vCPU model on the fly.
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
- }
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}

void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a403d92833f..3cfaccc24efb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5125,6 +5125,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid cpuid;

r = -EFAULT;
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
+ * sweep the problem under the rug.
+ *
+ * KVM's horrific CPUID ABI makes the problem all but impossible to
+ * solve, as correctly handling multiple vCPU models (with respect to
+ * paging and physical address properties) in a single VM would require
+ * tracking all relevant CPUID information in kvm_mmu_page_role. That
+ * is very undesirable as it would double the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments), and in practice
+ * no sane VMM mucks with the core vCPU model on the fly.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
@@ -5135,6 +5154,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 cpuid;

r = -EFAULT;
+
+ /*
+ * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
+ * KVM_SET_CPUID case above.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1)
+ goto out;
+
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
--
2.33.1


2021-11-26 12:22:37

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On 11/22/21 18:58, Vitaly Kuznetsov wrote:
> - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
> - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
> - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
> - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
> - * sweep the problem under the rug.
> - *
> - * KVM's horrific CPUID ABI makes the problem all but impossible to
> - * solve, as correctly handling multiple vCPU models (with respect to
> - * paging and physical address properties) in a single VM would require
> - * tracking all relevant CPUID information in kvm_mmu_page_role. That
> - * is very undesirable as it would double the memory requirements for
> - * gfn_track (see struct kvm_mmu_page_role comments), and in practice
> - * no sane VMM mucks with the core vCPU model on the fly.
> + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
> + * kvm_arch_vcpu_ioctl().
> */

The second part of the comment still applies to kvm_mmu_after_set_cpuid
more than to kvm_arch_vcpu_ioctl().

> r = -EFAULT;
> [...]
> + if (vcpu->arch.last_vmentry_cpu != -1)
> + goto out;
> +
> if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
> goto out;
> r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);

This should be an EINVAL.

Tweaked and queued nevertheless, thanks.

Paolo


2021-12-27 17:33:00

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Fri, 26 Nov 2021 13:20:28 +0100
Paolo Bonzini <[email protected]> wrote:

> On 11/22/21 18:58, Vitaly Kuznetsov wrote:
> > - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
> > - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
> > - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
> > - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
> > - * sweep the problem under the rug.
> > - *
> > - * KVM's horrific CPUID ABI makes the problem all but impossible to
> > - * solve, as correctly handling multiple vCPU models (with respect to
> > - * paging and physical address properties) in a single VM would require
> > - * tracking all relevant CPUID information in kvm_mmu_page_role. That
> > - * is very undesirable as it would double the memory requirements for
> > - * gfn_track (see struct kvm_mmu_page_role comments), and in practice
> > - * no sane VMM mucks with the core vCPU model on the fly.
> > + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
> > + * kvm_arch_vcpu_ioctl().
> > */
>
> The second part of the comment still applies to kvm_mmu_after_set_cpuid
> more than to kvm_arch_vcpu_ioctl().
>
> > r = -EFAULT;
> > [...]
> > + if (vcpu->arch.last_vmentry_cpu != -1)
> > + goto out;
> > +
> > if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
> > goto out;
> > r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
>
> This should be an EINVAL.
>
> Tweaked and queued nevertheless, thanks.

it seems this patch breaks VCPU hotplug, in scenario:

1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)

RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11

>
> Paolo
>


2022-01-02 17:32:00

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On 12/27/21 18:32, Igor Mammedov wrote:
>> Tweaked and queued nevertheless, thanks.
> it seems this patch breaks VCPU hotplug, in scenario:
>
> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
>
> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
>

The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
the data passed to the ioctl is the same that was set before.

Paolo


2022-01-03 08:04:39

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Paolo Bonzini <[email protected]> writes:

> On 12/27/21 18:32, Igor Mammedov wrote:
>>> Tweaked and queued nevertheless, thanks.
>> it seems this patch breaks VCPU hotplug, in scenario:
>>
>> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
>> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
>>
>> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
>>
>
> The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> the data passed to the ioctl is the same that was set before.

Are we sure the data is going to be *exactly* the same? In particular,
when using vCPU fds from the parked list, do we keep the same
APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
different id?

--
Vitaly


2022-01-03 09:41:06

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Mon, 03 Jan 2022 09:04:29 +0100
Vitaly Kuznetsov <[email protected]> wrote:

> Paolo Bonzini <[email protected]> writes:
>
> > On 12/27/21 18:32, Igor Mammedov wrote:
> >>> Tweaked and queued nevertheless, thanks.
> >> it seems this patch breaks VCPU hotplug, in scenario:
> >>
> >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> >>
> >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> >>
> >
> > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> > the data passed to the ioctl is the same that was set before.
>
> Are we sure the data is going to be *exactly* the same? In particular,
> when using vCPU fds from the parked list, do we keep the same
> APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> different id?

If I recall it right, it can be a different ID easily.


2022-01-03 12:57:01

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Igor Mammedov <[email protected]> writes:

> On Mon, 03 Jan 2022 09:04:29 +0100
> Vitaly Kuznetsov <[email protected]> wrote:
>
>> Paolo Bonzini <[email protected]> writes:
>>
>> > On 12/27/21 18:32, Igor Mammedov wrote:
>> >>> Tweaked and queued nevertheless, thanks.
>> >> it seems this patch breaks VCPU hotplug, in scenario:
>> >>
>> >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
>> >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
>> >>
>> >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
>> >>
>> >
>> > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
>> > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
>> > the data passed to the ioctl is the same that was set before.
>>
>> Are we sure the data is going to be *exactly* the same? In particular,
>> when using vCPU fds from the parked list, do we keep the same
>> APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
>> different id?
>
> If I recall it right, it can be a different ID easily.
>

It's broken then. I'd suggest we revert the patch from KVM and think
about the strategy how to proceed. Going forward, we really want to ban
KVM_SET_CPUID{,2} after KVM_RUN (see the comment which my patch moves).
E.g. we can have an 'allowlist' of things which can change (and put
*APICids there) and only fail KVM_SET_CPUID{,2} when we see something
else changing. In QEMU, we can search the parked CPUs list for an entry
with the right *APICid and reuse it only if we manage to find one.

--
Vitaly


2022-01-05 08:17:30

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Mon, 03 Jan 2022 13:56:53 +0100
Vitaly Kuznetsov <[email protected]> wrote:

> Igor Mammedov <[email protected]> writes:
>
> > On Mon, 03 Jan 2022 09:04:29 +0100
> > Vitaly Kuznetsov <[email protected]> wrote:
> >
> >> Paolo Bonzini <[email protected]> writes:
> >>
> >> > On 12/27/21 18:32, Igor Mammedov wrote:
> >> >>> Tweaked and queued nevertheless, thanks.
> >> >> it seems this patch breaks VCPU hotplug, in scenario:
> >> >>
> >> >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> >> >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> >> >>
> >> >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> >> >>
> >> >
> >> > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> >> > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> >> > the data passed to the ioctl is the same that was set before.
> >>
> >> Are we sure the data is going to be *exactly* the same? In particular,
> >> when using vCPU fds from the parked list, do we keep the same
> >> APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> >> different id?
> >
> > If I recall it right, it can be a different ID easily.
> >
>
> It's broken then. I'd suggest we revert the patch from KVM and think
> about the strategy how to proceed.

Can you post a patch, then?

> Going forward, we really want to ban
> KVM_SET_CPUID{,2} after KVM_RUN (see the comment which my patch moves).
> E.g. we can have an 'allowlist' of things which can change (and put
> *APICids there) and only fail KVM_SET_CPUID{,2} when we see something
> else changing.

It might work, at least on QEMU side we do not allow mixing up CPU models
within VM instance (so far). So aside of APICid (and related leafs
(extended APIC ID/possibly other topo related stuff)) nothing else should
change ever when a new vCPU is hotplugged.

> In QEMU, we can search the parked CPUs list for an entry
> with the right *APICid and reuse it only if we manage to find one.
In QEMU, 'parked cpus' fd list is a generic code shared by all supported
archs. And I'm reluctant to push something x86 specific there (it's not
impossible, but it's a crutch to workaround forbidden KVM_SET_CPUID{,2}).


2022-01-05 09:11:19

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On 1/3/22 13:56, Vitaly Kuznetsov wrote:
> 'allowlist' of things which can change (and put
> *APICids there) and only fail KVM_SET_CPUID{,2} when we see something
> else changing.

We could also go the other way and only deny changes that result in
changed CPU caps. That should be easier to implement since we have
already a mapping from CPU capability words to CPUID leaves and registers.

Paolo


2022-01-05 09:12:17

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Igor Mammedov <[email protected]> writes:

> On Mon, 03 Jan 2022 13:56:53 +0100
> Vitaly Kuznetsov <[email protected]> wrote:
>
>> Igor Mammedov <[email protected]> writes:
>>
>> > On Mon, 03 Jan 2022 09:04:29 +0100
>> > Vitaly Kuznetsov <[email protected]> wrote:
>> >
>> >> Paolo Bonzini <[email protected]> writes:
>> >>
>> >> > On 12/27/21 18:32, Igor Mammedov wrote:
>> >> >>> Tweaked and queued nevertheless, thanks.
>> >> >> it seems this patch breaks VCPU hotplug, in scenario:
>> >> >>
>> >> >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
>> >> >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
>> >> >>
>> >> >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
>> >> >>
>> >> >
>> >> > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
>> >> > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
>> >> > the data passed to the ioctl is the same that was set before.
>> >>
>> >> Are we sure the data is going to be *exactly* the same? In particular,
>> >> when using vCPU fds from the parked list, do we keep the same
>> >> APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
>> >> different id?
>> >
>> > If I recall it right, it can be a different ID easily.
>> >
>>
>> It's broken then. I'd suggest we revert the patch from KVM and think
>> about the strategy how to proceed.
>
> Can you post a patch, then?
>

Paolo,

would you like to send a last minute revert to Linus to save 5.16 ...

>> Going forward, we really want to ban
>> KVM_SET_CPUID{,2} after KVM_RUN (see the comment which my patch moves).
>> E.g. we can have an 'allowlist' of things which can change (and put
>> *APICids there) and only fail KVM_SET_CPUID{,2} when we see something
>> else changing.
>
> It might work, at least on QEMU side we do not allow mixing up CPU models
> within VM instance (so far). So aside of APICid (and related leafs
> (extended APIC ID/possibly other topo related stuff)) nothing else should
> change ever when a new vCPU is hotplugged.

or should we just focus on this (or similar) solution (for 5.17 and
[email protected])?

>
>> In QEMU, we can search the parked CPUs list for an entry
>> with the right *APICid and reuse it only if we manage to find one.
> In QEMU, 'parked cpus' fd list is a generic code shared by all supported
> archs. And I'm reluctant to push something x86 specific there (it's not
> impossible, but it's a crutch to workaround forbidden KVM_SET_CPUID{,2}).
>

You can see it this was: vCPU fd corresponds to the particular CPU being
hot plugged/unplugged, not to any CPU in the guest system. The change
may be generic enough then (but it's not going to save existing QEMUs of
course).

--
Vitaly


2022-01-05 10:10:06

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Paolo Bonzini <[email protected]> writes:

> On 1/3/22 13:56, Vitaly Kuznetsov wrote:
>> 'allowlist' of things which can change (and put
>> *APICids there) and only fail KVM_SET_CPUID{,2} when we see something
>> else changing.
>
> We could also go the other way and only deny changes that result in
> changed CPU caps. That should be easier to implement since we have
> already a mapping from CPU capability words to CPUID leaves and registers.
>

Good idea, I'll look into it (if noone beats me to it).

--
Vitaly


2022-01-07 09:02:56

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Vitaly Kuznetsov <[email protected]> writes:

> Paolo Bonzini <[email protected]> writes:
>
>> On 1/3/22 13:56, Vitaly Kuznetsov wrote:
>>> 'allowlist' of things which can change (and put
>>> *APICids there) and only fail KVM_SET_CPUID{,2} when we see something
>>> else changing.
>>
>> We could also go the other way and only deny changes that result in
>> changed CPU caps. That should be easier to implement since we have
>> already a mapping from CPU capability words to CPUID leaves and registers.
>>
>
> Good idea, I'll look into it (if noone beats me to it).

(just getting to it)

On the other hand, e.g. MAXPHYADDR (CPUID 0x80000008.EAX) is not a CPU
cap but it's one of the main reasons why we want to forbid
KVM_SET_CPUID{,2} after KVM_RUN in the first place. I'm also not sure
about allowing PV feature bits changes (KVM, Hyper-V, Xen) and I don't
think there's a need for that.

I'm again leaning towards an allowlist and currently I see only two
candidates:

CPUID.01H.EBX bits 31:24 (initial LAPIC id)
CPUID.0BH.EDX (x2APIC id)

Anything else I'm missing?

--
Vitaly


2022-01-07 18:26:14

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On 1/7/22 10:02, Vitaly Kuznetsov wrote:
>
> I'm again leaning towards an allowlist and currently I see only two
> candidates:
>
> CPUID.01H.EBX bits 31:24 (initial LAPIC id)
> CPUID.0BH.EDX (x2APIC id)
>
> Anything else I'm missing?

I would also ignore completely CPUID leaves 03H, 04H, 0BH, 80000005h,
80000006h, 8000001Dh, 8000001Eh (cache and processor topology), just to
err on the safe side.

We could change kvm_find_cpuid_entry to WARN if any of those leaves are
passed.

Paolo


2022-01-11 08:00:46

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Fri, 7 Jan 2022 19:15:43 +0100
Paolo Bonzini <[email protected]> wrote:

> On 1/7/22 10:02, Vitaly Kuznetsov wrote:
> >
> > I'm again leaning towards an allowlist and currently I see only two
> > candidates:
> >
> > CPUID.01H.EBX bits 31:24 (initial LAPIC id)
> > CPUID.0BH.EDX (x2APIC id)
> >
> > Anything else I'm missing?
>
> I would also ignore completely CPUID leaves 03H, 04H, 0BH, 80000005h,
> 80000006h, 8000001Dh, 8000001Eh (cache and processor topology), just to
> err on the safe side.

on top of that,

1Fh

> We could change kvm_find_cpuid_entry to WARN if any of those leaves are
> passed.
>
> Paolo
>


2022-01-12 13:58:28

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Igor Mammedov <[email protected]> writes:

> On Fri, 7 Jan 2022 19:15:43 +0100
> Paolo Bonzini <[email protected]> wrote:
>
>> On 1/7/22 10:02, Vitaly Kuznetsov wrote:
>> >
>> > I'm again leaning towards an allowlist and currently I see only two
>> > candidates:
>> >
>> > CPUID.01H.EBX bits 31:24 (initial LAPIC id)
>> > CPUID.0BH.EDX (x2APIC id)
>> >
>> > Anything else I'm missing?
>>
>> I would also ignore completely CPUID leaves 03H, 04H, 0BH, 80000005h,
>> 80000006h, 8000001Dh, 8000001Eh (cache and processor topology), just to
>> err on the safe side.
>
> on top of that,
>
> 1Fh
>

The implementation turned out to be a bit more complex as kvm also
mangles CPUIDs so we need to account for that. Could you give the
attached series a spin to see if it works?

--
Vitaly


Attachments:
0001-KVM-x86-Fix-indentation-in-kvm_set_cpuid.patch (1.38 kB)
0002-KVM-x86-Do-runtime-CPUID-update-before-updating-vcpu.patch (3.96 kB)
0003-KVM-x86-Partially-allow-KVM_SET_CPUID-2-after-KVM_RU.patch (4.67 kB)
Download all attachments

2022-01-12 18:39:16

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On 1/12/22 14:58, Vitaly Kuznetsov wrote:
> - best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
> + best = cpuid_entry2_find(entries, nent, 0xD, 1);
> if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
> cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
> best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
>
> - best = kvm_find_kvm_cpuid_features(vcpu);
> + best = __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
> + vcpu->arch.cpuid_nent);
> if (kvm_hlt_in_guest(vcpu->kvm) && best &&

I think this should be __kvm_find_kvm_cpuid_features(vcpu, entries, nent).

>
> + case 0x1:
> + /* Only initial LAPIC id is allowed to change */
> + if (e->eax ^ best->eax || ((e->ebx ^ best->ebx) >> 24) ||
> + e->ecx ^ best->ecx || e->edx ^ best->edx)
> + return -EINVAL;
> + break;

This XOR is a bit weird. In addition the EBX test is checking the wrong
bits (it checks whether 31:24 change and ignores changes to 23:0).

You can write just "(e->ebx & ~0xff000000u) != (best->ebx ~0xff000000u)".

>
> + default:
> + if (e->eax ^ best->eax || e->ebx ^ best->ebx ||
> + e->ecx ^ best->ecx || e->edx ^ best->edx)
> + return -EINVAL;

This one even more so.

Paolo


2022-01-13 09:27:40

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Paolo Bonzini <[email protected]> writes:

> On 1/12/22 14:58, Vitaly Kuznetsov wrote:
>> - best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
>> + best = cpuid_entry2_find(entries, nent, 0xD, 1);
>> if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
>> cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
>> best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
>>
>> - best = kvm_find_kvm_cpuid_features(vcpu);
>> + best = __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
>> + vcpu->arch.cpuid_nent);
>> if (kvm_hlt_in_guest(vcpu->kvm) && best &&
>
> I think this should be __kvm_find_kvm_cpuid_features(vcpu, entries, nent).
>

Of course.

>>
>> + case 0x1:
>> + /* Only initial LAPIC id is allowed to change */
>> + if (e->eax ^ best->eax || ((e->ebx ^ best->ebx) >> 24) ||
>> + e->ecx ^ best->ecx || e->edx ^ best->edx)
>> + return -EINVAL;
>> + break;
>
> This XOR is a bit weird. In addition the EBX test is checking the wrong
> bits (it checks whether 31:24 change and ignores changes to 23:0).

Indeed, however, I've tested CPU hotplug with QEMU trying different
CPUs in random order and surprisingly othing blew up, feels like QEMU
was smart enough to re-use the right fd)

>
> You can write just "(e->ebx & ~0xff000000u) != (best->ebx ~0xff000000u)".
>
>>
>> + default:
>> + if (e->eax ^ best->eax || e->ebx ^ best->ebx ||
>> + e->ecx ^ best->ecx || e->edx ^ best->edx)
>> + return -EINVAL;
>
> This one even more so.

Thanks for the early review, I'm going to prepare a selftest and send
this out.

--
Vitaly


2022-01-13 14:29:09

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Thu, 2022-01-13 at 10:27 +0100, Vitaly Kuznetsov wrote:
> Paolo Bonzini <[email protected]> writes:
>
> > On 1/12/22 14:58, Vitaly Kuznetsov wrote:
> > > - best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
> > > + best = cpuid_entry2_find(entries, nent, 0xD, 1);
> > > if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
> > > cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
> > > best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
> > >
> > > - best = kvm_find_kvm_cpuid_features(vcpu);
> > > + best = __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
> > > + vcpu->arch.cpuid_nent);
> > > if (kvm_hlt_in_guest(vcpu->kvm) && best &&
> >
> > I think this should be __kvm_find_kvm_cpuid_features(vcpu, entries, nent).
> >
>
> Of course.
>
> > > + case 0x1:
> > > + /* Only initial LAPIC id is allowed to change */
> > > + if (e->eax ^ best->eax || ((e->ebx ^ best->ebx) >> 24) ||
> > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
> > > + return -EINVAL;
> > > + break;
> >
> > This XOR is a bit weird. In addition the EBX test is checking the wrong
> > bits (it checks whether 31:24 change and ignores changes to 23:0).
>
> Indeed, however, I've tested CPU hotplug with QEMU trying different
> CPUs in random order and surprisingly othing blew up, feels like QEMU
> was smart enough to re-use the right fd)
>
> > You can write just "(e->ebx & ~0xff000000u) != (best->ebx ~0xff000000u)".
> >
> > > + default:
> > > + if (e->eax ^ best->eax || e->ebx ^ best->ebx ||
> > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
> > > + return -EINVAL;
> >
> > This one even more so.
>
> Thanks for the early review, I'm going to prepare a selftest and send
> this out.
>
I also looked at this recently (due to other reasons) and I found out that
qemu picks a parked vcpu by its vcpu_id which is its initial apic id,
thus apic id related features should not change.

Take a look at 'kvm_get_vcpu' in qemu source.
Maybe old qemu versions didn't do this?

Best regards,
Thanks,
Maxim Levitsky


2022-01-13 14:36:17

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Maxim Levitsky <[email protected]> writes:

> On Thu, 2022-01-13 at 10:27 +0100, Vitaly Kuznetsov wrote:
>> Paolo Bonzini <[email protected]> writes:
>>
>> > On 1/12/22 14:58, Vitaly Kuznetsov wrote:
>> > > - best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
>> > > + best = cpuid_entry2_find(entries, nent, 0xD, 1);
>> > > if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
>> > > cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
>> > > best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
>> > >
>> > > - best = kvm_find_kvm_cpuid_features(vcpu);
>> > > + best = __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
>> > > + vcpu->arch.cpuid_nent);
>> > > if (kvm_hlt_in_guest(vcpu->kvm) && best &&
>> >
>> > I think this should be __kvm_find_kvm_cpuid_features(vcpu, entries, nent).
>> >
>>
>> Of course.
>>
>> > > + case 0x1:
>> > > + /* Only initial LAPIC id is allowed to change */
>> > > + if (e->eax ^ best->eax || ((e->ebx ^ best->ebx) >> 24) ||
>> > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
>> > > + return -EINVAL;
>> > > + break;
>> >
>> > This XOR is a bit weird. In addition the EBX test is checking the wrong
>> > bits (it checks whether 31:24 change and ignores changes to 23:0).
>>
>> Indeed, however, I've tested CPU hotplug with QEMU trying different
>> CPUs in random order and surprisingly othing blew up, feels like QEMU
>> was smart enough to re-use the right fd)
>>
>> > You can write just "(e->ebx & ~0xff000000u) != (best->ebx ~0xff000000u)".
>> >
>> > > + default:
>> > > + if (e->eax ^ best->eax || e->ebx ^ best->ebx ||
>> > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
>> > > + return -EINVAL;
>> >
>> > This one even more so.
>>
>> Thanks for the early review, I'm going to prepare a selftest and send
>> this out.
>>
> I also looked at this recently (due to other reasons) and I found out that
> qemu picks a parked vcpu by its vcpu_id which is its initial apic id,
> thus apic id related features should not change.
>
> Take a look at 'kvm_get_vcpu' in qemu source.
> Maybe old qemu versions didn't do this?

I took Igor's word on this, I didn't check QEMU code :-)

In the v1 I've just sent [L,x2]APIC ids are allowed to change. This
shouldn't screw the MMU (which was the main motivation for forbidding
KVM_SET_CPUID{,2} after KVM_RUN in the first place) but maybe we don't
really need to be so permissive.

--
Vitaly


2022-01-13 14:42:08

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Thu, 2022-01-13 at 15:36 +0100, Vitaly Kuznetsov wrote:
> Maxim Levitsky <[email protected]> writes:
>
> > On Thu, 2022-01-13 at 10:27 +0100, Vitaly Kuznetsov wrote:
> > > Paolo Bonzini <[email protected]> writes:
> > >
> > > > On 1/12/22 14:58, Vitaly Kuznetsov wrote:
> > > > > - best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
> > > > > + best = cpuid_entry2_find(entries, nent, 0xD, 1);
> > > > > if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
> > > > > cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
> > > > > best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
> > > > >
> > > > > - best = kvm_find_kvm_cpuid_features(vcpu);
> > > > > + best = __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
> > > > > + vcpu->arch.cpuid_nent);
> > > > > if (kvm_hlt_in_guest(vcpu->kvm) && best &&
> > > >
> > > > I think this should be __kvm_find_kvm_cpuid_features(vcpu, entries, nent).
> > > >
> > >
> > > Of course.
> > >
> > > > > + case 0x1:
> > > > > + /* Only initial LAPIC id is allowed to change */
> > > > > + if (e->eax ^ best->eax || ((e->ebx ^ best->ebx) >> 24) ||
> > > > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
> > > > > + return -EINVAL;
> > > > > + break;
> > > >
> > > > This XOR is a bit weird. In addition the EBX test is checking the wrong
> > > > bits (it checks whether 31:24 change and ignores changes to 23:0).
> > >
> > > Indeed, however, I've tested CPU hotplug with QEMU trying different
> > > CPUs in random order and surprisingly othing blew up, feels like QEMU
> > > was smart enough to re-use the right fd)
> > >
> > > > You can write just "(e->ebx & ~0xff000000u) != (best->ebx ~0xff000000u)".
> > > >
> > > > > + default:
> > > > > + if (e->eax ^ best->eax || e->ebx ^ best->ebx ||
> > > > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
> > > > > + return -EINVAL;
> > > >
> > > > This one even more so.
> > >
> > > Thanks for the early review, I'm going to prepare a selftest and send
> > > this out.
> > >
> > I also looked at this recently (due to other reasons) and I found out that
> > qemu picks a parked vcpu by its vcpu_id which is its initial apic id,
> > thus apic id related features should not change.
> >
> > Take a look at 'kvm_get_vcpu' in qemu source.
> > Maybe old qemu versions didn't do this?
>
> I took Igor's word on this, I didn't check QEMU code :-)
>
> In the v1 I've just sent [L,x2]APIC ids are allowed to change. This
> shouldn't screw the MMU (which was the main motivation for forbidding
> KVM_SET_CPUID{,2} after KVM_RUN in the first place) but maybe we don't
> really need to be so permissive.
>

For my nested AVIC work I would really want the APIC ID of a VCPU to be read-only
and be equal to vcpu_id.

That simplifies lot of things, and in practice it is hightly likely that no guests
change their APIC id, and likely that qemu doesn't as well.

Best regards,
Maxim Levitsky





2022-01-13 15:00:05

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Maxim Levitsky <[email protected]> writes:

> On Thu, 2022-01-13 at 15:36 +0100, Vitaly Kuznetsov wrote:
>> Maxim Levitsky <[email protected]> writes:
>>
>> > On Thu, 2022-01-13 at 10:27 +0100, Vitaly Kuznetsov wrote:
>> > > Paolo Bonzini <[email protected]> writes:
>> > >
>> > > > On 1/12/22 14:58, Vitaly Kuznetsov wrote:
>> > > > > - best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
>> > > > > + best = cpuid_entry2_find(entries, nent, 0xD, 1);
>> > > > > if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
>> > > > > cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
>> > > > > best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
>> > > > >
>> > > > > - best = kvm_find_kvm_cpuid_features(vcpu);
>> > > > > + best = __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
>> > > > > + vcpu->arch.cpuid_nent);
>> > > > > if (kvm_hlt_in_guest(vcpu->kvm) && best &&
>> > > >
>> > > > I think this should be __kvm_find_kvm_cpuid_features(vcpu, entries, nent).
>> > > >
>> > >
>> > > Of course.
>> > >
>> > > > > + case 0x1:
>> > > > > + /* Only initial LAPIC id is allowed to change */
>> > > > > + if (e->eax ^ best->eax || ((e->ebx ^ best->ebx) >> 24) ||
>> > > > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
>> > > > > + return -EINVAL;
>> > > > > + break;
>> > > >
>> > > > This XOR is a bit weird. In addition the EBX test is checking the wrong
>> > > > bits (it checks whether 31:24 change and ignores changes to 23:0).
>> > >
>> > > Indeed, however, I've tested CPU hotplug with QEMU trying different
>> > > CPUs in random order and surprisingly othing blew up, feels like QEMU
>> > > was smart enough to re-use the right fd)
>> > >
>> > > > You can write just "(e->ebx & ~0xff000000u) != (best->ebx ~0xff000000u)".
>> > > >
>> > > > > + default:
>> > > > > + if (e->eax ^ best->eax || e->ebx ^ best->ebx ||
>> > > > > + e->ecx ^ best->ecx || e->edx ^ best->edx)
>> > > > > + return -EINVAL;
>> > > >
>> > > > This one even more so.
>> > >
>> > > Thanks for the early review, I'm going to prepare a selftest and send
>> > > this out.
>> > >
>> > I also looked at this recently (due to other reasons) and I found out that
>> > qemu picks a parked vcpu by its vcpu_id which is its initial apic id,
>> > thus apic id related features should not change.
>> >
>> > Take a look at 'kvm_get_vcpu' in qemu source.
>> > Maybe old qemu versions didn't do this?
>>
>> I took Igor's word on this, I didn't check QEMU code :-)
>>
>> In the v1 I've just sent [L,x2]APIC ids are allowed to change. This
>> shouldn't screw the MMU (which was the main motivation for forbidding
>> KVM_SET_CPUID{,2} after KVM_RUN in the first place) but maybe we don't
>> really need to be so permissive.
>>
>
> For my nested AVIC work I would really want the APIC ID of a VCPU to be read-only
> and be equal to vcpu_id.
>

Doesn't APIC ID have topology encoded in it?

> That simplifies lot of things, and in practice it is hightly likely that no guests
> change their APIC id, and likely that qemu doesn't as well.

--
Vitaly


2022-01-13 16:26:30

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Thu, Jan 13, 2022, Vitaly Kuznetsov wrote:
> Maxim Levitsky <[email protected]> writes:
> > For my nested AVIC work I would really want the APIC ID of a VCPU to be read-only
> > and be equal to vcpu_id.
> >
>
> Doesn't APIC ID have topology encoded in it?

Yeah, APIC IDs are derived from the topology. From the SDM (this doesn't
talk about core/SMT info, but that's included as well):

The hardware assigned APIC ID is based on system topology and includes encoding
for socket position and cluster information.

The SDM also says:

Some processors permit software to modify the APIC ID. However, the ability of
software to modify the APIC ID is processor model specific.

So I _think_ we could define KVM behavior to ignore writes from the _guest_, but
the APIC_ID == vcpu_id requirement won't fly as userspace expects to be able to
stuff virtual toplogy info into the APIC ID.

2022-01-13 16:30:59

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Thu, 2022-01-13 at 16:26 +0000, Sean Christopherson wrote:
> On Thu, Jan 13, 2022, Vitaly Kuznetsov wrote:
> > Maxim Levitsky <[email protected]> writes:
> > > For my nested AVIC work I would really want the APIC ID of a VCPU to be read-only
> > > and be equal to vcpu_id.
> > >
> >
> > Doesn't APIC ID have topology encoded in it?
>
> Yeah, APIC IDs are derived from the topology. From the SDM (this doesn't
> talk about core/SMT info, but that's included as well):
>
> The hardware assigned APIC ID is based on system topology and includes encoding
> for socket position and cluster information.
>
> The SDM also says:
>
> Some processors permit software to modify the APIC ID. However, the ability of
> software to modify the APIC ID is processor model specific.
>
> So I _think_ we could define KVM behavior to ignore writes from the _guest_, but
> the APIC_ID == vcpu_id requirement won't fly as userspace expects to be able to
> stuff virtual toplogy info into the APIC ID.
>
That is a very good piece of information! Thanks!

Best regards,
Maxim Levitsky


2022-01-13 22:33:47

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Mon, Jan 03, 2022, Igor Mammedov wrote:
> On Mon, 03 Jan 2022 09:04:29 +0100
> Vitaly Kuznetsov <[email protected]> wrote:
>
> > Paolo Bonzini <[email protected]> writes:
> >
> > > On 12/27/21 18:32, Igor Mammedov wrote:
> > >>> Tweaked and queued nevertheless, thanks.
> > >> it seems this patch breaks VCPU hotplug, in scenario:
> > >>
> > >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> > >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> > >>
> > >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> > >>
> > >
> > > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> > > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> > > the data passed to the ioctl is the same that was set before.
> >
> > Are we sure the data is going to be *exactly* the same? In particular,
> > when using vCPU fds from the parked list, do we keep the same
> > APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> > different id?
>
> If I recall it right, it can be a different ID easily.

No, it cannot. KVM doesn't provide a way for userspace to change the APIC ID of
a vCPU after the vCPU is created. x2APIC flat out disallows changing the APIC ID,
and unless there's magic I'm missing, apic_mmio_write() => kvm_lapic_reg_write()
is not reachable from userspace.

The only way for userspace to set the APIC ID is to change vcpu->vcpu_id, and that
can only be done at KVM_VCPU_CREATE.

So, reusing a parked vCPU for hotplug must reuse the same APIC ID. QEMU handles
this by stashing the vcpu_id, a.k.a. APIC ID, when parking a vCPU, and reuses a
parked vCPU if and only if it has the same APIC ID. And because QEMU derives the
APIC ID from topology, that means all the topology CPUID leafs must remain the
same, otherwise the guest is hosed because it will send IPIs to the wrong vCPUs.

static int do_kvm_destroy_vcpu(CPUState *cpu)
{
struct KVMParkedVcpu *vcpu = NULL;

...

vcpu = g_malloc0(sizeof(*vcpu));
vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); <=== stash the APIC ID when parking
vcpu->kvm_fd = cpu->kvm_fd;
QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
err:
return ret;
}

static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
{
struct KVMParkedVcpu *cpu;

QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
if (cpu->vcpu_id == vcpu_id) { <=== reuse if APIC ID matches
int kvm_fd;

QLIST_REMOVE(cpu, node);
kvm_fd = cpu->kvm_fd;
g_free(cpu);
return kvm_fd;
}
}

return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
}

2022-01-14 08:29:28

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Thu, 2022-01-13 at 22:33 +0000, Sean Christopherson wrote:
> On Mon, Jan 03, 2022, Igor Mammedov wrote:
> > On Mon, 03 Jan 2022 09:04:29 +0100
> > Vitaly Kuznetsov <[email protected]> wrote:
> >
> > > Paolo Bonzini <[email protected]> writes:
> > >
> > > > On 12/27/21 18:32, Igor Mammedov wrote:
> > > > > > Tweaked and queued nevertheless, thanks.
> > > > > it seems this patch breaks VCPU hotplug, in scenario:
> > > > >
> > > > > 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> > > > > 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> > > > >
> > > > > RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> > > > >
> > > >
> > > > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> > > > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> > > > the data passed to the ioctl is the same that was set before.
> > >
> > > Are we sure the data is going to be *exactly* the same? In particular,
> > > when using vCPU fds from the parked list, do we keep the same
> > > APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> > > different id?
> >
> > If I recall it right, it can be a different ID easily.
>
> No, it cannot. KVM doesn't provide a way for userspace to change the APIC ID of
> a vCPU after the vCPU is created. x2APIC flat out disallows changing the APIC ID,
> and unless there's magic I'm missing, apic_mmio_write() => kvm_lapic_reg_write()
> is not reachable from userspace.

So after all, it is true that vcpu_id == initial APIC_ID,
and if we don't let guest change it, it will be always like that?


You said that its not true in the other mail in the thread.
I haven't checked it in the code yet, as I never was much worried about userspace changing,
but I will check it soon.

I did a quick look and I see that at least the userspace can call 'kvm_apic_set_state' and it
contains snapshot of all apic registers, including apic id.
However it would be very easy to add a check
there and fail if userspace attempts to
set APIC_ID != vcpu_id.


Best regards,
Maxim Levitsky
>
> The only way for userspace to set the APIC ID is to change vcpu->vcpu_id, and that
> can only be done at KVM_VCPU_CREATE.
>
> So, reusing a parked vCPU for hotplug must reuse the same APIC ID. QEMU handles
> this by stashing the vcpu_id, a.k.a. APIC ID, when parking a vCPU, and reuses a
> parked vCPU if and only if it has the same APIC ID. And because QEMU derives the
> APIC ID from topology, that means all the topology CPUID leafs must remain the
> same, otherwise the guest is hosed because it will send IPIs to the wrong vCPUs.
>
> static int do_kvm_destroy_vcpu(CPUState *cpu)
> {
> struct KVMParkedVcpu *vcpu = NULL;
>
> ...
>
> vcpu = g_malloc0(sizeof(*vcpu));
> vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); <=== stash the APIC ID when parking
> vcpu->kvm_fd = cpu->kvm_fd;
> QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
> err:
> return ret;
> }
>
> static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
> {
> struct KVMParkedVcpu *cpu;
>
> QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
> if (cpu->vcpu_id == vcpu_id) { <=== reuse if APIC ID matches
> int kvm_fd;
>
> QLIST_REMOVE(cpu, node);
> kvm_fd = cpu->kvm_fd;
> g_free(cpu);
> return kvm_fd;
> }
> }
>
> return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> }
>



2022-01-14 08:55:44

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Thu, 13 Jan 2022 22:33:38 +0000
Sean Christopherson <[email protected]> wrote:

> On Mon, Jan 03, 2022, Igor Mammedov wrote:
> > On Mon, 03 Jan 2022 09:04:29 +0100
> > Vitaly Kuznetsov <[email protected]> wrote:
> >
> > > Paolo Bonzini <[email protected]> writes:
> > >
> > > > On 12/27/21 18:32, Igor Mammedov wrote:
> > > >>> Tweaked and queued nevertheless, thanks.
> > > >> it seems this patch breaks VCPU hotplug, in scenario:
> > > >>
> > > >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> > > >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> > > >>
> > > >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> > > >>
> > > >
> > > > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> > > > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> > > > the data passed to the ioctl is the same that was set before.
> > >
> > > Are we sure the data is going to be *exactly* the same? In particular,
> > > when using vCPU fds from the parked list, do we keep the same
> > > APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> > > different id?
> >
> > If I recall it right, it can be a different ID easily.
>
> No, it cannot. KVM doesn't provide a way for userspace to change the APIC ID of
> a vCPU after the vCPU is created. x2APIC flat out disallows changing the APIC ID,
> and unless there's magic I'm missing, apic_mmio_write() => kvm_lapic_reg_write()
> is not reachable from userspace.
>
> The only way for userspace to set the APIC ID is to change vcpu->vcpu_id, and that
> can only be done at KVM_VCPU_CREATE.
>
> So, reusing a parked vCPU for hotplug must reuse the same APIC ID. QEMU handles
> this by stashing the vcpu_id, a.k.a. APIC ID, when parking a vCPU, and reuses a
> parked vCPU if and only if it has the same APIC ID. And because QEMU derives the
> APIC ID from topology, that means all the topology CPUID leafs must remain the
> same, otherwise the guest is hosed because it will send IPIs to the wrong vCPUs.

Indeed, I was wrong.
I just checked all cpu unplug history in qemu. It was introduced in qemu-2.7
and from the very beginning it did stash vcpu_id,
so there is no old QEMU that would re-plug VCPU with different apic_id.
Though tells us nothing about what other userspace implementations might do.

However, a problem of failing KVM_SET_CPUID2 during VCPU re-plug
is still there and re-plug will fail if KVM rejects repeated KVM_SET_CPUID2
even if ioctl called with exactly the same CPUID leafs as the 1st call.


> static int do_kvm_destroy_vcpu(CPUState *cpu)
> {
> struct KVMParkedVcpu *vcpu = NULL;
>
> ...
>
> vcpu = g_malloc0(sizeof(*vcpu));
> vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); <=== stash the APIC ID when parking
> vcpu->kvm_fd = cpu->kvm_fd;
> QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
> err:
> return ret;
> }
>
> static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
> {
> struct KVMParkedVcpu *cpu;
>
> QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
> if (cpu->vcpu_id == vcpu_id) { <=== reuse if APIC ID matches
> int kvm_fd;
>
> QLIST_REMOVE(cpu, node);
> kvm_fd = cpu->kvm_fd;
> g_free(cpu);
> return kvm_fd;
> }
> }
>
> return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> }
>


2022-01-14 09:31:57

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Igor Mammedov <[email protected]> writes:

> On Thu, 13 Jan 2022 22:33:38 +0000
> Sean Christopherson <[email protected]> wrote:
>
>> On Mon, Jan 03, 2022, Igor Mammedov wrote:
>> > On Mon, 03 Jan 2022 09:04:29 +0100
>> > Vitaly Kuznetsov <[email protected]> wrote:
>> >
>> > > Paolo Bonzini <[email protected]> writes:
>> > >
>> > > > On 12/27/21 18:32, Igor Mammedov wrote:
>> > > >>> Tweaked and queued nevertheless, thanks.
>> > > >> it seems this patch breaks VCPU hotplug, in scenario:
>> > > >>
>> > > >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
>> > > >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
>> > > >>
>> > > >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
>> > > >>
>> > > >
>> > > > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
>> > > > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
>> > > > the data passed to the ioctl is the same that was set before.
>> > >
>> > > Are we sure the data is going to be *exactly* the same? In particular,
>> > > when using vCPU fds from the parked list, do we keep the same
>> > > APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
>> > > different id?
>> >
>> > If I recall it right, it can be a different ID easily.
>>
>> No, it cannot. KVM doesn't provide a way for userspace to change the APIC ID of
>> a vCPU after the vCPU is created. x2APIC flat out disallows changing the APIC ID,
>> and unless there's magic I'm missing, apic_mmio_write() => kvm_lapic_reg_write()
>> is not reachable from userspace.
>>
>> The only way for userspace to set the APIC ID is to change vcpu->vcpu_id, and that
>> can only be done at KVM_VCPU_CREATE.
>>
>> So, reusing a parked vCPU for hotplug must reuse the same APIC ID. QEMU handles
>> this by stashing the vcpu_id, a.k.a. APIC ID, when parking a vCPU, and reuses a
>> parked vCPU if and only if it has the same APIC ID. And because QEMU derives the
>> APIC ID from topology, that means all the topology CPUID leafs must remain the
>> same, otherwise the guest is hosed because it will send IPIs to the wrong vCPUs.
>
> Indeed, I was wrong.
> I just checked all cpu unplug history in qemu. It was introduced in qemu-2.7
> and from the very beginning it did stash vcpu_id,
> so there is no old QEMU that would re-plug VCPU with different apic_id.
> Though tells us nothing about what other userspace implementations might do.
>

The genie is out of the bottle already, 5.16 is released with the change
(which was promissed for some time, KVM was complaining with
pr_warn_ratelimited()). I'd be brave and say that if QEMU doesn't need
it then nobody else does (out of curiosity, are there KVM VMMs besides
QEMU which support CPU hotplug out there?).

> However, a problem of failing KVM_SET_CPUID2 during VCPU re-plug
> is still there and re-plug will fail if KVM rejects repeated KVM_SET_CPUID2
> even if ioctl called with exactly the same CPUID leafs as the 1st call.
>

Assuming APIC id change doesn not need to be supported, I can send v2
here with an empty allowlist.

--
Vitaly


2022-01-14 11:22:44

by Igor Mammedov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Fri, 14 Jan 2022 10:31:50 +0100
Vitaly Kuznetsov <[email protected]> wrote:

> Igor Mammedov <[email protected]> writes:
>
> > On Thu, 13 Jan 2022 22:33:38 +0000
> > Sean Christopherson <[email protected]> wrote:
> >
> >> On Mon, Jan 03, 2022, Igor Mammedov wrote:
> >> > On Mon, 03 Jan 2022 09:04:29 +0100
> >> > Vitaly Kuznetsov <[email protected]> wrote:
> >> >
> >> > > Paolo Bonzini <[email protected]> writes:
> >> > >
> >> > > > On 12/27/21 18:32, Igor Mammedov wrote:
> >> > > >>> Tweaked and queued nevertheless, thanks.
> >> > > >> it seems this patch breaks VCPU hotplug, in scenario:
> >> > > >>
> >> > > >> 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> >> > > >> 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> >> > > >>
> >> > > >> RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> >> > > >>
> >> > > >
> >> > > > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> >> > > > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> >> > > > the data passed to the ioctl is the same that was set before.
> >> > >
> >> > > Are we sure the data is going to be *exactly* the same? In particular,
> >> > > when using vCPU fds from the parked list, do we keep the same
> >> > > APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> >> > > different id?
> >> >
> >> > If I recall it right, it can be a different ID easily.
> >>
> >> No, it cannot. KVM doesn't provide a way for userspace to change the APIC ID of
> >> a vCPU after the vCPU is created. x2APIC flat out disallows changing the APIC ID,
> >> and unless there's magic I'm missing, apic_mmio_write() => kvm_lapic_reg_write()
> >> is not reachable from userspace.
> >>
> >> The only way for userspace to set the APIC ID is to change vcpu->vcpu_id, and that
> >> can only be done at KVM_VCPU_CREATE.
> >>
> >> So, reusing a parked vCPU for hotplug must reuse the same APIC ID. QEMU handles
> >> this by stashing the vcpu_id, a.k.a. APIC ID, when parking a vCPU, and reuses a
> >> parked vCPU if and only if it has the same APIC ID. And because QEMU derives the
> >> APIC ID from topology, that means all the topology CPUID leafs must remain the
> >> same, otherwise the guest is hosed because it will send IPIs to the wrong vCPUs.
> >
> > Indeed, I was wrong.
> > I just checked all cpu unplug history in qemu. It was introduced in qemu-2.7
> > and from the very beginning it did stash vcpu_id,
> > so there is no old QEMU that would re-plug VCPU with different apic_id.
> > Though tells us nothing about what other userspace implementations might do.
> >
>
> The genie is out of the bottle already, 5.16 is released with the change
> (which was promissed for some time, KVM was complaining with
> pr_warn_ratelimited()). I'd be brave and say that if QEMU doesn't need
> it then nobody else does (out of curiosity, are there KVM VMMs besides
> QEMU which support CPU hotplug out there?).
>
> > However, a problem of failing KVM_SET_CPUID2 during VCPU re-plug
> > is still there and re-plug will fail if KVM rejects repeated KVM_SET_CPUID2
> > even if ioctl called with exactly the same CPUID leafs as the 1st call.
> >
>
> Assuming APIC id change doesn not need to be supported, I can send v2
> here with an empty allowlist.
As you mentioned in another thread black list would be better
to address Sean's concerns or just revert problematic commit.


2022-01-14 12:25:17

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Igor Mammedov <[email protected]> writes:

> On Fri, 14 Jan 2022 10:31:50 +0100
> Vitaly Kuznetsov <[email protected]> wrote:
>
>> Igor Mammedov <[email protected]> writes:
>>
>>
>> > However, a problem of failing KVM_SET_CPUID2 during VCPU re-plug
>> > is still there and re-plug will fail if KVM rejects repeated KVM_SET_CPUID2
>> > even if ioctl called with exactly the same CPUID leafs as the 1st call.
>> >
>>
>> Assuming APIC id change doesn not need to be supported, I can send v2
>> here with an empty allowlist.
> As you mentioned in another thread black list would be better
> to address Sean's concerns or just revert problematic commit.
>

Personally, I'm leaning towards the blocklist approach even if just for
'documenting' the fact that KVM doesn't correctly handle the
change. Compared to a comment in the code, such approach could help
someone save tons of debugging time (if anyone ever decides do something
weird, like changing MAXPHYADDR on the fly).

--
Vitaly


2022-01-14 22:46:39

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Fri, Jan 14, 2022, Maxim Levitsky wrote:
> On Thu, 2022-01-13 at 22:33 +0000, Sean Christopherson wrote:
> > On Mon, Jan 03, 2022, Igor Mammedov wrote:
> > > On Mon, 03 Jan 2022 09:04:29 +0100
> > > Vitaly Kuznetsov <[email protected]> wrote:
> > >
> > > > Paolo Bonzini <[email protected]> writes:
> > > >
> > > > > On 12/27/21 18:32, Igor Mammedov wrote:
> > > > > > > Tweaked and queued nevertheless, thanks.
> > > > > > it seems this patch breaks VCPU hotplug, in scenario:
> > > > > >
> > > > > > 1. hotunplug existing VCPU (QEMU stores VCPU file descriptor in parked cpus list)
> > > > > > 2. hotplug it again (unsuspecting QEMU reuses stored file descriptor when recreating VCPU)
> > > > > >
> > > > > > RHBZ:https://bugzilla.redhat.com/show_bug.cgi?id=2028337#c11
> > > > > >
> > > > >
> > > > > The fix here would be (in QEMU) to not call KVM_SET_CPUID2 again.
> > > > > However, we need to work around it in KVM, and allow KVM_SET_CPUID2 if
> > > > > the data passed to the ioctl is the same that was set before.
> > > >
> > > > Are we sure the data is going to be *exactly* the same? In particular,
> > > > when using vCPU fds from the parked list, do we keep the same
> > > > APIC/x2APIC id when hotplugging? Or can we actually hotplug with a
> > > > different id?
> > >
> > > If I recall it right, it can be a different ID easily.
> >
> > No, it cannot. KVM doesn't provide a way for userspace to change the APIC ID of
> > a vCPU after the vCPU is created. x2APIC flat out disallows changing the APIC ID,
> > and unless there's magic I'm missing, apic_mmio_write() => kvm_lapic_reg_write()
> > is not reachable from userspace.
>
> So after all, it is true that vcpu_id == initial APIC_ID,
> and if we don't let guest change it, it will be always like that?

Except for kvm_apic_set_state(), which I forgot existed, yes.

> You said that its not true in the other mail in the thread.

I was wrong, I was thinking that userspace could reach kvm_lapic_reg_write(), but
I forgot that there would be no connection without x2apic. But I forgot about
kvm_apic_set_state()...

> I haven't checked it in the code yet, as I never was much worried about
> userspace changing, but I will check it soon.
>
> I did a quick look and I see that at least the userspace can call
> 'kvm_apic_set_state' and it contains snapshot of all apic registers,
> including apic id. However it would be very easy to add a check there and
> fail if userspace attempts to set APIC_ID != vcpu_id.

Yeah, hopefully that doesn't break any userspace. I can't imagine it would,
because if the guest disabled and re-enabled the APIC, kvm_lapic_set_base() would
restore the APIC ID to vcpu_id.

With luck, that's the last hole we need to close...

2022-01-14 22:54:42

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On Fri, Jan 14, 2022, Vitaly Kuznetsov wrote:
> Igor Mammedov <[email protected]> writes:
>
> > On Fri, 14 Jan 2022 10:31:50 +0100
> > Vitaly Kuznetsov <[email protected]> wrote:
> >
> >> Igor Mammedov <[email protected]> writes:
> >>
> >>
> >> > However, a problem of failing KVM_SET_CPUID2 during VCPU re-plug
> >> > is still there and re-plug will fail if KVM rejects repeated KVM_SET_CPUID2
> >> > even if ioctl called with exactly the same CPUID leafs as the 1st call.
> >> >
> >>
> >> Assuming APIC id change doesn not need to be supported, I can send v2
> >> here with an empty allowlist.
> > As you mentioned in another thread black list would be better
> > to address Sean's concerns or just revert problematic commit.
> >
>
> Personally, I'm leaning towards the blocklist approach even if just for
> 'documenting' the fact that KVM doesn't correctly handle the
> change. Compared to a comment in the code, such approach could help
> someone save tons of debugging time (if anyone ever decides do something
> weird, like changing MAXPHYADDR on the fly).

I assume the blocklist approach is let userspace opt into rejecting KVM_SET_CPUID{,2},
but allow all CPUID leafs and sub-leafs to be modified at will by default? I don't
dislike the idea, but I wonder if it's unnecessarily fancy.

What if we instead provide an ioctl/capability to let userspace toggle disabling
of KVM_SET_CPUID{,2}, a la STAC/CLAC to override SMAP? E.g. QEMU could enable
protections after initially creating the vCPU, then temporarily disable protections
only for the hotplug path?

That'd provide solid protections for minimal effort, and if userspace can restrict
the danger zone to one specific path, then userspace can easily do its own auditing
for that one path.

2022-01-17 17:07:08

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Sean Christopherson <[email protected]> writes:

> On Fri, Jan 14, 2022, Vitaly Kuznetsov wrote:
>> Igor Mammedov <[email protected]> writes:
>>
>> > On Fri, 14 Jan 2022 10:31:50 +0100
>> > Vitaly Kuznetsov <[email protected]> wrote:
>> >
>> >> Igor Mammedov <[email protected]> writes:
>> >>
>> >>
>> >> > However, a problem of failing KVM_SET_CPUID2 during VCPU re-plug
>> >> > is still there and re-plug will fail if KVM rejects repeated KVM_SET_CPUID2
>> >> > even if ioctl called with exactly the same CPUID leafs as the 1st call.
>> >> >
>> >>
>> >> Assuming APIC id change doesn not need to be supported, I can send v2
>> >> here with an empty allowlist.
>> > As you mentioned in another thread black list would be better
>> > to address Sean's concerns or just revert problematic commit.
>> >
>>
>> Personally, I'm leaning towards the blocklist approach even if just for
>> 'documenting' the fact that KVM doesn't correctly handle the
>> change. Compared to a comment in the code, such approach could help
>> someone save tons of debugging time (if anyone ever decides do something
>> weird, like changing MAXPHYADDR on the fly).
>
> I assume the blocklist approach is let userspace opt into rejecting KVM_SET_CPUID{,2},
> but allow all CPUID leafs and sub-leafs to be modified at will by
> default?

No, honestly I was thinking about something much simpler: instead of
forbidding KVM_SET_CPUID{,2} after KVM_RUN completely (what we have now
in 5.16), we only forbid to change certain data which we know breaks
some assumptions in MMU, from the comment:
"
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs.
"
It seems that CPU hotplug path doesn't need to change these so we don't
need an opt-in/opt-out, we can just forbid changing certain things for
the time being. Alternatively, we can silently ignore such changes but I
don't quite like it because it would mask bugs in VMMs.

> I don't dislike the idea, but I wonder if it's unnecessarily fancy.
>
> What if we instead provide an ioctl/capability to let userspace toggle disabling
> of KVM_SET_CPUID{,2}, a la STAC/CLAC to override SMAP? E.g. QEMU could enable
> protections after initially creating the vCPU, then temporarily
> disable protections only for the hotplug path?
>
> That'd provide solid protections for minimal effort, and if userspace can restrict
> the danger zone to one specific path, then userspace can easily do its own auditing
> for that one path.

Could work but it seems the protection would only "protect" VMM from
shooting itself in the foot and will likely result in killing the guest
anyway so I'm wondering if it's worth it.

--
Vitaly

2022-01-17 17:13:21

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

On 1/17/22 10:55, Vitaly Kuznetsov wrote:
> No, honestly I was thinking about something much simpler: instead of
> forbidding KVM_SET_CPUID{,2} after KVM_RUN completely (what we have now
> in 5.16), we only forbid to change certain data which we know breaks
> some assumptions in MMU, from the comment:
> "
> * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
> * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
> * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
> * faults due to reusing SPs/SPTEs.
> "
> It seems that CPU hotplug path doesn't need to change these so we don't
> need an opt-in/opt-out, we can just forbid changing certain things for
> the time being. Alternatively, we can silently ignore such changes but I
> don't quite like it because it would mask bugs in VMMs.

I think the version that only allows exactly the same CPUID is the best,
as it leaves less room for future bugs.

Paolo

2022-01-18 02:26:29

by Vitaly Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 2/2] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN

Paolo Bonzini <[email protected]> writes:

> On 1/17/22 10:55, Vitaly Kuznetsov wrote:
>> No, honestly I was thinking about something much simpler: instead of
>> forbidding KVM_SET_CPUID{,2} after KVM_RUN completely (what we have now
>> in 5.16), we only forbid to change certain data which we know breaks
>> some assumptions in MMU, from the comment:
>> "
>> * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
>> * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
>> * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
>> * faults due to reusing SPs/SPTEs.
>> "
>> It seems that CPU hotplug path doesn't need to change these so we don't
>> need an opt-in/opt-out, we can just forbid changing certain things for
>> the time being. Alternatively, we can silently ignore such changes but I
>> don't quite like it because it would mask bugs in VMMs.
>
> I think the version that only allows exactly the same CPUID is the best,
> as it leaves less room for future bugs.
>

Ok, I hear your vote) Will prepare v2.

--
Vitaly