The problem is described in [4/4].
[1/4] is a prerequisite to allow a cleanup in [2/4].
[2/4] hopefully makes [4/4] easier to understand.
[3/4] fixes mistakes from dealing with the mixed mode.
[4/4] allows the hotplug and and add a capability for it.
Radim Krčmář (4):
KVM: x86: use delivery to self in hyperv synic
KVM: x86: replace kvm_apic_id with kvm_{x,x2}apic_id
KVM: x86: make interrupt delivery fast and slow path behave the same
KVM: x86: allow hotplug of VCPU with APIC ID over 0xff
arch/x86/kvm/hyperv.c | 4 ++--
arch/x86/kvm/lapic.c | 62 ++++++++++++++++++++++++++++++++++--------------
arch/x86/kvm/lapic.h | 11 ---------
arch/x86/kvm/x86.c | 1 +
include/uapi/linux/kvm.h | 1 +
5 files changed, 48 insertions(+), 31 deletions(-)
--
2.10.2
There were three calls sites:
- recalculate_apic_map and kvm_apic_match_physical_addr, where it would
only complicate implementation of x2APIC hotplug;
- in apic_debug, where it was still somewhat preserved, but keeping the
old function just for apic_debug was not worth it
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/lapic.c | 41 ++++++++++++++++++++++++++++++-----------
arch/x86/kvm/lapic.h | 11 -----------
2 files changed, 30 insertions(+), 22 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 09edd32b8e42..e645b4bc6437 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+ return apic->vcpu->vcpu_id;
+}
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
int i;
- u32 max_id = 255;
+ u32 max_id = 256; /* enough space for any xAPIC ID */
mutex_lock(&kvm->arch.apic_map_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
- max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -179,16 +189,23 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_lapic **cluster;
u16 mask;
- u32 ldr, aid;
+ u32 ldr;
+ u8 xapic_id;
+ u32 x2apic_id;
if (!kvm_apic_present(vcpu))
continue;
- aid = kvm_apic_id(apic);
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ xapic_id = kvm_xapic_id(apic);
+ x2apic_id = kvm_x2apic_id(apic);
- if (aid <= new->max_apic_id)
- new->phys_map[aid] = apic;
+ if (apic_x2apic_mode(apic) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+ else if (!apic_x2apic_mode(apic))
+ new->phys_map[xapic_id] = apic;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
if (apic_x2apic_mode(apic)) {
new->mode |= KVM_APIC_MODE_X2APIC;
@@ -250,6 +267,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
recalculate_apic_map(apic->vcpu->kvm);
@@ -591,9 +610,9 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
return true;
if (apic_x2apic_mode(apic))
- return mda == kvm_apic_id(apic);
+ return mda == kvm_x2apic_id(apic);
- return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
+ return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic));
}
static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -1907,9 +1926,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
- apic_debug("%s: vcpu=%p, id=%d, base_msr="
+ apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
- vcpu, kvm_apic_id(apic),
+ vcpu, kvm_lapic_get_reg(apic, APIC_ID),
vcpu->arch.apic_base, apic->base_address);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e0c80233b3e1..cb16e6fd2330 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -202,17 +202,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
- /* To avoid a race between apic_base and following APIC_ID update when
- * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
- */
- if (apic_x2apic_mode(apic))
- return apic->vcpu->vcpu_id;
-
- return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
--
2.10.2
Slow path tried to prevent IPIs from x2APIC VCPUs from being delivered
to xAPIC VCPUs and vice-versa. Make slow path behave like fast path,
which never distinguished that.
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/lapic.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e645b4bc6437..42edf1ea2909 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -612,7 +612,7 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
if (apic_x2apic_mode(apic))
return mda == kvm_x2apic_id(apic);
- return mda == SET_APIC_DEST_FIELD(kvm_xapic_id(apic));
+ return mda == kvm_xapic_id(apic);
}
static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -629,7 +629,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
&& (logical_id & mda & 0xffff) != 0;
logical_id = GET_APIC_LOGICAL_ID(logical_id);
- mda = GET_APIC_DEST_FIELD(mda);
switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
@@ -646,9 +645,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
/* The KVM local APIC implementation has two quirks:
*
- * - the xAPIC MDA stores the destination at bits 24-31, while this
- * is not true of struct kvm_lapic_irq's dest_id field. This is
- * just a quirk in the API and is not problematic.
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
*
* - in-kernel IOAPIC messages have to be delivered directly to
* x2APIC, because the kernel does not support interrupt remapping.
@@ -664,13 +663,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
struct kvm_lapic *source, struct kvm_lapic *target)
{
bool ipi = source != NULL;
- bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
- !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
return X2APIC_BROADCAST;
- return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+ return dest_id;
}
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
--
2.10.2
LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
but there is no way to uniquely address it using xAPIC.
>From many possible options, we chose the one that also works on real
hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
xAPIC mode.
KVM intentionally differs from real hardware, because real hardware
(Knights Landing) does just "x2apic_id & 0xff" to decide whether to
accept the interrupt in xAPIC mode and it can deliver one interrupt to
more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
Add a capability to let userspace know that we do something now.
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/lapic.c | 13 +++++++++++--
arch/x86/kvm/x86.c | 1 +
include/uapi/linux/kvm.h | 1 +
3 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 42edf1ea2909..b85985985ac8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -199,10 +199,15 @@ static void recalculate_apic_map(struct kvm *kvm)
xapic_id = kvm_xapic_id(apic);
x2apic_id = kvm_x2apic_id(apic);
- if (apic_x2apic_mode(apic) &&
+ /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
x2apic_id <= new->max_apic_id)
new->phys_map[x2apic_id] = apic;
- else if (!apic_x2apic_mode(apic))
+ /*
+ * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+ * prevent them from masking VCPUs with APIC ID <= 0xff.
+ */
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
new->phys_map[xapic_id] = apic;
ldr = kvm_lapic_get_reg(apic, APIC_LDR);
@@ -612,6 +617,10 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
if (apic_x2apic_mode(apic))
return mda == kvm_x2apic_id(apic);
+ /* Hotplug hack: LAPIC in xAPIC mode also accepts x2APIC. */
+ if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+ return true;
+
return mda == kvm_xapic_id(apic);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7770d77c828d..945e8eeb4eb1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2618,6 +2618,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_X86_ALWAYS_ACCEPT_X2APIC_DEST:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e9f5ceffd741..f25efa375255 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -871,6 +871,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_S390_USER_INSTR0 130
#define KVM_CAP_MSI_DEVID 131
#define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_X86_ALWAYS_ACCEPT_X2APIC_DEST 133
#ifdef KVM_CAP_IRQ_ROUTING
--
2.10.2
Interrupt to self can sent without knowing the APIC ID.
Signed-off-by: Radim Krčmář <[email protected]>
---
arch/x86/kvm/hyperv.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 99cde5220e07..313957ec9a9d 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return -ENOENT;
memset(&irq, 0, sizeof(irq));
- irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+ irq.shorthand = APIC_DEST_SELF;
irq.dest_mode = APIC_DEST_PHYSICAL;
irq.delivery_mode = APIC_DM_FIXED;
irq.vector = vector;
irq.level = 1;
- ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
return ret;
}
--
2.10.2
Am 02.12.2016 um 20:44 schrieb Radim Krčmář:
> LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
> VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
> but there is no way to uniquely address it using xAPIC.
>
> From many possible options, we chose the one that also works on real
> hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
> xAPIC mode.
>
> KVM intentionally differs from real hardware, because real hardware
> (Knights Landing) does just "x2apic_id & 0xff" to decide whether to
> accept the interrupt in xAPIC mode and it can deliver one interrupt to
> more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
>
> Add a capability to let userspace know that we do something now.
Should we allow user space to turn it on/off for compatibility handling?
Or do we just not care? (or how will this capability be used later on?)
--
David
Am 02.12.2016 um 20:43 schrieb Radim Krčmář:
> Interrupt to self can sent without knowing the APIC ID.
can _be_ sent?
Looks sane to me.
Reviewed-by: David Hildenbrand <[email protected]>
--
David
2016-12-05 15:37+0100, David Hildenbrand:
> Am 02.12.2016 um 20:44 schrieb Radim Krčmář:
>> LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
>> VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
>> but there is no way to uniquely address it using xAPIC.
>>
>> From many possible options, we chose the one that also works on real
>> hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
>> xAPIC mode.
>>
>> KVM intentionally differs from real hardware, because real hardware
>> (Knights Landing) does just "x2apic_id & 0xff" to decide whether to
>> accept the interrupt in xAPIC mode and it can deliver one interrupt to
>> more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
>>
>> Add a capability to let userspace know that we do something now.
>
> Should we allow user space to turn it on/off for compatibility handling? Or
> do we just not care?
There should be no guest that relies on the previous behavior, so I'd
forgo the toggle, because it would be extra conditions in the code.
I'd add it as a flag to KVM_CAP_X2APIC_API if you have reasons to let
userspace choose.
> (or how will this capability be used later on?)
New userspace should check this capability and disable hotplug of VCPUs
with id over 255 if KVM doesn't support it.
2016-12-05 15:41+0100, David Hildenbrand:
> Am 02.12.2016 um 20:43 schrieb Radim Krčmář:
>> Interrupt to self can sent without knowing the APIC ID.
>
> can _be_ sent?
Yes, thanks, I'll fix it in v2 or when applying your r-b.
Am 05.12.2016 um 17:02 schrieb Radim Krčmář:
> 2016-12-05 15:37+0100, David Hildenbrand:
>> Am 02.12.2016 um 20:44 schrieb Radim Krčmář:
>>> LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
>>> VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
>>> but there is no way to uniquely address it using xAPIC.
>>>
>>> From many possible options, we chose the one that also works on real
>>> hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
>>> xAPIC mode.
>>>
>>> KVM intentionally differs from real hardware, because real hardware
>>> (Knights Landing) does just "x2apic_id & 0xff" to decide whether to
>>> accept the interrupt in xAPIC mode and it can deliver one interrupt to
>>> more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
>>>
>>> Add a capability to let userspace know that we do something now.
>>
>> Should we allow user space to turn it on/off for compatibility handling? Or
>> do we just not care?
>
> There should be no guest that relies on the previous behavior, so I'd
> forgo the toggle, because it would be extra conditions in the code.
> I'd add it as a flag to KVM_CAP_X2APIC_API if you have reasons to let
> userspace choose.
Okay I see. So if existing user space/guests don't break, there is no
reason to make it configurable. I was just not sure if user space might
want to decide whether to act "the old way".
>
>> (or how will this capability be used later on?)
>
> New userspace should check this capability and disable hotplug of VCPUs
> with id over 255 if KVM doesn't support it.
>
Wonder if this is actually a bugfix for allowing KVM_MAX_VCPU_ID to
be > 255. Currently it is somewhat like
"yes, I support VCPU ids with > 255, but no, you can't really hotplug
such CPUs".
(fix for older kernels would then be limiting the VCPU ID to 255 and
not introducing a new capability).
But I am no expert on that topic, so feel free to ignore.
The general idea of this patch makes sense to me (x2apic hack)!
--
David
2016-12-05 19:00+0100, David Hildenbrand:
> Am 05.12.2016 um 17:02 schrieb Radim Krčmář:
>> 2016-12-05 15:37+0100, David Hildenbrand:
>> > Am 02.12.2016 um 20:44 schrieb Radim Krčmář:
>> > > LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
>> > > VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
>> > > but there is no way to uniquely address it using xAPIC.
>> > >
>> > > From many possible options, we chose the one that also works on real
>> > > hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
>> > > xAPIC mode.
>> > >
>> > > KVM intentionally differs from real hardware, because real hardware
>> > > (Knights Landing) does just "x2apic_id & 0xff" to decide whether to
>> > > accept the interrupt in xAPIC mode and it can deliver one interrupt to
>> > > more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
>> > >
>> > > Add a capability to let userspace know that we do something now.
>> >
>> > Should we allow user space to turn it on/off for compatibility handling? Or
>> > do we just not care?
>>
>> There should be no guest that relies on the previous behavior, so I'd
>> forgo the toggle, because it would be extra conditions in the code.
>> I'd add it as a flag to KVM_CAP_X2APIC_API if you have reasons to let
>> userspace choose.
>
> Okay I see. So if existing user space/guests don't break, there is no reason
> to make it configurable. I was just not sure if user space might want to
> decide whether to act "the old way".
I also don't see a reason for userspace to want it disabled -- it just
shouldn't matter even if userspace implements another solution (e.g. it
hotplugs VCPUs in x2APIC mode) or KVM ends up with a better solution.
Any change can break some guest, but I couldn't with anything reasonable
that would be broken.
>> > (or how will this capability be used later on?)
>>
>> New userspace should check this capability and disable hotplug of VCPUs
>> with id over 255 if KVM doesn't support it.
>>
>
> Wonder if this is actually a bugfix for allowing KVM_MAX_VCPU_ID to
> be > 255. Currently it is somewhat like
Good point, it is, for guests that want hotplug. I'll add Fixes: line;
thanks!
> "yes, I support VCPU ids with > 255, but no, you can't really hotplug
> such CPUs".
My bad, offline/online in Linux worked fine so I didn't think enough
about hotplug.
> (fix for older kernels would then be limiting the VCPU ID to 255 and
> not introducing a new capability).
>
> But I am no expert on that topic, so feel free to ignore.
I think the agreement is to embrace compatibility, so we pile new
mistakes to hide known ones.
(Rewriting the past requires far more power than accepting it:
If we didn't force unfixed kernels out of existence, then userspace
couldn't tell if hotplug up to high VCPU ID limit is supported.)
> The general idea of this patch makes sense to me (x2apic hack)!
The situation would be a bit better if xAPIC ID was read-only (we'd
behave more like real-hardware then), but no major OS changes the ID,
which makes it a secondary concern with weird corner-cases.
> I think the agreement is to embrace compatibility, so we pile new
> mistakes to hide known ones.
> (Rewriting the past requires far more power than accepting it:
> If we didn't force unfixed kernels out of existence, then userspace
> couldn't tell if hotplug up to high VCPU ID limit is supported.)
I agree, the question is how old the bug is (you should know better than
me :) ) and if introducing a capability is strictly necessary. Do we
have to do the check in QEMU or can we simply fix implementations out
there silently.
(especially as hotplugging cpuid > 255 doesn't sound like setups wildly
used already today - and it doesn't work ;) ). But as I said, I don't
know the history, so you decide if this check in QEMU is necessary.
Fix all QEMUs (introduce capability check) vs fix all relevant kernels
(limiting VCPU id to 255).
--
David
2016-12-06 10:37+0100, David Hildenbrand:
>> I think the agreement is to embrace compatibility, so we pile new
>> mistakes to hide known ones.
>> (Rewriting the past requires far more power than accepting it:
>> If we didn't force unfixed kernels out of existence, then userspace
>> couldn't tell if hotplug up to high VCPU ID limit is supported.)
>
> I agree, the question is how old the bug is (you should know better than me
> :) )
Just half a year old, since v4.7.
> and if introducing a capability is strictly necessary. Do we have to do
> the check in QEMU or can we simply fix implementations out there silently.
This fix is too big for stable and I don't think that patches outside of
stable get backported much.
> (especially as hotplugging cpuid > 255 doesn't sound like setups wildly used
> already today - and it doesn't work ;) ).
Yes, it seems that no-one using high APIC ID noticed/cared.
> But as I said, I don't know the
> history, so you decide if this check in QEMU is necessary.
QEMU can decide not to check (I actually expect it won't :]).
I think the option to check is worth two lines of code in KVM, though.
> Fix all QEMUs (introduce capability check) vs fix all relevant kernels
> (limiting VCPU id to 255).
APID ID over 255 works without hotplug and has few users, so lowering
the limit would regress cases that are more important, IMO.
On 02/12/2016 20:44, Radim Krčmář wrote:
> LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
> VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
> but there is no way to uniquely address it using xAPIC.
>
> From many possible options, we chose the one that also works on real
> hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
> xAPIC mode.
>
> KVM intentionally differs from real hardware, because real hardware
> (Knights Landing) does just "x2apic_id & 0xff" to decide whether to
> accept the interrupt in xAPIC mode and it can deliver one interrupt to
> more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
>
> Add a capability to let userspace know that we do something now.
I wouldn't even bother with the capability. It's a bit borderline for
stable, but I think it's okay. We can always Cc and let the maintainer
reject it.
Paolo
> Signed-off-by: Radim Krčmář <[email protected]>
> ---
> arch/x86/kvm/lapic.c | 13 +++++++++++--
> arch/x86/kvm/x86.c | 1 +
> include/uapi/linux/kvm.h | 1 +
> 3 files changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 42edf1ea2909..b85985985ac8 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -199,10 +199,15 @@ static void recalculate_apic_map(struct kvm *kvm)
> xapic_id = kvm_xapic_id(apic);
> x2apic_id = kvm_x2apic_id(apic);
>
> - if (apic_x2apic_mode(apic) &&
> + /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
> + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
> x2apic_id <= new->max_apic_id)
> new->phys_map[x2apic_id] = apic;
> - else if (!apic_x2apic_mode(apic))
> + /*
> + * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
> + * prevent them from masking VCPUs with APIC ID <= 0xff.
> + */
> + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
> new->phys_map[xapic_id] = apic;
>
> ldr = kvm_lapic_get_reg(apic, APIC_LDR);
> @@ -612,6 +617,10 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
> if (apic_x2apic_mode(apic))
> return mda == kvm_x2apic_id(apic);
>
> + /* Hotplug hack: LAPIC in xAPIC mode also accepts x2APIC. */
> + if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
> + return true;
> +
> return mda == kvm_xapic_id(apic);
> }
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 7770d77c828d..945e8eeb4eb1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2618,6 +2618,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_DISABLE_QUIRKS:
> case KVM_CAP_SET_BOOT_CPU_ID:
> case KVM_CAP_SPLIT_IRQCHIP:
> + case KVM_CAP_X86_ALWAYS_ACCEPT_X2APIC_DEST:
> #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
> case KVM_CAP_ASSIGN_DEV_IRQ:
> case KVM_CAP_PCI_2_3:
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index e9f5ceffd741..f25efa375255 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -871,6 +871,7 @@ struct kvm_ppc_smmu_info {
> #define KVM_CAP_S390_USER_INSTR0 130
> #define KVM_CAP_MSI_DEVID 131
> #define KVM_CAP_PPC_HTM 132
> +#define KVM_CAP_X86_ALWAYS_ACCEPT_X2APIC_DEST 133
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
>
2016-12-07 13:03+0100, Paolo Bonzini:
> On 02/12/2016 20:44, Radim Krčmář wrote:
>> LAPIC after reset is in xAPIC mode, which poses a problem for hotplug of
>> VCPUs with high APIC ID, because reset VCPU is waiting for INIT/SIPI,
>> but there is no way to uniquely address it using xAPIC.
>>
>> From many possible options, we chose the one that also works on real
>> hardware: accepting interrupts addressed to LAPIC's x2APIC ID even in
>> xAPIC mode.
>>
>> KVM intentionally differs from real hardware, because real hardware
>> (Knights Landing) does just "x2apic_id & 0xff" to decide whether to
>> accept the interrupt in xAPIC mode and it can deliver one interrupt to
>> more than one physical destination, e.g. 0x123 to 0x123 and 0x23.
>>
>> Add a capability to let userspace know that we do something now.
>
> I wouldn't even bother with the capability. It's a bit borderline for
> stable, but I think it's okay. We can always Cc and let the maintainer
> reject it.
Ok, David also mentioned that I should tone down compatibility ...
Thanks to both, I'll prepare v2.