This patchset enables running KVM SMP guests with external interrupts on an
underlying RT-enabled Linux. Previous to this patch, a guest with in-kernel MPIC
emulation could easily panic the kernel due to preemption when delivering IPIs
and external interrupts, because of the openpic spinlock becoming a sleeping
mutex on PREEMPT_RT_FULL Linux.
0001: converts the openpic spinlock to a raw spinlock, in order to circumvent
this behavior. While this change is targeted for a RT enabled Linux, it has no
effect on upstream kvm-ppc, so send it upstream for better future maintenance.
0002: disables in-kernel MPIC emulation for guest running on RT, in order to
prevent a potential DoS attack due to large system latencies. This patch is
targeted to RT (due to CONFIG_PREEMPT_RT_FULL), but it can also be applied on
upstream Linux, with no effect.
- applied & compiled against vanilla 4.0
- applied & compiled against stable-rt 3.18-rt
v2:
- updated commit messages
- change the fix for potentially large latencies from limiting the max number of
VCPUs a guest can have to disabling the in-kernel MPIC
Bogdan Purcareata (2):
powerpc/kvm: Convert openpic lock to raw_spinlock
powerpc/kvm: Disable in-kernel MPIC emulation for PREEMPT_RT_FULL
arch/powerpc/kvm/Kconfig | 1 +
arch/powerpc/kvm/mpic.c | 44 ++++++++++++++++++++++----------------------
2 files changed, 23 insertions(+), 22 deletions(-)
--
2.1.4
The lock in the KVM openpic emulation on PPC is a spinlock_t, meaning it becomes
a sleeping mutex under PREEMPT_RT_FULL. This yields to a situation where this
non-raw lock is grabbed with interrupts already disabled by hard_irq_disable():
kvmppc_prepare_to_enter()
hard_irq_disable()
kvmppc_core_prepare_to_enter()
kvmppc_core_check_exceptions()
kvmppc_booke_irqprio_deliver()
kvmppc_mpic_set_epr()
spin_lock_irqsave()
...
This happens for guest interrupts that go through this openpic emulation code.
The result is a kernel crash on guest enter (include/linux/kvm_host.h:784).
Converting the lock to a raw_spinlock fixes the issue and enables the guest to
run I/O intensive workloads in a SMP configuration. A similar fix can be found
for the i8254 PIT emulation on x86 [1].
[1] https://lkml.org/lkml/2010/1/11/289
v2:
- updated commit message
Signed-off-by: Bogdan Purcareata <[email protected]>
---
arch/powerpc/kvm/mpic.c | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 6249cdc..2f70660 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -196,7 +196,7 @@ struct openpic {
int num_mmio_regions;
gpa_t reg_base;
- spinlock_t lock;
+ raw_spinlock_t lock;
/* Behavior control */
struct fsl_mpic_info *fsl;
@@ -1103,9 +1103,9 @@ static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
}
- spin_unlock(&opp->lock);
+ raw_spin_unlock(&opp->lock);
kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
- spin_lock(&opp->lock);
+ raw_spin_lock(&opp->lock);
break;
}
@@ -1180,12 +1180,12 @@ void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
int cpu = vcpu->arch.irq_cpu_id;
unsigned long flags;
- spin_lock_irqsave(&opp->lock, flags);
+ raw_spin_lock_irqsave(&opp->lock, flags);
if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
- spin_unlock_irqrestore(&opp->lock, flags);
+ raw_spin_unlock_irqrestore(&opp->lock, flags);
}
static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
@@ -1386,9 +1386,9 @@ static int kvm_mpic_read(struct kvm_vcpu *vcpu,
return -EINVAL;
}
- spin_lock_irq(&opp->lock);
+ raw_spin_lock_irq(&opp->lock);
ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
- spin_unlock_irq(&opp->lock);
+ raw_spin_unlock_irq(&opp->lock);
/*
* Technically only 32-bit accesses are allowed, but be nice to
@@ -1427,10 +1427,10 @@ static int kvm_mpic_write(struct kvm_vcpu *vcpu,
return -EOPNOTSUPP;
}
- spin_lock_irq(&opp->lock);
+ raw_spin_lock_irq(&opp->lock);
ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
*(const u32 *)ptr);
- spin_unlock_irq(&opp->lock);
+ raw_spin_unlock_irq(&opp->lock);
pr_debug("%s: addr %llx ret %d val %x\n",
__func__, addr, ret, *(const u32 *)ptr);
@@ -1501,14 +1501,14 @@ static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
if (addr & 3)
return -ENXIO;
- spin_lock_irq(&opp->lock);
+ raw_spin_lock_irq(&opp->lock);
if (type == ATTR_SET)
ret = kvm_mpic_write_internal(opp, addr, *val);
else
ret = kvm_mpic_read_internal(opp, addr, val);
- spin_unlock_irq(&opp->lock);
+ raw_spin_unlock_irq(&opp->lock);
pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
@@ -1545,9 +1545,9 @@ static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
if (attr32 != 0 && attr32 != 1)
return -EINVAL;
- spin_lock_irq(&opp->lock);
+ raw_spin_lock_irq(&opp->lock);
openpic_set_irq(opp, attr->attr, attr32);
- spin_unlock_irq(&opp->lock);
+ raw_spin_unlock_irq(&opp->lock);
return 0;
}
@@ -1592,9 +1592,9 @@ static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
if (attr->attr > MAX_SRC)
return -EINVAL;
- spin_lock_irq(&opp->lock);
+ raw_spin_lock_irq(&opp->lock);
attr32 = opp->src[attr->attr].pending;
- spin_unlock_irq(&opp->lock);
+ raw_spin_unlock_irq(&opp->lock);
if (put_user(attr32, (u32 __user *)(long)attr->addr))
return -EFAULT;
@@ -1670,7 +1670,7 @@ static int mpic_create(struct kvm_device *dev, u32 type)
opp->kvm = dev->kvm;
opp->dev = dev;
opp->model = type;
- spin_lock_init(&opp->lock);
+ raw_spin_lock_init(&opp->lock);
add_mmio_region(opp, &openpic_gbl_mmio);
add_mmio_region(opp, &openpic_tmr_mmio);
@@ -1743,7 +1743,7 @@ int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
if (cpu < 0 || cpu >= MAX_CPU)
return -EPERM;
- spin_lock_irq(&opp->lock);
+ raw_spin_lock_irq(&opp->lock);
if (opp->dst[cpu].vcpu) {
ret = -EEXIST;
@@ -1766,7 +1766,7 @@ int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
out:
- spin_unlock_irq(&opp->lock);
+ raw_spin_unlock_irq(&opp->lock);
return ret;
}
@@ -1796,9 +1796,9 @@ static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
struct openpic *opp = kvm->arch.mpic;
unsigned long flags;
- spin_lock_irqsave(&opp->lock, flags);
+ raw_spin_lock_irqsave(&opp->lock, flags);
openpic_set_irq(opp, irq, level);
- spin_unlock_irqrestore(&opp->lock, flags);
+ raw_spin_unlock_irqrestore(&opp->lock, flags);
/* All code paths we care about don't check for the return value */
return 0;
@@ -1810,14 +1810,14 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct openpic *opp = kvm->arch.mpic;
unsigned long flags;
- spin_lock_irqsave(&opp->lock, flags);
+ raw_spin_lock_irqsave(&opp->lock, flags);
/*
* XXX We ignore the target address for now, as we only support
* a single MSI bank.
*/
openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
- spin_unlock_irqrestore(&opp->lock, flags);
+ raw_spin_unlock_irqrestore(&opp->lock, flags);
/* All code paths we care about don't check for the return value */
return 0;
--
2.1.4
While converting the openpic emulation code to use a raw_spinlock_t enables
guests to run on RT, there's still a performance issue. For interrupts sent in
directed delivery mode with a multiple CPU mask, the emulated openpic will loop
through all of the VCPUs, and for each VCPUs, it call IRQ_check, which will loop
through all the pending interrupts for that VCPU. This is done while holding the
raw_lock, meaning that in all this time the interrupts and preemption are
disabled on the host Linux. A malicious user app can max both these number and
cause a DoS.
This temporary fix is sent for two reasons. First is so that users who want to
use the in-kernel MPIC emulation are aware of the potential latencies, thus
making sure that the hardware MPIC and their usage scenario does not involve
interrupts sent in directed delivery mode, and the number of possible pending
interrupts is kept small. Secondly, this should incentivize the development of a
proper openpic emulation that would be better suited for RT.
Signed-off-by: Bogdan Purcareata <[email protected]>
---
arch/powerpc/kvm/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 11850f3..415499a 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -158,6 +158,7 @@ config KVM_E500MC
config KVM_MPIC
bool "KVM in-kernel MPIC emulation"
depends on KVM && E500
+ depends on !PREEMPT_RT_FULL
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
--
2.1.4
* Bogdan Purcareata | 2015-04-24 15:53:11 [+0000]:
>0001: converts the openpic spinlock to a raw spinlock, in order to circumvent
>0002: disables in-kernel MPIC emulation for guest running on RT, in order to
Scott, I'm asking here for your explicit Acked-by on those two patches.
That you want it *that* way. We have now a nice explanation in #1 :)
Sebastian
On Thu, 2015-05-14 at 17:36 +0200, Sebastian Andrzej Siewior wrote:
> * Bogdan Purcareata | 2015-04-24 15:53:11 [+0000]:
>
> >0001: converts the openpic spinlock to a raw spinlock, in order to circumvent
> >0002: disables in-kernel MPIC emulation for guest running on RT, in order to
>
> Scott, I'm asking here for your explicit Acked-by on those two patches.
> That you want it *that* way. We have now a nice explanation in #1 :)
ACK patch 2 only. Patch 1 is unnecessary as long as we have patch 2,
and the most likely long-term solutions do not involve converting all
the locking in that file to a raw lock.
-Scott
* Bogdan Purcareata | 2015-04-24 15:53:13 [+0000]:
>While converting the openpic emulation code to use a raw_spinlock_t enables
>guests to run on RT, there's still a performance issue. For interrupts sent in
>Signed-off-by: Bogdan Purcareata <[email protected]>
Applied with Scott's Acked-by
Sebastian