The existing KVM mechanism for forwarding of level-triggered interrupts
using resample eventfd doesn't work quite correctly in the case of
interrupts that are handled in a Linux guest as oneshot interrupts
(IRQF_ONESHOT). Such an interrupt is acked to the device in its
threaded irq handler, i.e. later than it is acked to the interrupt
controller (EOI at the end of hardirq), not earlier. The existing KVM
code doesn't take that into account, which results in erroneous extra
interrupts in the guest caused by premature re-assert of an
unacknowledged IRQ by the host.
This patch series fixes this issue (for now on x86 only) by checking if
the interrupt is unmasked when we receive irq ack (EOI) and, in case if
it's masked, postponing resamplefd notify until the guest unmasks it.
Patches 1 and 2 implement the prerequisites needed for KVM irqfd to
know the interrupt mask state. Patch 3 implements the actual fix:
postponing resamplefd notify in KVM irqfd until the irq is unmasked.
Please see individual patches for more details.
Dmytro Maluka (3):
KVM: x86: Move kvm_(un)register_irq_mask_notifier() to generic KVM
KVM: x86: Add kvm_irq_is_masked()
KVM: irqfd: Postpone resamplefd notify for oneshot interrupts
arch/x86/include/asm/kvm_host.h | 11 +-----
arch/x86/kvm/i8259.c | 11 ++++++
arch/x86/kvm/ioapic.c | 11 ++++++
arch/x86/kvm/ioapic.h | 1 +
arch/x86/kvm/irq_comm.c | 34 +++++++++---------
include/linux/kvm_host.h | 13 +++++++
include/linux/kvm_irqfd.h | 14 ++++++++
virt/kvm/eventfd.c | 63 +++++++++++++++++++++++++++++++++
virt/kvm/irqchip.c | 34 ++++++++++++++++++
9 files changed, 164 insertions(+), 28 deletions(-)
--
2.37.0.170.g444d1eabd0-goog
In preparation for implementing postponing resamplefd event until the
interrupt is unmasked, move kvm_(un)register_irq_mask_notifier() from
x86 to arch-independent code to make it usable by irqfd.
Note that calling mask notifiers is still implemented for x86 only, so
registering mask notifiers on non-x86 will have no effect.
Link: https://lore.kernel.org/kvm/[email protected]/
Signed-off-by: Dmytro Maluka <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 10 ----------
arch/x86/kvm/irq_comm.c | 18 ------------------
include/linux/kvm_host.h | 10 ++++++++++
virt/kvm/eventfd.c | 18 ++++++++++++++++++
4 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9217bd6cf0d1..39a867d68721 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1688,16 +1688,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-struct kvm_irq_mask_notifier {
- void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
- int irq;
- struct hlist_node link;
-};
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 0687162c4f22..43e13892ed34 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -234,24 +234,6 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
mutex_unlock(&kvm->irq_lock);
}
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- kimn->irq = irq;
- hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_rcu(&kimn->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
-}
-
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask)
{
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 90a45ef7203b..9e12ef503157 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1581,6 +1581,12 @@ struct kvm_irq_ack_notifier {
void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
int kvm_irq_map_gsi(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *entries, int gsi);
int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
@@ -1599,6 +1605,10 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
int kvm_request_irq_source_id(struct kvm *kvm);
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2a3ed401ce46..50ddb1d1a7f0 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -520,6 +520,24 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
}
#endif
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
void
kvm_eventfd_init(struct kvm *kvm)
{
--
2.37.0.170.g444d1eabd0-goog
In order to implement postponing resamplefd event until an interrupt is
unmasked, we need not only to track changes of the interrupt mask state
(which is made possible by the previous patch "KVM: x86: Move
kvm_(un)register_irq_mask_notifier() to generic KVM") but also to know
its initial mask state at the time of registering a resamplefd
listener. So implement kvm_irq_is_masked() for that.
Actually, for now it's implemented for x86 only (see below).
The implementation is trickier than I'd like it to be, for 2 reasons:
1. Interrupt (GSI) to irqchip pin mapping is not a 1:1 mapping: an IRQ
may map to multiple pins on different irqchips. I guess the only
reason for that is to support x86 interrupts 0-15 for which we don't
know if the guest uses PIC or IOAPIC. For this reason kvm_set_irq()
delivers interrupt to both, assuming the guest will ignore the
unused one. For the same reason, in kvm_irq_is_masked() we should
also take into account the mask state of both irqchips. We consider
an interrupt unmasked if and only if it is unmasked in at least one
of PIC or IOAPIC, assuming that in the unused one all the interrupts
should be masked.
2. For now ->is_masked() implemented for x86 only, so need to handle
the case when ->is_masked() is not provided by the irqchip. In such
case kvm_irq_is_masked() returns failure, and its caller may fall
back to an assumption that an interrupt is always unmasked.
Link: https://lore.kernel.org/kvm/[email protected]/
Signed-off-by: Dmytro Maluka <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/i8259.c | 11 +++++++++++
arch/x86/kvm/ioapic.c | 11 +++++++++++
arch/x86/kvm/ioapic.h | 1 +
arch/x86/kvm/irq_comm.c | 16 ++++++++++++++++
include/linux/kvm_host.h | 3 +++
virt/kvm/irqchip.c | 34 +++++++++++++++++++++++++++++++++
7 files changed, 77 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 39a867d68721..64618b890700 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1840,6 +1840,7 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
+bool kvm_pic_irq_is_masked(struct kvm_pic *pic, int irq);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e1bb6218bb96..2d1ed3bc7cc5 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -211,6 +211,17 @@ void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
pic_unlock(s);
}
+bool kvm_pic_irq_is_masked(struct kvm_pic *s, int irq)
+{
+ bool ret;
+
+ pic_lock(s);
+ ret = !!(s->pics[irq >> 3].imr & (1 << irq));
+ pic_unlock(s);
+
+ return ret;
+}
+
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 765943d7cfa5..874f68a65c87 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -478,6 +478,17 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
spin_unlock(&ioapic->lock);
}
+bool kvm_ioapic_irq_is_masked(struct kvm_ioapic *ioapic, int irq)
+{
+ bool ret;
+
+ spin_lock(&ioapic->lock);
+ ret = !!ioapic->redirtbl[irq].fields.mask;
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 539333ac4b38..fe1f51319992 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -114,6 +114,7 @@ void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+bool kvm_ioapic_irq_is_masked(struct kvm_ioapic *ioapic, int irq);
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 43e13892ed34..5bff6d6ac54f 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -34,6 +34,13 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
}
+static bool kvm_is_masked_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ return kvm_pic_irq_is_masked(pic, e->irqchip.pin);
+}
+
static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -43,6 +50,13 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
line_status);
}
+static bool kvm_is_masked_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ return kvm_ioapic_irq_is_masked(ioapic, e->irqchip.pin);
+}
+
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
@@ -275,11 +289,13 @@ int kvm_set_routing_entry(struct kvm *kvm,
if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
return -EINVAL;
e->set = kvm_set_pic_irq;
+ e->is_masked = kvm_is_masked_pic_irq;
break;
case KVM_IRQCHIP_IOAPIC:
if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
return -EINVAL;
e->set = kvm_set_ioapic_irq;
+ e->is_masked = kvm_is_masked_ioapic_irq;
break;
default:
return -EINVAL;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9e12ef503157..e8bfb3b0d4d1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -625,6 +625,8 @@ struct kvm_kernel_irq_routing_entry {
int (*set)(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status);
+ bool (*is_masked)(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm);
union {
struct {
unsigned irqchip;
@@ -1598,6 +1600,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id,
int level, bool line_status);
+int kvm_irq_is_masked(struct kvm *kvm, int irq, bool *masked);
bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 58e4f88b2b9f..9252ebedba55 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -97,6 +97,40 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
return ret;
}
+/*
+ * Return value:
+ * = 0 Interrupt mask state successfully written to `masked`
+ * < 0 Failed to read interrupt mask state
+ */
+int kvm_irq_is_masked(struct kvm *kvm, int irq, bool *masked)
+{
+ struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
+ int ret = -1, i, idx;
+
+ /* Not possible to detect if the guest uses the PIC or the
+ * IOAPIC. So assume the interrupt to be unmasked iff it is
+ * unmasked in at least one of both.
+ */
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ i = kvm_irq_map_gsi(kvm, irq_set, irq);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+
+ while (i--) {
+ if (!irq_set[i].is_masked)
+ continue;
+
+ if (!irq_set[i].is_masked(&irq_set[i], kvm)) {
+ *masked = false;
+ return 0;
+ }
+ ret = 0;
+ }
+ if (!ret)
+ *masked = true;
+
+ return ret;
+}
+
static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
{
int i;
--
2.37.0.170.g444d1eabd0-goog
The existing KVM mechanism for forwarding of level-triggered interrupts
using resample eventfd doesn't work quite correctly in the case of
interrupts that are handled in a Linux guest as oneshot interrupts
(IRQF_ONESHOT). Such an interrupt is acked to the device in its
threaded irq handler, i.e. later than it is acked to the interrupt
controller (EOI at the end of hardirq), not earlier.
Linux keeps such interrupt masked until its threaded handler finishes,
to prevent the EOI from re-asserting an unacknowledged interrupt.
However, with KVM + vfio (or whatever is listening on the resamplefd)
we don't check that the interrupt is still masked in the guest at the
moment of EOI. Resamplefd is notified regardless, so vfio prematurely
unmasks the host physical IRQ, thus a new (unwanted) physical interrupt
is generated in the host and queued for injection to the guest.
The fact that the virtual IRQ is still masked doesn't prevent this new
physical IRQ from being propagated to the guest, because:
1. It is not guaranteed that the vIRQ will remain masked by the time
when vfio signals the trigger eventfd.
2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
new pending interrupt is injected by KVM to the guest anyway.
There are observed at least 2 user-visible issues caused by those
extra erroneous pending interrupts for oneshot irq in the guest:
1. System suspend aborted due to a pending wakeup interrupt from
ChromeOS EC (drivers/platform/chrome/cros_ec.c).
2. Annoying "invalid report id data" errors from ELAN0000 touchpad
(drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
every time the touchpad is touched.
This patch fixes the issue on x86 by checking if the interrupt is
unmasked when we receive irq ack (EOI) and, in case if it's masked,
postponing resamplefd notify until the guest unmasks it.
Important notes:
1. It doesn't fix the issue for other archs yet, due to some missing
KVM functionality needed by this patch:
- calling mask notifiers is implemented for x86 only
- irqchip ->is_masked() is implemented for x86 only
2. It introduces an additional spinlock locking in the resample notify
path, since we are no longer just traversing an RCU list of irqfds
but also updating the resampler state. Hopefully this locking won't
noticeably slow down anything for anyone.
Regarding #2, there may be an alternative solution worth considering:
extend KVM irqfd (userspace) API to send mask and unmask notifications
directly to vfio/whatever, in addition to resample notifications, to
let vfio check the irq state on its own. There is already locking on
vfio side (see e.g. vfio_platform_unmask()), so this way we would avoid
introducing any additional locking. Also such mask/unmask notifications
could be useful for other cases.
Link: https://lore.kernel.org/kvm/[email protected]/
Suggested-by: Sean Christopherson <[email protected]>
Signed-off-by: Dmytro Maluka <[email protected]>
---
include/linux/kvm_irqfd.h | 14 ++++++++++++
virt/kvm/eventfd.c | 45 +++++++++++++++++++++++++++++++++++++++
2 files changed, 59 insertions(+)
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index dac047abdba7..01754a1abb9e 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -19,6 +19,16 @@
* resamplefd. All resamplers on the same gsi are de-asserted
* together, so we don't need to track the state of each individual
* user. We can also therefore share the same irq source ID.
+ *
+ * A special case is when the interrupt is still masked at the moment
+ * an irq ack is received. That likely means that the interrupt has
+ * been acknowledged to the interrupt controller but not acknowledged
+ * to the device yet, e.g. it might be a Linux guest's threaded
+ * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
+ * resamplefd is postponed until the guest unmasks the interrupt,
+ * which is detected through the irq mask notifier. This prevents
+ * erroneous extra interrupts caused by premature re-assert of an
+ * unacknowledged interrupt by the resamplefd listener.
*/
struct kvm_kernel_irqfd_resampler {
struct kvm *kvm;
@@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
*/
struct list_head list;
struct kvm_irq_ack_notifier notifier;
+ struct kvm_irq_mask_notifier mask_notifier;
+ bool masked;
+ bool pending;
+ spinlock_t lock;
/*
* Entry in list of kvm->irqfd.resampler_list. Use for sharing
* resamplers among irqfds on the same gsi.
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 50ddb1d1a7f0..9ff47ac33790 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -75,6 +75,44 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
+ spin_lock(&resampler->lock);
+ if (resampler->masked) {
+ resampler->pending = true;
+ spin_unlock(&resampler->lock);
+ return;
+ }
+ spin_unlock(&resampler->lock);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+
+ list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
+ srcu_read_lock_held(&kvm->irq_srcu))
+ eventfd_signal(irqfd->resamplefd, 1);
+
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void
+irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool masked)
+{
+ struct kvm_kernel_irqfd_resampler *resampler;
+ struct kvm *kvm;
+ struct kvm_kernel_irqfd *irqfd;
+ int idx;
+
+ resampler = container_of(kimn,
+ struct kvm_kernel_irqfd_resampler, mask_notifier);
+ kvm = resampler->kvm;
+
+ spin_lock(&resampler->lock);
+ resampler->masked = masked;
+ if (masked || !resampler->pending) {
+ spin_unlock(&resampler->lock);
+ return;
+ }
+ resampler->pending = false;
+ spin_unlock(&resampler->lock);
+
idx = srcu_read_lock(&kvm->irq_srcu);
list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
@@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
if (list_empty(&resampler->list)) {
list_del(&resampler->link);
kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+ kvm_unregister_irq_mask_notifier(kvm, resampler->mask_notifier.irq,
+ &resampler->mask_notifier);
kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
resampler->notifier.gsi, 0, false);
kfree(resampler);
@@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
INIT_LIST_HEAD(&resampler->list);
resampler->notifier.gsi = irqfd->gsi;
resampler->notifier.irq_acked = irqfd_resampler_ack;
+ resampler->mask_notifier.func = irqfd_resampler_mask;
+ kvm_irq_is_masked(kvm, irqfd->gsi, &resampler->masked);
+ spin_lock_init(&resampler->lock);
INIT_LIST_HEAD(&resampler->link);
list_add(&resampler->link, &kvm->irqfds.resampler_list);
kvm_register_irq_ack_notifier(kvm,
&resampler->notifier);
+ kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
+ &resampler->mask_notifier);
irqfd->resampler = resampler;
}
--
2.37.0.170.g444d1eabd0-goog
Hi Dmytro,
> -----Original Message-----
> From: Dmytro Maluka <[email protected]>
> Sent: Friday, July 15, 2022 8:59 AM
> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
> <[email protected]>; [email protected]
> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> <[email protected]>; [email protected]; H. Peter Anvin
> <[email protected]>; [email protected]; Eric Auger
> <[email protected]>; Alex Williamson
> <[email protected]>; Liu, Rong L <[email protected]>;
> Zhenyu Wang <[email protected]>; Tomasz Nowicki
> <[email protected]>; Grzegorz Jaszczyk <[email protected]>; Dmitry
> Torokhov <[email protected]>; Dmytro Maluka <[email protected]>
> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for oneshot
> interrupts
>
> The existing KVM mechanism for forwarding of level-triggered interrupts
> using resample eventfd doesn't work quite correctly in the case of
> interrupts that are handled in a Linux guest as oneshot interrupts
> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> threaded irq handler, i.e. later than it is acked to the interrupt
> controller (EOI at the end of hardirq), not earlier.
>
> Linux keeps such interrupt masked until its threaded handler finishes,
> to prevent the EOI from re-asserting an unacknowledged interrupt.
> However, with KVM + vfio (or whatever is listening on the resamplefd)
> we don't check that the interrupt is still masked in the guest at the
> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
> unmasks the host physical IRQ, thus a new (unwanted) physical interrupt
> is generated in the host and queued for injection to the guest.
>
> The fact that the virtual IRQ is still masked doesn't prevent this new
> physical IRQ from being propagated to the guest, because:
>
> 1. It is not guaranteed that the vIRQ will remain masked by the time
> when vfio signals the trigger eventfd.
> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
> new pending interrupt is injected by KVM to the guest anyway.
>
> There are observed at least 2 user-visible issues caused by those
> extra erroneous pending interrupts for oneshot irq in the guest:
>
> 1. System suspend aborted due to a pending wakeup interrupt from
> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
> every time the touchpad is touched.
>
> This patch fixes the issue on x86 by checking if the interrupt is
> unmasked when we receive irq ack (EOI) and, in case if it's masked,
> postponing resamplefd notify until the guest unmasks it.
>
> Important notes:
>
> 1. It doesn't fix the issue for other archs yet, due to some missing
> KVM functionality needed by this patch:
> - calling mask notifiers is implemented for x86 only
> - irqchip ->is_masked() is implemented for x86 only
>
> 2. It introduces an additional spinlock locking in the resample notify
> path, since we are no longer just traversing an RCU list of irqfds
> but also updating the resampler state. Hopefully this locking won't
> noticeably slow down anything for anyone.
>
Instead of using a spinlock waiting for the unmask event, is it possible to call
resampler notify directly when unmask event happens, instead of calling it on
EOI?
> Regarding #2, there may be an alternative solution worth considering:
> extend KVM irqfd (userspace) API to send mask and unmask notifications
> directly to vfio/whatever, in addition to resample notifications, to
> let vfio check the irq state on its own. There is already locking on
> vfio side (see e.g. vfio_platform_unmask()), so this way we would avoid
> introducing any additional locking. Also such mask/unmask notifications
> could be useful for other cases.
>
> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> [email protected]/
> Suggested-by: Sean Christopherson <[email protected]>
> Signed-off-by: Dmytro Maluka <[email protected]>
> ---
> include/linux/kvm_irqfd.h | 14 ++++++++++++
> virt/kvm/eventfd.c | 45
> +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 59 insertions(+)
>
> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> index dac047abdba7..01754a1abb9e 100644
> --- a/include/linux/kvm_irqfd.h
> +++ b/include/linux/kvm_irqfd.h
> @@ -19,6 +19,16 @@
> * resamplefd. All resamplers on the same gsi are de-asserted
> * together, so we don't need to track the state of each individual
> * user. We can also therefore share the same irq source ID.
> + *
> + * A special case is when the interrupt is still masked at the moment
> + * an irq ack is received. That likely means that the interrupt has
> + * been acknowledged to the interrupt controller but not acknowledged
> + * to the device yet, e.g. it might be a Linux guest's threaded
> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
> + * resamplefd is postponed until the guest unmasks the interrupt,
> + * which is detected through the irq mask notifier. This prevents
> + * erroneous extra interrupts caused by premature re-assert of an
> + * unacknowledged interrupt by the resamplefd listener.
> */
> struct kvm_kernel_irqfd_resampler {
> struct kvm *kvm;
> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> */
> struct list_head list;
> struct kvm_irq_ack_notifier notifier;
> + struct kvm_irq_mask_notifier mask_notifier;
> + bool masked;
> + bool pending;
> + spinlock_t lock;
> /*
> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> * resamplers among irqfds on the same gsi.
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 50ddb1d1a7f0..9ff47ac33790 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier
> *kian)
> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
>
> + spin_lock(&resampler->lock);
> + if (resampler->masked) {
> + resampler->pending = true;
> + spin_unlock(&resampler->lock);
> + return;
> + }
> + spin_unlock(&resampler->lock);
> +
> + idx = srcu_read_lock(&kvm->irq_srcu);
> +
> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> + srcu_read_lock_held(&kvm->irq_srcu))
> + eventfd_signal(irqfd->resamplefd, 1);
> +
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> +}
> +
> +static void
> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
> masked)
> +{
> + struct kvm_kernel_irqfd_resampler *resampler;
> + struct kvm *kvm;
> + struct kvm_kernel_irqfd *irqfd;
> + int idx;
> +
> + resampler = container_of(kimn,
> + struct kvm_kernel_irqfd_resampler, mask_notifier);
> + kvm = resampler->kvm;
> +
> + spin_lock(&resampler->lock);
> + resampler->masked = masked;
> + if (masked || !resampler->pending) {
> + spin_unlock(&resampler->lock);
> + return;
> + }
> + resampler->pending = false;
> + spin_unlock(&resampler->lock);
> +
> idx = srcu_read_lock(&kvm->irq_srcu);
>
> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
> kvm_kernel_irqfd *irqfd)
> if (list_empty(&resampler->list)) {
> list_del(&resampler->link);
> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
> + kvm_unregister_irq_mask_notifier(kvm, resampler-
> >mask_notifier.irq,
> + &resampler->mask_notifier);
> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
> kfree(resampler);
> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct
> kvm_irqfd *args)
> INIT_LIST_HEAD(&resampler->list);
> resampler->notifier.gsi = irqfd->gsi;
> resampler->notifier.irq_acked = irqfd_resampler_ack;
> + resampler->mask_notifier.func = irqfd_resampler_mask;
> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
> >masked);
> + spin_lock_init(&resampler->lock);
> INIT_LIST_HEAD(&resampler->link);
>
> list_add(&resampler->link, &kvm->irqfds.resampler_list);
> kvm_register_irq_ack_notifier(kvm,
> &resampler->notifier);
> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> + &resampler->mask_notifier);
> irqfd->resampler = resampler;
> }
>
> --
> 2.37.0.170.g444d1eabd0-goog
Hi Rong,
On 7/26/22 01:44, Liu, Rong L wrote:
> Hi Dmytro,
>
>> -----Original Message-----
>> From: Dmytro Maluka <[email protected]>
>> Sent: Friday, July 15, 2022 8:59 AM
>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
>> <[email protected]>; [email protected]
>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>> <[email protected]>; [email protected]; H. Peter Anvin
>> <[email protected]>; [email protected]; Eric Auger
>> <[email protected]>; Alex Williamson
>> <[email protected]>; Liu, Rong L <[email protected]>;
>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>; Dmitry
>> Torokhov <[email protected]>; Dmytro Maluka <[email protected]>
>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for oneshot
>> interrupts
>>
>> The existing KVM mechanism for forwarding of level-triggered interrupts
>> using resample eventfd doesn't work quite correctly in the case of
>> interrupts that are handled in a Linux guest as oneshot interrupts
>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
>> threaded irq handler, i.e. later than it is acked to the interrupt
>> controller (EOI at the end of hardirq), not earlier.
>>
>> Linux keeps such interrupt masked until its threaded handler finishes,
>> to prevent the EOI from re-asserting an unacknowledged interrupt.
>> However, with KVM + vfio (or whatever is listening on the resamplefd)
>> we don't check that the interrupt is still masked in the guest at the
>> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
>> unmasks the host physical IRQ, thus a new (unwanted) physical interrupt
>> is generated in the host and queued for injection to the guest.
>>
>> The fact that the virtual IRQ is still masked doesn't prevent this new
>> physical IRQ from being propagated to the guest, because:
>>
>> 1. It is not guaranteed that the vIRQ will remain masked by the time
>> when vfio signals the trigger eventfd.
>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
>> new pending interrupt is injected by KVM to the guest anyway.
>>
>> There are observed at least 2 user-visible issues caused by those
>> extra erroneous pending interrupts for oneshot irq in the guest:
>>
>> 1. System suspend aborted due to a pending wakeup interrupt from
>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
>> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
>> every time the touchpad is touched.
>>
>> This patch fixes the issue on x86 by checking if the interrupt is
>> unmasked when we receive irq ack (EOI) and, in case if it's masked,
>> postponing resamplefd notify until the guest unmasks it.
>>
>> Important notes:
>>
>> 1. It doesn't fix the issue for other archs yet, due to some missing
>> KVM functionality needed by this patch:
>> - calling mask notifiers is implemented for x86 only
>> - irqchip ->is_masked() is implemented for x86 only
>>
>> 2. It introduces an additional spinlock locking in the resample notify
>> path, since we are no longer just traversing an RCU list of irqfds
>> but also updating the resampler state. Hopefully this locking won't
>> noticeably slow down anything for anyone.
>>
>
> Instead of using a spinlock waiting for the unmask event, is it possible to call
> resampler notify directly when unmask event happens, instead of calling it on
> EOI?
In this patch, resampler notify is already called directly when unmask
happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
writing to IOREDTBLx register, ioapic_write_indirect() calls
kvm_fire_mask_notifiers() which calls irqfd_resampler_mask() which
notifies the resampler. On EOI we postpone it just by setting
resampler->pending to true, not by waiting. The spinlock is needed
merely to synchronize reading & updating resampler->pending and
resampler->masked values between possibly concurrently running instances
of irqfd_resampler_ack() and/or irqfd_resampler_mask().
Thanks,
Dmytro
>
>> Regarding #2, there may be an alternative solution worth considering:
>> extend KVM irqfd (userspace) API to send mask and unmask notifications
>> directly to vfio/whatever, in addition to resample notifications, to
>> let vfio check the irq state on its own. There is already locking on
>> vfio side (see e.g. vfio_platform_unmask()), so this way we would avoid
>> introducing any additional locking. Also such mask/unmask notifications
>> could be useful for other cases.
>>
>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>> [email protected]/
>> Suggested-by: Sean Christopherson <[email protected]>
>> Signed-off-by: Dmytro Maluka <[email protected]>
>> ---
>> include/linux/kvm_irqfd.h | 14 ++++++++++++
>> virt/kvm/eventfd.c | 45
>> +++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 59 insertions(+)
>>
>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
>> index dac047abdba7..01754a1abb9e 100644
>> --- a/include/linux/kvm_irqfd.h
>> +++ b/include/linux/kvm_irqfd.h
>> @@ -19,6 +19,16 @@
>> * resamplefd. All resamplers on the same gsi are de-asserted
>> * together, so we don't need to track the state of each individual
>> * user. We can also therefore share the same irq source ID.
>> + *
>> + * A special case is when the interrupt is still masked at the moment
>> + * an irq ack is received. That likely means that the interrupt has
>> + * been acknowledged to the interrupt controller but not acknowledged
>> + * to the device yet, e.g. it might be a Linux guest's threaded
>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
>> + * resamplefd is postponed until the guest unmasks the interrupt,
>> + * which is detected through the irq mask notifier. This prevents
>> + * erroneous extra interrupts caused by premature re-assert of an
>> + * unacknowledged interrupt by the resamplefd listener.
>> */
>> struct kvm_kernel_irqfd_resampler {
>> struct kvm *kvm;
>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
>> */
>> struct list_head list;
>> struct kvm_irq_ack_notifier notifier;
>> + struct kvm_irq_mask_notifier mask_notifier;
>> + bool masked;
>> + bool pending;
>> + spinlock_t lock;
>> /*
>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
>> * resamplers among irqfds on the same gsi.
>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>> index 50ddb1d1a7f0..9ff47ac33790 100644
>> --- a/virt/kvm/eventfd.c
>> +++ b/virt/kvm/eventfd.c
>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier
>> *kian)
>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>> resampler->notifier.gsi, 0, false);
>>
>> + spin_lock(&resampler->lock);
>> + if (resampler->masked) {
>> + resampler->pending = true;
>> + spin_unlock(&resampler->lock);
>> + return;
>> + }
>> + spin_unlock(&resampler->lock);
>> +
>> + idx = srcu_read_lock(&kvm->irq_srcu);
>> +
>> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
>> + srcu_read_lock_held(&kvm->irq_srcu))
>> + eventfd_signal(irqfd->resamplefd, 1);
>> +
>> + srcu_read_unlock(&kvm->irq_srcu, idx);
>> +}
>> +
>> +static void
>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
>> masked)
>> +{
>> + struct kvm_kernel_irqfd_resampler *resampler;
>> + struct kvm *kvm;
>> + struct kvm_kernel_irqfd *irqfd;
>> + int idx;
>> +
>> + resampler = container_of(kimn,
>> + struct kvm_kernel_irqfd_resampler, mask_notifier);
>> + kvm = resampler->kvm;
>> +
>> + spin_lock(&resampler->lock);
>> + resampler->masked = masked;
>> + if (masked || !resampler->pending) {
>> + spin_unlock(&resampler->lock);
>> + return;
>> + }
>> + resampler->pending = false;
>> + spin_unlock(&resampler->lock);
>> +
>> idx = srcu_read_lock(&kvm->irq_srcu);
>>
>> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
>> kvm_kernel_irqfd *irqfd)
>> if (list_empty(&resampler->list)) {
>> list_del(&resampler->link);
>> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
>>> mask_notifier.irq,
>> + &resampler->mask_notifier);
>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>> resampler->notifier.gsi, 0, false);
>> kfree(resampler);
>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct
>> kvm_irqfd *args)
>> INIT_LIST_HEAD(&resampler->list);
>> resampler->notifier.gsi = irqfd->gsi;
>> resampler->notifier.irq_acked = irqfd_resampler_ack;
>> + resampler->mask_notifier.func = irqfd_resampler_mask;
>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
>>> masked);
>> + spin_lock_init(&resampler->lock);
>> INIT_LIST_HEAD(&resampler->link);
>>
>> list_add(&resampler->link, &kvm->irqfds.resampler_list);
>> kvm_register_irq_ack_notifier(kvm,
>> &resampler->notifier);
>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
>> + &resampler->mask_notifier);
>> irqfd->resampler = resampler;
>> }
>>
>> --
>> 2.37.0.170.g444d1eabd0-goog
>
On Fri, Jul 15, 2022, Dmytro Maluka wrote:
> In preparation for implementing postponing resamplefd event until the
> interrupt is unmasked, move kvm_(un)register_irq_mask_notifier() from
> x86 to arch-independent code to make it usable by irqfd.
This patch needs to move more than just the helpers, e.g. mask_notifier_list
needs to be in "struct kvm", not "stuct kvm_arch".
arch/arm64/kvm/../../../virt/kvm/eventfd.c: In function ‘kvm_register_irq_mask_notifier’:
arch/arm64/kvm/../../../virt/kvm/eventfd.c:528:51: error: ‘struct kvm_arch’ has no member named ‘mask_notifier_list’
528 | hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
| ^
make[3]: *** [scripts/Makefile.build:249: arch/arm64/kvm/../../../virt/kvm/eventfd.o] Error 1
make[3]: *** Waiting for unfinished jobs....
AR kernel/entry/built-in.a
And kvm_fire_mask_notifiers() should probably be moved as well, otherwise there's
no point in moving the registration to common code.
The other option would be to make the generic functions wrappers around arch-specific
hooks. But IIRC won't this eventually be needed for other architectures?
On Fri, Jul 15, 2022, Dmytro Maluka wrote:
> +static void
> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool masked)
Ugh, I see you're just following the existing "style" in this file. Linus provided
a lengthy explanation of why this style is unwanted[*]. And this file is straight up
obnoxious, e.g. a large number of functions put the return type on a separate line
even though it would fit without any wrap.
My vote is to break from this file's style for this patch, and then do a follow-up
patch to fix all the existing funky wraps.
[*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@mail.gmail.com
On 7/28/22 20:46, Sean Christopherson wrote:
> On Fri, Jul 15, 2022, Dmytro Maluka wrote:
>> In preparation for implementing postponing resamplefd event until the
>> interrupt is unmasked, move kvm_(un)register_irq_mask_notifier() from
>> x86 to arch-independent code to make it usable by irqfd.
>
> This patch needs to move more than just the helpers, e.g. mask_notifier_list
> needs to be in "struct kvm", not "stuct kvm_arch".
>
> arch/arm64/kvm/../../../virt/kvm/eventfd.c: In function ‘kvm_register_irq_mask_notifier’:
> arch/arm64/kvm/../../../virt/kvm/eventfd.c:528:51: error: ‘struct kvm_arch’ has no member named ‘mask_notifier_list’
> 528 | hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
> | ^
> make[3]: *** [scripts/Makefile.build:249: arch/arm64/kvm/../../../virt/kvm/eventfd.o] Error 1
> make[3]: *** Waiting for unfinished jobs....
> AR kernel/entry/built-in.a
Oops, sorry.
> And kvm_fire_mask_notifiers() should probably be moved as well, otherwise there's
> no point in moving the registration to common code.
Good point, we can move it right away, even though it is not called on
other architectures for now.
> The other option would be to make the generic functions wrappers around arch-specific
> hooks. But IIRC won't this eventually be needed for other architectures?
Right, I assume we will eventually need it for ARM at least. Not in the
near future though, and at the moment I have no non-x86 hardware on hand
to implement it for other architectures.
Actually I feel a bit uncomfortable with generic irqfd relying on
kvm_register_irq_mask_notifier() which silently has no effect on other
architectures. Maybe it's better to keep
kvm_(un)register_irq_mask_notifier() in the x86 code, and for the
generic code add a weak version which e.g. just prints a warning like
"irq mask notifiers not implemented on this arch". (Or maybe instead of
weak functions introduce arch-specific hooks as you suggested, and print
such a warning if no hook is provided.) What do you think?
Thanks,
Dmytro
On 7/28/22 20:55, Sean Christopherson wrote:
> On Fri, Jul 15, 2022, Dmytro Maluka wrote:
>> +static void
>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool masked)
>
> Ugh, I see you're just following the existing "style" in this file. Linus provided
> a lengthy explanation of why this style is unwanted[*]. And this file is straight up
> obnoxious, e.g. a large number of functions put the return type on a separate line
> even though it would fit without any wrap.
>
> My vote is to break from this file's style for this patch, and then do a follow-up
> patch to fix all the existing funky wraps.
Ok, I can do that.
Would you also like if I rename resampler->notifier to
resampler->ack_notifier and irqfd_resampler_mask() to
irqfd_resampler_mask_notify() for more clarity?
Thanks,
Dmytro
> [*] https://lore.kernel.org/all/CAHk-=wjoLAYG446ZNHfg=GhjSY6nFmuB_wA8fYd5iLBNXjo9Bw@mail.gmail.com
Hi Dmytro,
> -----Original Message-----
> From: Dmytro Maluka <[email protected]>
> Sent: Tuesday, July 26, 2022 7:08 AM
> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> <[email protected]>; Paolo Bonzini <[email protected]>;
> [email protected]
> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> <[email protected]>; [email protected]; H. Peter Anvin
> <[email protected]>; [email protected]; Eric Auger
> <[email protected]>; Alex Williamson
> <[email protected]>; Zhenyu Wang
> <[email protected]>; Tomasz Nowicki <[email protected]>;
> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> <[email protected]>
> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> oneshot interrupts
>
> Hi Rong,
>
> On 7/26/22 01:44, Liu, Rong L wrote:
> > Hi Dmytro,
> >
> >> -----Original Message-----
> >> From: Dmytro Maluka <[email protected]>
> >> Sent: Friday, July 15, 2022 8:59 AM
> >> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
> >> <[email protected]>; [email protected]
> >> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> >> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> >> <[email protected]>; [email protected]; H. Peter Anvin
> >> <[email protected]>; [email protected]; Eric Auger
> >> <[email protected]>; Alex Williamson
> >> <[email protected]>; Liu, Rong L <[email protected]>;
> >> Zhenyu Wang <[email protected]>; Tomasz Nowicki
> >> <[email protected]>; Grzegorz Jaszczyk <[email protected]>; Dmitry
> >> Torokhov <[email protected]>; Dmytro Maluka <[email protected]>
> >> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> oneshot
> >> interrupts
> >>
> >> The existing KVM mechanism for forwarding of level-triggered
> interrupts
> >> using resample eventfd doesn't work quite correctly in the case of
> >> interrupts that are handled in a Linux guest as oneshot interrupts
> >> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> >> threaded irq handler, i.e. later than it is acked to the interrupt
> >> controller (EOI at the end of hardirq), not earlier.
> >>
> >> Linux keeps such interrupt masked until its threaded handler finishes,
> >> to prevent the EOI from re-asserting an unacknowledged interrupt.
> >> However, with KVM + vfio (or whatever is listening on the resamplefd)
> >> we don't check that the interrupt is still masked in the guest at the
> >> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
> >> unmasks the host physical IRQ, thus a new (unwanted) physical
> interrupt
> >> is generated in the host and queued for injection to the guest.
> >>
> >> The fact that the virtual IRQ is still masked doesn't prevent this new
> >> physical IRQ from being propagated to the guest, because:
> >>
> >> 1. It is not guaranteed that the vIRQ will remain masked by the time
> >> when vfio signals the trigger eventfd.
> >> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> >> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
> >> new pending interrupt is injected by KVM to the guest anyway.
> >>
> >> There are observed at least 2 user-visible issues caused by those
> >> extra erroneous pending interrupts for oneshot irq in the guest:
> >>
> >> 1. System suspend aborted due to a pending wakeup interrupt from
> >> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> >> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
> >> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
> >> every time the touchpad is touched.
> >>
> >> This patch fixes the issue on x86 by checking if the interrupt is
> >> unmasked when we receive irq ack (EOI) and, in case if it's masked,
> >> postponing resamplefd notify until the guest unmasks it.
> >>
> >> Important notes:
> >>
> >> 1. It doesn't fix the issue for other archs yet, due to some missing
> >> KVM functionality needed by this patch:
> >> - calling mask notifiers is implemented for x86 only
> >> - irqchip ->is_masked() is implemented for x86 only
> >>
> >> 2. It introduces an additional spinlock locking in the resample notify
> >> path, since we are no longer just traversing an RCU list of irqfds
> >> but also updating the resampler state. Hopefully this locking won't
> >> noticeably slow down anything for anyone.
> >>
> >
> > Instead of using a spinlock waiting for the unmask event, is it possible
> to call
> > resampler notify directly when unmask event happens, instead of
> calling it on
> > EOI?
>
> In this patch, resampler notify is already called directly when unmask
> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
> writing to IOREDTBLx register, ioapic_write_indirect() calls
> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask() which
> notifies the resampler. On EOI we postpone it just by setting
> resampler->pending to true, not by waiting. The spinlock is needed
> merely to synchronize reading & updating resampler->pending and
> resampler->masked values between possibly concurrently running
> instances
> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
>
> Thanks,
> Dmytro
>
I mean the organization of the code. In current implementation,
kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your patch, why not
call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at the same
place where kvm_fire_mask_notifiers is called), instead of calling it from
kvm_ioapic_update_eoi_one, since what your intention here is to notify
vfio of the end of interrupt at the event of ioapic unmask, instead of
EOI?
> >
> >> Regarding #2, there may be an alternative solution worth considering:
> >> extend KVM irqfd (userspace) API to send mask and unmask
> notifications
> >> directly to vfio/whatever, in addition to resample notifications, to
> >> let vfio check the irq state on its own. There is already locking on
> >> vfio side (see e.g. vfio_platform_unmask()), so this way we would
> avoid
> >> introducing any additional locking. Also such mask/unmask
> notifications
> >> could be useful for other cases.
> >>
> >> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> >> [email protected]/
> >> Suggested-by: Sean Christopherson <[email protected]>
> >> Signed-off-by: Dmytro Maluka <[email protected]>
> >> ---
> >> include/linux/kvm_irqfd.h | 14 ++++++++++++
> >> virt/kvm/eventfd.c | 45
> >> +++++++++++++++++++++++++++++++++++++++
> >> 2 files changed, 59 insertions(+)
> >>
> >> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> >> index dac047abdba7..01754a1abb9e 100644
> >> --- a/include/linux/kvm_irqfd.h
> >> +++ b/include/linux/kvm_irqfd.h
> >> @@ -19,6 +19,16 @@
> >> * resamplefd. All resamplers on the same gsi are de-asserted
> >> * together, so we don't need to track the state of each individual
> >> * user. We can also therefore share the same irq source ID.
> >> + *
> >> + * A special case is when the interrupt is still masked at the moment
> >> + * an irq ack is received. That likely means that the interrupt has
> >> + * been acknowledged to the interrupt controller but not
> acknowledged
> >> + * to the device yet, e.g. it might be a Linux guest's threaded
> >> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
> >> + * resamplefd is postponed until the guest unmasks the interrupt,
> >> + * which is detected through the irq mask notifier. This prevents
> >> + * erroneous extra interrupts caused by premature re-assert of an
> >> + * unacknowledged interrupt by the resamplefd listener.
> >> */
> >> struct kvm_kernel_irqfd_resampler {
> >> struct kvm *kvm;
> >> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> >> */
> >> struct list_head list;
> >> struct kvm_irq_ack_notifier notifier;
> >> + struct kvm_irq_mask_notifier mask_notifier;
> >> + bool masked;
> >> + bool pending;
> >> + spinlock_t lock;
> >> /*
> >> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> >> * resamplers among irqfds on the same gsi.
> >> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> >> index 50ddb1d1a7f0..9ff47ac33790 100644
> >> --- a/virt/kvm/eventfd.c
> >> +++ b/virt/kvm/eventfd.c
> >> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
> kvm_irq_ack_notifier
> >> *kian)
> >> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> >> resampler->notifier.gsi, 0, false);
> >>
> >> + spin_lock(&resampler->lock);
> >> + if (resampler->masked) {
> >> + resampler->pending = true;
> >> + spin_unlock(&resampler->lock);
> >> + return;
> >> + }
> >> + spin_unlock(&resampler->lock);
> >> +
> >> + idx = srcu_read_lock(&kvm->irq_srcu);
> >> +
> >> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> >> + srcu_read_lock_held(&kvm->irq_srcu))
> >> + eventfd_signal(irqfd->resamplefd, 1);
> >> +
> >> + srcu_read_unlock(&kvm->irq_srcu, idx);
> >> +}
> >> +
> >> +static void
> >> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
> >> masked)
> >> +{
> >> + struct kvm_kernel_irqfd_resampler *resampler;
> >> + struct kvm *kvm;
> >> + struct kvm_kernel_irqfd *irqfd;
> >> + int idx;
> >> +
> >> + resampler = container_of(kimn,
> >> + struct kvm_kernel_irqfd_resampler, mask_notifier);
> >> + kvm = resampler->kvm;
> >> +
> >> + spin_lock(&resampler->lock);
> >> + resampler->masked = masked;
> >> + if (masked || !resampler->pending) {
> >> + spin_unlock(&resampler->lock);
> >> + return;
> >> + }
> >> + resampler->pending = false;
> >> + spin_unlock(&resampler->lock);
> >> +
> >> idx = srcu_read_lock(&kvm->irq_srcu);
> >>
> >> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> >> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
> >> kvm_kernel_irqfd *irqfd)
> >> if (list_empty(&resampler->list)) {
> >> list_del(&resampler->link);
> >> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
> >> + kvm_unregister_irq_mask_notifier(kvm, resampler-
> >>> mask_notifier.irq,
> >> + &resampler->mask_notifier);
> >> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> >> resampler->notifier.gsi, 0, false);
> >> kfree(resampler);
> >> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct
> >> kvm_irqfd *args)
> >> INIT_LIST_HEAD(&resampler->list);
> >> resampler->notifier.gsi = irqfd->gsi;
> >> resampler->notifier.irq_acked = irqfd_resampler_ack;
> >> + resampler->mask_notifier.func = irqfd_resampler_mask;
> >> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
> >>> masked);
> >> + spin_lock_init(&resampler->lock);
> >> INIT_LIST_HEAD(&resampler->link);
> >>
> >> list_add(&resampler->link, &kvm->irqfds.resampler_list);
> >> kvm_register_irq_ack_notifier(kvm,
> >> &resampler->notifier);
> >> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> >> + &resampler->mask_notifier);
> >> irqfd->resampler = resampler;
> >> }
> >>
> >> --
> >> 2.37.0.170.g444d1eabd0-goog
> >
Hi Dmytro,
> -----Original Message-----
> From: Dmytro Maluka <[email protected]>
> Sent: Friday, July 15, 2022 8:59 AM
> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
> <[email protected]>; [email protected]
> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> <[email protected]>; [email protected]; H. Peter Anvin
> <[email protected]>; [email protected]; Eric Auger
> <[email protected]>; Alex Williamson
> <[email protected]>; Liu, Rong L <[email protected]>;
> Zhenyu Wang <[email protected]>; Tomasz Nowicki
> <[email protected]>; Grzegorz Jaszczyk <[email protected]>; Dmitry
> Torokhov <[email protected]>; Dmytro Maluka <[email protected]>
> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for oneshot
> interrupts
>
> The existing KVM mechanism for forwarding of level-triggered interrupts
> using resample eventfd doesn't work quite correctly in the case of
> interrupts that are handled in a Linux guest as oneshot interrupts
> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> threaded irq handler, i.e. later than it is acked to the interrupt
> controller (EOI at the end of hardirq), not earlier.
>
> Linux keeps such interrupt masked until its threaded handler finishes,
> to prevent the EOI from re-asserting an unacknowledged interrupt.
> However, with KVM + vfio (or whatever is listening on the resamplefd)
> we don't check that the interrupt is still masked in the guest at the
> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
> unmasks the host physical IRQ, thus a new (unwanted) physical interrupt
> is generated in the host and queued for injection to the guest.
>
> The fact that the virtual IRQ is still masked doesn't prevent this new
> physical IRQ from being propagated to the guest, because:
>
> 1. It is not guaranteed that the vIRQ will remain masked by the time
> when vfio signals the trigger eventfd.
> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
> new pending interrupt is injected by KVM to the guest anyway.
>
> There are observed at least 2 user-visible issues caused by those
> extra erroneous pending interrupts for oneshot irq in the guest:
>
> 1. System suspend aborted due to a pending wakeup interrupt from
> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
> every time the touchpad is touched.
>
> This patch fixes the issue on x86 by checking if the interrupt is
> unmasked when we receive irq ack (EOI) and, in case if it's masked,
> postponing resamplefd notify until the guest unmasks it.
>
> Important notes:
>
> 1. It doesn't fix the issue for other archs yet, due to some missing
> KVM functionality needed by this patch:
> - calling mask notifiers is implemented for x86 only
> - irqchip ->is_masked() is implemented for x86 only
>
> 2. It introduces an additional spinlock locking in the resample notify
> path, since we are no longer just traversing an RCU list of irqfds
> but also updating the resampler state. Hopefully this locking won't
> noticeably slow down anything for anyone.
>
> Regarding #2, there may be an alternative solution worth considering:
> extend KVM irqfd (userspace) API to send mask and unmask notifications
> directly to vfio/whatever, in addition to resample notifications, to
> let vfio check the irq state on its own. There is already locking on
> vfio side (see e.g. vfio_platform_unmask()), so this way we would avoid
> introducing any additional locking. Also such mask/unmask notifications
> could be useful for other cases.
>
> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> [email protected]/
> Suggested-by: Sean Christopherson <[email protected]>
> Signed-off-by: Dmytro Maluka <[email protected]>
> ---
> include/linux/kvm_irqfd.h | 14 ++++++++++++
> virt/kvm/eventfd.c | 45
> +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 59 insertions(+)
>
> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> index dac047abdba7..01754a1abb9e 100644
> --- a/include/linux/kvm_irqfd.h
> +++ b/include/linux/kvm_irqfd.h
> @@ -19,6 +19,16 @@
> * resamplefd. All resamplers on the same gsi are de-asserted
> * together, so we don't need to track the state of each individual
> * user. We can also therefore share the same irq source ID.
> + *
> + * A special case is when the interrupt is still masked at the moment
> + * an irq ack is received. That likely means that the interrupt has
> + * been acknowledged to the interrupt controller but not acknowledged
> + * to the device yet, e.g. it might be a Linux guest's threaded
> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
> + * resamplefd is postponed until the guest unmasks the interrupt,
> + * which is detected through the irq mask notifier. This prevents
> + * erroneous extra interrupts caused by premature re-assert of an
> + * unacknowledged interrupt by the resamplefd listener.
> */
I don't really agree with above statement. Please correct me if I am wrong. In
all modern x86 interrupt infrastructure(lapic/ioapic), for level triggered
interrupt, the sequence is always EOI (LAPIC), which is called interrupt ack in
the context of this discussion, then unmask (IOAPIC). Oneshot interrupt is
different only because the timing of above 2 events are different from a
"normal" level-triggered interrupt. It is like for level interrupt: Hardirq ->
EOI -> Unmask but for oneshot, it is like: hardirq->EOI, then sometime later
threadedirq->unmask. So based on this, I don't think you need to keep track of
whether the interrupt is unmasked or not, just need to call the notifier at the
end of unmask, instead of EOI. And calling notifier at the end of unmask,
instead of EOI won't break non-oneshot case. The only caveat is guest ioapic
update (virq unmask) doesn't cause vmexit. But the assumption is already made
that virq unmask causes vmexit.
> struct kvm_kernel_irqfd_resampler {
> struct kvm *kvm;
> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> */
> struct list_head list;
> struct kvm_irq_ack_notifier notifier;
> + struct kvm_irq_mask_notifier mask_notifier;
> + bool masked;
> + bool pending;
> + spinlock_t lock;
> /*
> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> * resamplers among irqfds on the same gsi.
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 50ddb1d1a7f0..9ff47ac33790 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier
> *kian)
> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
>
> + spin_lock(&resampler->lock);
> + if (resampler->masked) {
> + resampler->pending = true;
> + spin_unlock(&resampler->lock);
> + return;
> + }
> + spin_unlock(&resampler->lock);
> +
> + idx = srcu_read_lock(&kvm->irq_srcu);
> +
> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> + srcu_read_lock_held(&kvm->irq_srcu))
> + eventfd_signal(irqfd->resamplefd, 1);
> +
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> +}
> +
> +static void
> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
> masked)
> +{
> + struct kvm_kernel_irqfd_resampler *resampler;
> + struct kvm *kvm;
> + struct kvm_kernel_irqfd *irqfd;
> + int idx;
> +
> + resampler = container_of(kimn,
> + struct kvm_kernel_irqfd_resampler, mask_notifier);
> + kvm = resampler->kvm;
> +
> + spin_lock(&resampler->lock);
> + resampler->masked = masked;
> + if (masked || !resampler->pending) {
> + spin_unlock(&resampler->lock);
> + return;
> + }
> + resampler->pending = false;
> + spin_unlock(&resampler->lock);
> +
> idx = srcu_read_lock(&kvm->irq_srcu);
>
> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
> kvm_kernel_irqfd *irqfd)
> if (list_empty(&resampler->list)) {
> list_del(&resampler->link);
> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
> + kvm_unregister_irq_mask_notifier(kvm, resampler-
> >mask_notifier.irq,
> + &resampler->mask_notifier);
> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
> kfree(resampler);
> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct
> kvm_irqfd *args)
> INIT_LIST_HEAD(&resampler->list);
> resampler->notifier.gsi = irqfd->gsi;
> resampler->notifier.irq_acked = irqfd_resampler_ack;
> + resampler->mask_notifier.func = irqfd_resampler_mask;
> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
> >masked);
> + spin_lock_init(&resampler->lock);
> INIT_LIST_HEAD(&resampler->link);
>
> list_add(&resampler->link, &kvm->irqfds.resampler_list);
> kvm_register_irq_ack_notifier(kvm,
> &resampler->notifier);
> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> + &resampler->mask_notifier);
> irqfd->resampler = resampler;
> }
>
> --
> 2.37.0.170.g444d1eabd0-goog
On 7/29/22 10:48 PM, Liu, Rong L wrote:
> Hi Dmytro,
>
>> -----Original Message-----
>> From: Dmytro Maluka <[email protected]>
>> Sent: Tuesday, July 26, 2022 7:08 AM
>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
>> <[email protected]>; Paolo Bonzini <[email protected]>;
>> [email protected]
>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>> <[email protected]>; [email protected]; H. Peter Anvin
>> <[email protected]>; [email protected]; Eric Auger
>> <[email protected]>; Alex Williamson
>> <[email protected]>; Zhenyu Wang
>> <[email protected]>; Tomasz Nowicki <[email protected]>;
>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
>> <[email protected]>
>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>> oneshot interrupts
>>
>> Hi Rong,
>>
>> On 7/26/22 01:44, Liu, Rong L wrote:
>>> Hi Dmytro,
>>>
>>>> -----Original Message-----
>>>> From: Dmytro Maluka <[email protected]>
>>>> Sent: Friday, July 15, 2022 8:59 AM
>>>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
>>>> <[email protected]>; [email protected]
>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>>>> <[email protected]>; [email protected]; H. Peter Anvin
>>>> <[email protected]>; [email protected]; Eric Auger
>>>> <[email protected]>; Alex Williamson
>>>> <[email protected]>; Liu, Rong L <[email protected]>;
>>>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
>>>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>; Dmitry
>>>> Torokhov <[email protected]>; Dmytro Maluka <[email protected]>
>>>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>> oneshot
>>>> interrupts
>>>>
>>>> The existing KVM mechanism for forwarding of level-triggered
>> interrupts
>>>> using resample eventfd doesn't work quite correctly in the case of
>>>> interrupts that are handled in a Linux guest as oneshot interrupts
>>>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
>>>> threaded irq handler, i.e. later than it is acked to the interrupt
>>>> controller (EOI at the end of hardirq), not earlier.
>>>>
>>>> Linux keeps such interrupt masked until its threaded handler finishes,
>>>> to prevent the EOI from re-asserting an unacknowledged interrupt.
>>>> However, with KVM + vfio (or whatever is listening on the resamplefd)
>>>> we don't check that the interrupt is still masked in the guest at the
>>>> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
>>>> unmasks the host physical IRQ, thus a new (unwanted) physical
>> interrupt
>>>> is generated in the host and queued for injection to the guest.
>>>>
>>>> The fact that the virtual IRQ is still masked doesn't prevent this new
>>>> physical IRQ from being propagated to the guest, because:
>>>>
>>>> 1. It is not guaranteed that the vIRQ will remain masked by the time
>>>> when vfio signals the trigger eventfd.
>>>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
>>>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
>>>> new pending interrupt is injected by KVM to the guest anyway.
>>>>
>>>> There are observed at least 2 user-visible issues caused by those
>>>> extra erroneous pending interrupts for oneshot irq in the guest:
>>>>
>>>> 1. System suspend aborted due to a pending wakeup interrupt from
>>>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
>>>> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
>>>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
>>>> every time the touchpad is touched.
>>>>
>>>> This patch fixes the issue on x86 by checking if the interrupt is
>>>> unmasked when we receive irq ack (EOI) and, in case if it's masked,
>>>> postponing resamplefd notify until the guest unmasks it.
>>>>
>>>> Important notes:
>>>>
>>>> 1. It doesn't fix the issue for other archs yet, due to some missing
>>>> KVM functionality needed by this patch:
>>>> - calling mask notifiers is implemented for x86 only
>>>> - irqchip ->is_masked() is implemented for x86 only
>>>>
>>>> 2. It introduces an additional spinlock locking in the resample notify
>>>> path, since we are no longer just traversing an RCU list of irqfds
>>>> but also updating the resampler state. Hopefully this locking won't
>>>> noticeably slow down anything for anyone.
>>>>
>>>
>>> Instead of using a spinlock waiting for the unmask event, is it possible
>> to call
>>> resampler notify directly when unmask event happens, instead of
>> calling it on
>>> EOI?
>>
>> In this patch, resampler notify is already called directly when unmask
>> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
>> writing to IOREDTBLx register, ioapic_write_indirect() calls
>> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask() which
>> notifies the resampler. On EOI we postpone it just by setting
>> resampler->pending to true, not by waiting. The spinlock is needed
>> merely to synchronize reading & updating resampler->pending and
>> resampler->masked values between possibly concurrently running
>> instances
>> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
>>
>> Thanks,
>> Dmytro
>>
>
> I mean the organization of the code. In current implementation,
> kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your patch, why not
> call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at the same
> place where kvm_fire_mask_notifiers is called), instead of calling it from
> kvm_ioapic_update_eoi_one, since what your intention here is to notify
> vfio of the end of interrupt at the event of ioapic unmask, instead of
> EOI?
Ah ok, got your point.
That was my initial approach in my PoC patch posted in [1]. But then I
dropped it, for 2 reasons:
1. Upon feedback from Sean I realized that kvm_notify_acked_irq() is
also doing some other important things besides notifying vfio. In
particular, in irqfd_resampler_ack() we also de-assert the vIRQ via
kvm_set_irq(). In case of IOAPIC it means clearing its bit in IRR
register. If we delay that until unmasking, it means that we change
the way how KVM emulates the interrupt controller. That would seem
inconsistent.
Also kvm_notify_acked_irq() notifies the emulated PIT timer via
kvm_pit_ack_irq(). I haven't analyzed how exactly that PIT stuff
works, so I'm not sure if delaying that until unmask wouldn't cause
any unwanted effects.
So the idea is to postpone eventfd_signal() only, to fix interaction
with vfio while keeping the rest of the KVM behavior intact. Because
the KVM job is to emulate the interrupt controller (which it already
does correctly), not the external device which is the job of vfio*.
2. kvm_notify_acked_irq() can't be called under ioapic->lock, so in [1]
I was unlocking ioapic->lock in ioapic_write_indirect() with a naive
assumption that it was as safe as doing it in
kvm_ioapic_update_eoi_one(). That was probably racy, and I hadn't
figured out how to rework it in a race-free way.
[1] https://lore.kernel.org/kvm/[email protected]/
[*] By "vfio" I always mean "vfio or any other resamplefd user".
Thanks,
Dmytro
>
>>>
>>>> Regarding #2, there may be an alternative solution worth considering:
>>>> extend KVM irqfd (userspace) API to send mask and unmask
>> notifications
>>>> directly to vfio/whatever, in addition to resample notifications, to
>>>> let vfio check the irq state on its own. There is already locking on
>>>> vfio side (see e.g. vfio_platform_unmask()), so this way we would
>> avoid
>>>> introducing any additional locking. Also such mask/unmask
>> notifications
>>>> could be useful for other cases.
>>>>
>>>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>>>> [email protected]/
>>>> Suggested-by: Sean Christopherson <[email protected]>
>>>> Signed-off-by: Dmytro Maluka <[email protected]>
>>>> ---
>>>> include/linux/kvm_irqfd.h | 14 ++++++++++++
>>>> virt/kvm/eventfd.c | 45
>>>> +++++++++++++++++++++++++++++++++++++++
>>>> 2 files changed, 59 insertions(+)
>>>>
>>>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
>>>> index dac047abdba7..01754a1abb9e 100644
>>>> --- a/include/linux/kvm_irqfd.h
>>>> +++ b/include/linux/kvm_irqfd.h
>>>> @@ -19,6 +19,16 @@
>>>> * resamplefd. All resamplers on the same gsi are de-asserted
>>>> * together, so we don't need to track the state of each individual
>>>> * user. We can also therefore share the same irq source ID.
>>>> + *
>>>> + * A special case is when the interrupt is still masked at the moment
>>>> + * an irq ack is received. That likely means that the interrupt has
>>>> + * been acknowledged to the interrupt controller but not
>> acknowledged
>>>> + * to the device yet, e.g. it might be a Linux guest's threaded
>>>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
>>>> + * resamplefd is postponed until the guest unmasks the interrupt,
>>>> + * which is detected through the irq mask notifier. This prevents
>>>> + * erroneous extra interrupts caused by premature re-assert of an
>>>> + * unacknowledged interrupt by the resamplefd listener.
>>>> */
>>>> struct kvm_kernel_irqfd_resampler {
>>>> struct kvm *kvm;
>>>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
>>>> */
>>>> struct list_head list;
>>>> struct kvm_irq_ack_notifier notifier;
>>>> + struct kvm_irq_mask_notifier mask_notifier;
>>>> + bool masked;
>>>> + bool pending;
>>>> + spinlock_t lock;
>>>> /*
>>>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
>>>> * resamplers among irqfds on the same gsi.
>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>>> index 50ddb1d1a7f0..9ff47ac33790 100644
>>>> --- a/virt/kvm/eventfd.c
>>>> +++ b/virt/kvm/eventfd.c
>>>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
>> kvm_irq_ack_notifier
>>>> *kian)
>>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>>>> resampler->notifier.gsi, 0, false);
>>>>
>>>> + spin_lock(&resampler->lock);
>>>> + if (resampler->masked) {
>>>> + resampler->pending = true;
>>>> + spin_unlock(&resampler->lock);
>>>> + return;
>>>> + }
>>>> + spin_unlock(&resampler->lock);
>>>> +
>>>> + idx = srcu_read_lock(&kvm->irq_srcu);
>>>> +
>>>> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
>>>> + srcu_read_lock_held(&kvm->irq_srcu))
>>>> + eventfd_signal(irqfd->resamplefd, 1);
>>>> +
>>>> + srcu_read_unlock(&kvm->irq_srcu, idx);
>>>> +}
>>>> +
>>>> +static void
>>>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
>>>> masked)
>>>> +{
>>>> + struct kvm_kernel_irqfd_resampler *resampler;
>>>> + struct kvm *kvm;
>>>> + struct kvm_kernel_irqfd *irqfd;
>>>> + int idx;
>>>> +
>>>> + resampler = container_of(kimn,
>>>> + struct kvm_kernel_irqfd_resampler, mask_notifier);
>>>> + kvm = resampler->kvm;
>>>> +
>>>> + spin_lock(&resampler->lock);
>>>> + resampler->masked = masked;
>>>> + if (masked || !resampler->pending) {
>>>> + spin_unlock(&resampler->lock);
>>>> + return;
>>>> + }
>>>> + resampler->pending = false;
>>>> + spin_unlock(&resampler->lock);
>>>> +
>>>> idx = srcu_read_lock(&kvm->irq_srcu);
>>>>
>>>> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
>>>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
>>>> kvm_kernel_irqfd *irqfd)
>>>> if (list_empty(&resampler->list)) {
>>>> list_del(&resampler->link);
>>>> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
>>>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
>>>>> mask_notifier.irq,
>>>> + &resampler->mask_notifier);
>>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>>>> resampler->notifier.gsi, 0, false);
>>>> kfree(resampler);
>>>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct
>>>> kvm_irqfd *args)
>>>> INIT_LIST_HEAD(&resampler->list);
>>>> resampler->notifier.gsi = irqfd->gsi;
>>>> resampler->notifier.irq_acked = irqfd_resampler_ack;
>>>> + resampler->mask_notifier.func = irqfd_resampler_mask;
>>>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
>>>>> masked);
>>>> + spin_lock_init(&resampler->lock);
>>>> INIT_LIST_HEAD(&resampler->link);
>>>>
>>>> list_add(&resampler->link, &kvm->irqfds.resampler_list);
>>>> kvm_register_irq_ack_notifier(kvm,
>>>> &resampler->notifier);
>>>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
>>>> + &resampler->mask_notifier);
>>>> irqfd->resampler = resampler;
>>>> }
>>>>
>>>> --
>>>> 2.37.0.170.g444d1eabd0-goog
>>>
On 7/29/22 11:21 PM, Liu, Rong L wrote:
> Hi Dmytro,
>
>> -----Original Message-----
>> From: Dmytro Maluka <[email protected]>
>> Sent: Friday, July 15, 2022 8:59 AM
>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
>> <[email protected]>; [email protected]
>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>> <[email protected]>; [email protected]; H. Peter Anvin
>> <[email protected]>; [email protected]; Eric Auger
>> <[email protected]>; Alex Williamson
>> <[email protected]>; Liu, Rong L <[email protected]>;
>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>; Dmitry
>> Torokhov <[email protected]>; Dmytro Maluka <[email protected]>
>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for oneshot
>> interrupts
>>
>> The existing KVM mechanism for forwarding of level-triggered interrupts
>> using resample eventfd doesn't work quite correctly in the case of
>> interrupts that are handled in a Linux guest as oneshot interrupts
>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
>> threaded irq handler, i.e. later than it is acked to the interrupt
>> controller (EOI at the end of hardirq), not earlier.
>>
>> Linux keeps such interrupt masked until its threaded handler finishes,
>> to prevent the EOI from re-asserting an unacknowledged interrupt.
>> However, with KVM + vfio (or whatever is listening on the resamplefd)
>> we don't check that the interrupt is still masked in the guest at the
>> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
>> unmasks the host physical IRQ, thus a new (unwanted) physical interrupt
>> is generated in the host and queued for injection to the guest.
>>
>> The fact that the virtual IRQ is still masked doesn't prevent this new
>> physical IRQ from being propagated to the guest, because:
>>
>> 1. It is not guaranteed that the vIRQ will remain masked by the time
>> when vfio signals the trigger eventfd.
>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
>> new pending interrupt is injected by KVM to the guest anyway.
>>
>> There are observed at least 2 user-visible issues caused by those
>> extra erroneous pending interrupts for oneshot irq in the guest:
>>
>> 1. System suspend aborted due to a pending wakeup interrupt from
>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
>> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
>> every time the touchpad is touched.
>>
>> This patch fixes the issue on x86 by checking if the interrupt is
>> unmasked when we receive irq ack (EOI) and, in case if it's masked,
>> postponing resamplefd notify until the guest unmasks it.
>>
>> Important notes:
>>
>> 1. It doesn't fix the issue for other archs yet, due to some missing
>> KVM functionality needed by this patch:
>> - calling mask notifiers is implemented for x86 only
>> - irqchip ->is_masked() is implemented for x86 only
>>
>> 2. It introduces an additional spinlock locking in the resample notify
>> path, since we are no longer just traversing an RCU list of irqfds
>> but also updating the resampler state. Hopefully this locking won't
>> noticeably slow down anything for anyone.
>>
>> Regarding #2, there may be an alternative solution worth considering:
>> extend KVM irqfd (userspace) API to send mask and unmask notifications
>> directly to vfio/whatever, in addition to resample notifications, to
>> let vfio check the irq state on its own. There is already locking on
>> vfio side (see e.g. vfio_platform_unmask()), so this way we would avoid
>> introducing any additional locking. Also such mask/unmask notifications
>> could be useful for other cases.
>>
>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>> [email protected]/
>> Suggested-by: Sean Christopherson <[email protected]>
>> Signed-off-by: Dmytro Maluka <[email protected]>
>> ---
>> include/linux/kvm_irqfd.h | 14 ++++++++++++
>> virt/kvm/eventfd.c | 45
>> +++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 59 insertions(+)
>>
>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
>> index dac047abdba7..01754a1abb9e 100644
>> --- a/include/linux/kvm_irqfd.h
>> +++ b/include/linux/kvm_irqfd.h
>> @@ -19,6 +19,16 @@
>> * resamplefd. All resamplers on the same gsi are de-asserted
>> * together, so we don't need to track the state of each individual
>> * user. We can also therefore share the same irq source ID.
>> + *
>> + * A special case is when the interrupt is still masked at the moment
>> + * an irq ack is received. That likely means that the interrupt has
>> + * been acknowledged to the interrupt controller but not acknowledged
>> + * to the device yet, e.g. it might be a Linux guest's threaded
>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
>> + * resamplefd is postponed until the guest unmasks the interrupt,
>> + * which is detected through the irq mask notifier. This prevents
>> + * erroneous extra interrupts caused by premature re-assert of an
>> + * unacknowledged interrupt by the resamplefd listener.
>> */
>
> I don't really agree with above statement. Please correct me if I am wrong. In
> all modern x86 interrupt infrastructure(lapic/ioapic), for level triggered
> interrupt, the sequence is always EOI (LAPIC), which is called interrupt ack in
> the context of this discussion, then unmask (IOAPIC). Oneshot interrupt is
> different only because the timing of above 2 events are different from a
> "normal" level-triggered interrupt. It is like for level interrupt: Hardirq ->
> EOI -> Unmask but for oneshot, it is like: hardirq->EOI, then sometime later
> threadedirq->unmask. So based on this, I don't think you need to keep track of
> whether the interrupt is unmasked or not, just need to call the notifier at the
> end of unmask, instead of EOI. And calling notifier at the end of unmask,
> instead of EOI won't break non-oneshot case. The only caveat is guest ioapic
> update (virq unmask) doesn't cause vmexit. But the assumption is already made
> that virq unmask causes vmexit.
This doesn't seem true to me. For example, looking at how Linux handles
level-triggered IOAPIC interrupts in handle_fasteoi_irq(), AFAICS
normally it does EOI only, without masking or unmasking.
I believe for regular level-triggered interrupts it's like:
hardirq -> EOI
while for oneshot interrupts:
mask -> hardirq -> EOI -> threadedirq -> unmask
I don't quite get what would be the point of unmasking in addition to an
EOI in the normal case (not in a special case like oneshot interrupt,
where we mask the interrupt in the beginning of hardirq, so need to
unmask it later on). Isn't the whole point of "fast EOI" to minimize
latencies by using just one LAPIC write, no IOAPIC writes?
Thanks,
Dmytro
>
>> struct kvm_kernel_irqfd_resampler {
>> struct kvm *kvm;
>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
>> */
>> struct list_head list;
>> struct kvm_irq_ack_notifier notifier;
>> + struct kvm_irq_mask_notifier mask_notifier;
>> + bool masked;
>> + bool pending;
>> + spinlock_t lock;
>> /*
>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
>> * resamplers among irqfds on the same gsi.
>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>> index 50ddb1d1a7f0..9ff47ac33790 100644
>> --- a/virt/kvm/eventfd.c
>> +++ b/virt/kvm/eventfd.c
>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier
>> *kian)
>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>> resampler->notifier.gsi, 0, false);
>>
>> + spin_lock(&resampler->lock);
>> + if (resampler->masked) {
>> + resampler->pending = true;
>> + spin_unlock(&resampler->lock);
>> + return;
>> + }
>> + spin_unlock(&resampler->lock);
>> +
>> + idx = srcu_read_lock(&kvm->irq_srcu);
>> +
>> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
>> + srcu_read_lock_held(&kvm->irq_srcu))
>> + eventfd_signal(irqfd->resamplefd, 1);
>> +
>> + srcu_read_unlock(&kvm->irq_srcu, idx);
>> +}
>> +
>> +static void
>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
>> masked)
>> +{
>> + struct kvm_kernel_irqfd_resampler *resampler;
>> + struct kvm *kvm;
>> + struct kvm_kernel_irqfd *irqfd;
>> + int idx;
>> +
>> + resampler = container_of(kimn,
>> + struct kvm_kernel_irqfd_resampler, mask_notifier);
>> + kvm = resampler->kvm;
>> +
>> + spin_lock(&resampler->lock);
>> + resampler->masked = masked;
>> + if (masked || !resampler->pending) {
>> + spin_unlock(&resampler->lock);
>> + return;
>> + }
>> + resampler->pending = false;
>> + spin_unlock(&resampler->lock);
>> +
>> idx = srcu_read_lock(&kvm->irq_srcu);
>>
>> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
>> kvm_kernel_irqfd *irqfd)
>> if (list_empty(&resampler->list)) {
>> list_del(&resampler->link);
>> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
>>> mask_notifier.irq,
>> + &resampler->mask_notifier);
>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>> resampler->notifier.gsi, 0, false);
>> kfree(resampler);
>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct
>> kvm_irqfd *args)
>> INIT_LIST_HEAD(&resampler->list);
>> resampler->notifier.gsi = irqfd->gsi;
>> resampler->notifier.irq_acked = irqfd_resampler_ack;
>> + resampler->mask_notifier.func = irqfd_resampler_mask;
>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
>>> masked);
>> + spin_lock_init(&resampler->lock);
>> INIT_LIST_HEAD(&resampler->link);
>>
>> list_add(&resampler->link, &kvm->irqfds.resampler_list);
>> kvm_register_irq_ack_notifier(kvm,
>> &resampler->notifier);
>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
>> + &resampler->mask_notifier);
>> irqfd->resampler = resampler;
>> }
>>
>> --
>> 2.37.0.170.g444d1eabd0-goog
>
On 7/15/22 17:59, Dmytro Maluka wrote:
> The existing KVM mechanism for forwarding of level-triggered interrupts
> using resample eventfd doesn't work quite correctly in the case of
> interrupts that are handled in a Linux guest as oneshot interrupts
> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> threaded irq handler, i.e. later than it is acked to the interrupt
> controller (EOI at the end of hardirq), not earlier.
>
> Linux keeps such interrupt masked until its threaded handler finishes,
> to prevent the EOI from re-asserting an unacknowledged interrupt.
> However, with KVM + vfio (or whatever is listening on the resamplefd)
> we don't check that the interrupt is still masked in the guest at the
> moment of EOI. Resamplefd is notified regardless, so vfio prematurely
> unmasks the host physical IRQ, thus a new (unwanted) physical interrupt
> is generated in the host and queued for injection to the guest.
>
> The fact that the virtual IRQ is still masked doesn't prevent this new
> physical IRQ from being propagated to the guest, because:
>
> 1. It is not guaranteed that the vIRQ will remain masked by the time
> when vfio signals the trigger eventfd.
> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
> new pending interrupt is injected by KVM to the guest anyway.
>
> There are observed at least 2 user-visible issues caused by those
> extra erroneous pending interrupts for oneshot irq in the guest:
>
> 1. System suspend aborted due to a pending wakeup interrupt from
> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> 2. Annoying "invalid report id data" errors from ELAN0000 touchpad
> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
> every time the touchpad is touched.
>
> This patch fixes the issue on x86 by checking if the interrupt is
> unmasked when we receive irq ack (EOI) and, in case if it's masked,
> postponing resamplefd notify until the guest unmasks it.
>
> Important notes:
>
> 1. It doesn't fix the issue for other archs yet, due to some missing
> KVM functionality needed by this patch:
> - calling mask notifiers is implemented for x86 only
> - irqchip ->is_masked() is implemented for x86 only
>
> 2. It introduces an additional spinlock locking in the resample notify
> path, since we are no longer just traversing an RCU list of irqfds
> but also updating the resampler state. Hopefully this locking won't
> noticeably slow down anything for anyone.
>
> Regarding #2, there may be an alternative solution worth considering:
> extend KVM irqfd (userspace) API to send mask and unmask notifications
> directly to vfio/whatever, in addition to resample notifications, to
> let vfio check the irq state on its own. There is already locking on
> vfio side (see e.g. vfio_platform_unmask()), so this way we would avoid
> introducing any additional locking. Also such mask/unmask notifications
> could be useful for other cases.
>
> Link: https://lore.kernel.org/kvm/[email protected]/
> Suggested-by: Sean Christopherson <[email protected]>
> Signed-off-by: Dmytro Maluka <[email protected]>
> ---
> include/linux/kvm_irqfd.h | 14 ++++++++++++
> virt/kvm/eventfd.c | 45 +++++++++++++++++++++++++++++++++++++++
> 2 files changed, 59 insertions(+)
>
> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> index dac047abdba7..01754a1abb9e 100644
> --- a/include/linux/kvm_irqfd.h
> +++ b/include/linux/kvm_irqfd.h
> @@ -19,6 +19,16 @@
> * resamplefd. All resamplers on the same gsi are de-asserted
> * together, so we don't need to track the state of each individual
> * user. We can also therefore share the same irq source ID.
> + *
> + * A special case is when the interrupt is still masked at the moment
> + * an irq ack is received. That likely means that the interrupt has
> + * been acknowledged to the interrupt controller but not acknowledged
> + * to the device yet, e.g. it might be a Linux guest's threaded
> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying through
> + * resamplefd is postponed until the guest unmasks the interrupt,
> + * which is detected through the irq mask notifier. This prevents
> + * erroneous extra interrupts caused by premature re-assert of an
> + * unacknowledged interrupt by the resamplefd listener.
> */
> struct kvm_kernel_irqfd_resampler {
> struct kvm *kvm;
> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> */
> struct list_head list;
> struct kvm_irq_ack_notifier notifier;
> + struct kvm_irq_mask_notifier mask_notifier;
> + bool masked;
> + bool pending;
> + spinlock_t lock;
> /*
> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> * resamplers among irqfds on the same gsi.
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 50ddb1d1a7f0..9ff47ac33790 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
>
> + spin_lock(&resampler->lock);
> + if (resampler->masked) {
> + resampler->pending = true;
> + spin_unlock(&resampler->lock);
> + return;
> + }
> + spin_unlock(&resampler->lock);
> +
> + idx = srcu_read_lock(&kvm->irq_srcu);
> +
> + list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> + srcu_read_lock_held(&kvm->irq_srcu))
> + eventfd_signal(irqfd->resamplefd, 1);
> +
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> +}
> +
> +static void
> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool masked)
> +{
> + struct kvm_kernel_irqfd_resampler *resampler;
> + struct kvm *kvm;
> + struct kvm_kernel_irqfd *irqfd;
> + int idx;
> +
> + resampler = container_of(kimn,
> + struct kvm_kernel_irqfd_resampler, mask_notifier);
> + kvm = resampler->kvm;
> +
> + spin_lock(&resampler->lock);
> + resampler->masked = masked;
> + if (masked || !resampler->pending) {
> + spin_unlock(&resampler->lock);
> + return;
> + }
> + resampler->pending = false;
> + spin_unlock(&resampler->lock);
> +
> idx = srcu_read_lock(&kvm->irq_srcu);
>
> list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
> if (list_empty(&resampler->list)) {
> list_del(&resampler->link);
> kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
> + kvm_unregister_irq_mask_notifier(kvm, resampler->mask_notifier.irq,
> + &resampler->mask_notifier);
> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
> kfree(resampler);
> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
> INIT_LIST_HEAD(&resampler->list);
> resampler->notifier.gsi = irqfd->gsi;
> resampler->notifier.irq_acked = irqfd_resampler_ack;
> + resampler->mask_notifier.func = irqfd_resampler_mask;
> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler->masked);
> + spin_lock_init(&resampler->lock);
> INIT_LIST_HEAD(&resampler->link);
>
> list_add(&resampler->link, &kvm->irqfds.resampler_list);
> kvm_register_irq_ack_notifier(kvm,
> &resampler->notifier);
> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> + &resampler->mask_notifier);
I realized this is a bit racy: we may miss a mask or unmask event just
before kvm_register_irq_ack_notifier(), so irqfd_resampler_ack() may
see an outdated irq mask state.
Moving kvm_register_irq_mask_notifier() before
kvm_register_irq_ack_notifier() isn't enough, since a mask or unmask
may still happen just before kvm_register_irq_mask_notifier().
This race can be avoided by moving also resampler->masked initialization
(kvm_irq_is_masked()) after kvm_register_irq_mask_notifier(). But then
kvm_irq_is_masked() would need to be called under resampler->lock,
which could cause a deadlock, since kvm_irq_is_masked() locks
ioapic->lock while irqfd_resampler_mask() is called under ioapic->lock
(or correspondingly pic->lock).
So for v2 I'm considering replacing kvm_irq_is_masked() with a function
like the following, for registering & initializing mask notifier in a
presumably race-free deadlock-free way. (The below is just a sketch
showing the locking order; the actual code would be more complicated
due to irq -> pin mapping etc.)
void kvm_register_and_fire_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
{
struct kvm_pic *pic = kvm->arch.vpic;
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
bool masked;
mutex_lock(&kvm->irq_lock);
spin_lock(&pic->lock);
spin_lock(&ioapic->lock);
kimn->irq = irq;
hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
masked = kvm_pic_irq_is_masked(pic, irq) &&
kvm_ioapic_irq_is_masked(ioapic, irq);
kimn->func(kimn, masked);
spin_unlock(&ioapic->lock);
spin_unlock(&pic->lock);
mutex_unlock(&kvm->irq_lock);
}
This implies that I'll probably go with moving the mask notifiers stuff
back to x86-specific code, and just add a weak no-op version for other
architectures.
Thanks,
Dmytro
> irqfd->resampler = resampler;
> }
>
On Fri, Jul 29, 2022, Dmytro Maluka wrote:
> On 7/28/22 20:46, Sean Christopherson wrote:
> > On Fri, Jul 15, 2022, Dmytro Maluka wrote:
> >> In preparation for implementing postponing resamplefd event until the
> >> interrupt is unmasked, move kvm_(un)register_irq_mask_notifier() from
> >> x86 to arch-independent code to make it usable by irqfd.
> >
> > This patch needs to move more than just the helpers, e.g. mask_notifier_list
> > needs to be in "struct kvm", not "stuct kvm_arch".
> >
> > arch/arm64/kvm/../../../virt/kvm/eventfd.c: In function ‘kvm_register_irq_mask_notifier’:
> > arch/arm64/kvm/../../../virt/kvm/eventfd.c:528:51: error: ‘struct kvm_arch’ has no member named ‘mask_notifier_list’
> > 528 | hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
> > | ^
> > make[3]: *** [scripts/Makefile.build:249: arch/arm64/kvm/../../../virt/kvm/eventfd.o] Error 1
> > make[3]: *** Waiting for unfinished jobs....
> > AR kernel/entry/built-in.a
>
> Oops, sorry.
>
> > And kvm_fire_mask_notifiers() should probably be moved as well, otherwise there's
> > no point in moving the registration to common code.
>
> Good point, we can move it right away, even though it is not called on
> other architectures for now.
>
> > The other option would be to make the generic functions wrappers around arch-specific
> > hooks. But IIRC won't this eventually be needed for other architectures?
>
> Right, I assume we will eventually need it for ARM at least. Not in the
> near future though, and at the moment I have no non-x86 hardware on hand
> to implement it for other architectures.
>
> Actually I feel a bit uncomfortable with generic irqfd relying on
> kvm_register_irq_mask_notifier() which silently has no effect on other
> architectures. Maybe it's better to keep
> kvm_(un)register_irq_mask_notifier() in the x86 code, and for the
> generic code add a weak version which e.g. just prints a warning like
> "irq mask notifiers not implemented on this arch". (Or maybe instead of
> weak functions introduce arch-specific hooks as you suggested, and print
> such a warning if no hook is provided.) What do you think?
If the entire concept of having mask notifiers is x86 specific, then moving it to
generic code obviously doesn't make sense. But if the concept applies to other
archictectures, then IMO the list belongs in "struct kvm" with generic, common
helpers, even if no other arch calls kvm_fire_mask_notifiers() at this time.
Paolo and/or non-x86 folks, any thoughts?
Hi Dmytro
On 7/15/22 17:59, Dmytro Maluka wrote:
> In order to implement postponing resamplefd event until an interrupt is
> unmasked, we need not only to track changes of the interrupt mask state
> (which is made possible by the previous patch "KVM: x86: Move
> kvm_(un)register_irq_mask_notifier() to generic KVM") but also to know
> its initial mask state at the time of registering a resamplefd
it is not obvious to me why the vIRQ is supposed to be masked at
resamplefd registration time. I would have expected the check to be done
in the resamplefd notifier instead (when the vEOI actually happens)?
Eric
> listener. So implement kvm_irq_is_masked() for that.
>
> Actually, for now it's implemented for x86 only (see below).
>
> The implementation is trickier than I'd like it to be, for 2 reasons:
>
> 1. Interrupt (GSI) to irqchip pin mapping is not a 1:1 mapping: an IRQ
> may map to multiple pins on different irqchips. I guess the only
> reason for that is to support x86 interrupts 0-15 for which we don't
> know if the guest uses PIC or IOAPIC. For this reason kvm_set_irq()
> delivers interrupt to both, assuming the guest will ignore the
> unused one. For the same reason, in kvm_irq_is_masked() we should
> also take into account the mask state of both irqchips. We consider
> an interrupt unmasked if and only if it is unmasked in at least one
> of PIC or IOAPIC, assuming that in the unused one all the interrupts
> should be masked.
>
> 2. For now ->is_masked() implemented for x86 only, so need to handle
> the case when ->is_masked() is not provided by the irqchip. In such
> case kvm_irq_is_masked() returns failure, and its caller may fall
> back to an assumption that an interrupt is always unmasked.
>
> Link: https://lore.kernel.org/kvm/[email protected]/
> Signed-off-by: Dmytro Maluka <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/i8259.c | 11 +++++++++++
> arch/x86/kvm/ioapic.c | 11 +++++++++++
> arch/x86/kvm/ioapic.h | 1 +
> arch/x86/kvm/irq_comm.c | 16 ++++++++++++++++
> include/linux/kvm_host.h | 3 +++
> virt/kvm/irqchip.c | 34 +++++++++++++++++++++++++++++++++
> 7 files changed, 77 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 39a867d68721..64618b890700 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1840,6 +1840,7 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
>
> int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
> void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
> +bool kvm_pic_irq_is_masked(struct kvm_pic *pic, int irq);
>
> void kvm_inject_nmi(struct kvm_vcpu *vcpu);
>
> diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> index e1bb6218bb96..2d1ed3bc7cc5 100644
> --- a/arch/x86/kvm/i8259.c
> +++ b/arch/x86/kvm/i8259.c
> @@ -211,6 +211,17 @@ void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
> pic_unlock(s);
> }
>
> +bool kvm_pic_irq_is_masked(struct kvm_pic *s, int irq)
> +{
> + bool ret;
> +
> + pic_lock(s);
> + ret = !!(s->pics[irq >> 3].imr & (1 << irq));
> + pic_unlock(s);
> +
> + return ret;
> +}
> +
> /*
> * acknowledge interrupt 'irq'
> */
> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
> index 765943d7cfa5..874f68a65c87 100644
> --- a/arch/x86/kvm/ioapic.c
> +++ b/arch/x86/kvm/ioapic.c
> @@ -478,6 +478,17 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
> spin_unlock(&ioapic->lock);
> }
>
> +bool kvm_ioapic_irq_is_masked(struct kvm_ioapic *ioapic, int irq)
> +{
> + bool ret;
> +
> + spin_lock(&ioapic->lock);
> + ret = !!ioapic->redirtbl[irq].fields.mask;
> + spin_unlock(&ioapic->lock);
> +
> + return ret;
> +}
> +
> static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
> {
> int i;
> diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
> index 539333ac4b38..fe1f51319992 100644
> --- a/arch/x86/kvm/ioapic.h
> +++ b/arch/x86/kvm/ioapic.h
> @@ -114,6 +114,7 @@ void kvm_ioapic_destroy(struct kvm *kvm);
> int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
> int level, bool line_status);
> void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
> +bool kvm_ioapic_irq_is_masked(struct kvm_ioapic *ioapic, int irq);
> void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
> void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
> void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> index 43e13892ed34..5bff6d6ac54f 100644
> --- a/arch/x86/kvm/irq_comm.c
> +++ b/arch/x86/kvm/irq_comm.c
> @@ -34,6 +34,13 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
> }
>
> +static bool kvm_is_masked_pic_irq(struct kvm_kernel_irq_routing_entry *e,
> + struct kvm *kvm)
> +{
> + struct kvm_pic *pic = kvm->arch.vpic;
> + return kvm_pic_irq_is_masked(pic, e->irqchip.pin);
> +}
> +
> static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> struct kvm *kvm, int irq_source_id, int level,
> bool line_status)
> @@ -43,6 +50,13 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> line_status);
> }
>
> +static bool kvm_is_masked_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> + struct kvm *kvm)
> +{
> + struct kvm_ioapic *ioapic = kvm->arch.vioapic;
> + return kvm_ioapic_irq_is_masked(ioapic, e->irqchip.pin);
> +}
> +
> int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
> struct kvm_lapic_irq *irq, struct dest_map *dest_map)
> {
> @@ -275,11 +289,13 @@ int kvm_set_routing_entry(struct kvm *kvm,
> if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
> return -EINVAL;
> e->set = kvm_set_pic_irq;
> + e->is_masked = kvm_is_masked_pic_irq;
> break;
> case KVM_IRQCHIP_IOAPIC:
> if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
> return -EINVAL;
> e->set = kvm_set_ioapic_irq;
> + e->is_masked = kvm_is_masked_ioapic_irq;
> break;
> default:
> return -EINVAL;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 9e12ef503157..e8bfb3b0d4d1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -625,6 +625,8 @@ struct kvm_kernel_irq_routing_entry {
> int (*set)(struct kvm_kernel_irq_routing_entry *e,
> struct kvm *kvm, int irq_source_id, int level,
> bool line_status);
> + bool (*is_masked)(struct kvm_kernel_irq_routing_entry *e,
> + struct kvm *kvm);
> union {
> struct {
> unsigned irqchip;
> @@ -1598,6 +1600,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
> int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
> struct kvm *kvm, int irq_source_id,
> int level, bool line_status);
> +int kvm_irq_is_masked(struct kvm *kvm, int irq, bool *masked);
> bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
> void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
> void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 58e4f88b2b9f..9252ebedba55 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -97,6 +97,40 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> return ret;
> }
>
> +/*
> + * Return value:
> + * = 0 Interrupt mask state successfully written to `masked`
> + * < 0 Failed to read interrupt mask state
> + */
> +int kvm_irq_is_masked(struct kvm *kvm, int irq, bool *masked)
> +{
> + struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
> + int ret = -1, i, idx;
> +
> + /* Not possible to detect if the guest uses the PIC or the
> + * IOAPIC. So assume the interrupt to be unmasked iff it is
> + * unmasked in at least one of both.
> + */
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + i = kvm_irq_map_gsi(kvm, irq_set, irq);
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> +
> + while (i--) {
> + if (!irq_set[i].is_masked)
> + continue;
> +
> + if (!irq_set[i].is_masked(&irq_set[i], kvm)) {
> + *masked = false;
> + return 0;
> + }
> + ret = 0;
> + }
> + if (!ret)
> + *masked = true;
> +
> + return ret;
> +}
> +
> static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
> {
> int i;
Hi Eric,
On 8/4/22 19:14, Eric Auger wrote:
> Hi Dmytro
>
> On 7/15/22 17:59, Dmytro Maluka wrote:
>> In order to implement postponing resamplefd event until an interrupt is
>> unmasked, we need not only to track changes of the interrupt mask state
>> (which is made possible by the previous patch "KVM: x86: Move
>> kvm_(un)register_irq_mask_notifier() to generic KVM") but also to know
>> its initial mask state at the time of registering a resamplefd
> it is not obvious to me why the vIRQ is supposed to be masked at
> resamplefd registration time. I would have expected the check to be done
> in the resamplefd notifier instead (when the vEOI actually happens)?
Great point, thanks. Indeed, it's not crucial to know the vIRQ mask
state at the registration time. It might be ok to check it later, at the
first vEOI after registration. So the wording should be changed to
"...but also to know its initial mask state before any mask notifier has
fired".
As I wrote in [1], the initialization of the mask state in this patchset
is actually racy (we may miss mask state change at initialization), and
I was about to send v2 which fixes this race by implementing
kvm_register_and_fire_irq_mask_notifier() as suggested in [1]. Now your
suggestion has made me think wouldn't it be possible to avoid this race
in a simpler way, by checking the vIRQ state directly in the ack (vEOI)
notifier.
It seems it still would be racy, e.g.:
1. ack notifier checks the vIRQ state, it is masked
2. the vIRQ gets unmasked
3. mask notifier fires, but the vIRQ is not flagged as postponed yet, so
mask notifier doesn't notify resamplefd
4. ack notifier flags the vIRQ as postponed and doesn't notify
resamplefd either
=> unmask event lost => the physical IRQ is not unmasked by vfio.
This race could be avoided if the ack notifier checked the vIRQ state
under resampler->lock, i.e. synchronized with the mask notifier. But
that would deadlock, for the same reason as mentioned in [1]: checking
the vIRQ state requires locking KVM irqchip lock (e.g. ioapic->lock on
x86) whereas the mask notifier is called under this lock.
[1] https://lore.kernel.org/lkml/[email protected]/
Thanks,
Dmytro
>
> Eric
>> listener. So implement kvm_irq_is_masked() for that.
>>
>> Actually, for now it's implemented for x86 only (see below).
>>
>> The implementation is trickier than I'd like it to be, for 2 reasons:
>>
>> 1. Interrupt (GSI) to irqchip pin mapping is not a 1:1 mapping: an IRQ
>> may map to multiple pins on different irqchips. I guess the only
>> reason for that is to support x86 interrupts 0-15 for which we don't
>> know if the guest uses PIC or IOAPIC. For this reason kvm_set_irq()
>> delivers interrupt to both, assuming the guest will ignore the
>> unused one. For the same reason, in kvm_irq_is_masked() we should
>> also take into account the mask state of both irqchips. We consider
>> an interrupt unmasked if and only if it is unmasked in at least one
>> of PIC or IOAPIC, assuming that in the unused one all the interrupts
>> should be masked.
>>
>> 2. For now ->is_masked() implemented for x86 only, so need to handle
>> the case when ->is_masked() is not provided by the irqchip. In such
>> case kvm_irq_is_masked() returns failure, and its caller may fall
>> back to an assumption that an interrupt is always unmasked.
>>
>> Link: https://lore.kernel.org/kvm/[email protected]/
>> Signed-off-by: Dmytro Maluka <[email protected]>
>> ---
>> arch/x86/include/asm/kvm_host.h | 1 +
>> arch/x86/kvm/i8259.c | 11 +++++++++++
>> arch/x86/kvm/ioapic.c | 11 +++++++++++
>> arch/x86/kvm/ioapic.h | 1 +
>> arch/x86/kvm/irq_comm.c | 16 ++++++++++++++++
>> include/linux/kvm_host.h | 3 +++
>> virt/kvm/irqchip.c | 34 +++++++++++++++++++++++++++++++++
>> 7 files changed, 77 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 39a867d68721..64618b890700 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1840,6 +1840,7 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
>>
>> int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
>> void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
>> +bool kvm_pic_irq_is_masked(struct kvm_pic *pic, int irq);
>>
>> void kvm_inject_nmi(struct kvm_vcpu *vcpu);
>>
>> diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
>> index e1bb6218bb96..2d1ed3bc7cc5 100644
>> --- a/arch/x86/kvm/i8259.c
>> +++ b/arch/x86/kvm/i8259.c
>> @@ -211,6 +211,17 @@ void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
>> pic_unlock(s);
>> }
>>
>> +bool kvm_pic_irq_is_masked(struct kvm_pic *s, int irq)
>> +{
>> + bool ret;
>> +
>> + pic_lock(s);
>> + ret = !!(s->pics[irq >> 3].imr & (1 << irq));
>> + pic_unlock(s);
>> +
>> + return ret;
>> +}
>> +
>> /*
>> * acknowledge interrupt 'irq'
>> */
>> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
>> index 765943d7cfa5..874f68a65c87 100644
>> --- a/arch/x86/kvm/ioapic.c
>> +++ b/arch/x86/kvm/ioapic.c
>> @@ -478,6 +478,17 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
>> spin_unlock(&ioapic->lock);
>> }
>>
>> +bool kvm_ioapic_irq_is_masked(struct kvm_ioapic *ioapic, int irq)
>> +{
>> + bool ret;
>> +
>> + spin_lock(&ioapic->lock);
>> + ret = !!ioapic->redirtbl[irq].fields.mask;
>> + spin_unlock(&ioapic->lock);
>> +
>> + return ret;
>> +}
>> +
>> static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
>> {
>> int i;
>> diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
>> index 539333ac4b38..fe1f51319992 100644
>> --- a/arch/x86/kvm/ioapic.h
>> +++ b/arch/x86/kvm/ioapic.h
>> @@ -114,6 +114,7 @@ void kvm_ioapic_destroy(struct kvm *kvm);
>> int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
>> int level, bool line_status);
>> void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
>> +bool kvm_ioapic_irq_is_masked(struct kvm_ioapic *ioapic, int irq);
>> void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
>> void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
>> void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
>> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
>> index 43e13892ed34..5bff6d6ac54f 100644
>> --- a/arch/x86/kvm/irq_comm.c
>> +++ b/arch/x86/kvm/irq_comm.c
>> @@ -34,6 +34,13 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
>> return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
>> }
>>
>> +static bool kvm_is_masked_pic_irq(struct kvm_kernel_irq_routing_entry *e,
>> + struct kvm *kvm)
>> +{
>> + struct kvm_pic *pic = kvm->arch.vpic;
>> + return kvm_pic_irq_is_masked(pic, e->irqchip.pin);
>> +}
>> +
>> static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>> struct kvm *kvm, int irq_source_id, int level,
>> bool line_status)
>> @@ -43,6 +50,13 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>> line_status);
>> }
>>
>> +static bool kvm_is_masked_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
>> + struct kvm *kvm)
>> +{
>> + struct kvm_ioapic *ioapic = kvm->arch.vioapic;
>> + return kvm_ioapic_irq_is_masked(ioapic, e->irqchip.pin);
>> +}
>> +
>> int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
>> struct kvm_lapic_irq *irq, struct dest_map *dest_map)
>> {
>> @@ -275,11 +289,13 @@ int kvm_set_routing_entry(struct kvm *kvm,
>> if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
>> return -EINVAL;
>> e->set = kvm_set_pic_irq;
>> + e->is_masked = kvm_is_masked_pic_irq;
>> break;
>> case KVM_IRQCHIP_IOAPIC:
>> if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
>> return -EINVAL;
>> e->set = kvm_set_ioapic_irq;
>> + e->is_masked = kvm_is_masked_ioapic_irq;
>> break;
>> default:
>> return -EINVAL;
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 9e12ef503157..e8bfb3b0d4d1 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -625,6 +625,8 @@ struct kvm_kernel_irq_routing_entry {
>> int (*set)(struct kvm_kernel_irq_routing_entry *e,
>> struct kvm *kvm, int irq_source_id, int level,
>> bool line_status);
>> + bool (*is_masked)(struct kvm_kernel_irq_routing_entry *e,
>> + struct kvm *kvm);
>> union {
>> struct {
>> unsigned irqchip;
>> @@ -1598,6 +1600,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
>> int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
>> struct kvm *kvm, int irq_source_id,
>> int level, bool line_status);
>> +int kvm_irq_is_masked(struct kvm *kvm, int irq, bool *masked);
>> bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
>> void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
>> void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
>> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
>> index 58e4f88b2b9f..9252ebedba55 100644
>> --- a/virt/kvm/irqchip.c
>> +++ b/virt/kvm/irqchip.c
>> @@ -97,6 +97,40 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
>> return ret;
>> }
>>
>> +/*
>> + * Return value:
>> + * = 0 Interrupt mask state successfully written to `masked`
>> + * < 0 Failed to read interrupt mask state
>> + */
>> +int kvm_irq_is_masked(struct kvm *kvm, int irq, bool *masked)
>> +{
>> + struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
>> + int ret = -1, i, idx;
>> +
>> + /* Not possible to detect if the guest uses the PIC or the
>> + * IOAPIC. So assume the interrupt to be unmasked iff it is
>> + * unmasked in at least one of both.
>> + */
>> + idx = srcu_read_lock(&kvm->irq_srcu);
>> + i = kvm_irq_map_gsi(kvm, irq_set, irq);
>> + srcu_read_unlock(&kvm->irq_srcu, idx);
>> +
>> + while (i--) {
>> + if (!irq_set[i].is_masked)
>> + continue;
>> +
>> + if (!irq_set[i].is_masked(&irq_set[i], kvm)) {
>> + *masked = false;
>> + return 0;
>> + }
>> + ret = 0;
>> + }
>> + if (!ret)
>> + *masked = true;
>> +
>> + return ret;
>> +}
>> +
>> static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
>> {
>> int i;
>
Hi Dmytro,
> -----Original Message-----
> From: Dmytro Maluka <[email protected]>
> Sent: Saturday, July 30, 2022 7:35 AM
> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> <[email protected]>; Paolo Bonzini <[email protected]>;
> [email protected]
> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> <[email protected]>; [email protected]; H. Peter Anvin
> <[email protected]>; [email protected]; Eric Auger
> <[email protected]>; Alex Williamson
> <[email protected]>; Zhenyu Wang
> <[email protected]>; Tomasz Nowicki <[email protected]>;
> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> <[email protected]>
> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> oneshot interrupts
>
> On 7/29/22 10:48 PM, Liu, Rong L wrote:
> > Hi Dmytro,
> >
> >> -----Original Message-----
> >> From: Dmytro Maluka <[email protected]>
> >> Sent: Tuesday, July 26, 2022 7:08 AM
> >> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> >> <[email protected]>; Paolo Bonzini <[email protected]>;
> >> [email protected]
> >> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> >> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> >> <[email protected]>; [email protected]; H. Peter Anvin
> >> <[email protected]>; [email protected]; Eric Auger
> >> <[email protected]>; Alex Williamson
> >> <[email protected]>; Zhenyu Wang
> >> <[email protected]>; Tomasz Nowicki <[email protected]>;
> >> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> >> <[email protected]>
> >> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> >> oneshot interrupts
> >>
> >> Hi Rong,
> >>
> >> On 7/26/22 01:44, Liu, Rong L wrote:
> >>> Hi Dmytro,
> >>>
> >>>> -----Original Message-----
> >>>> From: Dmytro Maluka <[email protected]>
> >>>> Sent: Friday, July 15, 2022 8:59 AM
> >>>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
> >>>> <[email protected]>; [email protected]
> >>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> >>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
> Hansen
> >>>> <[email protected]>; [email protected]; H. Peter Anvin
> >>>> <[email protected]>; [email protected]; Eric Auger
> >>>> <[email protected]>; Alex Williamson
> >>>> <[email protected]>; Liu, Rong L <[email protected]>;
> >>>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
> >>>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>;
> Dmitry
> >>>> Torokhov <[email protected]>; Dmytro Maluka
> <[email protected]>
> >>>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> >> oneshot
> >>>> interrupts
> >>>>
> >>>> The existing KVM mechanism for forwarding of level-triggered
> >> interrupts
> >>>> using resample eventfd doesn't work quite correctly in the case of
> >>>> interrupts that are handled in a Linux guest as oneshot interrupts
> >>>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> >>>> threaded irq handler, i.e. later than it is acked to the interrupt
> >>>> controller (EOI at the end of hardirq), not earlier.
> >>>>
> >>>> Linux keeps such interrupt masked until its threaded handler
> finishes,
> >>>> to prevent the EOI from re-asserting an unacknowledged interrupt.
> >>>> However, with KVM + vfio (or whatever is listening on the
> resamplefd)
> >>>> we don't check that the interrupt is still masked in the guest at the
> >>>> moment of EOI. Resamplefd is notified regardless, so vfio
> prematurely
> >>>> unmasks the host physical IRQ, thus a new (unwanted) physical
> >> interrupt
> >>>> is generated in the host and queued for injection to the guest.
> >>>>
> >>>> The fact that the virtual IRQ is still masked doesn't prevent this new
> >>>> physical IRQ from being propagated to the guest, because:
> >>>>
> >>>> 1. It is not guaranteed that the vIRQ will remain masked by the time
> >>>> when vfio signals the trigger eventfd.
> >>>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> >>>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
> >>>> new pending interrupt is injected by KVM to the guest anyway.
> >>>>
> >>>> There are observed at least 2 user-visible issues caused by those
> >>>> extra erroneous pending interrupts for oneshot irq in the guest:
> >>>>
> >>>> 1. System suspend aborted due to a pending wakeup interrupt from
> >>>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> >>>> 2. Annoying "invalid report id data" errors from ELAN0000
> touchpad
> >>>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
> >>>> every time the touchpad is touched.
> >>>>
> >>>> This patch fixes the issue on x86 by checking if the interrupt is
> >>>> unmasked when we receive irq ack (EOI) and, in case if it's masked,
> >>>> postponing resamplefd notify until the guest unmasks it.
> >>>>
> >>>> Important notes:
> >>>>
> >>>> 1. It doesn't fix the issue for other archs yet, due to some missing
> >>>> KVM functionality needed by this patch:
> >>>> - calling mask notifiers is implemented for x86 only
> >>>> - irqchip ->is_masked() is implemented for x86 only
> >>>>
> >>>> 2. It introduces an additional spinlock locking in the resample notify
> >>>> path, since we are no longer just traversing an RCU list of irqfds
> >>>> but also updating the resampler state. Hopefully this locking won't
> >>>> noticeably slow down anything for anyone.
> >>>>
> >>>
> >>> Instead of using a spinlock waiting for the unmask event, is it
> possible
> >> to call
> >>> resampler notify directly when unmask event happens, instead of
> >> calling it on
> >>> EOI?
> >>
> >> In this patch, resampler notify is already called directly when unmask
> >> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
> >> writing to IOREDTBLx register, ioapic_write_indirect() calls
> >> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask() which
> >> notifies the resampler. On EOI we postpone it just by setting
> >> resampler->pending to true, not by waiting. The spinlock is needed
> >> merely to synchronize reading & updating resampler->pending and
> >> resampler->masked values between possibly concurrently running
> >> instances
> >> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
> >>
> >> Thanks,
> >> Dmytro
> >>
> >
> > I mean the organization of the code. In current implementation,
> > kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your
> patch, why not
> > call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at
> the same
> > place where kvm_fire_mask_notifiers is called), instead of calling it
> from
> > kvm_ioapic_update_eoi_one, since what your intention here is to
> notify
> > vfio of the end of interrupt at the event of ioapic unmask, instead of
> > EOI?
>
> Ah ok, got your point.
>
> That was my initial approach in my PoC patch posted in [1]. But then I
> dropped it, for 2 reasons:
>
> 1. Upon feedback from Sean I realized that kvm_notify_acked_irq() is
> also doing some other important things besides notifying vfio. In
> particular, in irqfd_resampler_ack() we also de-assert the vIRQ via
> kvm_set_irq(). In case of IOAPIC it means clearing its bit in IRR
> register. If we delay that until unmasking, it means that we change
> the way how KVM emulates the interrupt controller. That would seem
> inconsistent.
>
Thanks for clarification. I totally agree that it is important to keep the way
how KVM emulates the interrupt controller.
> Also kvm_notify_acked_irq() notifies the emulated PIT timer via
> kvm_pit_ack_irq(). I haven't analyzed how exactly that PIT stuff
> works, so I'm not sure if delaying that until unmask wouldn't cause
> any unwanted effects.
>
> So the idea is to postpone eventfd_signal() only, to fix interaction
> with vfio while keeping the rest of the KVM behavior intact. Because
> the KVM job is to emulate the interrupt controller (which it already
> does correctly), not the external device which is the job of vfio*.
>
I made a mistake in my last post. I mean just to delay the notification of
vfio, but keep the rest of the code as intact as possible.
> 2. kvm_notify_acked_irq() can't be called under ioapic->lock, so in [1]
> I was unlocking ioapic->lock in ioapic_write_indirect() with a naive
> assumption that it was as safe as doing it in
> kvm_ioapic_update_eoi_one(). That was probably racy, and I hadn't
> figured out how to rework it in a race-free way.
>
> [1] https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> [email protected]/
>
> [*] By "vfio" I always mean "vfio or any other resamplefd user".
>
> Thanks,
> Dmytro
>
> >
> >>>
> >>>> Regarding #2, there may be an alternative solution worth
> considering:
> >>>> extend KVM irqfd (userspace) API to send mask and unmask
> >> notifications
> >>>> directly to vfio/whatever, in addition to resample notifications, to
> >>>> let vfio check the irq state on its own. There is already locking on
> >>>> vfio side (see e.g. vfio_platform_unmask()), so this way we would
> >> avoid
> >>>> introducing any additional locking. Also such mask/unmask
> >> notifications
> >>>> could be useful for other cases.
> >>>>
> >>>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> >>>> [email protected]/
> >>>> Suggested-by: Sean Christopherson <[email protected]>
> >>>> Signed-off-by: Dmytro Maluka <[email protected]>
> >>>> ---
> >>>> include/linux/kvm_irqfd.h | 14 ++++++++++++
> >>>> virt/kvm/eventfd.c | 45
> >>>> +++++++++++++++++++++++++++++++++++++++
> >>>> 2 files changed, 59 insertions(+)
> >>>>
> >>>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> >>>> index dac047abdba7..01754a1abb9e 100644
> >>>> --- a/include/linux/kvm_irqfd.h
> >>>> +++ b/include/linux/kvm_irqfd.h
> >>>> @@ -19,6 +19,16 @@
> >>>> * resamplefd. All resamplers on the same gsi are de-asserted
> >>>> * together, so we don't need to track the state of each individual
> >>>> * user. We can also therefore share the same irq source ID.
> >>>> + *
> >>>> + * A special case is when the interrupt is still masked at the
> moment
> >>>> + * an irq ack is received. That likely means that the interrupt has
> >>>> + * been acknowledged to the interrupt controller but not
> >> acknowledged
> >>>> + * to the device yet, e.g. it might be a Linux guest's threaded
> >>>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying
> through
> >>>> + * resamplefd is postponed until the guest unmasks the interrupt,
> >>>> + * which is detected through the irq mask notifier. This prevents
> >>>> + * erroneous extra interrupts caused by premature re-assert of an
> >>>> + * unacknowledged interrupt by the resamplefd listener.
> >>>> */
> >>>> struct kvm_kernel_irqfd_resampler {
> >>>> struct kvm *kvm;
> >>>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> >>>> */
> >>>> struct list_head list;
> >>>> struct kvm_irq_ack_notifier notifier;
> >>>> + struct kvm_irq_mask_notifier mask_notifier;
> >>>> + bool masked;
> >>>> + bool pending;
> >>>> + spinlock_t lock;
> >>>> /*
> >>>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> >>>> * resamplers among irqfds on the same gsi.
> >>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> >>>> index 50ddb1d1a7f0..9ff47ac33790 100644
> >>>> --- a/virt/kvm/eventfd.c
> >>>> +++ b/virt/kvm/eventfd.c
> >>>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
> >> kvm_irq_ack_notifier
> >>>> *kian)
> >>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> >>>> resampler->notifier.gsi, 0, false);
> >>>>
> >>>> + spin_lock(&resampler->lock);
> >>>> + if (resampler->masked) {
> >>>> + resampler->pending = true;
> >>>> + spin_unlock(&resampler->lock);
> >>>> + return;
> >>>> + }
> >>>> + spin_unlock(&resampler->lock);
> >>>> +
> >>>> + idx = srcu_read_lock(&kvm->irq_srcu);
> >>>> +
> >>>> + list_for_each_entry_srcu(irqfd, &resampler->list,
> resampler_link,
> >>>> + srcu_read_lock_held(&kvm->irq_srcu))
> >>>> + eventfd_signal(irqfd->resamplefd, 1);
> >>>> +
> >>>> + srcu_read_unlock(&kvm->irq_srcu, idx);
> >>>> +}
> >>>> +
> >>>> +static void
> >>>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
> >>>> masked)
> >>>> +{
> >>>> + struct kvm_kernel_irqfd_resampler *resampler;
> >>>> + struct kvm *kvm;
> >>>> + struct kvm_kernel_irqfd *irqfd;
> >>>> + int idx;
> >>>> +
> >>>> + resampler = container_of(kimn,
> >>>> + struct kvm_kernel_irqfd_resampler,
> mask_notifier);
> >>>> + kvm = resampler->kvm;
> >>>> +
> >>>> + spin_lock(&resampler->lock);
> >>>> + resampler->masked = masked;
> >>>> + if (masked || !resampler->pending) {
> >>>> + spin_unlock(&resampler->lock);
> >>>> + return;
> >>>> + }
> >>>> + resampler->pending = false;
> >>>> + spin_unlock(&resampler->lock);
> >>>> +
> >>>> idx = srcu_read_lock(&kvm->irq_srcu);
> >>>>
> >>>> list_for_each_entry_srcu(irqfd, &resampler->list,
> resampler_link,
> >>>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
> >>>> kvm_kernel_irqfd *irqfd)
> >>>> if (list_empty(&resampler->list)) {
> >>>> list_del(&resampler->link);
> >>>> kvm_unregister_irq_ack_notifier(kvm, &resampler-
> >notifier);
> >>>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
> >>>>> mask_notifier.irq,
> >>>> + &resampler->mask_notifier);
> >>>> kvm_set_irq(kvm,
> KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> >>>> resampler->notifier.gsi, 0, false);
> >>>> kfree(resampler);
> >>>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm,
> struct
> >>>> kvm_irqfd *args)
> >>>> INIT_LIST_HEAD(&resampler->list);
> >>>> resampler->notifier.gsi = irqfd->gsi;
> >>>> resampler->notifier.irq_acked =
> irqfd_resampler_ack;
> >>>> + resampler->mask_notifier.func =
> irqfd_resampler_mask;
> >>>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
> >>>>> masked);
> >>>> + spin_lock_init(&resampler->lock);
> >>>> INIT_LIST_HEAD(&resampler->link);
> >>>>
> >>>> list_add(&resampler->link, &kvm-
> >irqfds.resampler_list);
> >>>> kvm_register_irq_ack_notifier(kvm,
> >>>> &resampler->notifier);
> >>>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> >>>> + &resampler->mask_notifier);
> >>>> irqfd->resampler = resampler;
> >>>> }
> >>>>
> >>>> --
> >>>> 2.37.0.170.g444d1eabd0-goog
> >>>
Hi Rong,
On 8/10/22 12:02 AM, Liu, Rong L wrote:
> Hi Dmytro,
>
>> -----Original Message-----
>> From: Dmytro Maluka <[email protected]>
>> Sent: Saturday, July 30, 2022 7:35 AM
>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
>> <[email protected]>; Paolo Bonzini <[email protected]>;
>> [email protected]
>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>> <[email protected]>; [email protected]; H. Peter Anvin
>> <[email protected]>; [email protected]; Eric Auger
>> <[email protected]>; Alex Williamson
>> <[email protected]>; Zhenyu Wang
>> <[email protected]>; Tomasz Nowicki <[email protected]>;
>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
>> <[email protected]>
>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>> oneshot interrupts
>>
>> On 7/29/22 10:48 PM, Liu, Rong L wrote:
>>> Hi Dmytro,
>>>
>>>> -----Original Message-----
>>>> From: Dmytro Maluka <[email protected]>
>>>> Sent: Tuesday, July 26, 2022 7:08 AM
>>>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
>>>> <[email protected]>; Paolo Bonzini <[email protected]>;
>>>> [email protected]
>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>>>> <[email protected]>; [email protected]; H. Peter Anvin
>>>> <[email protected]>; [email protected]; Eric Auger
>>>> <[email protected]>; Alex Williamson
>>>> <[email protected]>; Zhenyu Wang
>>>> <[email protected]>; Tomasz Nowicki <[email protected]>;
>>>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
>>>> <[email protected]>
>>>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>>>> oneshot interrupts
>>>>
>>>> Hi Rong,
>>>>
>>>> On 7/26/22 01:44, Liu, Rong L wrote:
>>>>> Hi Dmytro,
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Dmytro Maluka <[email protected]>
>>>>>> Sent: Friday, July 15, 2022 8:59 AM
>>>>>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
>>>>>> <[email protected]>; [email protected]
>>>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>>>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
>> Hansen
>>>>>> <[email protected]>; [email protected]; H. Peter Anvin
>>>>>> <[email protected]>; [email protected]; Eric Auger
>>>>>> <[email protected]>; Alex Williamson
>>>>>> <[email protected]>; Liu, Rong L <[email protected]>;
>>>>>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
>>>>>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>;
>> Dmitry
>>>>>> Torokhov <[email protected]>; Dmytro Maluka
>> <[email protected]>
>>>>>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>>>> oneshot
>>>>>> interrupts
>>>>>>
>>>>>> The existing KVM mechanism for forwarding of level-triggered
>>>> interrupts
>>>>>> using resample eventfd doesn't work quite correctly in the case of
>>>>>> interrupts that are handled in a Linux guest as oneshot interrupts
>>>>>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
>>>>>> threaded irq handler, i.e. later than it is acked to the interrupt
>>>>>> controller (EOI at the end of hardirq), not earlier.
>>>>>>
>>>>>> Linux keeps such interrupt masked until its threaded handler
>> finishes,
>>>>>> to prevent the EOI from re-asserting an unacknowledged interrupt.
>>>>>> However, with KVM + vfio (or whatever is listening on the
>> resamplefd)
>>>>>> we don't check that the interrupt is still masked in the guest at the
>>>>>> moment of EOI. Resamplefd is notified regardless, so vfio
>> prematurely
>>>>>> unmasks the host physical IRQ, thus a new (unwanted) physical
>>>> interrupt
>>>>>> is generated in the host and queued for injection to the guest.
>>>>>>
>>>>>> The fact that the virtual IRQ is still masked doesn't prevent this new
>>>>>> physical IRQ from being propagated to the guest, because:
>>>>>>
>>>>>> 1. It is not guaranteed that the vIRQ will remain masked by the time
>>>>>> when vfio signals the trigger eventfd.
>>>>>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
>>>>>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked, this
>>>>>> new pending interrupt is injected by KVM to the guest anyway.
>>>>>>
>>>>>> There are observed at least 2 user-visible issues caused by those
>>>>>> extra erroneous pending interrupts for oneshot irq in the guest:
>>>>>>
>>>>>> 1. System suspend aborted due to a pending wakeup interrupt from
>>>>>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
>>>>>> 2. Annoying "invalid report id data" errors from ELAN0000
>> touchpad
>>>>>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest dmesg
>>>>>> every time the touchpad is touched.
>>>>>>
>>>>>> This patch fixes the issue on x86 by checking if the interrupt is
>>>>>> unmasked when we receive irq ack (EOI) and, in case if it's masked,
>>>>>> postponing resamplefd notify until the guest unmasks it.
>>>>>>
>>>>>> Important notes:
>>>>>>
>>>>>> 1. It doesn't fix the issue for other archs yet, due to some missing
>>>>>> KVM functionality needed by this patch:
>>>>>> - calling mask notifiers is implemented for x86 only
>>>>>> - irqchip ->is_masked() is implemented for x86 only
>>>>>>
>>>>>> 2. It introduces an additional spinlock locking in the resample notify
>>>>>> path, since we are no longer just traversing an RCU list of irqfds
>>>>>> but also updating the resampler state. Hopefully this locking won't
>>>>>> noticeably slow down anything for anyone.
>>>>>>
>>>>>
>>>>> Instead of using a spinlock waiting for the unmask event, is it
>> possible
>>>> to call
>>>>> resampler notify directly when unmask event happens, instead of
>>>> calling it on
>>>>> EOI?
>>>>
>>>> In this patch, resampler notify is already called directly when unmask
>>>> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
>>>> writing to IOREDTBLx register, ioapic_write_indirect() calls
>>>> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask() which
>>>> notifies the resampler. On EOI we postpone it just by setting
>>>> resampler->pending to true, not by waiting. The spinlock is needed
>>>> merely to synchronize reading & updating resampler->pending and
>>>> resampler->masked values between possibly concurrently running
>>>> instances
>>>> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
>>>>
>>>> Thanks,
>>>> Dmytro
>>>>
>>>
>>> I mean the organization of the code. In current implementation,
>>> kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your
>> patch, why not
>>> call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at
>> the same
>>> place where kvm_fire_mask_notifiers is called), instead of calling it
>> from
>>> kvm_ioapic_update_eoi_one, since what your intention here is to
>> notify
>>> vfio of the end of interrupt at the event of ioapic unmask, instead of
>>> EOI?
>>
>> Ah ok, got your point.
>>
>> That was my initial approach in my PoC patch posted in [1]. But then I
>> dropped it, for 2 reasons:
>>
>> 1. Upon feedback from Sean I realized that kvm_notify_acked_irq() is
>> also doing some other important things besides notifying vfio. In
>> particular, in irqfd_resampler_ack() we also de-assert the vIRQ via
>> kvm_set_irq(). In case of IOAPIC it means clearing its bit in IRR
>> register. If we delay that until unmasking, it means that we change
>> the way how KVM emulates the interrupt controller. That would seem
>> inconsistent.
>>
>
> Thanks for clarification. I totally agree that it is important to keep the way
> how KVM emulates the interrupt controller.
>
>> Also kvm_notify_acked_irq() notifies the emulated PIT timer via
>> kvm_pit_ack_irq(). I haven't analyzed how exactly that PIT stuff
>> works, so I'm not sure if delaying that until unmask wouldn't cause
>> any unwanted effects.
>>
>> So the idea is to postpone eventfd_signal() only, to fix interaction
>> with vfio while keeping the rest of the KVM behavior intact. Because
>> the KVM job is to emulate the interrupt controller (which it already
>> does correctly), not the external device which is the job of vfio*.
>>
>
> I made a mistake in my last post. I mean just to delay the notification of
> vfio, but keep the rest of the code as intact as possible.
Hmm, thanks, until now I wasn't thinking about another possibility: call
delayed eventfd_signal() directly from ioapic_write_indirect().
However, now thinking about that, I don't really see advantages. We wouldn't
need to use mask notifiers then, but we would still need the same logic and
synchronization in irqfd_resampler_ack() for checking whether we should delay
the notification. Moreover, I'm not sure how would I then address the
resampler->masked initialization race issue, which I addressed in v2 by
implementing kvm_register_and_fire_irq_mask_notifier() in [1].
[1] https://lore.kernel.org/lkml/[email protected]/
>
>> 2. kvm_notify_acked_irq() can't be called under ioapic->lock, so in [1]
>> I was unlocking ioapic->lock in ioapic_write_indirect() with a naive
>> assumption that it was as safe as doing it in
>> kvm_ioapic_update_eoi_one(). That was probably racy, and I hadn't
>> figured out how to rework it in a race-free way.
>>
>> [1] https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>> [email protected]/
>>
>> [*] By "vfio" I always mean "vfio or any other resamplefd user".
>>
>> Thanks,
>> Dmytro
>>
>>>
>>>>>
>>>>>> Regarding #2, there may be an alternative solution worth
>> considering:
>>>>>> extend KVM irqfd (userspace) API to send mask and unmask
>>>> notifications
>>>>>> directly to vfio/whatever, in addition to resample notifications, to
>>>>>> let vfio check the irq state on its own. There is already locking on
>>>>>> vfio side (see e.g. vfio_platform_unmask()), so this way we would
>>>> avoid
>>>>>> introducing any additional locking. Also such mask/unmask
>>>> notifications
>>>>>> could be useful for other cases.
>>>>>>
>>>>>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>>>>>> [email protected]/
>>>>>> Suggested-by: Sean Christopherson <[email protected]>
>>>>>> Signed-off-by: Dmytro Maluka <[email protected]>
>>>>>> ---
>>>>>> include/linux/kvm_irqfd.h | 14 ++++++++++++
>>>>>> virt/kvm/eventfd.c | 45
>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>> 2 files changed, 59 insertions(+)
>>>>>>
>>>>>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
>>>>>> index dac047abdba7..01754a1abb9e 100644
>>>>>> --- a/include/linux/kvm_irqfd.h
>>>>>> +++ b/include/linux/kvm_irqfd.h
>>>>>> @@ -19,6 +19,16 @@
>>>>>> * resamplefd. All resamplers on the same gsi are de-asserted
>>>>>> * together, so we don't need to track the state of each individual
>>>>>> * user. We can also therefore share the same irq source ID.
>>>>>> + *
>>>>>> + * A special case is when the interrupt is still masked at the
>> moment
>>>>>> + * an irq ack is received. That likely means that the interrupt has
>>>>>> + * been acknowledged to the interrupt controller but not
>>>> acknowledged
>>>>>> + * to the device yet, e.g. it might be a Linux guest's threaded
>>>>>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying
>> through
>>>>>> + * resamplefd is postponed until the guest unmasks the interrupt,
>>>>>> + * which is detected through the irq mask notifier. This prevents
>>>>>> + * erroneous extra interrupts caused by premature re-assert of an
>>>>>> + * unacknowledged interrupt by the resamplefd listener.
>>>>>> */
>>>>>> struct kvm_kernel_irqfd_resampler {
>>>>>> struct kvm *kvm;
>>>>>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
>>>>>> */
>>>>>> struct list_head list;
>>>>>> struct kvm_irq_ack_notifier notifier;
>>>>>> + struct kvm_irq_mask_notifier mask_notifier;
>>>>>> + bool masked;
>>>>>> + bool pending;
>>>>>> + spinlock_t lock;
>>>>>> /*
>>>>>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
>>>>>> * resamplers among irqfds on the same gsi.
>>>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>>>>> index 50ddb1d1a7f0..9ff47ac33790 100644
>>>>>> --- a/virt/kvm/eventfd.c
>>>>>> +++ b/virt/kvm/eventfd.c
>>>>>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
>>>> kvm_irq_ack_notifier
>>>>>> *kian)
>>>>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>>>>>> resampler->notifier.gsi, 0, false);
>>>>>>
>>>>>> + spin_lock(&resampler->lock);
>>>>>> + if (resampler->masked) {
>>>>>> + resampler->pending = true;
>>>>>> + spin_unlock(&resampler->lock);
>>>>>> + return;
>>>>>> + }
>>>>>> + spin_unlock(&resampler->lock);
>>>>>> +
>>>>>> + idx = srcu_read_lock(&kvm->irq_srcu);
>>>>>> +
>>>>>> + list_for_each_entry_srcu(irqfd, &resampler->list,
>> resampler_link,
>>>>>> + srcu_read_lock_held(&kvm->irq_srcu))
>>>>>> + eventfd_signal(irqfd->resamplefd, 1);
>>>>>> +
>>>>>> + srcu_read_unlock(&kvm->irq_srcu, idx);
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn, bool
>>>>>> masked)
>>>>>> +{
>>>>>> + struct kvm_kernel_irqfd_resampler *resampler;
>>>>>> + struct kvm *kvm;
>>>>>> + struct kvm_kernel_irqfd *irqfd;
>>>>>> + int idx;
>>>>>> +
>>>>>> + resampler = container_of(kimn,
>>>>>> + struct kvm_kernel_irqfd_resampler,
>> mask_notifier);
>>>>>> + kvm = resampler->kvm;
>>>>>> +
>>>>>> + spin_lock(&resampler->lock);
>>>>>> + resampler->masked = masked;
>>>>>> + if (masked || !resampler->pending) {
>>>>>> + spin_unlock(&resampler->lock);
>>>>>> + return;
>>>>>> + }
>>>>>> + resampler->pending = false;
>>>>>> + spin_unlock(&resampler->lock);
>>>>>> +
>>>>>> idx = srcu_read_lock(&kvm->irq_srcu);
>>>>>>
>>>>>> list_for_each_entry_srcu(irqfd, &resampler->list,
>> resampler_link,
>>>>>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
>>>>>> kvm_kernel_irqfd *irqfd)
>>>>>> if (list_empty(&resampler->list)) {
>>>>>> list_del(&resampler->link);
>>>>>> kvm_unregister_irq_ack_notifier(kvm, &resampler-
>>> notifier);
>>>>>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
>>>>>>> mask_notifier.irq,
>>>>>> + &resampler->mask_notifier);
>>>>>> kvm_set_irq(kvm,
>> KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>>>>>> resampler->notifier.gsi, 0, false);
>>>>>> kfree(resampler);
>>>>>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm,
>> struct
>>>>>> kvm_irqfd *args)
>>>>>> INIT_LIST_HEAD(&resampler->list);
>>>>>> resampler->notifier.gsi = irqfd->gsi;
>>>>>> resampler->notifier.irq_acked =
>> irqfd_resampler_ack;
>>>>>> + resampler->mask_notifier.func =
>> irqfd_resampler_mask;
>>>>>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
>>>>>>> masked);
>>>>>> + spin_lock_init(&resampler->lock);
>>>>>> INIT_LIST_HEAD(&resampler->link);
>>>>>>
>>>>>> list_add(&resampler->link, &kvm-
>>> irqfds.resampler_list);
>>>>>> kvm_register_irq_ack_notifier(kvm,
>>>>>> &resampler->notifier);
>>>>>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
>>>>>> + &resampler->mask_notifier);
>>>>>> irqfd->resampler = resampler;
>>>>>> }
>>>>>>
>>>>>> --
>>>>>> 2.37.0.170.g444d1eabd0-goog
>>>>>
> -----Original Message-----
> From: Liu, Rong L
> Sent: Tuesday, August 9, 2022 3:02 PM
> To: Dmytro Maluka <[email protected]>; Christopherson,, Sean
> <[email protected]>; Paolo Bonzini <[email protected]>;
> [email protected]
> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> <[email protected]>; [email protected]; H. Peter Anvin
> <[email protected]>; [email protected]; Eric Auger
> <[email protected]>; Alex Williamson
> <[email protected]>; Zhenyu Wang
> <[email protected]>; Tomasz Nowicki <[email protected]>;
> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> <[email protected]>
> Subject: RE: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> oneshot interrupts
>
> Hi Dmytro,
>
> > -----Original Message-----
> > From: Dmytro Maluka <[email protected]>
> > Sent: Saturday, July 30, 2022 7:35 AM
> > To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> > <[email protected]>; Paolo Bonzini <[email protected]>;
> > [email protected]
> > Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> > <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> > <[email protected]>; [email protected]; H. Peter Anvin
> > <[email protected]>; [email protected]; Eric Auger
> > <[email protected]>; Alex Williamson
> > <[email protected]>; Zhenyu Wang
> > <[email protected]>; Tomasz Nowicki <[email protected]>;
> > Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> > <[email protected]>
> > Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> > oneshot interrupts
> >
> > On 7/29/22 10:48 PM, Liu, Rong L wrote:
> > > Hi Dmytro,
> > >
> > >> -----Original Message-----
> > >> From: Dmytro Maluka <[email protected]>
> > >> Sent: Tuesday, July 26, 2022 7:08 AM
> > >> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> > >> <[email protected]>; Paolo Bonzini <[email protected]>;
> > >> [email protected]
> > >> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> > >> <[email protected]>; Borislav Petkov <[email protected]>; Dave
> Hansen
> > >> <[email protected]>; [email protected]; H. Peter Anvin
> > >> <[email protected]>; [email protected]; Eric Auger
> > >> <[email protected]>; Alex Williamson
> > >> <[email protected]>; Zhenyu Wang
> > >> <[email protected]>; Tomasz Nowicki <[email protected]>;
> > >> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> > >> <[email protected]>
> > >> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> > >> oneshot interrupts
> > >>
> > >> Hi Rong,
> > >>
> > >> On 7/26/22 01:44, Liu, Rong L wrote:
> > >>> Hi Dmytro,
> > >>>
> > >>>> -----Original Message-----
> > >>>> From: Dmytro Maluka <[email protected]>
> > >>>> Sent: Friday, July 15, 2022 8:59 AM
> > >>>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
> > >>>> <[email protected]>; [email protected]
> > >>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> > >>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
> > Hansen
> > >>>> <[email protected]>; [email protected]; H. Peter Anvin
> > >>>> <[email protected]>; [email protected]; Eric Auger
> > >>>> <[email protected]>; Alex Williamson
> > >>>> <[email protected]>; Liu, Rong L
> <[email protected]>;
> > >>>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
> > >>>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>;
> > Dmitry
> > >>>> Torokhov <[email protected]>; Dmytro Maluka
> > <[email protected]>
> > >>>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> > >> oneshot
> > >>>> interrupts
> > >>>>
> > >>>> The existing KVM mechanism for forwarding of level-triggered
> > >> interrupts
> > >>>> using resample eventfd doesn't work quite correctly in the case of
> > >>>> interrupts that are handled in a Linux guest as oneshot interrupts
> > >>>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> > >>>> threaded irq handler, i.e. later than it is acked to the interrupt
> > >>>> controller (EOI at the end of hardirq), not earlier.
> > >>>>
> > >>>> Linux keeps such interrupt masked until its threaded handler
> > finishes,
> > >>>> to prevent the EOI from re-asserting an unacknowledged interrupt.
> > >>>> However, with KVM + vfio (or whatever is listening on the
> > resamplefd)
> > >>>> we don't check that the interrupt is still masked in the guest at the
> > >>>> moment of EOI. Resamplefd is notified regardless, so vfio
> > prematurely
> > >>>> unmasks the host physical IRQ, thus a new (unwanted) physical
> > >> interrupt
> > >>>> is generated in the host and queued for injection to the guest.
> > >>>>
> > >>>> The fact that the virtual IRQ is still masked doesn't prevent this
> new
> > >>>> physical IRQ from being propagated to the guest, because:
> > >>>>
> > >>>> 1. It is not guaranteed that the vIRQ will remain masked by the
> time
> > >>>> when vfio signals the trigger eventfd.
> > >>>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> > >>>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked,
> this
> > >>>> new pending interrupt is injected by KVM to the guest anyway.
> > >>>>
> > >>>> There are observed at least 2 user-visible issues caused by those
> > >>>> extra erroneous pending interrupts for oneshot irq in the guest:
> > >>>>
> > >>>> 1. System suspend aborted due to a pending wakeup interrupt
> from
> > >>>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> > >>>> 2. Annoying "invalid report id data" errors from ELAN0000
> > touchpad
> > >>>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest
> dmesg
> > >>>> every time the touchpad is touched.
> > >>>>
> > >>>> This patch fixes the issue on x86 by checking if the interrupt is
> > >>>> unmasked when we receive irq ack (EOI) and, in case if it's masked,
> > >>>> postponing resamplefd notify until the guest unmasks it.
> > >>>>
> > >>>> Important notes:
> > >>>>
> > >>>> 1. It doesn't fix the issue for other archs yet, due to some missing
> > >>>> KVM functionality needed by this patch:
> > >>>> - calling mask notifiers is implemented for x86 only
> > >>>> - irqchip ->is_masked() is implemented for x86 only
> > >>>>
> > >>>> 2. It introduces an additional spinlock locking in the resample
> notify
> > >>>> path, since we are no longer just traversing an RCU list of irqfds
> > >>>> but also updating the resampler state. Hopefully this locking
> won't
> > >>>> noticeably slow down anything for anyone.
> > >>>>
> > >>>
> > >>> Instead of using a spinlock waiting for the unmask event, is it
> > possible
> > >> to call
> > >>> resampler notify directly when unmask event happens, instead of
> > >> calling it on
> > >>> EOI?
> > >>
> > >> In this patch, resampler notify is already called directly when
> unmask
> > >> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
> > >> writing to IOREDTBLx register, ioapic_write_indirect() calls
> > >> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask()
> which
> > >> notifies the resampler. On EOI we postpone it just by setting
> > >> resampler->pending to true, not by waiting. The spinlock is needed
> > >> merely to synchronize reading & updating resampler->pending and
> > >> resampler->masked values between possibly concurrently running
> > >> instances
> > >> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
> > >>
> > >> Thanks,
> > >> Dmytro
> > >>
> > >
> > > I mean the organization of the code. In current implementation,
> > > kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your
> > patch, why not
> > > call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at
> > the same
> > > place where kvm_fire_mask_notifiers is called), instead of calling it
> > from
> > > kvm_ioapic_update_eoi_one, since what your intention here is to
> > notify
> > > vfio of the end of interrupt at the event of ioapic unmask, instead of
> > > EOI?
> >
> > Ah ok, got your point.
> >
> > That was my initial approach in my PoC patch posted in [1]. But then I
> > dropped it, for 2 reasons:
> >
> > 1. Upon feedback from Sean I realized that kvm_notify_acked_irq() is
> > also doing some other important things besides notifying vfio. In
> > particular, in irqfd_resampler_ack() we also de-assert the vIRQ via
> > kvm_set_irq(). In case of IOAPIC it means clearing its bit in IRR
> > register. If we delay that until unmasking, it means that we change
> > the way how KVM emulates the interrupt controller. That would seem
> > inconsistent.
> >
>
> Thanks for clarification. I totally agree that it is important to keep the
> way
> how KVM emulates the interrupt controller.
>
> > Also kvm_notify_acked_irq() notifies the emulated PIT timer via
> > kvm_pit_ack_irq(). I haven't analyzed how exactly that PIT stuff
> > works, so I'm not sure if delaying that until unmask wouldn't cause
> > any unwanted effects.
> >
> > So the idea is to postpone eventfd_signal() only, to fix interaction
> > with vfio while keeping the rest of the KVM behavior intact. Because
> > the KVM job is to emulate the interrupt controller (which it already
> > does correctly), not the external device which is the job of vfio*.
> >
>
> I made a mistake in my last post. I mean just to delay the notification of
> vfio, but keep the rest of the code as intact as possible.
>
I took a closer look at the code and now I got what you mean. I didn't realize
irq_set.set() is actually calls ioapic_set_irq (in ioapic case)
> > 2. kvm_notify_acked_irq() can't be called under ioapic->lock, so in [1]
> > I was unlocking ioapic->lock in ioapic_write_indirect() with a naive
> > assumption that it was as safe as doing it in
> > kvm_ioapic_update_eoi_one(). That was probably racy, and I hadn't
> > figured out how to rework it in a race-free way.
> >
> > [1] https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> > [email protected]/
> >
> > [*] By "vfio" I always mean "vfio or any other resamplefd user".
> >
> > Thanks,
> > Dmytro
> >
> > >
> > >>>
> > >>>> Regarding #2, there may be an alternative solution worth
> > considering:
> > >>>> extend KVM irqfd (userspace) API to send mask and unmask
> > >> notifications
> > >>>> directly to vfio/whatever, in addition to resample notifications, to
> > >>>> let vfio check the irq state on its own. There is already locking on
> > >>>> vfio side (see e.g. vfio_platform_unmask()), so this way we would
> > >> avoid
> > >>>> introducing any additional locking. Also such mask/unmask
> > >> notifications
> > >>>> could be useful for other cases.
> > >>>>
> > >>>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> > >>>> [email protected]/
> > >>>> Suggested-by: Sean Christopherson <[email protected]>
> > >>>> Signed-off-by: Dmytro Maluka <[email protected]>
> > >>>> ---
> > >>>> include/linux/kvm_irqfd.h | 14 ++++++++++++
> > >>>> virt/kvm/eventfd.c | 45
> > >>>> +++++++++++++++++++++++++++++++++++++++
> > >>>> 2 files changed, 59 insertions(+)
> > >>>>
> > >>>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> > >>>> index dac047abdba7..01754a1abb9e 100644
> > >>>> --- a/include/linux/kvm_irqfd.h
> > >>>> +++ b/include/linux/kvm_irqfd.h
> > >>>> @@ -19,6 +19,16 @@
> > >>>> * resamplefd. All resamplers on the same gsi are de-asserted
> > >>>> * together, so we don't need to track the state of each individual
> > >>>> * user. We can also therefore share the same irq source ID.
> > >>>> + *
> > >>>> + * A special case is when the interrupt is still masked at the
> > moment
> > >>>> + * an irq ack is received. That likely means that the interrupt has
> > >>>> + * been acknowledged to the interrupt controller but not
> > >> acknowledged
> > >>>> + * to the device yet, e.g. it might be a Linux guest's threaded
> > >>>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying
> > through
> > >>>> + * resamplefd is postponed until the guest unmasks the interrupt,
> > >>>> + * which is detected through the irq mask notifier. This prevents
> > >>>> + * erroneous extra interrupts caused by premature re-assert of
> an
> > >>>> + * unacknowledged interrupt by the resamplefd listener.
> > >>>> */
> > >>>> struct kvm_kernel_irqfd_resampler {
> > >>>> struct kvm *kvm;
> > >>>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> > >>>> */
> > >>>> struct list_head list;
> > >>>> struct kvm_irq_ack_notifier notifier;
> > >>>> + struct kvm_irq_mask_notifier mask_notifier;
> > >>>> + bool masked;
> > >>>> + bool pending;
> > >>>> + spinlock_t lock;
> > >>>> /*
> > >>>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> > >>>> * resamplers among irqfds on the same gsi.
> > >>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > >>>> index 50ddb1d1a7f0..9ff47ac33790 100644
> > >>>> --- a/virt/kvm/eventfd.c
> > >>>> +++ b/virt/kvm/eventfd.c
> > >>>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
> > >> kvm_irq_ack_notifier
> > >>>> *kian)
> > >>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> > >>>> resampler->notifier.gsi, 0, false);
> > >>>>
> > >>>> + spin_lock(&resampler->lock);
> > >>>> + if (resampler->masked) {
> > >>>> + resampler->pending = true;
> > >>>> + spin_unlock(&resampler->lock);
> > >>>> + return;
> > >>>> + }
> > >>>> + spin_unlock(&resampler->lock);
> > >>>> +
> > >>>> + idx = srcu_read_lock(&kvm->irq_srcu);
> > >>>> +
> > >>>> + list_for_each_entry_srcu(irqfd, &resampler->list,
> > resampler_link,
> > >>>> + srcu_read_lock_held(&kvm->irq_srcu))
> > >>>> + eventfd_signal(irqfd->resamplefd, 1);
> > >>>> +
> > >>>> + srcu_read_unlock(&kvm->irq_srcu, idx);
> > >>>> +}
> > >>>> +
> > >>>> +static void
> > >>>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn,
> bool
> > >>>> masked)
> > >>>> +{
> > >>>> + struct kvm_kernel_irqfd_resampler *resampler;
> > >>>> + struct kvm *kvm;
> > >>>> + struct kvm_kernel_irqfd *irqfd;
> > >>>> + int idx;
> > >>>> +
> > >>>> + resampler = container_of(kimn,
> > >>>> + struct kvm_kernel_irqfd_resampler,
> > mask_notifier);
> > >>>> + kvm = resampler->kvm;
> > >>>> +
> > >>>> + spin_lock(&resampler->lock);
> > >>>> + resampler->masked = masked;
> > >>>> + if (masked || !resampler->pending) {
> > >>>> + spin_unlock(&resampler->lock);
> > >>>> + return;
> > >>>> + }
> > >>>> + resampler->pending = false;
> > >>>> + spin_unlock(&resampler->lock);
> > >>>> +
> > >>>> idx = srcu_read_lock(&kvm->irq_srcu);
> > >>>>
> > >>>> list_for_each_entry_srcu(irqfd, &resampler->list,
> > resampler_link,
> > >>>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
> > >>>> kvm_kernel_irqfd *irqfd)
> > >>>> if (list_empty(&resampler->list)) {
> > >>>> list_del(&resampler->link);
> > >>>> kvm_unregister_irq_ack_notifier(kvm, &resampler-
> > >notifier);
> > >>>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
> > >>>>> mask_notifier.irq,
> > >>>> + &resampler->mask_notifier);
> > >>>> kvm_set_irq(kvm,
> > KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> > >>>> resampler->notifier.gsi, 0, false);
> > >>>> kfree(resampler);
> > >>>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm,
> > struct
> > >>>> kvm_irqfd *args)
> > >>>> INIT_LIST_HEAD(&resampler->list);
> > >>>> resampler->notifier.gsi = irqfd->gsi;
> > >>>> resampler->notifier.irq_acked =
> > irqfd_resampler_ack;
> > >>>> + resampler->mask_notifier.func =
> > irqfd_resampler_mask;
> > >>>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
> > >>>>> masked);
> > >>>> + spin_lock_init(&resampler->lock);
> > >>>> INIT_LIST_HEAD(&resampler->link);
> > >>>>
> > >>>> list_add(&resampler->link, &kvm-
> > >irqfds.resampler_list);
> > >>>> kvm_register_irq_ack_notifier(kvm,
> > >>>> &resampler->notifier);
> > >>>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> > >>>> + &resampler->mask_notifier);
> > >>>> irqfd->resampler = resampler;
> > >>>> }
> > >>>>
> > >>>> --
> > >>>> 2.37.0.170.g444d1eabd0-goog
> > >>>
Hi Dmytro,
> -----Original Message-----
> From: Dmytro Maluka <[email protected]>
> Sent: Tuesday, August 9, 2022 5:57 PM
> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> <[email protected]>; Paolo Bonzini <[email protected]>;
> [email protected]
> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> <[email protected]>; [email protected]; H. Peter Anvin
> <[email protected]>; [email protected]; Eric Auger
> <[email protected]>; Alex Williamson
> <[email protected]>; Zhenyu Wang
> <[email protected]>; Tomasz Nowicki <[email protected]>;
> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> <[email protected]>
> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> oneshot interrupts
>
> Hi Rong,
>
> On 8/10/22 12:02 AM, Liu, Rong L wrote:
> > Hi Dmytro,
> >
> >> -----Original Message-----
> >> From: Dmytro Maluka <[email protected]>
> >> Sent: Saturday, July 30, 2022 7:35 AM
> >> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> >> <[email protected]>; Paolo Bonzini <[email protected]>;
> >> [email protected]
> >> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> >> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
> >> <[email protected]>; [email protected]; H. Peter Anvin
> >> <[email protected]>; [email protected]; Eric Auger
> >> <[email protected]>; Alex Williamson
> >> <[email protected]>; Zhenyu Wang
> >> <[email protected]>; Tomasz Nowicki <[email protected]>;
> >> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> >> <[email protected]>
> >> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> >> oneshot interrupts
> >>
> >> On 7/29/22 10:48 PM, Liu, Rong L wrote:
> >>> Hi Dmytro,
> >>>
> >>>> -----Original Message-----
> >>>> From: Dmytro Maluka <[email protected]>
> >>>> Sent: Tuesday, July 26, 2022 7:08 AM
> >>>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
> >>>> <[email protected]>; Paolo Bonzini <[email protected]>;
> >>>> [email protected]
> >>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> >>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
> Hansen
> >>>> <[email protected]>; [email protected]; H. Peter Anvin
> >>>> <[email protected]>; [email protected]; Eric Auger
> >>>> <[email protected]>; Alex Williamson
> >>>> <[email protected]>; Zhenyu Wang
> >>>> <[email protected]>; Tomasz Nowicki <[email protected]>;
> >>>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
> >>>> <[email protected]>
> >>>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify
> for
> >>>> oneshot interrupts
> >>>>
> >>>> Hi Rong,
> >>>>
> >>>> On 7/26/22 01:44, Liu, Rong L wrote:
> >>>>> Hi Dmytro,
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: Dmytro Maluka <[email protected]>
> >>>>>> Sent: Friday, July 15, 2022 8:59 AM
> >>>>>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
> >>>>>> <[email protected]>; [email protected]
> >>>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
> >>>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
> >> Hansen
> >>>>>> <[email protected]>; [email protected]; H. Peter Anvin
> >>>>>> <[email protected]>; [email protected]; Eric Auger
> >>>>>> <[email protected]>; Alex Williamson
> >>>>>> <[email protected]>; Liu, Rong L
> <[email protected]>;
> >>>>>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
> >>>>>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>;
> >> Dmitry
> >>>>>> Torokhov <[email protected]>; Dmytro Maluka
> >> <[email protected]>
> >>>>>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
> >>>> oneshot
> >>>>>> interrupts
> >>>>>>
> >>>>>> The existing KVM mechanism for forwarding of level-triggered
> >>>> interrupts
> >>>>>> using resample eventfd doesn't work quite correctly in the case
> of
> >>>>>> interrupts that are handled in a Linux guest as oneshot interrupts
> >>>>>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
> >>>>>> threaded irq handler, i.e. later than it is acked to the interrupt
> >>>>>> controller (EOI at the end of hardirq), not earlier.
> >>>>>>
> >>>>>> Linux keeps such interrupt masked until its threaded handler
> >> finishes,
> >>>>>> to prevent the EOI from re-asserting an unacknowledged
> interrupt.
> >>>>>> However, with KVM + vfio (or whatever is listening on the
> >> resamplefd)
> >>>>>> we don't check that the interrupt is still masked in the guest at
> the
> >>>>>> moment of EOI. Resamplefd is notified regardless, so vfio
> >> prematurely
> >>>>>> unmasks the host physical IRQ, thus a new (unwanted) physical
> >>>> interrupt
> >>>>>> is generated in the host and queued for injection to the guest.
> >>>>>>
> >>>>>> The fact that the virtual IRQ is still masked doesn't prevent this
> new
> >>>>>> physical IRQ from being propagated to the guest, because:
> >>>>>>
> >>>>>> 1. It is not guaranteed that the vIRQ will remain masked by the
> time
> >>>>>> when vfio signals the trigger eventfd.
> >>>>>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
> >>>>>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked,
> this
> >>>>>> new pending interrupt is injected by KVM to the guest anyway.
> >>>>>>
> >>>>>> There are observed at least 2 user-visible issues caused by those
> >>>>>> extra erroneous pending interrupts for oneshot irq in the guest:
> >>>>>>
> >>>>>> 1. System suspend aborted due to a pending wakeup interrupt
> from
> >>>>>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
> >>>>>> 2. Annoying "invalid report id data" errors from ELAN0000
> >> touchpad
> >>>>>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest
> dmesg
> >>>>>> every time the touchpad is touched.
> >>>>>>
> >>>>>> This patch fixes the issue on x86 by checking if the interrupt is
> >>>>>> unmasked when we receive irq ack (EOI) and, in case if it's
> masked,
> >>>>>> postponing resamplefd notify until the guest unmasks it.
> >>>>>>
> >>>>>> Important notes:
> >>>>>>
> >>>>>> 1. It doesn't fix the issue for other archs yet, due to some missing
> >>>>>> KVM functionality needed by this patch:
> >>>>>> - calling mask notifiers is implemented for x86 only
> >>>>>> - irqchip ->is_masked() is implemented for x86 only
> >>>>>>
> >>>>>> 2. It introduces an additional spinlock locking in the resample
> notify
> >>>>>> path, since we are no longer just traversing an RCU list of irqfds
> >>>>>> but also updating the resampler state. Hopefully this locking
> won't
> >>>>>> noticeably slow down anything for anyone.
> >>>>>>
> >>>>>
> >>>>> Instead of using a spinlock waiting for the unmask event, is it
> >> possible
> >>>> to call
> >>>>> resampler notify directly when unmask event happens, instead of
> >>>> calling it on
> >>>>> EOI?
> >>>>
> >>>> In this patch, resampler notify is already called directly when
> unmask
> >>>> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
> >>>> writing to IOREDTBLx register, ioapic_write_indirect() calls
> >>>> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask()
> which
> >>>> notifies the resampler. On EOI we postpone it just by setting
> >>>> resampler->pending to true, not by waiting. The spinlock is needed
> >>>> merely to synchronize reading & updating resampler->pending and
> >>>> resampler->masked values between possibly concurrently running
> >>>> instances
> >>>> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
> >>>>
> >>>> Thanks,
> >>>> Dmytro
> >>>>
> >>>
> >>> I mean the organization of the code. In current implementation,
> >>> kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your
> >> patch, why not
> >>> call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at
> >> the same
> >>> place where kvm_fire_mask_notifiers is called), instead of calling it
> >> from
> >>> kvm_ioapic_update_eoi_one, since what your intention here is to
> >> notify
> >>> vfio of the end of interrupt at the event of ioapic unmask, instead of
> >>> EOI?
> >>
> >> Ah ok, got your point.
> >>
> >> That was my initial approach in my PoC patch posted in [1]. But then I
> >> dropped it, for 2 reasons:
> >>
> >> 1. Upon feedback from Sean I realized that kvm_notify_acked_irq() is
> >> also doing some other important things besides notifying vfio. In
> >> particular, in irqfd_resampler_ack() we also de-assert the vIRQ via
> >> kvm_set_irq(). In case of IOAPIC it means clearing its bit in IRR
> >> register. If we delay that until unmasking, it means that we change
> >> the way how KVM emulates the interrupt controller. That would
> seem
> >> inconsistent.
> >>
> >
> > Thanks for clarification. I totally agree that it is important to keep the
> way
> > how KVM emulates the interrupt controller.
> >
> >> Also kvm_notify_acked_irq() notifies the emulated PIT timer via
> >> kvm_pit_ack_irq(). I haven't analyzed how exactly that PIT stuff
> >> works, so I'm not sure if delaying that until unmask wouldn't cause
> >> any unwanted effects.
> >>
> >> So the idea is to postpone eventfd_signal() only, to fix interaction
> >> with vfio while keeping the rest of the KVM behavior intact. Because
> >> the KVM job is to emulate the interrupt controller (which it already
> >> does correctly), not the external device which is the job of vfio*.
> >>
> >
> > I made a mistake in my last post. I mean just to delay the notification
> of
> > vfio, but keep the rest of the code as intact as possible.
>
> Hmm, thanks, until now I wasn't thinking about another possibility: call
> delayed eventfd_signal() directly from ioapic_write_indirect().
>
> However, now thinking about that, I don't really see advantages. We
> wouldn't
> need to use mask notifiers then, but we would still need the same logic
> and
> synchronization in irqfd_resampler_ack() for checking whether we
> should delay
> the notification. Moreover, I'm not sure how would I then address the
> resampler->masked initialization race issue, which I addressed in v2 by
> implementing kvm_register_and_fire_irq_mask_notifier() in [1].
>
> [1] https://lore.kernel.org/lkml/20220805193919.1470653-3-
> [email protected]/
>
I haven't finished reading your v2 patch, especially regarding the racing
conditions. However, I think there are some advantages to let irqchip call the
eventfd_signal, one thing is clearer code organization. The other is that it
seems it is necessary to differentiate the handling among different irqchips,
the goal here is to keep logic around other types of irqchips intact to reduce
code turmoil and fix ioapic? It looks like every irqchip has its own "set_irq"
function (You probably already know this: kvm_set_routing_entry), I think it is
logical to let every type irqchip (it is maybe necessary for other types of irq
too) to decide when to notify vfio.
> >
> >> 2. kvm_notify_acked_irq() can't be called under ioapic->lock, so in [1]
> >> I was unlocking ioapic->lock in ioapic_write_indirect() with a naive
> >> assumption that it was as safe as doing it in
> >> kvm_ioapic_update_eoi_one(). That was probably racy, and I hadn't
> >> figured out how to rework it in a race-free way.
> >>
> >> [1] https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> >> [email protected]/
> >>
> >> [*] By "vfio" I always mean "vfio or any other resamplefd user".
> >>
> >> Thanks,
> >> Dmytro
> >>
> >>>
> >>>>>
> >>>>>> Regarding #2, there may be an alternative solution worth
> >> considering:
> >>>>>> extend KVM irqfd (userspace) API to send mask and unmask
> >>>> notifications
> >>>>>> directly to vfio/whatever, in addition to resample notifications,
> to
> >>>>>> let vfio check the irq state on its own. There is already locking on
> >>>>>> vfio side (see e.g. vfio_platform_unmask()), so this way we would
> >>>> avoid
> >>>>>> introducing any additional locking. Also such mask/unmask
> >>>> notifications
> >>>>>> could be useful for other cases.
> >>>>>>
> >>>>>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
> >>>>>> [email protected]/
> >>>>>> Suggested-by: Sean Christopherson <[email protected]>
> >>>>>> Signed-off-by: Dmytro Maluka <[email protected]>
> >>>>>> ---
> >>>>>> include/linux/kvm_irqfd.h | 14 ++++++++++++
> >>>>>> virt/kvm/eventfd.c | 45
> >>>>>> +++++++++++++++++++++++++++++++++++++++
> >>>>>> 2 files changed, 59 insertions(+)
> >>>>>>
> >>>>>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
> >>>>>> index dac047abdba7..01754a1abb9e 100644
> >>>>>> --- a/include/linux/kvm_irqfd.h
> >>>>>> +++ b/include/linux/kvm_irqfd.h
> >>>>>> @@ -19,6 +19,16 @@
> >>>>>> * resamplefd. All resamplers on the same gsi are de-asserted
> >>>>>> * together, so we don't need to track the state of each individual
> >>>>>> * user. We can also therefore share the same irq source ID.
> >>>>>> + *
> >>>>>> + * A special case is when the interrupt is still masked at the
> >> moment
> >>>>>> + * an irq ack is received. That likely means that the interrupt has
> >>>>>> + * been acknowledged to the interrupt controller but not
> >>>> acknowledged
> >>>>>> + * to the device yet, e.g. it might be a Linux guest's threaded
> >>>>>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying
> >> through
> >>>>>> + * resamplefd is postponed until the guest unmasks the
> interrupt,
> >>>>>> + * which is detected through the irq mask notifier. This prevents
> >>>>>> + * erroneous extra interrupts caused by premature re-assert of
> an
> >>>>>> + * unacknowledged interrupt by the resamplefd listener.
> >>>>>> */
> >>>>>> struct kvm_kernel_irqfd_resampler {
> >>>>>> struct kvm *kvm;
> >>>>>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
> >>>>>> */
> >>>>>> struct list_head list;
> >>>>>> struct kvm_irq_ack_notifier notifier;
> >>>>>> + struct kvm_irq_mask_notifier mask_notifier;
> >>>>>> + bool masked;
> >>>>>> + bool pending;
> >>>>>> + spinlock_t lock;
> >>>>>> /*
> >>>>>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
> >>>>>> * resamplers among irqfds on the same gsi.
> >>>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> >>>>>> index 50ddb1d1a7f0..9ff47ac33790 100644
> >>>>>> --- a/virt/kvm/eventfd.c
> >>>>>> +++ b/virt/kvm/eventfd.c
> >>>>>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
> >>>> kvm_irq_ack_notifier
> >>>>>> *kian)
> >>>>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> >>>>>> resampler->notifier.gsi, 0, false);
> >>>>>>
> >>>>>> + spin_lock(&resampler->lock);
> >>>>>> + if (resampler->masked) {
> >>>>>> + resampler->pending = true;
> >>>>>> + spin_unlock(&resampler->lock);
> >>>>>> + return;
> >>>>>> + }
> >>>>>> + spin_unlock(&resampler->lock);
> >>>>>> +
> >>>>>> + idx = srcu_read_lock(&kvm->irq_srcu);
> >>>>>> +
> >>>>>> + list_for_each_entry_srcu(irqfd, &resampler->list,
> >> resampler_link,
> >>>>>> + srcu_read_lock_held(&kvm->irq_srcu))
> >>>>>> + eventfd_signal(irqfd->resamplefd, 1);
> >>>>>> +
> >>>>>> + srcu_read_unlock(&kvm->irq_srcu, idx);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void
> >>>>>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn,
> bool
> >>>>>> masked)
> >>>>>> +{
> >>>>>> + struct kvm_kernel_irqfd_resampler *resampler;
> >>>>>> + struct kvm *kvm;
> >>>>>> + struct kvm_kernel_irqfd *irqfd;
> >>>>>> + int idx;
> >>>>>> +
> >>>>>> + resampler = container_of(kimn,
> >>>>>> + struct kvm_kernel_irqfd_resampler,
> >> mask_notifier);
> >>>>>> + kvm = resampler->kvm;
> >>>>>> +
> >>>>>> + spin_lock(&resampler->lock);
> >>>>>> + resampler->masked = masked;
> >>>>>> + if (masked || !resampler->pending) {
> >>>>>> + spin_unlock(&resampler->lock);
> >>>>>> + return;
> >>>>>> + }
> >>>>>> + resampler->pending = false;
> >>>>>> + spin_unlock(&resampler->lock);
> >>>>>> +
> >>>>>> idx = srcu_read_lock(&kvm->irq_srcu);
> >>>>>>
> >>>>>> list_for_each_entry_srcu(irqfd, &resampler->list,
> >> resampler_link,
> >>>>>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
> >>>>>> kvm_kernel_irqfd *irqfd)
> >>>>>> if (list_empty(&resampler->list)) {
> >>>>>> list_del(&resampler->link);
> >>>>>> kvm_unregister_irq_ack_notifier(kvm, &resampler-
> >>> notifier);
> >>>>>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
> >>>>>>> mask_notifier.irq,
> >>>>>> + &resampler->mask_notifier);
> >>>>>> kvm_set_irq(kvm,
> >> KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> >>>>>> resampler->notifier.gsi, 0, false);
> >>>>>> kfree(resampler);
> >>>>>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm,
> >> struct
> >>>>>> kvm_irqfd *args)
> >>>>>> INIT_LIST_HEAD(&resampler->list);
> >>>>>> resampler->notifier.gsi = irqfd->gsi;
> >>>>>> resampler->notifier.irq_acked =
> >> irqfd_resampler_ack;
> >>>>>> + resampler->mask_notifier.func =
> >> irqfd_resampler_mask;
> >>>>>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
> >>>>>>> masked);
> >>>>>> + spin_lock_init(&resampler->lock);
> >>>>>> INIT_LIST_HEAD(&resampler->link);
> >>>>>>
> >>>>>> list_add(&resampler->link, &kvm-
> >>> irqfds.resampler_list);
> >>>>>> kvm_register_irq_ack_notifier(kvm,
> >>>>>> &resampler->notifier);
> >>>>>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
> >>>>>> + &resampler->mask_notifier);
> >>>>>> irqfd->resampler = resampler;
> >>>>>> }
> >>>>>>
> >>>>>> --
> >>>>>> 2.37.0.170.g444d1eabd0-goog
> >>>>>
Hi Rong,
On 8/11/22 6:12 PM, Liu, Rong L wrote:
> Hi Dmytro,
>
>> -----Original Message-----
>> From: Dmytro Maluka <[email protected]>
>> Sent: Tuesday, August 9, 2022 5:57 PM
>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
>> <[email protected]>; Paolo Bonzini <[email protected]>;
>> [email protected]
>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>> <[email protected]>; [email protected]; H. Peter Anvin
>> <[email protected]>; [email protected]; Eric Auger
>> <[email protected]>; Alex Williamson
>> <[email protected]>; Zhenyu Wang
>> <[email protected]>; Tomasz Nowicki <[email protected]>;
>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
>> <[email protected]>
>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>> oneshot interrupts
>>
>> Hi Rong,
>>
>> On 8/10/22 12:02 AM, Liu, Rong L wrote:
>>> Hi Dmytro,
>>>
>>>> -----Original Message-----
>>>> From: Dmytro Maluka <[email protected]>
>>>> Sent: Saturday, July 30, 2022 7:35 AM
>>>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
>>>> <[email protected]>; Paolo Bonzini <[email protected]>;
>>>> [email protected]
>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave Hansen
>>>> <[email protected]>; [email protected]; H. Peter Anvin
>>>> <[email protected]>; [email protected]; Eric Auger
>>>> <[email protected]>; Alex Williamson
>>>> <[email protected]>; Zhenyu Wang
>>>> <[email protected]>; Tomasz Nowicki <[email protected]>;
>>>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
>>>> <[email protected]>
>>>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>>>> oneshot interrupts
>>>>
>>>> On 7/29/22 10:48 PM, Liu, Rong L wrote:
>>>>> Hi Dmytro,
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Dmytro Maluka <[email protected]>
>>>>>> Sent: Tuesday, July 26, 2022 7:08 AM
>>>>>> To: Liu, Rong L <[email protected]>; Christopherson,, Sean
>>>>>> <[email protected]>; Paolo Bonzini <[email protected]>;
>>>>>> [email protected]
>>>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>>>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
>> Hansen
>>>>>> <[email protected]>; [email protected]; H. Peter Anvin
>>>>>> <[email protected]>; [email protected]; Eric Auger
>>>>>> <[email protected]>; Alex Williamson
>>>>>> <[email protected]>; Zhenyu Wang
>>>>>> <[email protected]>; Tomasz Nowicki <[email protected]>;
>>>>>> Grzegorz Jaszczyk <[email protected]>; Dmitry Torokhov
>>>>>> <[email protected]>
>>>>>> Subject: Re: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify
>> for
>>>>>> oneshot interrupts
>>>>>>
>>>>>> Hi Rong,
>>>>>>
>>>>>> On 7/26/22 01:44, Liu, Rong L wrote:
>>>>>>> Hi Dmytro,
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Dmytro Maluka <[email protected]>
>>>>>>>> Sent: Friday, July 15, 2022 8:59 AM
>>>>>>>> To: Christopherson,, Sean <[email protected]>; Paolo Bonzini
>>>>>>>> <[email protected]>; [email protected]
>>>>>>>> Cc: Thomas Gleixner <[email protected]>; Ingo Molnar
>>>>>>>> <[email protected]>; Borislav Petkov <[email protected]>; Dave
>>>> Hansen
>>>>>>>> <[email protected]>; [email protected]; H. Peter Anvin
>>>>>>>> <[email protected]>; [email protected]; Eric Auger
>>>>>>>> <[email protected]>; Alex Williamson
>>>>>>>> <[email protected]>; Liu, Rong L
>> <[email protected]>;
>>>>>>>> Zhenyu Wang <[email protected]>; Tomasz Nowicki
>>>>>>>> <[email protected]>; Grzegorz Jaszczyk <[email protected]>;
>>>> Dmitry
>>>>>>>> Torokhov <[email protected]>; Dmytro Maluka
>>>> <[email protected]>
>>>>>>>> Subject: [PATCH 3/3] KVM: irqfd: Postpone resamplefd notify for
>>>>>> oneshot
>>>>>>>> interrupts
>>>>>>>>
>>>>>>>> The existing KVM mechanism for forwarding of level-triggered
>>>>>> interrupts
>>>>>>>> using resample eventfd doesn't work quite correctly in the case
>> of
>>>>>>>> interrupts that are handled in a Linux guest as oneshot interrupts
>>>>>>>> (IRQF_ONESHOT). Such an interrupt is acked to the device in its
>>>>>>>> threaded irq handler, i.e. later than it is acked to the interrupt
>>>>>>>> controller (EOI at the end of hardirq), not earlier.
>>>>>>>>
>>>>>>>> Linux keeps such interrupt masked until its threaded handler
>>>> finishes,
>>>>>>>> to prevent the EOI from re-asserting an unacknowledged
>> interrupt.
>>>>>>>> However, with KVM + vfio (or whatever is listening on the
>>>> resamplefd)
>>>>>>>> we don't check that the interrupt is still masked in the guest at
>> the
>>>>>>>> moment of EOI. Resamplefd is notified regardless, so vfio
>>>> prematurely
>>>>>>>> unmasks the host physical IRQ, thus a new (unwanted) physical
>>>>>> interrupt
>>>>>>>> is generated in the host and queued for injection to the guest.
>>>>>>>>
>>>>>>>> The fact that the virtual IRQ is still masked doesn't prevent this
>> new
>>>>>>>> physical IRQ from being propagated to the guest, because:
>>>>>>>>
>>>>>>>> 1. It is not guaranteed that the vIRQ will remain masked by the
>> time
>>>>>>>> when vfio signals the trigger eventfd.
>>>>>>>> 2. KVM marks this IRQ as pending (e.g. setting its bit in the virtual
>>>>>>>> IRR register of IOAPIC on x86), so after the vIRQ is unmasked,
>> this
>>>>>>>> new pending interrupt is injected by KVM to the guest anyway.
>>>>>>>>
>>>>>>>> There are observed at least 2 user-visible issues caused by those
>>>>>>>> extra erroneous pending interrupts for oneshot irq in the guest:
>>>>>>>>
>>>>>>>> 1. System suspend aborted due to a pending wakeup interrupt
>> from
>>>>>>>> ChromeOS EC (drivers/platform/chrome/cros_ec.c).
>>>>>>>> 2. Annoying "invalid report id data" errors from ELAN0000
>>>> touchpad
>>>>>>>> (drivers/input/mouse/elan_i2c_core.c), flooding the guest
>> dmesg
>>>>>>>> every time the touchpad is touched.
>>>>>>>>
>>>>>>>> This patch fixes the issue on x86 by checking if the interrupt is
>>>>>>>> unmasked when we receive irq ack (EOI) and, in case if it's
>> masked,
>>>>>>>> postponing resamplefd notify until the guest unmasks it.
>>>>>>>>
>>>>>>>> Important notes:
>>>>>>>>
>>>>>>>> 1. It doesn't fix the issue for other archs yet, due to some missing
>>>>>>>> KVM functionality needed by this patch:
>>>>>>>> - calling mask notifiers is implemented for x86 only
>>>>>>>> - irqchip ->is_masked() is implemented for x86 only
>>>>>>>>
>>>>>>>> 2. It introduces an additional spinlock locking in the resample
>> notify
>>>>>>>> path, since we are no longer just traversing an RCU list of irqfds
>>>>>>>> but also updating the resampler state. Hopefully this locking
>> won't
>>>>>>>> noticeably slow down anything for anyone.
>>>>>>>>
>>>>>>>
>>>>>>> Instead of using a spinlock waiting for the unmask event, is it
>>>> possible
>>>>>> to call
>>>>>>> resampler notify directly when unmask event happens, instead of
>>>>>> calling it on
>>>>>>> EOI?
>>>>>>
>>>>>> In this patch, resampler notify is already called directly when
>> unmask
>>>>>> happens: e.g. with IOAPIC, when the guest unmasks the interrupt by
>>>>>> writing to IOREDTBLx register, ioapic_write_indirect() calls
>>>>>> kvm_fire_mask_notifiers() which calls irqfd_resampler_mask()
>> which
>>>>>> notifies the resampler. On EOI we postpone it just by setting
>>>>>> resampler->pending to true, not by waiting. The spinlock is needed
>>>>>> merely to synchronize reading & updating resampler->pending and
>>>>>> resampler->masked values between possibly concurrently running
>>>>>> instances
>>>>>> of irqfd_resampler_ack() and/or irqfd_resampler_mask().
>>>>>>
>>>>>> Thanks,
>>>>>> Dmytro
>>>>>>
>>>>>
>>>>> I mean the organization of the code. In current implementation,
>>>>> kvm_ioapic_update_eoi_one() calls kvm_notify_acked_irq(), in your
>>>> patch, why not
>>>>> call kvm_notify_acked_irq() from ioapic_write_indirect() (roughly at
>>>> the same
>>>>> place where kvm_fire_mask_notifiers is called), instead of calling it
>>>> from
>>>>> kvm_ioapic_update_eoi_one, since what your intention here is to
>>>> notify
>>>>> vfio of the end of interrupt at the event of ioapic unmask, instead of
>>>>> EOI?
>>>>
>>>> Ah ok, got your point.
>>>>
>>>> That was my initial approach in my PoC patch posted in [1]. But then I
>>>> dropped it, for 2 reasons:
>>>>
>>>> 1. Upon feedback from Sean I realized that kvm_notify_acked_irq() is
>>>> also doing some other important things besides notifying vfio. In
>>>> particular, in irqfd_resampler_ack() we also de-assert the vIRQ via
>>>> kvm_set_irq(). In case of IOAPIC it means clearing its bit in IRR
>>>> register. If we delay that until unmasking, it means that we change
>>>> the way how KVM emulates the interrupt controller. That would
>> seem
>>>> inconsistent.
>>>>
>>>
>>> Thanks for clarification. I totally agree that it is important to keep the
>> way
>>> how KVM emulates the interrupt controller.
>>>
>>>> Also kvm_notify_acked_irq() notifies the emulated PIT timer via
>>>> kvm_pit_ack_irq(). I haven't analyzed how exactly that PIT stuff
>>>> works, so I'm not sure if delaying that until unmask wouldn't cause
>>>> any unwanted effects.
>>>>
>>>> So the idea is to postpone eventfd_signal() only, to fix interaction
>>>> with vfio while keeping the rest of the KVM behavior intact. Because
>>>> the KVM job is to emulate the interrupt controller (which it already
>>>> does correctly), not the external device which is the job of vfio*.
>>>>
>>>
>>> I made a mistake in my last post. I mean just to delay the notification
>> of
>>> vfio, but keep the rest of the code as intact as possible.
>>
>> Hmm, thanks, until now I wasn't thinking about another possibility: call
>> delayed eventfd_signal() directly from ioapic_write_indirect().
>>
>> However, now thinking about that, I don't really see advantages. We
>> wouldn't
>> need to use mask notifiers then, but we would still need the same logic
>> and
>> synchronization in irqfd_resampler_ack() for checking whether we
>> should delay
>> the notification. Moreover, I'm not sure how would I then address the
>> resampler->masked initialization race issue, which I addressed in v2 by
>> implementing kvm_register_and_fire_irq_mask_notifier() in [1].
>>
>> [1] https://lore.kernel.org/lkml/20220805193919.1470653-3-
>> [email protected]/
>>
>
> I haven't finished reading your v2 patch, especially regarding the racing
> conditions. However, I think there are some advantages to let irqchip call the
> eventfd_signal, one thing is clearer code organization. The other is that it
> seems it is necessary to differentiate the handling among different irqchips,
> the goal here is to keep logic around other types of irqchips intact to reduce
> code turmoil and fix ioapic? It looks like every irqchip has its own "set_irq"
> function (You probably already know this: kvm_set_routing_entry), I think it is
> logical to let every type irqchip (it is maybe necessary for other types of irq
> too) to decide when to notify vfio.
In a way that is already the case with this patchset. Each irqchip
decides when to call kvm_fire_mask_notifiers(), so it (indirectly)
decides when to send the delayed vfio notification. Actually non-x86
irqchips currently never call kvm_fire_mask_notifiers(), so their
behavior is kept intact.
Regarding code organization, the tricky thing is the synchronization
needed for detecting whether to postpone resamplefd at vEOI or not.
Anyway, v3 will probably implement quite a different approach (to
address the issue pointed out by Marc: we should maintain correct vIRQ
pending state presented to the guest), without postponing resamplefd, so
maybe the implementation will be more in line with your suggestions.
>
>>>
>>>> 2. kvm_notify_acked_irq() can't be called under ioapic->lock, so in [1]
>>>> I was unlocking ioapic->lock in ioapic_write_indirect() with a naive
>>>> assumption that it was as safe as doing it in
>>>> kvm_ioapic_update_eoi_one(). That was probably racy, and I hadn't
>>>> figured out how to rework it in a race-free way.
>>>>
>>>> [1] https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>>>> [email protected]/
>>>>
>>>> [*] By "vfio" I always mean "vfio or any other resamplefd user".
>>>>
>>>> Thanks,
>>>> Dmytro
>>>>
>>>>>
>>>>>>>
>>>>>>>> Regarding #2, there may be an alternative solution worth
>>>> considering:
>>>>>>>> extend KVM irqfd (userspace) API to send mask and unmask
>>>>>> notifications
>>>>>>>> directly to vfio/whatever, in addition to resample notifications,
>> to
>>>>>>>> let vfio check the irq state on its own. There is already locking on
>>>>>>>> vfio side (see e.g. vfio_platform_unmask()), so this way we would
>>>>>> avoid
>>>>>>>> introducing any additional locking. Also such mask/unmask
>>>>>> notifications
>>>>>>>> could be useful for other cases.
>>>>>>>>
>>>>>>>> Link: https://lore.kernel.org/kvm/31420943-8c5f-125c-a5ee-
>>>>>>>> [email protected]/
>>>>>>>> Suggested-by: Sean Christopherson <[email protected]>
>>>>>>>> Signed-off-by: Dmytro Maluka <[email protected]>
>>>>>>>> ---
>>>>>>>> include/linux/kvm_irqfd.h | 14 ++++++++++++
>>>>>>>> virt/kvm/eventfd.c | 45
>>>>>>>> +++++++++++++++++++++++++++++++++++++++
>>>>>>>> 2 files changed, 59 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
>>>>>>>> index dac047abdba7..01754a1abb9e 100644
>>>>>>>> --- a/include/linux/kvm_irqfd.h
>>>>>>>> +++ b/include/linux/kvm_irqfd.h
>>>>>>>> @@ -19,6 +19,16 @@
>>>>>>>> * resamplefd. All resamplers on the same gsi are de-asserted
>>>>>>>> * together, so we don't need to track the state of each individual
>>>>>>>> * user. We can also therefore share the same irq source ID.
>>>>>>>> + *
>>>>>>>> + * A special case is when the interrupt is still masked at the
>>>> moment
>>>>>>>> + * an irq ack is received. That likely means that the interrupt has
>>>>>>>> + * been acknowledged to the interrupt controller but not
>>>>>> acknowledged
>>>>>>>> + * to the device yet, e.g. it might be a Linux guest's threaded
>>>>>>>> + * oneshot interrupt (IRQF_ONESHOT). In this case notifying
>>>> through
>>>>>>>> + * resamplefd is postponed until the guest unmasks the
>> interrupt,
>>>>>>>> + * which is detected through the irq mask notifier. This prevents
>>>>>>>> + * erroneous extra interrupts caused by premature re-assert of
>> an
>>>>>>>> + * unacknowledged interrupt by the resamplefd listener.
>>>>>>>> */
>>>>>>>> struct kvm_kernel_irqfd_resampler {
>>>>>>>> struct kvm *kvm;
>>>>>>>> @@ -28,6 +38,10 @@ struct kvm_kernel_irqfd_resampler {
>>>>>>>> */
>>>>>>>> struct list_head list;
>>>>>>>> struct kvm_irq_ack_notifier notifier;
>>>>>>>> + struct kvm_irq_mask_notifier mask_notifier;
>>>>>>>> + bool masked;
>>>>>>>> + bool pending;
>>>>>>>> + spinlock_t lock;
>>>>>>>> /*
>>>>>>>> * Entry in list of kvm->irqfd.resampler_list. Use for sharing
>>>>>>>> * resamplers among irqfds on the same gsi.
>>>>>>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>>>>>>>> index 50ddb1d1a7f0..9ff47ac33790 100644
>>>>>>>> --- a/virt/kvm/eventfd.c
>>>>>>>> +++ b/virt/kvm/eventfd.c
>>>>>>>> @@ -75,6 +75,44 @@ irqfd_resampler_ack(struct
>>>>>> kvm_irq_ack_notifier
>>>>>>>> *kian)
>>>>>>>> kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>>>>>>>> resampler->notifier.gsi, 0, false);
>>>>>>>>
>>>>>>>> + spin_lock(&resampler->lock);
>>>>>>>> + if (resampler->masked) {
>>>>>>>> + resampler->pending = true;
>>>>>>>> + spin_unlock(&resampler->lock);
>>>>>>>> + return;
>>>>>>>> + }
>>>>>>>> + spin_unlock(&resampler->lock);
>>>>>>>> +
>>>>>>>> + idx = srcu_read_lock(&kvm->irq_srcu);
>>>>>>>> +
>>>>>>>> + list_for_each_entry_srcu(irqfd, &resampler->list,
>>>> resampler_link,
>>>>>>>> + srcu_read_lock_held(&kvm->irq_srcu))
>>>>>>>> + eventfd_signal(irqfd->resamplefd, 1);
>>>>>>>> +
>>>>>>>> + srcu_read_unlock(&kvm->irq_srcu, idx);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +irqfd_resampler_mask(struct kvm_irq_mask_notifier *kimn,
>> bool
>>>>>>>> masked)
>>>>>>>> +{
>>>>>>>> + struct kvm_kernel_irqfd_resampler *resampler;
>>>>>>>> + struct kvm *kvm;
>>>>>>>> + struct kvm_kernel_irqfd *irqfd;
>>>>>>>> + int idx;
>>>>>>>> +
>>>>>>>> + resampler = container_of(kimn,
>>>>>>>> + struct kvm_kernel_irqfd_resampler,
>>>> mask_notifier);
>>>>>>>> + kvm = resampler->kvm;
>>>>>>>> +
>>>>>>>> + spin_lock(&resampler->lock);
>>>>>>>> + resampler->masked = masked;
>>>>>>>> + if (masked || !resampler->pending) {
>>>>>>>> + spin_unlock(&resampler->lock);
>>>>>>>> + return;
>>>>>>>> + }
>>>>>>>> + resampler->pending = false;
>>>>>>>> + spin_unlock(&resampler->lock);
>>>>>>>> +
>>>>>>>> idx = srcu_read_lock(&kvm->irq_srcu);
>>>>>>>>
>>>>>>>> list_for_each_entry_srcu(irqfd, &resampler->list,
>>>> resampler_link,
>>>>>>>> @@ -98,6 +136,8 @@ irqfd_resampler_shutdown(struct
>>>>>>>> kvm_kernel_irqfd *irqfd)
>>>>>>>> if (list_empty(&resampler->list)) {
>>>>>>>> list_del(&resampler->link);
>>>>>>>> kvm_unregister_irq_ack_notifier(kvm, &resampler-
>>>>> notifier);
>>>>>>>> + kvm_unregister_irq_mask_notifier(kvm, resampler-
>>>>>>>>> mask_notifier.irq,
>>>>>>>> + &resampler->mask_notifier);
>>>>>>>> kvm_set_irq(kvm,
>>>> KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
>>>>>>>> resampler->notifier.gsi, 0, false);
>>>>>>>> kfree(resampler);
>>>>>>>> @@ -367,11 +407,16 @@ kvm_irqfd_assign(struct kvm *kvm,
>>>> struct
>>>>>>>> kvm_irqfd *args)
>>>>>>>> INIT_LIST_HEAD(&resampler->list);
>>>>>>>> resampler->notifier.gsi = irqfd->gsi;
>>>>>>>> resampler->notifier.irq_acked =
>>>> irqfd_resampler_ack;
>>>>>>>> + resampler->mask_notifier.func =
>>>> irqfd_resampler_mask;
>>>>>>>> + kvm_irq_is_masked(kvm, irqfd->gsi, &resampler-
>>>>>>>>> masked);
>>>>>>>> + spin_lock_init(&resampler->lock);
>>>>>>>> INIT_LIST_HEAD(&resampler->link);
>>>>>>>>
>>>>>>>> list_add(&resampler->link, &kvm-
>>>>> irqfds.resampler_list);
>>>>>>>> kvm_register_irq_ack_notifier(kvm,
>>>>>>>> &resampler->notifier);
>>>>>>>> + kvm_register_irq_mask_notifier(kvm, irqfd->gsi,
>>>>>>>> + &resampler->mask_notifier);
>>>>>>>> irqfd->resampler = resampler;
>>>>>>>> }
>>>>>>>>
>>>>>>>> --
>>>>>>>> 2.37.0.170.g444d1eabd0-goog
>>>>>>>