LinuxLists.cc - [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

2022-03-22 08:15:46

Subject: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

Software Delegated Exception Interface (SDEI) provides a mechanism for
registering and servicing system events. Those system events are high
priority events, which must be serviced immediately. It's going to be
used by Asynchronous Page Fault (APF) to deliver notification from KVM
to guest. It's noted that SDEI is defined by ARM DEN0054C specification.

This introduces SDEI virtualization infrastructure where the SDEI events
are registered and manipulated by the guest through hypercall. The SDEI
event is delivered to one specific vCPU by KVM once it's raised. This
introduces data structures to represent the needed objects to support
the feature, which is highlighted as below. As those objects could be
migrated between VMs, these data structures are partially exposed to
user space.

* kvm_sdei_exposed_event
The exposed events are determined and added by VMM through ioctl
interface. Only the exposed events can be registered from the
guest.

* kvm_sdei_registered_event
The events that have been registered from the guest through the
SDEI_1_0_FN_SDEI_EVENT_REGISTER hypercall.

* kvm_sdei_vcpu_event
The events that have been delivered to the target vCPU.

* kvm_sdei_vcpu
Used to save the preempted context when the SDEI event is serviced
and delivered. After the SDEI event handling is completed, the
execution is resumed from the preempted context.

* kvm_sdei_kvm
Place holder for the exposed and registered events.

The error of SDEI_NOT_SUPPORTED is returned for all SDEI hypercalls for
now. They will be supported in the subsequent patches.

Signed-off-by: Gavin Shan <[email protected]>
---
arch/arm64/include/asm/kvm_host.h | 3 +
arch/arm64/include/asm/kvm_sdei.h | 171 +++++++++++++
arch/arm64/include/uapi/asm/kvm.h | 1 +
arch/arm64/include/uapi/asm/kvm_sdei_state.h | 72 ++++++
arch/arm64/kvm/Makefile | 2 +-
arch/arm64/kvm/arm.c | 8 +
arch/arm64/kvm/hypercalls.c | 21 ++
arch/arm64/kvm/sdei.c | 244 +++++++++++++++++++
include/uapi/linux/arm_sdei.h | 2 +
9 files changed, 523 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/include/asm/kvm_sdei.h
create mode 100644 arch/arm64/include/uapi/asm/kvm_sdei_state.h
create mode 100644 arch/arm64/kvm/sdei.c

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 031e3a2537fc..5d37e046a458 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -113,6 +113,8 @@ struct kvm_arch {
/* Interrupt controller */
struct vgic_dist vgic;

+ struct kvm_sdei_kvm *sdei;
+
/* Mandated version of PSCI */
u32 psci_version;

@@ -338,6 +340,7 @@ struct kvm_vcpu_arch {
* Anything that is not used directly from assembly code goes
* here.
*/
+ struct kvm_sdei_vcpu *sdei;

/*
* Guest registers we preserve during guest debugging.
diff --git a/arch/arm64/include/asm/kvm_sdei.h b/arch/arm64/include/asm/kvm_sdei.h
new file mode 100644
index 000000000000..6f58a846d05c
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_sdei.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Definitions of various KVM SDEI events.
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * Author(s): Gavin Shan <[email protected]>
+ */
+
+#ifndef __ARM64_KVM_SDEI_H__
+#define __ARM64_KVM_SDEI_H__
+
+#include <uapi/linux/arm_sdei.h>
+#include <uapi/asm/kvm_sdei_state.h>
+#include <linux/bitmap.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct kvm_vcpu;
+
+struct kvm_sdei_exposed_event {
+ struct kvm_sdei_exposed_event_state state;
+ struct kvm *kvm;
+ unsigned int registered_event_count;
+ struct list_head link;
+};
+
+struct kvm_sdei_registered_event {
+ struct kvm_sdei_registered_event_state state;
+ struct kvm *kvm;
+ struct kvm_sdei_exposed_event *exposed_event;
+ unsigned int vcpu_event_count;
+ struct list_head link;
+};
+
+struct kvm_sdei_vcpu_event {
+ struct kvm_sdei_vcpu_event_state state;
+ struct kvm_vcpu *vcpu;
+ struct kvm_sdei_registered_event *registered_event;
+ struct list_head link;
+};
+
+struct kvm_sdei_kvm {
+ spinlock_t lock;
+ unsigned int exposed_event_count;
+ unsigned int registered_event_count;
+ struct list_head exposed_events;
+ struct list_head registered_events;
+};
+
+struct kvm_sdei_vcpu {
+ spinlock_t lock;
+ struct kvm_sdei_vcpu_state state;
+ struct kvm_sdei_vcpu_event *critical_event;
+ struct kvm_sdei_vcpu_event *normal_event;
+ unsigned int critical_event_count;
+ unsigned int normal_event_count;
+ struct list_head critical_events;
+ struct list_head normal_events;
+};
+
+/*
+ * According to SDEI specification (v1.1), the event number spans 32-bits
+ * and the lower 24-bits are used as the (real) event number. I don't
+ * think we can use that much SDEI numbers in one system. So we reserve
+ * two bits from the 24-bits real event number, to indicate its types:
+ * physical or virtual event. One reserved bit is enough for now, but
+ * two bits are reserved for possible extension in future.
+ *
+ * The physical events are owned by firmware while the virtual events
+ * are used by VMM and KVM.
+ */
+#define KVM_SDEI_EVENT_NUM_TYPE_SHIFT 22
+#define KVM_SDEI_EVENT_NUM_TYPE_MASK (3UL << KVM_SDEI_EVENT_NUM_TYPE_SHIFT)
+#define KVM_SDEI_EVENT_NUM_TYPE_PHYS 0
+#define KVM_SDEI_EVENT_NUM_TYPE_VIRT 1
+
+static inline bool kvm_sdei_is_virtual(unsigned long num)
+{
+ unsigned long type;
+
+ if (num >> 32)
+ return false;
+
+ type = (num & KVM_SDEI_EVENT_NUM_TYPE_MASK) >>
+ KVM_SDEI_EVENT_NUM_TYPE_SHIFT;
+ if (type != KVM_SDEI_EVENT_NUM_TYPE_VIRT)
+ return false;
+
+ return true;
+}
+
+static inline bool kvm_sdei_is_default(unsigned long num)
+{
+ return num == KVM_SDEI_DEFAULT_EVENT;
+}
+
+static inline bool kvm_sdei_is_supported(unsigned long num)
+{
+ return kvm_sdei_is_default(num) || kvm_sdei_is_virtual(num);
+}
+
+static inline bool kvm_sdei_is_shared(unsigned char type)
+{
+ return type == SDEI_EVENT_TYPE_SHARED;
+}
+
+static inline bool kvm_sdei_is_private(unsigned char type)
+{
+ return type == SDEI_EVENT_TYPE_PRIVATE;
+}
+
+static inline bool kvm_sdei_is_critical(unsigned char priority)
+{
+ return priority == SDEI_EVENT_PRIORITY_CRITICAL;
+}
+
+static inline bool kvm_sdei_is_normal(unsigned char priority)
+{
+ return priority == SDEI_EVENT_PRIORITY_NORMAL;
+}
+
+static inline unsigned int
+kvm_sdei_vcpu_index(struct kvm_vcpu *vcpu,
+ struct kvm_sdei_exposed_event *event)
+{
+ return kvm_sdei_is_private(event->state.type) ? vcpu->vcpu_idx : 0;
+}
+
+/* Accessors for the registered event */
+#define KVM_SDEI_REGISTERED_EVENT_FUNC(field) \
+static inline bool \
+kvm_sdei_is_##field(struct kvm_sdei_registered_event *event, \
+ unsigned int index) \
+{ \
+ return !!test_bit(index, (void *)(event->state.field)); \
+} \
+ \
+static inline bool \
+kvm_sdei_none_##field(struct kvm_sdei_registered_event *event) \
+{ \
+ return bitmap_empty((void *)(event->state.field), \
+ KVM_SDEI_MAX_VCPUS); \
+} \
+ \
+static inline void \
+kvm_sdei_set_##field(struct kvm_sdei_registered_event *event, \
+ unsigned int index) \
+{ \
+ set_bit(index, (void *)(event->state.field)); \
+} \
+ \
+static inline void \
+kvm_sdei_clear_##field(struct kvm_sdei_registered_event *event, \
+ unsigned int index) \
+{ \
+ clear_bit(index, (void *)(event->state.field)); \
+}
+
+KVM_SDEI_REGISTERED_EVENT_FUNC(registered)
+KVM_SDEI_REGISTERED_EVENT_FUNC(enabled)
+KVM_SDEI_REGISTERED_EVENT_FUNC(unregister_pending)
+
+/* APIs */
+void kvm_sdei_init_vm(struct kvm *kvm);
+void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu);
+int kvm_sdei_hypercall(struct kvm_vcpu *vcpu);
+void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu);
+void kvm_sdei_destroy_vm(struct kvm *kvm);
+
+#endif /* __ARM64_KVM_SDEI_H__ */
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 323e251ed37b..33ee95bc088e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -36,6 +36,7 @@
#include <linux/types.h>
#include <asm/ptrace.h>
#include <asm/sve_context.h>
+#include <asm/kvm_sdei_state.h>

#define __KVM_HAVE_GUEST_DEBUG
#define __KVM_HAVE_IRQ_LINE
diff --git a/arch/arm64/include/uapi/asm/kvm_sdei_state.h b/arch/arm64/include/uapi/asm/kvm_sdei_state.h
new file mode 100644
index 000000000000..b14844230117
--- /dev/null
+++ b/arch/arm64/include/uapi/asm/kvm_sdei_state.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Definitions of various KVM SDEI event states.
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * Author(s): Gavin Shan <[email protected]>
+ */
+
+#ifndef _UAPI__ASM_KVM_SDEI_STATE_H
+#define _UAPI__ASM_KVM_SDEI_STATE_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/*
+ * The software signaled event is the default one, which is
+ * defined in v1.1 specification.
+ */
+#define KVM_SDEI_INVALID_EVENT 0xFFFFFFFF
+#define KVM_SDEI_DEFAULT_EVENT 0
+
+#define KVM_SDEI_MAX_VCPUS 512 /* Aligned to 64 */
+#define KVM_SDEI_MAX_EVENTS 128
+
+struct kvm_sdei_exposed_event_state {
+ __u64 num;
+
+ __u8 type;
+ __u8 signaled;
+ __u8 priority;
+ __u8 padding[5];
+ __u64 notifier;
+};
+
+struct kvm_sdei_registered_event_state {
+ __u64 num;
+
+ __u8 route_mode;
+ __u8 padding[3];
+ __u64 route_affinity;
+ __u64 ep_address[KVM_SDEI_MAX_VCPUS];
+ __u64 ep_arg[KVM_SDEI_MAX_VCPUS];
+ __u64 registered[KVM_SDEI_MAX_VCPUS/64];
+ __u64 enabled[KVM_SDEI_MAX_VCPUS/64];
+ __u64 unregister_pending[KVM_SDEI_MAX_VCPUS/64];
+};
+
+struct kvm_sdei_vcpu_event_state {
+ __u64 num;
+
+ __u32 event_count;
+ __u32 padding;
+};
+
+struct kvm_sdei_vcpu_regs_state {
+ __u64 regs[18];
+ __u64 pc;
+ __u64 pstate;
+};
+
+struct kvm_sdei_vcpu_state {
+ __u8 masked;
+ __u8 padding[7];
+ __u64 critical_num;
+ __u64 normal_num;
+ struct kvm_sdei_vcpu_regs_state critical_regs;
+ struct kvm_sdei_vcpu_regs_state normal_regs;
+};
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _UAPI__ASM_KVM_SDEI_STATE_H */
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 91861fd8b897..a12903a21d1f 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -14,7 +14,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o pkvm.o \
- arch_timer.o trng.o\
+ arch_timer.o trng.o sdei.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4dca6ffd03d4..96fcae5beee4 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -38,6 +38,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_sdei.h>
#include <asm/sections.h>

#include <kvm/arm_hypercalls.h>
@@ -152,6 +153,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

kvm_vgic_early_init(kvm);

+ kvm_sdei_init_vm(kvm);
+
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();

@@ -179,6 +182,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)

kvm_vgic_destroy(kvm);

+ kvm_sdei_destroy_vm(kvm);
+
kvm_destroy_vcpus(kvm);

kvm_unshare_hyp(kvm, kvm + 1);
@@ -330,6 +335,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)

kvm_arm_pvtime_vcpu_init(&vcpu->arch);

+ kvm_sdei_create_vcpu(vcpu);
+
vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;

err = kvm_vgic_vcpu_init(vcpu);
@@ -351,6 +358,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
+ kvm_sdei_destroy_vcpu(vcpu);

kvm_arm_vcpu_destroy(vcpu);
}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 202b8c455724..3c20fee72bb4 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -5,6 +5,7 @@
#include <linux/kvm_host.h>

#include <asm/kvm_emulate.h>
+#include <asm/kvm_sdei.h>

#include <kvm/arm_hypercalls.h>
#include <kvm/arm_psci.h>
@@ -151,6 +152,26 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
case ARM_SMCCC_TRNG_RND32:
case ARM_SMCCC_TRNG_RND64:
return kvm_trng_call(vcpu);
+ case SDEI_1_0_FN_SDEI_VERSION:
+ case SDEI_1_0_FN_SDEI_EVENT_REGISTER:
+ case SDEI_1_0_FN_SDEI_EVENT_ENABLE:
+ case SDEI_1_0_FN_SDEI_EVENT_DISABLE:
+ case SDEI_1_0_FN_SDEI_EVENT_CONTEXT:
+ case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
+ case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
+ case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER:
+ case SDEI_1_0_FN_SDEI_EVENT_STATUS:
+ case SDEI_1_0_FN_SDEI_EVENT_GET_INFO:
+ case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET:
+ case SDEI_1_0_FN_SDEI_PE_MASK:
+ case SDEI_1_0_FN_SDEI_PE_UNMASK:
+ case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
+ case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE:
+ case SDEI_1_1_FN_SDEI_EVENT_SIGNAL:
+ case SDEI_1_0_FN_SDEI_PRIVATE_RESET:
+ case SDEI_1_0_FN_SDEI_SHARED_RESET:
+ case SDEI_1_1_FN_SDEI_FEATURES:
+ return kvm_sdei_hypercall(vcpu);
default:
return kvm_psci_call(vcpu);
}
diff --git a/arch/arm64/kvm/sdei.c b/arch/arm64/kvm/sdei.c
new file mode 100644
index 000000000000..8a9b477b8977
--- /dev/null
+++ b/arch/arm64/kvm/sdei.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SDEI virtualization support.
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * Author(s): Gavin Shan <[email protected]>
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <kvm/arm_hypercalls.h>
+#include <asm/kvm_sdei.h>
+
+static void remove_all_exposed_events(struct kvm *kvm)
+{
+ struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+ struct kvm_sdei_exposed_event *exposed_event, *tmp;
+
+ list_for_each_entry_safe(exposed_event, tmp,
+ &ksdei->exposed_events, link) {
+ ksdei->exposed_event_count--;
+ list_del(&exposed_event->link);
+
+ kfree(exposed_event);
+ }
+}
+
+static void remove_one_registered_event(struct kvm *kvm,
+ struct kvm_sdei_registered_event *registered_event)
+{
+ struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+ struct kvm_sdei_exposed_event *exposed_event;
+
+ exposed_event = registered_event->exposed_event;
+ ksdei->registered_event_count--;
+ exposed_event->registered_event_count--;
+
+ list_del(&registered_event->link);
+ kfree(registered_event);
+}
+
+static void remove_all_registered_events(struct kvm *kvm)
+{
+ struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+ struct kvm_sdei_registered_event *registered_event, *tmp;
+
+ list_for_each_entry_safe(registered_event, tmp,
+ &ksdei->registered_events, link) {
+ remove_one_registered_event(kvm, registered_event);
+ }
+}
+
+static void remove_one_vcpu_event(struct kvm_vcpu *vcpu,
+ struct kvm_sdei_vcpu_event *vcpu_event)
+{
+ struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+ struct kvm_sdei_exposed_event *exposed_event;
+ struct kvm_sdei_registered_event *registered_event;
+
+ registered_event = vcpu_event->registered_event;
+ exposed_event = registered_event->exposed_event;
+ registered_event->vcpu_event_count--;
+ if (kvm_sdei_is_critical(exposed_event->state.priority))
+ vsdei->critical_event_count--;
+ else
+ vsdei->normal_event_count--;
+
+ list_del(&vcpu_event->link);
+ kfree(vcpu_event);
+}
+
+static bool remove_all_vcpu_events(struct kvm_vcpu *vcpu,
+ unsigned long num)
+{
+ struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+ struct kvm_sdei_vcpu_event *vcpu_event, *tmp;
+ bool pending = false;
+
+ list_for_each_entry_safe(vcpu_event, tmp,
+ &vsdei->critical_events, link) {
+ if (!kvm_sdei_is_supported(num)) {
+ remove_one_vcpu_event(vcpu, vcpu_event);
+ continue;
+ }
+
+ if (vcpu_event->state.num != num)
+ continue;
+
+ if (vsdei->critical_event == vcpu_event) {
+ vcpu_event->state.event_count = 1;
+ pending = true;
+ } else {
+ remove_one_vcpu_event(vcpu, vcpu_event);
+ }
+ }
+
+ list_for_each_entry_safe(vcpu_event, tmp,
+ &vsdei->normal_events, link) {
+ if (!kvm_sdei_is_supported(num)) {
+ remove_one_vcpu_event(vcpu, vcpu_event);
+ continue;
+ }
+
+ if (vcpu_event->state.num != num)
+ continue;
+
+ if (vsdei->normal_event == vcpu_event) {
+ vcpu_event->state.event_count = 1;
+ pending = true;
+ } else {
+ remove_one_vcpu_event(vcpu, vcpu_event);
+ }
+ }
+
+ return pending;
+}
+
+int kvm_sdei_hypercall(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+ struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+ u32 func = smccc_get_function(vcpu);
+ bool has_result = true;
+ unsigned long ret;
+
+ /*
+ * We don't have return value for COMPLETE or COMPLETE_AND_RESUME
+ * hypercalls. Otherwise, the restored context will be corrupted.
+ */
+ if (func == SDEI_1_0_FN_SDEI_EVENT_COMPLETE ||
+ func == SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME)
+ has_result = false;
+
+ if (!(ksdei && vsdei)) {
+ ret = SDEI_NOT_SUPPORTED;
+ goto out;
+ }
+
+ switch (func) {
+ case SDEI_1_0_FN_SDEI_VERSION:
+ case SDEI_1_0_FN_SDEI_EVENT_REGISTER:
+ case SDEI_1_0_FN_SDEI_EVENT_ENABLE:
+ case SDEI_1_0_FN_SDEI_EVENT_DISABLE:
+ case SDEI_1_0_FN_SDEI_EVENT_CONTEXT:
+ case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
+ case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
+ case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER:
+ case SDEI_1_0_FN_SDEI_EVENT_STATUS:
+ case SDEI_1_0_FN_SDEI_EVENT_GET_INFO:
+ case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET:
+ case SDEI_1_0_FN_SDEI_PE_MASK:
+ case SDEI_1_0_FN_SDEI_PE_UNMASK:
+ case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
+ case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE:
+ case SDEI_1_1_FN_SDEI_EVENT_SIGNAL:
+ case SDEI_1_0_FN_SDEI_PRIVATE_RESET:
+ case SDEI_1_0_FN_SDEI_SHARED_RESET:
+ case SDEI_1_1_FN_SDEI_FEATURES:
+ default:
+ ret = SDEI_NOT_SUPPORTED;
+ }
+
+out:
+ if (has_result)
+ smccc_set_retval(vcpu, ret, 0, 0, 0);
+
+ return 1;
+}
+
+void kvm_sdei_init_vm(struct kvm *kvm)
+{
+ struct kvm_sdei_kvm *ksdei;
+
+ ksdei = kzalloc(sizeof(*ksdei), GFP_KERNEL_ACCOUNT);
+ if (!ksdei)
+ return;
+
+ spin_lock_init(&ksdei->lock);
+ INIT_LIST_HEAD(&ksdei->exposed_events);
+ INIT_LIST_HEAD(&ksdei->registered_events);
+
+ kvm->arch.sdei = ksdei;
+}
+
+void kvm_sdei_create_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_sdei_vcpu *vsdei;
+
+ if (!kvm->arch.sdei)
+ return;
+
+ vsdei = kzalloc(sizeof(*vsdei), GFP_KERNEL_ACCOUNT);
+ if (!vsdei)
+ return;
+
+ spin_lock_init(&vsdei->lock);
+ vsdei->state.masked = 1;
+ vsdei->state.critical_num = KVM_SDEI_INVALID_EVENT;
+ vsdei->state.normal_num = KVM_SDEI_INVALID_EVENT;
+ vsdei->critical_event = NULL;
+ vsdei->normal_event = NULL;
+ INIT_LIST_HEAD(&vsdei->critical_events);
+ INIT_LIST_HEAD(&vsdei->normal_events);
+
+ vcpu->arch.sdei = vsdei;
+}
+
+void kvm_sdei_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+ struct kvm_sdei_vcpu *vsdei = vcpu->arch.sdei;
+
+ if (ksdei && vsdei) {
+ spin_lock(&ksdei->lock);
+ spin_lock(&vsdei->lock);
+ remove_all_vcpu_events(vcpu, KVM_SDEI_INVALID_EVENT);
+ spin_unlock(&vsdei->lock);
+ spin_unlock(&ksdei->lock);
+
+ kfree(vsdei);
+ vcpu->arch.sdei = NULL;
+ }
+}
+
+void kvm_sdei_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_sdei_kvm *ksdei = kvm->arch.sdei;
+
+ if (ksdei) {
+ spin_lock(&ksdei->lock);
+ remove_all_registered_events(kvm);
+ remove_all_exposed_events(kvm);
+ spin_unlock(&ksdei->lock);
+
+ kfree(ksdei);
+ kvm->arch.sdei = NULL;
+ }
+}
diff --git a/include/uapi/linux/arm_sdei.h b/include/uapi/linux/arm_sdei.h
index af0630ba5437..39bcf6dbbea0 100644
--- a/include/uapi/linux/arm_sdei.h
+++ b/include/uapi/linux/arm_sdei.h
@@ -22,8 +22,10 @@
#define SDEI_1_0_FN_SDEI_PE_UNMASK SDEI_1_0_FN(0x0C)
#define SDEI_1_0_FN_SDEI_INTERRUPT_BIND SDEI_1_0_FN(0x0D)
#define SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE SDEI_1_0_FN(0x0E)
+#define SDEI_1_1_FN_SDEI_EVENT_SIGNAL SDEI_1_0_FN(0x0F)
#define SDEI_1_0_FN_SDEI_PRIVATE_RESET SDEI_1_0_FN(0x11)
#define SDEI_1_0_FN_SDEI_SHARED_RESET SDEI_1_0_FN(0x12)
+#define SDEI_1_1_FN_SDEI_FEATURES SDEI_1_0_FN(0x30)

#define SDEI_VERSION_MAJOR_SHIFT 48
#define SDEI_VERSION_MAJOR_MASK 0x7fff
--
2.23.0

2022-03-23 18:14:38

by Oliver Upton

[permalink] [raw]

Subject: Re: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

Hi Gavin,

On Tue, Mar 22, 2022 at 04:06:50PM +0800, Gavin Shan wrote:
> Software Delegated Exception Interface (SDEI) provides a mechanism for
> registering and servicing system events. Those system events are high
> priority events, which must be serviced immediately. It's going to be
> used by Asynchronous Page Fault (APF) to deliver notification from KVM
> to guest. It's noted that SDEI is defined by ARM DEN0054C specification.

I'm guessing that you're using linked lists for stitching all of this
together because the specification provides for 24 bits of event
encoding. However, it seems that there will be a finite number of events
in KVM. So the APF stuff and a software signaled event.

Given that the number of events in KVM is rather small, would it make
more sense to do away with the overhead of linked lists and having the
state just represented in a static or allocated array? I think you can
cram all of the VM scoped event state into a single structure and hang
the implementation off of that.

> This introduces SDEI virtualization infrastructure where the SDEI events
> are registered and manipulated by the guest through hypercall. The SDEI
> event is delivered to one specific vCPU by KVM once it's raised. This
> introduces data structures to represent the needed objects to support
> the feature, which is highlighted as below. As those objects could be
> migrated between VMs, these data structures are partially exposed to
> user space.
>
> * kvm_sdei_exposed_event
> The exposed events are determined and added by VMM through ioctl
> interface. Only the exposed events can be registered from the
> guest.
>
> * kvm_sdei_registered_event
> The events that have been registered from the guest through the
> SDEI_1_0_FN_SDEI_EVENT_REGISTER hypercall.
>
> * kvm_sdei_vcpu_event
> The events that have been delivered to the target vCPU.
>
> * kvm_sdei_vcpu
> Used to save the preempted context when the SDEI event is serviced
> and delivered. After the SDEI event handling is completed, the
> execution is resumed from the preempted context.
>
> * kvm_sdei_kvm
> Place holder for the exposed and registered events.

It might be a good idea to expand these a bit and move them into
comments on each of the structures.

> The error of SDEI_NOT_SUPPORTED is returned for all SDEI hypercalls for
> now. They will be supported in the subsequent patches.
>
> Signed-off-by: Gavin Shan <[email protected]>
> ---
> arch/arm64/include/asm/kvm_host.h | 3 +
> arch/arm64/include/asm/kvm_sdei.h | 171 +++++++++++++
> arch/arm64/include/uapi/asm/kvm.h | 1 +
> arch/arm64/include/uapi/asm/kvm_sdei_state.h | 72 ++++++
> arch/arm64/kvm/Makefile | 2 +-
> arch/arm64/kvm/arm.c | 8 +
> arch/arm64/kvm/hypercalls.c | 21 ++
> arch/arm64/kvm/sdei.c | 244 +++++++++++++++++++
> include/uapi/linux/arm_sdei.h | 2 +
> 9 files changed, 523 insertions(+), 1 deletion(-)
> create mode 100644 arch/arm64/include/asm/kvm_sdei.h
> create mode 100644 arch/arm64/include/uapi/asm/kvm_sdei_state.h
> create mode 100644 arch/arm64/kvm/sdei.c
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 031e3a2537fc..5d37e046a458 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -113,6 +113,8 @@ struct kvm_arch {
> /* Interrupt controller */
> struct vgic_dist vgic;
>
> + struct kvm_sdei_kvm *sdei;
> +

nit: avoid repeating 'kvm'. struct kvm_sdei should be descriptive enough
:)

> /* Mandated version of PSCI */
> u32 psci_version;
>
> @@ -338,6 +340,7 @@ struct kvm_vcpu_arch {
> * Anything that is not used directly from assembly code goes
> * here.
> */
> + struct kvm_sdei_vcpu *sdei;
>

nit: put your scoping tokens at the beginning of a symbol name, so
'struct kvm_vcpu_sdei'.

[...]

> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> index 202b8c455724..3c20fee72bb4 100644
> --- a/arch/arm64/kvm/hypercalls.c
> +++ b/arch/arm64/kvm/hypercalls.c
> @@ -5,6 +5,7 @@
> #include <linux/kvm_host.h>
>
> #include <asm/kvm_emulate.h>
> +#include <asm/kvm_sdei.h>
>
> #include <kvm/arm_hypercalls.h>
> #include <kvm/arm_psci.h>
> @@ -151,6 +152,26 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
> case ARM_SMCCC_TRNG_RND32:
> case ARM_SMCCC_TRNG_RND64:
> return kvm_trng_call(vcpu);
> + case SDEI_1_0_FN_SDEI_VERSION:
> + case SDEI_1_0_FN_SDEI_EVENT_REGISTER:
> + case SDEI_1_0_FN_SDEI_EVENT_ENABLE:
> + case SDEI_1_0_FN_SDEI_EVENT_DISABLE:
> + case SDEI_1_0_FN_SDEI_EVENT_CONTEXT:
> + case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
> + case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
> + case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER:
> + case SDEI_1_0_FN_SDEI_EVENT_STATUS:
> + case SDEI_1_0_FN_SDEI_EVENT_GET_INFO:
> + case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET:
> + case SDEI_1_0_FN_SDEI_PE_MASK:
> + case SDEI_1_0_FN_SDEI_PE_UNMASK:
> + case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
> + case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE:
> + case SDEI_1_1_FN_SDEI_EVENT_SIGNAL:
> + case SDEI_1_0_FN_SDEI_PRIVATE_RESET:
> + case SDEI_1_0_FN_SDEI_SHARED_RESET:
> + case SDEI_1_1_FN_SDEI_FEATURES:

Consider only adding switch statements for hypercalls when they're
actually implemented.

Additionally, while this isn't directly related to your patch, I do have
a gripe about kvm_hvc_call_handler(). It is really ugly that we
enumerate the specific hypercalls we support, and otherwise fall through
to PSCI.

IMO, a cleaner approach would be to have kvm_hvc_call_handler() simply
route a particular service range/service owner to the appropriate
handler. We can then terminate individual hypercalls in those handlers,
avoiding a catch-all switch such as this one is currently.

--
Thanks,
Oliver

2022-03-24 02:11:33

by Oliver Upton

[permalink] [raw]

Subject: Re: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

Hi Gavin,

More comments, didn't see exactly how all of these structures are
getting used.

On Tue, Mar 22, 2022 at 04:06:50PM +0800, Gavin Shan wrote:

[...]

> diff --git a/arch/arm64/include/uapi/asm/kvm_sdei_state.h b/arch/arm64/include/uapi/asm/kvm_sdei_state.h
> new file mode 100644
> index 000000000000..b14844230117
> --- /dev/null
> +++ b/arch/arm64/include/uapi/asm/kvm_sdei_state.h
> @@ -0,0 +1,72 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * Definitions of various KVM SDEI event states.
> + *
> + * Copyright (C) 2022 Red Hat, Inc.
> + *
> + * Author(s): Gavin Shan <[email protected]>
> + */
> +
> +#ifndef _UAPI__ASM_KVM_SDEI_STATE_H
> +#define _UAPI__ASM_KVM_SDEI_STATE_H
> +
> +#ifndef __ASSEMBLY__
> +#include <linux/types.h>
> +
> +/*
> + * The software signaled event is the default one, which is
> + * defined in v1.1 specification.
> + */
> +#define KVM_SDEI_INVALID_EVENT 0xFFFFFFFF

Isn't the constraint that bit 31 must be zero? (DEN 0054C 4.4 "Event
number allocation")

> +#define KVM_SDEI_DEFAULT_EVENT 0
> +
> +#define KVM_SDEI_MAX_VCPUS 512 /* Aligned to 64 */
> +#define KVM_SDEI_MAX_EVENTS 128

I would *strongly* recommend against having these limits. I find the
vCPU limit especially concerning, because we're making KVM_MAX_VCPUS
ABI, which it definitely is not. Anything that deals with a vCPU should
be accessed through a vCPU FD (and thus agnostic to the maximum number
of vCPUs) to avoid such a complication.

> +struct kvm_sdei_exposed_event_state {
> + __u64 num;
> +
> + __u8 type;
> + __u8 signaled;
> + __u8 priority;
> + __u8 padding[5];
> + __u64 notifier;

Wait, isn't this a kernel function pointer!?

> +};
> +
> +struct kvm_sdei_registered_event_state {

You should fold these fields together with kvm_sdei_exposed_event_state
into a single 'kvm_sdei_event' structure:

> + __u64 num;
> +
> + __u8 route_mode;
> + __u8 padding[3];
> + __u64 route_affinity;

And these shouldn't be UAPI at the VM scope. Each of these properties
could be accessed via a synthetic/'pseudo-firmware' register on a vCPU FD:

> + __u64 ep_address[KVM_SDEI_MAX_VCPUS];
> + __u64 ep_arg[KVM_SDEI_MAX_VCPUS];
> + __u64 registered[KVM_SDEI_MAX_VCPUS/64];
> + __u64 enabled[KVM_SDEI_MAX_VCPUS/64];
> + __u64 unregister_pending[KVM_SDEI_MAX_VCPUS/64];
> +};
> +
> +struct kvm_sdei_vcpu_event_state {
> + __u64 num;
> +
> + __u32 event_count;
> + __u32 padding;
> +};
> +
> +struct kvm_sdei_vcpu_regs_state {
> + __u64 regs[18];
> + __u64 pc;
> + __u64 pstate;
> +};
> +
> +struct kvm_sdei_vcpu_state {

Same goes here, I strongly recommend you try to expose this through the
KVM_{GET,SET}_ONE_REG interface if at all possible since it
significantly reduces the UAPI burden, both on KVM to maintain it and
VMMs to actually use it.

--
Thanks,
Oliver

2022-03-24 08:08:17

by Gavin Shan

[permalink] [raw]

Subject: Re: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

Hi Oliver,

On 3/23/22 6:43 AM, Oliver Upton wrote:
> On Tue, Mar 22, 2022 at 04:06:50PM +0800, Gavin Shan wrote:
>> Software Delegated Exception Interface (SDEI) provides a mechanism for
>> registering and servicing system events. Those system events are high
>> priority events, which must be serviced immediately. It's going to be
>> used by Asynchronous Page Fault (APF) to deliver notification from KVM
>> to guest. It's noted that SDEI is defined by ARM DEN0054C specification.
>
> I'm guessing that you're using linked lists for stitching all of this
> together because the specification provides for 24 bits of event
> encoding. However, it seems that there will be a finite number of events
> in KVM. So the APF stuff and a software signaled event.
>
> Given that the number of events in KVM is rather small, would it make
> more sense to do away with the overhead of linked lists and having the
> state just represented in a static or allocated array? I think you can
> cram all of the VM scoped event state into a single structure and hang
> the implementation off of that.
>

Yes, the number of events in KVM is small. Including the events for Async
PF and the software signaled event, 8 events would be enough currently. In
the meanwhile, there are several types of objects for various events. Some
of them can be put into static array, while the left might need static array
of pointers to avoid the linked list:

struct kvm_sdei_exposed_event/state on struct kvm_arch
size: 24 bytes
static array, 8 entries
struct kvm_sdei_registered_event/state on struct kvm_arch
size: 9KB
static array of pointers, still need allocate them dynamically, 8 entries
struct kvm_sdei_vcpu_event/state on struct kvm_vcpu_arch
size: 16 bytes
static array, 8 entries

>> This introduces SDEI virtualization infrastructure where the SDEI events
>> are registered and manipulated by the guest through hypercall. The SDEI
>> event is delivered to one specific vCPU by KVM once it's raised. This
>> introduces data structures to represent the needed objects to support
>> the feature, which is highlighted as below. As those objects could be
>> migrated between VMs, these data structures are partially exposed to
>> user space.
>>
>> * kvm_sdei_exposed_event
>> The exposed events are determined and added by VMM through ioctl
>> interface. Only the exposed events can be registered from the
>> guest.
>>
>> * kvm_sdei_registered_event
>> The events that have been registered from the guest through the
>> SDEI_1_0_FN_SDEI_EVENT_REGISTER hypercall.
>>
>> * kvm_sdei_vcpu_event
>> The events that have been delivered to the target vCPU.
>>
>> * kvm_sdei_vcpu
>> Used to save the preempted context when the SDEI event is serviced
>> and delivered. After the SDEI event handling is completed, the
>> execution is resumed from the preempted context.
>>
>> * kvm_sdei_kvm
>> Place holder for the exposed and registered events.
>
> It might be a good idea to expand these a bit and move them into
> comments on each of the structures.
>

Sure, I will do in next respin.

>> The error of SDEI_NOT_SUPPORTED is returned for all SDEI hypercalls for
>> now. They will be supported in the subsequent patches.
>>
>> Signed-off-by: Gavin Shan <[email protected]>
>> ---
>> arch/arm64/include/asm/kvm_host.h | 3 +
>> arch/arm64/include/asm/kvm_sdei.h | 171 +++++++++++++
>> arch/arm64/include/uapi/asm/kvm.h | 1 +
>> arch/arm64/include/uapi/asm/kvm_sdei_state.h | 72 ++++++
>> arch/arm64/kvm/Makefile | 2 +-
>> arch/arm64/kvm/arm.c | 8 +
>> arch/arm64/kvm/hypercalls.c | 21 ++
>> arch/arm64/kvm/sdei.c | 244 +++++++++++++++++++
>> include/uapi/linux/arm_sdei.h | 2 +
>> 9 files changed, 523 insertions(+), 1 deletion(-)
>> create mode 100644 arch/arm64/include/asm/kvm_sdei.h
>> create mode 100644 arch/arm64/include/uapi/asm/kvm_sdei_state.h
>> create mode 100644 arch/arm64/kvm/sdei.c
>>
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 031e3a2537fc..5d37e046a458 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -113,6 +113,8 @@ struct kvm_arch {
>> /* Interrupt controller */
>> struct vgic_dist vgic;
>>
>> + struct kvm_sdei_kvm *sdei;
>> +
>
> nit: avoid repeating 'kvm'. struct kvm_sdei should be descriptive enough
> :)
>

Indeed, "struct kvm_sdei" is better :)

>> /* Mandated version of PSCI */
>> u32 psci_version;
>>
>> @@ -338,6 +340,7 @@ struct kvm_vcpu_arch {
>> * Anything that is not used directly from assembly code goes
>> * here.
>> */
>> + struct kvm_sdei_vcpu *sdei;
>>
>
> nit: put your scoping tokens at the beginning of a symbol name, so
> 'struct kvm_vcpu_sdei'.
>
> [...]
>

Yep, "struct kvm_vcpu_sdei" is the one I will have in next respin :)

>> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
>> index 202b8c455724..3c20fee72bb4 100644
>> --- a/arch/arm64/kvm/hypercalls.c
>> +++ b/arch/arm64/kvm/hypercalls.c
>> @@ -5,6 +5,7 @@
>> #include <linux/kvm_host.h>
>>
>> #include <asm/kvm_emulate.h>
>> +#include <asm/kvm_sdei.h>
>>
>> #include <kvm/arm_hypercalls.h>
>> #include <kvm/arm_psci.h>
>> @@ -151,6 +152,26 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
>> case ARM_SMCCC_TRNG_RND32:
>> case ARM_SMCCC_TRNG_RND64:
>> return kvm_trng_call(vcpu);
>> + case SDEI_1_0_FN_SDEI_VERSION:
>> + case SDEI_1_0_FN_SDEI_EVENT_REGISTER:
>> + case SDEI_1_0_FN_SDEI_EVENT_ENABLE:
>> + case SDEI_1_0_FN_SDEI_EVENT_DISABLE:
>> + case SDEI_1_0_FN_SDEI_EVENT_CONTEXT:
>> + case SDEI_1_0_FN_SDEI_EVENT_COMPLETE:
>> + case SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME:
>> + case SDEI_1_0_FN_SDEI_EVENT_UNREGISTER:
>> + case SDEI_1_0_FN_SDEI_EVENT_STATUS:
>> + case SDEI_1_0_FN_SDEI_EVENT_GET_INFO:
>> + case SDEI_1_0_FN_SDEI_EVENT_ROUTING_SET:
>> + case SDEI_1_0_FN_SDEI_PE_MASK:
>> + case SDEI_1_0_FN_SDEI_PE_UNMASK:
>> + case SDEI_1_0_FN_SDEI_INTERRUPT_BIND:
>> + case SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE:
>> + case SDEI_1_1_FN_SDEI_EVENT_SIGNAL:
>> + case SDEI_1_0_FN_SDEI_PRIVATE_RESET:
>> + case SDEI_1_0_FN_SDEI_SHARED_RESET:
>> + case SDEI_1_1_FN_SDEI_FEATURES:
>
> Consider only adding switch statements for hypercalls when they're
> actually implemented.
>
> Additionally, while this isn't directly related to your patch, I do have
> a gripe about kvm_hvc_call_handler(). It is really ugly that we
> enumerate the specific hypercalls we support, and otherwise fall through
> to PSCI.
>
> IMO, a cleaner approach would be to have kvm_hvc_call_handler() simply
> route a particular service range/service owner to the appropriate
> handler. We can then terminate individual hypercalls in those handlers,
> avoiding a catch-all switch such as this one is currently.
>

Yes, I agree. I can have a separate patch as preparatory work to
route the range of hypercalls to their owner for further handling.
In this way, we can route the range of SDEI hypercalls to its own
handler. I will figure it out in next respin.

Thanks,
Gavin

2022-03-24 18:24:38

by Gavin Shan

[permalink] [raw]

Subject: Re: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

Hi Oliver,

On 3/24/22 1:11 AM, Oliver Upton wrote:
> More comments, didn't see exactly how all of these structures are
> getting used.
>

Ok, thanks for your review and comments.

> On Tue, Mar 22, 2022 at 04:06:50PM +0800, Gavin Shan wrote:
>
> [...]
>
>> diff --git a/arch/arm64/include/uapi/asm/kvm_sdei_state.h b/arch/arm64/include/uapi/asm/kvm_sdei_state.h
>> new file mode 100644
>> index 000000000000..b14844230117
>> --- /dev/null
>> +++ b/arch/arm64/include/uapi/asm/kvm_sdei_state.h
>> @@ -0,0 +1,72 @@
>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>> +/*
>> + * Definitions of various KVM SDEI event states.
>> + *
>> + * Copyright (C) 2022 Red Hat, Inc.
>> + *
>> + * Author(s): Gavin Shan <[email protected]>
>> + */
>> +
>> +#ifndef _UAPI__ASM_KVM_SDEI_STATE_H
>> +#define _UAPI__ASM_KVM_SDEI_STATE_H
>> +
>> +#ifndef __ASSEMBLY__
>> +#include <linux/types.h>
>> +
>> +/*
>> + * The software signaled event is the default one, which is
>> + * defined in v1.1 specification.
>> + */
>> +#define KVM_SDEI_INVALID_EVENT 0xFFFFFFFF
>
> Isn't the constraint that bit 31 must be zero? (DEN 0054C 4.4 "Event
> number allocation")
>

Yes, bit 31 of the event number should be zero. So this is invalid
event number, used by struct kvm_sdei_vcpu_state::critical_num
and normal_num to indicate if there is event being handled on the
corresponding vcpu. When those fields are set to KVM_SDEI_INVALID_EVENT,
no event is being handled on the vcpu.

>> +#define KVM_SDEI_DEFAULT_EVENT 0
>> +
>> +#define KVM_SDEI_MAX_VCPUS 512 /* Aligned to 64 */
>> +#define KVM_SDEI_MAX_EVENTS 128
>
> I would *strongly* recommend against having these limits. I find the
> vCPU limit especially concerning, because we're making KVM_MAX_VCPUS
> ABI, which it definitely is not. Anything that deals with a vCPU should
> be accessed through a vCPU FD (and thus agnostic to the maximum number
> of vCPUs) to avoid such a complication.
>

For KVM_SDEI_DEFAULT_EVENT, which corresponds to the software signaled
event. As you suggested on PATCH[15/22], we can't assume its usage.
I will define it with SDEI_SW_SIGNALED_EVENT in uapi/linux/arm_sdei.h

For KVM_SDEI_MAX_EVENTS, it will be moved from this header file to
kvm_sdei.h after static arrays to hold the data structures or their
pointers are used, as you suggested early for this patch (PATCH[02/22]).

There are two types of (SDEI) events: shared and private. For the private
event, it can be registered independently from the vcpus. It also means
the address and argument for the entry points, corresponding to @ep_address
and @ep_arg in struct kvm_sdei_registered_event, can be different on
the individual vcpus. However, all the registered/enabled states and
the entry point address and argument are same on all vcpus for the shared
event. KVM_SDEI_MAX_VCPUS was introduced to use same data structure to
represent both shared and private event.

If the data belongs to particular vcpu should be accessed through the
vcpu fd, then we need to split or reorganize the data struct as below.

/*
* The events are exposed through ioctl interface or similar
* mechanism (synthetic system registers?) before they can be
* registered. struct kvm_sdei_exposed_event instance is reserved
* from the kvm's static array on receiving the ioctl command
* from VMM.
*/
struct kvm_sdei_exposed_event {
__u32 num;

__u8 type;
__u8 signaled;
__u8 priority;
__u8 padding;
};

/*
* The struct kvm_sdei_registered_event instance is allocated or
* reserved from the static array. For the shared event, the instance
* is linked to kvm, but it will be allocated or reserved from vcpu's
* static array and linked to the vcpu if it's a private event.
*
* The instance is only allocated and reserved upon SDEI_EVENT_REGISTER
* hypercall.
*/
struct kvm_sdei_registered_event {
__u32 num

#define KVM_SDEI_EVENT_STATE_REGISTERED (1 << 0)
#define KVM_SDEI_EVENT_STATE_ENABLE (1 << 1)
#define KVM_SDEI_EVENT_STATE_UNREGISTER_PENDING (1 << 2)
__u8 state;
__u8 route_mode;
__u8 padding[2];
__u64 route_affinity;
__u64 ep_address;
__u64 ep_arg;
__u64 notifier;
}

>> +struct kvm_sdei_exposed_event_state {
>> + __u64 num;
>> +
>> + __u8 type;
>> + __u8 signaled;
>> + __u8 priority;
>> + __u8 padding[5];
>> + __u64 notifier;
>
> Wait, isn't this a kernel function pointer!?
>

Yeah, it is a kernel function pointer, used by Async PF to know if
the corresponding event has been handled or not. Async PF can cancel
the previously injected event for performance concerns. Either Async PF
or SDEI needs to migrate it. To keep SDEI transparent enough to Async PF,
SDEI is responsible for its migration.

>> +};
>> +
>> +struct kvm_sdei_registered_event_state {
>
> You should fold these fields together with kvm_sdei_exposed_event_state
> into a single 'kvm_sdei_event' structure:
>

@route_mode and @route_affinity can't be configured or modified until
the event is registered. Besides, they're only valid to the shared
events. For private events, they don't have the routing needs. It means
those two fields would be part of struct kvm_sdei_registered_event instead
of kvm_sdei_exposed_event.

>> + __u64 num;
>> +
>> + __u8 route_mode;
>> + __u8 padding[3];
>> + __u64 route_affinity;
>
> And these shouldn't be UAPI at the VM scope. Each of these properties
> could be accessed via a synthetic/'pseudo-firmware' register on a vCPU FD:
>

They're accessed through vcpu or kvm fd depending on what type the event
is. For the VM-owned shared event, they're accessed through KVM fd. For the
vcpu-owned private event, they're accessed through vcpu fd.

I'm not sure if I catch the idea to have a synthetic register and I'm to
confirm. If I'm correct, you're talking about the "IMPLEMENTATION DEFINED"
system register, whose OP0 and CRn are 0B11 and 0B1x11. If two implementation
defined registers can be adopted, I don't think we need to expose anything
through ABI. All the operations and the needed data can be passed through
the system registers.

SYS_REG_SDEI_COMMAND
Receives commands like to expose event, register event and change
vcpu state etc.
SYS_REG_SDEI_DATA
The needed data corresponding to the received command.

However, I'm not positive that synthetic register can be used here. When
Mark Rutland review "PATCH[RFC v1] Async PF support", the implementation
defined registers can't be used in a very limited way. That time, a set
of implementation defined registers are defined to identify the asynchronous
page faults and access to the control data block. However, the idea was
rejected. Later on, Marc recommended SDEI for Async PF.

https://www.spinics.net/lists/kvm-arm/msg40315.html

>> + __u64 ep_address[KVM_SDEI_MAX_VCPUS];
>> + __u64 ep_arg[KVM_SDEI_MAX_VCPUS];
>> + __u64 registered[KVM_SDEI_MAX_VCPUS/64];
>> + __u64 enabled[KVM_SDEI_MAX_VCPUS/64];
>> + __u64 unregister_pending[KVM_SDEI_MAX_VCPUS/64];
>> +};
>> +
>> +struct kvm_sdei_vcpu_event_state {
>> + __u64 num;
>> +
>> + __u32 event_count;
>> + __u32 padding;
>> +};
>> +
>> +struct kvm_sdei_vcpu_regs_state {
>> + __u64 regs[18];
>> + __u64 pc;
>> + __u64 pstate;
>> +};
>> +
>> +struct kvm_sdei_vcpu_state {
>
> Same goes here, I strongly recommend you try to expose this through the
> KVM_{GET,SET}_ONE_REG interface if at all possible since it
> significantly reduces the UAPI burden, both on KVM to maintain it and
> VMMs to actually use it.
>

Yeah, it's much convenient to use the implementation defined register here.
However, I'm not positive if we can do this. Please see the details I
provided above :)

Thanks,
Gavin

2022-03-25 18:43:54

by Gavin Shan

[permalink] [raw]

Subject: Re: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

Hi Oliver,

On 3/24/22 5:04 PM, Oliver Upton wrote:
> On Thu, Mar 24, 2022 at 02:54:00PM +0800, Gavin Shan wrote:
>>>> +#define KVM_SDEI_DEFAULT_EVENT 0
>>>> +
>>>> +#define KVM_SDEI_MAX_VCPUS 512 /* Aligned to 64 */
>>>> +#define KVM_SDEI_MAX_EVENTS 128
>>>
>>> I would *strongly* recommend against having these limits. I find the
>>> vCPU limit especially concerning, because we're making KVM_MAX_VCPUS
>>> ABI, which it definitely is not. Anything that deals with a vCPU should
>>> be accessed through a vCPU FD (and thus agnostic to the maximum number
>>> of vCPUs) to avoid such a complication.
>>>
>>
>> For KVM_SDEI_DEFAULT_EVENT, which corresponds to the software signaled
>> event. As you suggested on PATCH[15/22], we can't assume its usage.
>> I will define it with SDEI_SW_SIGNALED_EVENT in uapi/linux/arm_sdei.h
>>
>> For KVM_SDEI_MAX_EVENTS, it will be moved from this header file to
>> kvm_sdei.h after static arrays to hold the data structures or their
>> pointers are used, as you suggested early for this patch (PATCH[02/22]).
>>
>> There are two types of (SDEI) events: shared and private. For the private
>> event, it can be registered independently from the vcpus. It also means
>> the address and argument for the entry points, corresponding to @ep_address
>> and @ep_arg in struct kvm_sdei_registered_event, can be different on
>> the individual vcpus. However, all the registered/enabled states and
>> the entry point address and argument are same on all vcpus for the shared
>> event. KVM_SDEI_MAX_VCPUS was introduced to use same data structure to
>> represent both shared and private event.
>
> You're providing a great deal of abstraction around the SDEI
> specification, but I really am unconvinced that KVM needs that. This
> series needs to add support for a single SDEI event (event 0) and async
> PF to follow. Since we are going to support a static set of events under
> KVM I believe a lot of the complexity in this design should fade away.
>

Yeah, I think we can drop the functionality to support the shared
event since both events are exposed by KVM as private events. Please
refer below reply for more details.

>>>> +struct kvm_sdei_exposed_event_state {
>>>> + __u64 num;
>>>> +
>>>> + __u8 type;
>>>> + __u8 signaled;
>>>> + __u8 priority;
>>>> + __u8 padding[5];
>>>> + __u64 notifier;
>>>
>>> Wait, isn't this a kernel function pointer!?
>>>
>>
>> Yeah, it is a kernel function pointer, used by Async PF to know if
>> the corresponding event has been handled or not. Async PF can cancel
>> the previously injected event for performance concerns. Either Async PF
>> or SDEI needs to migrate it. To keep SDEI transparent enough to Async PF,
>> SDEI is responsible for its migration.
>
> But this is a UAPI header, why would we even consider giving userspace a
> window into the kernel like this? Couldn't userspace crash the kernel by
> writing whatever it wants to this field, knowing that it will call it as
> a function pointer?
>
> Security aside, there's no guarantee that a function winds up at the
> same address between compiler versions or kernel versions.
>
> Overall, I don't think that userspace should have the ability to add
> arbitrary SDEI events. KVM takes ownership of its own vendor-specific
> ABI in patch 3/22 by declaring its vendor identifier, so any new events
> we support must remain within KVM. There is going to be some state that
> will need to be migrated for KVM's SDEI events, that ought to be
> surfaced to userspace through the KVM_{GET,SET}_ONE_REG ioctls.
>
> Given that there isn't much userspace involvement to make SDEI
> work, do you think it would be possible to drop the proposed UAPI from
> your series and work on enabling software-signaled SDEI events within
> KVM first? By this I mean a VM running under KVM shouldn't require any
> ioctls to make it work.
>
> In so doing, we can discover exactly what the mechanics look like in KVM
> and only then talk about the necessary UAPI to migrate state. One piece
> of the mechanics that is top of mind which I'd like to see addressed is
> the use of linked lists and the preallocations that have been made in
> structures. KVM will know how many events exist at compile time, so we
> can represent these statically.
>

For @notifier, it is set on the kernel of source VM and migrated to the
kernel of destination VM. It's really concerned with security, I will
drop this in next respin.

I plan to make the following changes for next revision (v6) as below.
Please let me know if there are more things to be covered:

(1) Drop the code to support the shared (SDEI) events since the needed
events, either software signaled or Async PF event, are all private.
After that, the private event is the only type to be supported. When
the shared event becomes unsupported, we can simply return SDEI_NOT_SUPPORTED
for hypercall SDEI_EVENT_ROUTING_SET and SDEI_SHARED_RESET.

(2) Drop the code to support migration for now. It means the data
structures defined in arch/arm64/include/uapi/asm/kvm_sdei_state.h
will be combined to arch/arm64/include/asm/kvm_sdei.h. Besides,
all the ioctl commands, including KVM_CAP_ARM_SDEI will be dropped
either.

(3) As we discussed before, the linked lists will be replaced by the
static arrays or preallocation. The data structures will be amended
like below.

(3.1) In arch/arm64/include/asm/kvm_sdei.h

struct kvm_sdei_exposed_event {
unsigned int num;
unsigned char type; /* reserved, should be 'private' */
unsigned char signaled;
unsigned char priority;
};

struct kvm_sdei_registered_event {
struct kvm_sdei_exposed_event *exposed_event;
unsigned long ep_address;
unsigned long ep_arg;
#define KVM_SDEI_EVENT_STATE_REGISTERED (1UL << 0)
#define KVM_SDEI_EVENT_STATE_ENABLED (1UL << 1)
#define KVM_SDEI_EVENT_STATE_UNREGISTER_PENDING (1UL << 2)
unsigned long state;

unsigned long event_count; /* number of events pending for hanlding */
};

struct kvm_sdei_context {
struct kvm_sdei_registered_event *registered_event;
unsigned long regs[18];
unsigned long pc;
unsigned long pstate;
};

/* We even needn't a lock */
struct kvm_sdei_vcpu {
unsigned char masked;
struct kvm_sdei_registered_event *registered_event;
struct kvm_sdei_context context[SDEI_EVENT_PRIORITY_CRITICAL + 1];
};

/* struct kvm_sdei won't be existing any more */

(3.2) In arch/arm64/kvm/sdei.c

static struct kvm_sdei_exposed_event exposed_events[] = {
{ .num = SDEI_EVENT_SW_SIGNALED, /* defined in include/uapi/linux/arm_sdei.h */
.type = SDEI_EVENT_TYPE_PRIVATE,
.signaled = 1,
.priority = SDEI_EVENT_PRIORITY_NORMAL,
},
{
.num = SDEI_EVENT_ASYNC_PF, /* defined in include/asm/kvm_sdei.h */
.type = SDEI_EVENT_TYPE_PRIVATE,
.signaled = 1,
.priority = SDEI_EVENT_PRIORITY_CRITICAL,
},
};

In kvm_sdei_create_vcpu(), the events are pre-allocated. They will be destroy
in kvm_sdei_destroy_vcpu().

struct kvm_vcpu_arch::sdei =
kzalloc(sizeof(struct kvm_sdei_vcpu), GFP_KERNEL_ACCOUNT);
struct kvm_sdei_vcpu::registered_event =
kcalloc(sizeof(struct kvm_sdei_exposed_event), ARRAY_SIZE(exposed_events), GFP_KERNEL_ACCOUNT);

In kvm_sdei_hypercall(), SDEI_NOT_SUPPORTED will be returned on hypercall
SDEI_EVENT_ROUTING_SET and SDEI_SHARED_RESET.

(4) Stuff to be considered or discussed in future: migration and mechanism.
We probably don't want to support the shared event. The function of a
event is to notify from host to guest kernel. In this regard, the shared
event can be replaced by the private one. With above changes in (1)/(2)/(3),
we don't have any VM-scoped properties or states. It means all the properties
and states are associated with VCPU. It will help to adopt 'firmware-pseudo'
registers and KVM_{GET,SET}_ONE_REG during the migration in the future.

>>>> +};
>>>> +
>>>> +struct kvm_sdei_registered_event_state {
>>>
>>> You should fold these fields together with kvm_sdei_exposed_event_state
>>> into a single 'kvm_sdei_event' structure:
>>>
>>
>> @route_mode and @route_affinity can't be configured or modified until
>> the event is registered. Besides, they're only valid to the shared
>> events. For private events, they don't have the routing needs. It means
>> those two fields would be part of struct kvm_sdei_registered_event instead
>> of kvm_sdei_exposed_event.
>>
>>
>>>> + __u64 num;
>>>> +
>>>> + __u8 route_mode;
>>>> + __u8 padding[3];
>>>> + __u64 route_affinity;
>>>
>>> And these shouldn't be UAPI at the VM scope. Each of these properties
>>> could be accessed via a synthetic/'pseudo-firmware' register on a vCPU FD:
>>>
>>
>> They're accessed through vcpu or kvm fd depending on what type the event
>> is. For the VM-owned shared event, they're accessed through KVM fd. For the
>> vcpu-owned private event, they're accessed through vcpu fd.
>
> Some of the state that you represent in struct kvm_sdei_registered_event_state
> is really per-vCPU state. Any time that there's data available at a vCPU
> granularity userspace should access it with a vCPU FD.
>

Yeah, I used "struct kvm_sdei_registered_event" to represent the shared and
private event. For the shared event, they are VM scoped. However, they're
VCPU scoped for the private event. As I suggested above, all the states are
VCPU scoped when the private event becomes the only supported one.

>> I'm not sure if I catch the idea to have a synthetic register and I'm to
>> confirm. If I'm correct, you're talking about the "IMPLEMENTATION DEFINED"
>> system register, whose OP0 and CRn are 0B11 and 0B1x11. If two implementation
>> defined registers can be adopted, I don't think we need to expose anything
>> through ABI. All the operations and the needed data can be passed through
>> the system registers.
>
> No, I'm not talking about the guest-facing interface. I'm talking about
> what gets exposed to userspace to migrate the VM's state. For parts of
> the guest that do not map to an architectural construct, we've defined
> the concept of a firmware pseudo-register. What that really means is we
> expose a register to userspace that is inaccessible from the guest which
> migrates KVM's nonarchitected state. We are abusing the fact that VMMs
> will save/restore whatever registers are reported on KVM_GET_REG_LIST to
> trick it into migrating KVM state, and it has worked quite nicely to
> avoid adding new ioctls for new features. It also means that older VMMs
> are capable of utilizing new KVM features, so long as they obey the
> prescribed rules.
>

Thanks for the details about the 'firmware pseudo register'. Oliver, is
there exiting one in current KVM implementation? I would like to see how
it's being used. It's definitely a good idea. Those non-architectural
CPU properties can be mapped and migrated in a natural way. I'm not
sure if we had similar mechanism or 'firmware pseudo registers' for
the VM scoped properties?

Thanks,
Gavin

2022-03-25 19:41:08

by Oliver Upton

[permalink] [raw]

Subject: Re: [PATCH v5 02/22] KVM: arm64: Add SDEI virtualization infrastructure

On Thu, Mar 24, 2022 at 02:54:00PM +0800, Gavin Shan wrote:
> > > +#define KVM_SDEI_DEFAULT_EVENT 0
> > > +
> > > +#define KVM_SDEI_MAX_VCPUS 512 /* Aligned to 64 */
> > > +#define KVM_SDEI_MAX_EVENTS 128
> >
> > I would *strongly* recommend against having these limits. I find the
> > vCPU limit especially concerning, because we're making KVM_MAX_VCPUS
> > ABI, which it definitely is not. Anything that deals with a vCPU should
> > be accessed through a vCPU FD (and thus agnostic to the maximum number
> > of vCPUs) to avoid such a complication.
> >
>
> For KVM_SDEI_DEFAULT_EVENT, which corresponds to the software signaled
> event. As you suggested on PATCH[15/22], we can't assume its usage.
> I will define it with SDEI_SW_SIGNALED_EVENT in uapi/linux/arm_sdei.h
>
> For KVM_SDEI_MAX_EVENTS, it will be moved from this header file to
> kvm_sdei.h after static arrays to hold the data structures or their
> pointers are used, as you suggested early for this patch (PATCH[02/22]).
>
> There are two types of (SDEI) events: shared and private. For the private
> event, it can be registered independently from the vcpus. It also means
> the address and argument for the entry points, corresponding to @ep_address
> and @ep_arg in struct kvm_sdei_registered_event, can be different on
> the individual vcpus. However, all the registered/enabled states and
> the entry point address and argument are same on all vcpus for the shared
> event. KVM_SDEI_MAX_VCPUS was introduced to use same data structure to
> represent both shared and private event.

You're providing a great deal of abstraction around the SDEI
specification, but I really am unconvinced that KVM needs that. This
series needs to add support for a single SDEI event (event 0) and async
PF to follow. Since we are going to support a static set of events under
KVM I believe a lot of the complexity in this design should fade away.

> > > +struct kvm_sdei_exposed_event_state {
> > > + __u64 num;
> > > +
> > > + __u8 type;
> > > + __u8 signaled;
> > > + __u8 priority;
> > > + __u8 padding[5];
> > > + __u64 notifier;
> >
> > Wait, isn't this a kernel function pointer!?
> >
>
> Yeah, it is a kernel function pointer, used by Async PF to know if
> the corresponding event has been handled or not. Async PF can cancel
> the previously injected event for performance concerns. Either Async PF
> or SDEI needs to migrate it. To keep SDEI transparent enough to Async PF,
> SDEI is responsible for its migration.

But this is a UAPI header, why would we even consider giving userspace a
window into the kernel like this? Couldn't userspace crash the kernel by
writing whatever it wants to this field, knowing that it will call it as
a function pointer?

Security aside, there's no guarantee that a function winds up at the
same address between compiler versions or kernel versions.

Overall, I don't think that userspace should have the ability to add
arbitrary SDEI events. KVM takes ownership of its own vendor-specific
ABI in patch 3/22 by declaring its vendor identifier, so any new events
we support must remain within KVM. There is going to be some state that
will need to be migrated for KVM's SDEI events, that ought to be
surfaced to userspace through the KVM_{GET,SET}_ONE_REG ioctls.

Given that there isn't much userspace involvement to make SDEI
work, do you think it would be possible to drop the proposed UAPI from
your series and work on enabling software-signaled SDEI events within
KVM first? By this I mean a VM running under KVM shouldn't require any
ioctls to make it work.

In so doing, we can discover exactly what the mechanics look like in KVM
and only then talk about the necessary UAPI to migrate state. One piece
of the mechanics that is top of mind which I'd like to see addressed is
the use of linked lists and the preallocations that have been made in
structures. KVM will know how many events exist at compile time, so we
can represent these statically.

> > > +};
> > > +
> > > +struct kvm_sdei_registered_event_state {
> >
> > You should fold these fields together with kvm_sdei_exposed_event_state
> > into a single 'kvm_sdei_event' structure:
> >
>
> @route_mode and @route_affinity can't be configured or modified until
> the event is registered. Besides, they're only valid to the shared
> events. For private events, they don't have the routing needs. It means
> those two fields would be part of struct kvm_sdei_registered_event instead
> of kvm_sdei_exposed_event.
>
>
> > > + __u64 num;
> > > +
> > > + __u8 route_mode;
> > > + __u8 padding[3];
> > > + __u64 route_affinity;
> >
> > And these shouldn't be UAPI at the VM scope. Each of these properties
> > could be accessed via a synthetic/'pseudo-firmware' register on a vCPU FD:
> >
>
> They're accessed through vcpu or kvm fd depending on what type the event
> is. For the VM-owned shared event, they're accessed through KVM fd. For the
> vcpu-owned private event, they're accessed through vcpu fd.

Some of the state that you represent in struct kvm_sdei_registered_event_state
is really per-vCPU state. Any time that there's data available at a vCPU
granularity userspace should access it with a vCPU FD.

> I'm not sure if I catch the idea to have a synthetic register and I'm to
> confirm. If I'm correct, you're talking about the "IMPLEMENTATION DEFINED"
> system register, whose OP0 and CRn are 0B11 and 0B1x11. If two implementation
> defined registers can be adopted, I don't think we need to expose anything
> through ABI. All the operations and the needed data can be passed through
> the system registers.

No, I'm not talking about the guest-facing interface. I'm talking about
what gets exposed to userspace to migrate the VM's state. For parts of
the guest that do not map to an architectural construct, we've defined
the concept of a firmware pseudo-register. What that really means is we
expose a register to userspace that is inaccessible from the guest which
migrates KVM's nonarchitected state. We are abusing the fact that VMMs
will save/restore whatever registers are reported on KVM_GET_REG_LIST to
trick it into migrating KVM state, and it has worked quite nicely to
avoid adding new ioctls for new features. It also means that older VMMs
are capable of utilizing new KVM features, so long as they obey the
prescribed rules.

--
Thanks,
Oliver