Changes since v4:
- Adjust KVM_CAP_HYPERV_SEND_IPI's number [158]
- Add Roman's Reviewed-bys
Using hypercall for sending IPIs is faster because this allows to specify
any number of vCPUs (even > 64 with sparse CPU set), the whole procedure
will take only one VMEXIT.
Same as PV TLB flush, this allows Windows guests having > 64 vCPUs to boot
on KVM when Hyper-V extensions are enabled.
Vitaly Kuznetsov (5):
KVM: x86: hyperv: enforce vp_index < KVM_MAX_VCPUS
KVM: x86: hyperv: optimize 'all cpus' case in kvm_hv_flush_tlb()
KVM: x86: hyperv: use get_vcpu_by_vpidx() in kvm_hv_flush_tlb()
x86/hyper-v: rename ipi_arg_{ex,non_ex} structures
KVM: x86: hyperv: implement PV IPI send hypercalls
Documentation/virtual/kvm/api.txt | 8 ++
arch/x86/hyperv/hv_apic.c | 12 +--
arch/x86/include/asm/hyperv-tlfs.h | 16 +--
arch/x86/kvm/hyperv.c | 211 +++++++++++++++++++++++++++----------
arch/x86/kvm/trace.h | 42 ++++++++
arch/x86/kvm/x86.c | 1 +
include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 6 +-
8 files changed, 224 insertions(+), 73 deletions(-)
--
2.14.4
We can use 'NULL' to represent 'all cpus' case in
kvm_make_vcpus_request_mask() and avoid building vCPU mask with
all vCPUs.
Suggested-by: Radim Krčmář <[email protected]>
Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Roman Kagan <[email protected]>
---
arch/x86/kvm/hyperv.c | 42 +++++++++++++++++++++++-------------------
virt/kvm/kvm_main.c | 6 ++----
2 files changed, 25 insertions(+), 23 deletions(-)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 0cd597b0f754..b45ce136be2f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1325,35 +1325,39 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
cpumask_clear(&hv_current->tlb_lush);
+ if (all_cpus) {
+ kvm_make_vcpus_request_mask(kvm,
+ KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
+ NULL, &hv_current->tlb_lush);
+ goto ret_success;
+ }
+
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
int bank = hv->vp_index / 64, sbank = 0;
- if (!all_cpus) {
- /* Banks >64 can't be represented */
- if (bank >= 64)
- continue;
-
- /* Non-ex hypercalls can only address first 64 vCPUs */
- if (!ex && bank)
- continue;
+ /* Banks >64 can't be represented */
+ if (bank >= 64)
+ continue;
- if (ex) {
- /*
- * Check is the bank of this vCPU is in sparse
- * set and get the sparse bank number.
- */
- sbank = get_sparse_bank_no(valid_bank_mask,
- bank);
+ /* Non-ex hypercalls can only address first 64 vCPUs */
+ if (!ex && bank)
+ continue;
- if (sbank < 0)
- continue;
- }
+ if (ex) {
+ /*
+ * Check is the bank of this vCPU is in sparse
+ * set and get the sparse bank number.
+ */
+ sbank = get_sparse_bank_no(valid_bank_mask, bank);
- if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
+ if (sbank < 0)
continue;
}
+ if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
+ continue;
+
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
* can't analyze it here, flush TLB regardless of the specified
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f83239ac8be1..3340f8128dc8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -218,7 +218,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
me = get_cpu();
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!test_bit(i, vcpu_bitmap))
+ if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
continue;
kvm_make_request(req, vcpu);
@@ -242,12 +242,10 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
cpumask_var_t cpus;
bool called;
- static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
- = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
- called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
+ called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
free_cpumask_var(cpus);
return called;
--
2.14.4
These structures are going to be used from KVM code so let's make
their names reflect their Hyper-V origin.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Roman Kagan <[email protected]>
---
arch/x86/hyperv/hv_apic.c | 12 ++++++------
arch/x86/include/asm/hyperv-tlfs.h | 16 +++++++++-------
2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 402338365651..49284e1506b1 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -93,14 +93,14 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
*/
static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
{
- struct ipi_arg_ex **arg;
- struct ipi_arg_ex *ipi_arg;
+ struct hv_send_ipi_ex **arg;
+ struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
int nr_bank = 0;
int ret = 1;
local_irq_save(flags);
- arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
ipi_arg = *arg;
if (unlikely(!ipi_arg))
@@ -130,8 +130,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
static bool __send_ipi_mask(const struct cpumask *mask, int vector)
{
int cur_cpu, vcpu;
- struct ipi_arg_non_ex **arg;
- struct ipi_arg_non_ex *ipi_arg;
+ struct hv_send_ipi **arg;
+ struct hv_send_ipi *ipi_arg;
int ret = 1;
unsigned long flags;
@@ -148,7 +148,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
return __send_ipi_mask_ex(mask, vector);
local_irq_save(flags);
- arg = (struct ipi_arg_non_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ arg = (struct hv_send_ipi **)this_cpu_ptr(hyperv_pcpu_input_arg);
ipi_arg = *arg;
if (unlikely(!ipi_arg))
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 08e24f552030..d0554409a3de 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -725,19 +725,21 @@ struct hv_enlightened_vmcs {
#define HV_STIMER_AUTOENABLE (1ULL << 3)
#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
-struct ipi_arg_non_ex {
- u32 vector;
- u32 reserved;
- u64 cpu_mask;
-};
-
struct hv_vpset {
u64 format;
u64 valid_bank_mask;
u64 bank_contents[];
};
-struct ipi_arg_ex {
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+ u32 vector;
+ u32 reserved;
+ u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
u32 vector;
u32 reserved;
struct hv_vpset vp_set;
--
2.14.4
Using hypercall for sending IPIs is faster because this allows to specify
any number of vCPUs (even > 64 with sparse CPU set), the whole procedure
will take only one VMEXIT.
Current Hyper-V TLFS (v5.0b) claims that HvCallSendSyntheticClusterIpi
hypercall can't be 'fast' (passing parameters through registers) but
apparently this is not true, Windows always uses it as 'fast' so we need
to support that.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
---
Documentation/virtual/kvm/api.txt | 8 +++
arch/x86/kvm/hyperv.c | 109 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/trace.h | 42 +++++++++++++++
arch/x86/kvm/x86.c | 1 +
include/uapi/linux/kvm.h | 1 +
5 files changed, 161 insertions(+)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 7b83b176c662..832ea72d43c1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4690,3 +4690,11 @@ This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush
hypercalls:
HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx,
HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
+
+8.19 KVM_CAP_HYPERV_SEND_IPI
+
+Architectures: x86
+
+This capability indicates that KVM supports paravirtualized Hyper-V IPI send
+hypercalls:
+HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index d1a911132b59..3183cf9bcb63 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1360,6 +1360,101 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+ bool ex, bool fast)
+{
+ struct kvm *kvm = current_vcpu->kvm;
+ struct hv_send_ipi_ex send_ipi_ex;
+ struct hv_send_ipi send_ipi;
+ struct kvm_vcpu *vcpu;
+ unsigned long valid_bank_mask;
+ u64 sparse_banks[64];
+ int sparse_banks_len, bank, i;
+ struct kvm_lapic_irq irq = {.delivery_mode = APIC_DM_FIXED};
+ bool all_cpus;
+
+ if (!ex) {
+ if (!fast) {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
+ sizeof(send_ipi))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = send_ipi.cpu_mask;
+ irq.vector = send_ipi.vector;
+ } else {
+ /* 'reserved' part of hv_send_ipi should be 0 */
+ if (unlikely(ingpa >> 32 != 0))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = outgpa;
+ irq.vector = (u32)ingpa;
+ }
+ all_cpus = false;
+ valid_bank_mask = BIT_ULL(0);
+
+ trace_kvm_hv_send_ipi(irq.vector, sparse_banks[0]);
+ } else {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
+ send_ipi_ex.vp_set.format,
+ send_ipi_ex.vp_set.valid_bank_mask);
+
+ irq.vector = send_ipi_ex.vector;
+ valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
+ sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+ sizeof(sparse_banks[0]);
+
+ all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+
+ if (!sparse_banks_len)
+ goto ret_success;
+
+ if (!all_cpus &&
+ kvm_read_guest(kvm,
+ ingpa + offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents),
+ sparse_banks,
+ sparse_banks_len))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ if ((irq.vector < HV_IPI_LOW_VECTOR) ||
+ (irq.vector > HV_IPI_HIGH_VECTOR))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /* We fail only when APIC is disabled */
+ if (!kvm_apic_set_irq(vcpu, &irq, NULL))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+ goto ret_success;
+ }
+
+ for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+ BITS_PER_LONG) {
+
+ for_each_set_bit(i, (unsigned long *)&sparse_banks[bank],
+ BITS_PER_LONG) {
+ u32 vp_index = bank * 64 + i;
+ struct kvm_vcpu *vcpu =
+ get_vcpu_by_vpidx(kvm, vp_index);
+
+ /* Unknown vCPU specified */
+ if (!vcpu)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* We fail only when APIC is disabled */
+ if (!kvm_apic_set_irq(vcpu, &irq, NULL))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+ }
+
+ret_success:
+ return HV_STATUS_SUCCESS;
+}
+
bool kvm_hv_hypercall_enabled(struct kvm *kvm)
{
return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
@@ -1529,6 +1624,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
break;
+ case HVCALL_SEND_IPI:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast);
+ break;
+ case HVCALL_SEND_IPI_EX:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
+ break;
default:
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0f997683404f..0659465a745c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1418,6 +1418,48 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
__entry->valid_bank_mask, __entry->format,
__entry->address_space, __entry->flags)
);
+
+/*
+ * Tracepoints for kvm_hv_send_ipi.
+ */
+TRACE_EVENT(kvm_hv_send_ipi,
+ TP_PROTO(u32 vector, u64 processor_mask),
+ TP_ARGS(vector, processor_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, processor_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->processor_mask = processor_mask;
+ ),
+
+ TP_printk("vector %x processor_mask 0x%llx",
+ __entry->vector, __entry->processor_mask)
+);
+
+TRACE_EVENT(kvm_hv_send_ipi_ex,
+ TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
+ TP_ARGS(vector, format, valid_bank_mask),
+
+ TP_STRUCT__entry(
+ __field(u32, vector)
+ __field(u64, format)
+ __field(u64, valid_bank_mask)
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ __entry->format = format;
+ __entry->valid_bank_mask = valid_bank_mask;
+ ),
+
+ TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
+ __entry->vector, __entry->format,
+ __entry->valid_bank_mask)
+);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c83711c0ebe..a5b7ce303b62 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2885,6 +2885,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_VP_INDEX:
case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3cf632839337..55da336e7632 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -951,6 +951,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_HYPERV_TLBFLUSH 155
#define KVM_CAP_S390_HPAGE_1M 156
#define KVM_CAP_NESTED_STATE 157
+#define KVM_CAP_HYPERV_SEND_IPI 158
#ifdef KVM_CAP_IRQ_ROUTING
--
2.14.4
VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
use it instead of traversing full vCPU list every time.
To support the change split off get_vcpu_idx_by_vpidx() from
get_vcpu_by_vpidx().
Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Roman Kagan <[email protected]>
---
arch/x86/kvm/hyperv.c | 78 ++++++++++++++++++++-------------------------------
1 file changed, 31 insertions(+), 47 deletions(-)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b45ce136be2f..d1a911132b59 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -127,20 +127,31 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
return 0;
}
-static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+static u32 get_vcpu_idx_by_vpidx(struct kvm *kvm, u32 vpidx)
{
struct kvm_vcpu *vcpu = NULL;
int i;
if (vpidx >= KVM_MAX_VCPUS)
- return NULL;
+ return U32_MAX;
vcpu = kvm_get_vcpu(kvm, vpidx);
if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
- return vcpu;
+ return vpidx;
kvm_for_each_vcpu(i, vcpu, kvm)
if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
- return vcpu;
+ return i;
+ return U32_MAX;
+}
+
+static __always_inline struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm,
+ u32 vpidx)
+{
+ u32 vcpu_idx = get_vcpu_idx_by_vpidx(kvm, vpidx);
+
+ if (vcpu_idx < KVM_MAX_VCPUS)
+ return kvm_get_vcpu(kvm, vcpu_idx);
+
return NULL;
}
@@ -1257,20 +1268,6 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
-static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
-{
- int i = 0, j;
-
- if (!(valid_bank_mask & BIT_ULL(bank_no)))
- return -1;
-
- for (j = 0; j < bank_no; j++)
- if (valid_bank_mask & BIT_ULL(j))
- i++;
-
- return i;
-}
-
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
u16 rep_cnt, bool ex)
{
@@ -1278,11 +1275,10 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
struct kvm_vcpu_hv *hv_current = ¤t_vcpu->arch.hyperv;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- struct kvm_vcpu *vcpu;
unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
- unsigned long valid_bank_mask = 0;
+ unsigned long valid_bank_mask;
u64 sparse_banks[64];
- int sparse_banks_len, i;
+ int sparse_banks_len, bank, i;
bool all_cpus;
if (!ex) {
@@ -1292,6 +1288,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
trace_kvm_hv_flush_tlb(flush.processor_mask,
flush.address_space, flush.flags);
+ valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS;
} else {
@@ -1332,38 +1329,25 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
goto ret_success;
}
- kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
- int bank = hv->vp_index / 64, sbank = 0;
+ for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+ BITS_PER_LONG) {
- /* Banks >64 can't be represented */
- if (bank >= 64)
- continue;
+ for_each_set_bit(i, (unsigned long *)&sparse_banks[bank],
+ BITS_PER_LONG) {
+ u32 vp_index = bank * 64 + i;
+ u32 vcpu_idx = get_vcpu_idx_by_vpidx(kvm, vp_index);
- /* Non-ex hypercalls can only address first 64 vCPUs */
- if (!ex && bank)
- continue;
+ /* A non-existent vCPU was specified */
+ if (vcpu_idx >= KVM_MAX_VCPUS)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (ex) {
/*
- * Check is the bank of this vCPU is in sparse
- * set and get the sparse bank number.
+ * vcpu->arch.cr3 may not be up-to-date for running
+ * vCPUs so we can't analyze it here, flush TLB
+ * regardless of the specified address space.
*/
- sbank = get_sparse_bank_no(valid_bank_mask, bank);
-
- if (sbank < 0)
- continue;
+ __set_bit(vcpu_idx, vcpu_bitmap);
}
-
- if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
- continue;
-
- /*
- * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
- * can't analyze it here, flush TLB regardless of the specified
- * address space.
- */
- __set_bit(i, vcpu_bitmap);
}
kvm_make_vcpus_request_mask(kvm,
--
2.14.4
Hyper-V TLFS (5.0b) states:
> Virtual processors are identified by using an index (VP index). The
> maximum number of virtual processors per partition supported by the
> current implementation of the hypervisor can be obtained through CPUID
> leaf 0x40000005. A virtual processor index must be less than the
> maximum number of virtual processors per partition.
Forbid userspace to set VP_INDEX above KVM_MAX_VCPUS. get_vcpu_by_vpidx()
can now be optimized to bail early when supplied vpidx is >= KVM_MAX_VCPUS.
Signed-off-by: Vitaly Kuznetsov <[email protected]>
Reviewed-by: Roman Kagan <[email protected]>
---
arch/x86/kvm/hyperv.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01d209ab5481..0cd597b0f754 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -132,8 +132,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
struct kvm_vcpu *vcpu = NULL;
int i;
- if (vpidx < KVM_MAX_VCPUS)
- vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -1044,7 +1046,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
switch (msr) {
case HV_X64_MSR_VP_INDEX:
- if (!host)
+ if (!host || (u32)data >= KVM_MAX_VCPUS)
return 1;
hv->vp_index = (u32)data;
break;
--
2.14.4
On Wed, Aug 22, 2018 at 12:18:32PM +0200, Vitaly Kuznetsov wrote:
> Using hypercall for sending IPIs is faster because this allows to specify
> any number of vCPUs (even > 64 with sparse CPU set), the whole procedure
> will take only one VMEXIT.
>
> Current Hyper-V TLFS (v5.0b) claims that HvCallSendSyntheticClusterIpi
> hypercall can't be 'fast' (passing parameters through registers) but
> apparently this is not true, Windows always uses it as 'fast' so we need
> to support that.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> ---
> Documentation/virtual/kvm/api.txt | 8 +++
> arch/x86/kvm/hyperv.c | 109 ++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/trace.h | 42 +++++++++++++++
> arch/x86/kvm/x86.c | 1 +
> include/uapi/linux/kvm.h | 1 +
> 5 files changed, 161 insertions(+)
Looks like I forgot to respond to this one, sorry.
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 7b83b176c662..832ea72d43c1 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -4690,3 +4690,11 @@ This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush
> hypercalls:
> HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx,
> HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
> +
> +8.19 KVM_CAP_HYPERV_SEND_IPI
> +
> +Architectures: x86
> +
> +This capability indicates that KVM supports paravirtualized Hyper-V IPI send
> +hypercalls:
> +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index d1a911132b59..3183cf9bcb63 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1360,6 +1360,101 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
> ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
> }
>
> +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
> + bool ex, bool fast)
> +{
> + struct kvm *kvm = current_vcpu->kvm;
> + struct hv_send_ipi_ex send_ipi_ex;
> + struct hv_send_ipi send_ipi;
> + struct kvm_vcpu *vcpu;
> + unsigned long valid_bank_mask;
> + u64 sparse_banks[64];
> + int sparse_banks_len, bank, i;
> + struct kvm_lapic_irq irq = {.delivery_mode = APIC_DM_FIXED};
> + bool all_cpus;
> +
> + if (!ex) {
> + if (!fast) {
> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
> + sizeof(send_ipi))))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + sparse_banks[0] = send_ipi.cpu_mask;
> + irq.vector = send_ipi.vector;
> + } else {
> + /* 'reserved' part of hv_send_ipi should be 0 */
> + if (unlikely(ingpa >> 32 != 0))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + sparse_banks[0] = outgpa;
> + irq.vector = (u32)ingpa;
> + }
> + all_cpus = false;
> + valid_bank_mask = BIT_ULL(0);
> +
> + trace_kvm_hv_send_ipi(irq.vector, sparse_banks[0]);
> + } else {
> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
> + sizeof(send_ipi_ex))))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
> + send_ipi_ex.vp_set.format,
> + send_ipi_ex.vp_set.valid_bank_mask);
> +
> + irq.vector = send_ipi_ex.vector;
> + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
> + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
> + sizeof(sparse_banks[0]);
> +
> + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
> +
> + if (!sparse_banks_len)
> + goto ret_success;
> +
> + if (!all_cpus &&
> + kvm_read_guest(kvm,
> + ingpa + offsetof(struct hv_send_ipi_ex,
> + vp_set.bank_contents),
> + sparse_banks,
> + sparse_banks_len))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + }
> +
> + if ((irq.vector < HV_IPI_LOW_VECTOR) ||
> + (irq.vector > HV_IPI_HIGH_VECTOR))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> +
> + if (all_cpus) {
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + /* We fail only when APIC is disabled */
> + if (!kvm_apic_set_irq(vcpu, &irq, NULL))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
> + }
> + goto ret_success;
> + }
> +
> + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
> + BITS_PER_LONG) {
I think you need exactly 64 rather than BITS_PER_LONG
> +
> + for_each_set_bit(i, (unsigned long *)&sparse_banks[bank],
> + BITS_PER_LONG) {
ditto
> + u32 vp_index = bank * 64 + i;
> + struct kvm_vcpu *vcpu =
> + get_vcpu_by_vpidx(kvm, vp_index);
> +
> + /* Unknown vCPU specified */
> + if (!vcpu)
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
You may have already fired some IPIs, so returning error here without
attempting the remaining vcpus in the request looks inconsistent to me.
I don't see a specified way to report partial success from this
hypercall.
I'd rather continue here.
> +
> + /* We fail only when APIC is disabled */
> + if (!kvm_apic_set_irq(vcpu, &irq, NULL))
> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
Same here.
> + }
> + }
> +
> +ret_success:
> + return HV_STATUS_SUCCESS;
> +}
To my personal taste, this would have been easier to read if the -ex and
non-ex versions were separate functions preparing the arguments (vector
and mask) and calling into a common helper function to send the ipis.
Roman.
Roman Kagan <[email protected]> writes:
> On Wed, Aug 22, 2018 at 12:18:32PM +0200, Vitaly Kuznetsov wrote:
>> Using hypercall for sending IPIs is faster because this allows to specify
>> any number of vCPUs (even > 64 with sparse CPU set), the whole procedure
>> will take only one VMEXIT.
>>
>> Current Hyper-V TLFS (v5.0b) claims that HvCallSendSyntheticClusterIpi
>> hypercall can't be 'fast' (passing parameters through registers) but
>> apparently this is not true, Windows always uses it as 'fast' so we need
>> to support that.
>>
>> Signed-off-by: Vitaly Kuznetsov <[email protected]>
>> ---
>> Documentation/virtual/kvm/api.txt | 8 +++
>> arch/x86/kvm/hyperv.c | 109 ++++++++++++++++++++++++++++++++++++++
>> arch/x86/kvm/trace.h | 42 +++++++++++++++
>> arch/x86/kvm/x86.c | 1 +
>> include/uapi/linux/kvm.h | 1 +
>> 5 files changed, 161 insertions(+)
>
> Looks like I forgot to respond to this one, sorry.
>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index 7b83b176c662..832ea72d43c1 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -4690,3 +4690,11 @@ This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush
>> hypercalls:
>> HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx,
>> HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
>> +
>> +8.19 KVM_CAP_HYPERV_SEND_IPI
>> +
>> +Architectures: x86
>> +
>> +This capability indicates that KVM supports paravirtualized Hyper-V IPI send
>> +hypercalls:
>> +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> index d1a911132b59..3183cf9bcb63 100644
>> --- a/arch/x86/kvm/hyperv.c
>> +++ b/arch/x86/kvm/hyperv.c
>> @@ -1360,6 +1360,101 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
>> ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
>> }
>>
>> +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
>> + bool ex, bool fast)
>> +{
>> + struct kvm *kvm = current_vcpu->kvm;
>> + struct hv_send_ipi_ex send_ipi_ex;
>> + struct hv_send_ipi send_ipi;
>> + struct kvm_vcpu *vcpu;
>> + unsigned long valid_bank_mask;
>> + u64 sparse_banks[64];
>> + int sparse_banks_len, bank, i;
>> + struct kvm_lapic_irq irq = {.delivery_mode = APIC_DM_FIXED};
>> + bool all_cpus;
>> +
>> + if (!ex) {
>> + if (!fast) {
>> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
>> + sizeof(send_ipi))))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>> + sparse_banks[0] = send_ipi.cpu_mask;
>> + irq.vector = send_ipi.vector;
>> + } else {
>> + /* 'reserved' part of hv_send_ipi should be 0 */
>> + if (unlikely(ingpa >> 32 != 0))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>> + sparse_banks[0] = outgpa;
>> + irq.vector = (u32)ingpa;
>> + }
>> + all_cpus = false;
>> + valid_bank_mask = BIT_ULL(0);
>> +
>> + trace_kvm_hv_send_ipi(irq.vector, sparse_banks[0]);
>> + } else {
>> + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
>> + sizeof(send_ipi_ex))))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>> +
>> + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
>> + send_ipi_ex.vp_set.format,
>> + send_ipi_ex.vp_set.valid_bank_mask);
>> +
>> + irq.vector = send_ipi_ex.vector;
>> + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
>> + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
>> + sizeof(sparse_banks[0]);
>> +
>> + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
>> +
>> + if (!sparse_banks_len)
>> + goto ret_success;
>> +
>> + if (!all_cpus &&
>> + kvm_read_guest(kvm,
>> + ingpa + offsetof(struct hv_send_ipi_ex,
>> + vp_set.bank_contents),
>> + sparse_banks,
>> + sparse_banks_len))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>> + }
>> +
>> + if ((irq.vector < HV_IPI_LOW_VECTOR) ||
>> + (irq.vector > HV_IPI_HIGH_VECTOR))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>> +
>> + if (all_cpus) {
>> + kvm_for_each_vcpu(i, vcpu, kvm) {
>> + /* We fail only when APIC is disabled */
>> + if (!kvm_apic_set_irq(vcpu, &irq, NULL))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>> + }
>> + goto ret_success;
>> + }
>> +
>> + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
>> + BITS_PER_LONG) {
>
> I think you need exactly 64 rather than BITS_PER_LONG
>
>> +
>> + for_each_set_bit(i, (unsigned long *)&sparse_banks[bank],
>> + BITS_PER_LONG) {
>
> ditto
>
Sure, will do.
>> + u32 vp_index = bank * 64 + i;
>> + struct kvm_vcpu *vcpu =
>> + get_vcpu_by_vpidx(kvm, vp_index);
>> +
>> + /* Unknown vCPU specified */
>> + if (!vcpu)
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>
> You may have already fired some IPIs, so returning error here without
> attempting the remaining vcpus in the request looks inconsistent to me.
>
> I don't see a specified way to report partial success from this
> hypercall.
>
> I'd rather continue here.
>
Basically, we have three choices:
- Just bail (what we do now)
- Ignore and continue returning success
- Ignore and continue returning failure.
but all of them are not perfect.
We could've pre-validated the set but this is kind of expensive and no
sane OS should be using invalid VP indexes.
>> +
>> + /* We fail only when APIC is disabled */
>> + if (!kvm_apic_set_irq(vcpu, &irq, NULL))
>> + return HV_STATUS_INVALID_HYPERCALL_INPUT;
>
> Same here.
>
This is even worse as this can't be pre-validated reliably without
pausing all vCPUs first.
OK, let's switch to 'ignore and continue returning success'.
>> + }
>> + }
>> +
>> +ret_success:
>> + return HV_STATUS_SUCCESS;
>> +}
>
> To my personal taste, this would have been easier to read if the -ex and
> non-ex versions were separate functions preparing the arguments (vector
> and mask) and calling into a common helper function to send the ipis.
I would agree but I think it was Radim's suggestion to unify ex- and
non-ex versions of kvm_hv_flush_tlb() and sending IPIs is not any
different.
Radim, please let me know if you think we should split them again, I'll
do it for both functions (either in this series or as a follow-up).
Thanks,
--
Vitaly
On 22/08/2018 12:18, Vitaly Kuznetsov wrote:
> cpumask_clear(&hv_current->tlb_lush);
>
> + if (all_cpus) {
> + kvm_make_vcpus_request_mask(kvm,
> + KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
> + NULL, &hv_current->tlb_lush);
> + goto ret_success;
> + }
> +
The cpumask_clear can be pushed below the if. Queued with that change.
Paolo
On 22/08/2018 12:18, Vitaly Kuznetsov wrote:
> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
> use it instead of traversing full vCPU list every time.
... but if it doesn't, the algorithm is now quadratic, isn't i?
Paolo
On 22/08/2018 12:18, Vitaly Kuznetsov wrote:
> Hyper-V TLFS (5.0b) states:
>
>> Virtual processors are identified by using an index (VP index). The
>> maximum number of virtual processors per partition supported by the
>> current implementation of the hypervisor can be obtained through CPUID
>> leaf 0x40000005. A virtual processor index must be less than the
>> maximum number of virtual processors per partition.
>
> Forbid userspace to set VP_INDEX above KVM_MAX_VCPUS. get_vcpu_by_vpidx()
> can now be optimized to bail early when supplied vpidx is >= KVM_MAX_VCPUS.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> Reviewed-by: Roman Kagan <[email protected]>
> ---
> arch/x86/kvm/hyperv.c | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 01d209ab5481..0cd597b0f754 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -132,8 +132,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
> struct kvm_vcpu *vcpu = NULL;
> int i;
>
> - if (vpidx < KVM_MAX_VCPUS)
> - vcpu = kvm_get_vcpu(kvm, vpidx);
> + if (vpidx >= KVM_MAX_VCPUS)
> + return NULL;
> +
> + vcpu = kvm_get_vcpu(kvm, vpidx);
> if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
> return vcpu;
> kvm_for_each_vcpu(i, vcpu, kvm)
> @@ -1044,7 +1046,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
>
> switch (msr) {
> case HV_X64_MSR_VP_INDEX:
> - if (!host)
> + if (!host || (u32)data >= KVM_MAX_VCPUS)
> return 1;
> hv->vp_index = (u32)data;
> break;
>
Queued, thanks.
Paolo
On 22/08/2018 12:18, Vitaly Kuznetsov wrote:
> These structures are going to be used from KVM code so let's make
> their names reflect their Hyper-V origin.
>
> Signed-off-by: Vitaly Kuznetsov <[email protected]>
> Reviewed-by: Roman Kagan <[email protected]>
> ---
> arch/x86/hyperv/hv_apic.c | 12 ++++++------
> arch/x86/include/asm/hyperv-tlfs.h | 16 +++++++++-------
> 2 files changed, 15 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
> index 402338365651..49284e1506b1 100644
> --- a/arch/x86/hyperv/hv_apic.c
> +++ b/arch/x86/hyperv/hv_apic.c
> @@ -93,14 +93,14 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
> */
> static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
> {
> - struct ipi_arg_ex **arg;
> - struct ipi_arg_ex *ipi_arg;
> + struct hv_send_ipi_ex **arg;
> + struct hv_send_ipi_ex *ipi_arg;
> unsigned long flags;
> int nr_bank = 0;
> int ret = 1;
>
> local_irq_save(flags);
> - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
>
> ipi_arg = *arg;
> if (unlikely(!ipi_arg))
> @@ -130,8 +130,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
> static bool __send_ipi_mask(const struct cpumask *mask, int vector)
> {
> int cur_cpu, vcpu;
> - struct ipi_arg_non_ex **arg;
> - struct ipi_arg_non_ex *ipi_arg;
> + struct hv_send_ipi **arg;
> + struct hv_send_ipi *ipi_arg;
> int ret = 1;
> unsigned long flags;
>
> @@ -148,7 +148,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
> return __send_ipi_mask_ex(mask, vector);
>
> local_irq_save(flags);
> - arg = (struct ipi_arg_non_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
> + arg = (struct hv_send_ipi **)this_cpu_ptr(hyperv_pcpu_input_arg);
>
> ipi_arg = *arg;
> if (unlikely(!ipi_arg))
> diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
> index 08e24f552030..d0554409a3de 100644
> --- a/arch/x86/include/asm/hyperv-tlfs.h
> +++ b/arch/x86/include/asm/hyperv-tlfs.h
> @@ -725,19 +725,21 @@ struct hv_enlightened_vmcs {
> #define HV_STIMER_AUTOENABLE (1ULL << 3)
> #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
>
> -struct ipi_arg_non_ex {
> - u32 vector;
> - u32 reserved;
> - u64 cpu_mask;
> -};
> -
> struct hv_vpset {
> u64 format;
> u64 valid_bank_mask;
> u64 bank_contents[];
> };
>
> -struct ipi_arg_ex {
> +/* HvCallSendSyntheticClusterIpi hypercall */
> +struct hv_send_ipi {
> + u32 vector;
> + u32 reserved;
> + u64 cpu_mask;
> +};
> +
> +/* HvCallSendSyntheticClusterIpiEx hypercall */
> +struct hv_send_ipi_ex {
> u32 vector;
> u32 reserved;
> struct hv_vpset vp_set;
>
This doesn't apply anymore, sorry.
Paolo
Paolo Bonzini <[email protected]> writes:
> On 22/08/2018 12:18, Vitaly Kuznetsov wrote:
>> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
>> use it instead of traversing full vCPU list every time.
>
> ... but if it doesn't, the algorithm is now quadratic, isn't i?
Yes,
I even had an implementation with a logarythmic search back in v2 but
we had a discussion with Roman and he convinced me this is an overkill
and not currently required. It seems that with Qemu this is true indeed,
vp_index always matches vcpu is but in case some other userspace decides
to break this unwritten rule users may experience significant slowdown.
--
Vitaly
Vitaly Kuznetsov <[email protected]> writes:
> Paolo Bonzini <[email protected]> writes:
>
>> On 22/08/2018 12:18, Vitaly Kuznetsov wrote:
>>> VP_INDEX almost always matches VCPU id and get_vcpu_by_vpidx() is fast,
>>> use it instead of traversing full vCPU list every time.
>>
>> ... but if it doesn't, the algorithm is now quadratic, isn't i?
>
> Yes,
>
> I even had an implementation with a logarythmic search back in v2 but
> we had a discussion with Roman and he convinced me this is an overkill
> and not currently required. It seems that with Qemu this is true indeed,
> vp_index always matches vcpu is but in case some other userspace decides
> to break this unwritten rule users may experience significant slowdown.
Hi Paolo,
could you please clarify what needs to be done to get this merged? In
particular, are you OK with my comment above or do we need to do
something with it (e.g. get back to the 'logarythmic search' from v2)?
In kvm/queue I can see only 'x86/hyper-v: rename ipi_arg_{ex,non_ex}
structures' patch from this series applied.
Thanks,
--
Vitaly
On 24/09/2018 18:24, Paolo Bonzini wrote:
> Hi Paolo,
>
> could you please clarify what needs to be done to get this merged? In
> particular, are you OK with my comment above or do we need to do
> something with it (e.g. get back to the 'logarythmic search' from v2)?
>
> In kvm/queue I can see only 'x86/hyper-v: rename ipi_arg_{ex,non_ex}
> structures' patch from this series applied.
Hi,
my plan was to apply only 1/2/5 for now. I singled out the rename patch
because that one could be included in 4.19-rc kernels as a cleanup.
Paolo
On Mon, Sep 24, 2018 at 06:55:28PM +0200, Paolo Bonzini wrote:
> On 24/09/2018 18:24, Paolo Bonzini wrote:
> > Hi Paolo,
> >
> > could you please clarify what needs to be done to get this merged? In
> > particular, are you OK with my comment above or do we need to do
> > something with it (e.g. get back to the 'logarythmic search' from v2)?
> >
> > In kvm/queue I can see only 'x86/hyper-v: rename ipi_arg_{ex,non_ex}
> > structures' patch from this series applied.
>
> Hi,
>
> my plan was to apply only 1/2/5 for now. I singled out the rename patch
> because that one could be included in 4.19-rc kernels as a cleanup.
Is this supposed to mean you're not happy with the approach taken in
Vitaly's patch? Can you explain why? I take my part of guilt for it so
I'd like to know, too.
Speaking of the options we have, the choice depends on the assumptions
we take. (And I guess when you spoke of quadratic complexity you
referred to the algorithm to convert the vp_index mask into the KVM cpu
mask.)
If we can assume that in all relevant cases vp_index coincides with the
cpu index (which I think we can) then Vitaly's approach is the most
efficient.
If, on the opposite, we want to optimize for random mapping between
vp_index and cpu index, then it's probably better instead to iterate
over vcpus and test if their vp_index belongs to the requested mask.
Neither of the above is quadratic.
Dunno if we need to specifically consider intermediate situations.
Anyway using a havier vp_index -> cpu index translation looks like an
overkill to me.
What do you think?
Thanks,
Roman.
On 25/09/2018 10:57, Roman Kagan wrote:
> Speaking of the options we have, the choice depends on the assumptions
> we take. (And I guess when you spoke of quadratic complexity you
> referred to the algorithm to convert the vp_index mask into the KVM cpu
> mask.)
Right; with Vitaly's patch, if you have a random mapping between
vp_index and cpu index, each loop requires a list walk, and so you have
O(#VMcpus * #IPIcpus) worst case for sending an IPI to #IPIcpus CPUs in
a guest with #VMcpus.
> If we can assume that in all relevant cases vp_index coincides with the
> cpu index (which I think we can) then Vitaly's approach is the most
> efficient.
>
> If, on the opposite, we want to optimize for random mapping between
> vp_index and cpu index, then it's probably better instead to iterate
> over vcpus and test if their vp_index belongs to the requested mask.
Yes, that would work too. Perhaps we can do both? You can have a
kvm->num_mismatched_vp_indexes count to choose between the two.
Paolo
> Neither of the above is quadratic.
> Dunno if we need to specifically consider intermediate situations.
On Tue, Sep 25, 2018 at 11:29:57AM +0200, Paolo Bonzini wrote:
> On 25/09/2018 10:57, Roman Kagan wrote:
> > If we can assume that in all relevant cases vp_index coincides with the
> > cpu index (which I think we can) then Vitaly's approach is the most
> > efficient.
> >
> > If, on the opposite, we want to optimize for random mapping between
> > vp_index and cpu index, then it's probably better instead to iterate
> > over vcpus and test if their vp_index belongs to the requested mask.
>
> Yes, that would work too. Perhaps we can do both? You can have a
> kvm->num_mismatched_vp_indexes count to choose between the two.
Makes sense to me.
Roman.
Roman Kagan <[email protected]> writes:
> On Tue, Sep 25, 2018 at 11:29:57AM +0200, Paolo Bonzini wrote:
>> On 25/09/2018 10:57, Roman Kagan wrote:
>> > If we can assume that in all relevant cases vp_index coincides with the
>> > cpu index (which I think we can) then Vitaly's approach is the most
>> > efficient.
>> >
>> > If, on the opposite, we want to optimize for random mapping between
>> > vp_index and cpu index, then it's probably better instead to iterate
>> > over vcpus and test if their vp_index belongs to the requested mask.
>>
>> Yes, that would work too. Perhaps we can do both? You can have a
>> kvm->num_mismatched_vp_indexes count to choose between the two.
>
> Makes sense to me.
Thanks guys,
I'll try to draft something up for v6.
--
Vitaly