2021-04-26 09:10:08

by Hikaru Nishida

[permalink] [raw]
Subject: [RFC PATCH 0/6] x86/kvm: Virtual suspend time injection support


Hi,

This patch series adds virtual suspend time injection support to KVM.

Before this change, if the host goes into suspended state while the
guest is running, the guest will experience a time jump after the host's
resume. This can confuse some services in the guest since they can't
detect if the system went into suspend or not by comparing
CLOCK_BOOTTIME and CLOCK_MONOTONIC.

To solve this problem, we wanted to add a way to adjust the guest clocks
without actually suspending the guests. However, there was no way to
modify a gap between CLOCK_BOOTTIME and CLOCK_MONOTONIC without actually
suspending the guests. Therefore, this series introduce a new struct
called kvm_host_suspend_time to share the suspend time between host and
guest and a mechanism to inject a suspend time to the guest while
keeping
monotonicity of the clocks.

Could you take a look and let me know how we can improve the patches if
they are doing something wrong?

Thanks,

Hikaru Nishida



Hikaru Nishida (6):
x86/kvm: Reserve KVM_FEATURE_HOST_SUSPEND_TIME and
MSR_KVM_HOST_SUSPEND_TIME
x86/kvm: Add a struct and constants for virtual suspend time injection
x86/kvm: Add CONFIG_KVM_VIRT_SUSPEND_TIMING
x86/kvm: Add a host side support for virtual suspend time injection
x86/kvm: Add CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
x86/kvm: Add a guest side support for virtual suspend time injection

Documentation/virt/kvm/cpuid.rst | 3 +
Documentation/virt/kvm/msr.rst | 29 +++++++++
arch/x86/Kconfig | 13 ++++
arch/x86/include/asm/kvm_host.h | 5 ++
arch/x86/include/asm/kvm_para.h | 9 +++
arch/x86/include/uapi/asm/kvm_para.h | 6 ++
arch/x86/kernel/kvmclock.c | 25 ++++++++
arch/x86/kvm/Kconfig | 13 ++++
arch/x86/kvm/cpuid.c | 4 ++
arch/x86/kvm/x86.c | 89 +++++++++++++++++++++++++++-
include/linux/kvm_host.h | 7 +++
include/linux/timekeeper_internal.h | 4 ++
kernel/time/timekeeping.c | 31 ++++++++++
13 files changed, 237 insertions(+), 1 deletion(-)

--
2.31.1.498.g6c1eba8ee3d-goog


2021-04-26 09:10:49

by Hikaru Nishida

[permalink] [raw]
Subject: [RFC PATCH 3/6] x86/kvm: Add CONFIG_KVM_VIRT_SUSPEND_TIMING

The config option can be used to enable virtual suspend time injection
support on kvm hosts.

Signed-off-by: Hikaru Nishida <[email protected]>
---

arch/x86/kvm/Kconfig | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a788d5120d4d..6cb6795726a2 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -119,4 +119,17 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.

+config KVM_VIRT_SUSPEND_TIMING
+ bool "Virtual suspend time injection"
+ depends on KVM=y
+ default n
+ help
+ This option makes the host's suspension reflected on the guest's clocks.
+ In other words, guest's CLOCK_MONOTONIC will stop and
+ CLOCK_BOOTTIME keeps running during the host's suspension.
+ This feature will only be effective when both guest and host enable
+ this option.
+
+ If unsure, say N.
+
endif # VIRTUALIZATION
--
2.31.1.498.g6c1eba8ee3d-goog

2021-04-26 09:10:53

by Hikaru Nishida

[permalink] [raw]
Subject: [RFC PATCH 4/6] x86/kvm: Add a host side support for virtual suspend time injection

This patch implements virtual suspend time injection support for kvm
hosts.
If this functionality is enabled and the guest requests it, the host
will stop all the clocks observed by the guest during the host's
suspension and report the duration of suspend to the guest through
struct kvm_host_suspend_time to give a chance to adjust CLOCK_BOOTTIME
to the guest. This mechanism can be used to align the guest's clock
behavior to the hosts' ones.

Signed-off-by: Hikaru Nishida <[email protected]>
---

arch/x86/include/asm/kvm_host.h | 5 ++
arch/x86/kvm/cpuid.c | 4 ++
arch/x86/kvm/x86.c | 89 ++++++++++++++++++++++++++++++++-
include/linux/kvm_host.h | 7 +++
kernel/time/timekeeping.c | 3 ++
5 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3768819693e5..6584adaab3bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -994,6 +994,11 @@ struct kvm_arch {

gpa_t wall_clock;

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ struct gfn_to_hva_cache suspend_time;
+ bool suspend_time_injection_enabled;
+#endif /* KVM_VIRT_SUSPEND_TIMING */
+
bool mwait_in_guest;
bool hlt_in_guest;
bool pause_in_guest;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6bd2f8b830e4..62224b7bd7f9 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -814,6 +814,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
(1 << KVM_FEATURE_PV_SCHED_YIELD) |
(1 << KVM_FEATURE_ASYNC_PF_INT);

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ entry->eax |= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
+#endif
+
if (sched_info_on())
entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eca63625aee4..d919f771ce31 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1285,6 +1285,9 @@ static const u32 emulated_msrs_all[] = {

MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ MSR_KVM_HOST_SUSPEND_TIME,
+#endif

MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
@@ -2020,6 +2023,20 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
return;
}

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+static void kvm_write_suspend_time(struct kvm *kvm, bool updated)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_host_suspend_time st;
+
+ if (!ka->suspend_time_injection_enabled)
+ return;
+
+ st.suspend_time_ns = kvm->suspend_time_ns;
+ kvm_write_guest_cached(kvm, &ka->suspend_time, &st, sizeof(st));
+}
+#endif
+
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
do_shl32_div32(dividend, divisor);
@@ -2653,6 +2670,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v,

if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ kvm_write_suspend_time(v->kvm, true);
+#endif
vcpu->pvclock_set_guest_stopped_request = false;
}

@@ -3229,7 +3249,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

vcpu->arch.msr_kvm_poll_control = data;
break;
-
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ case MSR_KVM_HOST_SUSPEND_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_HOST_SUSPEND_TIME))
+ return 1;
+ if (!(data & KVM_MSR_ENABLED)) {
+ vcpu->kvm->arch.suspend_time_injection_enabled = false;
+ break;
+ }
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->kvm->arch.suspend_time, data & ~KVM_MSR_ENABLED,
+ sizeof(struct kvm_host_suspend_time))) {
+ return 1;
+ }
+ vcpu->kvm->arch.suspend_time_injection_enabled = true;
+ kvm_write_suspend_time(vcpu->kvm, false);
+ break;
+#endif
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
@@ -3535,6 +3571,15 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ case MSR_KVM_HOST_SUSPEND_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_HOST_SUSPEND_TIME))
+ return 1;
+ msr_info->data = vcpu->kvm->arch.suspend_time.gpa;
+ if (vcpu->kvm->arch.suspend_time_injection_enabled)
+ msr_info->data |= KVM_MSR_ENABLED;
+ break;
+#endif
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -11723,6 +11768,48 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+void kvm_arch_timekeeping_inject_sleeptime(const struct timespec64 *delta)
+{
+ struct kvm_vcpu *vcpu;
+ u64 suspend_time_ns;
+ struct kvm *kvm;
+ s64 adj;
+ int i;
+
+ suspend_time_ns = timespec64_to_ns(delta);
+ adj = tsc_khz * (suspend_time_ns / 1000000);
+ /*
+ * Adjust TSCs on all vcpus as if they are stopped during a suspend.
+ * doing a similar thing in kvm_arch_hardware_enable().
+ */
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ if (!kvm->arch.suspend_time_injection_enabled)
+ continue;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Adjustment here is always smaller than the gap
+ * observed by the guest, so subtracting the value
+ * here never rewinds the observed TSC in guests.
+ * This adjustment will be applied on the next
+ * kvm_arch_vcpu_load().
+ */
+ vcpu->arch.tsc_offset_adjustment -= adj;
+ }
+ /*
+ * Move the offset of kvm_clock here as if it is stopped
+ * during the suspension.
+ */
+ kvm->arch.kvmclock_offset -= suspend_time_ns;
+ /* suspend_time is accumulated per VM. */
+ kvm->suspend_time_ns += suspend_time_ns;
+ }
+ mutex_unlock(&kvm_lock);
+}
+#endif
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b65e7204344..e077b9a960fc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -520,6 +520,9 @@ struct kvm {
pid_t userspace_pid;
unsigned int max_halt_poll_ns;
u32 dirty_ring_size;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ u64 suspend_time_ns;
+#endif
};

#define kvm_err(fmt, ...) \
@@ -1522,4 +1525,8 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
/* Max number of entries allowed for each kvm dirty ring */
#define KVM_DIRTY_RING_MAX_ENTRIES 65536

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+void kvm_arch_timekeeping_inject_sleeptime(const struct timespec64 *delta);
+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */
+
#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ff0304de7de9..342a032ad552 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1786,6 +1786,9 @@ void timekeeping_resume(void)
if (inject_sleeptime) {
suspend_timing_needed = false;
__timekeeping_inject_sleeptime(tk, &ts_delta);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
+ kvm_arch_timekeeping_inject_sleeptime(&ts_delta);
+#endif
}

/* Re-base the last cycle value */
--
2.31.1.498.g6c1eba8ee3d-goog

2021-04-26 09:10:59

by Hikaru Nishida

[permalink] [raw]
Subject: [RFC PATCH 6/6] x86/kvm: Add a guest side support for virtual suspend time injection

This patch implements virtual suspend time injection support for kvm
guests. If this functionality is enabled and the host supports
KVM_FEATURE_HOST_SUSPEND_TIME,
the guest will register struct kvm_host_suspend_time through MSR to
monitor how much time we spend during the host's suspension. If the
duration is increased, the guest will advance CLOCK_BOOTTIME to match
with the host's suspension.

Signed-off-by: Hikaru Nishida <[email protected]>

---

arch/x86/include/asm/kvm_para.h | 9 +++++++++
arch/x86/kernel/kvmclock.c | 25 +++++++++++++++++++++++++
include/linux/timekeeper_internal.h | 4 ++++
kernel/time/timekeeping.c | 27 +++++++++++++++++++++++++++
4 files changed, 65 insertions(+)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 338119852512..e552efa931a8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -18,6 +18,15 @@ static inline bool kvm_check_and_clear_guest_paused(void)
}
#endif /* CONFIG_KVM_GUEST */

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+u64 kvm_get_suspend_time(void);
+#else
+static inline u64 kvm_get_suspend_time(void)
+{
+ return 0;
+}
+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST */
+
#define KVM_HYPERCALL \
ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1fc0962c89c0..630460beb05a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -49,6 +49,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);

static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+static struct kvm_host_suspend_time suspend_time __bss_decrypted;
+#endif
static struct pvclock_wall_clock wall_clock __bss_decrypted;
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
static struct pvclock_vsyscall_time_info *hvclock_mem;
@@ -164,6 +167,20 @@ static int kvm_cs_enable(struct clocksource *cs)
return 0;
}

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/*
+ * kvm_get_suspend_time_and_clear_request - This function
+ * checks how much suspend time is injected by the host.
+ * A returned value is a total time passed during the guest in a suspend
+ * state while this guest is running. (Not a duration of the last host
+ * suspend.)
+ */
+u64 kvm_get_suspend_time(void)
+{
+ return suspend_time.suspend_time_ns;
+}
+#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST */
+
struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
@@ -324,6 +341,14 @@ void __init kvmclock_init(void)
return;
}

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ if (kvm_para_has_feature(KVM_FEATURE_HOST_SUSPEND_TIME)) {
+ /* Register the suspend time structure */
+ wrmsrl(MSR_KVM_HOST_SUSPEND_TIME,
+ slow_virt_to_phys(&suspend_time) | KVM_MSR_ENABLED);
+ }
+#endif
+
if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
kvmclock_setup_percpu, NULL) < 0) {
return;
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 84ff2844df2a..a5fd515f0a9d 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -124,6 +124,10 @@ struct timekeeper {
u32 ntp_err_mult;
/* Flag used to avoid updating NTP twice with same second */
u32 skip_second_overflow;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ /* suspend_time_injected keeps the duration injected through kvm */
+ u64 suspend_time_injected;
+#endif
#ifdef CONFIG_DEBUG_TIMEKEEPING
long last_warning;
/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 342a032ad552..6b89e0d42596 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2114,6 +2114,29 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
return offset;
}

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/*
+ * timekeeping_inject_suspend_time - Inject virtual suspend time
+ * if it is requested by kvm host.
+ * This function should be called under holding timekeeper_lock and
+ * only from timekeeping_advance().
+ */
+static void timekeeping_inject_virtual_suspend_time(struct timekeeper *tk)
+{
+ struct timespec64 delta;
+ u64 suspend_time;
+
+ suspend_time = kvm_get_suspend_time();
+ if (suspend_time <= tk->suspend_time_injected) {
+ /* Sufficient amount of suspend time is already injected. */
+ return;
+ }
+ delta = ns_to_timespec64(suspend_time - tk->suspend_time_injected);
+ __timekeeping_inject_sleeptime(tk, &delta);
+ tk->suspend_time_injected = suspend_time;
+}
+#endif
+
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
@@ -2143,6 +2166,10 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
/* Do some additional sanity checking */
timekeeping_check_update(tk, offset);

+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ timekeeping_inject_virtual_suspend_time(tk);
+#endif
+
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
--
2.31.1.498.g6c1eba8ee3d-goog

2021-04-26 09:11:33

by Hikaru Nishida

[permalink] [raw]
Subject: [RFC PATCH 2/6] x86/kvm: Add a struct and constants for virtual suspend time injection

This patch adds definitions that are needed by both host and guest
to implement virtual suspend time injection.
This patch also adds #include <linux/kvm_host.h> in
kernel/time/timekeeping.c to make necesarily functions which will be
introduced in later patches available.

Signed-off-by: Hikaru Nishida <[email protected]>
---

arch/x86/include/uapi/asm/kvm_para.h | 6 ++++++
kernel/time/timekeeping.c | 1 +
2 files changed, 7 insertions(+)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 950afebfba88..13788b01094f 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -33,6 +33,7 @@
#define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14
#define KVM_FEATURE_MSI_EXT_DEST_ID 15
+#define KVM_FEATURE_HOST_SUSPEND_TIME 16

#define KVM_HINTS_REALTIME 0

@@ -54,6 +55,7 @@
#define MSR_KVM_POLL_CONTROL 0x4b564d05
#define MSR_KVM_ASYNC_PF_INT 0x4b564d06
#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
+#define MSR_KVM_HOST_SUSPEND_TIME 0x4b564d08

struct kvm_steal_time {
__u64 steal;
@@ -64,6 +66,10 @@ struct kvm_steal_time {
__u32 pad[11];
};

+struct kvm_host_suspend_time {
+ __u64 suspend_time_ns;
+};
+
#define KVM_VCPU_PREEMPTED (1 << 0)
#define KVM_VCPU_FLUSH_TLB (1 << 1)

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6aee5768c86f..ff0304de7de9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,6 +22,7 @@
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
+#include <linux/kvm_host.h>

#include "tick-internal.h"
#include "ntp_internal.h"
--
2.31.1.498.g6c1eba8ee3d-goog

2021-04-26 09:11:43

by Hikaru Nishida

[permalink] [raw]
Subject: [RFC PATCH 5/6] x86/kvm: Add CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST

The config option can be used to enable virtual suspend time injection
support on kvm guests.

Signed-off-by: Hikaru Nishida <[email protected]>
---

arch/x86/Kconfig | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..fac06534c30a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -825,6 +825,19 @@ config KVM_GUEST
underlying device model, the host provides the guest with
timing infrastructure such as time of day, and system time

+config KVM_VIRT_SUSPEND_TIMING_GUEST
+ bool "Virtual suspend time injection (guest side)"
+ depends on KVM_GUEST
+ default n
+ help
+ This option makes the host's suspension reflected on the guest's clocks.
+ In other words, guest's CLOCK_MONOTONIC will stop and
+ CLOCK_BOOTTIME keeps running during the host's suspension.
+ This feature will only be effective when both guest and host enable
+ this option.
+
+ If unsure, say N.
+
config ARCH_CPUIDLE_HALTPOLL
def_bool n
prompt "Disable host haltpoll when loading haltpoll driver"
--
2.31.1.498.g6c1eba8ee3d-goog

2021-04-26 13:17:36

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [RFC PATCH 0/6] x86/kvm: Virtual suspend time injection support

On Mon, 2021-04-26 at 18:06 +0900, Hikaru Nishida wrote:
> Hi,
>
> This patch series adds virtual suspend time injection support to KVM.
>
> Before this change, if the host goes into suspended state while the
> guest is running, the guest will experience a time jump after the host's
> resume. This can confuse some services in the guest since they can't
> detect if the system went into suspend or not by comparing
> CLOCK_BOOTTIME and CLOCK_MONOTONIC.
>
> To solve this problem, we wanted to add a way to adjust the guest clocks
> without actually suspending the guests. However, there was no way to
> modify a gap between CLOCK_BOOTTIME and CLOCK_MONOTONIC without actually
> suspending the guests. Therefore, this series introduce a new struct
> called kvm_host_suspend_time to share the suspend time between host and
> guest and a mechanism to inject a suspend time to the guest while
> keeping
> monotonicity of the clocks.
>
> Could you take a look and let me know how we can improve the patches if
> they are doing something wrong?
>
> Thanks,
>
> Hikaru Nishida
>

I haven't yet looked at that, but in my experience when I suspend the host
with VMs running, after resume all my VMs complain something about TSC watchdog
and stop using it. The TSC is stable/synchornized, but after resume it does
reset to 0 on all CPUs.

I use INVTSC flag for all my VMs.
I haven't investigated this futher yet.

Just my 0.2 cents.

Best regards,
Maxim Levitsky

>
>
> Hikaru Nishida (6):
> x86/kvm: Reserve KVM_FEATURE_HOST_SUSPEND_TIME and
> MSR_KVM_HOST_SUSPEND_TIME
> x86/kvm: Add a struct and constants for virtual suspend time injection
> x86/kvm: Add CONFIG_KVM_VIRT_SUSPEND_TIMING
> x86/kvm: Add a host side support for virtual suspend time injection
> x86/kvm: Add CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
> x86/kvm: Add a guest side support for virtual suspend time injection
>
> Documentation/virt/kvm/cpuid.rst | 3 +
> Documentation/virt/kvm/msr.rst | 29 +++++++++
> arch/x86/Kconfig | 13 ++++
> arch/x86/include/asm/kvm_host.h | 5 ++
> arch/x86/include/asm/kvm_para.h | 9 +++
> arch/x86/include/uapi/asm/kvm_para.h | 6 ++
> arch/x86/kernel/kvmclock.c | 25 ++++++++
> arch/x86/kvm/Kconfig | 13 ++++
> arch/x86/kvm/cpuid.c | 4 ++
> arch/x86/kvm/x86.c | 89 +++++++++++++++++++++++++++-
> include/linux/kvm_host.h | 7 +++
> include/linux/timekeeper_internal.h | 4 ++
> kernel/time/timekeeping.c | 31 ++++++++++
> 13 files changed, 237 insertions(+), 1 deletion(-)
>


2021-04-26 14:24:15

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC PATCH 6/6] x86/kvm: Add a guest side support for virtual suspend time injection

On Mon, Apr 26 2021 at 18:06, Hikaru Nishida wrote:
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
> +/*
> + * timekeeping_inject_suspend_time - Inject virtual suspend time
> + * if it is requested by kvm host.
> + * This function should be called under holding timekeeper_lock and
> + * only from timekeeping_advance().
> + */
> +static void timekeeping_inject_virtual_suspend_time(struct timekeeper *tk)
> +{
> + struct timespec64 delta;
> + u64 suspend_time;
> +
> + suspend_time = kvm_get_suspend_time();
> + if (suspend_time <= tk->suspend_time_injected) {
> + /* Sufficient amount of suspend time is already injected. */

What's a sufficient amount of suspend time?

> + return;
> + }
> + delta = ns_to_timespec64(suspend_time - tk->suspend_time_injected);
> + __timekeeping_inject_sleeptime(tk, &delta);
> + tk->suspend_time_injected = suspend_time;
> +}
> +#endif
>
> +
> /*
> * timekeeping_advance - Updates the timekeeper to the current time and
> * current NTP tick length
> @@ -2143,6 +2166,10 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode)
> /* Do some additional sanity checking */
> timekeeping_check_update(tk, offset);
>
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST

There are better ways than slapping #ifdefs into the code.

> + timekeeping_inject_virtual_suspend_time(tk);
> +#endif

So this is invoked on every tick? How is that justified?

The changelog is silent about this, but that's true for most of your
changelogs as they describe what the patch is doing and not the WHY,
which is the most important information. Also please do a

grep 'This patch' Documentation/process

the match there will lead you also to documentation about changelogs in
general.

Now to the overall approach, which works only for a subset of host
systems:

Host resumes
timekeeping_resume()

delta = get_suspend_time_if_possible(); <----- !!

kvm_arch_timekeeping_inject_sleeptime(delta)
TSC offset adjustment on all vCPUs
and prepare for magic injection

So this fails to work on host systems which cannot calculate the suspend
time in timekeeping_resume() because the clocksource stops in suspend
and some other source, e.g. RTC, is not accessible at that point in
time. There is a world outside of x86.

So on the host side the notification for the hypervisor has to be in
__timekeeping_inject_sleeptime() obviously.

Also I explicitely said hypervisor as we really don't want anything KVM
only here because there are other hypervisors which might want to have
the same functionality. We're not going to add a per hypervisor call
just because.

Now to the guest side:

Guest is unfrozen

clocksource in guest restarts at the point of freeze (TSC on x86)

All CLOCK ids except CLOCK_MONOTONIC continue from the state of
freeze up to the point where the first tick() after unfreeze
happens in the guest.

Now that first tick does sleep time injection which makes all
clocks except CLOCK_MONOTONIC jump forward by the amount of time
which was spent in suspend on the host.

But why is this gap correct? The first tick after unfreeze might be
up to a jiffie away.

Again the changelog is silent about this.

Also for the guest side there has to be a better way than lazily polling
a potential suspend injection on every tick and imposing the overhead
whether it's needed or not.

That's a one off event and really should be handled by some one off
injection mechanism which then invokes the existing
timekeeping_inject_sleeptime64(). There is no need for special
KVM/hypervisor magic in the core timekeeping code at all.

Seriously, if the only way to handle one off event injection from
hypervisor to guest is by polling, then there is a fundamental design
flaw in KVM or whatever hypervisor.

Thanks,

tglx

2021-04-27 15:18:46

by David Edmondson

[permalink] [raw]
Subject: Re: [RFC PATCH 2/6] x86/kvm: Add a struct and constants for virtual suspend time injection

On Monday, 2021-04-26 at 18:06:41 +09, Hikaru Nishida wrote:

> This patch adds definitions that are needed by both host and guest

s/This patch adds/Add/

> to implement virtual suspend time injection.
> This patch also adds #include <linux/kvm_host.h> in

s/This patch also adds/Also add/

> kernel/time/timekeeping.c to make necesarily functions which will be

s/necesarily/necessary/

> introduced in later patches available.
>
> Signed-off-by: Hikaru Nishida <[email protected]>
> ---
>
> arch/x86/include/uapi/asm/kvm_para.h | 6 ++++++
> kernel/time/timekeeping.c | 1 +
> 2 files changed, 7 insertions(+)
>
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
> index 950afebfba88..13788b01094f 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -33,6 +33,7 @@
> #define KVM_FEATURE_PV_SCHED_YIELD 13
> #define KVM_FEATURE_ASYNC_PF_INT 14
> #define KVM_FEATURE_MSI_EXT_DEST_ID 15
> +#define KVM_FEATURE_HOST_SUSPEND_TIME 16
>
> #define KVM_HINTS_REALTIME 0
>
> @@ -54,6 +55,7 @@
> #define MSR_KVM_POLL_CONTROL 0x4b564d05
> #define MSR_KVM_ASYNC_PF_INT 0x4b564d06
> #define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
> +#define MSR_KVM_HOST_SUSPEND_TIME 0x4b564d08
>
> struct kvm_steal_time {
> __u64 steal;
> @@ -64,6 +66,10 @@ struct kvm_steal_time {
> __u32 pad[11];
> };
>
> +struct kvm_host_suspend_time {
> + __u64 suspend_time_ns;
> +};
> +
> #define KVM_VCPU_PREEMPTED (1 << 0)
> #define KVM_VCPU_FLUSH_TLB (1 << 1)
>
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index 6aee5768c86f..ff0304de7de9 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -22,6 +22,7 @@
> #include <linux/pvclock_gtod.h>
> #include <linux/compiler.h>
> #include <linux/audit.h>
> +#include <linux/kvm_host.h>
>
> #include "tick-internal.h"
> #include "ntp_internal.h"
> --
> 2.31.1.498.g6c1eba8ee3d-goog

dme.
--
Too much information, running through my brain.

2021-04-27 15:32:25

by David Edmondson

[permalink] [raw]
Subject: Re: [RFC PATCH 4/6] x86/kvm: Add a host side support for virtual suspend time injection

On Monday, 2021-04-26 at 18:06:43 +09, Hikaru Nishida wrote:

> This patch implements virtual suspend time injection support for kvm
> hosts.
> If this functionality is enabled and the guest requests it, the host
> will stop all the clocks observed by the guest during the host's
> suspension and report the duration of suspend to the guest through
> struct kvm_host_suspend_time to give a chance to adjust CLOCK_BOOTTIME
> to the guest. This mechanism can be used to align the guest's clock
> behavior to the hosts' ones.
>
> Signed-off-by: Hikaru Nishida <[email protected]>
> ---
>
> arch/x86/include/asm/kvm_host.h | 5 ++
> arch/x86/kvm/cpuid.c | 4 ++
> arch/x86/kvm/x86.c | 89 ++++++++++++++++++++++++++++++++-
> include/linux/kvm_host.h | 7 +++
> kernel/time/timekeeping.c | 3 ++
> 5 files changed, 107 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3768819693e5..6584adaab3bf 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -994,6 +994,11 @@ struct kvm_arch {
>
> gpa_t wall_clock;
>
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + struct gfn_to_hva_cache suspend_time;
> + bool suspend_time_injection_enabled;
> +#endif /* KVM_VIRT_SUSPEND_TIMING */
> +
> bool mwait_in_guest;
> bool hlt_in_guest;
> bool pause_in_guest;
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 6bd2f8b830e4..62224b7bd7f9 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -814,6 +814,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
> (1 << KVM_FEATURE_PV_SCHED_YIELD) |
> (1 << KVM_FEATURE_ASYNC_PF_INT);
>
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + entry->eax |= (1 << KVM_FEATURE_HOST_SUSPEND_TIME);
> +#endif
> +
> if (sched_info_on())
> entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eca63625aee4..d919f771ce31 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1285,6 +1285,9 @@ static const u32 emulated_msrs_all[] = {
>
> MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
> MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + MSR_KVM_HOST_SUSPEND_TIME,
> +#endif
>
> MSR_IA32_TSC_ADJUST,
> MSR_IA32_TSCDEADLINE,
> @@ -2020,6 +2023,20 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
> return;
> }
>
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> +static void kvm_write_suspend_time(struct kvm *kvm, bool updated)

The "updated" argument is not used.

> +{
> + struct kvm_arch *ka = &kvm->arch;
> + struct kvm_host_suspend_time st;
> +
> + if (!ka->suspend_time_injection_enabled)
> + return;
> +
> + st.suspend_time_ns = kvm->suspend_time_ns;
> + kvm_write_guest_cached(kvm, &ka->suspend_time, &st, sizeof(st));
> +}
> +#endif
> +
> static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
> {
> do_shl32_div32(dividend, divisor);
> @@ -2653,6 +2670,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
>
> if (vcpu->pvclock_set_guest_stopped_request) {
> vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + kvm_write_suspend_time(v->kvm, true);
> +#endif
> vcpu->pvclock_set_guest_stopped_request = false;
> }
>
> @@ -3229,7 +3249,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>
> vcpu->arch.msr_kvm_poll_control = data;
> break;
> -
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + case MSR_KVM_HOST_SUSPEND_TIME:
> + if (!guest_pv_has(vcpu, KVM_FEATURE_HOST_SUSPEND_TIME))
> + return 1;
> + if (!(data & KVM_MSR_ENABLED)) {
> + vcpu->kvm->arch.suspend_time_injection_enabled = false;
> + break;
> + }
> + if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
> + &vcpu->kvm->arch.suspend_time, data & ~KVM_MSR_ENABLED,
> + sizeof(struct kvm_host_suspend_time))) {
> + return 1;
> + }
> + vcpu->kvm->arch.suspend_time_injection_enabled = true;
> + kvm_write_suspend_time(vcpu->kvm, false);
> + break;
> +#endif
> case MSR_IA32_MCG_CTL:
> case MSR_IA32_MCG_STATUS:
> case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> @@ -3535,6 +3571,15 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>
> msr_info->data = vcpu->arch.msr_kvm_poll_control;
> break;
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + case MSR_KVM_HOST_SUSPEND_TIME:
> + if (!guest_pv_has(vcpu, KVM_FEATURE_HOST_SUSPEND_TIME))
> + return 1;
> + msr_info->data = vcpu->kvm->arch.suspend_time.gpa;
> + if (vcpu->kvm->arch.suspend_time_injection_enabled)
> + msr_info->data |= KVM_MSR_ENABLED;
> + break;
> +#endif
> case MSR_IA32_P5_MC_ADDR:
> case MSR_IA32_P5_MC_TYPE:
> case MSR_IA32_MCG_CAP:
> @@ -11723,6 +11768,48 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
> }
> EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
>
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> +void kvm_arch_timekeeping_inject_sleeptime(const struct timespec64 *delta)
> +{
> + struct kvm_vcpu *vcpu;
> + u64 suspend_time_ns;
> + struct kvm *kvm;
> + s64 adj;
> + int i;
> +
> + suspend_time_ns = timespec64_to_ns(delta);
> + adj = tsc_khz * (suspend_time_ns / 1000000);
> + /*
> + * Adjust TSCs on all vcpus as if they are stopped during a suspend.
> + * doing a similar thing in kvm_arch_hardware_enable().
> + */
> + mutex_lock(&kvm_lock);
> + list_for_each_entry(kvm, &vm_list, vm_list) {
> + if (!kvm->arch.suspend_time_injection_enabled)
> + continue;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + /*
> + * Adjustment here is always smaller than the gap
> + * observed by the guest, so subtracting the value
> + * here never rewinds the observed TSC in guests.
> + * This adjustment will be applied on the next
> + * kvm_arch_vcpu_load().
> + */
> + vcpu->arch.tsc_offset_adjustment -= adj;
> + }
> + /*
> + * Move the offset of kvm_clock here as if it is stopped
> + * during the suspension.
> + */
> + kvm->arch.kvmclock_offset -= suspend_time_ns;
> + /* suspend_time is accumulated per VM. */
> + kvm->suspend_time_ns += suspend_time_ns;
> + }
> + mutex_unlock(&kvm_lock);
> +}
> +#endif
> +
> EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
> EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
> EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1b65e7204344..e077b9a960fc 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -520,6 +520,9 @@ struct kvm {
> pid_t userspace_pid;
> unsigned int max_halt_poll_ns;
> u32 dirty_ring_size;
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + u64 suspend_time_ns;
> +#endif
> };
>
> #define kvm_err(fmt, ...) \
> @@ -1522,4 +1525,8 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> /* Max number of entries allowed for each kvm dirty ring */
> #define KVM_DIRTY_RING_MAX_ENTRIES 65536
>
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> +void kvm_arch_timekeeping_inject_sleeptime(const struct timespec64 *delta);
> +#endif /* CONFIG_KVM_VIRT_SUSPEND_TIMING */
> +
> #endif
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index ff0304de7de9..342a032ad552 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -1786,6 +1786,9 @@ void timekeeping_resume(void)
> if (inject_sleeptime) {
> suspend_timing_needed = false;
> __timekeeping_inject_sleeptime(tk, &ts_delta);
> +#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING
> + kvm_arch_timekeeping_inject_sleeptime(&ts_delta);
> +#endif
> }
>
> /* Re-base the last cycle value */
> --
> 2.31.1.498.g6c1eba8ee3d-goog

dme.
--
Stranded starfish have no place to hide.