2017-08-02 14:58:49

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

V4:
* removed "is stable" function with vague definition of stability
there is the only function which does time with cycle stamp getting
* some variables renamed
* some patches split into smaller once
* atomic64_t usage is replaced with atomic_t

V3:
Changing the timekeeper interface for clocksource reading looks like
an overkill to achive the goal of getting cycles stamp for KVM.
Instead extend the timekeeping interface and add functions which provide
necessary data: read clocksource with cycles stamp, check whether the
clock source is stable.

Use those functions and improve existing timekeeper functionality to
replace pvclock_gtod_copy scheme in masterclock data calculation.

V2:
The main goal is to make L2 kvm-clock be stable when it's running over L1
with stable kvm-clock.

The patch series is for x86 architecture only. If the series is approved
I'll do changes for other architectures but I don't have an ability to
compile and check for every single on (help needed)

The patch series do the following:

* change timekeeper interface to get cycles stamp value from
the timekeeper
* get rid of pvclock copy in KVM by using the changed timekeeper
interface: get time and cycles right from the timekeeper
* make KVM recognize a stable kvm-clock as stable clocksource
and use the KVM masterclock in this case, which means making
L2 stable when running over stable L1 kvm-clock

Denis Plotnikov (10):
timekeeper: introduce extended clocksource reading callback
timekeeper: introduce boot field in system_time_snapshot
timekeeper: use the extended reading function on snapshot acquiring
tsc: implement the extended tsc reading function
KVM: x86: switch to masterclock update using timekeeper functionality
timekeeper: add clocksource change notifier
KVM: x86: remove not used pvclock_gtod_copy
pvclock: add parameters to store stamp data in pvclock reading
function
pvclock: add clocksource change notification on changing of tsc stable
bit
kvmclock: implement the extended reading function

arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/include/asm/pvclock.h | 3 +-
arch/x86/kernel/kvmclock.c | 19 +++-
arch/x86/kernel/pvclock.c | 37 +++++-
arch/x86/kernel/tsc.c | 10 ++
arch/x86/kvm/trace.h | 31 ++---
arch/x86/kvm/x86.c | 242 ++++++++--------------------------------
arch/x86/xen/time.c | 2 +-
include/linux/clocksource.h | 11 +-
include/linux/cs_notifier.h | 17 +++
include/linux/timekeeping.h | 5 +
kernel/time/timekeeping.c | 68 ++++++++++-
12 files changed, 219 insertions(+), 228 deletions(-)
create mode 100644 include/linux/cs_notifier.h

--
2.7.4


2017-08-02 14:57:51

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 04/10] tsc: implement the extended tsc reading function

By doing that, add tsc clocksource to a list of KVM clocksources
providing valid cycle values, meaning that KVM can use its masterclock.

This is a part of the work aiming to move to a more simple scheme of
masterclock related values calculation in KVM.

Signed-off-by: Denis Plotnikov <[email protected]>
---
arch/x86/kernel/tsc.c | 10 ++++++++++
1 file changed, 10 insertions(+)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 796d96b..8786454 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1036,6 +1036,15 @@ static void tsc_cs_tick_stable(struct clocksource *cs)
sched_clock_tick_stable();
}

+static bool tsc_read_with_stamp(struct clocksource *cs,
+ u64 *cycles, u64 *cycles_stamp)
+{
+ u64 tsc = read_tsc(cs);
+ *cycles = tsc;
+ *cycles_stamp = tsc;
+ return true;
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1043,6 +1052,7 @@ static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
+ .read_with_stamp = tsc_read_with_stamp,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
--
2.7.4

2017-08-02 14:58:11

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 10/10] kvmclock: implement the extended reading function

This allows L2 guests to use masterclock, namely
provide KVM with ability to use masterclock while
running over kvmclock closksource in the cases when
it's possible.

This is the final part of the work of teaching KVM
to use masterclock when over kvmclock clocksource.

Signed-off-by: Denis Plotnikov <[email protected]>
---
arch/x86/kernel/kvmclock.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f692579..8c1008f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -82,7 +82,7 @@ static int kvm_set_wallclock(const struct timespec *now)
return -1;
}

-static u64 kvm_clock_read(void)
+static inline u64 __kvm_clock_read(u64 *cycles, u8 *flags)
{
struct pvclock_vcpu_time_info *src;
u64 ret;
@@ -91,10 +91,14 @@ static u64 kvm_clock_read(void)
preempt_disable_notrace();
cpu = smp_processor_id();
src = &hv_clock[cpu].pvti;
- ret = pvclock_clocksource_read(src, NULL, NULL);
+ ret = pvclock_clocksource_read(src, cycles, flags);
preempt_enable_notrace();
return ret;
}
+static u64 kvm_clock_read(void)
+{
+ return __kvm_clock_read(NULL, NULL);
+}

static u64 kvm_clock_get_cycles(struct clocksource *cs)
{
@@ -177,9 +181,20 @@ bool kvm_check_and_clear_guest_paused(void)
return ret;
}

+static bool kvm_clock_read_with_stamp(struct clocksource *cs,
+ u64 *cycles, u64 *cycles_stamp)
+{
+ u8 flags;
+
+ *cycles = __kvm_clock_read(cycles_stamp, &flags);
+
+ return (bool) flags & PVCLOCK_TSC_STABLE_BIT;
+}
+
struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
+ .read_with_stamp = kvm_clock_read_with_stamp,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
--
2.7.4

2017-08-02 14:58:00

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 01/10] timekeeper: introduce extended clocksource reading callback

The callback provides extended information about just read
clocksource value.

It's going to be used in cases when detailed system information
needed for further time related values calculation, e.g in KVM
masterclock settings calculation.

Signed-off-by: Denis Plotnikov <[email protected]>
---
include/linux/clocksource.h | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index a78cb18..786a522 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -48,7 +48,14 @@ struct module;
* 400-499: Perfect
* The ideal clocksource. A must-use where
* available.
- * @read: returns a cycle value, passes clocksource as argument
+ * @read: returns a cycle value (might be not quite cycles:
+ * see pvclock) passes clocksource as argument
+ * @read_with_stamp: saves cycles value (got from timekeeper) and cycles
+ * stamp (got from hardware counter value and used by
+ * timekeeper to calculate the cycles value) to
+ * corresponding input pointers return true if cycles
+ * stamp holds real cycles and false if it holds some
+ * time derivative value
* @enable: optional function to enable the clocksource
* @disable: optional function to disable the clocksource
* @mask: bitmask for two's complement
@@ -78,6 +85,8 @@ struct module;
*/
struct clocksource {
u64 (*read)(struct clocksource *cs);
+ bool (*read_with_stamp)(struct clocksource *cs,
+ u64 *cycles, u64 *cycles_stamp);
u64 mask;
u32 mult;
u32 shift;
--
2.7.4

2017-08-02 14:58:20

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 09/10] pvclock: add clocksource change notification on changing of tsc stable bit

It's needed to notify the KVM guest about critical changes in pvclock
and make it to update its masterclock.

This is a part of the work aiming to make kvmclock be a clocksource
providing valid cycles value for KVM masterclock, another words
make possible to use KVM masterclock over kvmclock clocksource.

Signed-off-by: Denis Plotnikov <[email protected]>
---
arch/x86/kernel/pvclock.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index bece384..5898f20 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -22,6 +22,7 @@
#include <linux/gfp.h>
#include <linux/bootmem.h>
#include <linux/nmi.h>
+#include <linux/cs_notifier.h>

#include <asm/fixmap.h>
#include <asm/pvclock.h>
@@ -73,6 +74,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
return flags & valid_flags;
}

+static atomic_t clocksource_stable = ATOMIC_INIT(0);
+
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src,
u64 *cycles_stamp, u8 *flags_stamp)
{
@@ -102,10 +105,20 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src,
pvclock_touch_watchdogs();
}

- if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
- (flags & PVCLOCK_TSC_STABLE_BIT))
- return ret;
+ if (likely(valid_flags & PVCLOCK_TSC_STABLE_BIT)) {
+ bool stable_now = !!(flags & PVCLOCK_TSC_STABLE_BIT);
+ bool stable_last = (bool) atomic_read(&clocksource_stable);
+
+ if (unlikely(stable_now != stable_last)) {
+ /* send notification once */
+ if (stable_last == atomic_cmpxchg(
+ &clocksource_stable, stable_last, stable_now))
+ clocksource_changes_notify();
+ }

+ if (stable_now)
+ return ret;
+ }
/*
* Assumption here is that last_value, a global accumulator, always goes
* forward. If we are less than that, we should not be much smaller.
--
2.7.4

2017-08-02 14:58:30

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 02/10] timekeeper: introduce boot field in system_time_snapshot

The field keeps monotonic time since boot.

The value of boot will be used in KVM for masterclock.

This is a part of the work aiming to move to a more simple scheme
of masterclock related values calculation in KVM

Signed-off-by: Denis Plotnikov <[email protected]>
---
include/linux/timekeeping.h | 2 ++
kernel/time/timekeeping.c | 4 ++++
2 files changed, 6 insertions(+)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ddc229f..5008e3e 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -283,6 +283,7 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
* @cycles: Clocksource counter value to produce the system times
* @real: Realtime system time
* @raw: Monotonic raw system time
+ * @boot: Monotonic time since boot
* @clock_was_set_seq: The sequence number of clock was set events
* @cs_was_changed_seq: The sequence number of clocksource change events
*/
@@ -290,6 +291,7 @@ struct system_time_snapshot {
u64 cycles;
ktime_t real;
ktime_t raw;
+ ktime_t boot;
unsigned int clock_was_set_seq;
u8 cs_was_changed_seq;
};
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cedafa0..66f7701 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -953,6 +953,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
unsigned long seq;
ktime_t base_raw;
ktime_t base_real;
+ ktime_t base_boot;
u64 nsec_raw;
u64 nsec_real;
u64 now;
@@ -967,6 +968,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
+ base_boot = ktime_add(tk->tkr_mono.base,
+ tk_core.timekeeper.offs_boot);
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -974,6 +977,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
systime_snapshot->cycles = now;
systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
+ systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

--
2.7.4

2017-08-02 14:58:39

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 03/10] timekeeper: use the extended reading function on snapshot acquiring

Make use of the extended reading function on time snapshot getting:
get raw cycles value if extended function is defined.
The raw cycles value then is used for KVM masterclock.

This is a part of the work aiming to move to a more simple scheme of
masterclock related values calculation in KVM

Signed-off-by: Denis Plotnikov <[email protected]>
---
include/linux/timekeeping.h | 3 +++
kernel/time/timekeeping.c | 13 +++++++++++--
2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 5008e3e..528d088 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -286,6 +286,8 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
* @boot: Monotonic time since boot
* @clock_was_set_seq: The sequence number of clock was set events
* @cs_was_changed_seq: The sequence number of clocksource change events
+ * @cycles_valid: The flag is true when @cycles contain actual
+ * number of cycles instead some cycle derivative
*/
struct system_time_snapshot {
u64 cycles;
@@ -294,6 +296,7 @@ struct system_time_snapshot {
ktime_t boot;
unsigned int clock_was_set_seq;
u8 cs_was_changed_seq;
+ bool cycles_valid;
};

/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 66f7701..d1aa575 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -957,12 +957,22 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
u64 nsec_raw;
u64 nsec_real;
u64 now;
+ struct clocksource *clock;

WARN_ON_ONCE(timekeeping_suspended);

do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_clock_read(&tk->tkr_mono);
+ clock = READ_ONCE(tk->tkr_mono.clock);
+ if (clock->read_with_stamp)
+ systime_snapshot->cycles_valid =
+ clock->read_with_stamp(
+ clock, &now, &systime_snapshot->cycles);
+ else {
+ systime_snapshot->cycles_valid = false;
+ now = clock->read(clock);
+ systime_snapshot->cycles = now;
+ }
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
@@ -974,7 +984,6 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
} while (read_seqcount_retry(&tk_core.seq, seq));

- systime_snapshot->cycles = now;
systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
--
2.7.4

2017-08-02 14:58:58

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 05/10] KVM: x86: switch to masterclock update using timekeeper functionality

It is reasonable to switch KVM to using a more simple, effective
and conceptually correct scheme of dealing with the data needed
for kvm masterclock values calculation.

With the current scheme the kvm needs to have an up-to-date copy of
some timekeeper data to provide a guest using kvmclock with necessary
information.

This is not:
- simple
KVM has to have a lot of code to do that, instead KVM could use
a timekeeper function to get all the data it needs
- effective
the copy of the data used for time data calculation is updated
every time it changed although this is not necessary since
the updated piece of time data is needed in certain moments only
(e.g masterclock updating), instead KVM can request this data
directly form the timekeeper at the moments when it's really needed
- conceptually correct
to do the work (calculate the time data) which the other part
of the system (timekeeper) has been designed and is able to do
is not the proper way, instead deligate the work to the proper part

This patch switches KVM to using the improved timekeeper function for
the kvm masterclock time data.

Removing the leftovers of the old scheme is the matter of the next patches.

Signed-off-by: Denis Plotnikov <[email protected]>
---
arch/x86/kvm/x86.c | 26 ++++++++++++++++++--------
1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c97c82..d8ec2ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1643,22 +1643,32 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now)
/* returns true if host is using tsc clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
{
- /* checked again under seqlock below */
- if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
- return false;
+ struct system_time_snapshot systime_snapshot;
+
+ ktime_get_snapshot(&systime_snapshot);
+
+ if (systime_snapshot.cycles_valid) {
+ *kernel_ns = ktime_to_ns(systime_snapshot.boot);
+ *cycle_now = systime_snapshot.cycles;
+ }

- return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
+ return systime_snapshot.cycles_valid;
}

/* returns true if host is using tsc clocksource */
static bool kvm_get_walltime_and_clockread(struct timespec *ts,
u64 *cycle_now)
{
- /* checked again under seqlock below */
- if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
- return false;
+ struct system_time_snapshot systime_snapshot;
+
+ ktime_get_snapshot(&systime_snapshot);
+
+ if (systime_snapshot.cycles_valid) {
+ *ts = ktime_to_timespec(systime_snapshot.real);
+ *cycle_now = systime_snapshot.cycles;
+ }

- return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+ return systime_snapshot.cycles_valid;
}
#endif

--
2.7.4

2017-08-02 14:59:06

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 08/10] pvclock: add parameters to store stamp data in pvclock reading function

The parameters is a pointer which is used to store cycles and flags
stamp value used for time calulation on pvclock reading.

This is a part of the work aiming to make possible to use KVM
masterclock in L2 guests (those once running over kvmclock clocksource).

Signed-off-by: Denis Plotnikov <[email protected]>
---
arch/x86/include/asm/pvclock.h | 3 ++-
arch/x86/kernel/kvmclock.c | 2 +-
arch/x86/kernel/pvclock.c | 18 +++++++++++++++---
arch/x86/xen/time.c | 2 +-
4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 448cfe1..f4ff2c0 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -14,7 +14,8 @@ static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
#endif

/* some helper functions for xen and kvm pv clock sources */
-u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src,
+ u64 *cycles_stamp, u8 *flags_stamp);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d889676..f692579 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -91,7 +91,7 @@ static u64 kvm_clock_read(void)
preempt_disable_notrace();
cpu = smp_processor_id();
src = &hv_clock[cpu].pvti;
- ret = pvclock_clocksource_read(src);
+ ret = pvclock_clocksource_read(src, NULL, NULL);
preempt_enable_notrace();
return ret;
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5c3f6d6..bece384 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -73,21 +73,32 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
return flags & valid_flags;
}

-u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src,
+ u64 *cycles_stamp, u8 *flags_stamp)
{
unsigned version;
u64 ret;
u64 last;
u8 flags;
+ u64 tsc;

do {
version = pvclock_read_begin(src);
- ret = __pvclock_read_cycles(src, rdtsc_ordered());
+ tsc = rdtsc_ordered();
+ ret = __pvclock_read_cycles(src, tsc);
flags = src->flags;
} while (pvclock_read_retry(src, version));

+ if (cycles_stamp)
+ *cycles_stamp = tsc;
+
+ if (flags_stamp)
+ *flags_stamp = flags;
+
if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ if (flags_stamp)
+ *flags_stamp &= ~PVCLOCK_GUEST_STOPPED;
pvclock_touch_watchdogs();
}

@@ -136,7 +147,8 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));

- delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ /* time since system boot */
+ delta = pvclock_clocksource_read(vcpu_time, NULL, NULL);
delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;

now.tv_nsec = do_div(delta, NSEC_PER_SEC);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1ecb05d..3292cbc 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -46,7 +46,7 @@ u64 xen_clocksource_read(void)

preempt_disable_notrace();
src = &__this_cpu_read(xen_vcpu)->time;
- ret = pvclock_clocksource_read(src);
+ ret = pvclock_clocksource_read(src, NULL, NULL);
preempt_enable_notrace();
return ret;
}
--
2.7.4

2017-08-02 14:59:15

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 06/10] timekeeper: add clocksource change notifier

This notifier will fire when clocksource is changed or
any properties of the clocksource are changed which alter
the clocksource critical properties, e.g clocksource stability.

It will be used in updating the KVM masterclock with the help
of timekeeper because it's the very moment to notify KVM about
critical changes happened in the underlying timekeeper.

This is a final related to the timekeeper patch of the work aiming
to move to a more simple scheme of masterclock related values
calculation in KVM.

Signed-off-by: Denis Plotnikov <[email protected]>
---
include/linux/cs_notifier.h | 17 +++++++++++++++
kernel/time/timekeeping.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 68 insertions(+)
create mode 100644 include/linux/cs_notifier.h

diff --git a/include/linux/cs_notifier.h b/include/linux/cs_notifier.h
new file mode 100644
index 0000000..2b1b4e6
--- /dev/null
+++ b/include/linux/cs_notifier.h
@@ -0,0 +1,17 @@
+#ifndef _CS_CHANGES_H
+#define _CS_CHANGES_H
+
+#include <linux/notifier.h>
+
+/*
+ * The clocksource changes notifier is called when the system
+ * clocksource is changed or some properties of the current
+ * system clocksource is changed that can affect other parts of the system,
+ * for example KVM guests
+ */
+
+extern void clocksource_changes_notify(void);
+extern int clocksource_changes_register_notifier(struct notifier_block *nb);
+extern int clocksource_changes_unregister_notifier(struct notifier_block *nb);
+
+#endif /* _CS_CHANGES_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d1aa575..c9dad8a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -596,6 +596,55 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

+/* notification chain when there is some changes in the clocksource */
+static RAW_NOTIFIER_HEAD(clocksource_changes_chain);
+
+/**
+ * notify_clocksource_changing - notify all the listeners about changes
+ * happened in the clocksource: changing a clocksource, changing the sensitive
+ * parameters of the clocksource, e.g. stability flag for kvmclock
+ */
+void clocksource_changes_notify(void)
+{
+ raw_notifier_call_chain(&clocksource_changes_chain, 0L, NULL);
+}
+EXPORT_SYMBOL_GPL(clocksource_changes_notify);
+
+/**
+ * clocksource_changes_register_notifier - register
+ * a clocksource changes listener
+ */
+int clocksource_changes_register_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ ret = raw_notifier_chain_register(&clocksource_changes_chain, nb);
+ clocksource_changes_notify();
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clocksource_changes_register_notifier);
+
+/**
+ * clocksource_changes_unregister_notifier - unregister
+ * a clocksource changes listener
+ */
+int clocksource_changes_unregister_notifier(struct notifier_block *nb)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ ret = raw_notifier_chain_unregister(&clocksource_changes_chain, nb);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clocksource_changes_unregister_notifier);
+
/*
* tk_update_leap_state - helper to update the next_leap_ktime
*/
@@ -1363,6 +1412,7 @@ static int change_clocksource(void *data)
}
}
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ clocksource_changes_notify();

write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1540,6 +1590,7 @@ void __init timekeeping_init(void)
tk_set_wall_to_mono(tk, tmp);

timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
+ clocksource_changes_notify();

write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
--
2.7.4

2017-08-02 14:59:25

by Denis Plotnikov

[permalink] [raw]
Subject: [PATCH v4 07/10] KVM: x86: remove not used pvclock_gtod_copy

Since, KVM has been switched to getting masterclock related data
right from the timekeeper by the previous patches, now we are able
to remove all the parts related to the old scheme of getting
masterclock data.

This patch removes those parts.

Signed-off-by: Denis Plotnikov <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/trace.h | 31 ++----
arch/x86/kvm/x86.c | 216 ++++++----------------------------------
3 files changed, 42 insertions(+), 207 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 87ac4fb..91465db 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -791,7 +791,7 @@ struct kvm_arch {
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;

- spinlock_t pvclock_gtod_sync_lock;
+ spinlock_t masterclock_lock;
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc67..923ab31 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -807,45 +807,39 @@ TRACE_EVENT(kvm_write_tsc_offset,

#ifdef CONFIG_X86_64

-#define host_clocks \
- {VCLOCK_NONE, "none"}, \
- {VCLOCK_TSC, "tsc"} \
-
TRACE_EVENT(kvm_update_master_clock,
- TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
- TP_ARGS(use_master_clock, host_clock, offset_matched),
+ TP_PROTO(bool use_master_clock, bool host_clock_stable,
+ bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock_stable, offset_matched),

TP_STRUCT__entry(
__field( bool, use_master_clock )
- __field( unsigned int, host_clock )
+ __field( bool, host_clock_stable )
__field( bool, offset_matched )
),

TP_fast_assign(
__entry->use_master_clock = use_master_clock;
- __entry->host_clock = host_clock;
+ __entry->host_clock_stable = host_clock_stable;
__entry->offset_matched = offset_matched;
),

- TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ TP_printk("masterclock %d hostclock stable %u offsetmatched %u",
__entry->use_master_clock,
- __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->host_clock_stable,
__entry->offset_matched)
);

TRACE_EVENT(kvm_track_tsc,
TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
- unsigned int online_vcpus, bool use_master_clock,
- unsigned int host_clock),
- TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
- host_clock),
+ unsigned int online_vcpus, bool use_master_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock),

TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned int, nr_vcpus_matched_tsc )
__field( unsigned int, online_vcpus )
__field( bool, use_master_clock )
- __field( unsigned int, host_clock )
),

TP_fast_assign(
@@ -853,14 +847,11 @@ TRACE_EVENT(kvm_track_tsc,
__entry->nr_vcpus_matched_tsc = nr_matched;
__entry->online_vcpus = online_vcpus;
__entry->use_master_clock = use_master_clock;
- __entry->host_clock = host_clock;
),

- TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
- " hostclock %s",
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u",
__entry->vcpu_id, __entry->use_master_clock,
- __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
- __print_symbolic(__entry->host_clock, host_clocks))
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus)
);

#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d8ec2ca..53754fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -50,7 +50,7 @@
#include <linux/hash.h>
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
-#include <linux/pvclock_gtod.h>
+#include <linux/cs_notifier.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
@@ -1134,50 +1134,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
return kvm_set_msr(vcpu, &msr);
}

-#ifdef CONFIG_X86_64
-struct pvclock_gtod_data {
- seqcount_t seq;
-
- struct { /* extract of a clocksource struct */
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
- } clock;
-
- u64 boot_ns;
- u64 nsec_base;
- u64 wall_time_sec;
-};
-
-static struct pvclock_gtod_data pvclock_gtod_data;
-
-static void update_pvclock_gtod(struct timekeeper *tk)
-{
- struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
- u64 boot_ns;
-
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
-
- write_seqcount_begin(&vdata->seq);
-
- /* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
- vdata->clock.mask = tk->tkr_mono.mask;
- vdata->clock.mult = tk->tkr_mono.mult;
- vdata->clock.shift = tk->tkr_mono.shift;
-
- vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
-
- vdata->wall_time_sec = tk->xtime_sec;
-
- write_seqcount_end(&vdata->seq);
-}
-#endif
-
void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
{
/*
@@ -1269,10 +1225,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
__func__, base_hz, scaled_hz, shift, *pmultiplier);
}

-#ifdef CONFIG_X86_64
-static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
-#endif
-
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
static unsigned long max_tsc_khz;

@@ -1366,7 +1318,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
bool vcpus_matched;
struct kvm_arch *ka = &vcpu->kvm->arch;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;

vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&vcpu->kvm->online_vcpus));
@@ -1379,13 +1330,12 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
* and the vcpus need to have matched TSCs. When that happens,
* perform request to enable masterclock.
*/
- if (ka->use_master_clock ||
- (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
+ if (ka->use_master_clock || vcpus_matched)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
- atomic_read(&vcpu->kvm->online_vcpus),
- ka->use_master_clock, gtod->clock.vclock_mode);
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock);
#endif
}

@@ -1538,7 +1488,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);

- spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_lock(&kvm->arch.masterclock_lock);
if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (!already_matched) {
@@ -1546,9 +1496,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
}

kvm_track_tsc_matching(vcpu);
- spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_unlock(&kvm->arch.masterclock_lock);
}
-
EXPORT_SYMBOL_GPL(kvm_write_tsc);

static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -1567,79 +1516,6 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)

#ifdef CONFIG_X86_64

-static u64 read_tsc(void)
-{
- u64 ret = (u64)rdtsc_ordered();
- u64 last = pvclock_gtod_data.clock.cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a function of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
-
-static inline u64 vgettsc(u64 *cycle_now)
-{
- long v;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-
- *cycle_now = read_tsc();
-
- v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
- return v * gtod->clock.mult;
-}
-
-static int do_monotonic_boot(s64 *t, u64 *cycle_now)
-{
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
- unsigned long seq;
- int mode;
- u64 ns;
-
- do {
- seq = read_seqcount_begin(&gtod->seq);
- mode = gtod->clock.vclock_mode;
- ns = gtod->nsec_base;
- ns += vgettsc(cycle_now);
- ns >>= gtod->clock.shift;
- ns += gtod->boot_ns;
- } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
- *t = ns;
-
- return mode;
-}
-
-static int do_realtime(struct timespec *ts, u64 *cycle_now)
-{
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
- unsigned long seq;
- int mode;
- u64 ns;
-
- do {
- seq = read_seqcount_begin(&gtod->seq);
- mode = gtod->clock.vclock_mode;
- ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->nsec_base;
- ns += vgettsc(cycle_now);
- ns >>= gtod->clock.shift;
- } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
-
- return mode;
-}
-
/* returns true if host is using tsc clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
{
@@ -1713,34 +1589,28 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts,
*
*/

-static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+static void update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
struct kvm_arch *ka = &kvm->arch;
- int vclock_mode;
- bool host_tsc_clocksource, vcpus_matched;
+ bool host_clocksource_stable, vcpus_matched;

vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus));

/*
- * If the host uses TSC clock, then passthrough TSC as stable
- * to the guest.
+ * kvm_get_time_and_clockread returns true if clocksource is stable
*/
- host_tsc_clocksource = kvm_get_time_and_clockread(
+ host_clocksource_stable = kvm_get_time_and_clockread(
&ka->master_kernel_ns,
&ka->master_cycle_now);

- ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ ka->use_master_clock = host_clocksource_stable && vcpus_matched
&& !ka->backwards_tsc_observed
&& !ka->boot_vcpu_runs_old_kvmclock;

- if (ka->use_master_clock)
- atomic_set(&kvm_guest_has_master_clock, 1);
-
- vclock_mode = pvclock_gtod_data.clock.vclock_mode;
- trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
- vcpus_matched);
+ trace_kvm_update_master_clock(ka->use_master_clock,
+ host_clocksource_stable, vcpus_matched);
#endif
}

@@ -1756,10 +1626,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;

- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock(&ka->masterclock_lock);
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
- pvclock_update_vm_gtod_copy(kvm);
+ update_masterclock(kvm);

kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1768,7 +1638,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);

- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock(&ka->masterclock_lock);
#endif
}

@@ -1778,15 +1648,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
struct pvclock_vcpu_time_info hv_clock;
u64 ret;

- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock(&ka->masterclock_lock);
if (!ka->use_master_clock) {
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock(&ka->masterclock_lock);
return ktime_get_boot_ns() + ka->kvmclock_offset;
}

hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock(&ka->masterclock_lock);

/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
@@ -1872,13 +1742,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock(&ka->masterclock_lock);
use_master_clock = ka->use_master_clock;
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
}
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock(&ka->masterclock_lock);

/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -4208,11 +4078,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;

r = 0;
- /*
- * TODO: userspace has to take care of races with VCPU_RUN, so
- * kvm_gen_update_masterclock() can be cut down to locked
- * pvclock_update_vm_gtod_copy().
- */
+
kvm_gen_update_masterclock(kvm);
now_ns = get_kvmclock_ns(kvm);
kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
@@ -6041,7 +5907,8 @@ static void kvm_set_mmio_spte_mask(void)
}

#ifdef CONFIG_X86_64
-static void pvclock_gtod_update_fn(struct work_struct *work)
+static int process_clocksource_change(struct notifier_block *nb,
+ unsigned long unused0, void *unused1)
{
struct kvm *kvm;

@@ -6052,35 +5919,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
- atomic_set(&kvm_guest_has_master_clock, 0);
spin_unlock(&kvm_lock);
-}
-
-static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
-
-/*
- * Notification about pvclock gtod data update.
- */
-static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
- void *priv)
-{
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
- struct timekeeper *tk = priv;
-
- update_pvclock_gtod(tk);
-
- /* disable master clock if host does not trust, or does not
- * use, TSC clocksource
- */
- if (gtod->clock.vclock_mode != VCLOCK_TSC &&
- atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
return 0;
}

-static struct notifier_block pvclock_gtod_notifier = {
- .notifier_call = pvclock_gtod_notify,
+static struct notifier_block clocksource_change_notifier = {
+ .notifier_call = process_clocksource_change,
};
#endif

@@ -6133,7 +5977,7 @@ int kvm_arch_init(void *opaque)

kvm_lapic_init();
#ifdef CONFIG_X86_64
- pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+ clocksource_changes_register_notifier(&clocksource_change_notifier);
#endif

return 0;
@@ -6154,7 +5998,7 @@ void kvm_arch_exit(void)
CPUFREQ_TRANSITION_NOTIFIER);
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
- pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ clocksource_changes_unregister_notifier(&clocksource_change_notifier);
#endif
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
@@ -8056,10 +7900,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
mutex_init(&kvm->arch.hyperv.hv_lock);
- spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_lock_init(&kvm->arch.masterclock_lock);

kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
- pvclock_update_vm_gtod_copy(kvm);
+ update_masterclock(kvm);

INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
--
2.7.4

2017-08-02 16:10:09

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On 02/08/2017 16:38, Denis Plotnikov wrote:
> V4:
> * removed "is stable" function with vague definition of stability
> there is the only function which does time with cycle stamp getting
> * some variables renamed
> * some patches split into smaller once
> * atomic64_t usage is replaced with atomic_t

Thanks. This looks good from the KVM point of view. Let's see what the
timekeeping guys think.

Paolo

> V3:
> Changing the timekeeper interface for clocksource reading looks like
> an overkill to achive the goal of getting cycles stamp for KVM.
> Instead extend the timekeeping interface and add functions which provide
> necessary data: read clocksource with cycles stamp, check whether the
> clock source is stable.
>
> Use those functions and improve existing timekeeper functionality to
> replace pvclock_gtod_copy scheme in masterclock data calculation.
>
> V2:
> The main goal is to make L2 kvm-clock be stable when it's running over L1
> with stable kvm-clock.
>
> The patch series is for x86 architecture only. If the series is approved
> I'll do changes for other architectures but I don't have an ability to
> compile and check for every single on (help needed)
>
> The patch series do the following:
>
> * change timekeeper interface to get cycles stamp value from
> the timekeeper
> * get rid of pvclock copy in KVM by using the changed timekeeper
> interface: get time and cycles right from the timekeeper
> * make KVM recognize a stable kvm-clock as stable clocksource
> and use the KVM masterclock in this case, which means making
> L2 stable when running over stable L1 kvm-clock
>
> Denis Plotnikov (10):
> timekeeper: introduce extended clocksource reading callback
> timekeeper: introduce boot field in system_time_snapshot
> timekeeper: use the extended reading function on snapshot acquiring
> tsc: implement the extended tsc reading function
> KVM: x86: switch to masterclock update using timekeeper functionality
> timekeeper: add clocksource change notifier
> KVM: x86: remove not used pvclock_gtod_copy
> pvclock: add parameters to store stamp data in pvclock reading
> function
> pvclock: add clocksource change notification on changing of tsc stable
> bit
> kvmclock: implement the extended reading function
>
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/include/asm/pvclock.h | 3 +-
> arch/x86/kernel/kvmclock.c | 19 +++-
> arch/x86/kernel/pvclock.c | 37 +++++-
> arch/x86/kernel/tsc.c | 10 ++
> arch/x86/kvm/trace.h | 31 ++---
> arch/x86/kvm/x86.c | 242 ++++++++--------------------------------
> arch/x86/xen/time.c | 2 +-
> include/linux/clocksource.h | 11 +-
> include/linux/cs_notifier.h | 17 +++
> include/linux/timekeeping.h | 5 +
> kernel/time/timekeeping.c | 68 ++++++++++-
> 12 files changed, 219 insertions(+), 228 deletions(-)
> create mode 100644 include/linux/cs_notifier.h
>

2017-08-02 16:49:51

by John Stultz

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On Wed, Aug 2, 2017 at 7:38 AM, Denis Plotnikov
<[email protected]> wrote:
> V4:
> * removed "is stable" function with vague definition of stability
> there is the only function which does time with cycle stamp getting
> * some variables renamed
> * some patches split into smaller once
> * atomic64_t usage is replaced with atomic_t
>
> V3:
> Changing the timekeeper interface for clocksource reading looks like
> an overkill to achive the goal of getting cycles stamp for KVM.
> Instead extend the timekeeping interface and add functions which provide
> necessary data: read clocksource with cycles stamp, check whether the
> clock source is stable.
>
> Use those functions and improve existing timekeeper functionality to
> replace pvclock_gtod_copy scheme in masterclock data calculation.
>
> V2:
> The main goal is to make L2 kvm-clock be stable when it's running over L1
> with stable kvm-clock.
>
> The patch series is for x86 architecture only. If the series is approved
> I'll do changes for other architectures but I don't have an ability to
> compile and check for every single on (help needed)
>
> The patch series do the following:
>
> * change timekeeper interface to get cycles stamp value from
> the timekeeper
> * get rid of pvclock copy in KVM by using the changed timekeeper
> interface: get time and cycles right from the timekeeper
> * make KVM recognize a stable kvm-clock as stable clocksource
> and use the KVM masterclock in this case, which means making
> L2 stable when running over stable L1 kvm-clock

So, from a brief skim, I'm not a big fan of this patchset. Though this
is likely in part due to that I haven't seen anything about *why*
these changes are needed.

Can you briefly explain the issue you're trying to solve, and why you
think this approach is the way to go?
(Its usually a good idea to have such rational included in the patchset)

thanks
-john

2017-08-02 17:10:17

by John Stultz

[permalink] [raw]
Subject: Re: [PATCH v4 01/10] timekeeper: introduce extended clocksource reading callback

On Wed, Aug 2, 2017 at 7:38 AM, Denis Plotnikov
<[email protected]> wrote:
> The callback provides extended information about just read
> clocksource value.
>
> It's going to be used in cases when detailed system information
> needed for further time related values calculation, e.g in KVM
> masterclock settings calculation.
>
> Signed-off-by: Denis Plotnikov <[email protected]>
> ---
> include/linux/clocksource.h | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
> index a78cb18..786a522 100644
> --- a/include/linux/clocksource.h
> +++ b/include/linux/clocksource.h
> @@ -48,7 +48,14 @@ struct module;
> * 400-499: Perfect
> * The ideal clocksource. A must-use where
> * available.
> - * @read: returns a cycle value, passes clocksource as argument
> + * @read: returns a cycle value (might be not quite cycles:
> + * see pvclock) passes clocksource as argument
> + * @read_with_stamp: saves cycles value (got from timekeeper) and cycles
> + * stamp (got from hardware counter value and used by
> + * timekeeper to calculate the cycles value) to
> + * corresponding input pointers return true if cycles
> + * stamp holds real cycles and false if it holds some
> + * time derivative value
> * @enable: optional function to enable the clocksource
> * @disable: optional function to disable the clocksource
> * @mask: bitmask for two's complement
> @@ -78,6 +85,8 @@ struct module;
> */
> struct clocksource {
> u64 (*read)(struct clocksource *cs);
> + bool (*read_with_stamp)(struct clocksource *cs,
> + u64 *cycles, u64 *cycles_stamp);
> u64 mask;

I'm not really fan of an interface that leaks magic data to users that
know enough.

And its not clear from this if the magic data is standardized or
different clocksources export different data?

What exactly are the attributes you're trying to pull from the
lower-level hardware that you can't get otherwise (without using the
update_pvclock_gtod() since, if I'm understanding that apparently
gives you too much detail to deal with)?

thanks
-john

2017-08-02 17:11:49

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On 02/08/2017 18:49, John Stultz wrote:
> On Wed, Aug 2, 2017 at 7:38 AM, Denis Plotnikov
> <[email protected]> wrote:
>> V4:
>> * removed "is stable" function with vague definition of stability
>> there is the only function which does time with cycle stamp getting
>> * some variables renamed
>> * some patches split into smaller once
>> * atomic64_t usage is replaced with atomic_t
>>
>> V3:
>> Changing the timekeeper interface for clocksource reading looks like
>> an overkill to achive the goal of getting cycles stamp for KVM.
>> Instead extend the timekeeping interface and add functions which provide
>> necessary data: read clocksource with cycles stamp, check whether the
>> clock source is stable.
>>
>> Use those functions and improve existing timekeeper functionality to
>> replace pvclock_gtod_copy scheme in masterclock data calculation.
>>
>> V2:
>> The main goal is to make L2 kvm-clock be stable when it's running over L1
>> with stable kvm-clock.
>>
>> The patch series is for x86 architecture only. If the series is approved
>> I'll do changes for other architectures but I don't have an ability to
>> compile and check for every single on (help needed)
>>
>> The patch series do the following:
>>
>> * change timekeeper interface to get cycles stamp value from
>> the timekeeper
>> * get rid of pvclock copy in KVM by using the changed timekeeper
>> interface: get time and cycles right from the timekeeper
>> * make KVM recognize a stable kvm-clock as stable clocksource
>> and use the KVM masterclock in this case, which means making
>> L2 stable when running over stable L1 kvm-clock
>
> So, from a brief skim, I'm not a big fan of this patchset. Though this
> is likely in part due to that I haven't seen anything about *why*
> these changes are needed.

>From my selfish KVM maintainer point of view, one advantage is that it
drops knowledge of internal timekeeping functioning from KVM, using
ktime_get_snapshot instead. These are patches 1-5. Structuring the
series like this was my idea so I take the blame.

As to patches 6-10, KVM is currently only able to provide vsyscalls if
the host is using the TSC. However, when using nested virtualization
you have

L0: bare-metal hypervisor (uses TSC)
L1: nested hypervisor (uses kvmclock, can use vsyscall)
L2: nested guest

and L2 cannot use vsyscall because it is not using the TSC. This series
lets you use the vsyscall in L2 as long as L1 can.

There is one point where I couldn't help Denis as much as I wanted.
That's a definition of what's a "good" clocksource that can be used by
KVM to provide the vsyscall. I know why the patch is correct, but I
couldn't really define the concept.

In ktime_get_snapshot and struct system_counterval_t's users, they seem
to use "cycles" to map from TSC to ART; this is not unlike kvmclock's
use of "cycles" to map from TSC to nanoseconds at an origin point.
However, it's not clear to me whether "cycles" may be used by
adjust_historical_crosststamp even for non-TSC clocksources (or
non-kvmclock after this series). It doesn't help that
adjust_historical_crosststamp is essentially dead code, since
get_device_system_crosststamp is always called with a NULL history argument.

I'm also CCing Marcelo who wrote the KVM vsyscall code.

Paolo

> Can you briefly explain the issue you're trying to solve, and why you
> think this approach is the way to go?
> (Its usually a good idea to have such rational included in the patchset)

2017-08-02 17:21:25

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 01/10] timekeeper: introduce extended clocksource reading callback

On 02/08/2017 19:08, John Stultz wrote:
>> + bool (*read_with_stamp)(struct clocksource *cs,
>> + u64 *cycles, u64 *cycles_stamp);
>> u64 mask;
> I'm not really fan of an interface that leaks magic data to users that
> know enough.
>
> And its not clear from this if the magic data is standardized or
> different clocksources export different data?
>
> What exactly are the attributes you're trying to pull from the
> lower-level hardware that you can't get otherwise (without using the
> update_pvclock_gtod() since, if I'm understanding that apparently
> gives you too much detail to deal with)?

We need the exact TSC value that was used to compute the ktime. This is
different between TSC and kvmclock because TSC's read() callback
returns cycles (of course), while kvmclock's read() callback returns
nanoseconds.

In turn, kvmclock's read() callback returns nanoseconds because it has
to check the read against the host-provided seqlock, so this cannot be
changed.

Paolo

2017-08-02 23:22:12

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 07/10] KVM: x86: remove not used pvclock_gtod_copy

Hi Denis,

I'm all for this as well, the original submission suggested something
similar, someone said "use a scheme similar to vsyscalls",
therefore the internal copy of the fields.

More comments below.


On Wed, Aug 02, 2017 at 05:38:07PM +0300, Denis Plotnikov wrote:
> Since, KVM has been switched to getting masterclock related data
> right from the timekeeper by the previous patches, now we are able
> to remove all the parts related to the old scheme of getting
> masterclock data.
>
> This patch removes those parts.
>
> Signed-off-by: Denis Plotnikov <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/kvm/trace.h | 31 ++----
> arch/x86/kvm/x86.c | 216 ++++++----------------------------------
> 3 files changed, 42 insertions(+), 207 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 87ac4fb..91465db 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -791,7 +791,7 @@ struct kvm_arch {
> u64 cur_tsc_generation;
> int nr_vcpus_matched_tsc;
>
> - spinlock_t pvclock_gtod_sync_lock;
> + spinlock_t masterclock_lock;
> bool use_master_clock;
> u64 master_kernel_ns;
> u64 master_cycle_now;
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 0a6cc67..923ab31 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -807,45 +807,39 @@ TRACE_EVENT(kvm_write_tsc_offset,
>
> #ifdef CONFIG_X86_64
>
> -#define host_clocks \
> - {VCLOCK_NONE, "none"}, \
> - {VCLOCK_TSC, "tsc"} \
> -
> TRACE_EVENT(kvm_update_master_clock,
> - TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
> - TP_ARGS(use_master_clock, host_clock, offset_matched),
> + TP_PROTO(bool use_master_clock, bool host_clock_stable,
> + bool offset_matched),
> + TP_ARGS(use_master_clock, host_clock_stable, offset_matched),
>
> TP_STRUCT__entry(
> __field( bool, use_master_clock )
> - __field( unsigned int, host_clock )
> + __field( bool, host_clock_stable )
> __field( bool, offset_matched )
> ),
>
> TP_fast_assign(
> __entry->use_master_clock = use_master_clock;
> - __entry->host_clock = host_clock;
> + __entry->host_clock_stable = host_clock_stable;
> __entry->offset_matched = offset_matched;
> ),
>
> - TP_printk("masterclock %d hostclock %s offsetmatched %u",
> + TP_printk("masterclock %d hostclock stable %u offsetmatched %u",
> __entry->use_master_clock,
> - __print_symbolic(__entry->host_clock, host_clocks),
> + __entry->host_clock_stable,
> __entry->offset_matched)
> );
>
> TRACE_EVENT(kvm_track_tsc,
> TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
> - unsigned int online_vcpus, bool use_master_clock,
> - unsigned int host_clock),
> - TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
> - host_clock),
> + unsigned int online_vcpus, bool use_master_clock),
> + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock),
>
> TP_STRUCT__entry(
> __field( unsigned int, vcpu_id )
> __field( unsigned int, nr_vcpus_matched_tsc )
> __field( unsigned int, online_vcpus )
> __field( bool, use_master_clock )
> - __field( unsigned int, host_clock )
> ),
>
> TP_fast_assign(
> @@ -853,14 +847,11 @@ TRACE_EVENT(kvm_track_tsc,
> __entry->nr_vcpus_matched_tsc = nr_matched;
> __entry->online_vcpus = online_vcpus;
> __entry->use_master_clock = use_master_clock;
> - __entry->host_clock = host_clock;
> ),
>
> - TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
> - " hostclock %s",
> + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u",
> __entry->vcpu_id, __entry->use_master_clock,
> - __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
> - __print_symbolic(__entry->host_clock, host_clocks))
> + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus)
> );
>
> #endif /* CONFIG_X86_64 */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d8ec2ca..53754fa 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -50,7 +50,7 @@
> #include <linux/hash.h>
> #include <linux/pci.h>
> #include <linux/timekeeper_internal.h>
> -#include <linux/pvclock_gtod.h>
> +#include <linux/cs_notifier.h>
> #include <linux/kvm_irqfd.h>
> #include <linux/irqbypass.h>
> #include <linux/sched/stat.h>
> @@ -1134,50 +1134,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
> return kvm_set_msr(vcpu, &msr);
> }
>
> -#ifdef CONFIG_X86_64
> -struct pvclock_gtod_data {
> - seqcount_t seq;
> -
> - struct { /* extract of a clocksource struct */
> - int vclock_mode;
> - u64 cycle_last;
> - u64 mask;
> - u32 mult;
> - u32 shift;
> - } clock;
> -
> - u64 boot_ns;
> - u64 nsec_base;
> - u64 wall_time_sec;
> -};
> -
> -static struct pvclock_gtod_data pvclock_gtod_data;
> -
> -static void update_pvclock_gtod(struct timekeeper *tk)
> -{
> - struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
> - u64 boot_ns;
> -
> - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
> -
> - write_seqcount_begin(&vdata->seq);
> -
> - /* copy pvclock gtod data */
> - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
> - vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
> - vdata->clock.mask = tk->tkr_mono.mask;
> - vdata->clock.mult = tk->tkr_mono.mult;
> - vdata->clock.shift = tk->tkr_mono.shift;
> -
> - vdata->boot_ns = boot_ns;
> - vdata->nsec_base = tk->tkr_mono.xtime_nsec;
> -
> - vdata->wall_time_sec = tk->xtime_sec;
> -
> - write_seqcount_end(&vdata->seq);
> -}
> -#endif
> -
> void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
> {
> /*
> @@ -1269,10 +1225,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
> __func__, base_hz, scaled_hz, shift, *pmultiplier);
> }
>
> -#ifdef CONFIG_X86_64
> -static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
> -#endif
> -
> static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
> static unsigned long max_tsc_khz;
>
> @@ -1366,7 +1318,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
> #ifdef CONFIG_X86_64
> bool vcpus_matched;
> struct kvm_arch *ka = &vcpu->kvm->arch;
> - struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
>
> vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
> atomic_read(&vcpu->kvm->online_vcpus));
> @@ -1379,13 +1330,12 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
> * and the vcpus need to have matched TSCs. When that happens,
> * perform request to enable masterclock.
> */
> - if (ka->use_master_clock ||
> - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
> + if (ka->use_master_clock || vcpus_matched)
> kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

Don't drop this. The masterclock scheme requires TSC for proper functioning
(or an analysis why its supposed with different HPET+TSC, for example).

>
> trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
> - atomic_read(&vcpu->kvm->online_vcpus),
> - ka->use_master_clock, gtod->clock.vclock_mode);
> + atomic_read(&vcpu->kvm->online_vcpus),
> + ka->use_master_clock);
> #endif
> }
>
> @@ -1538,7 +1488,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
> kvm_vcpu_write_tsc_offset(vcpu, offset);
> raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
>
> - spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
> + spin_lock(&kvm->arch.masterclock_lock);
> if (!matched) {
> kvm->arch.nr_vcpus_matched_tsc = 0;
> } else if (!already_matched) {
> @@ -1546,9 +1496,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
> }
>
> kvm_track_tsc_matching(vcpu);
> - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
> + spin_unlock(&kvm->arch.masterclock_lock);
> }
> -
> EXPORT_SYMBOL_GPL(kvm_write_tsc);
>
> static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
> @@ -1567,79 +1516,6 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
>
> #ifdef CONFIG_X86_64
>
> -static u64 read_tsc(void)
> -{
> - u64 ret = (u64)rdtsc_ordered();
> - u64 last = pvclock_gtod_data.clock.cycle_last;
> -
> - if (likely(ret >= last))
> - return ret;
> -
> - /*
> - * GCC likes to generate cmov here, but this branch is extremely
> - * predictable (it's just a function of time and the likely is
> - * very likely) and there's a data dependence, so force GCC
> - * to generate a branch instead. I don't barrier() because
> - * we don't actually need a barrier, and if this function
> - * ever gets inlined it will generate worse code.
> - */
> - asm volatile ("");
> - return last;
> -}
> -
> -static inline u64 vgettsc(u64 *cycle_now)
> -{
> - long v;
> - struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> -
> - *cycle_now = read_tsc();
> -
> - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
> - return v * gtod->clock.mult;
> -}
> -
> -static int do_monotonic_boot(s64 *t, u64 *cycle_now)
> -{
> - struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> - unsigned long seq;
> - int mode;
> - u64 ns;
> -
> - do {
> - seq = read_seqcount_begin(&gtod->seq);
> - mode = gtod->clock.vclock_mode;
> - ns = gtod->nsec_base;
> - ns += vgettsc(cycle_now);
> - ns >>= gtod->clock.shift;
> - ns += gtod->boot_ns;
> - } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
> - *t = ns;
> -
> - return mode;
> -}
> -
> -static int do_realtime(struct timespec *ts, u64 *cycle_now)
> -{
> - struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> - unsigned long seq;
> - int mode;
> - u64 ns;
> -
> - do {
> - seq = read_seqcount_begin(&gtod->seq);
> - mode = gtod->clock.vclock_mode;
> - ts->tv_sec = gtod->wall_time_sec;
> - ns = gtod->nsec_base;
> - ns += vgettsc(cycle_now);
> - ns >>= gtod->clock.shift;
> - } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
> -
> - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
> - ts->tv_nsec = ns;
> -
> - return mode;
> -}
> -
> /* returns true if host is using tsc clocksource */
> static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
> {
> @@ -1713,34 +1589,28 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts,
> *
> */
>
> -static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
> +static void update_masterclock(struct kvm *kvm)
> {
> #ifdef CONFIG_X86_64
> struct kvm_arch *ka = &kvm->arch;
> - int vclock_mode;
> - bool host_tsc_clocksource, vcpus_matched;
> + bool host_clocksource_stable, vcpus_matched;
>
> vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
> atomic_read(&kvm->online_vcpus));
>
> /*
> - * If the host uses TSC clock, then passthrough TSC as stable
> - * to the guest.
> + * kvm_get_time_and_clockread returns true if clocksource is stable
> */
> - host_tsc_clocksource = kvm_get_time_and_clockread(
> + host_clocksource_stable = kvm_get_time_and_clockread(
> &ka->master_kernel_ns,
> &ka->master_cycle_now);
>
> - ka->use_master_clock = host_tsc_clocksource && vcpus_matched
> + ka->use_master_clock = host_clocksource_stable && vcpus_matched
> && !ka->backwards_tsc_observed
> && !ka->boot_vcpu_runs_old_kvmclock;
>
> - if (ka->use_master_clock)
> - atomic_set(&kvm_guest_has_master_clock, 1);
> -
> - vclock_mode = pvclock_gtod_data.clock.vclock_mode;
> - trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
> - vcpus_matched);
> + trace_kvm_update_master_clock(ka->use_master_clock,
> + host_clocksource_stable, vcpus_matched);
> #endif
> }
>
> @@ -1756,10 +1626,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> struct kvm_vcpu *vcpu;
> struct kvm_arch *ka = &kvm->arch;
>
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock(&ka->masterclock_lock);
> kvm_make_mclock_inprogress_request(kvm);
> /* no guest entries from this point */
> - pvclock_update_vm_gtod_copy(kvm);
> + update_masterclock(kvm);
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -1768,7 +1638,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
>
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock(&ka->masterclock_lock);
> #endif
> }
>
> @@ -1778,15 +1648,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
> struct pvclock_vcpu_time_info hv_clock;
> u64 ret;
>
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock(&ka->masterclock_lock);
> if (!ka->use_master_clock) {
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock(&ka->masterclock_lock);
> return ktime_get_boot_ns() + ka->kvmclock_offset;
> }
>
> hv_clock.tsc_timestamp = ka->master_cycle_now;
> hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock(&ka->masterclock_lock);
>
> /* both __this_cpu_read() and rdtsc() should be on the same cpu */
> get_cpu();
> @@ -1872,13 +1742,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
> * If the host uses TSC clock, then passthrough TSC as stable
> * to the guest.
> */
> - spin_lock(&ka->pvclock_gtod_sync_lock);
> + spin_lock(&ka->masterclock_lock);
> use_master_clock = ka->use_master_clock;
> if (use_master_clock) {
> host_tsc = ka->master_cycle_now;
> kernel_ns = ka->master_kernel_ns;
> }
> - spin_unlock(&ka->pvclock_gtod_sync_lock);
> + spin_unlock(&ka->masterclock_lock);
>
> /* Keep irq disabled to prevent changes to the clock */
> local_irq_save(flags);
> @@ -4208,11 +4078,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
> goto out;
>
> r = 0;
> - /*
> - * TODO: userspace has to take care of races with VCPU_RUN, so
> - * kvm_gen_update_masterclock() can be cut down to locked
> - * pvclock_update_vm_gtod_copy().
> - */

I have no idea what race is this.. do you know?

> +
> kvm_gen_update_masterclock(kvm);
> now_ns = get_kvmclock_ns(kvm);
> kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
> @@ -6041,7 +5907,8 @@ static void kvm_set_mmio_spte_mask(void)
> }
>
> #ifdef CONFIG_X86_64
> -static void pvclock_gtod_update_fn(struct work_struct *work)
> +static int process_clocksource_change(struct notifier_block *nb,
> + unsigned long unused0, void *unused1)
> {
> struct kvm *kvm;
>
> @@ -6052,35 +5919,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
> list_for_each_entry(kvm, &vm_list, vm_list)
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> - atomic_set(&kvm_guest_has_master_clock, 0);
> spin_unlock(&kvm_lock);
> -}
> -
> -static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
> -
> -/*
> - * Notification about pvclock gtod data update.
> - */
> -static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
> - void *priv)
> -{
> - struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> - struct timekeeper *tk = priv;
> -
> - update_pvclock_gtod(tk);
> -
> - /* disable master clock if host does not trust, or does not
> - * use, TSC clocksource
> - */
> - if (gtod->clock.vclock_mode != VCLOCK_TSC &&
> - atomic_read(&kvm_guest_has_master_clock) != 0)
> - queue_work(system_long_wq, &pvclock_gtod_work);

Don't drop this: TSC is required, and switching to another
clock must disable masterclock scheme.

> -
> return 0;
> }
>
> -static struct notifier_block pvclock_gtod_notifier = {
> - .notifier_call = pvclock_gtod_notify,
> +static struct notifier_block clocksource_change_notifier = {
> + .notifier_call = process_clocksource_change,
> };
> #endif
>
> @@ -6133,7 +5977,7 @@ int kvm_arch_init(void *opaque)
>
> kvm_lapic_init();
> #ifdef CONFIG_X86_64
> - pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
> + clocksource_changes_register_notifier(&clocksource_change_notifier);
> #endif
>
> return 0;
> @@ -6154,7 +5998,7 @@ void kvm_arch_exit(void)
> CPUFREQ_TRANSITION_NOTIFIER);
> cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
> #ifdef CONFIG_X86_64
> - pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
> + clocksource_changes_unregister_notifier(&clocksource_change_notifier);
> #endif
> kvm_x86_ops = NULL;
> kvm_mmu_module_exit();
> @@ -8056,10 +7900,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> raw_spin_lock_init(&kvm->arch.tsc_write_lock);
> mutex_init(&kvm->arch.apic_map_lock);
> mutex_init(&kvm->arch.hyperv.hv_lock);
> - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
> + spin_lock_init(&kvm->arch.masterclock_lock);
>
> kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
> - pvclock_update_vm_gtod_copy(kvm);
> + update_masterclock(kvm);
>
> INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
> INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
> --
> 2.7.4

2017-08-02 23:37:02

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 09/10] pvclock: add clocksource change notification on changing of tsc stable bit

On Wed, Aug 02, 2017 at 05:38:09PM +0300, Denis Plotnikov wrote:
> It's needed to notify the KVM guest about critical changes in pvclock
> and make it to update its masterclock.
>
> This is a part of the work aiming to make kvmclock be a clocksource
> providing valid cycles value for KVM masterclock, another words
> make possible to use KVM masterclock over kvmclock clocksource.
>
> Signed-off-by: Denis Plotnikov <[email protected]>
> ---
> arch/x86/kernel/pvclock.c | 19 ++++++++++++++++---
> 1 file changed, 16 insertions(+), 3 deletions(-)

Please do an analysis similar to the comment which starts at

"* Assuming a stable TSC across physical CPUS, and a stable TSC
* across virtual CPUs, the following condition is possible.
* Each numbered line represents an event visible to both
* CPUs at the next numbered event."

Describing why its safe to use kvmclock as source for the masterclock
(honestly i haven't gone through the details, but someone should
before this patch is merged).

For one thing, its only safe to use kvmclock masterclock if
the TSCs are synchronized in the host (so that you can read offset
on vcpu-0 using a TSC that has been initialized on vcpu-1).
So masterclock in the L1 guest is necessary. Do you enforce that?

Also L2 guest TSCs must be synchronized. Where is that enforced?

Also why its safe to use non-TSC-clocksource (clock_read, tsc_read) +
tsc offsets from that point.



>
> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
> index bece384..5898f20 100644
> --- a/arch/x86/kernel/pvclock.c
> +++ b/arch/x86/kernel/pvclock.c
> @@ -22,6 +22,7 @@
> #include <linux/gfp.h>
> #include <linux/bootmem.h>
> #include <linux/nmi.h>
> +#include <linux/cs_notifier.h>
>
> #include <asm/fixmap.h>
> #include <asm/pvclock.h>
> @@ -73,6 +74,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
> return flags & valid_flags;
> }
>
> +static atomic_t clocksource_stable = ATOMIC_INIT(0);
> +
> u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src,
> u64 *cycles_stamp, u8 *flags_stamp)
> {
> @@ -102,10 +105,20 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src,
> pvclock_touch_watchdogs();
> }
>
> - if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
> - (flags & PVCLOCK_TSC_STABLE_BIT))
> - return ret;
> + if (likely(valid_flags & PVCLOCK_TSC_STABLE_BIT)) {
> + bool stable_now = !!(flags & PVCLOCK_TSC_STABLE_BIT);
> + bool stable_last = (bool) atomic_read(&clocksource_stable);
> +
> + if (unlikely(stable_now != stable_last)) {
> + /* send notification once */
> + if (stable_last == atomic_cmpxchg(
> + &clocksource_stable, stable_last, stable_now))
> + clocksource_changes_notify();
> + }
>
> + if (stable_now)
> + return ret;
> + }
> /*
> * Assumption here is that last_value, a global accumulator, always goes
> * forward. If we are less than that, we should not be much smaller.
> --
> 2.7.4

2017-08-03 12:35:27

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 07/10] KVM: x86: remove not used pvclock_gtod_copy

On 03/08/2017 01:21, Marcelo Tosatti wrote:
>> - if (ka->use_master_clock ||
>> - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
>> + if (ka->use_master_clock || vcpus_matched)
>> kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> Don't drop this. The masterclock scheme requires TSC for proper functioning
> (or an analysis why its supposed with different HPET+TSC, for example).

I think testing gtod->clock.vclock_mode is just an optimization?
kvm_get_time_and_clockread would return false anyway and masterclock
would not be enabled.

Paolo

2017-08-11 22:59:39

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 07/10] KVM: x86: remove not used pvclock_gtod_copy

On Thu, Aug 03, 2017 at 02:35:19PM +0200, Paolo Bonzini wrote:
> On 03/08/2017 01:21, Marcelo Tosatti wrote:
> >> - if (ka->use_master_clock ||
> >> - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
> >> + if (ka->use_master_clock || vcpus_matched)
> >> kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> > Don't drop this. The masterclock scheme requires TSC for proper functioning
> > (or an analysis why its supposed with different HPET+TSC, for example).
>
> I think testing gtod->clock.vclock_mode is just an optimization?
> kvm_get_time_and_clockread would return false anyway and masterclock
> would not be enabled.
>
> Paolo

Right.


2017-08-21 08:40:43

by Denis Plotnikov

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

ping!

On 02.08.2017 20:11, Paolo Bonzini wrote:
> On 02/08/2017 18:49, John Stultz wrote:
>> On Wed, Aug 2, 2017 at 7:38 AM, Denis Plotnikov
>> <[email protected]> wrote:
>>> V4:
>>> * removed "is stable" function with vague definition of stability
>>> there is the only function which does time with cycle stamp getting
>>> * some variables renamed
>>> * some patches split into smaller once
>>> * atomic64_t usage is replaced with atomic_t
>>>
>>> V3:
>>> Changing the timekeeper interface for clocksource reading looks like
>>> an overkill to achive the goal of getting cycles stamp for KVM.
>>> Instead extend the timekeeping interface and add functions which provide
>>> necessary data: read clocksource with cycles stamp, check whether the
>>> clock source is stable.
>>>
>>> Use those functions and improve existing timekeeper functionality to
>>> replace pvclock_gtod_copy scheme in masterclock data calculation.
>>>
>>> V2:
>>> The main goal is to make L2 kvm-clock be stable when it's running over L1
>>> with stable kvm-clock.
>>>
>>> The patch series is for x86 architecture only. If the series is approved
>>> I'll do changes for other architectures but I don't have an ability to
>>> compile and check for every single on (help needed)
>>>
>>> The patch series do the following:
>>>
>>> * change timekeeper interface to get cycles stamp value from
>>> the timekeeper
>>> * get rid of pvclock copy in KVM by using the changed timekeeper
>>> interface: get time and cycles right from the timekeeper
>>> * make KVM recognize a stable kvm-clock as stable clocksource
>>> and use the KVM masterclock in this case, which means making
>>> L2 stable when running over stable L1 kvm-clock
>>
>> So, from a brief skim, I'm not a big fan of this patchset. Though this
>> is likely in part due to that I haven't seen anything about *why*
>> these changes are needed.
>
> From my selfish KVM maintainer point of view, one advantage is that it
> drops knowledge of internal timekeeping functioning from KVM, using
> ktime_get_snapshot instead. These are patches 1-5. Structuring the
> series like this was my idea so I take the blame.
>
> As to patches 6-10, KVM is currently only able to provide vsyscalls if
> the host is using the TSC. However, when using nested virtualization
> you have
>
> L0: bare-metal hypervisor (uses TSC)
> L1: nested hypervisor (uses kvmclock, can use vsyscall)
> L2: nested guest
>
> and L2 cannot use vsyscall because it is not using the TSC. This series
> lets you use the vsyscall in L2 as long as L1 can.
>
> There is one point where I couldn't help Denis as much as I wanted.
> That's a definition of what's a "good" clocksource that can be used by
> KVM to provide the vsyscall. I know why the patch is correct, but I
> couldn't really define the concept.
>
> In ktime_get_snapshot and struct system_counterval_t's users, they seem
> to use "cycles" to map from TSC to ART; this is not unlike kvmclock's
> use of "cycles" to map from TSC to nanoseconds at an origin point.
> However, it's not clear to me whether "cycles" may be used by
> adjust_historical_crosststamp even for non-TSC clocksources (or
> non-kvmclock after this series). It doesn't help that
> adjust_historical_crosststamp is essentially dead code, since
> get_device_system_crosststamp is always called with a NULL history argument.
>
> I'm also CCing Marcelo who wrote the KVM vsyscall code.
>
> Paolo
>
>> Can you briefly explain the issue you're trying to solve, and why you
>> think this approach is the way to go?
>> (Its usually a good idea to have such rational included in the patchset)
>

--
Best,
Denis

2017-08-22 19:50:11

by John Stultz

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On Mon, Aug 21, 2017 at 1:40 AM, Denis Plotnikov
<[email protected]> wrote:
> ping!
>

I still don't feel my questions have been well answered. Its really
not clear to me why, in order to allow the level-2 guest to use a vdso
that the answer is to export more data through the entire stack rather
then to make the kvmclock to be usable from the vsyscall.

So far for a problem statement, all I've got is:
"However, when using nested virtualization you have

L0: bare-metal hypervisor (uses TSC)
L1: nested hypervisor (uses kvmclock, can use vsyscall)
L2: nested guest

and L2 cannot use vsyscall because it is not using the TSC."

Which is a start but doesn't really make it clear why the proposed
solution is best/necessary.

thanks
-john

2017-08-22 21:00:57

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM


> I still don't feel my questions have been well answered. Its really
> not clear to me why, in order to allow the level-2 guest to use a vdso
> that the answer is to export more data through the entire stack rather
> then to make the kvmclock to be usable from the vsyscall.

Thanks, this helps.

A stable kvmclock is already usable from the vsyscall. It is however not
yet usable _in the hypervisor_ as a way to provide another stable kvmclock
to the nested guest; right now the only clocksource that a hypervisor can
use to provide a stable kvmclock is the TSC.

So, regarding the "why is it necessary" part. Even on a modern host with
invariant TSC, kvmclock mediates between TSC and the guest and provides for
example support for live migration, where the TSC frequency may be
different between source and destination. If the L1 hypervisor could
use the TSC to provide a stable kvmclock, there would be no need for kvmclock
in the first place. The paravirtualized clock may well disappear in a few
years since Skylake provides TSC scaling. However, I'm not that optimistic
because people are complaining that I removed support for 2007 processors
and it seems that I'll have to put it back. So, as more people use nested
virtualization (and we have nested virt migration in the works, too), nested
kvmclock becomes more important too.

Regarding the "why is it best" part. Right now, the hypervisor makes a
copy of the timekeeper information in order to prepare the stable kvmclock.
This code is very much tied to the TSC. However, a snapshot of the timekeeper
information is almost entirely the same thing that ktime_get_snapshot returns,
so my suggestion to "untie" the hypervisor code from the TSC was to use
ktime_get_snapshot instead. This way, the clocksource itself tells KVM
whether it can be the base for a vsyscall-happy kvmclock (which means, it
must be the TSC or a linear transformation of it).

While I am very happy with how the KVM code comes out, it might certainly
be not the best solution---I definitely need help from the clocksource
maintainers here, not just approval! In particular, it doesn't help that
a lot of code surrounding ktime_get_snapshot is unused, so that may have
sent me off track.

In particular, the return value of the new callback can be defined as "is
it the TSC or a linear transformation of it". But that's as good a definition
as "is it good for KVM" (i.e., not very good) without some documentation on
the meaning of "cycles" in the struct returned by ktime_get_snapshot. Once I
understand that, I hope I can provide a better explanation for the return
value of the callback.

Paolo

> So far for a problem statement, all I've got is:
> "However, when using nested virtualization you have
>
> L0: bare-metal hypervisor (uses TSC)
> L1: nested hypervisor (uses kvmclock, can use vsyscall)
> L2: nested guest
>
> and L2 cannot use vsyscall because it is not using the TSC."
>
> Which is a start but doesn't really make it clear why the proposed
> solution is best/necessary.
>
> thanks
> -john
>

2017-08-23 12:45:54

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On Tue, 22 Aug 2017, Paolo Bonzini wrote:
> Regarding the "why is it best" part. Right now, the hypervisor makes a
> copy of the timekeeper information in order to prepare the stable kvmclock.
> This code is very much tied to the TSC. However, a snapshot of the timekeeper
> information is almost entirely the same thing that ktime_get_snapshot returns,
> so my suggestion to "untie" the hypervisor code from the TSC was to use
> ktime_get_snapshot instead. This way, the clocksource itself tells KVM
> whether it can be the base for a vsyscall-happy kvmclock (which means, it
> must be the TSC or a linear transformation of it).
>
> While I am very happy with how the KVM code comes out, it might certainly
> be not the best solution---I definitely need help from the clocksource
> maintainers here, not just approval! In particular, it doesn't help that
> a lot of code surrounding ktime_get_snapshot is unused, so that may have
> sent me off track.
>
> In particular, the return value of the new callback can be defined as "is
> it the TSC or a linear transformation of it". But that's as good a
> definition as "is it good for KVM" (i.e., not very good) without some
> documentation on the meaning of "cycles" in the struct returned by
> ktime_get_snapshot. Once I understand that, I hope I can provide a better
> explanation for the return value of the callback.

This all looks wrong to begin with and you are just bolting and duct taping
stuff together as you see fit.

I understand the idea of providing a clocksource to the core code which
provides direct nano second resolution and do the host -> guest conversion
in the kvmclock implementation. But that's the root cause of all evils.

The reason why you do that is to support live migration of VMs to hosts
with a different TSC frequency. And for exactly that particular corner case
the whole conversion magic is implemented and now you need even more duct
tape to make it work with nested VMs.

That's just wrong. We need to sit back and look at that whole kvm clock
mechanism and redesign it proper.

Let's look at the goals:

1) Provide the information of host frequency to the guest

2) Propagate host frequency changes to the guest

That's solely used for migration purposes.

It's not there for dealing with non constant frequency TSCs, which
would be beyond insane.

Neither to runtime propagate host clocksource adjustments (via NTP &
friends) to a guest. If you use it that way today, then this needs to
be fixed, simply because that's the wrong approach. We have proper
correction mechanisms (NTP,PPS,PTP ...) which can be utilized to do
so. Aside of that it'd be interesting to see the interaction between
the host frequency scaling change and NTP in the guest trying to
adjust as well.

Now lets look at a simple ktime_get() with the current implementation:

ktime_get()
do {
seq = seqcount_begin(tk);
data = tk->data;
now = tk->clock->read(tk->clock);
kvmclock_read()
do {
seqk = seqcount_begin(kvmclock);
nowk = kvmclock_read();
datak = kvmclock_data;
} while (seqcount_retry(seqk, kvmclock));
nsec = convert(nowk, datak);
return nsec;
} while (seqcount_retry(seq, tk));

nsec = convert(now, data);
return nsec;

So you need two sequence counters and two conversions for reading the
clock. Sorry, but that's just crap.

Let's look at the normal usecase (no migration) first:

The TSC frequency can be retrieved at guest boot time and does not ever
change. For this case the above is bloat and completely useless.

All you need to do is to register the kvm clocksource with the proper
frequency and the readout is just the plain TSC read. Nothing
else.

So now the special case of live migration. You need to make sure that the
change of the TSC frequency is properly propagated and any reader which is
in the middle of a time getter function will retry with the new parameters.

That's not any different from the case where the timekeeper is adjusted by
any of the regular mechanisms: update_wall_time(), ntp, pps, ptp ....

The only thing you need to ensure is that the timekeeping core is properly
informed about the change and code which executes time getter functions
will retry. The core has _ALL_ mechanisms available for that.

So the real question is how to ensure that:

1) None of the update functions is in progress

2) The update is propagated via the existing mechanisms

The whole live migration magic is orchestrated by qemu and the kernel. So
it's reasonably simple to ensure that.

Something like the below should work for this:

Host Guest
1: prevent_nmi_delivery_to_guest()
2: inject_kvmclock_irq()
3: handle_kvmclock_irq()
4: timekeeping_freeze()
5: exit_vm()
6: stop_and_migrate_guest()
7: update_guest_data()
8: resume_guest()
9: timekeeping_reconfigure();
10: timekeeping_unfreeze();
11: resume_nmi_delivery_to_guest()

#1 Ensures that no NMI is delivered to the guest so that the timekeeper
core does not have to worry about NMI context calling any of the NMI
safe time getters.

If that's not possible then it's simple enough to do something about
the NMI safe time getters in the time keeping core code, so you don't
have to worry much about it.

#2 Inject a special interrupt vector which initiates the timekeeping
freeze mechanism in the guest

#3 kvm clock interrupt handler gets invoked

#4 Ensures that:

- All potential timekeeping update mechanisms have left the critical
region

- All potential time getters are blocked

The mechanism for that is simple enough:

timekeeping_freeze()
{
raw_spin_lock(&timekeeper_lock);
write_seqcount_begin(&tk_core.seq);
/* Ensure a consistent state */
timekeeping_forward_now(tk);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
}

#5 Exit the VM with some magic exit reason so the host side knows that
timekeeping is frozen

#6 Stop the VM, migrate it

#9 Retrieve the new conversion factor and adjust the timekeeper data
accordingly

#10 Unfreeze the time keeper with the new configuration

timekeeping_freeze()
{
/* Ensure a consistent state */
timekeeping_forward_now(tk);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

write_seqcount_end(&tk_core.seq);
raw_spin_unlock(&timekeeper_lock);
}

#11 Resume NMI delivery

As I said in #1 this can be completely handled in the timekeeper core,
but if we can avoid that then stuff becomes simpler. And simpler is
preferred ....


So now for the nested KVM case. If you follow the above scheme then this
becomes really simple:

1) The TSC frequency is merily propagated to the L2 guest. It's the
same as the L1 guest TSC frequency. No magic voodoo required.

2) Migration of a L2 guest to a different L1 guest follows the
same scheme as above

3) Migration of a L2 guest to a physcial host follows the same scheme as
above - no idea whether that's supported at all

4) Migration of a L1 guest with a embedded L2 guest is not rocket science
either. The above needs some extra code which propagates the time
keeper freeze to L2 and then when L1 resumes, the updated frequency
data is propagated to L2 and L2 resumed along with it.

You don't need any timestamp snapshot magic and voodoo callbacks. All you
need is a proper mechanism to update the timekeeper.

I might be missing something really important as usual. If so, I'm happy to
be educated.

Thanks,

tglx

2017-08-23 16:02:25

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On 23/08/2017 14:45, Thomas Gleixner wrote:
> So the real question is how to ensure that:
>
> 1) None of the update functions is in progress
>
> 2) The update is propagated via the existing mechanisms
>
> The whole live migration magic is orchestrated by qemu and the kernel. So
> it's reasonably simple to ensure that.

There is no orchestration whatsoever between QEMU and the host kernel,
much less between anything and the guest. QEMU just sends ioctls. The
complex part is above QEMU, just because you have to make sure that the
destination sets up networking and everything else just like the source,
but as far as QEMU is concerned live migration is as simple as

stop_guest()
get_guest_state()
write_data_to_socket()

read_data_from_socket()
set_guest_state()
resume_guest()

and as far as KVM is concerned, it's as simple as

get a signal, KVM_RUN exits
KVM_GET_REGS and a bunch more ioctls

KVM_SET_REGS and a bunch more ioctls
KVM_SET_KVMCLOCK triggers kvmclock update
ioctl(KVM_RUN)
process KVM_REQ_CLOCK_UPDATE
enter the guest

Adding interrupts and all that is much, much more complicated than just
reusing code that runs all the time (albeit only on hosts with impaired
TSC) and needs no special case at all.

> The reason why you do that is to support live migration of VMs to hosts
> with a different TSC frequency. And for exactly that particular corner case
> the whole conversion magic is implemented and now you need even more duct
> tape to make it work with nested VMs.

The point of the first part of this series is to _remove_ the duct tape
and actually make KVM use generic timekeeper services, namely
ktime_get_snapshot. If you look at patches 1-2-3-4-5-7 the delta is
-120 lines of code, without any nested virtualization stuff.

More duct tape would have been just:

- if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+ mode = READ_ONCE(pvclock_gtod_data.clock.vclock_mode);
+ if (mode != VCLOCK_TSC &&
+ (mode != VCLOCK_PVCLOCK || !pvclock_nested_virt_magic())
return false;

- return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+ switch (mode) {
+ case VCLOCK_TSC:
+ return do_realtime_tsc(ts, cycle_now);
+ case VCLOCK_PVCLOCK:
+ return do_realtime_pvclock(ts, cycle_now);
+ }

Nested virtualization does need a clocksource change notifier on top,
but we can cross that bridge later. Maybe Denis can post just those
patches to begin with.

Paolo

> So now for the nested KVM case. If you follow the above scheme then this
> becomes really simple:
>
> 1) The TSC frequency is merily propagated to the L2 guest. It's the
> same as the L1 guest TSC frequency. No magic voodoo required.
>
> 2) Migration of a L2 guest to a different L1 guest follows the
> same scheme as above
>
> 3) Migration of a L2 guest to a physcial host follows the same scheme as
> above - no idea whether that's supported at all
>
> 4) Migration of a L1 guest with a embedded L2 guest is not rocket science
> either. The above needs some extra code which propagates the time
> keeper freeze to L2 and then when L1 resumes, the updated frequency
> data is propagated to L2 and L2 resumed along with it.
>
> You don't need any timestamp snapshot magic and voodoo callbacks. All you
> need is a proper mechanism to update the timekeeper.
>
> I might be missing something really important as usual. If so, I'm happy to
> be educated.

2017-08-24 08:00:51

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM

On 23/08/2017 18:02, Paolo Bonzini wrote:
>
> More duct tape would have been just:
>
> - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
> + mode = READ_ONCE(pvclock_gtod_data.clock.vclock_mode);
> + if (mode != VCLOCK_TSC &&
> + (mode != VCLOCK_PVCLOCK || !pvclock_nested_virt_magic())
> return false;
>
> - return do_realtime(ts, cycle_now) == VCLOCK_TSC;
> + switch (mode) {
> + case VCLOCK_TSC:
> + return do_realtime_tsc(ts, cycle_now);
> + case VCLOCK_PVCLOCK:
> + return do_realtime_pvclock(ts, cycle_now);
> + }
>
> Nested virtualization does need a clocksource change notifier on top,
> but we can cross that bridge later. Maybe Denis can post just those
> patches to begin with.

For what it's worth, this is all that's needed (with patches 1-2-3-4-5-7)
to support kvmclock on top of Hyper-V clock. It's trivial.

Even if we could add paravirtualization magic to KVM live migration, we
certainly couldn't do that for other hypervisors.

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 5b882cc0c0e9..3bab935b021a 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -46,10 +46,24 @@ static u64 read_hv_clock_tsc(struct clocksource *arg)
return current_tick;
}

+static bool read_hv_clock_tsc_with_stamp(struct clocksource *arg,
+ u64 *cycles, u64 *cycles_stamp)
+{
+ *cycles = __hv_read_tsc_page(tsc_pg, &cycles_stamp);
+
+ if (*cycles == U64_MAX) {
+ *cycles = rdmsrl(HV_X64_MSR_TIME_REF_COUNT);
+ return false;
+ }
+
+ return true;
+}
+
static struct clocksource hyperv_cs_tsc = {
.name = "hyperv_clocksource_tsc_page",
.rating = 400,
.read = read_hv_clock_tsc,
+ .read_with_stamp = read_hv_clock_tsc_with_stamp,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 2b58c8c1eeaa..5aff66e9fff7 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -176,9 +176,9 @@ void hyperv_cleanup(void);
#endif
#ifdef CONFIG_HYPERV_TSCPAGE
struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
-static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+static inline u64 __hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
{
- u64 scale, offset, cur_tsc;
+ u64 scale, offset;
u32 sequence;

/*
@@ -209,7 +209,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)

scale = READ_ONCE(tsc_pg->tsc_scale);
offset = READ_ONCE(tsc_pg->tsc_offset);
- cur_tsc = rdtsc_ordered();
+ *cur_tsc = rdtsc_ordered();

/*
* Make sure we read sequence after we read all other values
@@ -219,9 +219,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)

} while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);

- return mul_u64_u64_shr(cur_tsc, scale, 64) + offset;
+ return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
}

+static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+{
+ u64 cur_tsc;
+ return __hv_read_tsc_page(tsc_pg, &cur_tsc);
+}
#else
static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
{


Denis, could you try redoing patch 7 to use the pvclock_gtod_notifier
instead of the new one you're adding, and only send that first part? I
think it's a worthwhile cleanup anyway, so let's start with that.

Paolo

2017-08-28 07:29:13

by Denis Plotnikov

[permalink] [raw]
Subject: Re: [PATCH v4 00/10] make L2's kvm-clock stable, get rid of pvclock_gtod_copy in KVM



On 24.08.2017 11:00, Paolo Bonzini wrote:
> On 23/08/2017 18:02, Paolo Bonzini wrote:
>>
>> More duct tape would have been just:
>>
>> - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
>> + mode = READ_ONCE(pvclock_gtod_data.clock.vclock_mode);
>> + if (mode != VCLOCK_TSC &&
>> + (mode != VCLOCK_PVCLOCK || !pvclock_nested_virt_magic())
>> return false;
>>
>> - return do_realtime(ts, cycle_now) == VCLOCK_TSC;
>> + switch (mode) {
>> + case VCLOCK_TSC:
>> + return do_realtime_tsc(ts, cycle_now);
>> + case VCLOCK_PVCLOCK:
>> + return do_realtime_pvclock(ts, cycle_now);
>> + }
>>
>> Nested virtualization does need a clocksource change notifier on top,
>> but we can cross that bridge later. Maybe Denis can post just those
>> patches to begin with.
>
> For what it's worth, this is all that's needed (with patches 1-2-3-4-5-7)
> to support kvmclock on top of Hyper-V clock. It's trivial.
>
> Even if we could add paravirtualization magic to KVM live migration, we
> certainly couldn't do that for other hypervisors.
>
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 5b882cc0c0e9..3bab935b021a 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -46,10 +46,24 @@ static u64 read_hv_clock_tsc(struct clocksource *arg)
> return current_tick;
> }
>
> +static bool read_hv_clock_tsc_with_stamp(struct clocksource *arg,
> + u64 *cycles, u64 *cycles_stamp)
> +{
> + *cycles = __hv_read_tsc_page(tsc_pg, &cycles_stamp);
> +
> + if (*cycles == U64_MAX) {
> + *cycles = rdmsrl(HV_X64_MSR_TIME_REF_COUNT);
> + return false;
> + }
> +
> + return true;
> +}
> +
> static struct clocksource hyperv_cs_tsc = {
> .name = "hyperv_clocksource_tsc_page",
> .rating = 400,
> .read = read_hv_clock_tsc,
> + .read_with_stamp = read_hv_clock_tsc_with_stamp,
> .mask = CLOCKSOURCE_MASK(64),
> .flags = CLOCK_SOURCE_IS_CONTINUOUS,
> };
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index 2b58c8c1eeaa..5aff66e9fff7 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -176,9 +176,9 @@ void hyperv_cleanup(void);
> #endif
> #ifdef CONFIG_HYPERV_TSCPAGE
> struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
> -static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
> +static inline u64 __hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
> {
> - u64 scale, offset, cur_tsc;
> + u64 scale, offset;
> u32 sequence;
>
> /*
> @@ -209,7 +209,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
>
> scale = READ_ONCE(tsc_pg->tsc_scale);
> offset = READ_ONCE(tsc_pg->tsc_offset);
> - cur_tsc = rdtsc_ordered();
> + *cur_tsc = rdtsc_ordered();
>
> /*
> * Make sure we read sequence after we read all other values
> @@ -219,9 +219,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
>
> } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
>
> - return mul_u64_u64_shr(cur_tsc, scale, 64) + offset;
> + return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
> }
>
> +static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
> +{
> + u64 cur_tsc;
> + return __hv_read_tsc_page(tsc_pg, &cur_tsc);
> +}
> #else
> static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
> {
>
>
> Denis, could you try redoing patch 7 to use the pvclock_gtod_notifier
> instead of the new one you're adding, and only send that first part? I
> think it's a worthwhile cleanup anyway, so let's start with that.
>
> Paolo
>
Ok, I'll do that

Denis