LinuxLists.cc - [PATCH v3 0/6] Steal time for KVM

[permalink] [raw]

Subject: [PATCH v3 6/6] Describe KVM_MSR_STEAL_TIME

This patch adds documentation about usage of the newly
introduced KVM_MSR_STEAL_TIME.

Signed-off-by: Glauber Costa <[email protected]>
---
Documentation/kvm/msr.txt | 33 +++++++++++++++++++++++++++++++++
1 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
index d079aed..79c12a7 100644
--- a/Documentation/kvm/msr.txt
+++ b/Documentation/kvm/msr.txt
@@ -185,3 +185,36 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02

Currently type 2 APF will be always delivered on the same vcpu as
type 1 was, but guest should not rely on that.
+
+MSR_KVM_STEAL_TIME: 0x4b564d03
+
+ data: 64-byte alignment physical address of a memory area which must be
+ in guest RAM, plus an enable bit in bit 0. This memory is expected to
+ hold a copy of the following structure:
+
+ struct kvm_steal_time {
+ __u64 steal;
+ __u32 version;
+ __u32 flags;
+ __u32 pad[6];
+ }
+
+ whose data will be filled in by the hypervisor periodically. Only one
+ write, or registration, is needed for each VCPU. The interval between
+ updates of this structure is arbitrary and implementation-dependent.
+ The hypervisor may update this structure at any time it sees fit until
+ anything with bit0 == 0 is written to it. Guest is required to make sure
+ this structure is initialized to zero.
+
+ Fields have the following meanings:
+
+ version: guest has to check version before and after grabbing
+ time information and check that they are both equal and even.
+ An odd version indicates an in-progress update.
+
+ flags: At this point, always zero. May be used to indicate
+ changes in this structure in the future.
+
+ steal: the amount of time in which this vCPU did not run, in
+ nanoseconds.
+
--
1.7.2.3

2011-02-11 18:21:28

[permalink] [raw]

Subject: [PATCH v3 4/6] KVM-GST: KVM Steal time registration

Register steal time within KVM. Everytime we sample the steal time
information, we update a local variable that tells what was the
last time read. We then account the difference.

Signed-off-by: Glauber Costa <[email protected]>
CC: Rik van Riel <[email protected]>
CC: Jeremy Fitzhardinge <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Avi Kivity <[email protected]>
---
arch/x86/kernel/kvm.c | 43 +++++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/kvmclock.c | 2 ++
2 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0..ea49217 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -58,6 +58,7 @@ struct kvm_para_state {

static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_steal_time, steal_time);

static struct kvm_para_state *kvm_para_state(void)
{
@@ -483,24 +484,46 @@ static struct notifier_block kvm_pv_reboot_nb = {
.notifier_call = kvm_pv_reboot_notify,
};

+static int kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ int low, high, ret;
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!hypervisor_steal_time)
+ return 0;
+
+ memset(st, 0, sizeof(*st));
+
+ low = (int)__pa(st) | 1;
+ high = ((u64)__pa(st) >> 32);
+ ret = wrmsr_safe(MSR_KVM_STEAL_TIME, low, high);
+ printk(KERN_INFO "kvm-stealtime: cpu %d, msr %x:%x\n",
+ cpu, high, low);
+ return ret;
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
#ifdef CONFIG_KVM_CLOCK
WARN_ON(kvm_register_clock("primary cpu clock"));
#endif
+ WARN_ON(kvm_register_steal_time());
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
}

static void __cpuinit kvm_guest_cpu_online(void *dummy)
{
+ WARN_ON(kvm_register_steal_time());
kvm_guest_cpu_init();
}

static void kvm_guest_cpu_offline(void *dummy)
{
kvm_pv_disable_apf(NULL);
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
apf_task_wake_all();
}

@@ -534,6 +557,23 @@ static void __init kvm_apf_trap_init(void)
set_intr_gate(14, &async_page_fault);
}

+static u64 kvm_account_steal_time(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ rmb();
+ steal = src->steal;
+ rmb();
+ } while ((src->version & 1) || (version != src->version));
+
+ return steal;
+}
+
void __init kvm_guest_init(void)
{
int i;
@@ -548,6 +588,9 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;

+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
+ hypervisor_steal_time = kvm_account_steal_time;
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3ea..dcb6a67 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -164,6 +164,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
native_machine_crash_shutdown(regs);
}
#endif
@@ -171,6 +172,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
static void kvm_shutdown(void)
{
native_write_msr(msr_kvm_system_time, 0, 0);
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
native_machine_shutdown();
}

--
1.7.2.3

2011-02-11 18:21:42

[permalink] [raw]

Subject: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

This patch accounts steal time time in kernel/sched.
I kept it from last proposal, because I still see advantages
in it: Doing it here will give us easier access from scheduler
variables such as the cpu rq. The next patch shows an example of
usage for it.

Since functions like account_idle_time() can be called from
multiple places, not only account_process_tick(), steal time
grabbing is repeated in each account function separatedely.

Signed-off-by: Glauber Costa <[email protected]>
CC: Rik van Riel <[email protected]>
CC: Jeremy Fitzhardinge <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Avi Kivity <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f94..5dbf509 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -302,6 +302,7 @@ long io_schedule_timeout(long timeout);
extern void cpu_init (void);
extern void trap_init(void);
extern void update_process_times(int user);
+extern u64 (*hypervisor_steal_time)(int cpu);
extern void scheduler_tick(void);

extern void sched_show_task(struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..60b0cf8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -524,6 +524,8 @@ struct rq {
u64 prev_irq_time;
#endif

+ u64 prev_steal_ticks;
+
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
@@ -1780,6 +1782,16 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dec_nr_running(rq);
}

+u64 (*hypervisor_steal_time)(int cpu) = NULL;
+
+static u64 steal_time_clock(int cpu)
+{
+ if (!hypervisor_steal_time)
+ return 0;
+
+ return hypervisor_steal_time(cpu);
+}
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING

/*
@@ -3509,6 +3521,33 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
}

/*
+ * We have to at flush steal time information every time something else
+ * is accounted. Since the accounting functions are all visible to the rest
+ * of the kernel, it gets tricky to do them in one place. This helper function
+ * helps us.
+ *
+ * When the system is idle, the concept of steal time does not apply. We just
+ * tell the underlying hypervisor that we grabbed the data, but skip steal time
+ * accounting
+ */
+static int touch_steal_time(int is_idle)
+{
+ u64 steal, st;
+
+ steal = steal_time_clock(smp_processor_id());
+
+ st = steal / TICK_NSEC - this_rq()->prev_steal_ticks;
+
+ this_rq()->prev_steal_ticks += st;
+
+ if (!st || is_idle)
+ return 0;
+
+ account_steal_time(st);
+ return 1;
+}
+
+/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @cputime: the cpu time spent in user space since the last update
@@ -3520,6 +3559,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t tmp;

+ if (touch_steal_time(0))
+ return;
+
/* Add user time to process. */
p->utime = cputime_add(p->utime, cputime);
p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
@@ -3580,6 +3622,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t tmp;

+ if (touch_steal_time(0))
+ return;
+
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
return;
@@ -3627,6 +3672,8 @@ void account_idle_time(cputime_t cputime)
cputime64_t cputime64 = cputime_to_cputime64(cputime);
struct rq *rq = this_rq();

+ touch_steal_time(1);
+
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
else
--
1.7.2.3

2011-02-11 18:21:44

[permalink] [raw]

Subject: [PATCH v3 1/6] KVM-HDR: KVM Steal time implementation

2011-02-11 18:21:45

[permalink] [raw]

Subject: [PATCH v3 2/6] KVM-HV: KVM Steal time implementation

To implement steal time, we need the hypervisor to pass the guest information
about how much time was spent running other processes outside the VM.
This is per-vcpu, and using the kvmclock structure for that is an abuse
we decided not to make.

In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
holds the memory area address containing information about steal time

This patch contains the hypervisor part for it. I am keeping it separate from
the headers to facilitate backports to people who wants to backport the kernel
part but not the hypervisor, or the other way around.

Signed-off-by: Glauber Costa <[email protected]>
CC: Rik van Riel <[email protected]>
CC: Jeremy Fitzhardinge <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Avi Kivity <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 +++++
arch/x86/kvm/x86.c | 39 +++++++++++++++++++++++++++++++++++++--
2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ffd7f8d..be6e0e2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -377,6 +377,11 @@ struct kvm_vcpu_arch {
unsigned int hw_tsc_khz;
unsigned int time_offset;
struct page *time_page;
+
+ gpa_t stime;
+ struct kvm_steal_time steal;
+ u64 this_time_out;
+
u64 last_host_tsc;
u64 last_guest_tsc;
u64 last_kernel_ns;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a326cb..7f14900 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -787,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/

-#define KVM_SAVE_MSRS_BEGIN 8
+#define KVM_SAVE_MSRS_BEGIN 9
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1546,6 +1546,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
+ case MSR_KVM_STEAL_TIME:
+
+ if (!(data & 1)) {
+ vcpu->arch.stime = 0;
+ break;
+ }
+
+ vcpu->arch.stime = data & ~1;
+ break;
+
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1831,6 +1841,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_ASYNC_PF_EN:
data = vcpu->arch.apf.msr_val;
break;
+ case MSR_KVM_STEAL_TIME:
+ data = vcpu->arch.stime;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1993,6 +2006,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_STEAL_TIME:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2108,6 +2122,9 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)

void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_steal_time *st;
+ st = (struct kvm_steal_time *)vcpu->arch.stime;
+
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
if (kvm_x86_ops->has_wbinvd_exit())
@@ -2133,6 +2150,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
}
+
+ if (vcpu->arch.this_time_out) {
+ u64 to = (get_kernel_ns() - vcpu->arch.this_time_out);
+
+ kvm_read_guest(vcpu->kvm, (gpa_t)st, &vcpu->arch.steal,
+ sizeof(*st));
+
+ vcpu->arch.steal.steal += to;
+ vcpu->arch.steal.version += 2;
+
+ kvm_write_guest(vcpu->kvm, (gpa_t)st, &vcpu->arch.steal,
+ sizeof(*st));
+ /* is it possible to have 2 loads in sequence? */
+ vcpu->arch.this_time_out = 0;
+ }
}

void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2140,6 +2172,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = native_read_tsc();
+ vcpu->arch.this_time_out = get_kernel_ns();
}

static int is_efer_nx(void)
@@ -5882,6 +5915,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)

kvmclock_reset(vcpu);

+ vcpu->arch.stime = 0;
+
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
--
1.7.2.3

2011-02-11 18:21:40

[permalink] [raw]

Subject: [PATCH v3 5/6] KVM-GST: adjust scheduler cpu power

This is a first proposal for using steal time information
to influence the scheduler. There are a lot of optimizations
and fine grained adjustments to be done, but it is working reasonably
so far for me (mostly)

With this patch (and some host pinnings to demonstrate the situation),
two vcpus with very different steal time (Say 80 % vs 1 %) will not get
an even distribution of processes. This is a situation that can naturally
arise, specially in overcommited scenarios. Previosly, the guest scheduler
would wrongly think that all cpus have the same ability to run processes,
lowering the overall throughput.

Signed-off-by: Glauber Costa <[email protected]>
CC: Rik van Riel <[email protected]>
CC: Jeremy Fitzhardinge <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Avi Kivity <[email protected]>
---
arch/x86/Kconfig | 12 ++++++++++++
kernel/sched.c | 30 ++++++++++++++++++++----------
kernel/sched_features.h | 4 ++--
3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d5ed94d..24d07e1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -515,6 +515,18 @@ menuconfig PARAVIRT_GUEST

if PARAVIRT_GUEST

+config PARAVIRT_TIME_ACCOUNTING
+ bool "Paravirtual steal time accounting"
+ select PARAVIRT
+ default n
+ ---help---
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
source "arch/x86/xen/Kconfig"

config KVM_CLOCK
diff --git a/kernel/sched.c b/kernel/sched.c
index 60b0cf8..80fc47c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -525,6 +525,9 @@ struct rq {
#endif

u64 prev_steal_ticks;
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time;
+#endif

/* calc_load related fields */
unsigned long calc_load_update;
@@ -1900,10 +1903,13 @@ void account_system_vtime(struct task_struct *curr)
}
EXPORT_SYMBOL_GPL(account_system_vtime);

+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- s64 irq_delta;
+ s64 irq_delta = 0, steal = 0;

+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

/*
@@ -1926,20 +1932,24 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)

rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- rq->clock_task += delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

- if (irq_delta && sched_feat(NONIRQ_POWER))
- sched_rt_avg_update(rq, irq_delta);
-}
+ steal = steal_time_clock(cpu_of(rq)) - rq->prev_steal_time;

-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+ if (steal > delta)
+ steal = delta;
+
+ rq->prev_steal_time += steal;
+
+ delta -= steal;
+#endif

-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
rq->clock_task += delta;
-}

-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+ if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ sched_rt_avg_update(rq, irq_delta + steal);
+}

#include "sched_idletask.c"
#include "sched_fair.c"
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69ac..194fc6d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,6 +61,6 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(OWNER_SPIN, 1)

/*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
*/
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
--
1.7.2.3

2011-02-11 19:04:42

[permalink] [raw]

Subject: Re: [PATCH v3 5/6] KVM-GST: adjust scheduler cpu power

On Fri, 2011-02-11 at 13:19 -0500, Glauber Costa wrote:

> static void update_rq_clock_task(struct rq *rq, s64 delta)
> {
> + s64 irq_delta = 0, steal = 0;
>
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
>
> /*
> @@ -1926,20 +1932,24 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
>
> rq->prev_irq_time += irq_delta;
> delta -= irq_delta;
> +#endif
> +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
>
> + steal = steal_time_clock(cpu_of(rq)) - rq->prev_steal_time;
>
> + if (steal > delta)
> + steal = delta;
> +
> + rq->prev_steal_time += steal;
> +
> + delta -= steal;
> +#endif
>
> rq->clock_task += delta;
>
> + if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
> + sched_rt_avg_update(rq, irq_delta + steal);
> +}

I think we should make both these conditional, like:

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
if (sched_clock_irqtime) {
/* ... magic ... */
}
#endif

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (sched_paravirt_time) {
/* ... magic ... */
}
#endif

Once the jump-label stuff gets a bit better we could use the if
(static_branch()) magic to avoid pretty much all cost.

2011-02-11 19:04:45

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On Fri, 2011-02-11 at 13:19 -0500, Glauber Costa wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d747f94..5dbf509 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -302,6 +302,7 @@ long io_schedule_timeout(long timeout);
> extern void cpu_init (void);
> extern void trap_init(void);
> extern void update_process_times(int user);
> +extern u64 (*hypervisor_steal_time)(int cpu);
> extern void scheduler_tick(void);

That's quite terrible..

> extern void sched_show_task(struct task_struct *p);
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 18d38e4..60b0cf8 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -524,6 +524,8 @@ struct rq {
> u64 prev_irq_time;
> #endif
>
> + u64 prev_steal_ticks;
> +
> /* calc_load related fields */
> unsigned long calc_load_update;
> long calc_load_active;
> @@ -1780,6 +1782,16 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> dec_nr_running(rq);
> }
>
> +u64 (*hypervisor_steal_time)(int cpu) = NULL;

I don't think exposing functions pointers like that is very nice at all.

> +static u64 steal_time_clock(int cpu)
> +{
> + if (!hypervisor_steal_time)
> + return 0;
> +
> + return hypervisor_steal_time(cpu);
> +}

This really wants to be under some PARAVIRT config thing, preferably the
same as the other bits (PARAVIRT_TIME_ACCOUNTING).

Also, it would be nice to avoid the function call and conditional on
native hardware, this is on all scheduler hot paths, best it to make
sure you don't even get so far as to call this function on native
hardware.

> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
>
> /*
> @@ -3509,6 +3521,33 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
> }
>
> /*
> + * We have to at flush steal time information every time something else
> + * is accounted. Since the accounting functions are all visible to the rest
> + * of the kernel, it gets tricky to do them in one place. This helper function
> + * helps us.
> + *
> + * When the system is idle, the concept of steal time does not apply. We just
> + * tell the underlying hypervisor that we grabbed the data, but skip steal time
> + * accounting
> + */
> +static int touch_steal_time(int is_idle)
> +{
> + u64 steal, st;
> +
> + steal = steal_time_clock(smp_processor_id());
> +
> + st = steal / TICK_NSEC - this_rq()->prev_steal_ticks;

(this won't compile on 32bits)

> + this_rq()->prev_steal_ticks += st;

This doesn't seem right..

struct rq *rq = this_rq();
u64 steal, delta;
int ticks = 0;

steal = steal_time_clock(smp_processor_id());
delta = rq->prev_steal_ticks;
while (delta >= TICK_NSEC) {
ticks++;
rq->prev_steal_time += TICK_NSEC;
}

if (!ticks)
return 0;

account_steal_time(ticks);
return 1;

would be much more accurate.

> + if (!st || is_idle)
> + return 0;
> +
> + account_steal_time(st);
> + return 1;
> +}

This also wants to be much much cheaper for native hardware even if you
were silly enough to enable the paravirt config ;-) ie. bail out of this
function at the start instead of going through the motions when we know
the clock doesn't count.

2011-02-12 23:47:15

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On Fri, 2011-02-11 at 20:05 +0100, Peter Zijlstra wrote:
> On Fri, 2011-02-11 at 13:19 -0500, Glauber Costa wrote:
>
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d747f94..5dbf509 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -302,6 +302,7 @@ long io_schedule_timeout(long timeout);
> > extern void cpu_init (void);
> > extern void trap_init(void);
> > extern void update_process_times(int user);
> > +extern u64 (*hypervisor_steal_time)(int cpu);
> > extern void scheduler_tick(void);
>
> That's quite terrible..
>
> > extern void sched_show_task(struct task_struct *p);
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index 18d38e4..60b0cf8 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -524,6 +524,8 @@ struct rq {
> > u64 prev_irq_time;
> > #endif
> >
> > + u64 prev_steal_ticks;
> > +
> > /* calc_load related fields */
> > unsigned long calc_load_update;
> > long calc_load_active;
> > @@ -1780,6 +1782,16 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> > dec_nr_running(rq);
> > }
> >
> > +u64 (*hypervisor_steal_time)(int cpu) = NULL;
>
> I don't think exposing functions pointers like that is very nice at all.
>
> > +static u64 steal_time_clock(int cpu)
> > +{
> > + if (!hypervisor_steal_time)
> > + return 0;
> > +
> > + return hypervisor_steal_time(cpu);
> > +}
>
> This really wants to be under some PARAVIRT config thing, preferably the
> same as the other bits (PARAVIRT_TIME_ACCOUNTING).

Agree on the function pointer, but (at first) disagree on the
conditional. The mere accounting of steal time is in principle
independent on its use to adjust CPU power. So we potentially may want
the first, but avoid the later.

I can wrap the accounting on a CONFIG switch, but I don't honestly see a
reason for it.

> Also, it would be nice to avoid the function call and conditional on
> native hardware, this is on all scheduler hot paths, best it to make
> sure you don't even get so far as to call this function on native
> hardware.

Fair.

> > #ifdef CONFIG_IRQ_TIME_ACCOUNTING
> >
> > /*
> > @@ -3509,6 +3521,33 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
> > }
> >
> > /*
> > + * We have to at flush steal time information every time something else
> > + * is accounted. Since the accounting functions are all visible to the rest
> > + * of the kernel, it gets tricky to do them in one place. This helper function
> > + * helps us.
> > + *
> > + * When the system is idle, the concept of steal time does not apply. We just
> > + * tell the underlying hypervisor that we grabbed the data, but skip steal time
> > + * accounting
> > + */
> > +static int touch_steal_time(int is_idle)
> > +{
> > + u64 steal, st;
> > +
> > + steal = steal_time_clock(smp_processor_id());
> > +
> > + st = steal / TICK_NSEC - this_rq()->prev_steal_ticks;
>
> (this won't compile on 32bits)
>
> > + this_rq()->prev_steal_ticks += st;
>
> This doesn't seem right..
>
> struct rq *rq = this_rq();
> u64 steal, delta;
> int ticks = 0;
>
> steal = steal_time_clock(smp_processor_id());
> delta = rq->prev_steal_ticks;
> while (delta >= TICK_NSEC) {
> ticks++;
> rq->prev_steal_time += TICK_NSEC;
> }
>
> if (!ticks)
> return 0;
>
> account_steal_time(ticks);
> return 1;
>
> would be much more accurate.
>
> > + if (!st || is_idle)
> > + return 0;
> > +
> > + account_steal_time(st);
> > + return 1;
> > +}
>
> This also wants to be much much cheaper for native hardware even if you
> were silly enough to enable the paravirt config ;-) ie. bail out of this
> function at the start instead of going through the motions when we know
> the clock doesn't count.

Okay, I will update it.

>
>

2011-02-15 14:41:32

[permalink] [raw]

Subject: Re: [PATCH v3 4/6] KVM-GST: KVM Steal time registration

On 02/11/2011 08:19 PM, Glauber Costa wrote:
> Register steal time within KVM. Everytime we sample the steal time
> information, we update a local variable that tells what was the
> last time read. We then account the difference.
>
>
>
> +static int kvm_register_steal_time(void)
> +{
> + int cpu = smp_processor_id();
> + int low, high, ret;
> + struct kvm_steal_time *st =&per_cpu(steal_time, cpu);
> +
> + if (!hypervisor_steal_time)
> + return 0;

You're using hypervisor_steal_time to communicate the existence of the
feature. Yuck.

> +
> + memset(st, 0, sizeof(*st));
> +
> + low = (int)__pa(st) | 1;

Named constant.

> + high = ((u64)__pa(st)>> 32);
> + ret = wrmsr_safe(MSR_KVM_STEAL_TIME, low, high);

No need for wrmsr_safe() since you're checking the cpuid bit. The other
APIs are nicer (no need to break into two words).

> + printk(KERN_INFO "kvm-stealtime: cpu %d, msr %x:%x\n",
> + cpu, high, low);
> + return ret;
> +}
> +
> #ifdef CONFIG_SMP
> static void __init kvm_smp_prepare_boot_cpu(void)
> {
> #ifdef CONFIG_KVM_CLOCK
> WARN_ON(kvm_register_clock("primary cpu clock"));
> #endif
> + WARN_ON(kvm_register_steal_time());
> kvm_guest_cpu_init();
> native_smp_prepare_boot_cpu();
> }
>
> static void __cpuinit kvm_guest_cpu_online(void *dummy)
> {
> + WARN_ON(kvm_register_steal_time());
> kvm_guest_cpu_init();
> }
>
> static void kvm_guest_cpu_offline(void *dummy)
> {
> kvm_pv_disable_apf(NULL);
> + wrmsr(MSR_KVM_STEAL_TIME, 0, 0);

This will trap if running on a hypervisor without this MSR.

> apf_task_wake_all();
> }
>
> @@ -534,6 +557,23 @@ static void __init kvm_apf_trap_init(void)
> set_intr_gate(14,&async_page_fault);
> }
>
> +static u64 kvm_account_steal_time(int cpu)
> +{
> + u64 steal;
> + struct kvm_steal_time *src;
> + int version;
> +
> + src =&per_cpu(steal_time, cpu);
> + do {
> + version = src->version;
> + rmb();
> + steal = src->steal;
> + rmb();
> + } while ((src->version& 1) || (version != src->version));

Check version & 1 instead of src->version & 1, slightly cheaper.

> +
> + return steal;
> +}
> +
>
> index f98d3ea..dcb6a67 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -164,6 +164,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
> static void kvm_crash_shutdown(struct pt_regs *regs)
> {
> native_write_msr(msr_kvm_system_time, 0, 0);
> + wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
> native_machine_crash_shutdown(regs);
> }

Make conditional. Needed for the previous wrmsr as well.

> #endif
> @@ -171,6 +172,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
> static void kvm_shutdown(void)
> {
> native_write_msr(msr_kvm_system_time, 0, 0);
> + wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
> native_machine_shutdown();
> }
>

Ditto. Is it me, or is the code duplicated?

--
error compiling committee.c: too many arguments to function

2011-02-15 14:38:33

[permalink] [raw]

Subject: Re: [PATCH v3 2/6] KVM-HV: KVM Steal time implementation

On 02/11/2011 08:19 PM, Glauber Costa wrote:
> To implement steal time, we need the hypervisor to pass the guest information
> about how much time was spent running other processes outside the VM.
> This is per-vcpu, and using the kvmclock structure for that is an abuse
> we decided not to make.
>
> In this patchset, I am introducing a new msr, KVM_MSR_STEAL_TIME, that
> holds the memory area address containing information about steal time
>
> This patch contains the hypervisor part for it. I am keeping it separate from
> the headers to facilitate backports to people who wants to backport the kernel
> part but not the hypervisor, or the other way around.
>
>
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ffd7f8d..be6e0e2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -377,6 +377,11 @@ struct kvm_vcpu_arch {
> unsigned int hw_tsc_khz;
> unsigned int time_offset;
> struct page *time_page;
> +
> + gpa_t stime;
> + struct kvm_steal_time steal;
> + u64 this_time_out;
> +

Please put in a small sub-structure (or rename stime to something
meaningful).

> @@ -1546,6 +1546,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> if (kvm_pv_enable_async_pf(vcpu, data))
> return 1;
> break;
> + case MSR_KVM_STEAL_TIME:
> +
> + if (!(data& 1)) {

Named constant.

> + vcpu->arch.stime = 0;
> + break;
> + }
> +
> + vcpu->arch.stime = data& ~1;

I asked for 64-byte aligned structure, yes? Need to fault if bits 1-5
are set (to make sure the guest doesn't use an unadvertised feature and
break itself in the future).

We might also want to fault if the cpuid bit isn't present. We haven't
done so in the past but it makes some sense.

> + break;
> +
> case MSR_IA32_MCG_CTL:
> case MSR_IA32_MCG_STATUS:
> case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
> @@ -1831,6 +1841,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
> case MSR_KVM_ASYNC_PF_EN:
> data = vcpu->arch.apf.msr_val;
> break;
> + case MSR_KVM_STEAL_TIME:
> + data = vcpu->arch.stime;
> + break;

You are returning something other than the guest has written.

> case MSR_IA32_P5_MC_ADDR:
> case MSR_IA32_P5_MC_TYPE:
> case MSR_IA32_MCG_CAP:
>

> @@ -2108,6 +2122,9 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
>
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> + struct kvm_steal_time *st;
> + st = (struct kvm_steal_time *)vcpu->arch.stime;

You are converting a guest physical address into a host virtual
address? Truncating it in the process.

> +
> /* Address WBINVD may be executed by guest */
> if (need_emulate_wbinvd(vcpu)) {
> if (kvm_x86_ops->has_wbinvd_exit())
> @@ -2133,6 +2150,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> kvm_migrate_timers(vcpu);
> vcpu->cpu = cpu;
> }
> +
> + if (vcpu->arch.this_time_out) {
> + u64 to = (get_kernel_ns() - vcpu->arch.this_time_out);
> +
> + kvm_read_guest(vcpu->kvm, (gpa_t)st,&vcpu->arch.steal,
> + sizeof(*st));

Now you are converting it back. Also, you aren't checking the error
return from kvm_read_guest().

> +
> + vcpu->arch.steal.steal += to;
> + vcpu->arch.steal.version += 2;
> +
> + kvm_write_guest(vcpu->kvm, (gpa_t)st,&vcpu->arch.steal,
> + sizeof(*st));

Error check.

> + /* is it possible to have 2 loads in sequence? */

No.

> + vcpu->arch.this_time_out = 0;
> + }
> }

kvm_arch_vcpu_put() is also executed when we return to userspace. Do we
want to account that as steal time?

Do we want to execute this code even if steal time isn't enabled?

Please put this into a separate function.

--
error compiling committee.c: too many arguments to function

2011-02-15 14:46:04

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On Tue, 2011-02-15 at 16:35 +0200, Avi Kivity wrote:
> On 02/11/2011 08:19 PM, Glauber Costa wrote:
> > This patch accounts steal time time in kernel/sched.
> > I kept it from last proposal, because I still see advantages
> > in it: Doing it here will give us easier access from scheduler
> > variables such as the cpu rq. The next patch shows an example of
> > usage for it.
> >
> > Since functions like account_idle_time() can be called from
> > multiple places, not only account_process_tick(), steal time
> > grabbing is repeated in each account function separatedely.
> >
>
> I still don't see how we export this to userspace for top(1) and friends.
>

The existing steal time stuff is:

kernel/sched.c:account_steal_time()

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

and

fs/proc/stat.c:show_stat()

steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);

seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
"%llu\n",
(unsigned long long)cputime64_to_clock_t(user),
(unsigned long long)cputime64_to_clock_t(nice),
(unsigned long long)cputime64_to_clock_t(system),
(unsigned long long)cputime64_to_clock_t(idle),
(unsigned long long)cputime64_to_clock_t(iowait),
(unsigned long long)cputime64_to_clock_t(irq),
(unsigned long long)cputime64_to_clock_t(softirq),
(unsigned long long)cputime64_to_clock_t(steal),
(unsigned long long)cputime64_to_clock_t(guest),
(unsigned long long)cputime64_to_clock_t(guest_nice));

etc..

Glauber's patch is making sure we call account_steal_time(), which is
currently only called from arch code like:

# git grep account_steal_
arch/ia64/xen/time.c: account_steal_ticks(stolen);
arch/powerpc/kernel/time.c: account_steal_time(stolen);
arch/s390/kernel/vtime.c: account_steal_time(steal);
arch/x86/xen/time.c: account_steal_ticks(ticks);
include/linux/kernel_stat.h:extern void account_steal_time(cputime_t);
include/linux/kernel_stat.h:extern void account_steal_ticks(unsigned long ticks);
kernel/sched.c:void account_steal_time(cputime_t cputime)
kernel/sched.c:void account_steal_ticks(unsigned long ticks)
kernel/sched.c: account_steal_time(jiffies_to_cputime(ticks));

2011-02-15 14:28:21

[permalink] [raw]

Subject: Re: [PATCH v3 1/6] KVM-HDR: KVM Steal time implementation

2011-02-15 15:02:05

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On 02/11/2011 08:19 PM, Glauber Costa wrote:
> This patch accounts steal time time in kernel/sched.
> I kept it from last proposal, because I still see advantages
> in it: Doing it here will give us easier access from scheduler
> variables such as the cpu rq. The next patch shows an example of
> usage for it.
>
> Since functions like account_idle_time() can be called from
> multiple places, not only account_process_tick(), steal time
> grabbing is repeated in each account function separatedely.
>

I still don't see how we export this to userspace for top(1) and friends.

--
error compiling committee.c: too many arguments to function

2011-02-15 15:17:26

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On 02/15/2011 04:45 PM, Peter Zijlstra wrote:
> On Tue, 2011-02-15 at 16:35 +0200, Avi Kivity wrote:
>> On 02/11/2011 08:19 PM, Glauber Costa wrote:
>>> This patch accounts steal time time in kernel/sched.
>>> I kept it from last proposal, because I still see advantages
>>> in it: Doing it here will give us easier access from scheduler
>>> variables such as the cpu rq. The next patch shows an example of
>>> usage for it.
>>>
>>> Since functions like account_idle_time() can be called from
>>> multiple places, not only account_process_tick(), steal time
>>> grabbing is repeated in each account function separatedely.
>>>
>> I still don't see how we export this to userspace for top(1) and friends.
>>
> The existing steal time stuff is:
>
> kernel/sched.c:account_steal_time()
>
> cpustat->steal = cputime64_add(cpustat->steal, cputime64);
>
> and
>
> fs/proc/stat.c:show_stat()
>
>
> steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
>
> seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
> "%llu\n",
> (unsigned long long)cputime64_to_clock_t(user),
> (unsigned long long)cputime64_to_clock_t(nice),
> (unsigned long long)cputime64_to_clock_t(system),
> (unsigned long long)cputime64_to_clock_t(idle),
> (unsigned long long)cputime64_to_clock_t(iowait),
> (unsigned long long)cputime64_to_clock_t(irq),
> (unsigned long long)cputime64_to_clock_t(softirq),
> (unsigned long long)cputime64_to_clock_t(steal),
> (unsigned long long)cputime64_to_clock_t(guest),
> (unsigned long long)cputime64_to_clock_t(guest_nice));
>
>

Ah, so we're all set. Do you know if any user tools process this
information?

2011-02-15 15:22:51

by Rik van Riel

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On 02/15/2011 10:17 AM, Avi Kivity wrote:

> Ah, so we're all set. Do you know if any user tools process this
> information?

Top and vmstat have been displaying steal time for
maybe 4 or 5 years now.

2011-02-15 15:26:27

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On 02/15/2011 05:24 PM, Rik van Riel wrote:
> On 02/15/2011 10:17 AM, Avi Kivity wrote:
>
>> Ah, so we're all set. Do you know if any user tools process this
>> information?
>
> Top and vmstat have been displaying steal time for
> maybe 4 or 5 years now.

Excellent, thanks.

2011-02-15 15:27:17

[permalink] [raw]

Subject: Re: [PATCH v3 3/6] KVM-GST: KVM Steal time accounting

On Tue, 2011-02-15 at 17:17 +0200, Avi Kivity wrote:
>
> Ah, so we're all set. Do you know if any user tools process this
> information?

I suppose there are, I bet Jeremy knows, Xen after all supports this
stuff ;-)

2011-02-15 15:48:55