Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h 2007-09-10 14:56:52.000000000 +0200
+++ linux-2.6/drivers/kvm/kvm.h 2007-09-10 15:08:42.000000000 +0200
@@ -625,6 +625,19 @@ void kvm_mmu_unload(struct kvm_vcpu *vcp
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+#ifndef PF_VCPU
+#define PF_VCPU 0
+#endif
+
+static inline void kvm_guest_enter(void)
+{
+ current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+}
+
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c 2007-09-10 14:56:52.000000000 +0200
+++ linux-2.6/drivers/kvm/svm.c 2007-09-10 15:08:42.000000000 +0200
@@ -1494,6 +1494,7 @@ again:
clgi();
vcpu->guest_mode = 1;
+ kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
svm_flush_tlb(vcpu);
@@ -1629,6 +1630,7 @@ again:
#endif
: "cc", "memory" );
+ kvm_guest_exit();
vcpu->guest_mode = 0;
if (vcpu->fpu_active) {
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c 2007-09-10 14:56:52.000000000 +0200
+++ linux-2.6/drivers/kvm/vmx.c 2007-09-10 15:08:42.000000000 +0200
@@ -2018,6 +2018,7 @@ again:
local_irq_disable();
vcpu->guest_mode = 1;
+ kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
vmx_flush_tlb(vcpu);
@@ -2138,6 +2139,7 @@ again:
[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
: "cc", "memory" );
+ kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
* Laurent Vivier <[email protected]> wrote:
> [PATCH 4/4] Modify KVM to update guest time accounting.
FYI, KVM abstracted its guest-mode entry code recently so this did not
apply - find below the reworked patch.
Ingo
------------------->
Subject: sched: guest CPU accounting: maintain guest state in KVM
From: Laurent Vivier <[email protected]>
Modify KVM to update guest time accounting.
[ [email protected]: ported to 2.6.24 KVM. ]
Signed-off-by: Laurent Vivier <[email protected]>
Acked-by: Avi Kivity <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
drivers/kvm/kvm.h | 16 ++++++++++++++++
drivers/kvm/kvm_main.c | 2 ++
2 files changed, 18 insertions(+)
Index: linux/drivers/kvm/kvm.h
===================================================================
--- linux.orig/drivers/kvm/kvm.h
+++ linux/drivers/kvm/kvm.h
@@ -624,6 +624,22 @@ void kvm_mmu_unload(struct kvm_vcpu *vcp
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+/*
+ * Compatibility define - PF_VCPU is only available in v2.6.24+ kernels:
+ */
+#ifndef PF_VCPU
+# define PF_VCPU 0
+#endif
+
+static inline void kvm_guest_enter(void)
+{
+ current->flags |= PF_VCPU;
+}
+
+static inline void kvm_guest_exit(void)
+{
+}
+
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
Index: linux/drivers/kvm/kvm_main.c
===================================================================
--- linux.orig/drivers/kvm/kvm_main.c
+++ linux/drivers/kvm/kvm_main.c
@@ -2046,6 +2046,7 @@ again:
kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
vcpu->guest_mode = 1;
+ kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
@@ -2053,6 +2054,7 @@ again:
kvm_x86_ops->run(vcpu, kvm_run);
+ kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
Ingo Molnar wrote:
> int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
>
> +/*
> + * Compatibility define - PF_VCPU is only available in v2.6.24+ kernels:
> + */
> +#ifndef PF_VCPU
> +# define PF_VCPU 0
> +#endif
> +
>
This bit can go; for the external module I can add it back in
external-module-compat.h. No need to pollute mainline with backward
compatibility stuff.
--
error compiling committee.c: too many arguments to function
* Avi Kivity <[email protected]> wrote:
> Ingo Molnar wrote:
> > int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
> >
> >+/*
> >+ * Compatibility define - PF_VCPU is only available in v2.6.24+ kernels:
> >+ */
> >+#ifndef PF_VCPU
> >+# define PF_VCPU 0
> >+#endif
> >+
> >
>
> This bit can go; for the external module I can add it back in
> external-module-compat.h. No need to pollute mainline with backward
> compatibility stuff.
thx - zapped.
Ingo
* Avi Kivity <[email protected]> wrote:
> This bit can go; for the external module I can add it back in
> external-module-compat.h. No need to pollute mainline with backward
> compatibility stuff.
hm:
static inline void kvm_guest_enter(void)
{
current->flags |= PF_VCPU;
}
static inline void kvm_guest_exit(void)
{
}
shouldnt PF_VCPU be cleared in kvm_guest_exit()?
Ingo
Ingo Molnar wrote:
> * Avi Kivity <[email protected]> wrote:
>
>
>> This bit can go; for the external module I can add it back in
>> external-module-compat.h. No need to pollute mainline with backward
>> compatibility stuff.
>>
>
> hm:
>
> static inline void kvm_guest_enter(void)
> {
> current->flags |= PF_VCPU;
> }
>
> static inline void kvm_guest_exit(void)
> {
> }
>
> shouldnt PF_VCPU be cleared in kvm_guest_exit()?
>
IIRC the accounting code clears it, but yes, it may not have been called
at all, so clearing it here is needed.
--
error compiling committee.c: too many arguments to function
Avi Kivity wrote:
> Ingo Molnar wrote:
>> * Avi Kivity <[email protected]> wrote:
>>
>>
>>> This bit can go; for the external module I can add it back in
>>> external-module-compat.h. No need to pollute mainline with backward
>>> compatibility stuff.
>>>
>>
>> hm:
>>
>> static inline void kvm_guest_enter(void)
>> {
>> current->flags |= PF_VCPU;
>> }
>>
>> static inline void kvm_guest_exit(void)
>> {
>> }
>>
>> shouldnt PF_VCPU be cleared in kvm_guest_exit()?
>>
>
> IIRC the accounting code clears it, but yes, it may not have been called
> at all, so clearing it here is needed.
>
No, It must not be cleared here because we can't enter in the accounting code
between kvm_guest_enter(void) and kvm_guest_exit(void).
This is why the accounting code clears it.
I put a kvm_guest_exit() only for symmetry.
Laurent
--
---------------- [email protected] -----------------
"Given enough eyeballs, all bugs are shallow" E. S. Raymond
Laurent Vivier wrote:
> Avi Kivity wrote:
>
>> Ingo Molnar wrote:
>>
>>> * Avi Kivity <[email protected]> wrote:
>>>
>>>
>>>
>>>> This bit can go; for the external module I can add it back in
>>>> external-module-compat.h. No need to pollute mainline with backward
>>>> compatibility stuff.
>>>>
>>>>
>>> hm:
>>>
>>> static inline void kvm_guest_enter(void)
>>> {
>>> current->flags |= PF_VCPU;
>>> }
>>>
>>> static inline void kvm_guest_exit(void)
>>> {
>>> }
>>>
>>> shouldnt PF_VCPU be cleared in kvm_guest_exit()?
>>>
>>>
>> IIRC the accounting code clears it, but yes, it may not have been called
>> at all, so clearing it here is needed.
>>
>>
>
> No, It must not be cleared here because we can't enter in the accounting code
> between kvm_guest_enter(void) and kvm_guest_exit(void).
>
>
Right.
> This is why the accounting code clears it.
>
But if we didn't get an interrupt in that time?
We can clear it a bit later, after local_irq_enable() in __vcpu_run().
However we need a nop instruction first because "sti" keeps interrupts
disabled for one more instruction.
--
error compiling committee.c: too many arguments to function
forgot to CC everybody besides Laurent:
Am Montag, 15. Oktober 2007 schrieben Sie:
> No, It must not be cleared here because we can't enter in the accounting code
> between kvm_guest_enter(void) and kvm_guest_exit(void).
>
> This is why the accounting code clears it.
>
> I put a kvm_guest_exit() only for symmetry.
Why cant we enter the accounting code?
Dont know about x86, but on s390 we can get host interrupts and reschedules
even when we run a guest (if preemption is on).
If the timer tick happens ?while the guest is running, we will run the
accounting code on x86 as well, no?
Christian
--
IBM Deutschland Entwicklung GmbH
Vorsitzender des Aufsichtsrats: Martin Jetter
Gesch?ftsf?hrung: Herbert Kircher
Sitz der Gesellschaft: B?blingen
Registergericht: Amtsgericht Stuttgart, HRB 243294
Am Montag, 15. Oktober 2007 schrieb Avi Kivity:
> We can clear it a bit later, after local_irq_enable() in __vcpu_run().
> However we need a nop instruction first because "sti" keeps interrupts
> disabled for one more instruction.
Ah, I see. The host interrupt behaves different and instead of running the
interrupt handler, it exits the vmrun on x86? The interrupt handler will be
called some cycles after the sti?
That is different to s390. We can run the guest code for a long time and the
host instruction pointer stays on the sie instruction. That means, we can
interrupt sie and continue by simply setting the instrution pointer (PSW)
back to the sie instruction.
Any idea how to make this proper on all architectures? I will have a look.
Christian
Avi Kivity wrote:
> Laurent Vivier wrote:
>> Avi Kivity wrote:
>>
>>> Ingo Molnar wrote:
>>>
>>>> * Avi Kivity <[email protected]> wrote:
>>>>
>>>>
>>>>
>>>>> This bit can go; for the external module I can add it back in
>>>>> external-module-compat.h. No need to pollute mainline with backward
>>>>> compatibility stuff.
>>>>>
>>>> hm:
>>>>
>>>> static inline void kvm_guest_enter(void)
>>>> {
>>>> current->flags |= PF_VCPU;
>>>> }
>>>>
>>>> static inline void kvm_guest_exit(void)
>>>> {
>>>> }
>>>>
>>>> shouldnt PF_VCPU be cleared in kvm_guest_exit()?
>>>>
>>> IIRC the accounting code clears it, but yes, it may not have been called
>>> at all, so clearing it here is needed.
>>>
>>>
>>
>> No, It must not be cleared here because we can't enter in the
>> accounting code
>> between kvm_guest_enter(void) and kvm_guest_exit(void).
>>
>>
>
> Right.
>
>> This is why the accounting code clears it.
>>
>
> But if we didn't get an interrupt in that time?
>
> We can clear it a bit later, after local_irq_enable() in __vcpu_run().
> However we need a nop instruction first because "sti" keeps interrupts
> disabled for one more instruction.
IMHO, I think it is better to let kvm_guest_exit() empty (you can remove it, if
you want):
1st case:
- unset PF_VCPU in kvm_guest_exit(), all the tick is always for system time.
Guest time is always 0.
1st case and half:
- like 1st case but we move kvm_guest_exit() as you propose and the reason of
the interrupt is the tick interrupt. The tick is for guest time only. I think
the probability is very low.
2nd case:
- don't unset PF_VCPU in kvm_guest_exit(), all the tick is for guest time.
I proposed a patch allowing to be more accurate, but it introduces more
complexity and system and user time accounting are not very accurate too (the
tick if for system if it appears whereas we are in system, for user if it
appears whereas we are in user).
Laurent
--
---------------- [email protected] -----------------
"Given enough eyeballs, all bugs are shallow" E. S. Raymond
Christian Borntraeger wrote:
> Am Montag, 15. Oktober 2007 schrieb Avi Kivity:
>> We can clear it a bit later, after local_irq_enable() in __vcpu_run().
>> However we need a nop instruction first because "sti" keeps interrupts
>> disabled for one more instruction.
>
> Ah, I see. The host interrupt behaves different and instead of running the
> interrupt handler, it exits the vmrun on x86? The interrupt handler will be
> called some cycles after the sti?
> That is different to s390. We can run the guest code for a long time and the
> host instruction pointer stays on the sie instruction. That means, we can
> interrupt sie and continue by simply setting the instrution pointer (PSW)
> back to the sie instruction.
>
> Any idea how to make this proper on all architectures? I will have a look.
I think the solution is to have an arch dependent kvm_guest_exit(): empty for
x86, clearing the bit for s390.
Regards,
Laurent
--
---------------- [email protected] -----------------
"Given enough eyeballs, all bugs are shallow" E. S. Raymond
Laurent Vivier wrote:
>> But if we didn't get an interrupt in that time?
>>
>> We can clear it a bit later, after local_irq_enable() in __vcpu_run().
>> However we need a nop instruction first because "sti" keeps interrupts
>> disabled for one more instruction.
>>
>
> IMHO, I think it is better to let kvm_guest_exit() empty (you can remove it, if
> you want):
>
> 1st case:
> - unset PF_VCPU in kvm_guest_exit(), all the tick is always for system time.
> Guest time is always 0.
>
> 1st case and half:
>
> - like 1st case but we move kvm_guest_exit() as you propose and the reason of
> the interrupt is the tick interrupt. The tick is for guest time only. I think
> the probability is very low.
>
If the guest is executing for 10% of the time, the probability is
exactly 10%, no?
> 2nd case:
> - don't unset PF_VCPU in kvm_guest_exit(), all the tick is for guest time.
>
But then even execution in ->handle_exit() is accounted as guest time,
which is wrong.
--
error compiling committee.c: too many arguments to function
Avi Kivity wrote:
> Laurent Vivier wrote:
>
>
>
>>> But if we didn't get an interrupt in that time?
>>>
>>> We can clear it a bit later, after local_irq_enable() in
>>> __vcpu_run(). However we need a nop instruction first because "sti"
>>> keeps interrupts
>>> disabled for one more instruction.
>>>
>>
>> IMHO, I think it is better to let kvm_guest_exit() empty (you can
>> remove it, if
>> you want):
>>
>> 1st case:
>> - unset PF_VCPU in kvm_guest_exit(), all the tick is always for system
>> time.
>> Guest time is always 0.
>>
>> 1st case and half:
>>
>> - like 1st case but we move kvm_guest_exit() as you propose and the
>> reason of
>> the interrupt is the tick interrupt. The tick is for guest time only.
>> I think
>> the probability is very low.
>>
>
> If the guest is executing for 10% of the time, the probability is
> exactly 10%, no?
I think you know that better than me.
But is there homogeneity in probability ?
I mean, if the guest has a lot I/O, it is interrupted by them and the
probability to be interrupted by a tick is lower than the time passed in the VCPU ?
>> 2nd case:
>> - don't unset PF_VCPU in kvm_guest_exit(), all the tick is for guest
>> time.
>>
>
> But then even execution in ->handle_exit() is accounted as guest time,
> which is wrong.
System time and User time are wrong too as the tick is accounted to the side
where it appears, even if CPU has executed code from the other side in a
sub-part of the tick. It's not a good argument.
Laurent
--
---------------- [email protected] -----------------
"Given enough eyeballs, all bugs are shallow" E. S. Raymond
Am Montag, 15. Oktober 2007 schrieb Laurent Vivier:
> > Any idea how to make this proper on all architectures? I will have a look.
>
> I think the solution is to have an arch dependent kvm_guest_exit(): empty for
> x86, clearing the bit for s390.
I think we can merge your patches, as the userspace interface seems fine. To
make the accounting work for virtual cpu accounting found on ppc and s390 we
can later add an additional patch that also deals with interruptible guest
contexts.
So something like this should work:
Signed-off-by: Christian Borntraeger <[email protected]>
---
drivers/kvm/kvm.h | 8 ++++++++
kernel/sched.c | 2 ++
2 files changed, 10 insertions(+)
Index: kvm/drivers/kvm/kvm.h
===================================================================
--- kvm.orig/drivers/kvm/kvm.h
+++ kvm/drivers/kvm/kvm.h
@@ -18,6 +18,7 @@
#include <linux/kvm.h>
#include <linux/kvm_para.h>
+#include <asm/system.h>
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -675,11 +676,18 @@ __init void kvm_arch_init(void);
static inline void kvm_guest_enter(void)
{
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+ account_system_vtime(current);
+#endif
current->flags |= PF_VCPU;
}
static inline void kvm_guest_exit(void)
{
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+ account_system_vtime(current);
+ current->flags &= ~PF_VCPU;
+#endif
}
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
Index: kvm/kernel/sched.c
===================================================================
--- kvm.orig/kernel/sched.c
+++ kvm/kernel/sched.c
@@ -3312,7 +3312,9 @@ void account_system_time(struct task_str
if (p->flags & PF_VCPU) {
account_guest_time(p, cputime);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
p->flags &= ~PF_VCPU;
+#endif
return;
}
Christian Borntraeger wrote:
> Am Montag, 15. Oktober 2007 schrieb Laurent Vivier:
>>> Any idea how to make this proper on all architectures? I will have a look.
>> I think the solution is to have an arch dependent kvm_guest_exit(): empty for
>> x86, clearing the bit for s390.
>
> I think we can merge your patches, as the userspace interface seems fine. To
> make the accounting work for virtual cpu accounting found on ppc and s390 we
> can later add an additional patch that also deals with interruptible guest
> contexts.
> So something like this should work:
>
> Signed-off-by: Christian Borntraeger <[email protected]>
>
It's fine for me.
Laurent
--
---------------- [email protected] -----------------
"Given enough eyeballs, all bugs are shallow" E. S. Raymond
Laurent Vivier wrote:
> Avi Kivity wrote:
>
>> Laurent Vivier wrote:
>>
>>
>>
>>
>>>> But if we didn't get an interrupt in that time?
>>>>
>>>> We can clear it a bit later, after local_irq_enable() in
>>>> __vcpu_run(). However we need a nop instruction first because "sti"
>>>> keeps interrupts
>>>> disabled for one more instruction.
>>>>
>>>>
>>> IMHO, I think it is better to let kvm_guest_exit() empty (you can
>>> remove it, if
>>> you want):
>>>
>>> 1st case:
>>> - unset PF_VCPU in kvm_guest_exit(), all the tick is always for system
>>> time.
>>> Guest time is always 0.
>>>
>>> 1st case and half:
>>>
>>> - like 1st case but we move kvm_guest_exit() as you propose and the
>>> reason of
>>> the interrupt is the tick interrupt. The tick is for guest time only.
>>> I think
>>> the probability is very low.
>>>
>>>
>> If the guest is executing for 10% of the time, the probability is
>> exactly 10%, no?
>>
>
> I think you know that better than me.
>
> But is there homogeneity in probability ?
>
It's exactly the same issue as with systime and usertime. The interrupt
samples the program counter at various points at a fairly low frequency
(milliseconds) while syscalls last a few dozens of microseconds.
Probability makes it average out correctly in the end.
[Ingo, what about dyntick? suppose you have just one process that calls
read() from /dev/zero repeatedly. There'd be very few (or no)
interrupts -- what happens to accounting accuracy?]
> I mean, if the guest has a lot I/O, it is interrupted by them and the
> probability to be interrupted by a tick is lower than the time passed in the VCPU ?
>
Suppose the time to service the I/O is exactly equal to the amount
running in guest mode. Then the probability of the interrupt happening
in guest mode is equal to it happening outside guest mode and you'd get
50% guest, 50% system/user, which is what you want.
>
>>> 2nd case:
>>> - don't unset PF_VCPU in kvm_guest_exit(), all the tick is for guest
>>> time.
>>>
>>>
>> But then even execution in ->handle_exit() is accounted as guest time,
>> which is wrong.
>>
>
> System time and User time are wrong too as the tick is accounted to the side
> where it appears, even if CPU has executed code from the other side in a
> sub-part of the tick. It's not a good argument.
>
It's at least consistent... the same errors for everyone, so it averages
out in the end.
--
error compiling committee.c: too many arguments to function
No more comments: I agree.
We can move the "&= ~PF_VCPU" to kvm_guest_exit() and remove it from
account_system_time(). Moreover it will simplify the code for s390...
Regards,
Laurent
Avi Kivity wrote:
> Laurent Vivier wrote:
>> Avi Kivity wrote:
>>
>>> Laurent Vivier wrote:
>>>
>>>
>>>
>>>
>>>>> But if we didn't get an interrupt in that time?
>>>>>
>>>>> We can clear it a bit later, after local_irq_enable() in
>>>>> __vcpu_run(). However we need a nop instruction first because "sti"
>>>>> keeps interrupts
>>>>> disabled for one more instruction.
>>>>>
>>>> IMHO, I think it is better to let kvm_guest_exit() empty (you can
>>>> remove it, if
>>>> you want):
>>>>
>>>> 1st case:
>>>> - unset PF_VCPU in kvm_guest_exit(), all the tick is always for system
>>>> time.
>>>> Guest time is always 0.
>>>>
>>>> 1st case and half:
>>>>
>>>> - like 1st case but we move kvm_guest_exit() as you propose and the
>>>> reason of
>>>> the interrupt is the tick interrupt. The tick is for guest time only.
>>>> I think
>>>> the probability is very low.
>>>>
>>> If the guest is executing for 10% of the time, the probability is
>>> exactly 10%, no?
>>>
>>
>> I think you know that better than me.
>>
>> But is there homogeneity in probability ?
>>
>
> It's exactly the same issue as with systime and usertime. The interrupt
> samples the program counter at various points at a fairly low frequency
> (milliseconds) while syscalls last a few dozens of microseconds.
> Probability makes it average out correctly in the end.
>
> [Ingo, what about dyntick? suppose you have just one process that calls
> read() from /dev/zero repeatedly. There'd be very few (or no)
> interrupts -- what happens to accounting accuracy?]
>
>> I mean, if the guest has a lot I/O, it is interrupted by them and the
>> probability to be interrupted by a tick is lower than the time passed
>> in the VCPU ?
>>
>
> Suppose the time to service the I/O is exactly equal to the amount
> running in guest mode. Then the probability of the interrupt happening
> in guest mode is equal to it happening outside guest mode and you'd get
> 50% guest, 50% system/user, which is what you want.
>
>
>>
>>>> 2nd case:
>>>> - don't unset PF_VCPU in kvm_guest_exit(), all the tick is for guest
>>>> time.
>>>>
>>> But then even execution in ->handle_exit() is accounted as guest time,
>>> which is wrong.
>>>
>>
>> System time and User time are wrong too as the tick is accounted to
>> the side
>> where it appears, even if CPU has executed code from the other side in a
>> sub-part of the tick. It's not a good argument.
>>
>
> It's at least consistent... the same errors for everyone, so it averages
> out in the end.
>
clear PF_VCPU in kvm_guest_exit() and move kvm_guest_exit() after
local_irq_enable().
According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if we move
it after local_irq_enable().
http://lkml.org/lkml/2007/10/15/114
To simplify s390 port, we don't clear it in account_system_time().
http://lkml.org/lkml/2007/10/15/183
Signed-off-by: Laurent Vivier <[email protected]>
---
drivers/kvm/kvm.h | 1 +
drivers/kvm/kvm_main.c | 3 ++-
kernel/sched.c | 1 -
3 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index e9dbf67..e8b4902 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -677,6 +677,7 @@ static inline void kvm_guest_enter(void)
static inline void kvm_guest_exit(void)
{
+ current->flags &= ~PF_VCPU;
}
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 87275be..a9db477 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2194,12 +2194,13 @@ again:
kvm_x86_ops->run(vcpu, kvm_run);
- kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
++vcpu->stat.exits;
+ kvm_guest_exit();
+
preempt_enable();
/*
diff --git a/kernel/sched.c b/kernel/sched.c
index b27ab3e..57fac22 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3315,7 +3315,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
#ifdef CONFIG_GUEST_ACCOUNTING
if (p->flags & PF_VCPU) {
account_guest_time(p, cputime);
- p->flags &= ~PF_VCPU;
return;
}
#endif
--
1.5.2.4
Am Mittwoch, 17. Oktober 2007 schrieb Laurent Vivier:
> clear PF_VCPU in kvm_guest_exit() and move kvm_guest_exit() after
> local_irq_enable().
Looks good. In the next days I will sent a patch for precise guest time
accounting with CONFIG_VIRT_CPU_ACCOUNTING on top of this patch.
Acked-By: Christian Borntraeger <[email protected]>
Laurent Vivier wrote:
> clear PF_VCPU in kvm_guest_exit() and move kvm_guest_exit() after
> local_irq_enable().
>
> According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if we move
> it after local_irq_enable().
>
> http://lkml.org/lkml/2007/10/15/114
>
> To simplify s390 port, we don't clear it in account_system_time().
>
> http://lkml.org/lkml/2007/10/15/183
>
> Signed-off-by: Laurent Vivier <[email protected]>
> ---
> drivers/kvm/kvm.h | 1 +
> drivers/kvm/kvm_main.c | 3 ++-
> kernel/sched.c | 1 -
> 3 files changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
> index e9dbf67..e8b4902 100644
> --- a/drivers/kvm/kvm.h
> +++ b/drivers/kvm/kvm.h
> @@ -677,6 +677,7 @@ static inline void kvm_guest_enter(void)
>
> static inline void kvm_guest_exit(void)
> {
> + current->flags &= ~PF_VCPU;
> }
>
Ingo already added this.
>
> static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
> diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
> index 87275be..a9db477 100644
> --- a/drivers/kvm/kvm_main.c
> +++ b/drivers/kvm/kvm_main.c
> @@ -2194,12 +2194,13 @@ again:
>
> kvm_x86_ops->run(vcpu, kvm_run);
>
> - kvm_guest_exit();
> vcpu->guest_mode = 0;
> local_irq_enable();
>
> ++vcpu->stat.exits;
>
>
Need a barrier() here. Otherwise the compiler may reorder
"kvm_guest_exit();" and "++vcpu->stats.exits;", making kvm_guest_exit()
right after local_irq_enable(). If it inlines kvm_guest_exit(), and
further encodes it in one instruction (both likely) we get:
sti
andl $something, (something_else)
The andl executes in interrupt shadow (that is, interrupts are still
disabled) and the timer tick won't see PF_VCPU.
> + kvm_guest_exit();
> +
> preempt_enable();
>
> /*
>
Please copy kvm-devel on kvm patches.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
Avi Kivity wrote:
> Laurent Vivier wrote:
>> clear PF_VCPU in kvm_guest_exit() and move kvm_guest_exit() after
>> local_irq_enable().
>>
>> According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if we move
>> it after local_irq_enable().
>>
>> http://lkml.org/lkml/2007/10/15/114
>>
>> To simplify s390 port, we don't clear it in account_system_time().
>>
>> http://lkml.org/lkml/2007/10/15/183
>>
>> Signed-off-by: Laurent Vivier <[email protected]>
>> ---
>> drivers/kvm/kvm.h | 1 +
>> drivers/kvm/kvm_main.c | 3 ++-
>> kernel/sched.c | 1 -
>> 3 files changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
>> index e9dbf67..e8b4902 100644
>> --- a/drivers/kvm/kvm.h
>> +++ b/drivers/kvm/kvm.h
>> @@ -677,6 +677,7 @@ static inline void kvm_guest_enter(void)
>>
>> static inline void kvm_guest_exit(void)
>> {
>> + current->flags &= ~PF_VCPU;
>> }
>>
>
>
> Ingo already added this.
>
>>
>> static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>> diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
>> index 87275be..a9db477 100644
>> --- a/drivers/kvm/kvm_main.c
>> +++ b/drivers/kvm/kvm_main.c
>> @@ -2194,12 +2194,13 @@ again:
>>
>> kvm_x86_ops->run(vcpu, kvm_run);
>>
>> - kvm_guest_exit();
>> vcpu->guest_mode = 0;
>> local_irq_enable();
>>
>> ++vcpu->stat.exits;
>>
>>
>
> Need a barrier() here. Otherwise the compiler may reorder
> "kvm_guest_exit();" and "++vcpu->stats.exits;", making kvm_guest_exit()
> right after local_irq_enable(). If it inlines kvm_guest_exit(), and
> further encodes it in one instruction (both likely) we get:
>
> sti
> andl $something, (something_else)
For the moment it is:
5676: fb sti
5677: 48 8b 85 f8 fe ff ff mov 0xfffffffffffffef8(%rbp),%rax
567e: 8b 80 10 0c 00 00 mov 0xc10(%rax),%eax
5684: 8d 50 01 lea 0x1(%rax),%edx
5687: 48 8b 85 f8 fe ff ff mov 0xfffffffffffffef8(%rbp),%rax
568e: 89 90 10 0c 00 00 mov %edx,0xc10(%rax)
5694: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax
569b: 00 00
569d: 48 89 45 a8 mov %rax,0xffffffffffffffa8(%rbp)
56a1: 48 8b 45 a8 mov 0xffffffffffffffa8(%rbp),%rax
56a5: 48 89 45 b0 mov %rax,0xffffffffffffffb0(%rbp)
56a9: 48 8b 45 b0 mov 0xffffffffffffffb0(%rbp),%rax
56ad: 48 89 c2 mov %rax,%rdx
56b0: 8b 42 14 mov 0x14(%rdx),%eax
56b3: 83 e0 ef and $0xffffffffffffffef,%eax
56b6: 89 42 14 mov %eax,0x14(%rdx)
So I think it is out of the interrupt shadow... but you're right I will resend
the patch adding barrier() (because in futur gcc can behave differently),
removing the part already added by Ingo and copying kvm-devel.
> The andl executes in interrupt shadow (that is, interrupts are still
> disabled) and the timer tick won't see PF_VCPU.
>
>> + kvm_guest_exit();
>> +
>> preempt_enable();
>>
>> /*
>>
>
>
> Please copy kvm-devel on kvm patches.
>
Laurent
--
---------------- [email protected] -----------------
"Given enough eyeballs, all bugs are shallow" E. S. Raymond
Avi,
ppc and s390 offer the possibility to track process times precisely
by looking at cpu timer on every context switch, irq, softirq etc.
We can use that infrastructure as well for guest time accounting.
We need to account the used time before we change the state.
This patch adds a call to account_system_vtime to kvm_guest_enter
and kvm_guest exit. If CONFIG_VIRT_CPU_ACCOUNTING is not set,
account_system_vtime is defined in hardirq.h as an empty function,
which means this patch does not change the behaviour on other
platforms.
I compile tested this patch on x86 and function tested the patch on
s390.
Avi, please apply.
Signed-off-by: Christian Borntraeger <[email protected]>
---
drivers/kvm/kvm.h | 3 +++
1 file changed, 3 insertions(+)
Index: kvm/drivers/kvm/kvm.h
===================================================================
--- kvm.orig/drivers/kvm/kvm.h
+++ kvm/drivers/kvm/kvm.h
@@ -7,6 +7,7 @@
*/
#include <linux/types.h>
+#include <linux/hardirq.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
@@ -669,11 +670,13 @@ __init void kvm_arch_init(void);
static inline void kvm_guest_enter(void)
{
+ account_system_vtime(current);
current->flags |= PF_VCPU;
}
static inline void kvm_guest_exit(void)
{
+ account_system_vtime(current);
current->flags &= ~PF_VCPU;
}
Christian Borntraeger wrote:
> ppc and s390 offer the possibility to track process times precisely
> by looking at cpu timer on every context switch, irq, softirq etc.
> We can use that infrastructure as well for guest time accounting.
> We need to account the used time before we change the state.
> This patch adds a call to account_system_vtime to kvm_guest_enter
> and kvm_guest exit. If CONFIG_VIRT_CPU_ACCOUNTING is not set,
> account_system_vtime is defined in hardirq.h as an empty function,
> which means this patch does not change the behaviour on other
> platforms.
>
> I compile tested this patch on x86 and function tested the patch on
> s390.
>
> Avi, please apply.
>
>
Done. Thanks!
--
error compiling committee.c: too many arguments to function
According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if we move
it after local_irq_enable().
http://lkml.org/lkml/2007/10/15/114
To simplify s390 port, we don't clear it in account_system_time().
http://lkml.org/lkml/2007/10/15/183
---
drivers/kvm/kvm_main.c | 5 ++++-
kernel/sched.c | 1 -
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 87275be..b9cd1f0 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -2194,12 +2194,15 @@ again:
kvm_x86_ops->run(vcpu, kvm_run);
- kvm_guest_exit();
vcpu->guest_mode = 0;
local_irq_enable();
++vcpu->stat.exits;
+ barrier();
+
+ kvm_guest_exit();
+
preempt_enable();
/*
diff --git a/kernel/sched.c b/kernel/sched.c
index b27ab3e..57fac22 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3315,7 +3315,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
#ifdef CONFIG_GUEST_ACCOUNTING
if (p->flags & PF_VCPU) {
account_guest_time(p, cputime);
- p->flags &= ~PF_VCPU;
return;
}
#endif
--
1.5.2.4
Laurent Vivier wrote:
> According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if we move
> it after local_irq_enable().
>
> http://lkml.org/lkml/2007/10/15/114
>
> To simplify s390 port, we don't clear it in account_system_time().
>
> http://lkml.org/lkml/2007/10/15/183
>
Applied (the kvm part), and added a fat comment on the barrier. Can you
send a signed-off-by: line?
--
error compiling committee.c: too many arguments to function
Avi Kivity a ?crit :
> Laurent Vivier wrote:
>> According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if we move
>> it after local_irq_enable().
>>
>> http://lkml.org/lkml/2007/10/15/114
>>
>> To simplify s390 port, we don't clear it in account_system_time().
>>
>> http://lkml.org/lkml/2007/10/15/183
>>
>
> Applied (the kvm part), and added a fat comment on the barrier. Can you
> send a signed-off-by: line?
>
Sorry, I missed it...
Signed-off-by: Laurent Vivier <[email protected]>
On Thu, 2007-10-18 at 14:39 +0200, Christian Borntraeger wrote:
> Avi,
>
> ppc and s390 offer the possibility to track process times precisely
> by looking at cpu timer on every context switch, irq, softirq etc.
> We can use that infrastructure as well for guest time accounting.
> We need to account the used time before we change the state.
> This patch adds a call to account_system_vtime to kvm_guest_enter
> and kvm_guest exit. If CONFIG_VIRT_CPU_ACCOUNTING is not set,
> account_system_vtime is defined in hardirq.h as an empty function,
> which means this patch does not change the behaviour on other
> platforms.
>
> I compile tested this patch on x86 and function tested the patch on
> s390.
>
> Avi, please apply.
>
>
> Signed-off-by: Christian Borntraeger <[email protected]>
>
> ---
> drivers/kvm/kvm.h | 3 +++
> 1 file changed, 3 insertions(+)
>
> Index: kvm/drivers/kvm/kvm.h
> ===================================================================
> --- kvm.orig/drivers/kvm/kvm.h
> +++ kvm/drivers/kvm/kvm.h
> @@ -7,6 +7,7 @@
> */
>
> #include <linux/types.h>
> +#include <linux/hardirq.h>
> #include <linux/list.h>
> #include <linux/mutex.h>
> #include <linux/spinlock.h>
> @@ -669,11 +670,13 @@ __init void kvm_arch_init(void);
>
> static inline void kvm_guest_enter(void)
> {
> + account_system_vtime(current);
> current->flags |= PF_VCPU;
> }
>
> static inline void kvm_guest_exit(void)
> {
> + account_system_vtime(current);
> current->flags &= ~PF_VCPU;
> }
I don't understand. Should kvm_guest_exit() be calling
account_user_vtime() (instead of account_system_vtime())?
--
Hollis Blanchard
IBM Linux Technology Center
On Fri, 2007-10-19 at 11:57 -0500, Hollis Blanchard wrote:
> > @@ -669,11 +670,13 @@ __init void kvm_arch_init(void);
> >
> > static inline void kvm_guest_enter(void)
> > {
> > + account_system_vtime(current);
> > current->flags |= PF_VCPU;
> > }
> >
> > static inline void kvm_guest_exit(void)
> > {
> > + account_system_vtime(current);
> > current->flags &= ~PF_VCPU;
> > }
>
> I don't understand. Should kvm_guest_exit() be calling
> account_user_vtime() (instead of account_system_vtime())?
Never mind; the tree I was looking at didn't have the
account_guest_time() stuff in account_system_time().
--
Hollis Blanchard
IBM Linux Technology Center
* Laurent Vivier <[email protected]> wrote:
> According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if
> we move it after local_irq_enable().
>
> http://lkml.org/lkml/2007/10/15/114
>
> To simplify s390 port, we don't clear it in account_system_time().
>
> http://lkml.org/lkml/2007/10/15/183
thanks - i have picked your patch up.
Ingo
Ingo Molnar wrote:
> * Laurent Vivier <[email protected]> wrote:
>
>
>> According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if
>> we move it after local_irq_enable().
>>
>> http://lkml.org/lkml/2007/10/15/114
>>
>> To simplify s390 port, we don't clear it in account_system_time().
>>
>> http://lkml.org/lkml/2007/10/15/183
>>
>
> thanks - i have picked your patch up.
>
>
Note the kvm part is already in my 2.6.24 queue.
--
error compiling committee.c: too many arguments to function
* Avi Kivity <[email protected]> wrote:
> Ingo Molnar wrote:
> >* Laurent Vivier <[email protected]> wrote:
> >
> >
> >>According comments from Avi, we can clear PF_VCPU in kvm_guest_exit if
> >>we move it after local_irq_enable().
> >>
> >>http://lkml.org/lkml/2007/10/15/114
> >>
> >>To simplify s390 port, we don't clear it in account_system_time().
> >>
> >>http://lkml.org/lkml/2007/10/15/183
> >>
> >
> >thanks - i have picked your patch up.
> >
> >
>
> Note the kvm part is already in my 2.6.24 queue.
ok, then please push the whole patch to Linus, the scheduler bits are:
Acked-by: Ingo Molnar <[email protected]>
Ingo
Ingo Molnar wrote:
>>
>> Note the kvm part is already in my 2.6.24 queue.
>>
>
> ok, then please push the whole patch to Linus, the scheduler bits are:
>
> Acked-by: Ingo Molnar <[email protected]>
>
Sure, thanks.
--
error compiling committee.c: too many arguments to function