2007-02-06 03:52:48

by Zachary Amsden

[permalink] [raw]
Subject: [PATCH 2/11] Sched clock paravirt op

diff -r 3e746c0ebcdf arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c Fri Feb 02 13:54:53 2007 -0800
+++ b/arch/i386/kernel/paravirt.c Fri Feb 02 15:27:50 2007 -0800
@@ -32,6 +32,7 @@
#include <asm/fixmap.h>
#include <asm/apic.h>
#include <asm/tlbflush.h>
+#include <asm/timer.h>

/* nop stub */
static void native_nop(void)
@@ -523,6 +524,8 @@ struct paravirt_ops paravirt_ops = {
.write_msr = native_write_msr,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
+ .get_scheduled_cycles = native_read_tsc,
+ .get_cpu_khz = native_calculate_cpu_khz,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
diff -r 3e746c0ebcdf arch/i386/kernel/tsc.c
--- a/arch/i386/kernel/tsc.c Fri Feb 02 13:54:53 2007 -0800
+++ b/arch/i386/kernel/tsc.c Fri Feb 02 13:54:53 2007 -0800
@@ -14,6 +14,7 @@
#include <asm/delay.h>
#include <asm/tsc.h>
#include <asm/io.h>
+#include <asm/timer.h>

#include "mach_timer.h"

@@ -102,9 +103,6 @@ unsigned long long sched_clock(void)
{
unsigned long long this_offset;

- if (unlikely(custom_sched_clock))
- return (*custom_sched_clock)();
-
/*
* Fall back to jiffies if there's no TSC available:
*/
@@ -113,13 +111,13 @@ unsigned long long sched_clock(void)
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);

/* read the Time Stamp Counter: */
- rdtscll(this_offset);
+ get_scheduled_cycles(this_offset);

/* return the value in ns */
return cycles_2_ns(this_offset);
}

-static unsigned long calculate_cpu_khz(void)
+unsigned long native_calculate_cpu_khz(void)
{
unsigned long long start, end;
unsigned long count;
diff -r 3e746c0ebcdf arch/i386/kernel/vmi.c
--- a/arch/i386/kernel/vmi.c Fri Feb 02 13:54:53 2007 -0800
+++ b/arch/i386/kernel/vmi.c Fri Feb 02 15:32:20 2007 -0800
@@ -880,7 +880,7 @@ static int __init activate_vmi(void)
paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
#endif
- custom_sched_clock = vmi_sched_clock;
+ paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
}

/*
diff -r 3e746c0ebcdf include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h Fri Feb 02 13:54:53 2007 -0800
+++ b/include/asm-i386/paravirt.h Fri Feb 02 15:27:50 2007 -0800
@@ -94,6 +94,8 @@ struct paravirt_ops

u64 (fastcall *read_tsc)(void);
u64 (fastcall *read_pmc)(void);
+ u64 (*get_scheduled_cycles)(void);
+ unsigned long (*get_cpu_khz)(void);

void (fastcall *load_tr_desc)(void);
void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
@@ -273,6 +275,9 @@ static inline void halt(void)

#define rdtscll(val) (val = paravirt_ops.read_tsc())

+#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
+#define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
+
#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

#define rdpmc(counter,low,high) do { \
diff -r 3e746c0ebcdf include/asm-i386/time.h
--- a/include/asm-i386/time.h Fri Feb 02 13:54:53 2007 -0800
+++ b/include/asm-i386/time.h Fri Feb 02 15:27:50 2007 -0800
@@ -30,7 +30,6 @@ static inline int native_set_wallclock(u

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
-extern unsigned long long native_sched_clock(void);
#else /* !CONFIG_PARAVIRT */

#define get_wallclock() native_get_wallclock()
diff -r 3e746c0ebcdf include/asm-i386/timer.h
--- a/include/asm-i386/timer.h Fri Feb 02 13:54:53 2007 -0800
+++ b/include/asm-i386/timer.h Fri Feb 02 13:54:53 2007 -0800
@@ -4,13 +4,21 @@
#include <linux/pm.h>

#define TICK_SIZE (tick_nsec / 1000)
+
void setup_pit_timer(void);
+unsigned long long native_sched_clock(void);
+unsigned long native_calculate_cpu_khz(void);
+
/* Modifiers for buggy PIT handling */
extern int pit_latch_buggy;
extern int timer_ack;
extern int no_timer_check;
-extern unsigned long long (*custom_sched_clock)(void);
extern int no_sync_cmos_clock;
extern int recalibrate_cpu_khz(void);

+#ifndef CONFIG_PARAVIRT
+#define get_scheduled_cycles(val) rdtscll(val)
+#define calculate_cpu_khz() native_calculate_cpu_khz()
#endif
+
+#endif
diff -r 3e746c0ebcdf arch/i386/kernel/vmitime.c
--- a/arch/i386/kernel/vmitime.c Fri Feb 02 13:54:53 2007 -0800
+++ b/arch/i386/kernel/vmitime.c Fri Feb 02 15:31:35 2007 -0800
@@ -170,7 +170,7 @@ int vmi_set_wallclock(unsigned long now)
return -1;
}

-unsigned long long vmi_sched_clock(void)
+unsigned long long vmi_get_sched_cycles(void)
{
return read_available_cycles();
}
diff -r 3e746c0ebcdf include/asm-i386/vmi_time.h
--- a/include/asm-i386/vmi_time.h Fri Feb 02 13:54:53 2007 -0800
+++ b/include/asm-i386/vmi_time.h Fri Feb 02 15:31:53 2007 -0800
@@ -49,7 +49,7 @@ extern void __init vmi_time_init(void);
extern void __init vmi_time_init(void);
extern unsigned long vmi_get_wallclock(void);
extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
+extern unsigned long long vmi_get_sched_cycles(void);

#ifdef CONFIG_X86_LOCAL_APIC
extern void __init vmi_timer_setup_boot_alarm(void);


2007-02-06 04:00:49

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op

Zachary Amsden wrote:
>
> #include "mach_timer.h"
>
> @@ -102,9 +103,6 @@ unsigned long long sched_clock(void)
> {
> unsigned long long this_offset;
>
> - if (unlikely(custom_sched_clock))
> - return (*custom_sched_clock)();
> -
> /*
> * Fall back to jiffies if there's no TSC available:
> */
> @@ -113,13 +111,13 @@ unsigned long long sched_clock(void)
> return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
>
> /* read the Time Stamp Counter: */
> - rdtscll(this_offset);
> + get_scheduled_cycles(this_offset);
>
> /* return the value in ns */
> return cycles_2_ns(this_offset);
> }
>

I missed a title / signed-off on this guy.


Internally, sched_clock runs in units of nanoseconds, not CPU cycles.
This was wrong in my previous patch. Fix it so everyone can use the
same cycles_2_ns code in tsc.c.

Signed-off-by: Zachary Amsden <[email protected]>

2007-02-06 12:32:58

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op


> .write_msr = native_write_msr,
> .read_tsc = native_read_tsc,
> .read_pmc = native_read_pmc,
> + .get_scheduled_cycles = native_read_tsc,
> + .get_cpu_khz = native_calculate_cpu_khz,
> .load_tr_desc = native_load_tr_desc,
Description missing?

Please write at least two paragraphs or more on each new hook
you want to add.

My feeling is that rdtsc should work fine here. If not please explain.

-Andi

2007-02-06 22:47:09

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op

Andi Kleen wrote:
>> .write_msr = native_write_msr,
>> .read_tsc = native_read_tsc,
>> .read_pmc = native_read_pmc,
>> + .get_scheduled_cycles = native_read_tsc,
>> + .get_cpu_khz = native_calculate_cpu_khz,
>> .load_tr_desc = native_load_tr_desc,
>>
> Description missing?
>

I missed a title / signed-off on this guy.


Internally, sched_clock runs in units of nanoseconds, not CPU cycles.
This was wrong in my previous patch. Fix it so everyone can use the
same cycles_2_ns code in tsc.c.

Signed-off-by: Zachary Amsden <[email protected]>

> Please write at least two paragraphs or more on each new hook
> you want to add.
>

Not a new hook; I just changed the name.

> My feeling is that rdtsc should work fine here. If not please explain.
>

It depends. Scheduled clock must be in units of available time - stolen
time is not always evenly distributed. If you make rdtsc just be
scheduled clock, that almost works. But most places that use rdtsc
expect it to be in cycles of approximate real time, and to leap forward
if something like SMM comes along and steals time.

Not that this is pretty. Arguably, the TSC should just run at a fixed
rate, not progress during stolen time. This idealized TSC assumption is
not however how Linux is making use of the TSC today. TSC is more like
real time, only in a VM, it can't quite keep up with real time, so it
gets simulated.

Scheduled (or available) time and real time are good notions. Stolen
time is debatable. But TSC is basically just always wrong. That's why
I don't want to overload the rdtsc operation.

Zach

2007-02-06 23:23:11

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op

Zachary Amsden wrote:
> Scheduled (or available) time and real time are good notions. Stolen
> time is debatable. But TSC is basically just always wrong. That's
> why I don't want to overload the rdtsc operation.
Well, in the Xen case it is actually guaranteed to be correct and useful
as real time, but that's definitely not something we can expect in
general. But you're talking specifically about schedulable vcpu time
here, right?

J

2007-02-06 23:42:20

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op

Jeremy Fitzhardinge wrote:
> Zachary Amsden wrote:
>
>> Scheduled (or available) time and real time are good notions. Stolen
>> time is debatable. But TSC is basically just always wrong. That's
>> why I don't want to overload the rdtsc operation.
>>
> Well, in the Xen case it is actually guaranteed to be correct and useful
> as real time, but that's definitely not something we can expect in
> general. But you're talking specifically about schedulable vcpu time
> here, right?
>

Not schedulable time, scheduled time (schedulable - scheduled) = stolen

Zach

2007-02-06 23:48:12

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op

Zachary Amsden wrote:
> Jeremy Fitzhardinge wrote:
>> Zachary Amsden wrote:
>>
>>> Scheduled (or available) time and real time are good notions. Stolen
>>> time is debatable. But TSC is basically just always wrong. That's
>>> why I don't want to overload the rdtsc operation.
>> Well, in the Xen case it is actually guaranteed to be correct and useful
>> as real time, but that's definitely not something we can expect in
>> general. But you're talking specifically about schedulable vcpu time
>> here, right?
>>
>
> Not schedulable time, scheduled time (schedulable - scheduled) = stolen

I meant "schedulable" (perhaps "usable" would be better) from the
guest's perspective: total amount of real cpu time each vcpu gets. ie:
(real - schedulable) = stolen. So I think we're talking about the same
thing.

J

2007-02-06 23:51:06

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 2/11] Sched clock paravirt op

Jeremy Fitzhardinge wrote:
> Zachary Amsden wrote:
>
>> Jeremy Fitzhardinge wrote:
>>
>>> Zachary Amsden wrote:
>>>
>>>
>>>> Scheduled (or available) time and real time are good notions. Stolen
>>>> time is debatable. But TSC is basically just always wrong. That's
>>>> why I don't want to overload the rdtsc operation.
>>>>
>>> Well, in the Xen case it is actually guaranteed to be correct and useful
>>> as real time, but that's definitely not something we can expect in
>>> general. But you're talking specifically about schedulable vcpu time
>>> here, right?
>>>
>>>
>> Not schedulable time, scheduled time (schedulable - scheduled) = stolen
>>
>
> I meant "schedulable" (perhaps "usable" would be better) from the
> guest's perspective: total amount of real cpu time each vcpu gets. ie:
> (real - schedulable) = stolen. So I think we're talking about the same
> thing.
>

Yes, I think so. The point though, is the for Xen, TSC is real time.
So it won't do for the scheduler, which must be schedulable time.

Zach