Fairly straightforward code motion of MSR / TSC / PMC accessors
to the sub-arch level. Note that rdmsr/wrmsr_safe functions are
not moved; Linux relies on the fault behavior here in the event
that certain MSRs are not supported on hardware, and combining
this with a VMI wrapper is overly complicated. The instructions
are virtualizable with trap and emulate, not on critical code
paths, and only used as part of the MSR /proc device, which is
highly sketchy to use inside a virtual machine, but must be
allowed as part of the compile, since it is useful on native.
Signed-off-by: Zachary Amsden <[email protected]>
Index: linux-2.6.16-rc5/include/asm-i386/msr.h
===================================================================
--- linux-2.6.16-rc5.orig/include/asm-i386/msr.h 2006-03-08 10:31:10.000000000 -0800
+++ linux-2.6.16-rc5/include/asm-i386/msr.h 2006-03-08 10:32:07.000000000 -0800
@@ -1,22 +1,14 @@
#ifndef __ASM_MSR_H
#define __ASM_MSR_H
+#include <mach_msr.h>
+
/*
* Access to machine-specific registers (available on 586 and better only)
* Note: the rd* operations modify the parameters directly (without using
* pointer indirection), this allows gcc to optimize better
*/
-#define rdmsr(msr,val1,val2) \
- __asm__ __volatile__("rdmsr" \
- : "=a" (val1), "=d" (val2) \
- : "c" (msr))
-
-#define wrmsr(msr,val1,val2) \
- __asm__ __volatile__("wrmsr" \
- : /* no outputs */ \
- : "c" (msr), "a" (val1), "d" (val2))
-
#define rdmsrl(msr,val) do { \
unsigned long l__,h__; \
rdmsr (msr, l__, h__); \
@@ -62,22 +54,6 @@ static inline void wrmsrl (unsigned long
: "c" (msr), "i" (-EFAULT));\
ret__; })
-#define rdtsc(low,high) \
- __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
-
-#define rdtscl(low) \
- __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx")
-
-#define rdtscll(val) \
- __asm__ __volatile__("rdtsc" : "=A" (val))
-
-#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
-
-#define rdpmc(counter,low,high) \
- __asm__ __volatile__("rdpmc" \
- : "=a" (low), "=d" (high) \
- : "c" (counter))
-
/* symbolic names for some interesting MSRs */
/* Intel defined MSRs. */
#define MSR_IA32_P5_MC_ADDR 0
Index: linux-2.6.16-rc5/include/asm-i386/mach-vmi/mach_msr.h
===================================================================
--- linux-2.6.16-rc5.orig/include/asm-i386/mach-vmi/mach_msr.h 2006-03-08 10:32:07.000000000 -0800
+++ linux-2.6.16-rc5/include/asm-i386/mach-vmi/mach_msr.h 2006-03-08 10:32:30.000000000 -0800
@@ -0,0 +1,79 @@
+#ifndef MACH_MSR_H
+#define MACH_MSR_H
+
+#include <vmi.h>
+
+static inline u64 vmi_rdmsr(const u32 msr)
+{
+ u64 ret;
+ vmi_wrap_call(
+ RDMSR, "rdmsr",
+ VMI_OREG64 (ret),
+ 1, "c" (msr),
+ VMI_CLOBBER(TWO_RETURNS));
+ return ret;
+}
+
+#define rdmsr(msr,val1,val2) \
+do { \
+ u64 _val = vmi_rdmsr(msr); \
+ val1 = (u32)_val; \
+ val2 = (u32)(_val >> 32); \
+} while (0)
+
+static inline void wrmsr(const u32 msr, const u32 valLo, const u32 valHi)
+{
+ vmi_wrap_call(
+ WRMSR, "wrmsr",
+ VMI_NO_OUTPUT,
+ 3, XCONC("a"(valLo), "d"(valHi), "c"(msr)),
+ VMI_CLOBBER_EXTENDED(ZERO_RETURNS, "memory"));
+}
+
+static inline u64 vmi_rdtsc(void)
+{
+ u64 ret;
+ vmi_wrap_call(
+ RDTSC, "rdtsc",
+ VMI_OREG64 (ret),
+ 0, VMI_NO_INPUT,
+ VMI_CLOBBER(TWO_RETURNS));
+ return ret;
+}
+
+#define rdtsc(low,high) \
+do { \
+ u64 _val = vmi_rdtsc(); \
+ low = (u32)_val; \
+ high = (u32)(_val >> 32); \
+} while (0)
+
+#define rdtscl(low) \
+do { \
+ u64 _val = vmi_rdtsc(); \
+ low = (u32)_val; \
+} while (0)
+
+#define rdtscll(val) do { val = vmi_rdtsc(); } while (0)
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+static inline u64 vmi_rdpmc(const u32 counter)
+{
+ u64 ret;
+ vmi_wrap_call(
+ RDPMC, "rdpmc",
+ VMI_OREG64 (ret),
+ 1, "c" (counter),
+ VMI_CLOBBER(TWO_RETURNS));
+ return ret;
+}
+
+#define rdpmc(counter,val1,val2) \
+do { \
+ u64 _val = vmi_rdpmc(counter); \
+ val1 = (u32)_val; \
+ val2 = (u32)(_val >> 32); \
+} while (0)
+
+#endif
Index: linux-2.6.16-rc5/include/asm-i386/mach-default/mach_msr.h
===================================================================
--- linux-2.6.16-rc5.orig/include/asm-i386/mach-default/mach_msr.h 2006-03-08 10:32:07.000000000 -0800
+++ linux-2.6.16-rc5/include/asm-i386/mach-default/mach_msr.h 2006-03-08 10:32:07.000000000 -0800
@@ -0,0 +1,30 @@
+#ifndef MACH_MSR_H
+#define MACH_MSR_H
+
+#define rdmsr(msr,val1,val2) \
+ __asm__ __volatile__("rdmsr" \
+ : "=a" (val1), "=d" (val2) \
+ : "c" (msr))
+
+#define wrmsr(msr,val1,val2) \
+ __asm__ __volatile__("wrmsr" \
+ : /* no outputs */ \
+ : "c" (msr), "a" (val1), "d" (val2))
+
+#define rdtsc(low,high) \
+ __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
+
+#define rdtscl(low) \
+ __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx")
+
+#define rdtscll(val) \
+ __asm__ __volatile__("rdtsc" : "=A" (val))
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) \
+ __asm__ __volatile__("rdpmc" \
+ : "=a" (low), "=d" (high) \
+ : "c" (counter))
+
+#endif
On Monday 13 March 2006 19:12, Zachary Amsden wrote:
> Fairly straightforward code motion of MSR / TSC / PMC accessors
> to the sub-arch level. Note that rdmsr/wrmsr_safe functions are
> not moved; Linux relies on the fault behavior here in the event
> that certain MSRs are not supported on hardware, and combining
> this with a VMI wrapper is overly complicated. The instructions
> are virtualizable with trap and emulate, not on critical code
> paths, and only used as part of the MSR /proc device, which is
> highly sketchy to use inside a virtual machine, but must be
> allowed as part of the compile, since it is useful on native.
I'm not aware of any MSR access being on a critical code
path on a 32bit kernel.
And I don't think it's a good idea to virtualize the TSC
without CPU support.
Why would you want to do any of this?
-Andi
Andi Kleen wrote:
> On Monday 13 March 2006 19:12, Zachary Amsden wrote:
>
>> Fairly straightforward code motion of MSR / TSC / PMC accessors
>> to the sub-arch level. Note that rdmsr/wrmsr_safe functions are
>> not moved; Linux relies on the fault behavior here in the event
>> that certain MSRs are not supported on hardware, and combining
>> this with a VMI wrapper is overly complicated. The instructions
>> are virtualizable with trap and emulate, not on critical code
>> paths, and only used as part of the MSR /proc device, which is
>> highly sketchy to use inside a virtual machine, but must be
>> allowed as part of the compile, since it is useful on native.
>>
>
> I'm not aware of any MSR access being on a critical code
> path on a 32bit kernel.
>
There aren't really any. There are some unexpected ones - such as
setting SYSENTER_CS during a context switch, but only if leaving v8086
mode, which isn't a common case. But most importantly, the MSR
functions were challenging to get correct, because they combine two
novel elements - 64 bit values, as well as non-C calling conventions.
They were actually some of the first functions I inlined, because I knew
there would be problems, and the solutions would yield more powerful
inlining macros.
> And I don't think it's a good idea to virtualize the TSC
> without CPU support.
>
We currently don't support configurations without a TSC. But we're not
trying to virtualize the TSC without CPU support. It is possible. But
I have no idea _why_ you would want to do such a thing.
Zach
On Tuesday 14 March 2006 17:32, Zachary Amsden wrote:
> There aren't really any. There are some unexpected ones - such as
> setting SYSENTER_CS during a context switch, but only if leaving v8086
> mode, which isn't a common case. But most importantly, the MSR
> functions were challenging to get correct, because they combine two
> novel elements - 64 bit values, as well as non-C calling conventions.
> They were actually some of the first functions I inlined, because I knew
> there would be problems, and the solutions would yield more powerful
> inlining macros.
That doesn't seem like a good rationale to add it to the kernel.
>
> > And I don't think it's a good idea to virtualize the TSC
> > without CPU support.
>
> We currently don't support configurations without a TSC. But we're not
> trying to virtualize the TSC without CPU support. It is possible. But
> I have no idea _why_ you would want to do such a thing.
Don't change it then?
BTW I think it will be pretty tough to find enough competent reviewers
for your patchkit.
And is the spec still in flux or are you trying to implement an interface
for an specification that is already put into stone?
-Andi
Andi Kleen wrote:
>>> And I don't think it's a good idea to virtualize the TSC
>>> without CPU support.
>>>
>> We currently don't support configurations without a TSC. But we're not
>> trying to virtualize the TSC without CPU support. It is possible. But
>> I have no idea _why_ you would want to do such a thing.
>>
>
> Don't change it then?
>
I misunderstood you. I thought when you said, "I don't think it's a
good idea virtualize the TSC without CPU support" that meant on CPUs
without a hardware TSC. If you really meant on CPUs without
virtualization hardware, well, that is something we do, and it is not
only possible, it is necessary for many non-paravirtualized operating
systems. As I mention in the interface documentation, TSC and the
performance counters in general are very problematic in a virtual
machine - they can be visible to userspace, and they are hard to keep
accurate. Long term, dropping kernel usage of the TSC is a good thing
to do for VMs.
The primary motivation for the change here was to get rid of the call to
the VMI ROM for TSC support. I am in the process of removing the ROM
call sites so that they can instead be patched into the kernel
directly. I am actually unsure if there are any uses of the TSC left on
critical paths with the new VMI timer support, but I inlined the code
here for consistency.
I agree that long term, it doesn't need to be done, and these
instructions can all go back to trap and emulate. Perhaps they are even
worth dropping from the list of VMI calls, since they really are
problematic and/or not useful in a virtual machine. This again, is a
good point for debate. We're open to suggestions, but keep in mind that
the fact that other operating systems and hypervisors might find
something useful in virtualizing them. Maybe not.
> BTW I think it will be pretty tough to find enough competent reviewers
> for your patchkit.
>
> And is the spec still in flux or are you trying to implement an interface
> for an specification that is already put into stone?
>
Everything is still very much open to change, and nothing is cast in
stone - this is about finding the best interface for Linux, and it is
clear that it needs some iteration before that is found. Which is why
we are looking for feedback, exactly like this.
Thanks,
Zach
> -#define rdtsc(low,high) \
> - __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
> -
> -#define rdtscl(low) \
> - __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx")
> -
> -#define rdtscll(val) \
> - __asm__ __volatile__("rdtsc" : "=A" (val))
> -
> -#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
> -
> -#define rdpmc(counter,low,high) \
> - __asm__ __volatile__("rdpmc" \
> - : "=a" (low), "=d" (high) \
> - : "c" (counter))
The kernel doesn't use rdpmc. And moving rdtsc is useless
as I wrote earlier.
But mostly it is used from user space and you will break everything there.
-Andi