The various x86 syscall related MSRs (MSR_LSTAR and friends, EFER when SCE
needs to be updated) are fairly expensive to read or write. Since different
operating systems can set different values for these MSRs, KVM needs to reload
them when switching to a different guest or to the host.
Switching on every guest entry/exit is too expensive, so KVM reloads on
guest preemption, which is a lot rarer. Even so, preemption based reload
is suboptimal:
- if we're switching to a kernel thread and back, there's no need to reload
the MSRs. Examples of kernel threads we're likely to switch to are:
- the idle task
- a threaded interrupt handler
- a kernel-mode virtio server (vhost-net)
- if we're switching to a guest running the same OS, the MSRs will have the
same values and there's no need to reload them
- if the guest and host run the same OS, again the MSRs need not be reloaded.
This patchset implements just-in-time reloads to defer them to the last
possible instant. When we do reload, we check whether the values have in
fact changed and reload conditionally.
For the just-in-time reloads the first patch implements "user return
notifiers", a callback invoked just before return to userspace. This has
been written so that there is no code impact if KVM is not configured, and
no runtime impact if KVM is not running.
The patchset improves guest/idle/guest switches by about 2000 cycles.
Avi Kivity (4):
core, x86: Add user return notifiers
KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area
KVM: x86 shared msr infrastructure
KVM: VMX: Use shared msr infrastructure
arch/Kconfig | 10 +++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/kvm_host.h | 3 +
arch/x86/include/asm/thread_info.h | 7 +-
arch/x86/kernel/process.c | 2 +
arch/x86/kernel/signal.c | 3 +
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/vmx.c | 142 +++++++++++++++--------------------
arch/x86/kvm/x86.c | 79 ++++++++++++++++++++
kernel/Makefile | 1 +
10 files changed, 166 insertions(+), 83 deletions(-)
Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace. The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.
Signed-off-by: Avi Kivity <[email protected]>
---
arch/Kconfig | 10 ++++++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/thread_info.h | 7 +++++--
arch/x86/kernel/process.c | 2 ++
arch/x86/kernel/signal.c | 3 +++
kernel/Makefile | 1 +
6 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 99193b1..2275b46 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -70,6 +70,13 @@ config KRETPROBES
def_bool y
depends on KPROBES && HAVE_KRETPROBES
+config USER_RETURN_NOTIFIER
+ bool
+ depends on HAVE_USER_RETURN_NOTIFIER
+ help
+ Provide a kernel-internal notification when a cpu is about to
+ switch to user mode.
+
config HAVE_IOREMAP_PROT
bool
@@ -113,4 +120,7 @@ config HAVE_DMA_API_DEBUG
config HAVE_DEFAULT_NO_SPIN_MUTEXES
bool
+config HAVE_USER_RETURN_NOTIFIER
+ bool
+
source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5d..5fc4fae 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
select HAVE_ARCH_KMEMCHECK
+ select HAVE_USER_RETURN_NOTIFIER
config OUTPUT_FORMAT
string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40..a8f46de 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
@@ -141,13 +143,14 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
+ _TIF_USER_RETURN_NOTIFY)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a..7ea6972 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
+#include <linux/user-return-notifier.h>
#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
@@ -227,6 +228,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+ propagate_user_return_notify(prev_p, next_p);
}
int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c57875..bc0db10 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -870,6 +871,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
+ if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a69..11c5de5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
--
1.6.4.1
Currently MSR_KERNEL_GS_BASE is saved and restored as part of the
guest/host msr reloading. Since we wish to lazy-restore all the other
msrs, save and reload MSR_KERNEL_GS_BASE explicitly instead of using
the common code.
Signed-off-by: Avi Kivity <[email protected]>
---
arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++-------------
1 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d3213ac..547881a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -80,7 +80,8 @@ struct vcpu_vmx {
int save_nmsrs;
int msr_offset_efer;
#ifdef CONFIG_X86_64
- int msr_offset_kernel_gs_base;
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
#endif
struct vmcs *vmcs;
struct {
@@ -183,7 +184,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_K6_STAR,
};
@@ -649,10 +650,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- save_msrs(vmx->host_msrs +
- vmx->msr_offset_kernel_gs_base, 1);
-
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ }
#endif
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
load_transition_efer(vmx);
@@ -686,6 +687,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
load_msrs(vmx->host_msrs, vmx->save_nmsrs);
reload_host_efer(vmx);
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ }
+#endif
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -915,9 +922,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_CSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
@@ -929,10 +933,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
#endif
vmx->save_nmsrs = save_nmsrs;
-#ifdef CONFIG_X86_64
- vmx->msr_offset_kernel_gs_base =
- __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
if (cpu_has_vmx_msr_bitmap()) {
@@ -990,6 +990,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_GS_BASE:
data = vmcs_readl(GUEST_GS_BASE);
break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(to_vmx(vcpu));
+ data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ break;
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
#endif
@@ -1043,6 +1047,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_GS_BASE:
vmcs_writel(GUEST_GS_BASE, data);
break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_load_host_state(vmx);
+ vmx->msr_guest_kernel_gs_base = data;
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
vmcs_write32(GUEST_SYSENTER_CS, data);
--
1.6.4.1
The various syscall-related MSRs are fairly expensive to switch. Currently
we switch them on every vcpu preemption, which is far too often:
- if we're switching to a kernel thread (idle task, threaded interrupt,
kernel-mode virtio server (vhost-net), for example) and back, then
there's no need to switch those MSRs since kernel threasd won't
be exiting to userspace.
- if we're switching to another guest running an identical OS, most likely
those MSRs will have the same value, so there's little point in reloading
them.
- if we're running the same OS on the guest and host, the MSRs will have
identical values and reloading is unnecessary.
This patch uses the new user return notifiers to implement last-minute
switching, and checks the msr values to avoid unnecessary reloading.
Signed-off-by: Avi Kivity <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 3 +
arch/x86/kvm/Kconfig | 1 +
arch/x86/kvm/x86.c | 79 +++++++++++++++++++++++++++++++++++++++
3 files changed, 83 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 45226f0..863bde8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -799,4 +799,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+void kvm_define_shared_msr(unsigned index, u32 msr);
+void kvm_set_shared_msr(unsigned index, u64 val);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571..4cd4983 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
+ select USER_RETURN_NOTIFIER
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d454d9..0e7c40c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,6 +37,7 @@
#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
#include <trace/events/kvm.h>
#undef TRACE_INCLUDE_FILE
#define CREATE_TRACE_POINTS
@@ -87,6 +88,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
int ignore_msrs = 0;
module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+ int nr;
+ struct kvm_shared_msr {
+ u32 msr;
+ u64 value;
+ } msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ u64 current_value[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +143,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_shared_msr *global;
+ struct kvm_shared_msrs *locals
+ = container_of(urn, struct kvm_shared_msrs, urn);
+
+ for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+ global = &shared_msrs_global.msrs[slot];
+ if (global->value != locals->current_value[slot]) {
+ wrmsrl(global->msr, global->value);
+ locals->current_value[slot] = global->value;
+ }
+ }
+ locals->registered = false;
+ user_return_notifier_unregister(urn);
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+ int cpu;
+ u64 value;
+
+ if (slot >= shared_msrs_global.nr)
+ shared_msrs_global.nr = slot + 1;
+ shared_msrs_global.msrs[slot].msr = msr;
+ rdmsrl_safe(msr, &value);
+ shared_msrs_global.msrs[slot].value = value;
+ for_each_online_cpu(cpu)
+ per_cpu(shared_msrs, cpu).current_value[slot] = value;
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+ unsigned i;
+ struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
+
+ for (i = 0; i < shared_msrs_global.nr; ++i)
+ locals->current_value[i] = shared_msrs_global.msrs[i].value;
+}
+
+void kvm_set_shared_msr(unsigned slot, u64 value)
+{
+ struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+ if (value == smsr->current_value[slot])
+ return;
+ smsr->current_value[slot] = value;
+ wrmsrl(shared_msrs_global.msrs[slot].msr, value);
+ if (!smsr->registered) {
+ smsr->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&smsr->urn);
+ smsr->registered = true;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
@@ -4696,6 +4774,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
void kvm_arch_hardware_enable(void *garbage)
{
+ kvm_shared_msr_cpu_online();
kvm_x86_ops->hardware_enable(garbage);
}
--
1.6.4.1
Instead of reloading syscall MSRs on every preemption, use the new shared
msr infrastructure to reload them at the last possible minute (just before
exit to userspace).
Improves vcpu/idle/vcpu switches by about 2000 cycles (when EFER needs to be
reloaded as well).
Signed-off-by: Avi Kivity <[email protected]>
---
arch/x86/kvm/vmx.c | 108 +++++++++++++++++++--------------------------------
1 files changed, 40 insertions(+), 68 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 547881a..8dae1dc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -67,6 +67,11 @@ struct vmcs {
char data[0];
};
+struct shared_msr_entry {
+ unsigned index;
+ u64 data;
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
struct list_head local_vcpus_link;
@@ -74,8 +79,7 @@ struct vcpu_vmx {
int launched;
u8 fail;
u32 idt_vectoring_info;
- struct kvm_msr_entry *guest_msrs;
- struct kvm_msr_entry *host_msrs;
+ struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
int msr_offset_efer;
@@ -89,7 +93,6 @@ struct vcpu_vmx {
u16 fs_sel, gs_sel, ldt_sel;
int gs_ldt_reload_needed;
int fs_reload_needed;
- int guest_efer_loaded;
} host_state;
struct {
int vm86_active;
@@ -176,6 +179,8 @@ static struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
+static u64 host_efer;
+
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
@@ -190,22 +195,6 @@ static const u32 vmx_msr_index[] = {
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
- int i;
-
- for (i = 0; i < n; ++i)
- rdmsrl(e[i].index, e[i].data);
-}
-
static inline int is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -348,7 +337,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
int i;
for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx->guest_msrs[i].index == msr)
+ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
return i;
return -1;
}
@@ -379,7 +368,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
: : "a" (&operand), "c" (ext) : "cc", "memory");
}
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -570,17 +559,15 @@ static void reload_tss(void)
load_TR_desc();
}
-static void load_transition_efer(struct vcpu_vmx *vmx)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
{
int efer_offset = vmx->msr_offset_efer;
- u64 host_efer;
u64 guest_efer;
u64 ignore_bits;
if (efer_offset < 0)
- return;
- host_efer = vmx->host_msrs[efer_offset].data;
- guest_efer = vmx->guest_msrs[efer_offset].data;
+ return false;
+ guest_efer = vmx->vcpu.arch.shadow_efer;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -594,26 +581,18 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
ignore_bits &= ~(u64)EFER_SCE;
#endif
if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
- return;
+ return false;
- vmx->host_state.guest_efer_loaded = 1;
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
- wrmsrl(MSR_EFER, guest_efer);
- vmx->vcpu.stat.efer_reload++;
-}
-
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
- if (vmx->host_state.guest_efer_loaded) {
- vmx->host_state.guest_efer_loaded = 0;
- load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
- }
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ return true;
}
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int i;
if (vmx->host_state.loaded)
return;
@@ -655,8 +634,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
}
#endif
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_transition_efer(vmx);
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(i, vmx->guest_msrs[i].data);
}
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +663,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
local_irq_restore(flags);
}
reload_tss();
- save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
- load_msrs(vmx->host_msrs, vmx->save_nmsrs);
- reload_host_efer(vmx);
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -886,14 +862,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
#ifdef CONFIG_X86_64
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
{
- struct kvm_msr_entry tmp;
+ struct shared_msr_entry tmp;
tmp = vmx->guest_msrs[to];
vmx->guest_msrs[to] = vmx->guest_msrs[from];
vmx->guest_msrs[from] = tmp;
- tmp = vmx->host_msrs[to];
- vmx->host_msrs[to] = vmx->host_msrs[from];
- vmx->host_msrs[from] = tmp;
}
#endif
@@ -904,15 +877,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- int save_nmsrs;
+ int save_nmsrs, index;
unsigned long *msr_bitmap;
vmx_load_host_state(vmx);
save_nmsrs = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
- int index;
-
index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
@@ -931,9 +902,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
- vmx->save_nmsrs = save_nmsrs;
+ vmx->msr_offset_efer = index = __find_msr_index(vmx, MSR_EFER);
+ if (index >= 0 && update_transition_efer(vmx))
+ move_msr_up(vmx, index, save_nmsrs++);
- vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+ vmx->save_nmsrs = save_nmsrs;
if (cpu_has_vmx_msr_bitmap()) {
if (is_long_mode(&vmx->vcpu))
@@ -975,7 +948,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
u64 data;
- struct kvm_msr_entry *msr;
+ struct shared_msr_entry *msr;
if (!pdata) {
printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -994,9 +967,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
vmx_load_host_state(to_vmx(vcpu));
data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
break;
+#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
case MSR_IA32_TSC:
data = guest_read_tsc();
break;
@@ -1010,6 +983,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
default:
+ vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
vmx_load_host_state(to_vmx(vcpu));
@@ -1031,7 +1005,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr;
+ struct shared_msr_entry *msr;
u64 host_tsc;
int ret = 0;
@@ -1543,8 +1517,10 @@ continue_rmode:
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ if (!msr)
+ return;
vcpu->arch.shadow_efer = efer;
if (!msr)
return;
@@ -2383,10 +2359,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (wrmsr_safe(index, data_low, data_high) < 0)
continue;
data = data_low | ((u64)data_high << 32);
- vmx->host_msrs[j].index = index;
- vmx->host_msrs[j].reserved = 0;
- vmx->host_msrs[j].data = data;
- vmx->guest_msrs[j] = vmx->host_msrs[j];
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
++vmx->nmsrs;
}
@@ -3774,7 +3748,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
__clear_bit(vmx->vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
vmx_free_vmcs(vcpu);
- kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3801,10 +3774,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto uninit_vcpu;
}
- vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!vmx->host_msrs)
- goto free_guest_msrs;
-
vmx->vmcs = alloc_vmcs();
if (!vmx->vmcs)
goto free_msrs;
@@ -3835,8 +3804,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
free_vmcs:
free_vmcs(vmx->vmcs);
free_msrs:
- kfree(vmx->host_msrs);
-free_guest_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
kvm_vcpu_uninit(&vmx->vcpu);
@@ -3986,7 +3953,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
static int __init vmx_init(void)
{
- int r;
+ int r, i;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < NR_VMX_MSR; ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_io_bitmap_a)
--
1.6.4.1
On Wed, Sep 16, 2009 at 03:45:33PM +0300, Avi Kivity wrote:
> The various syscall-related MSRs are fairly expensive to switch. Currently
> we switch them on every vcpu preemption, which is far too often:
>
> - if we're switching to a kernel thread (idle task, threaded interrupt,
> kernel-mode virtio server (vhost-net), for example) and back, then
> there's no need to switch those MSRs since kernel threasd won't
> be exiting to userspace.
>
> - if we're switching to another guest running an identical OS, most likely
> those MSRs will have the same value, so there's little point in reloading
> them.
>
> - if we're running the same OS on the guest and host, the MSRs will have
> identical values and reloading is unnecessary.
>
> This patch uses the new user return notifiers to implement last-minute
> switching, and checks the msr values to avoid unnecessary reloading.
>
> Signed-off-by: Avi Kivity <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 3 +
> arch/x86/kvm/Kconfig | 1 +
> arch/x86/kvm/x86.c | 79 +++++++++++++++++++++++++++++++++++++++
> 3 files changed, 83 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 45226f0..863bde8 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -799,4 +799,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
> int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
>
> +void kvm_define_shared_msr(unsigned index, u32 msr);
> +void kvm_set_shared_msr(unsigned index, u64 val);
> +
> #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index b84e571..4cd4983 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -28,6 +28,7 @@ config KVM
> select HAVE_KVM_IRQCHIP
> select HAVE_KVM_EVENTFD
> select KVM_APIC_ARCHITECTURE
> + select USER_RETURN_NOTIFIER
> ---help---
> Support hosting fully virtualized guest machines using hardware
> virtualization extensions. You will need a fairly recent
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1d454d9..0e7c40c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -37,6 +37,7 @@
> #include <linux/iommu.h>
> #include <linux/intel-iommu.h>
> #include <linux/cpufreq.h>
> +#include <linux/user-return-notifier.h>
> #include <trace/events/kvm.h>
> #undef TRACE_INCLUDE_FILE
> #define CREATE_TRACE_POINTS
> @@ -87,6 +88,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
> int ignore_msrs = 0;
> module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
>
> +#define KVM_NR_SHARED_MSRS 16
> +
> +struct kvm_shared_msrs_global {
> + int nr;
> + struct kvm_shared_msr {
> + u32 msr;
> + u64 value;
> + } msrs[KVM_NR_SHARED_MSRS];
> +};
> +
> +struct kvm_shared_msrs {
> + struct user_return_notifier urn;
> + bool registered;
> + u64 current_value[KVM_NR_SHARED_MSRS];
> +};
> +
> +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
Does this assume the MSRs in question are consistent across CPUs?
I guess that is not true with arch_prctl(ARCH_SET_GS/ARCH_GET_GS) ?
On 09/17/2009 12:21 AM, Marcelo Tosatti wrote:
>> +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
>>
> Does this assume the MSRs in question are consistent across CPUs?
>
Yes. And they are.
> I guess that is not true with arch_prctl(ARCH_SET_GS/ARCH_GET_GS) ?
>
These aren't shared MSRs and are saved and restored independently.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On 09/16/2009 05:45 AM, Avi Kivity wrote:
> Add a general per-cpu notifier that is called whenever the kernel is
> about to return to userspace. The notifier uses a thread_info flag
> and existing checks, so there is no impact on user return or context
> switch fast paths.
>
> Signed-off-by: Avi Kivity <[email protected]>
The concept of this patch looks entirely sane, as does the code,
*except* for:
+#include <linux/user-return-notifier.h>
... missing header file, and ...
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
... missing source file.
Could you fix the patch, please?
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace. The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.
Signed-off-by: Avi Kivity <[email protected]>
---
v2: include new files in patch
arch/Kconfig | 10 +++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/thread_info.h | 7 +++-
arch/x86/kernel/process.c | 2 +
arch/x86/kernel/signal.c | 3 ++
include/linux/user-return-notifier.h | 42 +++++++++++++++++++++++++++++++
kernel/Makefile | 1 +
kernel/user-return-notifier.c | 46 ++++++++++++++++++++++++++++++++++
8 files changed, 110 insertions(+), 2 deletions(-)
create mode 100644 include/linux/user-return-notifier.h
create mode 100644 kernel/user-return-notifier.c
diff --git a/arch/Kconfig b/arch/Kconfig
index beea3cc..b1d0757 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -82,6 +82,13 @@ config KRETPROBES
def_bool y
depends on KPROBES && HAVE_KRETPROBES
+config USER_RETURN_NOTIFIER
+ bool
+ depends on HAVE_USER_RETURN_NOTIFIER
+ help
+ Provide a kernel-internal notification when a cpu is about to
+ switch to user mode.
+
config HAVE_IOREMAP_PROT
bool
@@ -125,4 +132,7 @@ config HAVE_DMA_API_DEBUG
config HAVE_DEFAULT_NO_SPIN_MUTEXES
bool
+config HAVE_USER_RETURN_NOTIFIER
+ bool
+
source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc20fdc..ed21d6a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
select HAVE_ARCH_KMEMCHECK
+ select HAVE_USER_RETURN_NOTIFIER
config OUTPUT_FORMAT
string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2..375c917 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
+ _TIF_USER_RETURN_NOTIFY)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a..7ea6972 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
+#include <linux/user-return-notifier.h>
#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
@@ -227,6 +228,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+ propagate_user_return_notify(prev_p, next_p);
}
int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 81e5823..13aa99c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (current->replacement_session_keyring)
key_replace_session_keyring();
}
+ if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 0000000..ef04e2e
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+ void (*on_user_return)(struct user_return_notifier *urn);
+ struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+ struct task_struct *next)
+{
+ if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+ clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+ set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+ }
+}
+
+void fire_user_return_notifiers(void);
+
+#else
+
+struct user_return_notifier {};
+
+static inline propagate_user_return_notify(struct task_struct *prev,
+ struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+#endif
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 961379c..f6abe84 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 0000000..530ccb8
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace. Must be
+ * called in atomic context. The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+ set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+ hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier. Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+ hlist_del(&urn->link);
+ if (hlist_empty(&URN_LIST_HEAD))
+ clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+ struct user_return_notifier *urn;
+ struct hlist_node *tmp1, *tmp2;
+ struct hlist_head *head;
+
+ head = &get_cpu_var(return_notifier_list);
+ hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+ urn->on_user_return(urn);
+ put_cpu_var();
+}
--
1.6.4.1
On 09/18/2009 09:14 PM, H. Peter Anvin wrote:
> On 09/16/2009 05:45 AM, Avi Kivity wrote:
>
>> Add a general per-cpu notifier that is called whenever the kernel is
>> about to return to userspace. The notifier uses a thread_info flag
>> and existing checks, so there is no impact on user return or context
>> switch fast paths.
>>
>> Signed-off-by: Avi Kivity<[email protected]>
>>
> The concept of this patch looks entirely sane, as does the code,
> *except* for:
>
> +#include<linux/user-return-notifier.h>
>
> ... missing header file, and ...
>
> +obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
>
> ... missing source file.
>
> Could you fix the patch, please?
>
>
Doh. Sent out fixed patch.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 09/19/2009 09:40 AM, Avi Kivity wrote:
> Add a general per-cpu notifier that is called whenever the kernel is
> about to return to userspace. The notifier uses a thread_info flag
> and existing checks, so there is no impact on user return or context
> switch fast paths.
>
Ingo/Peter?
--
error compiling committee.c: too many arguments to function
On Tue, 22 Sep 2009 12:25:33 +0300
Avi Kivity <[email protected]> wrote:
> On 09/19/2009 09:40 AM, Avi Kivity wrote:
> > Add a general per-cpu notifier that is called whenever the kernel is
> > about to return to userspace. The notifier uses a thread_info flag
> > and existing checks, so there is no impact on user return or context
> > switch fast paths.
> >
>
> Ingo/Peter?
isn't this like really expensive when used ?
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On 09/22/2009 12:37 PM, Arjan van de Ven wrote:
> On Tue, 22 Sep 2009 12:25:33 +0300
> Avi Kivity<[email protected]> wrote:
>
>
>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>
>>> Add a general per-cpu notifier that is called whenever the kernel is
>>> about to return to userspace. The notifier uses a thread_info flag
>>> and existing checks, so there is no impact on user return or context
>>> switch fast paths.
>>>
>>>
>> Ingo/Peter?
>>
> isn't this like really expensive when used ?
>
No, why? It triggers a call to do_notify_resume() and a walks a list of
length 1.
--
error compiling committee.c: too many arguments to function
* Avi Kivity <[email protected]> wrote:
> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>> Add a general per-cpu notifier that is called whenever the kernel is
>> about to return to userspace. The notifier uses a thread_info flag
>> and existing checks, so there is no impact on user return or context
>> switch fast paths.
>>
>
> Ingo/Peter?
Would be nice to convert some existing open-coded return-to-user-space
logic to this facility. One such candidate would be lockdep_sys_exit?
Ingo
On 09/22/2009 05:32 PM, Ingo Molnar wrote:
> * Avi Kivity<[email protected]> wrote:
>
>
>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>
>>> Add a general per-cpu notifier that is called whenever the kernel is
>>> about to return to userspace. The notifier uses a thread_info flag
>>> and existing checks, so there is no impact on user return or context
>>> switch fast paths.
>>>
>>>
>> Ingo/Peter?
>>
> Would be nice to convert some existing open-coded return-to-user-space
> logic to this facility. One such candidate would be lockdep_sys_exit?
>
I only implemented this for x86, while lockdep is arch independent. If
arch support is added, it should be trivial.
I think perf counters could use preempt notifiers though, these are arch
independent.
--
error compiling committee.c: too many arguments to function
Ingo Molnar wrote:
> * Avi Kivity <[email protected]> wrote:
>
>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>> Add a general per-cpu notifier that is called whenever the kernel is
>>> about to return to userspace. The notifier uses a thread_info flag
>>> and existing checks, so there is no impact on user return or context
>>> switch fast paths.
>>>
>> Ingo/Peter?
>
> Would be nice to convert some existing open-coded return-to-user-space
> logic to this facility. One such candidate would be lockdep_sys_exit?
>
> Ingo
Sorry, limited bandwidth due to LinuxCon, but I like the concept, and
the previous (partial) patch was really clean. I agree with Ingo that
arch support so we can use this as a general facility would be nice, but
I don't consider that as a prerequisite for merging.
-hpa
On 09/22/2009 05:45 PM, Avi Kivity wrote:
>> Would be nice to convert some existing open-coded return-to-user-space
>> logic to this facility. One such candidate would be lockdep_sys_exit?
>
> I only implemented this for x86, while lockdep is arch independent.
> If arch support is added, it should be trivial.
>
The lockdep_sys_exit bit is actually x86/s390 only, and can easily be
adapted to use the new functionality on x86 only. I'll try it out.
--
error compiling committee.c: too many arguments to function
On Tue, 2009-09-22 at 16:32 +0200, Ingo Molnar wrote:
> * Avi Kivity <[email protected]> wrote:
>
> > On 09/19/2009 09:40 AM, Avi Kivity wrote:
> >> Add a general per-cpu notifier that is called whenever the kernel is
> >> about to return to userspace. The notifier uses a thread_info flag
> >> and existing checks, so there is no impact on user return or context
> >> switch fast paths.
> >>
> >
> > Ingo/Peter?
>
> Would be nice to convert some existing open-coded return-to-user-space
> logic to this facility. One such candidate would be lockdep_sys_exit?
And here I was thinking this was one of the hottest code paths in the
whole kernel...
On 09/22/2009 07:50 PM, Peter Zijlstra wrote:
>> Would be nice to convert some existing open-coded return-to-user-space
>> logic to this facility. One such candidate would be lockdep_sys_exit?
>>
> And here I was thinking this was one of the hottest code paths in the
> whole kernel...
>
If you're using lockdep, surely that's not your biggest worry?
--
error compiling committee.c: too many arguments to function
On Tue, 2009-09-22 at 19:52 +0300, Avi Kivity wrote:
> On 09/22/2009 07:50 PM, Peter Zijlstra wrote:
> >> Would be nice to convert some existing open-coded return-to-user-space
> >> logic to this facility. One such candidate would be lockdep_sys_exit?
> >>
> > And here I was thinking this was one of the hottest code paths in the
> > whole kernel...
> >
>
> If you're using lockdep, surely that's not your biggest worry?
No, but that's all under #ifdef and fully disappears when not enabled.
Generic return-tu-user notifiers don't sound like they will though.
On 09/22/2009 07:55 PM, Peter Zijlstra wrote:
>> If you're using lockdep, surely that's not your biggest worry?
>>
> No, but that's all under #ifdef and fully disappears when not enabled.
> Generic return-tu-user notifiers don't sound like they will though.
>
They will if not selected. If selected and not armed, they will have
zero runtime impact since they piggyback on existing branches
(_TIF_DO_NOTIFY_MASK and near relatives). If selected and armed they'll
cause __switch_to_xtra() on every context switch and do_notity_resume()
on syscall exit until disarmed, but then you've asked for it.
--
error compiling committee.c: too many arguments to function
On 09/22/2009 06:50 PM, Avi Kivity wrote:
> On 09/22/2009 05:45 PM, Avi Kivity wrote:
>>> Would be nice to convert some existing open-coded return-to-user-space
>>> logic to this facility. One such candidate would be lockdep_sys_exit?
>>
>> I only implemented this for x86, while lockdep is arch independent.
>> If arch support is added, it should be trivial.
>>
>
> The lockdep_sys_exit bit is actually x86/s390 only, and can easily be
> adapted to use the new functionality on x86 only. I'll try it out.
Unfortunately it doesn't work out well. The notifier is called until
explicitly unregistered (since it relies on a bit in TIF_NOTIFY_MASK),
so we have to disarm it on the first return to usersspace or it spins
forever. We could re-arm it on the next kernel entry, but we don't have
a kernel entry notifier so we'll just be moving hooks from one point to
another.
--
error compiling committee.c: too many arguments to function
On Tue, 2009-09-22 at 16:32 +0200, Ingo Molnar wrote:
> Would be nice to convert some existing open-coded return-to-user-space
> logic to this facility. One such candidate would be lockdep_sys_exit?
I don't really like lockdep_sys_exit() in such a call,
lockdep_sys_exit() is currently placed such that it will execute as the
last C code before returning to user-space.
Put it any earlier and you've got a window where people can leak a lock
to userspace undetected.
On 09/22/2009 06:19 PM, H. Peter Anvin wrote:
> Ingo Molnar wrote:
>> * Avi Kivity <[email protected]> wrote:
>>
>>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>>> Add a general per-cpu notifier that is called whenever the kernel is
>>>> about to return to userspace. The notifier uses a thread_info flag
>>>> and existing checks, so there is no impact on user return or context
>>>> switch fast paths.
>>> Ingo/Peter?
>>
>> Would be nice to convert some existing open-coded
>> return-to-user-space logic to this facility. One such candidate would
>> be lockdep_sys_exit?
>>
>> Ingo
>
> Sorry, limited bandwidth due to LinuxCon, but I like the concept, and
> the previous (partial) patch was really clean. I agree with Ingo that
> arch support so we can use this as a general facility would be nice,
> but I don't consider that as a prerequisite for merging.
Re-ping?
If accepted, please merge just the core patch and I will carry all of
them in parallel. Once tip is merged I'll drop my copy of the first patch.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 10/01/2009 08:21 AM, Avi Kivity wrote:
>
> Re-ping?
>
> If accepted, please merge just the core patch and I will carry all of
> them in parallel. Once tip is merged I'll drop my copy of the first patch.
>
I was just talking to Ingo about this... would you consider this .32 or
.33 material at his time? It came in awfully late for .32...
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
On 10/01/2009 05:25 PM, H. Peter Anvin wrote:
> On 10/01/2009 08:21 AM, Avi Kivity wrote:
>
>> Re-ping?
>>
>> If accepted, please merge just the core patch and I will carry all of
>> them in parallel. Once tip is merged I'll drop my copy of the first patch.
>>
>>
> I was just talking to Ingo about this... would you consider this .32 or
> .33 material at his time? It came in awfully late for .32...
>
While I'd prefer it to be in .32, as there's much churn in both trees, I
guess we should follow the process and schedule it for .33.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 10/01/2009 08:30 AM, Avi Kivity wrote:
>
> While I'd prefer it to be in .32, as there's much churn in both trees, I
> guess we should follow the process and schedule it for .33.
>
OK, sounds like a plan. I'll pull it into -tip.
-hpa
On 09/16/2009 03:45 PM, Avi Kivity wrote:
> The various x86 syscall related MSRs (MSR_LSTAR and friends, EFER when SCE
> needs to be updated) are fairly expensive to read or write. Since different
> operating systems can set different values for these MSRs, KVM needs to reload
> them when switching to a different guest or to the host.
>
> Switching on every guest entry/exit is too expensive, so KVM reloads on
> guest preemption, which is a lot rarer. Even so, preemption based reload
> is suboptimal:
>
> - if we're switching to a kernel thread and back, there's no need to reload
> the MSRs. Examples of kernel threads we're likely to switch to are:
>
> - the idle task
> - a threaded interrupt handler
> - a kernel-mode virtio server (vhost-net)
>
> - if we're switching to a guest running the same OS, the MSRs will have the
> same values and there's no need to reload them
>
> - if the guest and host run the same OS, again the MSRs need not be reloaded.
>
> This patchset implements just-in-time reloads to defer them to the last
> possible instant. When we do reload, we check whether the values have in
> fact changed and reload conditionally.
>
> For the just-in-time reloads the first patch implements "user return
> notifiers", a callback invoked just before return to userspace. This has
> been written so that there is no code impact if KVM is not configured, and
> no runtime impact if KVM is not running.
>
> The patchset improves guest/idle/guest switches by about 2000 cycles.
>
>
I've applied this to kvm.git master.
--
error compiling committee.c: too many arguments to function
Commit-ID: 7c68af6e32c73992bad24107311f3433c89016e2
Gitweb: http://git.kernel.org/tip/7c68af6e32c73992bad24107311f3433c89016e2
Author: Avi Kivity <[email protected]>
AuthorDate: Sat, 19 Sep 2009 09:40:22 +0300
Committer: H. Peter Anvin <[email protected]>
CommitDate: Thu, 1 Oct 2009 12:12:18 -0700
core, x86: Add user return notifiers
Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace. The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.
This will be used initially to speed up KVM task switching by lazily
updating MSRs.
Signed-off-by: Avi Kivity <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/Kconfig | 10 +++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/thread_info.h | 7 +++-
arch/x86/kernel/process.c | 2 +
arch/x86/kernel/signal.c | 3 ++
include/linux/user-return-notifier.h | 42 +++++++++++++++++++++++++++++++
kernel/user-return-notifier.c | 46 ++++++++++++++++++++++++++++++++++
7 files changed, 109 insertions(+), 2 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 7f418bb..4e312ff 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -83,6 +83,13 @@ config KRETPROBES
def_bool y
depends on KPROBES && HAVE_KRETPROBES
+config USER_RETURN_NOTIFIER
+ bool
+ depends on HAVE_USER_RETURN_NOTIFIER
+ help
+ Provide a kernel-internal notification when a cpu is about to
+ switch to user mode.
+
config HAVE_IOREMAP_PROT
bool
@@ -126,4 +133,7 @@ config HAVE_DMA_API_DEBUG
config HAVE_DEFAULT_NO_SPIN_MUTEXES
bool
+config HAVE_USER_RETURN_NOTIFIER
+ bool
+
source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da9374..1df175d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
select HAVE_ARCH_KMEMCHECK
+ select HAVE_USER_RETURN_NOTIFIER
config OUTPUT_FORMAT
string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2..375c917 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_FORK (1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+ (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
+ _TIF_USER_RETURN_NOTIFY)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2..e51b056 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
+#include <linux/user-return-notifier.h>
#include <trace/events/power.h>
#include <asm/system.h>
#include <asm/apic.h>
@@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+ propagate_user_return_notify(prev_p, next_p);
}
int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76..c49f90f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (current->replacement_session_keyring)
key_replace_session_keyring();
}
+ if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
#ifdef CONFIG_X86_32
clear_thread_flag(TIF_IRET);
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 0000000..b6ac056
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+ void (*on_user_return)(struct user_return_notifier *urn);
+ struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+ struct task_struct *next)
+{
+ if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+ clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+ set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+ }
+}
+
+void fire_user_return_notifiers(void);
+
+#else
+
+struct user_return_notifier {};
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+ struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+#endif
+
+#endif
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 0000000..530ccb8
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace. Must be
+ * called in atomic context. The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+ set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+ hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier. Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+ hlist_del(&urn->link);
+ if (hlist_empty(&URN_LIST_HEAD))
+ clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+ struct user_return_notifier *urn;
+ struct hlist_node *tmp1, *tmp2;
+ struct hlist_head *head;
+
+ head = &get_cpu_var(return_notifier_list);
+ hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+ urn->on_user_return(urn);
+ put_cpu_var();
+}