Currently, kdump just makes all the logical processors leave VMX operation by
executing VMXOFF instruction, so any VMCSs active on the logical processors may
be corrupted. But, sometimes, we need the VMCSs to debug guest images contained
in the host vmcore. To prevent the corruption, we should VMCLEAR the VMCSs before
executing the VMXOFF instruction.
The patch set provides a way to VMCLEAR vmcss related to guests on all cpus before
executing the VMXOFF when doing kdump. This is used to ensure the VMCSs in the
vmcore updated and non-corrupted.
Changelog from v5 to v6:
1. KEXEC: the atomic notifier list renamed:
crash_notifier_list --> vmclear_notifier_list
2. KVM-INTEL: provide empty functions if CONFIG_KEXEC is
not defined and remove unnecessary #ifdef's.
Changelog from v4 to v5:
1. use an atomic notifier instead of function call, so
have all the vmclear codes in vmx.c.
Changelog from v3 to v4:
1. add a new percpu variable vmclear_skipped to skip
vmclear in kdump in some conditions.
Changelog from v2 to v3:
1. remove unnecessary conditions in function
cpu_emergency_clear_loaded_vmcss as Marcelo suggested.
Changelog from v1 to v2:
1. remove the sysctl and clear VMCSs unconditionally.
Zhang Yanfei (2):
x86/kexec: VMCLEAR vmcss on all cpus if necessary
KVM-INTEL: add a notifier and a bitmap to support VMCLEAR in kdump
arch/x86/include/asm/kexec.h | 2 +
arch/x86/kernel/crash.c | 25 +++++++++++++
arch/x86/kvm/vmx.c | 77 +++++++++++++++++++++++++++++++++++++++++-
3 files changed, 103 insertions(+), 1 deletions(-)
This patch adds an atomic notifier list named vmclear_notifier_list.
When loading kvm-intel module, a notifier will be registered in
the list to enable vmcss loaded on all cpus to be VMCLEAR'd if
needed.
Signed-off-by: Zhang Yanfei <[email protected]>
---
arch/x86/include/asm/kexec.h | 2 ++
arch/x86/kernel/crash.c | 25 +++++++++++++++++++++++++
2 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..43e0db3 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,8 @@ struct kimage_arch {
};
#endif
+extern struct atomic_notifier_head vmclear_notifier_list;
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad899..c953d50 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,8 @@
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -30,6 +32,19 @@
int in_crash_kexec;
+/*
+ * The list is used to VMCLEAR vmcss loaded on all
+ * cpus. And when loading kvm_intel module, the
+ * vmclear function will be registered in the list.
+ */
+ATOMIC_NOTIFIER_HEAD(vmclear_notifier_list);
+EXPORT_SYMBOL_GPL(vmclear_notifier_list);
+
+static inline void cpu_emergency_vmclear_loaded_vmcss(void)
+{
+ atomic_notifier_call_chain(&vmclear_notifier_list, 0, NULL);
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +61,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
#endif
crash_save_cpu(regs, cpu);
+ /*
+ * VMCLEAR vmcss loaded on all cpus if needed.
+ */
+ cpu_emergency_vmclear_loaded_vmcss();
+
/* Disable VMX or SVM if needed.
*
* We need to disable virtualization on all CPUs.
@@ -88,6 +108,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
kdump_nmi_shootdown_cpus();
+ /*
+ * VMCLEAR vmcss loaded on this cpu if needed.
+ */
+ cpu_emergency_vmclear_loaded_vmcss();
+
/* Booting kdump kernel with VMX or SVM enabled won't work,
* because (among other limitations) we can't disable paging
* with the virt flags.
--
1.7.1
The notifier will be registered in vmclear_notifier_list when loading
kvm-intel module. And the bitmap indicates whether we should do
VMCLEAR operation in kdump. The bits in the bitmap are set/unset
according to different conditions.
Signed-off-by: Zhang Yanfei <[email protected]>
---
arch/x86/kvm/vmx.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 76 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ff0ab9..eea55b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -41,6 +41,7 @@
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/perf_event.h>
+#include <asm/kexec.h>
#include "trace.h"
@@ -963,6 +964,49 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+
+static inline void crash_enable_local_vmclear(int cpu)
+{
+ cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline void crash_disable_local_vmclear(int cpu)
+{
+ cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+ return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static void vmclear_local_loaded_vmcss(void);
+static int crash_vmclear_local_loaded_vmcss(struct notifier_block *this,
+ unsigned long val, void *ptr)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (crash_local_vmclear_enabled(cpu))
+ vmclear_local_loaded_vmcss();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block crash_vmclear_notifier = {
+ .notifier_call = crash_vmclear_local_loaded_vmcss,
+};
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
+
static void __loaded_vmcs_clear(void *arg)
{
struct loaded_vmcs *loaded_vmcs = arg;
@@ -972,8 +1016,10 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
+ crash_disable_local_vmclear(cpu);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
loaded_vmcs_init(loaded_vmcs);
+ crash_enable_local_vmclear(cpu);
}
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -1491,8 +1537,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
+ crash_disable_local_vmclear(cpu);
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
+ crash_enable_local_vmclear(cpu);
local_irq_enable();
/*
@@ -2302,6 +2350,18 @@ static int hardware_enable(void *garbage)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ /*
+ * Now we can enable the vmclear operation in kdump
+ * since the loaded_vmcss_on_cpu list on this cpu
+ * has been initialized.
+ *
+ * Though the cpu is not in VMX operation now, there
+ * is no problem to enable the vmclear operation
+ * for the loaded_vmcss_on_cpu list is empty!
+ */
+ crash_enable_local_vmclear(cpu);
+
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -2335,7 +2395,6 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-
/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
* tricks.
*/
@@ -2348,6 +2407,12 @@ static void hardware_disable(void *garbage)
{
if (vmm_exclusive) {
vmclear_local_loaded_vmcss();
+ /*
+ * vmclear operation in kdump should be disabled here
+ * because the cpu is going to exit VMX operation
+ * and the loaded_vmcss_on_cpu list may not be empty!
+ */
+ crash_disable_local_vmclear(raw_smp_processor_id());
kvm_cpu_vmxoff();
}
write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -7230,6 +7295,11 @@ static int __init vmx_init(void)
if (r)
goto out3;
+#ifdef CONFIG_KEXEC
+ atomic_notifier_chain_register(&vmclear_notifier_list,
+ &crash_vmclear_notifier);
+#endif
+
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7265,6 +7335,11 @@ static void __exit vmx_exit(void)
free_page((unsigned long)vmx_io_bitmap_b);
free_page((unsigned long)vmx_io_bitmap_a);
+#ifdef CONFIG_KEXEC
+ atomic_notifier_chain_unregister(&vmclear_notifier_list,
+ &crash_vmclear_notifier);
+#endif
+
kvm_exit();
}
--
1.7.1
On Wed, Nov 21, 2012 at 11:27:19PM +0800, Zhang Yanfei wrote:
> The notifier will be registered in vmclear_notifier_list when loading
> kvm-intel module. And the bitmap indicates whether we should do
> VMCLEAR operation in kdump. The bits in the bitmap are set/unset
> according to different conditions.
>
> Signed-off-by: Zhang Yanfei <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 76 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 4ff0ab9..eea55b3 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -41,6 +41,7 @@
> #include <asm/i387.h>
> #include <asm/xcr.h>
> #include <asm/perf_event.h>
> +#include <asm/kexec.h>
>
> #include "trace.h"
>
> @@ -963,6 +964,49 @@ static void vmcs_load(struct vmcs *vmcs)
> vmcs, phys_addr);
> }
>
> +#ifdef CONFIG_KEXEC
> +/*
> + * This bitmap is used to indicate whether the vmclear
> + * operation is enabled on all cpus. All disabled by
> + * default.
> + */
> +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
> +
> +static inline void crash_enable_local_vmclear(int cpu)
> +{
> + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
> +}
> +
> +static inline void crash_disable_local_vmclear(int cpu)
> +{
> + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
> +}
> +
> +static inline int crash_local_vmclear_enabled(int cpu)
> +{
> + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
> +}
> +
> +static void vmclear_local_loaded_vmcss(void);
> +static int crash_vmclear_local_loaded_vmcss(struct notifier_block *this,
> + unsigned long val, void *ptr)
> +{
> + int cpu = raw_smp_processor_id();
> +
> + if (crash_local_vmclear_enabled(cpu))
> + vmclear_local_loaded_vmcss();
> +
> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block crash_vmclear_notifier = {
> + .notifier_call = crash_vmclear_local_loaded_vmcss,
> +};
> +#else
> +static inline void crash_enable_local_vmclear(int cpu) { }
> +static inline void crash_disable_local_vmclear(int cpu) { }
> +#endif /* CONFIG_KEXEC */
> +
> static void __loaded_vmcs_clear(void *arg)
> {
> struct loaded_vmcs *loaded_vmcs = arg;
> @@ -972,8 +1016,10 @@ static void __loaded_vmcs_clear(void *arg)
> return; /* vcpu migration can race with cpu offline */
> if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
> per_cpu(current_vmcs, cpu) = NULL;
> + crash_disable_local_vmclear(cpu);
> list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
> loaded_vmcs_init(loaded_vmcs);
> + crash_enable_local_vmclear(cpu);
> }
>
> static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
> @@ -1491,8 +1537,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>
> kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> local_irq_disable();
> + crash_disable_local_vmclear(cpu);
> list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
> &per_cpu(loaded_vmcss_on_cpu, cpu));
> + crash_enable_local_vmclear(cpu);
> local_irq_enable();
>
> /*
> @@ -2302,6 +2350,18 @@ static int hardware_enable(void *garbage)
> return -EBUSY;
>
> INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
> +
> + /*
> + * Now we can enable the vmclear operation in kdump
> + * since the loaded_vmcss_on_cpu list on this cpu
> + * has been initialized.
> + *
> + * Though the cpu is not in VMX operation now, there
> + * is no problem to enable the vmclear operation
> + * for the loaded_vmcss_on_cpu list is empty!
> + */
> + crash_enable_local_vmclear(cpu);
> +
> rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
>
> test_bits = FEATURE_CONTROL_LOCKED;
> @@ -2335,7 +2395,6 @@ static void vmclear_local_loaded_vmcss(void)
> __loaded_vmcs_clear(v);
> }
>
> -
> /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
> * tricks.
> */
> @@ -2348,6 +2407,12 @@ static void hardware_disable(void *garbage)
> {
> if (vmm_exclusive) {
> vmclear_local_loaded_vmcss();
> + /*
> + * vmclear operation in kdump should be disabled here
> + * because the cpu is going to exit VMX operation
> + * and the loaded_vmcss_on_cpu list may not be empty!
> + */
> + crash_disable_local_vmclear(raw_smp_processor_id());
> kvm_cpu_vmxoff();
How come its not empty? vmclear_local_loaded_vmcss cleared it, didnt it?