From: Lai Jiangshan <[email protected]>
All are just cleanups and independent to each other except the last 3
patches.
Lai Jiangshan (15):
KVM: VMX: Use x86 core API to access to fs_base and inactive gs_base
KVM: VMX: Avoid to rdmsrl(MSR_IA32_SYSENTER_ESP)
KVM: VMX: Update msr value after kvm_set_user_return_msr() succeeds
KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()
KVM: VMX: Add document to state that write to uret msr should always
be intercepted
KVM: VMX: Use kvm_set_msr_common() for MSR_IA32_TSC_ADJUST in the
default way
KVM: VMX: Change comments about vmx_get_msr()
KVM: SVM: Rename get_max_npt_level() to get_npt_level()
KVM: SVM: Allocate sd->save_area with __GFP_ZERO
KVM: X86: Skip allocating pae_root for vcpu->arch.guest_mmu when
!tdp_enabled
KVM: X86: Fix comment in __kvm_mmu_create()
KVM: X86: Remove unused declaration of __kvm_mmu_free_some_pages()
KVM: X86: Remove useless code to set role.gpte_is_8_bytes when
role.direct
KVM: X86: Calculate quadrant when !role.gpte_is_8_bytes
KVM: X86: Always set gpte_is_8_bytes when direct map
Documentation/virt/kvm/mmu.rst | 2 +-
arch/x86/include/asm/kvm_host.h | 11 ------
arch/x86/kernel/process_64.c | 2 ++
arch/x86/kvm/mmu/mmu.c | 12 ++++---
arch/x86/kvm/svm/svm.c | 12 +++----
arch/x86/kvm/vmx/nested.c | 8 +----
arch/x86/kvm/vmx/vmx.c | 64 +++++++++++++++++++--------------
7 files changed, 53 insertions(+), 58 deletions(-)
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
And they use FSGSBASE instructions when enabled.
Cc: [email protected]
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 10 ----------
arch/x86/kernel/process_64.c | 2 ++
arch/x86/kvm/vmx/vmx.c | 14 +++++++-------
3 files changed, 9 insertions(+), 17 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1fcb345bc107..4cbb402f5636 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1808,16 +1808,6 @@ static inline void kvm_load_ldt(u16 sel)
asm("lldt %0" : : "rm"(sel));
}
-#ifdef CONFIG_X86_64
-static inline unsigned long read_msr(unsigned long msr)
-{
- u64 value;
-
- rdmsrl(msr, value);
- return value;
-}
-#endif
-
static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3402edec236c..296bd5c2e38b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -443,6 +443,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void)
return gsbase;
}
+EXPORT_SYMBOL_GPL(x86_gsbase_read_cpu_inactive);
void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
{
@@ -456,6 +457,7 @@ void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
}
}
+EXPORT_SYMBOL_GPL(x86_gsbase_write_cpu_inactive);
unsigned long x86_fsbase_read_task(struct task_struct *task)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3127c66a1651..48a34d1a2989 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1156,11 +1156,11 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
- fs_base = read_msr(MSR_FS_BASE);
- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ fs_base = x86_fsbase_read_cpu();
+ vmx->msr_host_kernel_gs_base = x86_gsbase_read_cpu_inactive();
}
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ x86_gsbase_write_cpu_inactive(vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -1184,7 +1184,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
++vmx->vcpu.stat.host_state_reload;
#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ vmx->msr_guest_kernel_gs_base = x86_gsbase_read_cpu_inactive();
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -1204,7 +1204,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ x86_gsbase_write_cpu_inactive(vmx->msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
vmx->guest_state_loaded = false;
@@ -1216,7 +1216,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
preempt_disable();
if (vmx->guest_state_loaded)
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ vmx->msr_guest_kernel_gs_base = x86_gsbase_read_cpu_inactive();
preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
@@ -1225,7 +1225,7 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
preempt_disable();
if (vmx->guest_state_loaded)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
+ x86_gsbase_write_cpu_inactive(data);
preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
The value of host MSR_IA32_SYSENTER_ESP is known to be constant for
each CPU: (cpu_entry_stack(cpu) + 1) when 32 bit syscall is enabled or
NULL is 32 bit syscall is not enabled.
So rdmsrl() can be avoided for the first case and both rdmsrl() and
vmcs_writel() can be avoided for the second case.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 48a34d1a2989..4e4a33226edb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1271,7 +1271,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
/*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
@@ -1287,8 +1286,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
@@ -4021,6 +4023,8 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+ rdmsrl(MSR_IA32_SYSENTER_ESP, tmpl);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, tmpl);
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Aoid earlier modification.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4e4a33226edb..fc7aa7f30ad5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -602,15 +602,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
- u64 old_msr_data = msr->data;
- msr->data = data;
if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
- if (ret)
- msr->data = old_msr_data;
}
+ if (!ret)
+ msr->data = data;
return ret;
}
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
The host CR3 in the vcpu thread can only be changed when scheduling.
Moving the code in vmx_prepare_switch_to_guest() makes the code
simpler.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/vmx/nested.c | 8 +-------
arch/x86/kvm/vmx/vmx.c | 17 ++++++++++-------
2 files changed, 11 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 2d9565b37fe0..d8d0dbc4fc18 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3028,7 +3028,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
bool vm_fail;
if (!nested_early_check)
@@ -3051,12 +3051,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fc7aa7f30ad5..e8a41fdc3c4d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1103,6 +1103,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
#endif
+ unsigned long cr3;
unsigned long fs_base, gs_base;
u16 fs_sel, gs_sel;
int i;
@@ -1167,6 +1168,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
#endif
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+
+ /* Host CR3 including its PCID is stable when guest state is loaded. */
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != host_state->cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ host_state->cr3 = cr3;
+ }
+
vmx->guest_state_loaded = true;
}
@@ -6590,7 +6599,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6634,12 +6643,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
And adds a corresponding sanity check code.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e8a41fdc3c4d..cd081219b668 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3703,13 +3703,21 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
+ /*
+ * Write to uret msr should always be intercepted due to the mechanism
+ * must know the current value. Santity check to avoid any inadvertent
+ * mistake in coding.
+ */
+ if (WARN_ON_ONCE(vmx_find_uret_msr(vmx, msr) && (type & MSR_TYPE_W)))
+ return;
+
if (static_branch_unlikely(&enable_evmcs))
evmcs_touch_msr_bitmap();
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filters change.
- */
+ */
if (is_valid_passthrough_msr(msr)) {
int idx = possible_passthrough_msr_slot(msr);
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
MSR_IA32_TSC_ADJUST can be left to the default way which also uese
kvm_set_msr_common().
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cd081219b668..a0efc1e74311 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2104,9 +2104,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
The variable name is changed in the code.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/vmx/vmx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a0efc1e74311..c6d9c50ea5d4 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1757,7 +1757,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
It returns the only proper NPT level, so the "max" in the name
is not appropriate.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/svm/svm.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 943da8a7d850..33b434fd5d9b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -265,7 +265,7 @@ u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static int get_max_npt_level(void)
+static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
@@ -1029,9 +1029,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
- /* Force VM NPT level equal to the host's max NPT level */
- kvm_configure_mmu(npt_enabled, get_max_npt_level(),
- get_max_npt_level(), PG_LEVEL_1G);
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
And remove clear_page() on it.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/svm/svm.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 33b434fd5d9b..d855ba664fc2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -585,12 +585,10 @@ static int svm_cpu_init(int cpu)
if (!sd)
return ret;
sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
+ sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!sd->save_area)
goto free_cpu_data;
- clear_page(page_address(sd->save_area));
-
ret = sev_cpu_init(sd);
if (ret)
goto free_save_area;
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
It is never used.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8f0035517450..e1fe8af874ef 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5501,6 +5501,10 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
The allocation of special roots is moved to mmu_alloc_special_roots().
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e1fe8af874ef..64599fa333c4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5514,7 +5514,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* generally doesn't use PAE paging and can skip allocating the PDP
* table. The main exception, handled here, is SVM's 32-bit NPT. The
* other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
- * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
The body of __kvm_mmu_free_some_pages() has been removed.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4cbb402f5636..eb6ef0209ee6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1760,7 +1760,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu);
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
role.gpte_is_8_bytes is unused when role.direct, so this code is
useless so far.
And role.gpte_is_8_bytes is going to be used for indicating if gpte is 4
bytes even when role.direct is true when shadowpaging, so this code
has to be removed for preparation.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 64599fa333c4..db8b64971f25 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2082,8 +2082,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu->mmu_role.base;
role.level = level;
role.direct = direct;
- if (role.direct)
- role.gpte_is_8_bytes = true;
role.access = access;
if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
role.quadrant is only valid when gpte size is 4 bytes and only be
calculated when gpte size is 4 bytes.
Although "vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL" also means
gpte size is 4 bytes, but using "!role.gpte_is_8_bytes" is clearer
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index db8b64971f25..6948f2d696c3 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2083,7 +2083,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level;
role.direct = direct;
role.access = access;
- if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (!direct_mmu && !role.gpte_is_8_bytes) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
When direct map, gpte_is_8_bytes has no meaning, but it is true for all
other cases except direct map when nonpaping.
Setting gpte_is_8_bytes to true when nonpaping can ensure that
!gpte_is_8_bytes means 32-bit gptes for shadow paging.
Signed-off-by: Lai Jiangshan <[email protected]>
---
Documentation/virt/kvm/mmu.rst | 2 +-
arch/x86/kvm/mmu/mmu.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst
index f60f5488e121..5d1086602759 100644
--- a/Documentation/virt/kvm/mmu.rst
+++ b/Documentation/virt/kvm/mmu.rst
@@ -179,7 +179,7 @@ Shadow pages contain the following information:
unpinned it will be destroyed.
role.gpte_is_8_bytes:
Reflects the size of the guest PTE for which the page is valid, i.e. '1'
- if 64-bit gptes are in use, '0' if 32-bit gptes are in use.
+ if direct map or 64-bit gptes are in use, '0' if 32-bit gptes are in use.
role.efer_nx:
Contains the value of efer.nx for which the page is valid.
role.cr0_wp:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6948f2d696c3..0c92cbc07320 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2083,7 +2083,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level;
role.direct = direct;
role.access = access;
- if (!direct_mmu && !role.gpte_is_8_bytes) {
+ if (!role.gpte_is_8_bytes) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
@@ -4777,7 +4777,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
- role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
+ role.base.gpte_is_8_bytes = !____is_cr0_pg(regs) || ____is_cr4_pae(regs);
return role;
}
--
2.19.1.6.gb485710b
On 11/18/21 12:08, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> When direct map, gpte_is_8_bytes has no meaning, but it is true for all
> other cases except direct map when nonpaping.
>
> Setting gpte_is_8_bytes to true when nonpaping can ensure that
> !gpte_is_8_bytes means 32-bit gptes for shadow paging.
Then the right thing to do would be to rename it to has_4_byte_gptes and
invert the direction. But as things stand, it's a bit more confusing to
make gpte_is_8_bytes=1 if there are no guest PTEs at all.
Paolo
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> Documentation/virt/kvm/mmu.rst | 2 +-
> arch/x86/kvm/mmu/mmu.c | 4 ++--
> 2 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst
> index f60f5488e121..5d1086602759 100644
> --- a/Documentation/virt/kvm/mmu.rst
> +++ b/Documentation/virt/kvm/mmu.rst
> @@ -179,7 +179,7 @@ Shadow pages contain the following information:
> unpinned it will be destroyed.
> role.gpte_is_8_bytes:
> Reflects the size of the guest PTE for which the page is valid, i.e. '1'
> - if 64-bit gptes are in use, '0' if 32-bit gptes are in use.
> + if direct map or 64-bit gptes are in use, '0' if 32-bit gptes are in use.
> role.efer_nx:
> Contains the value of efer.nx for which the page is valid.
> role.cr0_wp:
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6948f2d696c3..0c92cbc07320 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2083,7 +2083,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> role.level = level;
> role.direct = direct;
> role.access = access;
> - if (!direct_mmu && !role.gpte_is_8_bytes) {
> + if (!role.gpte_is_8_bytes) {
> quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> role.quadrant = quadrant;
> @@ -4777,7 +4777,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
>
> role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
> role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
> - role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
> + role.base.gpte_is_8_bytes = !____is_cr0_pg(regs) || ____is_cr4_pae(regs);
>
> return role;
> }
>
On 11/18/21 12:08, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> The value of host MSR_IA32_SYSENTER_ESP is known to be constant for
> each CPU: (cpu_entry_stack(cpu) + 1) when 32 bit syscall is enabled or
> NULL is 32 bit syscall is not enabled.
>
> So rdmsrl() can be avoided for the first case and both rdmsrl() and
> vmcs_writel() can be avoided for the second case.
>
> Signed-off-by: Lai Jiangshan <[email protected]>
Then it's not constant host state, isn't? The hunk in
vmx_set_constant_host_state seems wrong.
Paolo
> ---
> arch/x86/kvm/vmx/vmx.c | 10 +++++++---
> 1 file changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 48a34d1a2989..4e4a33226edb 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1271,7 +1271,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
>
> if (!already_loaded) {
> void *gdt = get_current_gdt_ro();
> - unsigned long sysenter_esp;
>
> /*
> * Flush all EPTP/VPID contexts, the new pCPU may have stale
> @@ -1287,8 +1286,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
> (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
> vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
>
> - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
> - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
> + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
> + /* 22.2.3 */
> + vmcs_writel(HOST_IA32_SYSENTER_ESP,
> + (unsigned long)(cpu_entry_stack(cpu) + 1));
> + }
>
> vmx->loaded_vmcs->cpu = cpu;
> }
> @@ -4021,6 +4023,8 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
>
> rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
> vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
> + rdmsrl(MSR_IA32_SYSENTER_ESP, tmpl);
> + vmcs_writel(HOST_IA32_SYSENTER_ESP, tmpl);
> rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
> vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
On 2021/11/18 19:18, Paolo Bonzini wrote:
> On 11/18/21 12:08, Lai Jiangshan wrote:
>> From: Lai Jiangshan <[email protected]>
>>
>> The value of host MSR_IA32_SYSENTER_ESP is known to be constant for
>> each CPU: (cpu_entry_stack(cpu) + 1) when 32 bit syscall is enabled or
>> NULL is 32 bit syscall is not enabled.
>>
>> So rdmsrl() can be avoided for the first case and both rdmsrl() and
>> vmcs_writel() can be avoided for the second case.
>>
>> Signed-off-by: Lai Jiangshan <[email protected]>
>
> Then it's not constant host state, isn't? The hunk in vmx_set_constant_host_state seems wrong.
>
The change in vmx_vcpu_load_vmcs() handles only the percpu constant case:
(cpu_entry_stack(cpu) + 1), it doesn't handle the case where
MSR_IA32_SYSENTER_ESP is NULL.
The change in vmx_set_constant_host_state() handles the case where
MSR_IA32_SYSENTER_ESP is NULL, it does be constant host state in this case.
If it is not the case, the added code in vmx_vcpu_load_vmcs() will override
it safely.
If an else branch with "vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);" is added to
vmx_vcpu_load_vmcs(), we will not need to change vmx_set_constant_host_state().
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ } else {
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+ }
I'm paranoid about the case when vmcs.HOST_IA32_SYSENTER_ESP is not reset to be
NULL when the vcpu is reset. So the hunk in vmx_set_constant_host_state() or
the else branch in vmx_vcpu_load_vmcs() is required. I just chose to change
the vmx_set_constant_host_state() so that we can reduce a vmcs_writel() in the
case where CONFIG_IA32_EMULATION is off in x86_64.
> Paolo
>
>> ---
>> arch/x86/kvm/vmx/vmx.c | 10 +++++++---
>> 1 file changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index 48a34d1a2989..4e4a33226edb 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -1271,7 +1271,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
>> if (!already_loaded) {
>> void *gdt = get_current_gdt_ro();
>> - unsigned long sysenter_esp;
>> /*
>> * Flush all EPTP/VPID contexts, the new pCPU may have stale
>> @@ -1287,8 +1286,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
>> (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
>> vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
>> - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
>> - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
>> + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
>> + /* 22.2.3 */
>> + vmcs_writel(HOST_IA32_SYSENTER_ESP,
>> + (unsigned long)(cpu_entry_stack(cpu) + 1));
>> + }
>> vmx->loaded_vmcs->cpu = cpu;
>> }
>> @@ -4021,6 +4023,8 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
>> rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
>> vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
>> + rdmsrl(MSR_IA32_SYSENTER_ESP, tmpl);
>> + vmcs_writel(HOST_IA32_SYSENTER_ESP, tmpl);
>> rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
>> vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
>
On 2021/11/18 19:12, Paolo Bonzini wrote:
> On 11/18/21 12:08, Lai Jiangshan wrote:
>> From: Lai Jiangshan <[email protected]>
>>
>> When direct map, gpte_is_8_bytes has no meaning, but it is true for all
>> other cases except direct map when nonpaping.
>>
>> Setting gpte_is_8_bytes to true when nonpaping can ensure that
>> !gpte_is_8_bytes means 32-bit gptes for shadow paging.
>
> Then the right thing to do would be to rename it to has_4_byte_gptes and invert the direction. But as things stand,
> it's a bit more confusing to make gpte_is_8_bytes=1 if there are no guest PTEs at all.
>
I will make the last 3 patches be a separated patchset and will do the rename.
On 11/18/21 15:17, Lai Jiangshan wrote:
>
> The change in vmx_vcpu_load_vmcs() handles only the percpu constant case:
> (cpu_entry_stack(cpu) + 1), it doesn't handle the case where
> MSR_IA32_SYSENTER_ESP is NULL.
>
> The change in vmx_set_constant_host_state() handles the case where
> MSR_IA32_SYSENTER_ESP is NULL, it does be constant host state in this case.
> If it is not the case, the added code in vmx_vcpu_load_vmcs() will override
> it safely.
>
> If an else branch with "vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);" is
> added to
> vmx_vcpu_load_vmcs(), we will not need to change
> vmx_set_constant_host_state().
We can change vmx_set_constant_host_state to write 0.
Paolo
On 11/18/21 15:34, Lai Jiangshan wrote:
>
>
> On 2021/11/18 19:12, Paolo Bonzini wrote:
>> On 11/18/21 12:08, Lai Jiangshan wrote:
>>> From: Lai Jiangshan <[email protected]>
>>>
>>> When direct map, gpte_is_8_bytes has no meaning, but it is true for all
>>> other cases except direct map when nonpaping.
>>>
>>> Setting gpte_is_8_bytes to true when nonpaping can ensure that
>>> !gpte_is_8_bytes means 32-bit gptes for shadow paging.
>>
>> Then the right thing to do would be to rename it to has_4_byte_gptes
>> and invert the direction. But as things stand, it's a bit more
>> confusing to make gpte_is_8_bytes=1 if there are no guest PTEs at all.
>>
>
> I will make the last 3 patches be a separated patchset and will do the
> rename.
Patches 13 and 14 are fine actually.
Paolo
On 11/18/21 12:08, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> And adds a corresponding sanity check code.
>
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> arch/x86/kvm/vmx/vmx.c | 10 +++++++++-
> 1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index e8a41fdc3c4d..cd081219b668 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3703,13 +3703,21 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
> if (!cpu_has_vmx_msr_bitmap())
> return;
>
> + /*
> + * Write to uret msr should always be intercepted due to the mechanism
> + * must know the current value. Santity check to avoid any inadvertent
> + * mistake in coding.
> + */
> + if (WARN_ON_ONCE(vmx_find_uret_msr(vmx, msr) && (type & MSR_TYPE_W)))
> + return;
> +
I'm not sure about this one, it's relatively expensive to call
vmx_find_uret_msr.
User-return MSRs and disable-intercept MSRs are almost the opposite:
uret is for MSRs that the host (not even the processor) never uses,
disable-intercept is for MSRs that the guest reads/writes often. As
such it seems almost impossible that they overlap.
Paolo
On 11/18/21 12:08, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> And they use FSGSBASE instructions when enabled.
>
> Cc: [email protected]
> Signed-off-by: Lai Jiangshan <[email protected]>
This needs ACK from x86 maintainers.
I queued 2-4 and 6-14.
Paolo
> ---
> arch/x86/include/asm/kvm_host.h | 10 ----------
> arch/x86/kernel/process_64.c | 2 ++
> arch/x86/kvm/vmx/vmx.c | 14 +++++++-------
> 3 files changed, 9 insertions(+), 17 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 1fcb345bc107..4cbb402f5636 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1808,16 +1808,6 @@ static inline void kvm_load_ldt(u16 sel)
> asm("lldt %0" : : "rm"(sel));
> }
>
> -#ifdef CONFIG_X86_64
> -static inline unsigned long read_msr(unsigned long msr)
> -{
> - u64 value;
> -
> - rdmsrl(msr, value);
> - return value;
> -}
> -#endif
> -
> static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
> {
> kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 3402edec236c..296bd5c2e38b 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -443,6 +443,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void)
>
> return gsbase;
> }
> +EXPORT_SYMBOL_GPL(x86_gsbase_read_cpu_inactive);
>
> void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
> {
> @@ -456,6 +457,7 @@ void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
> wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
> }
> }
> +EXPORT_SYMBOL_GPL(x86_gsbase_write_cpu_inactive);
>
> unsigned long x86_fsbase_read_task(struct task_struct *task)
> {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 3127c66a1651..48a34d1a2989 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1156,11 +1156,11 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> } else {
> savesegment(fs, fs_sel);
> savesegment(gs, gs_sel);
> - fs_base = read_msr(MSR_FS_BASE);
> - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
> + fs_base = x86_fsbase_read_cpu();
> + vmx->msr_host_kernel_gs_base = x86_gsbase_read_cpu_inactive();
> }
>
> - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
> + x86_gsbase_write_cpu_inactive(vmx->msr_guest_kernel_gs_base);
> #else
> savesegment(fs, fs_sel);
> savesegment(gs, gs_sel);
> @@ -1184,7 +1184,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
> ++vmx->vcpu.stat.host_state_reload;
>
> #ifdef CONFIG_X86_64
> - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
> + vmx->msr_guest_kernel_gs_base = x86_gsbase_read_cpu_inactive();
> #endif
> if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
> kvm_load_ldt(host_state->ldt_sel);
> @@ -1204,7 +1204,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
> #endif
> invalidate_tss_limit();
> #ifdef CONFIG_X86_64
> - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
> + x86_gsbase_write_cpu_inactive(vmx->msr_host_kernel_gs_base);
> #endif
> load_fixmap_gdt(raw_smp_processor_id());
> vmx->guest_state_loaded = false;
> @@ -1216,7 +1216,7 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
> {
> preempt_disable();
> if (vmx->guest_state_loaded)
> - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
> + vmx->msr_guest_kernel_gs_base = x86_gsbase_read_cpu_inactive();
> preempt_enable();
> return vmx->msr_guest_kernel_gs_base;
> }
> @@ -1225,7 +1225,7 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
> {
> preempt_disable();
> if (vmx->guest_state_loaded)
> - wrmsrl(MSR_KERNEL_GS_BASE, data);
> + x86_gsbase_write_cpu_inactive(data);
> preempt_enable();
> vmx->msr_guest_kernel_gs_base = data;
> }
>
On 2021/11/18 23:01, Paolo Bonzini wrote:
> On 11/18/21 15:34, Lai Jiangshan wrote:
>>
>>
>> On 2021/11/18 19:12, Paolo Bonzini wrote:
>>> On 11/18/21 12:08, Lai Jiangshan wrote:
>>>> From: Lai Jiangshan <[email protected]>
>>>>
>>>> When direct map, gpte_is_8_bytes has no meaning, but it is true for all
>>>> other cases except direct map when nonpaping.
>>>>
>>>> Setting gpte_is_8_bytes to true when nonpaping can ensure that
>>>> !gpte_is_8_bytes means 32-bit gptes for shadow paging.
>>>
>>> Then the right thing to do would be to rename it to has_4_byte_gptes and invert the direction. But as things stand,
>>> it's a bit more confusing to make gpte_is_8_bytes=1 if there are no guest PTEs at all.
>>>
>>
>> I will make the last 3 patches be a separated patchset and will do the rename.
>
> Patches 13 and 14 are fine actually.
>
Hello
Since 13, and 14 is queued, could you also queue this one and I will do the rename
separately in the next patchset. I found that the intent of this patch is hidden
in the lengthened squashed patch (of this patch and the renaming patch).
Thanks
Lai
On 11/19/21 11:30, Lai Jiangshan wrote:
>>
>
> Hello
>
> Since 13, and 14 is queued, could you also queue this one and I will
> do the rename separately in the next patchset. I found that the
> intent of this patch is hidden in the lengthened squashed patch (of
> this patch and the renaming patch).
Then you can do the renaming first?
Paolo
On 2021/11/19 18:34, Paolo Bonzini wrote:
> On 11/19/21 11:30, Lai Jiangshan wrote:
>>>
>>
>> Hello
>>
>> Since 13, and 14 is queued, could you also queue this one and I will
>> do the rename separately in the next patchset. I found that the
>> intent of this patch is hidden in the lengthened squashed patch (of
>> this patch and the renaming patch).
>
> Then you can do the renaming first?
>
Then I would prefer to make the code match the name for the first time
when the name is introduced, which means it would be a squashed patch.
I will not fuss about it and will send the squashed patch.
Lai,
On Thu, Nov 18 2021 at 19:08, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> And they use FSGSBASE instructions when enabled.
That's really not a proper explanation for adding yet more exports.
Thanks,
tglx
On 2021/11/21 23:17, Thomas Gleixner wrote:
> Lai,
>
> On Thu, Nov 18 2021 at 19:08, Lai Jiangshan wrote:
>> From: Lai Jiangshan <[email protected]>
>>
>> And they use FSGSBASE instructions when enabled.
>
> That's really not a proper explanation for adding yet more exports.
>
Hello
---
When a vCPU thread is rescheduled, 1 rdmsr and 2 wrmsr are called for
MSR_KERNEL_GS_BASE.
In scheduler, the core kernel uses x86_gsbase_[read|write]_cpu_inactive()
to accelerate the access to inactive GSBASE, but when the scheduler calls
in the preemption notifier in kvm, {rd|wr}msr(MSR_KERNEL_GS_BASE) is used.
To make the way of how kvm access to inactive GSBASE consistent with the
scheduler, kvm is changed to use x86 core API to access to fs_base and
inactive gs_base. And they use FSGSBASE instructions when enabled.
It would add 2 more exports, but it doesn't export any extra software nor
hardware resources since the resources can be access via {rd|wr}msr.
---
Not so persuasive. If it needs to be accelerated in the preemption notifier,
there are some other more aggressive ways.
Thanks
Lai
On Thu, Nov 18, 2021, Paolo Bonzini wrote:
> On 11/18/21 12:08, Lai Jiangshan wrote:
> > From: Lai Jiangshan <[email protected]>
> >
> > And adds a corresponding sanity check code.
> >
> > Signed-off-by: Lai Jiangshan <[email protected]>
> > ---
> > arch/x86/kvm/vmx/vmx.c | 10 +++++++++-
> > 1 file changed, 9 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index e8a41fdc3c4d..cd081219b668 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -3703,13 +3703,21 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
> > if (!cpu_has_vmx_msr_bitmap())
> > return;
> > + /*
> > + * Write to uret msr should always be intercepted due to the mechanism
> > + * must know the current value. Santity check to avoid any inadvertent
> > + * mistake in coding.
> > + */
> > + if (WARN_ON_ONCE(vmx_find_uret_msr(vmx, msr) && (type & MSR_TYPE_W)))
> > + return;
> > +
>
> I'm not sure about this one, it's relatively expensive to call
> vmx_find_uret_msr.
>
> User-return MSRs and disable-intercept MSRs are almost the opposite: uret is
> for MSRs that the host (not even the processor) never uses,
> disable-intercept is for MSRs that the guest reads/writes often. As such it
> seems almost impossible that they overlap.
And they aren't fundamentally mutually exclusive, e.g. KVM could pass-through an
MSR and then do RDMSR in vmx_prepare_switch_to_host() to refresh the uret data
with the current (guest) value. It'd be silly, but it would work.