From: Isaku Yamahata <[email protected]>
Hello. I've updated the patch series following Michael suggestion.
Here are the discussion points.
- 4/6 KVM: x86: Introduce fault type to indicate kvm page fault is private
The moot point is where to put the logic to determine the fault is private.
- The caller site of kvm_mmu_do_page_fault() (or kvm_mmu_page_fault()).
The v1 patch approach.
- npf_interception()
- handle_ept_violation(), handle_ept_misconfig() and TDX counter part.
- Open code in kvm_get_fault_type()
- 5/6 KVM: Add flags to struct kvm_gfn_range
I think this patch doesn't interfere the SNP case because it passes more
info to the callback.
- 6/6 KVM: x86: Add is_vm_type_supported callback
Let's introduce KVM_X86_TDX_VM for TDX. And revise KVM_X86_PROTECTED_VM.
I won't address the name in this patch series.
Thanks,
Hello. This is an RFC patch series based on KVM gmem [1] and [2]
for the common use of TDX and SEV-SNP. I'd like to discuss three items.
* Flags for kvm_gfn_range:
Roth, By adding flags to kvm_gfn_range, kvm_arch_gmem_invalidate() won't be
needed. Maybe x86 gmem_invalidate callback is still needed, though.
* Page fault error code or how to figure out the private page fault
There is an issue to set kvm_page_fault.is_private. I came up with two new
error codes. Is this a way or any other way?
* VM type: Now we have KVM_X86_PROTECTED_VM. How do we proceed?
- Keep KVM_X86_PROTECTED_VM for its use. Introduce KVM_X86_TDX_VM
- Use KVM_X86_PROTECTED_VM for TDX. (If necessary, introduce another type in
the future)
- any other way?
[1] KVM gmem patches
https://github.com/sean-jc/linux/tree/x86/kvm_gmem_solo
[2] Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support
https://lore.kernel.org/lkml/[email protected]/
Isaku Yamahata (6):
KVM: selftests: Fix test_add_overlapping_private_memory_regions()
KVM: selftests: Fix guest_memfd()
KVM: x86/mmu: Pass round full 64-bit error code for the KVM page fault
KVM: x86: Introduce fault type to indicate kvm page fault is private
KVM: Add flags to struct kvm_gfn_range
KVM: x86: Add is_vm_type_supported callback
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 7 ++++
arch/x86/include/uapi/asm/kvm.h | 1 +
arch/x86/kvm/mmu/mmu.c | 31 +++++++++++-----
arch/x86/kvm/mmu/mmu_internal.h | 37 +++++++++++++++++--
arch/x86/kvm/mmu/mmutrace.h | 2 +-
arch/x86/kvm/mmu/paging_tmpl.h | 4 +-
arch/x86/kvm/svm/svm.c | 7 ++++
arch/x86/kvm/vmx/vmx.c | 6 +++
arch/x86/kvm/x86.c | 10 ++++-
arch/x86/kvm/x86.h | 2 +
include/linux/kvm_host.h | 10 ++++-
.../testing/selftests/kvm/guest_memfd_test.c | 4 +-
.../selftests/kvm/set_memory_region_test.c | 16 +++++++-
virt/kvm/guest_mem.c | 9 +++--
virt/kvm/kvm_main.c | 4 +-
16 files changed, 125 insertions(+), 26 deletions(-)
base-commit: be8abcec83c87d4e15ae04816b685fe260c4bcfd
--
2.25.1
From: Isaku Yamahata <[email protected]>
For TDX, allow the backend can override the supported vm type. Add
KVM_X86_TDX_VM to reserve the bit.
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v1 -> v2
- no change
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/uapi/asm/kvm.h | 1 +
arch/x86/kvm/svm/svm.c | 7 +++++++
arch/x86/kvm/vmx/vmx.c | 6 ++++++
arch/x86/kvm/x86.c | 10 +++++++++-
arch/x86/kvm/x86.h | 2 ++
7 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 13bc212cd4bc..c0143906fe6d 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -20,6 +20,7 @@ KVM_X86_OP(hardware_disable)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
+KVM_X86_OP(is_vm_type_supported)
KVM_X86_OP(vm_init)
KVM_X86_OP_OPTIONAL(vm_destroy)
KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5afeefc7a516..6adcc6be6466 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1549,6 +1549,7 @@ struct kvm_x86_ops {
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
+ bool (*is_vm_type_supported)(unsigned long vm_type);
unsigned int vm_size;
int (*vm_init)(struct kvm *kvm);
void (*vm_destroy)(struct kvm *kvm);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 6afbfbb32d56..53d382b3b423 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -561,5 +561,6 @@ struct kvm_pmu_event_filter {
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_PROTECTED_VM 1
+#define KVM_X86_TDX_VM 2
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index eb308c9994f9..e9ed8729f63b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4756,6 +4756,12 @@ static void svm_vm_destroy(struct kvm *kvm)
sev_vm_destroy(kvm);
}
+static bool svm_is_vm_type_supported(unsigned long type)
+{
+ /* FIXME: Check if CPU is capable of SEV. */
+ return __kvm_is_vm_type_supported(type);
+}
+
static int svm_vm_init(struct kvm *kvm)
{
if (!pause_filter_count || !pause_filter_thresh)
@@ -4784,6 +4790,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_free = svm_vcpu_free,
.vcpu_reset = svm_vcpu_reset,
+ .is_vm_type_supported = svm_is_vm_type_supported,
.vm_size = sizeof(struct kvm_svm),
.vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 44fb619803b8..b5394ba8cb9c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7469,6 +7469,11 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
return err;
}
+static bool vmx_is_vm_type_supported(unsigned long type)
+{
+ return __kvm_is_vm_type_supported(type);
+}
+
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
@@ -8138,6 +8143,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hardware_disable = vmx_hardware_disable,
.has_emulated_msr = vmx_has_emulated_msr,
+ .is_vm_type_supported = vmx_is_vm_type_supported,
.vm_size = sizeof(struct kvm_vmx),
.vm_init = vmx_vm_init,
.vm_destroy = vmx_vm_destroy,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c9e1c9369be2..b5f865f39a00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4418,12 +4418,18 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
return 0;
}
-static bool kvm_is_vm_type_supported(unsigned long type)
+bool __kvm_is_vm_type_supported(unsigned long type)
{
return type == KVM_X86_DEFAULT_VM ||
(type == KVM_X86_PROTECTED_VM &&
IS_ENABLED(CONFIG_KVM_PROTECTED_VM) && tdp_enabled);
}
+EXPORT_SYMBOL_GPL(__kvm_is_vm_type_supported);
+
+static bool kvm_is_vm_type_supported(unsigned long type)
+{
+ return static_call(kvm_x86_is_vm_type_supported)(type);
+}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
@@ -4618,6 +4624,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = BIT(KVM_X86_DEFAULT_VM);
if (kvm_is_vm_type_supported(KVM_X86_PROTECTED_VM))
r |= BIT(KVM_X86_PROTECTED_VM);
+ if (kvm_is_vm_type_supported(KVM_X86_TDX_VM))
+ r |= BIT(KVM_X86_TDX_VM);
break;
default:
break;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c544602d07a3..7d5aa8f0571a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -9,6 +9,8 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+bool __kvm_is_vm_type_supported(unsigned long type);
+
struct kvm_caps {
/* control of guest tsc rate supported? */
bool has_tsc_control;
--
2.25.1
From: Isaku Yamahata <[email protected]>
Some test cases should succeed. Check !ret instead of ret.
Fixes: 36eedd5b91e3 ("KVM: selftests: Add basic selftest for guest_memfd()")
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v1 -> v2:
- no change
---
tools/testing/selftests/kvm/guest_memfd_test.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index 3b6532b833b2..f3b99c1e5464 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -72,11 +72,11 @@ static void test_fallocate(int fd, size_t page_size, size_t total_size)
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
total_size, page_size);
- TEST_ASSERT(ret, "fallocate(PUNCH_HOLE) at total_size should be fine (no-op)");
+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) at total_size should be fine (no-op)");
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
total_size + page_size, page_size);
- TEST_ASSERT(ret, "fallocate(PUNCH_HOLE) after total_size should be fine (no-op)");
+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) after total_size should be fine (no-op)");
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
page_size, page_size - 1);
--
2.25.1
From: Isaku Yamahata <[email protected]>
Introduce kvm fault type to indicate how to handle kvm page fault.
It is unfortunate and inflexible for kvm_mmu_do_page_fault() to call
kvm_mem_is_private(), eventually looking up memory attributes. Later
__kvm_faultin_pfn() looks up memory attributes again. There is a race
condition that other threads can change memory attributes due to not
gaining the mmu lock. SNP-SEV and TDX define theri way to indicate that
the page fault is private.
Add KVM fault type, add mmu_private_fault_mask to struct kvm_arch for SNP
to determine the fault is private, add gfn_shared_mask to struct kvm_arch
for TDX to determine the fault is private. KVM_FAULT_SHARED_ALWAYS is added
for the conventional guest to avoid over head to lookup memory attributes.
Suggested-by: Michael Roth <[email protected]>
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v1 -> v2:
- Introduced fault type and replaced is_private with fault_type.
- Add kvm_get_fault_type() to encapsulate the difference.
---
arch/x86/include/asm/kvm_host.h | 6 ++++++
arch/x86/kvm/mmu/mmu.c | 26 ++++++++++++++++++++------
arch/x86/kvm/mmu/mmu_internal.h | 33 +++++++++++++++++++++++++++++++--
3 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8ae131dc645d..5afeefc7a516 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1445,6 +1445,12 @@ struct kvm_arch {
*/
#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
struct kvm_mmu_memory_cache split_desc_cache;
+
+#ifdef CONFIG_KVM_PROTECTED_VM
+ /* To make the patch compile. */
+ u64 mmu_private_fault_mask;
+ gfn_t gfn_shared_mask;
+#endif
};
struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b8ba7f11c3cb..feec75515f39 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3174,10 +3174,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot,
- gfn_t gfn, int max_level, bool is_private)
+ gfn_t gfn, int max_level,
+ enum kvm_fault_type fault_type)
{
struct kvm_lpage_info *linfo;
int host_level;
+ bool is_private = fault_type == KVM_FAULT_PRIVATE;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -3228,7 +3230,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
*/
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
fault->gfn, fault->max_level,
- fault->is_private);
+ fault->fault_type);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@@ -4328,7 +4330,7 @@ static int kvm_do_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
- if (fault->is_private)
+ if (fault->fault_type == KVM_FAULT_PRIVATE)
vcpu->run->memory.flags = KVM_MEMORY_EXIT_FLAG_PRIVATE;
else
vcpu->run->memory.flags = 0;
@@ -4386,10 +4388,22 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_EMULATE;
}
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn))
- return kvm_do_memory_fault_exit(vcpu, fault);
+ if (fault->fault_type == KVM_FAULT_SHARED_ALWAYS) {
+ /*
+ * The conventional case. Don't lookup memory attributes to
+ * avoid overhead
+ */
+ fault->fault_type = KVM_FAULT_SHARED;
+ } else if (fault->fault_type == KVM_FAULT_MEM_ATTR) {
+ fault->fault_type = kvm_mem_is_private(vcpu->kvm, fault->gfn) ?
+ KVM_FAULT_PRIVATE : KVM_FAULT_SHARED;
+ } else {
+ if ((fault->fault_type == KVM_FAULT_PRIVATE) !=
+ kvm_mem_is_private(vcpu->kvm, fault->gfn))
+ return kvm_do_memory_fault_exit(vcpu, fault);
+ }
- if (fault->is_private)
+ if (fault->fault_type == KVM_FAULT_PRIVATE)
return kvm_faultin_pfn_private(vcpu, fault);
async = false;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 7f9ec1e5b136..0ec0b927a391 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -188,6 +188,13 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
}
+enum kvm_fault_type {
+ KVM_FAULT_MEM_ATTR,
+ KVM_FAULT_SHARED,
+ KVM_FAULT_SHARED_ALWAYS,
+ KVM_FAULT_PRIVATE,
+};
+
struct kvm_page_fault {
/* arguments to kvm_mmu_do_page_fault. */
const gpa_t addr;
@@ -203,9 +210,10 @@ struct kvm_page_fault {
/* Derived from mmu and global state. */
const bool is_tdp;
- const bool is_private;
const bool nx_huge_page_workaround_enabled;
+ enum kvm_fault_type fault_type;
+
/*
* Whether a >4KB mapping can be created or is forbidden due to NX
* hugepages.
@@ -282,6 +290,27 @@ enum {
RET_PF_SPURIOUS,
};
+static inline enum kvm_fault_type kvm_get_fault_type(struct kvm *kvm,
+ gpa_t gpa, u64 err)
+{
+
+#ifdef CONFIG_KVM_PROTECTED_VM
+ /* SEV-SNP handling */
+ if (kvm->arch.mmu_private_fault_mask)
+ return (err & kvm->arch.mmu_private_fault_mask) ?
+ KVM_FAULT_PRIVATE : KVM_FAULT_SHARED;
+
+ /* TDX handling */
+ if (kvm->arch.gfn_shared_mask)
+ return (gpa_to_gfn(gpa) & kvm->arch.gfn_shared_mask) ?
+ KVM_FAULT_SHARED : KVM_FAULT_PRIVATE;
+#endif
+ if (kvm->arch.vm_type == KVM_X86_PROTECTED_VM)
+ return KVM_FAULT_MEM_ATTR;
+ /* Don't query memory attributes. */
+ return KVM_FAULT_SHARED_ALWAYS;
+}
+
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u64 err, bool prefetch, int *emulation_type)
{
@@ -301,7 +330,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
- .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+ .fault_type = kvm_get_fault_type(vcpu->kvm, cr2_or_gpa, err),
};
int r;
--
2.25.1
From: Isaku Yamahata <[email protected]>
Because the full 64-bit error code, or more info about the fault, for the
KVM page fault will be needed for protected VM, TDX and SEV-SNP, update
kvm_mmu_do_page_fault() to accept the 64-bit value so it can pass it to the
callbacks.
The upper 32 bits of error code are discarded at kvm_mmu_page_fault()
by lower_32_bits(). Now it's passed down as full 64 bits.
Because only FNAME(page_fault) depends on it, move lower_32_bits() into
FNAME(page_fault).
The accesses of fault->error_code are as follows
- FNAME(page_fault): change to explicitly use lower_32_bits()
- kvm_mmu_page_fault(): explicit mask with PFERR_RSVD_MASK,
PFERR_NESTED_GUEST_PAGE
- mmutrace: changed u32 -> u64
- pgprintk(): change %x -> %llx
No functional change is intended. This is a preparation to pass on more
info with page fault error code.
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v1 -> v2:
- no change
---
arch/x86/kvm/mmu/mmu.c | 5 ++---
arch/x86/kvm/mmu/mmu_internal.h | 4 ++--
arch/x86/kvm/mmu/mmutrace.h | 2 +-
arch/x86/kvm/mmu/paging_tmpl.h | 4 ++--
4 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index dc2b9a2f717c..b8ba7f11c3cb 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4510,7 +4510,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
- pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
+ pgprintk("%s: gva %llx error %llx\n", __func__, fault->addr, fault->error_code);
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
fault->max_level = PG_LEVEL_2M;
@@ -5820,8 +5820,7 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
}
if (r == RET_PF_INVALID) {
- r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false,
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
&emulation_type);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index f1786698ae00..7f9ec1e5b136 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -191,7 +191,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
struct kvm_page_fault {
/* arguments to kvm_mmu_do_page_fault. */
const gpa_t addr;
- const u32 error_code;
+ const u64 error_code;
const bool prefetch;
/* Derived from error_code. */
@@ -283,7 +283,7 @@ enum {
};
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch, int *emulation_type)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2d7555381955..2e77883c92f6 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -261,7 +261,7 @@ TRACE_EVENT(
TP_STRUCT__entry(
__field(int, vcpu_id)
__field(gpa_t, cr2_or_gpa)
- __field(u32, error_code)
+ __field(u64, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 0662e0278e70..ee4b881c5b39 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -758,7 +758,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
struct guest_walker walker;
int r;
- pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
+ pgprintk("%s: addr %llx err %llx\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
/*
@@ -767,7 +767,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* The bit needs to be cleared before walking guest page tables.
*/
r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
- fault->error_code & ~PFERR_RSVD_MASK);
+ lower_32_bits(fault->error_code) & ~PFERR_RSVD_MASK);
/*
* The page is not mapped by the guest. Let the guest handle it.
--
2.25.1
From: Isaku Yamahata <[email protected]>
The last test in test_add_overlapping_private_memory_regions() doesn't use
overlapping regions resulting in the failure. When the region is overlaps
with the existing ones, the error code is EEXIST instead of EINVAL. Pass
the overlapping region, and check if the errno is EEXIST.
Fixes: bdb645960cb5 ("KVM: selftests: Expand set_memory_region_test to validate guest_memfd()")
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v1 -> v2:
- no change
---
.../selftests/kvm/set_memory_region_test.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index f46841843300..ea7da324c4d6 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -432,6 +432,7 @@ static void test_add_overlapping_private_memory_regions(void)
{
struct kvm_vm *vm;
int memfd;
+ int r;
pr_info("Testing ADD of overlapping KVM_MEM_PRIVATE memory regions\n");
@@ -453,8 +454,19 @@ static void test_add_overlapping_private_memory_regions(void)
vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_PRIVATE,
MEM_REGION_GPA, 0, NULL, -1, 0);
- test_invalid_guest_memfd(vm, memfd, MEM_REGION_SIZE,
- "Overlapping guest_memfd() bindings should fail");
+ r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_PRIVATE,
+ MEM_REGION_GPA * 2 - MEM_REGION_SIZE,
+ MEM_REGION_SIZE * 2,
+ 0, memfd, 0);
+ TEST_ASSERT(r == -1 && errno == EEXIST, "%s",
+ "Overlapping guest_memfd() bindings should fail");
+
+ r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_PRIVATE,
+ MEM_REGION_GPA * 2 + MEM_REGION_SIZE,
+ MEM_REGION_SIZE * 2,
+ 0, memfd, 0);
+ TEST_ASSERT(r == -1 && errno == EEXIST, "%s",
+ "Overlapping guest_memfd() bindings should fail");
close(memfd);
kvm_vm_free(vm);
--
2.25.1
From: Isaku Yamahata <[email protected]>
Add flags to strut kvm_gfn_range to indicate who triggered the callback
and new memory attributes.
TDX needs to know the reason for a callback by kvm_unmap_gfn_range(). mmu
notifier, set memory attributes ioctl or KVM gmem callback. With TDX,
zapping a private page from the encrypted page table and adding the page
back to the same private GPA results in zeroing the page, and the guest has
to accept the page again. On the change of memory attribute from private
to shared, zapping the GPA range irrespective to private-or-shared and
expecting the fault doesn't work for TDX. Instead, zap shared pages only
and keep the private pages. Concretely
- If it's from mmu notifier, zap shared pages.
- If it's from KVM gmem, zap private pages.
- If setting memory attributes to private, zap shared pages.
- If setting memory attributes to shared, zap private pages.
Signed-off-by: Isaku Yamahata <[email protected]>
---
Changes v1 -> v2:
- consolidate KVM_GFN_RANGE_FLAGS_GMEM_{PUNCH_HOLE, RELEASE} into
KVM_GFN_RANGE_FLAGS_GMEM.
- Update the commit message to describe TDX more. Drop SEV_SNP.
---
include/linux/kvm_host.h | 10 +++++++++-
virt/kvm/guest_mem.c | 9 ++++++---
virt/kvm/kvm_main.c | 4 +++-
3 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a47cedae8a1..1fe0516fcddf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,12 +256,20 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
+
+#define KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR BIT(0)
+#define KVM_GFN_RANGE_FLAGS_GMEM BIT(1)
+
struct kvm_gfn_range {
struct kvm_memory_slot *slot;
gfn_t start;
gfn_t end;
- pte_t pte;
+ union {
+ pte_t pte;
+ u64 attrs;
+ };
bool may_block;
+ unsigned int flags;
};
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index cdf2d84683c8..387226136960 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -99,7 +99,8 @@ static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
}
static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
- pgoff_t start, pgoff_t end)
+ pgoff_t start, pgoff_t end,
+ unsigned int flags)
{
struct kvm_memory_slot *slot;
unsigned long index;
@@ -118,6 +119,7 @@ static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
.slot = slot,
.pte = __pte(0),
.may_block = true,
+ .flags = flags,
};
kvm_mmu_invalidate_range_add(kvm, gfn_range.start, gfn_range.end);
@@ -156,7 +158,8 @@ static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
*/
filemap_invalidate_lock(file->f_mapping);
- kvm_gmem_invalidate_begin(kvm, gmem, start, end);
+ kvm_gmem_invalidate_begin(kvm, gmem, start, end,
+ KVM_GFN_RANGE_FLAGS_GMEM);
truncate_inode_pages_range(file->f_mapping, offset, offset + len - 1);
@@ -263,7 +266,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
* Free the backing memory, and more importantly, zap all SPTEs that
* pointed at this file.
*/
- kvm_gmem_invalidate_begin(kvm, gmem, 0, -1ul);
+ kvm_gmem_invalidate_begin(kvm, gmem, 0, -1ul, KVM_GFN_RANGE_FLAGS_GMEM);
truncate_inode_pages_final(file->f_mapping);
kvm_gmem_invalidate_end(kvm, gmem, 0, -1ul);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 422d49634c56..9cdfa2fb675f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -613,6 +613,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
gfn_range.slot = slot;
+ gfn_range.flags = 0;
if (!locked) {
locked = true;
@@ -2391,8 +2392,9 @@ static void kvm_mem_attrs_changed(struct kvm *kvm, unsigned long attrs,
bool flush = false;
int i;
- gfn_range.pte = __pte(0);
+ gfn_range.attrs = attrs;
gfn_range.may_block = true;
+ gfn_range.flags = KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR;
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
--
2.25.1
On Thu, Jun 22, 2023, [email protected] wrote:
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 7f9ec1e5b136..0ec0b927a391 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -188,6 +188,13 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
> return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
> }
>
> +enum kvm_fault_type {
> + KVM_FAULT_MEM_ATTR,
> + KVM_FAULT_SHARED,
> + KVM_FAULT_SHARED_ALWAYS,
> + KVM_FAULT_PRIVATE,
This is silly. Just use AMD's error code bit, i.e. PFERR_GUEST_ENC_MASK as per
the SNP series.
Bit 34 (ENC): Set to 1 if the guest’s effective C-bit was 1, 0 otherwise.
Just because Intel's ucode is too crusty to support error codes larger than 16
bits doesn't mean KVM can't utilize the bits :-) KVM already translates to AMD's
error codes for other things, e.g.
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
For TDX, handle_ept_violation() can do something like:
if (is_tdx(vcpu->kvm))
error_code |= (gpa & shared) ? 0 : PFERR_GUEST_ENC_MASK;
else if (kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(gpa)))
error_code |= PFERR_GUEST_ENC_MASK;
And that's not even taking into account that TDX might have a separate entry point,
i.e. the "is_tdx()" check can probably be avoided.
As for optimizing kvm_mem_is_private() to avoid unnecessary xarray lookups, that
can and should be done separately, e.g.
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
kvm_guest_has_private_mem(kvm) &&
kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
}
where x86's implementation of kvm_guest_has_private_mem() can be
#define kvm_guest_has_private_mem(kvm) (!!(kvm)->vm_type)
On Fri, Jun 23, 2023 at 01:04:03PM -0700, Sean Christopherson wrote:
> On Thu, Jun 22, 2023, [email protected] wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 7f9ec1e5b136..0ec0b927a391 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -188,6 +188,13 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
> > return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
> > }
> >
> > +enum kvm_fault_type {
> > + KVM_FAULT_MEM_ATTR,
> > + KVM_FAULT_SHARED,
> > + KVM_FAULT_SHARED_ALWAYS,
> > + KVM_FAULT_PRIVATE,
>
> This is silly. Just use AMD's error code bit, i.e. PFERR_GUEST_ENC_MASK as per
> the SNP series.
>
> Bit 34 (ENC): Set to 1 if the guest’s effective C-bit was 1, 0 otherwise.
>
> Just because Intel's ucode is too crusty to support error codes larger than 16
> bits doesn't mean KVM can't utilize the bits :-) KVM already translates to AMD's
> error codes for other things, e.g.
>
> error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
> PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
>
> For TDX, handle_ept_violation() can do something like:
>
> if (is_tdx(vcpu->kvm))
> error_code |= (gpa & shared) ? 0 : PFERR_GUEST_ENC_MASK;
> else if (kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(gpa)))
> error_code |= PFERR_GUEST_ENC_MASK;
Maybe this is what you're getting at below, but seems awkward to have this
being handling in TDX code since that would suggest that SVM module would
also need to duplicate that logic and synthesize this PFERR_GUEST_ENC_MASK
bit for non-SNP VMs (e.g. gmem self-tests).
So maybe SNP/TDX can rely on passing this via error_code, and then some
common code, like kvm_mem_is_private(), can handle this for non-TDX/SNP
guest types. But the current gmem series does this via a new .is_private
in kvm_page_fault:
.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
This seems at odds with the idea of storing this 'fault->is_private'
logic into the error_code. Isaku and I were discussing[1] that we
should do one or the other:
a) store everything in error_code
b) use the .is_private field that seems to have been introduced to
track exactly this information
So I think this series is attempting to do b). If you're suggesting we
should do a), then what purpose is fault->is_private field meant to
serve? Are you planning to get rid of it? Otherwise it seems redundant.
But I think approach b) is useful for another reason:
>
> And that's not even taking into account that TDX might have a separate entry point,
> i.e. the "is_tdx()" check can probably be avoided.
>
> As for optimizing kvm_mem_is_private() to avoid unnecessary xarray lookups, that
> can and should be done separately, e.g.
>
> static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> {
> return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> kvm_guest_has_private_mem(kvm) &&
> kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> }
>
> where x86's implementation of kvm_guest_has_private_mem() can be
>
> #define kvm_guest_has_private_mem(kvm) (!!(kvm)->vm_type)
It's just about not checking xarray for non-protected case. The
optimization here is that neither TDX nor SNP care about the xarray as
far as determining whether the *fault* was a private or not. We only
care later, in part of the KVM MMU code that determines whether the
fault type is at odds with the xarray state and whether to generate a
KVM_EXIT_MEMORY_FAULT as a result.
In that code, both TDX/SNP, as well as gmem self-tests, will all end up
calling kvm_mem_is_private().
In the gmem self-test case, in current gmem base series, and I think with
what you've proposed here, we'd check the xarray via kvm_mem_is_privae(),
both in kvm_mmu_do_page_fault(), as well as later kvm_faultin_pfn() where
KVM_EXIT_MEMORY_FAULT case is handled. That seems off because:
1) Checking in kvm_mem_is_private() via kvm_mmu_do_page_fault() means
that we will grab the value prior to when the KVM MMU records the
mmu_invalidate_seq, which means there's a window between
kvm_mmu_do_page_fault() and kvm_faultin_pfn() where an updated
xarray won't be noticed, and the #NPF retry logic will not get
triggered.
2) For gmem self-test, kvm_mem_is_private() is the authority on
whether the fault is private or not. There's no need to distinguish
between what was set via KVM_SET_MEMORY_ATTRIBUTES, vs. what was
indicated via fault flags/GPA like TDX/SNP do.
So it makes sense, in the gmem case (and TDX/SNP), to defer the
kvm_mem_is_private() till later in kvm_faultin_pfn(). It avoid a
duplicate lookup, and a race. But .is_private only conveys
encrypted/unencrypted fault in TDX/SNP case, it doesn't have a way to
cleanly convey this case "just use whatever kvm_mem_is_private() reports
later, because it will always be what the VMM set, and it's too early
to check kvm_mem_is_private() right now".
So that's where this enum type came from. Although in the discussion I
linked to above I suggested just:
enum kvm_fault_type {
KVM_FAULT_VMM_DEFINED,
KVM_FAULT_SHARED,
KVM_FAULT_PRIVATE,
Which I think would work just as well, since VMM_DEFINED is basically what
MEM_ATTR case handles in this patch, and if you also add the
kvm_guest_has_private_mem() check into kvm_mem_is_private() as you suggested,
it would naturally cover the SHARED_ALWAYS case as well.
I'm fine with other approaches, but think there are a couple good reasons
(#1 and #2 above) for what Isaku is proposing, and any alternative
approach should account for them as well.
[1] https://lore.kernel.org/kvm/[email protected]/T/#mdb24fe4998aa3ff3e568e8a0cba4d12b159087c7
-Mike
On Sun, Jun 25, 2023, Michael Roth wrote:
> On Fri, Jun 23, 2023 at 01:04:03PM -0700, Sean Christopherson wrote:
> > On Thu, Jun 22, 2023, [email protected] wrote:
> > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > > index 7f9ec1e5b136..0ec0b927a391 100644
> > > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > > @@ -188,6 +188,13 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
> > > return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
> > > }
> > >
> > > +enum kvm_fault_type {
> > > + KVM_FAULT_MEM_ATTR,
> > > + KVM_FAULT_SHARED,
> > > + KVM_FAULT_SHARED_ALWAYS,
> > > + KVM_FAULT_PRIVATE,
> >
> > This is silly. Just use AMD's error code bit, i.e. PFERR_GUEST_ENC_MASK as per
> > the SNP series.
> >
> > Bit 34 (ENC): Set to 1 if the guest’s effective C-bit was 1, 0 otherwise.
> >
> > Just because Intel's ucode is too crusty to support error codes larger than 16
> > bits doesn't mean KVM can't utilize the bits :-) KVM already translates to AMD's
> > error codes for other things, e.g.
> >
> > error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
> > PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
> >
> > For TDX, handle_ept_violation() can do something like:
> >
> > if (is_tdx(vcpu->kvm))
> > error_code |= (gpa & shared) ? 0 : PFERR_GUEST_ENC_MASK;
> > else if (kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(gpa)))
> > error_code |= PFERR_GUEST_ENC_MASK;
>
> Maybe this is what you're getting at below, but seems awkward to have this
> being handling in TDX code since that would suggest that SVM module would
> also need to duplicate that logic and synthesize this PFERR_GUEST_ENC_MASK
> bit for non-SNP VMs (e.g. gmem self-tests).
Ah, right, forgot about that angle. The non-TDX synthesizing can be done in
kvm_mmu_page_fault(), e.g.
if (vcpu->kvm->vm_type == KVM_X86_PROTECTED_VM &&
kvm_mem_is_private(...))
error_code |= PFERR_GUEST_ENC_MASK;
> So maybe SNP/TDX can rely on passing this via error_code, and then some
> common code, like kvm_mem_is_private(), can handle this for non-TDX/SNP
> guest types. But the current gmem series does this via a new .is_private
> in kvm_page_fault:
>
> .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
>
> This seems at odds with the idea of storing this 'fault->is_private'
> logic into the error_code. Isaku and I were discussing[1] that we
> should do one or the other:
>
> a) store everything in error_code
> b) use the .is_private field that seems to have been introduced to
> track exactly this information
>
> So I think this series is attempting to do b). If you're suggesting we
> should do a), then what purpose is fault->is_private field meant to
> serve? Are you planning to get rid of it? Otherwise it seems redundant.
>
> But I think approach b) is useful for another reason:
"is_private" would serve the same purpose as all the other bits that are derived
from the error code, e.g. improve readability, reduce line lengths, etc. Though
looking at the name, just "private" is probably the right name.
/* Derived from error_code. */
const bool exec;
const bool write;
const bool present;
const bool rsvd;
const bool user;
> > And that's not even taking into account that TDX might have a separate entry point,
> > i.e. the "is_tdx()" check can probably be avoided.
> >
> > As for optimizing kvm_mem_is_private() to avoid unnecessary xarray lookups, that
> > can and should be done separately, e.g.
> >
> > static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > {
> > return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> > kvm_guest_has_private_mem(kvm) &&
> > kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > }
> >
> > where x86's implementation of kvm_guest_has_private_mem() can be
> >
> > #define kvm_guest_has_private_mem(kvm) (!!(kvm)->vm_type)
>
> It's just about not checking xarray for non-protected case. The
> optimization here is that neither TDX nor SNP care about the xarray as
> far as determining whether the *fault* was a private or not.
Yes, and what I am suggesting doesn't use kvm_mem_is_private() to synthesize that
flag for TDX (or SNP).
> We only care later, in part of the KVM MMU code that determines whether the
> fault type is at odds with the xarray state and whether to generate a
> KVM_EXIT_MEMORY_FAULT as a result.
>
> In that code, both TDX/SNP, as well as gmem self-tests, will all end up
> calling kvm_mem_is_private().
>
> In the gmem self-test case, in current gmem base series, and I think with
> what you've proposed here, we'd check the xarray via kvm_mem_is_privae(),
> both in kvm_mmu_do_page_fault(), as well as later kvm_faultin_pfn() where
> KVM_EXIT_MEMORY_FAULT case is handled. That seems off because:
>
> 1) Checking in kvm_mem_is_private() via kvm_mmu_do_page_fault() means
> that we will grab the value prior to when the KVM MMU records the
> mmu_invalidate_seq, which means there's a window between
> kvm_mmu_do_page_fault() and kvm_faultin_pfn() where an updated
> xarray won't be noticed, and the #NPF retry logic will not get
> triggered.
That's ok-ish, for some definitions of ok. There's no fatal bug, but userspace
will see a spurious, arguably nonsensical exit. If the race occurs before mmu_seq
is snapshot, this code will detect the change and exit to userspace.
if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn))
return kvm_do_memory_fault_exit(vcpu, fault);
> 2) For gmem self-test, kvm_mem_is_private() is the authority on
> whether the fault is private or not. There's no need to distinguish
> between what was set via KVM_SET_MEMORY_ATTRIBUTES, vs. what was
> indicated via fault flags/GPA like TDX/SNP do.
>
> So it makes sense, in the gmem case (and TDX/SNP), to defer the
> kvm_mem_is_private() till later in kvm_faultin_pfn(). It avoid a
> duplicate lookup, and a race. But .is_private only conveys
> encrypted/unencrypted fault in TDX/SNP case, it doesn't have a way to
> cleanly convey this case "just use whatever kvm_mem_is_private() reports
> later, because it will always be what the VMM set, and it's too early
> to check kvm_mem_is_private() right now".
Yeah, the duplicate lookup is unfortunate :-/ But I'd really like to be able to
make kvm_page_fault.private const, just like all the other booleans that are
derived from the error code. My concern with making it *not* const is that
there will be a discrepancy between "private" and "error_code", and we'll have
to be very careful to never touch "private" before kvm_faultin_pfn().
And I don't want to special case "VMM defined", because the primary reason the
"VMM defined" case exists at this time is to allow testing KVM's implementation
without TDX or SNP. E.g. I don't want to end up with code in fast_page_fault()
or so that does X for KVM_FAULT_VMM_DEFINED, but Y for KVM_FAULT_PRIVATE.
So I'm leaning toward the above be
if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
if (vcpu->kvm->vm_type == KVM_X86_PROTECTED_VM)
return RET_PF_RETRY;
else
return kvm_do_memory_fault_exit(vcpu, fault);
}
even though software-protected VMs would perform a technically-unnecessary
attributes lookup. *If* software-protected VMs ever get to the point where folks
care about the performance overhead of the extra lookup, then I'm all for
revisiting the implementation, but that is a ginormous "if" at this point. Though
even then I'd still prefer to figure out a way to make the flag const, but that's
a future problem.
> So that's where this enum type came from. Although in the discussion I
> linked to above I suggested just:
>
> enum kvm_fault_type {
> KVM_FAULT_VMM_DEFINED,
> KVM_FAULT_SHARED,
> KVM_FAULT_PRIVATE,
>
> Which I think would work just as well,
I want to use vm_type for tracking "VMM_DEFINED". KVM's ABI easily allows for 64
VM types, I see no reason to reuse KVM_X86_PROTECTED_VM for TDX and/or SNP, though
the type probably should be KVM_X86_SW_PROTECTED_VM. With that out of the way,
there's no need for an enum to track shared vs. private.
On Mon, Jun 26, 2023 at 11:21:45AM -0700, Sean Christopherson wrote:
> On Sun, Jun 25, 2023, Michael Roth wrote:
> > On Fri, Jun 23, 2023 at 01:04:03PM -0700, Sean Christopherson wrote:
> > > On Thu, Jun 22, 2023, [email protected] wrote:
> > > > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > > > index 7f9ec1e5b136..0ec0b927a391 100644
> > > > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > > > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > > > @@ -188,6 +188,13 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
> > > > return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
> > > > }
> > > >
> > > > +enum kvm_fault_type {
> > > > + KVM_FAULT_MEM_ATTR,
> > > > + KVM_FAULT_SHARED,
> > > > + KVM_FAULT_SHARED_ALWAYS,
> > > > + KVM_FAULT_PRIVATE,
> > >
> > > This is silly. Just use AMD's error code bit, i.e. PFERR_GUEST_ENC_MASK as per
> > > the SNP series.
> > >
> > > Bit 34 (ENC): Set to 1 if the guest’s effective C-bit was 1, 0 otherwise.
> > >
> > > Just because Intel's ucode is too crusty to support error codes larger than 16
> > > bits doesn't mean KVM can't utilize the bits :-) KVM already translates to AMD's
> > > error codes for other things, e.g.
> > >
> > > error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
> > > PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
> > >
> > > For TDX, handle_ept_violation() can do something like:
> > >
> > > if (is_tdx(vcpu->kvm))
> > > error_code |= (gpa & shared) ? 0 : PFERR_GUEST_ENC_MASK;
> > > else if (kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(gpa)))
> > > error_code |= PFERR_GUEST_ENC_MASK;
> >
> > Maybe this is what you're getting at below, but seems awkward to have this
> > being handling in TDX code since that would suggest that SVM module would
> > also need to duplicate that logic and synthesize this PFERR_GUEST_ENC_MASK
> > bit for non-SNP VMs (e.g. gmem self-tests).
>
> Ah, right, forgot about that angle. The non-TDX synthesizing can be done in
> kvm_mmu_page_fault(), e.g.
>
> if (vcpu->kvm->vm_type == KVM_X86_PROTECTED_VM &&
> kvm_mem_is_private(...))
> error_code |= PFERR_GUEST_ENC_MASK;
>
> > So maybe SNP/TDX can rely on passing this via error_code, and then some
> > common code, like kvm_mem_is_private(), can handle this for non-TDX/SNP
> > guest types. But the current gmem series does this via a new .is_private
> > in kvm_page_fault:
> >
> > .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
> >
> > This seems at odds with the idea of storing this 'fault->is_private'
> > logic into the error_code. Isaku and I were discussing[1] that we
> > should do one or the other:
> >
> > a) store everything in error_code
> > b) use the .is_private field that seems to have been introduced to
> > track exactly this information
> >
> > So I think this series is attempting to do b). If you're suggesting we
> > should do a), then what purpose is fault->is_private field meant to
> > serve? Are you planning to get rid of it? Otherwise it seems redundant.
> >
> > But I think approach b) is useful for another reason:
>
> "is_private" would serve the same purpose as all the other bits that are derived
> from the error code, e.g. improve readability, reduce line lengths, etc. Though
> looking at the name, just "private" is probably the right name.
>
> /* Derived from error_code. */
> const bool exec;
> const bool write;
> const bool present;
> const bool rsvd;
> const bool user;
If we go out of our way to pull bits out of error_code so they can be
accessed more consistently/sensibly, why introduce additional bits into
error_code in the first place? It just seems like an unecessary
intermediate step, and introduces error_code bits that TDX/selftests
don't actually ever need, which raises the specter of "what if hardware
starts using this synthetic bit for something else?"
Is it mainly to avoid introducing additional parameters to
kvm_mmu_page_fault() and instead piggy-backing off of error_code param?
Or does recording the values into error_code have a use outside of that?
>
> > > And that's not even taking into account that TDX might have a separate entry point,
> > > i.e. the "is_tdx()" check can probably be avoided.
> > >
> > > As for optimizing kvm_mem_is_private() to avoid unnecessary xarray lookups, that
> > > can and should be done separately, e.g.
> > >
> > > static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > > {
> > > return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> > > kvm_guest_has_private_mem(kvm) &&
> > > kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > > }
> > >
> > > where x86's implementation of kvm_guest_has_private_mem() can be
> > >
> > > #define kvm_guest_has_private_mem(kvm) (!!(kvm)->vm_type)
> >
> > It's just about not checking xarray for non-protected case. The
> > optimization here is that neither TDX nor SNP care about the xarray as
> > far as determining whether the *fault* was a private or not.
>
> Yes, and what I am suggesting doesn't use kvm_mem_is_private() to synthesize that
> flag for TDX (or SNP).
>
> > We only care later, in part of the KVM MMU code that determines whether the
> > fault type is at odds with the xarray state and whether to generate a
> > KVM_EXIT_MEMORY_FAULT as a result.
> >
> > In that code, both TDX/SNP, as well as gmem self-tests, will all end up
> > calling kvm_mem_is_private().
> >
> > In the gmem self-test case, in current gmem base series, and I think with
> > what you've proposed here, we'd check the xarray via kvm_mem_is_privae(),
> > both in kvm_mmu_do_page_fault(), as well as later kvm_faultin_pfn() where
> > KVM_EXIT_MEMORY_FAULT case is handled. That seems off because:
> >
> > 1) Checking in kvm_mem_is_private() via kvm_mmu_do_page_fault() means
> > that we will grab the value prior to when the KVM MMU records the
> > mmu_invalidate_seq, which means there's a window between
> > kvm_mmu_do_page_fault() and kvm_faultin_pfn() where an updated
> > xarray won't be noticed, and the #NPF retry logic will not get
> > triggered.
>
> That's ok-ish, for some definitions of ok. There's no fatal bug, but userspace
> will see a spurious, arguably nonsensical exit. If the race occurs before mmu_seq
> is snapshot, this code will detect the change and exit to userspace.
>
> if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn))
> return kvm_do_memory_fault_exit(vcpu, fault);
Yah, that's not so bad, but it does make it more tricky to write tighter
tests since you always have to allow/handle spurious KVM_EXIT_MEMORY
faults instead of being able to individually flag those non-sensical
cases. Not a huge deal I guess.
>
> > 2) For gmem self-test, kvm_mem_is_private() is the authority on
> > whether the fault is private or not. There's no need to distinguish
> > between what was set via KVM_SET_MEMORY_ATTRIBUTES, vs. what was
> > indicated via fault flags/GPA like TDX/SNP do.
> >
> > So it makes sense, in the gmem case (and TDX/SNP), to defer the
> > kvm_mem_is_private() till later in kvm_faultin_pfn(). It avoid a
> > duplicate lookup, and a race. But .is_private only conveys
> > encrypted/unencrypted fault in TDX/SNP case, it doesn't have a way to
> > cleanly convey this case "just use whatever kvm_mem_is_private() reports
> > later, because it will always be what the VMM set, and it's too early
> > to check kvm_mem_is_private() right now".
>
> Yeah, the duplicate lookup is unfortunate :-/ But I'd really like to be able to
> make kvm_page_fault.private const, just like all the other booleans that are
> derived from the error code. My concern with making it *not* const is that
> there will be a discrepancy between "private" and "error_code", and we'll have
> to be very careful to never touch "private" before kvm_faultin_pfn().
It can actually be const using the KVM_FAULT_{VMM_DEFINED,SHARED,PRIVATE}
abstraction. fault->is_private doesn't really need to be overwritten for
VMM_DEFINED case later, it can just be treated as "use whatever
kvm_mem_is_private() reports". But I guess that's what you mean by
"special casing" below.
>
> And I don't want to special case "VMM defined", because the primary reason the
> "VMM defined" case exists at this time is to allow testing KVM's implementation
> without TDX or SNP. E.g. I don't want to end up with code in fast_page_fault()
Are you suggesting VMM_DEFINED would eventually go away once SNP/TDX
have bigger uptake, or maybe in favor of tests built around some new VM type
(e.g. software-protected VMs) that use guest hypercalls to set
guest-expected memory state rather than always relying on what the VMM sets
up?
I guess in that case VMM_DEFINED handling could just be dropped at
that point, and KVM_FAULT_{SHARED,PRIVATE} would still be relevant (or they
could get switched back to bool at that point), and software-protected VMs
would set that value based on whatever state tracks the hypercalls.
But if that's the end-game maybe it's a good reason for keeping
fault->is_private bool and avoiding enums. But still don't really see the
worth in also setting the bit in error_code.
> without TDX or SNP. E.g. I don't want to end up with code in fast_page_fault()
> or so that does X for KVM_FAULT_VMM_DEFINED, but Y for KVM_FAULT_PRIVATE.
Hadn't really considered fast_page_fault() for SNP... it seems like
for explicit page-state changes, the vCPU issuing the conversions
would just block until the GHCB request it issued was completed. So by
the time it accesses the GPA, KVM_SET_MEMORY_ATTRIBUTES would have
already zapped the old entry, so the fast path would get bypassed at
that point.
For implicit page-state changes, I guess there's a risk you can
get stuck looping on fast_page_fault() since it's up to KVM MMU
to generate the KVM_EXIT_MEMORY_FAULT so KVM_SET_MEMORY_ATTRIBUTES
gets called eventually. Sort of surprised I haven't encountered
that case though... but anyway...
If you rely on similar checks to what slow path does:
if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn))
return kvm_do_memory_fault_exit(vcpu, fault);
kvm_mem_is_private() could be stale due to unnoticed invalidation,
but eventually it would reach steady-state and the
KVM_EXIT_MEMORY_FAULT would fire. Is that sort of what you have in
mind there?
For SNP it seems more efficient to check for RMP bit and then head
straight to the slow-path, but this isn't a hot path so probably
doesn't matter much.
>
> So I'm leaning toward the above be
>
> if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
> if (vcpu->kvm->vm_type == KVM_X86_PROTECTED_VM)
> return RET_PF_RETRY;
> else
> return kvm_do_memory_fault_exit(vcpu, fault);
> }
>
> even though software-protected VMs would perform a technically-unnecessary
> attributes lookup. *If* software-protected VMs ever get to the point where folks
> care about the performance overhead of the extra lookup, then I'm all for
> revisiting the implementation, but that is a ginormous "if" at this point. Though
> even then I'd still prefer to figure out a way to make the flag const, but that's
> a future problem.
>
> > So that's where this enum type came from. Although in the discussion I
> > linked to above I suggested just:
> >
> > enum kvm_fault_type {
> > KVM_FAULT_VMM_DEFINED,
> > KVM_FAULT_SHARED,
> > KVM_FAULT_PRIVATE,
> >
> > Which I think would work just as well,
>
> I want to use vm_type for tracking "VMM_DEFINED". KVM's ABI easily allows for 64
> VM types, I see no reason to reuse KVM_X86_PROTECTED_VM for TDX and/or SNP, though
> the type probably should be KVM_X86_SW_PROTECTED_VM. With that out of the way,
> there's no need for an enum to track shared vs. private.
>
Introducing TDX/SNP vm types seems to buy us some flexibility so it
sounds useful.
Rather than synthesizing bits in error_code, maybe we could also use it
to help out there as well? (assuming parameter-passing was the main
use-case there)
static bool kvm_fault_is_private(kvm, gpa, error_code)
{
if (kvm->vm_type == KVM_X86_TDX_VM)
return gpa & TDX_SHARED_GPA_MASK;
if (kvm->vm_type == KVM_X86_SNP_VM)
return error_code & PFERR_GUEST_ENC_MASK;
return kvm_mem_is_private(kvm, gpa);
}
static inline int kvm_mmu_do_page_fault(vcpu, gpa, err, ...)
{
struct kvm_page_fault = {
...
.is_private = kvm_fault_is_private(kvm, gpa, err)
};
...
}
-Mike
On Thu, Jun 22, 2023 at 04:16:29PM -0700, [email protected] wrote:
> From: Isaku Yamahata <[email protected]>
>
> Add flags to strut kvm_gfn_range to indicate who triggered the callback
> and new memory attributes.
>
> TDX needs to know the reason for a callback by kvm_unmap_gfn_range(). mmu
> notifier, set memory attributes ioctl or KVM gmem callback. With TDX,
> zapping a private page from the encrypted page table and adding the page
> back to the same private GPA results in zeroing the page, and the guest has
> to accept the page again. On the change of memory attribute from private
Is this part used to explains why on MMU notifier only shared pages should
be zapped ?
> to shared, zapping the GPA range irrespective to private-or-shared and
> expecting the fault doesn't work for TDX. Instead, zap shared pages only
> and keep the private pages. Concretely
Do you mean:
On the change of memory attribute, zapping the GPA range irrespective to
private-or-shared and expecting that the EPT mapping for attribute converts
to doesn't exist at the time of changing the attribute, zap the "from"
attribute range only and ignore the "to" attribute.
> - If it's from mmu notifier, zap shared pages.
> - If it's from KVM gmem, zap private pages.
> - If setting memory attributes to private, zap shared pages.
> - If setting memory attributes to shared, zap private pages.
>
> Signed-off-by: Isaku Yamahata <[email protected]>
>
> ---
> Changes v1 -> v2:
> - consolidate KVM_GFN_RANGE_FLAGS_GMEM_{PUNCH_HOLE, RELEASE} into
> KVM_GFN_RANGE_FLAGS_GMEM.
> - Update the commit message to describe TDX more. Drop SEV_SNP.
> ---
> include/linux/kvm_host.h | 10 +++++++++-
> virt/kvm/guest_mem.c | 9 ++++++---
> virt/kvm/kvm_main.c | 4 +++-
> 3 files changed, 18 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1a47cedae8a1..1fe0516fcddf 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -256,12 +256,20 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> #endif
>
> #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
> +
> +#define KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR BIT(0)
> +#define KVM_GFN_RANGE_FLAGS_GMEM BIT(1)
> +
> struct kvm_gfn_range {
> struct kvm_memory_slot *slot;
> gfn_t start;
> gfn_t end;
> - pte_t pte;
> + union {
> + pte_t pte;
> + u64 attrs;
> + };
> bool may_block;
> + unsigned int flags;
> };
> bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> index cdf2d84683c8..387226136960 100644
> --- a/virt/kvm/guest_mem.c
> +++ b/virt/kvm/guest_mem.c
> @@ -99,7 +99,8 @@ static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> }
>
> static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
> - pgoff_t start, pgoff_t end)
> + pgoff_t start, pgoff_t end,
> + unsigned int flags)
> {
> struct kvm_memory_slot *slot;
> unsigned long index;
> @@ -118,6 +119,7 @@ static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
> .slot = slot,
> .pte = __pte(0),
> .may_block = true,
> + .flags = flags,
> };
>
> kvm_mmu_invalidate_range_add(kvm, gfn_range.start, gfn_range.end);
> @@ -156,7 +158,8 @@ static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
> */
> filemap_invalidate_lock(file->f_mapping);
>
> - kvm_gmem_invalidate_begin(kvm, gmem, start, end);
> + kvm_gmem_invalidate_begin(kvm, gmem, start, end,
> + KVM_GFN_RANGE_FLAGS_GMEM);
>
> truncate_inode_pages_range(file->f_mapping, offset, offset + len - 1);
>
> @@ -263,7 +266,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
> * Free the backing memory, and more importantly, zap all SPTEs that
> * pointed at this file.
> */
> - kvm_gmem_invalidate_begin(kvm, gmem, 0, -1ul);
> + kvm_gmem_invalidate_begin(kvm, gmem, 0, -1ul, KVM_GFN_RANGE_FLAGS_GMEM);
> truncate_inode_pages_final(file->f_mapping);
> kvm_gmem_invalidate_end(kvm, gmem, 0, -1ul);
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 422d49634c56..9cdfa2fb675f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -613,6 +613,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
> gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
> gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
> gfn_range.slot = slot;
> + gfn_range.flags = 0;
>
> if (!locked) {
> locked = true;
> @@ -2391,8 +2392,9 @@ static void kvm_mem_attrs_changed(struct kvm *kvm, unsigned long attrs,
> bool flush = false;
> int i;
>
> - gfn_range.pte = __pte(0);
> + gfn_range.attrs = attrs;
> gfn_range.may_block = true;
> + gfn_range.flags = KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR;
>
> for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> slots = __kvm_memslots(kvm, i);
> --
> 2.25.1
>
On Thu, Jun 22, 2023 at 04:16:29PM -0700, [email protected] wrote:
> From: Isaku Yamahata <[email protected]>
>
> Add flags to strut kvm_gfn_range to indicate who triggered the callback
> and new memory attributes.
>
> TDX needs to know the reason for a callback by kvm_unmap_gfn_range(). mmu
> notifier, set memory attributes ioctl or KVM gmem callback. With TDX,
> zapping a private page from the encrypted page table and adding the page
> back to the same private GPA results in zeroing the page, and the guest has
> to accept the page again. On the change of memory attribute from private
> to shared, zapping the GPA range irrespective to private-or-shared and
> expecting the fault doesn't work for TDX. Instead, zap shared pages only
> and keep the private pages. Concretely
> - If it's from mmu notifier, zap shared pages.
> - If it's from KVM gmem, zap private pages.
> - If setting memory attributes to private, zap shared pages.
> - If setting memory attributes to shared, zap private pages.
>
> Signed-off-by: Isaku Yamahata <[email protected]>
>
> ---
> Changes v1 -> v2:
> - consolidate KVM_GFN_RANGE_FLAGS_GMEM_{PUNCH_HOLE, RELEASE} into
> KVM_GFN_RANGE_FLAGS_GMEM.
> - Update the commit message to describe TDX more. Drop SEV_SNP.
> ---
> include/linux/kvm_host.h | 10 +++++++++-
> virt/kvm/guest_mem.c | 9 ++++++---
> virt/kvm/kvm_main.c | 4 +++-
> 3 files changed, 18 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 1a47cedae8a1..1fe0516fcddf 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -256,12 +256,20 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> #endif
>
> #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
> +
> +#define KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR BIT(0)
> +#define KVM_GFN_RANGE_FLAGS_GMEM BIT(1)
> +
> struct kvm_gfn_range {
> struct kvm_memory_slot *slot;
> gfn_t start;
> gfn_t end;
> - pte_t pte;
> + union {
> + pte_t pte;
> + u64 attrs;
> + };
> bool may_block;
> + unsigned int flags;
> };
> bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> index cdf2d84683c8..387226136960 100644
> --- a/virt/kvm/guest_mem.c
> +++ b/virt/kvm/guest_mem.c
> @@ -99,7 +99,8 @@ static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> }
>
> static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
> - pgoff_t start, pgoff_t end)
> + pgoff_t start, pgoff_t end,
> + unsigned int flags)
> {
> struct kvm_memory_slot *slot;
> unsigned long index;
> @@ -118,6 +119,7 @@ static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
> .slot = slot,
> .pte = __pte(0),
> .may_block = true,
> + .flags = flags,
> };
>
> kvm_mmu_invalidate_range_add(kvm, gfn_range.start, gfn_range.end);
> @@ -156,7 +158,8 @@ static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
> */
> filemap_invalidate_lock(file->f_mapping);
>
> - kvm_gmem_invalidate_begin(kvm, gmem, start, end);
> + kvm_gmem_invalidate_begin(kvm, gmem, start, end,
> + KVM_GFN_RANGE_FLAGS_GMEM);
Do you anticipate ever needing to pass a different flag via
kvm_gmem_invalidate_begin()? If not it might make sense to just
hard-code it rather than passing as a parameter.
-Mike
On Tue, Jun 27, 2023, Michael Roth wrote:
> On Mon, Jun 26, 2023 at 11:21:45AM -0700, Sean Christopherson wrote:
> > "is_private" would serve the same purpose as all the other bits that are derived
> > from the error code, e.g. improve readability, reduce line lengths, etc. Though
> > looking at the name, just "private" is probably the right name.
> >
> > /* Derived from error_code. */
> > const bool exec;
> > const bool write;
> > const bool present;
> > const bool rsvd;
> > const bool user;
>
> If we go out of our way to pull bits out of error_code so they can be
> accessed more consistently/sensibly, why introduce additional bits into
> error_code in the first place? It just seems like an unecessary
> intermediate step, and introduces error_code bits that TDX/selftests
> don't actually ever need, which raises the specter of "what if hardware
> starts using this synthetic bit for something else?"
It's not a KVM-defined bit though, it's an AMD-defined bit. That's a very big
difference as AMD and Intel both go to out of their way to not step on each other's
toes.
And *architecturally*, VMX doesn't support #PF error codes larger than 16 bits,
not to mention that all of this needs to be dependent on TDP. And for EPT
violations, the error code is *fully* synthetic anyways because the error code
passed into kvm_mmu_page_fault is a translation of vmcs.EXIT_QUALIFICATION into
the "legacy" error code format.
So there's really no danger here, whereas a truly synthetic, KVM-defined bit like
PFERR_IMPLICIT_ACCESS *does* carry some risk because AMD in particular might use
bit 48 for a new hardware-defined flags.
> Is it mainly to avoid introducing additional parameters to
> kvm_mmu_page_fault() and instead piggy-backing off of error_code param?
> Or does recording the values into error_code have a use outside of that?
It provides canonical behavior within common KVM code. Stashing individual flags
in const bools in kvm_page_fault is purely for ease-of-use, they are NOT the canonical
representation of state for the entirety of page fault handling. E.g. in the unlikely
scenario that kvm_mmu_page_fault() needs to care about shared vs. private, there's
no kvm_page_fault structure to query.
> > > So it makes sense, in the gmem case (and TDX/SNP), to defer the
> > > kvm_mem_is_private() till later in kvm_faultin_pfn(). It avoid a
> > > duplicate lookup, and a race. But .is_private only conveys
> > > encrypted/unencrypted fault in TDX/SNP case, it doesn't have a way to
> > > cleanly convey this case "just use whatever kvm_mem_is_private() reports
> > > later, because it will always be what the VMM set, and it's too early
> > > to check kvm_mem_is_private() right now".
> >
> > Yeah, the duplicate lookup is unfortunate :-/ But I'd really like to be able to
> > make kvm_page_fault.private const, just like all the other booleans that are
> > derived from the error code. My concern with making it *not* const is that
> > there will be a discrepancy between "private" and "error_code", and we'll have
> > to be very careful to never touch "private" before kvm_faultin_pfn().
>
> It can actually be const using the KVM_FAULT_{VMM_DEFINED,SHARED,PRIVATE}
> abstraction. fault->is_private doesn't really need to be overwritten for
> VMM_DEFINED case later, it can just be treated as "use whatever
> kvm_mem_is_private() reports". But I guess that's what you mean by
> "special casing" below.
That's not constifying the state (private vs. shared), that's constifying behavior.
VMM_DEFINED is like Schrodinger's cat; the fault is neither private nor shared
until KVM actually looks at it.
> > And I don't want to special case "VMM defined", because the primary reason the
> > "VMM defined" case exists at this time is to allow testing KVM's implementation
> > without TDX or SNP. E.g. I don't want to end up with code in fast_page_fault()
>
> Are you suggesting VMM_DEFINED would eventually go away once SNP/TDX
> have bigger uptake,
I'm saying I don't want VMM_DEFINED, period. :-)
> or maybe in favor of tests built around some new VM type (e.g.
> software-protected VMs) that use guest hypercalls to set guest-expected
> memory state rather than always relying on what the VMM sets up?
>
> I guess in that case VMM_DEFINED handling could just be dropped at
> that point, and KVM_FAULT_{SHARED,PRIVATE} would still be relevant (or they
> could get switched back to bool at that point), and software-protected VMs
> would set that value based on whatever state tracks the hypercalls.
>
> But if that's the end-game maybe it's a good reason for keeping
> fault->is_private bool and avoiding enums. But still don't really see the
> worth in also setting the bit in error_code.
>
> > without TDX or SNP. E.g. I don't want to end up with code in fast_page_fault()
> > or so that does X for KVM_FAULT_VMM_DEFINED, but Y for KVM_FAULT_PRIVATE.
>
> Hadn't really considered fast_page_fault() for SNP... it seems like
> for explicit page-state changes, the vCPU issuing the conversions
> would just block until the GHCB request it issued was completed. So by
> the time it accesses the GPA, KVM_SET_MEMORY_ATTRIBUTES would have
> already zapped the old entry, so the fast path would get bypassed at
> that point.
>
> For implicit page-state changes, I guess there's a risk you can
> get stuck looping on fast_page_fault() since it's up to KVM MMU
> to generate the KVM_EXIT_MEMORY_FAULT so KVM_SET_MEMORY_ATTRIBUTES
> gets called eventually. Sort of surprised I haven't encountered
> that case though... but anyway...
>
> If you rely on similar checks to what slow path does:
>
> if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn))
> return kvm_do_memory_fault_exit(vcpu, fault);
>
> kvm_mem_is_private() could be stale due to unnoticed invalidation,
> but eventually it would reach steady-state and the
> KVM_EXIT_MEMORY_FAULT would fire. Is that sort of what you have in
> mind there?
>
> For SNP it seems more efficient to check for RMP bit and then head
> straight to the slow-path, but this isn't a hot path so probably
> doesn't matter much.
I'm not concerned about any specific path, i.e. don't read too much into my
arbitrary fast_page_fault() example.
What I am trying to point out is that I don't want to end up in a world where
a page fault is known to be private or shared for TDX/SNP, but not for SW-protected
VMs. Because if that happens, then we'll end up in one of three situations:
1. KVM can't act on private versus shared until after __kvm_faultin_pfn()
2. KVM is buggy because it is consuming undefined state, i.e. querying if memory
is private versus shared before looking inside the box
3. KVM has different flows for TDX/SNP versus SW-protected VMs
> > So I'm leaning toward the above be
> >
> > if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
> > if (vcpu->kvm->vm_type == KVM_X86_PROTECTED_VM)
> > return RET_PF_RETRY;
> > else
> > return kvm_do_memory_fault_exit(vcpu, fault);
> > }
> >
> > even though software-protected VMs would perform a technically-unnecessary
> > attributes lookup. *If* software-protected VMs ever get to the point where folks
> > care about the performance overhead of the extra lookup, then I'm all for
> > revisiting the implementation, but that is a ginormous "if" at this point. Though
> > even then I'd still prefer to figure out a way to make the flag const, but that's
> > a future problem.
> >
> > > So that's where this enum type came from. Although in the discussion I
> > > linked to above I suggested just:
> > >
> > > enum kvm_fault_type {
> > > KVM_FAULT_VMM_DEFINED,
> > > KVM_FAULT_SHARED,
> > > KVM_FAULT_PRIVATE,
> > >
> > > Which I think would work just as well,
> >
> > I want to use vm_type for tracking "VMM_DEFINED". KVM's ABI easily allows for 64
> > VM types, I see no reason to reuse KVM_X86_PROTECTED_VM for TDX and/or SNP, though
> > the type probably should be KVM_X86_SW_PROTECTED_VM. With that out of the way,
> > there's no need for an enum to track shared vs. private.
> >
>
> Introducing TDX/SNP vm types seems to buy us some flexibility so it
> sounds useful.
>
> Rather than synthesizing bits in error_code, maybe we could also use it
> to help out there as well? (assuming parameter-passing was the main
> use-case there)
>
> static bool kvm_fault_is_private(kvm, gpa, error_code)
> {
> if (kvm->vm_type == KVM_X86_TDX_VM)
> return gpa & TDX_SHARED_GPA_MASK;
>
> if (kvm->vm_type == KVM_X86_SNP_VM)
> return error_code & PFERR_GUEST_ENC_MASK;
>
> return kvm_mem_is_private(kvm, gpa);
> }
That's also an option, but (a) it's a bit kludgy as it requries three pieces of data,
(b) it's suboptimal for KVM_X86_SW_PROTECTED_VM if there are multiple lookups, and
(c) it will result in inconsistent behavior if there are multiple lookups.
E.g. these two things wouldn't be functionally equivalent, which is just asking
for bugs.
if (!kvm_fault_is_private(...)
A();
B();
if (kvm_fault_is_private(...)
C();
versus
bool private = kvm_fault_is_private();
if (!private)
A();
B();
if (private)
C();
On Wed, Jun 28, 2023 at 02:41:56PM +0800,
Yuan Yao <[email protected]> wrote:
> On Thu, Jun 22, 2023 at 04:16:29PM -0700, [email protected] wrote:
> > From: Isaku Yamahata <[email protected]>
> >
> > Add flags to strut kvm_gfn_range to indicate who triggered the callback
> > and new memory attributes.
> >
> > TDX needs to know the reason for a callback by kvm_unmap_gfn_range(). mmu
> > notifier, set memory attributes ioctl or KVM gmem callback. With TDX,
> > zapping a private page from the encrypted page table and adding the page
> > back to the same private GPA results in zeroing the page, and the guest has
> > to accept the page again. On the change of memory attribute from private
>
> Is this part used to explains why on MMU notifier only shared pages should
> be zapped ?
Right for TDX.
> > to shared, zapping the GPA range irrespective to private-or-shared and
> > expecting the fault doesn't work for TDX. Instead, zap shared pages only
> > and keep the private pages. Concretely
>
> Do you mean:
>
> On the change of memory attribute, zapping the GPA range irrespective to
> private-or-shared and expecting that the EPT mapping for attribute converts
> to doesn't exist at the time of changing the attribute, zap the "from"
> attribute range only and ignore the "to" attribute.
That's what I meant. The requirement seems specific to TDX.
I'll update this patch following the suggestion by Sean. [1]
[1] https://lore.kernel.org/all/[email protected]/
--
Isaku Yamahata <[email protected]>
On Wed, Jun 28, 2023 at 10:21:32AM -0500,
Michael Roth <[email protected]> wrote:
> On Thu, Jun 22, 2023 at 04:16:29PM -0700, [email protected] wrote:
> > From: Isaku Yamahata <[email protected]>
> >
> > Add flags to strut kvm_gfn_range to indicate who triggered the callback
> > and new memory attributes.
> >
> > TDX needs to know the reason for a callback by kvm_unmap_gfn_range(). mmu
> > notifier, set memory attributes ioctl or KVM gmem callback. With TDX,
> > zapping a private page from the encrypted page table and adding the page
> > back to the same private GPA results in zeroing the page, and the guest has
> > to accept the page again. On the change of memory attribute from private
> > to shared, zapping the GPA range irrespective to private-or-shared and
> > expecting the fault doesn't work for TDX. Instead, zap shared pages only
> > and keep the private pages. Concretely
> > - If it's from mmu notifier, zap shared pages.
> > - If it's from KVM gmem, zap private pages.
> > - If setting memory attributes to private, zap shared pages.
> > - If setting memory attributes to shared, zap private pages.
> >
> > Signed-off-by: Isaku Yamahata <[email protected]>
> >
> > ---
> > Changes v1 -> v2:
> > - consolidate KVM_GFN_RANGE_FLAGS_GMEM_{PUNCH_HOLE, RELEASE} into
> > KVM_GFN_RANGE_FLAGS_GMEM.
> > - Update the commit message to describe TDX more. Drop SEV_SNP.
> > ---
> > include/linux/kvm_host.h | 10 +++++++++-
> > virt/kvm/guest_mem.c | 9 ++++++---
> > virt/kvm/kvm_main.c | 4 +++-
> > 3 files changed, 18 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 1a47cedae8a1..1fe0516fcddf 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -256,12 +256,20 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
> > #endif
> >
> > #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
> > +
> > +#define KVM_GFN_RANGE_FLAGS_SET_MEM_ATTR BIT(0)
> > +#define KVM_GFN_RANGE_FLAGS_GMEM BIT(1)
> > +
> > struct kvm_gfn_range {
> > struct kvm_memory_slot *slot;
> > gfn_t start;
> > gfn_t end;
> > - pte_t pte;
> > + union {
> > + pte_t pte;
> > + u64 attrs;
> > + };
> > bool may_block;
> > + unsigned int flags;
> > };
> > bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> > index cdf2d84683c8..387226136960 100644
> > --- a/virt/kvm/guest_mem.c
> > +++ b/virt/kvm/guest_mem.c
> > @@ -99,7 +99,8 @@ static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> > }
> >
> > static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
> > - pgoff_t start, pgoff_t end)
> > + pgoff_t start, pgoff_t end,
> > + unsigned int flags)
> > {
> > struct kvm_memory_slot *slot;
> > unsigned long index;
> > @@ -118,6 +119,7 @@ static void kvm_gmem_invalidate_begin(struct kvm *kvm, struct kvm_gmem *gmem,
> > .slot = slot,
> > .pte = __pte(0),
> > .may_block = true,
> > + .flags = flags,
> > };
> >
> > kvm_mmu_invalidate_range_add(kvm, gfn_range.start, gfn_range.end);
> > @@ -156,7 +158,8 @@ static long kvm_gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
> > */
> > filemap_invalidate_lock(file->f_mapping);
> >
> > - kvm_gmem_invalidate_begin(kvm, gmem, start, end);
> > + kvm_gmem_invalidate_begin(kvm, gmem, start, end,
> > + KVM_GFN_RANGE_FLAGS_GMEM);
>
> Do you anticipate ever needing to pass a different flag via
> kvm_gmem_invalidate_begin()? If not it might make sense to just
> hard-code it rather than passing as a parameter.
I'll update this patch following the suggestion by Sean. [1]
I'll drop the flag. the struct kvm_gfn_range will be
struct kvm_gfn_range {
struct kvm_memory_slot *slot;
gfn_t start;
gfn_t end;
union {
struct test_clear_young_metadata *metadata;
unsigned long attributes;
pte_t pte;
unsigned long callback_arg; /* needs a better name */
};
bool only_private;
bool only_shared;
bool may_block;
};
[1] https://lore.kernel.org/all/[email protected]/
--
Isaku Yamahata <[email protected]>