LinuxLists.cc - [PATCH Part2 v5 39/45] KVM: SVM: Introduce ops for the post gfn map and unmap

2021-08-20 16:06:37

Subject: [PATCH Part2 v5 39/45] KVM: SVM: Introduce ops for the post gfn map and unmap

When SEV-SNP is enabled in the guest VM, the guest memory pages can
either be a private or shared. A write from the hypervisor goes through
the RMP checks. If hardware sees that hypervisor is attempting to write
to a guest private page, then it triggers an RMP violation #PF.

To avoid the RMP violation, add post_{map,unmap}_gfn() ops that can be
used to verify that its safe to map a given guest page. Use the SRCU to
protect against the page state change for existing mapped pages.

Signed-off-by: Brijesh Singh <[email protected]>
---
arch/x86/include/asm/kvm-x86-ops.h | 2 +
arch/x86/include/asm/kvm_host.h | 4 ++
arch/x86/kvm/svm/sev.c | 69 +++++++++++++++++++++-----
arch/x86/kvm/svm/svm.c | 4 ++
arch/x86/kvm/svm/svm.h | 8 +++
arch/x86/kvm/x86.c | 78 +++++++++++++++++++++++++++---
6 files changed, 146 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 371756c7f8f4..c09bd40e0160 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -124,6 +124,8 @@ KVM_X86_OP(msr_filter_changed)
KVM_X86_OP_NULL(complete_emulated_msr)
KVM_X86_OP(alloc_apic_backing_page)
KVM_X86_OP_NULL(rmp_page_level_adjust)
+KVM_X86_OP(post_map_gfn)
+KVM_X86_OP(post_unmap_gfn)

#undef KVM_X86_OP
#undef KVM_X86_OP_NULL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a6e764458f3e..5ac1ff097e8c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1463,7 +1463,11 @@ struct kvm_x86_ops {
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);

void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+
void (*rmp_page_level_adjust)(struct kvm *kvm, kvm_pfn_t pfn, int *level);
+
+ int (*post_map_gfn)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int *token);
+ void (*post_unmap_gfn)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int token);
};

struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0de85ed63e9b..65b578463271 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -336,6 +336,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (ret)
goto e_free;

+ init_srcu_struct(&sev->psc_srcu);
ret = sev_snp_init(&argp->error);
} else {
ret = sev_platform_init(&argp->error);
@@ -2293,6 +2294,7 @@ void sev_vm_destroy(struct kvm *kvm)
WARN_ONCE(1, "Failed to free SNP guest context, leaking asid!\n");
return;
}
+ cleanup_srcu_struct(&sev->psc_srcu);
} else {
sev_unbind_asid(kvm, sev->handle);
}
@@ -2494,23 +2496,32 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
kfree(svm->ghcb_sa);
}

-static inline int svm_map_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map)
+static inline int svm_map_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map, int *token)
{
struct vmcb_control_area *control = &svm->vmcb->control;
u64 gfn = gpa_to_gfn(control->ghcb_gpa);
+ struct kvm_vcpu *vcpu = &svm->vcpu;

- if (kvm_vcpu_map(&svm->vcpu, gfn, map)) {
+ if (kvm_vcpu_map(vcpu, gfn, map)) {
/* Unable to map GHCB from guest */
pr_err("error mapping GHCB GFN [%#llx] from guest\n", gfn);
return -EFAULT;
}

+ if (sev_post_map_gfn(vcpu->kvm, map->gfn, map->pfn, token)) {
+ kvm_vcpu_unmap(vcpu, map, false);
+ return -EBUSY;
+ }
+
return 0;
}

-static inline void svm_unmap_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map)
+static inline void svm_unmap_ghcb(struct vcpu_svm *svm, struct kvm_host_map *map, int token)
{
- kvm_vcpu_unmap(&svm->vcpu, map, true);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ kvm_vcpu_unmap(vcpu, map, true);
+ sev_post_unmap_gfn(vcpu->kvm, map->gfn, map->pfn, token);
}

static void dump_ghcb(struct vcpu_svm *svm)
@@ -2518,8 +2529,9 @@ static void dump_ghcb(struct vcpu_svm *svm)
struct kvm_host_map map;
unsigned int nbits;
struct ghcb *ghcb;
+ int token;

- if (svm_map_ghcb(svm, &map))
+ if (svm_map_ghcb(svm, &map, &token))
return;

ghcb = map.hva;
@@ -2544,7 +2556,7 @@ static void dump_ghcb(struct vcpu_svm *svm)
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap);

e_unmap:
- svm_unmap_ghcb(svm, &map);
+ svm_unmap_ghcb(svm, &map, token);
}

static bool sev_es_sync_to_ghcb(struct vcpu_svm *svm)
@@ -2552,8 +2564,9 @@ static bool sev_es_sync_to_ghcb(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_host_map map;
struct ghcb *ghcb;
+ int token;

- if (svm_map_ghcb(svm, &map))
+ if (svm_map_ghcb(svm, &map, &token))
return false;

ghcb = map.hva;
@@ -2579,7 +2592,7 @@ static bool sev_es_sync_to_ghcb(struct vcpu_svm *svm)

trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, ghcb);

- svm_unmap_ghcb(svm, &map);
+ svm_unmap_ghcb(svm, &map, token);

return true;
}
@@ -2636,8 +2649,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code)
struct kvm_vcpu *vcpu = &svm->vcpu;
struct kvm_host_map map;
struct ghcb *ghcb;
+ int token;

- if (svm_map_ghcb(svm, &map))
+ if (svm_map_ghcb(svm, &map, &token))
return -EFAULT;

ghcb = map.hva;
@@ -2739,7 +2753,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code)

sev_es_sync_from_ghcb(svm, ghcb);

- svm_unmap_ghcb(svm, &map);
+ svm_unmap_ghcb(svm, &map, token);
return 0;

vmgexit_err:
@@ -2760,7 +2774,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm, u64 *exit_code)
vcpu->run->internal.data[0] = *exit_code;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;

- svm_unmap_ghcb(svm, &map);
+ svm_unmap_ghcb(svm, &map, token);
return -EINVAL;
}

@@ -3036,6 +3050,9 @@ static int __snp_handle_page_state_change(struct kvm_vcpu *vcpu, enum psc_op op,
return PSC_UNDEF_ERR;
}

+ /* Wait for all the existing mapped gfn to unmap */
+ synchronize_srcu_expedited(&sev->psc_srcu);
+
write_lock(&kvm->mmu_lock);

rc = kvm_mmu_get_tdp_walk(vcpu, gpa, &pfn, &npt_level);
@@ -3604,3 +3621,33 @@ void sev_rmp_page_level_adjust(struct kvm *kvm, kvm_pfn_t pfn, int *level)
/* Adjust the level to keep the NPT and RMP in sync */
*level = min_t(size_t, *level, rmp_level);
}
+
+int sev_post_map_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int *token)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int level;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ *token = srcu_read_lock(&sev->psc_srcu);
+
+ /* If pfn is not added as private then fail */
+ if (snp_lookup_rmpentry(pfn, &level) == 1) {
+ srcu_read_unlock(&sev->psc_srcu, *token);
+ pr_err_ratelimited("failed to map private gfn 0x%llx pfn 0x%llx\n", gfn, pfn);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+void sev_post_unmap_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int token)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ if (!sev_snp_guest(kvm))
+ return;
+
+ srcu_read_unlock(&sev->psc_srcu, token);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5f73f21a37a1..3784d389247b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4679,7 +4679,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,

.alloc_apic_backing_page = svm_alloc_apic_backing_page,
+
.rmp_page_level_adjust = sev_rmp_page_level_adjust,
+
+ .post_map_gfn = sev_post_map_gfn,
+ .post_unmap_gfn = sev_post_unmap_gfn,
};

static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index d10f7166b39d..ff91184f9b4a 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -76,16 +76,22 @@ struct kvm_sev_info {
bool active; /* SEV enabled guest */
bool es_active; /* SEV-ES enabled guest */
bool snp_active; /* SEV-SNP enabled guest */
+
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
+
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
+
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+
struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
+
u64 snp_init_flags;
void *snp_context; /* SNP guest context page */
+ struct srcu_struct psc_srcu;
};

struct kvm_svm {
@@ -618,6 +624,8 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
void sev_rmp_page_level_adjust(struct kvm *kvm, kvm_pfn_t pfn, int *level);
+int sev_post_map_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int *token);
+void sev_post_unmap_gfn(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int token);

/* vmenter.S */

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index afcdc75a99f2..bf4389ffc88f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3095,6 +3095,65 @@ static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
return (vcpu->arch.apf.msr_en_val & mask) == mask;
}

+static int kvm_map_gfn_protected(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
+ struct gfn_to_pfn_cache *cache, bool atomic, int *token)
+{
+ int ret;
+
+ ret = kvm_map_gfn(vcpu, gfn, map, cache, atomic);
+ if (ret)
+ return ret;
+
+ if (kvm_x86_ops.post_map_gfn) {
+ ret = static_call(kvm_x86_post_map_gfn)(vcpu->kvm, map->gfn, map->pfn, token);
+ if (ret)
+ kvm_unmap_gfn(vcpu, map, cache, false, atomic);
+ }
+
+ return ret;
+}
+
+static int kvm_unmap_gfn_protected(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+ struct gfn_to_pfn_cache *cache, bool dirty,
+ bool atomic, int token)
+{
+ int ret;
+
+ ret = kvm_unmap_gfn(vcpu, map, cache, dirty, atomic);
+
+ if (kvm_x86_ops.post_unmap_gfn)
+ static_call(kvm_x86_post_unmap_gfn)(vcpu->kvm, map->gfn, map->pfn, token);
+
+ return ret;
+}
+
+static int kvm_vcpu_map_protected(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map,
+ int *token)
+{
+ int ret;
+
+ ret = kvm_vcpu_map(vcpu, gpa, map);
+ if (ret)
+ return ret;
+
+ if (kvm_x86_ops.post_map_gfn) {
+ ret = static_call(kvm_x86_post_map_gfn)(vcpu->kvm, map->gfn, map->pfn, token);
+ if (ret)
+ kvm_vcpu_unmap(vcpu, map, false);
+ }
+
+ return ret;
+}
+
+static void kvm_vcpu_unmap_protected(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+ bool dirty, int token)
+{
+ kvm_vcpu_unmap(vcpu, map, dirty);
+
+ if (kvm_x86_ops.post_unmap_gfn)
+ static_call(kvm_x86_post_unmap_gfn)(vcpu->kvm, map->gfn, map->pfn, token);
+}
+
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
@@ -3185,6 +3244,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
+ int token;

if (kvm_xen_msr_enabled(vcpu->kvm)) {
kvm_xen_runstate_set_running(vcpu);
@@ -3195,8 +3255,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
return;

/* -EAGAIN is returned in atomic context so we can just return. */
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
- &map, &vcpu->arch.st.cache, false))
+ if (kvm_map_gfn_protected(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false, &token))
return;

st = map.hva +
@@ -3234,7 +3294,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)

st->version += 1;

- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ kvm_unmap_gfn_protected(vcpu, &map, &vcpu->arch.st.cache, true, false, token);
}

int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4271,6 +4331,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
+ int token;

if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4278,8 +4339,8 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;

- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
- &vcpu->arch.st.cache, true))
+ if (kvm_map_gfn_protected(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, true, &token))
return;

st = map.hva +
@@ -4287,7 +4348,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)

st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;

- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ kvm_unmap_gfn_protected(vcpu, &map, &vcpu->arch.st.cache, true, true, token);
}

void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -6816,6 +6877,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
gpa_t gpa;
char *kaddr;
bool exchanged;
+ int token;

/* guests cmpxchg8b have to be emulated atomically */
if (bytes > 8 || (bytes & (bytes - 1)))
@@ -6839,7 +6901,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;

- if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
+ if (kvm_vcpu_map_protected(vcpu, gpa_to_gfn(gpa), &map, &token))
goto emul_write;

kaddr = map.hva + offset_in_page(gpa);
@@ -6861,7 +6923,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
BUG();
}

- kvm_vcpu_unmap(vcpu, &map, true);
+ kvm_vcpu_unmap_protected(vcpu, &map, true, token);

if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
--
2.17.1

2021-10-13 22:12:41

by Sean Christopherson

[permalink] [raw]

Subject: Re: [PATCH Part2 v5 39/45] KVM: SVM: Introduce ops for the post gfn map and unmap

On Wed, Oct 13, 2021, Brijesh Singh wrote:
>
> On 10/13/21 1:10 PM, Sean Christopherson wrote:
> > Side topic, what do you think about s/assigned/private for the "public" APIs, as
> > suggested/implied above? I actually like the terminology when talking specifically
> > about the RMP, but it doesn't fit the abstractions that tend to be used when talking
> > about these things in other contexts, e.g. in KVM.
>
> I can float the idea to see if docs folks is okay with the changes but
> generally speaking we all have been referring the assigned == private in
> the Linux SNP support patch.

Oh, I wasn't suggesting anything remotely close to an "official" change, it was
purely a suggestion for the host-side patches. It might even be unique to this
one helper.

2022-09-19 18:10:25

by Michael Roth

[permalink] [raw]

Subject: Re: [PATCH Part2 v5 39/45] KVM: SVM: Introduce ops for the post gfn map and unmap

On Wed, Sep 14, 2022 at 08:05:49AM +0000, Sean Christopherson wrote:
> On Thu, Sep 08, 2022, Michael Roth wrote:
> > On Fri, Oct 15, 2021 at 05:16:28PM +0000, Sean Christopherson wrote:
> > So in the context of this interim solution, we're trying to look for a
> > solution that's simple enough that it can be used reliably, without
> > introducing too much additional complexity into KVM. There is one
> > approach that seems to fit that bill, that Brijesh attempted in an
> > earlier version of this series (I'm not sure what exactly was the
> > catalyst to changing the approach, as I wasn't really in the loop at
> > the time, but AIUI there weren't any showstoppers there, but please
> > correct me if I'm missing anything):
> >
> > - if the host is writing to a page that it thinks is supposed to be
> > shared, and the guest switches it to private, we get an RMP fault
> > (actually, we will get a !PRESENT fault, since as of v5 we now
> > remove the mapping from the directmap as part of conversion)
> > - in the host #PF handler, if we see that the page is marked private
> > in the RMP table, simply switch it back to shared
> > - if this was a bug on the part of the host, then the guest will see

Hi Sean,

Thanks for the input here and at KVM Forum.

>
> As discussed off-list, attempting to fix up RMP violations in the host #PF handler
> is not a viable approach. There was also extensive discussion on-list a while back:
>
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F8a244d34-2b10-4cf8-894a-1bf12b59cf92%40www.fastmail.com&data=05%7C01%7Cmichael.roth%40amd.com%7C2f2356ebe2b44daab93708da9627f2b4%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637987395629620130%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=Mm13HgUAE4M%2BluyBys3Ihp%2FTNqSQTq14WrMXdF8ArAw%3D&reserved=0

I think that was likely the only hope for a non-UPM approach, as
anything else would require a good bit of infrastructure in KVM and
elsewhere to avoid that situation occuring to begin with, and it
probably would not be worth the effort outside the context of a
general/platform-independent solution like UPM. I was hoping it would be
possible to work through Andy's concerns, but the concerns you and Paolo
raised of potential surprise #PFs in other parts of the kernel are
something I'm less optimistic about, so I agree UPM is probably the right
place to focus efforts.

>
> > AIUI, this is still sort of an open question, but you noted how nuking
> > the directmap without any formalized interface for letting the kernel
> > know about it could be problematic down the road, which also sounds
> > like the sort of thing more suited for having UPM address at a more
> > general level, since there are similar requirements for TDX as well.
> >
> > AIUI there are 2 main arguments against splitting the directmap:
> > a) we can't easily rebuild it atm
> > b) things like KSM might still tries to access private pages
> >
> > But unmapping also suffers from a), since we still end up splitting the
> > directmap unless we remove pages in blocks of 2M.
>
> But for UPM, it's easy to rebuild the direct map since there will be an explicit,
> kernel controlled point where the "inaccesible" memfd releases the private page.

I was thinking it would be possible to do something similar by doing page
splitting/restore in bulk as part of MEM_ENCRYPT_{REG,UNREG}_REGION, but
yes UPM also allows for a convenient point in time to split/unsplit.

>
> > But nothing prevents a guest from switching a single 4K page to private, in
> > which case we are forced to split. That would be normal behavior on the part
> > of the guest for setting up GHCB pages/etc, so we still end up splitting the
> > directmap over time.
>
> The host actually isn't _forced_ to split with UPM. One option would be to refuse
> to split the direct map and instead force userspace to eat the 2mb allocation even
> though it only wants to map a single 4kb chunk into the guest. I don't know that
> that's a _good_ option, but it is an option.

That does seem like a reasonable option. Maybe it also opens up a path
for hugetlbfs support of sorts. In practice I wouldn't expect too many
of those pages to be wasted, worst case would be 2MB per shared page in
the guest... I suppose that could add up for GHCB pages and whatnot if
there are lots of vCPUs, but at that point you're likely dealing with
large guests with enough memory to spare. Could be another pain point
regarding calculating appropriate memory limits for userspace though.

Thanks!

-Mike