The current handling of the MVPG instruction when executed in a nested
guest is wrong, and can lead to the nested guest hanging.
This patchset fixes the behaviour to be more architecturally correct,
and fixes the hangs observed.
Claudio Imbrenda (4):
s390/kvm: VSIE: stop leaking host addresses
s390/kvm: extend guest_translate for MVPG interpretation
s390/kvm: add kvm_s390_vsie_mvpg_check needed for VSIE MVPG
s390/kvm: VSIE: correctly handle MVPG when in VSIE
arch/s390/kvm/gaccess.c | 88 ++++++++++++++++++++++++++++++++++++++---
arch/s390/kvm/gaccess.h | 3 ++
arch/s390/kvm/vsie.c | 78 +++++++++++++++++++++++++++++++++---
3 files changed, 159 insertions(+), 10 deletions(-)
--
2.26.2
Correctly handle the MVPG instruction when issued by a VSIE guest.
Fixes: a3508fbe9dc6d ("KVM: s390: vsie: initial support for nested virtualization")
Cc: [email protected]
Signed-off-by: Claudio Imbrenda <[email protected]>
---
arch/s390/kvm/vsie.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 73 insertions(+)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index ada49583e530..6c3069868acd 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -977,6 +977,75 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return 0;
}
+static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg)
+{
+ reg &= 0xf;
+ switch (reg) {
+ case 15:
+ return vsie_page->scb_s.gg15;
+ case 14:
+ return vsie_page->scb_s.gg14;
+ default:
+ return vcpu->run->s.regs.gprs[reg];
+ }
+}
+
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ unsigned long r1, r2, mask = PAGE_MASK;
+ int rc;
+
+ if (psw_bits(scb_s->gpsw).eaba == PSW_BITS_AMODE_24BIT)
+ mask = 0xfff000;
+ else if (psw_bits(scb_s->gpsw).eaba == PSW_BITS_AMODE_31BIT)
+ mask = 0x7ffff000;
+
+ r1 = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask;
+ r2 = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
+ rc = kvm_s390_vsie_mvpg_check(vcpu, r1, r2, &vsie_page->scb_o->mcic);
+
+ /*
+ * Guest translation was not successful. The host needs to forward
+ * the intercept to the guest and let the guest fix its page tables.
+ * The guest needs then to retry the instruction.
+ */
+ if (rc == -ENOENT)
+ return 1;
+
+ retry_vsie_icpt(vsie_page);
+
+ /*
+ * Guest translation was not successful. The page tables of the guest
+ * are broken. Try again and let the hardware deliver the fault.
+ */
+ if (rc == -EFAULT)
+ return 0;
+
+ /*
+ * Guest translation was successful. The host needs to fix up its
+ * page tables and retry the instruction in the nested guest.
+ * In case of failure, the instruction will intercept again, and
+ * a different path will be taken.
+ */
+ if (!rc) {
+ kvm_s390_shadow_fault(vcpu, vsie_page->gmap, r2);
+ kvm_s390_shadow_fault(vcpu, vsie_page->gmap, r1);
+ return 0;
+ }
+
+ /*
+ * An exception happened during guest translation, it needs to be
+ * delivered to the guest. This can happen if the host has EDAT1
+ * enabled and the guest has not, or for other causes. The guest
+ * needs to process the exception and return to the nested guest.
+ */
+ if (rc > 0)
+ return kvm_s390_inject_prog_cond(vcpu, rc);
+
+ return 1;
+}
+
/*
* Run the vsie on a shadow scb and a shadow gmap, without any further
* sanity checks, handling SIE faults.
@@ -1063,6 +1132,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if ((scb_s->ipa & 0xf000) != 0xf000)
scb_s->ipa += 0x1000;
break;
+ case ICPT_PARTEXEC:
+ if (scb_s->ipa == 0xb254)
+ rc = vsie_handle_mvpg(vcpu, vsie_page);
+ break;
}
return rc;
}
--
2.26.2
Add kvm_s390_vsie_mvpg_check to perform the necessary checks in case an
MVPG instruction intercepts in a VSIE guest.
Cc: [email protected]
Signed-off-by: Claudio Imbrenda <[email protected]>
---
arch/s390/kvm/gaccess.c | 55 +++++++++++++++++++++++++++++++++++++++++
arch/s390/kvm/gaccess.h | 3 +++
2 files changed, 58 insertions(+)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8e256a233583..90e9baff6eac 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1228,3 +1228,58 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
mmap_read_unlock(sg->mm);
return rc;
}
+
+static int kvm_s390_mvpg_check_one(struct kvm_vcpu *vcpu, unsigned long *addr,
+ const int edat, const union asce asce,
+ const enum gacc_mode mode, unsigned long *pteptr)
+{
+ enum prot_type prot;
+ int rc;
+
+ rc = guest_translate(vcpu, *addr, addr, asce, mode, &prot, pteptr);
+ if (rc <= 0)
+ return rc;
+
+ switch (rc) {
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_SEGMENT_TRANSLATION:
+ if (!edat)
+ return trans_exc(vcpu, rc, *addr, 0, mode, prot);
+ *pteptr |= 4;
+ fallthrough;
+ case PGM_PAGE_TRANSLATION:
+ return -ENOENT;
+ default:
+ return rc;
+ }
+}
+
+int kvm_s390_vsie_mvpg_check(struct kvm_vcpu *vcpu, unsigned long r1,
+ unsigned long r2, void *gpei)
+{
+ unsigned long pei[2] = {0};
+ union ctlreg0 cr0;
+ union asce cr1;
+ int edat, rc1, rc2;
+
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ cr1.val = vcpu->arch.sie_block->gcr[1];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+
+ rc1 = kvm_s390_mvpg_check_one(vcpu, &r1, edat, cr1, GACC_FETCH, pei);
+ rc2 = kvm_s390_mvpg_check_one(vcpu, &r2, edat, cr1, GACC_STORE, pei + 1);
+
+ if (rc1 == -ENOENT || rc2 == -ENOENT) {
+ memcpy(gpei, pei, sizeof(pei));
+ return -ENOENT;
+ }
+
+ if (rc2 < 0)
+ return rc2;
+ if (rc1 < 0)
+ return rc1;
+
+ return 0;
+}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index f4c51756c462..2c53cee3b29f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -166,6 +166,9 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len, enum gacc_mode mode);
+int kvm_s390_vsie_mvpg_check(struct kvm_vcpu *vcpu, unsigned long r1,
+ unsigned long r2, void *gpei);
+
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
--
2.26.2
Extend guest_translate to optionally return the address of the guest
DAT table which caused the exception, and change the return value to int.
Also return the appropriate values in the low order bits of the address
indicating protection or EDAT.
Cc: [email protected]
Signed-off-by: Claudio Imbrenda <[email protected]>
---
arch/s390/kvm/gaccess.c | 33 ++++++++++++++++++++++++++++-----
1 file changed, 28 insertions(+), 5 deletions(-)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6d6b57059493..8e256a233583 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -598,6 +598,10 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
* @asce: effective asce
* @mode: indicates the access mode to be used
* @prot: returns the type for protection exceptions
+ * @entryptr: returns the physical address of the last DAT table entry
+ * processed, additionally setting a few flags in the lower bits
+ * to indicate whether a translation exception or a protection
+ * exception were encountered during the address translation.
*
* Translate a guest virtual address into a guest absolute address by means
* of dynamic address translation as specified by the architecture.
@@ -611,9 +615,10 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
* the returned value is the program interruption code as defined
* by the architecture
*/
-static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
- unsigned long *gpa, const union asce asce,
- enum gacc_mode mode, enum prot_type *prot)
+static int guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
+ unsigned long *gpa, const union asce asce,
+ enum gacc_mode mode, enum prot_type *prot,
+ unsigned long *entryptr)
{
union vaddress vaddr = {.addr = gva};
union raddress raddr = {.addr = gva};
@@ -628,6 +633,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
+ if (entryptr)
+ *entryptr = 0;
if (asce.r)
goto real_address;
ptr = asce.origin * PAGE_SIZE;
@@ -667,6 +674,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rfte.val))
return -EFAULT;
+ if (entryptr)
+ *entryptr = ptr;
if (rfte.i)
return PGM_REGION_FIRST_TRANS;
if (rfte.tt != TABLE_TYPE_REGION1)
@@ -685,6 +694,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rste.val))
return -EFAULT;
+ if (entryptr)
+ *entryptr = ptr;
if (rste.i)
return PGM_REGION_SECOND_TRANS;
if (rste.tt != TABLE_TYPE_REGION2)
@@ -703,6 +714,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rtte.val))
return -EFAULT;
+ if (entryptr)
+ *entryptr = ptr;
if (rtte.i)
return PGM_REGION_THIRD_TRANS;
if (rtte.tt != TABLE_TYPE_REGION3)
@@ -713,6 +726,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= rtte.fc1.p;
iep_protection = rtte.fc1.iep;
raddr.rfaa = rtte.fc1.rfaa;
+ if (entryptr)
+ *entryptr |= dat_protection ? 6 : 4;
goto absolute_address;
}
if (vaddr.sx01 < rtte.fc0.tf)
@@ -731,6 +746,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &ste.val))
return -EFAULT;
+ if (entryptr)
+ *entryptr = ptr;
if (ste.i)
return PGM_SEGMENT_TRANSLATION;
if (ste.tt != TABLE_TYPE_SEGMENT)
@@ -741,6 +758,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
dat_protection |= ste.fc1.p;
iep_protection = ste.fc1.iep;
raddr.sfaa = ste.fc1.sfaa;
+ if (entryptr)
+ *entryptr |= dat_protection ? 6 : 4;
goto absolute_address;
}
dat_protection |= ste.fc0.p;
@@ -751,10 +770,14 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &pte.val))
return -EFAULT;
+ if (entryptr)
+ *entryptr = ptr;
if (pte.i)
return PGM_PAGE_TRANSLATION;
if (pte.z)
return PGM_TRANSLATION_SPEC;
+ if (entryptr && dat_protection)
+ *entryptr |= 2;
dat_protection |= pte.p;
iep_protection = pte.iep;
raddr.pfra = pte.pfra;
@@ -810,7 +833,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
PROT_TYPE_LA);
ga &= PAGE_MASK;
if (psw_bits(*psw).dat) {
- rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
+ rc = guest_translate(vcpu, ga, pages, asce, mode, &prot, NULL);
if (rc < 0)
return rc;
} else {
@@ -920,7 +943,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
}
if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */
- rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
+ rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot, NULL);
if (rc > 0)
return trans_exc(vcpu, rc, gva, 0, mode, prot);
} else {
--
2.26.2
The addresses in the SIE control block of the host should not be
forwarded to the guest. They are only meaningful to the host, and
moreover it would be a clear security issue.
Subsequent patches will actually put the right values in the guest SIE
control block.
Fixes: a3508fbe9dc6d ("KVM: s390: vsie: initial support for nested virtualization")
Cc: [email protected]
Signed-off-by: Claudio Imbrenda <[email protected]>
---
arch/s390/kvm/vsie.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 4f3cbf6003a9..ada49583e530 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -416,11 +416,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
memcpy((void *)((u64)scb_o + 0xc0),
(void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
break;
- case ICPT_PARTEXEC:
- /* MVPG only */
- memcpy((void *)((u64)scb_o + 0xc0),
- (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
- break;
}
if (scb_s->ihcpu != 0xffffU)
--
2.26.2
On 18.12.20 15:18, Claudio Imbrenda wrote:
> The current handling of the MVPG instruction when executed in a nested
> guest is wrong, and can lead to the nested guest hanging.
Hi,
thanks for spotting and debugging! Is this related to nested guests
hanging while migrating (mentioned by Janosch at some point)?
Or can this not be reproduced with actual Linux guests?
Thanks!
>
> This patchset fixes the behaviour to be more architecturally correct,
> and fixes the hangs observed.
>
> Claudio Imbrenda (4):
> s390/kvm: VSIE: stop leaking host addresses
> s390/kvm: extend guest_translate for MVPG interpretation
> s390/kvm: add kvm_s390_vsie_mvpg_check needed for VSIE MVPG
> s390/kvm: VSIE: correctly handle MVPG when in VSIE
>
> arch/s390/kvm/gaccess.c | 88 ++++++++++++++++++++++++++++++++++++++---
> arch/s390/kvm/gaccess.h | 3 ++
> arch/s390/kvm/vsie.c | 78 +++++++++++++++++++++++++++++++++---
> 3 files changed, 159 insertions(+), 10 deletions(-)
>
--
Thanks,
David / dhildenb
On 18.12.20 15:18, Claudio Imbrenda wrote:
> The addresses in the SIE control block of the host should not be
> forwarded to the guest. They are only meaningful to the host, and
> moreover it would be a clear security issue.
It's really almost impossible for someone without access to
documentation to understand what we leak. I assume we're leaking the g1
address of a page table (entry), used for translation of g2->g3 to g1.
Can you try making that clearer?
In that case, it's pretty much a random number (of a random page used as
a leave page table) and does not let g1 identify locations of symbols
etc. If so, I don't think this is a "clear security issue" and suggest
squashing this into the actual fix (#p4 I assume).
@Christian, @Janosch? Am I missing something?
>
> Subsequent patches will actually put the right values in the guest SIE
> control block.
>
> Fixes: a3508fbe9dc6d ("KVM: s390: vsie: initial support for nested virtualization")
> Cc: [email protected]
> Signed-off-by: Claudio Imbrenda <[email protected]>
> ---
> arch/s390/kvm/vsie.c | 5 -----
> 1 file changed, 5 deletions(-)
>
> diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
> index 4f3cbf6003a9..ada49583e530 100644
> --- a/arch/s390/kvm/vsie.c
> +++ b/arch/s390/kvm/vsie.c
> @@ -416,11 +416,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
> memcpy((void *)((u64)scb_o + 0xc0),
> (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
> break;
> - case ICPT_PARTEXEC:
> - /* MVPG only */
> - memcpy((void *)((u64)scb_o + 0xc0),
> - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
> - break;
> }
>
> if (scb_s->ihcpu != 0xffffU)
>
--
Thanks,
David / dhildenb
On 18.12.20 15:18, Claudio Imbrenda wrote:
> Correctly handle the MVPG instruction when issued by a VSIE guest.
>
I remember that MVPG SIE documentation was completely crazy and full of
corner cases. :)
Looking at arch/s390/kvm/intercept.c:handle_mvpg_pei(), I can spot that
1. "This interception can only happen for guests with DAT disabled ..."
2. KVM does not make use of any mvpg state inside the SCB.
Can this be observed with Linux guests?
Can I get some information on what information is stored at [0xc0, 0xd)
inside the SCB? I assume it's:
0xc0: guest physical address of source PTE
0xc8: guest physical address of target PTE
Also, which conditions have to be met such that we get a ICPT_PARTEXEC:
a) State of guest DAT (I assume off?)
b) State of PTEs: What happens if there is no PTE (I assume we need two
PTEs, otherwise no such intercept)? I assume we get an intercept if one
of both PTEs is not present or the destination PTE is protected. Correct?
So, when we (g1) get an intercept for g3, can we be sure 0xc0 and 0xc8
in the scb are both valid g1 addresses pointing at our PTE, and what do
we know about these PTEs (one not present or destination protected)?
[...]
> /*
> * Run the vsie on a shadow scb and a shadow gmap, without any further
> * sanity checks, handling SIE faults.
> @@ -1063,6 +1132,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
> if ((scb_s->ipa & 0xf000) != 0xf000)
> scb_s->ipa += 0x1000;
> break;
> + case ICPT_PARTEXEC:
> + if (scb_s->ipa == 0xb254)
Old code hat "/* MVPG only */" - why is this condition now necessary?
> + rc = vsie_handle_mvpg(vcpu, vsie_page);
> + break;
> }
> return rc;
> }
>
--
Thanks,
David / dhildenb
On Sun, 20 Dec 2020 10:44:56 +0100
David Hildenbrand <[email protected]> wrote:
> On 18.12.20 15:18, Claudio Imbrenda wrote:
> > The addresses in the SIE control block of the host should not be
> > forwarded to the guest. They are only meaningful to the host, and
> > moreover it would be a clear security issue.
>
> It's really almost impossible for someone without access to
> documentation to understand what we leak. I assume we're leaking the
> g1 address of a page table (entry), used for translation of g2->g3 to
> g1. Can you try making that clearer?
this is correct.
I guess I can improve the text of the commit
> In that case, it's pretty much a random number (of a random page used
> as a leave page table) and does not let g1 identify locations of
> symbols etc. If so, I don't think this is a "clear security issue"
> and suggest squashing this into the actual fix (#p4 I assume).
yeah __maybe__ I overstated the importance ;)
But I would still like to keep it as a separate patch, looks more
straightforward to me
> @Christian, @Janosch? Am I missing something?
>
> >
> > Subsequent patches will actually put the right values in the guest
> > SIE control block.
> >
> > Fixes: a3508fbe9dc6d ("KVM: s390: vsie: initial support for nested
> > virtualization") Cc: [email protected]
> > Signed-off-by: Claudio Imbrenda <[email protected]>
> > ---
> > arch/s390/kvm/vsie.c | 5 -----
> > 1 file changed, 5 deletions(-)
> >
> > diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
> > index 4f3cbf6003a9..ada49583e530 100644
> > --- a/arch/s390/kvm/vsie.c
> > +++ b/arch/s390/kvm/vsie.c
> > @@ -416,11 +416,6 @@ static void unshadow_scb(struct kvm_vcpu
> > *vcpu, struct vsie_page *vsie_page) memcpy((void *)((u64)scb_o +
> > 0xc0), (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
> > break;
> > - case ICPT_PARTEXEC:
> > - /* MVPG only */
> > - memcpy((void *)((u64)scb_o + 0xc0),
> > - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
> > - break;
> > }
> >
> > if (scb_s->ihcpu != 0xffffU)
> >
>
>
On Sun, 20 Dec 2020 10:40:27 +0100
David Hildenbrand <[email protected]> wrote:
> On 18.12.20 15:18, Claudio Imbrenda wrote:
> > The current handling of the MVPG instruction when executed in a
> > nested guest is wrong, and can lead to the nested guest hanging.
>
> Hi,
>
> thanks for spotting and debugging! Is this related to nested guests
> hanging while migrating (mentioned by Janosch at some point)?
no, it was found by running legacy tests in VSIE (I have written
kvm-unit-tests for this now, I'll post them Soon™)
> Or can this not be reproduced with actual Linux guests?
Linux doesn't use MVPG, and gcc in general seems to avoid it, so we
never really see this in the wild. Moreover Linux does not normally run
with DAT disabled.
> Thanks!
>
> >
> > This patchset fixes the behaviour to be more architecturally
> > correct, and fixes the hangs observed.
> >
> > Claudio Imbrenda (4):
> > s390/kvm: VSIE: stop leaking host addresses
> > s390/kvm: extend guest_translate for MVPG interpretation
> > s390/kvm: add kvm_s390_vsie_mvpg_check needed for VSIE MVPG
> > s390/kvm: VSIE: correctly handle MVPG when in VSIE
> >
> > arch/s390/kvm/gaccess.c | 88
> > ++++++++++++++++++++++++++++++++++++++--- arch/s390/kvm/gaccess.h |
> > 3 ++ arch/s390/kvm/vsie.c | 78
> > +++++++++++++++++++++++++++++++++--- 3 files changed, 159
> > insertions(+), 10 deletions(-)
>
>
On Sun, 20 Dec 2020 11:13:57 +0100
David Hildenbrand <[email protected]> wrote:
> On 18.12.20 15:18, Claudio Imbrenda wrote:
> > Correctly handle the MVPG instruction when issued by a VSIE guest.
> >
>
> I remember that MVPG SIE documentation was completely crazy and full
> of corner cases. :)
you remember correctly
> Looking at arch/s390/kvm/intercept.c:handle_mvpg_pei(), I can spot
> that
>
> 1. "This interception can only happen for guests with DAT disabled
> ..." 2. KVM does not make use of any mvpg state inside the SCB.
>
> Can this be observed with Linux guests?
a Linux guest will typically not run with DAT disabled
> Can I get some information on what information is stored at [0xc0,
> 0xd) inside the SCB? I assume it's:
>
> 0xc0: guest physical address of source PTE
> 0xc8: guest physical address of target PTE
yes (plus 3 flags in the lower bits of each)
>
> Also, which conditions have to be met such that we get a
> ICPT_PARTEXEC:
>
> a) State of guest DAT (I assume off?)
> b) State of PTEs: What happens if there is no PTE (I assume we need
> two PTEs, otherwise no such intercept)? I assume we get an intercept
> if one of both PTEs is not present or the destination PTE is
> protected. Correct?
we get the intercept if the guest has DAT off, and at least one of the
host PTEs is invalid (and I think if the destination is valid but
protected)
> So, when we (g1) get an intercept for g3, can we be sure 0xc0 and 0xc8
> in the scb are both valid g1 addresses pointing at our PTE, and what
> do we know about these PTEs (one not present or destination
> protected)?
we only know that at least one of the following holds true:
* source invalid
* destination invalid
* destination protected
there is a bit that tells us if the destination was protected (bit 62),
but that does not exclude an invalid source
> [...]
> > /*
> > * Run the vsie on a shadow scb and a shadow gmap, without any
> > further
> > * sanity checks, handling SIE faults.
> > @@ -1063,6 +1132,10 @@ static int do_vsie_run(struct kvm_vcpu
> > *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) !=
> > 0xf000) scb_s->ipa += 0x1000;
> > break;
> > + case ICPT_PARTEXEC:
> > + if (scb_s->ipa == 0xb254)
>
> Old code hat "/* MVPG only */" - why is this condition now necessary?
old code was wrong ;)
> > + rc = vsie_handle_mvpg(vcpu, vsie_page);
> > + break;
> > }
> > return rc;
> > }
> >
>
>
>> In that case, it's pretty much a random number (of a random page used
>> as a leave page table) and does not let g1 identify locations of
>> symbols etc. If so, I don't think this is a "clear security issue"
>> and suggest squashing this into the actual fix (#p4 I assume).
>
> yeah __maybe__ I overstated the importance ;)
>
> But I would still like to keep it as a separate patch, looks more
> straightforward to me
>
I don't see a need to split this up. Just fix it right away.
--
Thanks,
David / dhildenb
On 04.01.21 16:22, Claudio Imbrenda wrote:
> On Sun, 20 Dec 2020 11:13:57 +0100
> David Hildenbrand <[email protected]> wrote:
>
>> On 18.12.20 15:18, Claudio Imbrenda wrote:
>>> Correctly handle the MVPG instruction when issued by a VSIE guest.
>>>
>>
>> I remember that MVPG SIE documentation was completely crazy and full
>> of corner cases. :)
>
> you remember correctly
>
>> Looking at arch/s390/kvm/intercept.c:handle_mvpg_pei(), I can spot
>> that
>>
>> 1. "This interception can only happen for guests with DAT disabled
>> ..." 2. KVM does not make use of any mvpg state inside the SCB.
>>
>> Can this be observed with Linux guests?
>
> a Linux guest will typically not run with DAT disabled
>
>> Can I get some information on what information is stored at [0xc0,
>> 0xd) inside the SCB? I assume it's:
>>
>> 0xc0: guest physical address of source PTE
>> 0xc8: guest physical address of target PTE
>
> yes (plus 3 flags in the lower bits of each)
Thanks! Do the flags tell us what the deal with the PTE was? If yes,
what's the meaning of the separate flags?
I assume something like "invalid, proteced, ??"
I'm asking because I think we can handle this a little easier.
>
>> [...]
>>> /*
>>> * Run the vsie on a shadow scb and a shadow gmap, without any
>>> further
>>> * sanity checks, handling SIE faults.
>>> @@ -1063,6 +1132,10 @@ static int do_vsie_run(struct kvm_vcpu
>>> *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) !=
>>> 0xf000) scb_s->ipa += 0x1000;
>>> break;
>>> + case ICPT_PARTEXEC:
>>> + if (scb_s->ipa == 0xb254)
>>
>> Old code hat "/* MVPG only */" - why is this condition now necessary?
>
> old code was wrong ;)
arch/s390/kvm/intercept.c:handle_partial_execution() we only seem to handle
1. MVPG
2. SIGP PEI
The latter is only relevant for external calls. IIRC, this is only active
with sigp interpretation - which is never active under vsie (ECA_SIGPI).
--
Thanks,
David / dhildenb
On Mon, 4 Jan 2021 17:08:15 +0100
David Hildenbrand <[email protected]> wrote:
> On 04.01.21 16:22, Claudio Imbrenda wrote:
> > On Sun, 20 Dec 2020 11:13:57 +0100
> > David Hildenbrand <[email protected]> wrote:
> >
> >> On 18.12.20 15:18, Claudio Imbrenda wrote:
> >>> Correctly handle the MVPG instruction when issued by a VSIE guest.
> >>>
> >>
> >> I remember that MVPG SIE documentation was completely crazy and
> >> full of corner cases. :)
> >
> > you remember correctly
> >
> >> Looking at arch/s390/kvm/intercept.c:handle_mvpg_pei(), I can spot
> >> that
> >>
> >> 1. "This interception can only happen for guests with DAT disabled
> >> ..." 2. KVM does not make use of any mvpg state inside the SCB.
> >>
> >> Can this be observed with Linux guests?
> >
> > a Linux guest will typically not run with DAT disabled
> >
> >> Can I get some information on what information is stored at [0xc0,
> >> 0xd) inside the SCB? I assume it's:
> >>
> >> 0xc0: guest physical address of source PTE
> >> 0xc8: guest physical address of target PTE
> >
> > yes (plus 3 flags in the lower bits of each)
>
> Thanks! Do the flags tell us what the deal with the PTE was? If yes,
> what's the meaning of the separate flags?
>
> I assume something like "invalid, proteced, ??"
bit 61 indicates that the address is a region or segment table entry,
when EDAT applies
bit 62 is "protected" when the protected bit is set in the segment
table entry (or region, if EDAT applies)
bit 63 is set when the operand was translated with a real-space ASCE
but you can check if the PTE is valid just by dereferencing the
pointers...
> I'm asking because I think we can handle this a little easier.
what is your idea?
> >
> >> [...]
> >>> /*
> >>> * Run the vsie on a shadow scb and a shadow gmap, without any
> >>> further
> >>> * sanity checks, handling SIE faults.
> >>> @@ -1063,6 +1132,10 @@ static int do_vsie_run(struct kvm_vcpu
> >>> *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) !=
> >>> 0xf000) scb_s->ipa += 0x1000;
> >>> break;
> >>> + case ICPT_PARTEXEC:
> >>> + if (scb_s->ipa == 0xb254)
> >>
> >> Old code hat "/* MVPG only */" - why is this condition now
> >> necessary?
> >
> > old code was wrong ;)
>
>
> arch/s390/kvm/intercept.c:handle_partial_execution() we only seem to
> handle
>
> 1. MVPG
> 2. SIGP PEI
>
> The latter is only relevant for external calls. IIRC, this is only
> active with sigp interpretation - which is never active under vsie
> (ECA_SIGPI).
I think putting an explicit check is better than just a jump in the
dark.
On 04.01.21 17:36, Claudio Imbrenda wrote:
> On Mon, 4 Jan 2021 17:08:15 +0100
> David Hildenbrand <[email protected]> wrote:
>
>> On 04.01.21 16:22, Claudio Imbrenda wrote:
>>> On Sun, 20 Dec 2020 11:13:57 +0100
>>> David Hildenbrand <[email protected]> wrote:
>>>
>>>> On 18.12.20 15:18, Claudio Imbrenda wrote:
>>>>> Correctly handle the MVPG instruction when issued by a VSIE guest.
>>>>>
>>>>
>>>> I remember that MVPG SIE documentation was completely crazy and
>>>> full of corner cases. :)
>>>
>>> you remember correctly
>>>
>>>> Looking at arch/s390/kvm/intercept.c:handle_mvpg_pei(), I can spot
>>>> that
>>>>
>>>> 1. "This interception can only happen for guests with DAT disabled
>>>> ..." 2. KVM does not make use of any mvpg state inside the SCB.
>>>>
>>>> Can this be observed with Linux guests?
>>>
>>> a Linux guest will typically not run with DAT disabled
>>>
>>>> Can I get some information on what information is stored at [0xc0,
>>>> 0xd) inside the SCB? I assume it's:
>>>>
>>>> 0xc0: guest physical address of source PTE
>>>> 0xc8: guest physical address of target PTE
>>>
>>> yes (plus 3 flags in the lower bits of each)
>>
>> Thanks! Do the flags tell us what the deal with the PTE was? If yes,
>> what's the meaning of the separate flags?
>>
>> I assume something like "invalid, proteced, ??"
>
> bit 61 indicates that the address is a region or segment table entry,
> when EDAT applies
> bit 62 is "protected" when the protected bit is set in the segment
> table entry (or region, if EDAT applies)
> bit 63 is set when the operand was translated with a real-space ASCE
Thanks!
> but you can check if the PTE is valid just by dereferencing the
> pointers...
The pgtable might already have been unshadowed and repurposed I think.
So for vSIE, the PTE content, therefore, is a little unreliable.
We could, of course, try using them to make a guess.
"Likely valid"
"Likely invalid"
A rerun of the vSIE will fixup any wrong guess.
>
>> I'm asking because I think we can handle this a little easier.
>
> what is your idea?
I was wondering if we can
1. avoid essentially two translations per PTE, obtaining the information
we need while tying to shadow. kvm_s390_shadow_fault() on steroids that
a) gives us the last guest pte address (tricky for segment.region table
I think ... will have to think about this)
b) the final protection
2. avoid faulting/shadowing in case we know an entry is not problematic.
E.g., no need to shadow/fault the source in case the PTE is there and
not invalid. "likely valid" case above.
The idea would be to call the new kvm_s390_shadow_fault() two times (or
only once due to our guesses) and either rerun the vsie, inject an
interrupt, or create the partial intercept.
Essentially avoiding kvm_s390_vsie_mvpg_check(). Will have to think
about this.
[...]
>>
>> arch/s390/kvm/intercept.c:handle_partial_execution() we only seem to
>> handle
>>
>> 1. MVPG
>> 2. SIGP PEI
>>
>> The latter is only relevant for external calls. IIRC, this is only
>> active with sigp interpretation - which is never active under vsie
>> (ECA_SIGPI).
>
> I think putting an explicit check is better than just a jump in the
> dark.
Agreed, but that should then be called out somewhere why the change as
done. (e.g., separate cleanup patch)
--
Thanks,
David / dhildenb
On 18.12.20 15:18, Claudio Imbrenda wrote:
> Add kvm_s390_vsie_mvpg_check to perform the necessary checks in case an
> MVPG instruction intercepts in a VSIE guest.
>
> Cc: [email protected]
> Signed-off-by: Claudio Imbrenda <[email protected]>
> ---
> arch/s390/kvm/gaccess.c | 55 +++++++++++++++++++++++++++++++++++++++++
> arch/s390/kvm/gaccess.h | 3 +++
> 2 files changed, 58 insertions(+)
>
> diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
> index 8e256a233583..90e9baff6eac 100644
> --- a/arch/s390/kvm/gaccess.c
> +++ b/arch/s390/kvm/gaccess.c
> @@ -1228,3 +1228,58 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
> mmap_read_unlock(sg->mm);
> return rc;
> }
> +
> +static int kvm_s390_mvpg_check_one(struct kvm_vcpu *vcpu, unsigned long *addr,
> + const int edat, const union asce asce,
> + const enum gacc_mode mode, unsigned long *pteptr)
> +{
> + enum prot_type prot;
> + int rc;
> +
> + rc = guest_translate(vcpu, *addr, addr, asce, mode, &prot, pteptr);
> + if (rc <= 0)
> + return rc;
> +
> + switch (rc) {
> + case PGM_REGION_FIRST_TRANS:
> + case PGM_REGION_SECOND_TRANS:
> + case PGM_REGION_THIRD_TRANS:
> + case PGM_SEGMENT_TRANSLATION:
> + if (!edat)
> + return trans_exc(vcpu, rc, *addr, 0, mode, prot);
> + *pteptr |= 4;
Hmmm, I wonder why that is necessary. Can't we set that in all relevant
cases in guest_translate() just as you do via
*entryptr |= dat_protection ? 6 : 4;
Can you enlighten me? :)
> + fallthrough;
> + case PGM_PAGE_TRANSLATION:
> + return -ENOENT;
> + default:
> + return rc;
> + }
> +}
> +
> +int kvm_s390_vsie_mvpg_check(struct kvm_vcpu *vcpu, unsigned long r1,
> + unsigned long r2, void *gpei)
> +{
> + unsigned long pei[2] = {0};
> + union ctlreg0 cr0;
> + union asce cr1;
> + int edat, rc1, rc2;
> +
> + cr0.val = vcpu->arch.sie_block->gcr[0];
> + cr1.val = vcpu->arch.sie_block->gcr[1];
> + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
> +
> + rc1 = kvm_s390_mvpg_check_one(vcpu, &r1, edat, cr1, GACC_FETCH, pei);
> + rc2 = kvm_s390_mvpg_check_one(vcpu, &r2, edat, cr1, GACC_STORE, pei + 1);
> +
> + if (rc1 == -ENOENT || rc2 == -ENOENT) {
> + memcpy(gpei, pei, sizeof(pei));
I'd really prefer just passing two unsigned long pointers to
kvm_s390_vsie_mvpg_check() and eventually directly forwarding them to
kvm_s390_mvpg_check_one().
> + return -ENOENT;
> + }
> +
> + if (rc2 < 0)
> + return rc2;
> + if (rc1 < 0)
> + return rc1;
> +
> + return 0;
> +}
> diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
> index f4c51756c462..2c53cee3b29f 100644
> --- a/arch/s390/kvm/gaccess.h
> +++ b/arch/s390/kvm/gaccess.h
> @@ -166,6 +166,9 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
> int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
> unsigned long len, enum gacc_mode mode);
>
> +int kvm_s390_vsie_mvpg_check(struct kvm_vcpu *vcpu, unsigned long r1,
> + unsigned long r2, void *gpei);
> +
> int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
> void *data, unsigned long len, enum gacc_mode mode);
>
>
--
Thanks,
David / dhildenb
On 12/18/20 3:18 PM, Claudio Imbrenda wrote:
> The addresses in the SIE control block of the host should not be
> forwarded to the guest. They are only meaningful to the host, and
> moreover it would be a clear security issue.
>
> Subsequent patches will actually put the right values in the guest SIE
> control block.
>
> Fixes: a3508fbe9dc6d ("KVM: s390: vsie: initial support for nested virtualization")
> Cc: [email protected]
> Signed-off-by: Claudio Imbrenda <[email protected]>
Looks reasonable.
Give me some time to have a deeper look it's a lot of architecture to read.
> ---
> arch/s390/kvm/vsie.c | 5 -----
> 1 file changed, 5 deletions(-)
>
> diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
> index 4f3cbf6003a9..ada49583e530 100644
> --- a/arch/s390/kvm/vsie.c
> +++ b/arch/s390/kvm/vsie.c
> @@ -416,11 +416,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
> memcpy((void *)((u64)scb_o + 0xc0),
> (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
> break;
> - case ICPT_PARTEXEC:
> - /* MVPG only */
> - memcpy((void *)((u64)scb_o + 0xc0),
> - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
> - break;
> }
>
> if (scb_s->ihcpu != 0xffffU)
>
On 12/18/20 3:18 PM, Claudio Imbrenda wrote:
> Extend guest_translate to optionally return the address of the guest
> DAT table which caused the exception, and change the return value to int.
>
> Also return the appropriate values in the low order bits of the address
> indicating protection or EDAT.
>
> Cc: [email protected]
> Signed-off-by: Claudio Imbrenda <[email protected]>
> ---
> arch/s390/kvm/gaccess.c | 33 ++++++++++++++++++++++++++++-----
> 1 file changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
> index 6d6b57059493..8e256a233583 100644
> --- a/arch/s390/kvm/gaccess.c
> +++ b/arch/s390/kvm/gaccess.c
> @@ -598,6 +598,10 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
> * @asce: effective asce
> * @mode: indicates the access mode to be used
> * @prot: returns the type for protection exceptions
> + * @entryptr: returns the physical address of the last DAT table entry
> + * processed, additionally setting a few flags in the lower bits
> + * to indicate whether a translation exception or a protection
> + * exception were encountered during the address translation.
I'd much rather have another argument pointer than fusing the address
and the status bits. Or we could make prot a struct and add your status
bits in.
> *
> * Translate a guest virtual address into a guest absolute address by means
> * of dynamic address translation as specified by the architecture.
> @@ -611,9 +615,10 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
> * the returned value is the program interruption code as defined
> * by the architecture
> */
> -static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> - unsigned long *gpa, const union asce asce,
> - enum gacc_mode mode, enum prot_type *prot)
> +static int guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> + unsigned long *gpa, const union asce asce,
> + enum gacc_mode mode, enum prot_type *prot,
> + unsigned long *entryptr)
> {
> union vaddress vaddr = {.addr = gva};
> union raddress raddr = {.addr = gva};
> @@ -628,6 +633,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> edat1 = ctlreg0.edat && test_kvm_facility(vcpu->kvm, 8);
> edat2 = edat1 && test_kvm_facility(vcpu->kvm, 78);
> iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
> + if (entryptr)
> + *entryptr = 0;
> if (asce.r)
> goto real_address;
> ptr = asce.origin * PAGE_SIZE;
> @@ -667,6 +674,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> return PGM_ADDRESSING;
> if (deref_table(vcpu->kvm, ptr, &rfte.val))
> return -EFAULT;
> + if (entryptr)
> + *entryptr = ptr;
> if (rfte.i)
> return PGM_REGION_FIRST_TRANS;
> if (rfte.tt != TABLE_TYPE_REGION1)
> @@ -685,6 +694,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> return PGM_ADDRESSING;
> if (deref_table(vcpu->kvm, ptr, &rste.val))
> return -EFAULT;
> + if (entryptr)
> + *entryptr = ptr;
> if (rste.i)
> return PGM_REGION_SECOND_TRANS;
> if (rste.tt != TABLE_TYPE_REGION2)
> @@ -703,6 +714,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> return PGM_ADDRESSING;
> if (deref_table(vcpu->kvm, ptr, &rtte.val))
> return -EFAULT;
> + if (entryptr)
> + *entryptr = ptr;
> if (rtte.i)
> return PGM_REGION_THIRD_TRANS;
> if (rtte.tt != TABLE_TYPE_REGION3)
> @@ -713,6 +726,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> dat_protection |= rtte.fc1.p;
> iep_protection = rtte.fc1.iep;
> raddr.rfaa = rtte.fc1.rfaa;
> + if (entryptr)
> + *entryptr |= dat_protection ? 6 : 4;
Magic constants are magic
> goto absolute_address;
> }
> if (vaddr.sx01 < rtte.fc0.tf)
> @@ -731,6 +746,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> return PGM_ADDRESSING;
> if (deref_table(vcpu->kvm, ptr, &ste.val))
> return -EFAULT;
> + if (entryptr)
> + *entryptr = ptr;
> if (ste.i)
> return PGM_SEGMENT_TRANSLATION;
> if (ste.tt != TABLE_TYPE_SEGMENT)
> @@ -741,6 +758,8 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> dat_protection |= ste.fc1.p;
> iep_protection = ste.fc1.iep;
> raddr.sfaa = ste.fc1.sfaa;
> + if (entryptr)
> + *entryptr |= dat_protection ? 6 : 4;
> goto absolute_address;
> }
> dat_protection |= ste.fc0.p;
> @@ -751,10 +770,14 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
> return PGM_ADDRESSING;
> if (deref_table(vcpu->kvm, ptr, &pte.val))
> return -EFAULT;
> + if (entryptr)
> + *entryptr = ptr;
> if (pte.i)
> return PGM_PAGE_TRANSLATION;
> if (pte.z)
> return PGM_TRANSLATION_SPEC;
> + if (entryptr && dat_protection)
> + *entryptr |= 2;
> dat_protection |= pte.p;
> iep_protection = pte.iep;
> raddr.pfra = pte.pfra;
> @@ -810,7 +833,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
> PROT_TYPE_LA);
> ga &= PAGE_MASK;
> if (psw_bits(*psw).dat) {
> - rc = guest_translate(vcpu, ga, pages, asce, mode, &prot);
> + rc = guest_translate(vcpu, ga, pages, asce, mode, &prot, NULL);
> if (rc < 0)
> return rc;
> } else {
> @@ -920,7 +943,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
> }
>
> if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */
> - rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot);
> + rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot, NULL);
> if (rc > 0)
> return trans_exc(vcpu, rc, gva, 0, mode, prot);
> } else {
>