2022-01-14 23:07:20

by Matthew Rosato

[permalink] [raw]
Subject: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI translations

Add a routine that will perform a shadow operation between a guest
and host IOAT. A subsequent patch will invoke this in response to
an 04 RPCIT instruction intercept.

Signed-off-by: Matthew Rosato <[email protected]>
---
arch/s390/include/asm/kvm_pci.h | 1 +
arch/s390/include/asm/pci_dma.h | 1 +
arch/s390/kvm/pci.c | 208 +++++++++++++++++++++++++++++++-
arch/s390/kvm/pci.h | 8 +-
4 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/kvm_pci.h b/arch/s390/include/asm/kvm_pci.h
index 770849f13a70..fa90729a35cf 100644
--- a/arch/s390/include/asm/kvm_pci.h
+++ b/arch/s390/include/asm/kvm_pci.h
@@ -30,6 +30,7 @@ struct kvm_zdev_ioat {
struct kvm_zdev {
struct zpci_dev *zdev;
struct kvm *kvm;
+ u64 rpcit_count;
struct kvm_zdev_ioat ioat;
struct zpci_fib fib;
};
diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index 69e616d0712c..38004e0a4383 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -52,6 +52,7 @@ enum zpci_ioat_dtype {
#define ZPCI_TABLE_ENTRIES (ZPCI_TABLE_SIZE / ZPCI_TABLE_ENTRY_SIZE)
#define ZPCI_TABLE_PAGES (ZPCI_TABLE_SIZE >> PAGE_SHIFT)
#define ZPCI_TABLE_ENTRIES_PAGES (ZPCI_TABLE_ENTRIES * ZPCI_TABLE_PAGES)
+#define ZPCI_TABLE_ENTRIES_PER_PAGE (ZPCI_TABLE_ENTRIES / ZPCI_TABLE_PAGES)

#define ZPCI_TABLE_BITS 11
#define ZPCI_PT_BITS 8
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index 39c13c25a700..38d2b77ec565 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -149,6 +149,208 @@ int kvm_s390_pci_aen_init(u8 nisc)
return rc;
}

+static int dma_shadow_cpu_trans(struct kvm_vcpu *vcpu, unsigned long *entry,
+ unsigned long *gentry)
+{
+ phys_addr_t gaddr = 0;
+ unsigned long idx;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gpa_t addr;
+ int rc = 0;
+
+ if (pt_entry_isvalid(*gentry)) {
+ /* pin and validate */
+ addr = *gentry & ZPCI_PTE_ADDR_MASK;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ if (is_error_page(page))
+ return -EIO;
+ gaddr = page_to_phys(page) + (addr & ~PAGE_MASK);
+ }
+
+ if (pt_entry_isvalid(*entry)) {
+ /* Either we are invalidating, replacing or no-op */
+ if (gaddr != 0) {
+ if ((*entry & ZPCI_PTE_ADDR_MASK) == gaddr) {
+ /* Duplicate */
+ kvm_release_pfn_dirty(*entry >> PAGE_SHIFT);
+ } else {
+ /* Replace */
+ pfn = (*entry >> PAGE_SHIFT);
+ invalidate_pt_entry(entry);
+ set_pt_pfaa(entry, gaddr);
+ validate_pt_entry(entry);
+ kvm_release_pfn_dirty(pfn);
+ rc = 1;
+ }
+ } else {
+ /* Invalidate */
+ pfn = (*entry >> PAGE_SHIFT);
+ invalidate_pt_entry(entry);
+ kvm_release_pfn_dirty(pfn);
+ rc = 1;
+ }
+ } else if (gaddr != 0) {
+ /* New Entry */
+ set_pt_pfaa(entry, gaddr);
+ validate_pt_entry(entry);
+ }
+
+ return rc;
+}
+
+static unsigned long *dma_walk_guest_cpu_trans(struct kvm_vcpu *vcpu,
+ struct kvm_zdev_ioat *ioat,
+ dma_addr_t dma_addr)
+{
+ unsigned long *rto, *sto, *pto;
+ unsigned int rtx, rts, sx, px, idx;
+ struct page *page;
+ gpa_t addr;
+ int i;
+
+ /* Pin guest segment table if needed */
+ rtx = calc_rtx(dma_addr);
+ rto = ioat->head[(rtx / ZPCI_TABLE_ENTRIES_PER_PAGE)];
+ rts = rtx * ZPCI_TABLE_PAGES;
+ if (!ioat->seg[rts]) {
+ if (!reg_entry_isvalid(rto[rtx % ZPCI_TABLE_ENTRIES_PER_PAGE]))
+ return NULL;
+ sto = get_rt_sto(rto[rtx % ZPCI_TABLE_ENTRIES_PER_PAGE]);
+ addr = ((u64)sto & ZPCI_RTE_ADDR_MASK);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ for (i = 0; i < ZPCI_TABLE_PAGES; i++) {
+ page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
+ if (is_error_page(page)) {
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ return NULL;
+ }
+ ioat->seg[rts + i] = page_to_virt(page) +
+ (addr & ~PAGE_MASK);
+ addr += PAGE_SIZE;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ /* Allocate pin pointers for another segment table if needed */
+ if (!ioat->pt[rtx]) {
+ ioat->pt[rtx] = kcalloc(ZPCI_TABLE_ENTRIES,
+ (sizeof(unsigned long *)), GFP_KERNEL);
+ if (!ioat->pt[rtx])
+ return NULL;
+ }
+ /* Pin guest page table if needed */
+ sx = calc_sx(dma_addr);
+ sto = ioat->seg[(rts + (sx / ZPCI_TABLE_ENTRIES_PER_PAGE))];
+ if (!ioat->pt[rtx][sx]) {
+ if (!reg_entry_isvalid(sto[sx % ZPCI_TABLE_ENTRIES_PER_PAGE]))
+ return NULL;
+ pto = get_st_pto(sto[sx % ZPCI_TABLE_ENTRIES_PER_PAGE]);
+ if (!pto)
+ return NULL;
+ addr = ((u64)pto & ZPCI_STE_ADDR_MASK);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ if (is_error_page(page))
+ return NULL;
+ ioat->pt[rtx][sx] = page_to_virt(page) + (addr & ~PAGE_MASK);
+ }
+ pto = ioat->pt[rtx][sx];
+
+ /* Return guest PTE */
+ px = calc_px(dma_addr);
+ return &pto[px];
+}
+
+
+static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev *zdev,
+ dma_addr_t dma_addr, size_t size)
+{
+ unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct kvm_zdev *kzdev = zdev->kzdev;
+ unsigned long *entry, *gentry;
+ int i, rc = 0, rc2;
+
+ if (!nr_pages || !kzdev)
+ return -EINVAL;
+
+ mutex_lock(&kzdev->ioat.lock);
+ if (!zdev->dma_table || !kzdev->ioat.head[0]) {
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat, dma_addr);
+ if (!gentry)
+ continue;
+ entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
+
+ if (!entry) {
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+
+ rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
+ if (rc2 < 0) {
+ rc = -EIO;
+ goto out_unlock;
+ }
+ dma_addr += PAGE_SIZE;
+ rc += rc2;
+ }
+
+out_unlock:
+ mutex_unlock(&kzdev->ioat.lock);
+ return rc;
+}
+
+int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
+ unsigned long start, unsigned long size,
+ u8 *status)
+{
+ struct zpci_dev *zdev;
+ u32 fh = req >> 32;
+ int rc;
+
+ /* Make sure this is a valid device associated with this guest */
+ zdev = get_zdev_by_fh(fh);
+ if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
+ *status = 0;
+ return -EINVAL;
+ }
+
+ /* Only proceed if the device is using the assist */
+ if (zdev->kzdev->ioat.head[0] == 0)
+ return -EOPNOTSUPP;
+
+ rc = dma_table_shadow(vcpu, zdev, start, size);
+ if (rc < 0) {
+ /*
+ * If errors encountered during shadow operations, we must
+ * fabricate status to present to the guest
+ */
+ switch (rc) {
+ case -ENOMEM:
+ *status = KVM_S390_RPCIT_INS_RES;
+ break;
+ default:
+ *status = KVM_S390_RPCIT_ERR;
+ break;
+ }
+ } else if (rc > 0) {
+ /* Host RPCIT must be issued */
+ rc = zpci_refresh_trans((u64) zdev->fh << 32, start, size,
+ status);
+ }
+ zdev->kzdev->rpcit_count++;
+
+ return rc;
+}
+
/* Modify PCI: Register floating adapter interruption forwarding */
static int kvm_zpci_set_airq(struct zpci_dev *zdev)
{
@@ -620,6 +822,8 @@ EXPORT_SYMBOL_GPL(kvm_s390_pci_attach_kvm);

int kvm_s390_pci_init(void)
{
+ int rc;
+
aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
if (!aift)
return -ENOMEM;
@@ -627,5 +831,7 @@ int kvm_s390_pci_init(void)
spin_lock_init(&aift->gait_lock);
mutex_init(&aift->lock);

- return 0;
+ rc = zpci_get_mdd(&aift->mdd);
+
+ return rc;
}
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index 54355634df82..bb2be7fc3934 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -18,6 +18,9 @@

#define KVM_S390_PCI_DTSM_MASK 0x40

+#define KVM_S390_RPCIT_INS_RES 0x10
+#define KVM_S390_RPCIT_ERR 0x28
+
struct zpci_gaite {
u32 gisa;
u8 gisc;
@@ -33,6 +36,7 @@ struct zpci_aift {
struct kvm_zdev **kzdev;
spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
struct mutex lock; /* Protects the other structures in aift */
+ u32 mdd;
};

extern struct zpci_aift *aift;
@@ -47,7 +51,9 @@ static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,

int kvm_s390_pci_aen_init(u8 nisc);
void kvm_s390_pci_aen_exit(void);
-
+int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
+ unsigned long start, unsigned long end,
+ u8 *status);
int kvm_s390_pci_init(void);

#endif /* __KVM_S390_PCI_H */
--
2.27.0


2022-01-21 19:05:08

by Pierre Morel

[permalink] [raw]
Subject: Re: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI translations



On 1/14/22 21:31, Matthew Rosato wrote:
> Add a routine that will perform a shadow operation between a guest
> and host IOAT. A subsequent patch will invoke this in response to
> an 04 RPCIT instruction intercept.
>
> Signed-off-by: Matthew Rosato <[email protected]>
> ---
> arch/s390/include/asm/kvm_pci.h | 1 +
> arch/s390/include/asm/pci_dma.h | 1 +
> arch/s390/kvm/pci.c | 208 +++++++++++++++++++++++++++++++-
> arch/s390/kvm/pci.h | 8 +-
> 4 files changed, 216 insertions(+), 2 deletions(-)
>
> diff --git a/arch/s390/include/asm/kvm_pci.h b/arch/s390/include/asm/kvm_pci.h
> index 770849f13a70..fa90729a35cf 100644
> --- a/arch/s390/include/asm/kvm_pci.h
> +++ b/arch/s390/include/asm/kvm_pci.h
> @@ -30,6 +30,7 @@ struct kvm_zdev_ioat {
> struct kvm_zdev {
> struct zpci_dev *zdev;
> struct kvm *kvm;
> + u64 rpcit_count;
> struct kvm_zdev_ioat ioat;
> struct zpci_fib fib;
> };
> diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
> index 69e616d0712c..38004e0a4383 100644
> --- a/arch/s390/include/asm/pci_dma.h
> +++ b/arch/s390/include/asm/pci_dma.h
> @@ -52,6 +52,7 @@ enum zpci_ioat_dtype {
> #define ZPCI_TABLE_ENTRIES (ZPCI_TABLE_SIZE / ZPCI_TABLE_ENTRY_SIZE)
> #define ZPCI_TABLE_PAGES (ZPCI_TABLE_SIZE >> PAGE_SHIFT)
> #define ZPCI_TABLE_ENTRIES_PAGES (ZPCI_TABLE_ENTRIES * ZPCI_TABLE_PAGES)
> +#define ZPCI_TABLE_ENTRIES_PER_PAGE (ZPCI_TABLE_ENTRIES / ZPCI_TABLE_PAGES)
>
> #define ZPCI_TABLE_BITS 11
> #define ZPCI_PT_BITS 8
> diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
> index 39c13c25a700..38d2b77ec565 100644
> --- a/arch/s390/kvm/pci.c
> +++ b/arch/s390/kvm/pci.c
> @@ -149,6 +149,208 @@ int kvm_s390_pci_aen_init(u8 nisc)
> return rc;
> }
>
> +static int dma_shadow_cpu_trans(struct kvm_vcpu *vcpu, unsigned long *entry,
> + unsigned long *gentry)
> +{
> + phys_addr_t gaddr = 0;
> + unsigned long idx;
> + struct page *page;
> + kvm_pfn_t pfn;
> + gpa_t addr;
> + int rc = 0;
> +
> + if (pt_entry_isvalid(*gentry)) {
> + /* pin and validate */
> + addr = *gentry & ZPCI_PTE_ADDR_MASK;
> + idx = srcu_read_lock(&vcpu->kvm->srcu);
> + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + if (is_error_page(page))
> + return -EIO;
> + gaddr = page_to_phys(page) + (addr & ~PAGE_MASK);
> + }
> +
> + if (pt_entry_isvalid(*entry)) {
> + /* Either we are invalidating, replacing or no-op */
> + if (gaddr != 0) {
> + if ((*entry & ZPCI_PTE_ADDR_MASK) == gaddr) {
> + /* Duplicate */
> + kvm_release_pfn_dirty(*entry >> PAGE_SHIFT);
> + } else {
> + /* Replace */
> + pfn = (*entry >> PAGE_SHIFT);
> + invalidate_pt_entry(entry);
> + set_pt_pfaa(entry, gaddr);
> + validate_pt_entry(entry);
> + kvm_release_pfn_dirty(pfn);
> + rc = 1;
> + }
> + } else {
> + /* Invalidate */
> + pfn = (*entry >> PAGE_SHIFT);
> + invalidate_pt_entry(entry);
> + kvm_release_pfn_dirty(pfn);
> + rc = 1;
> + }
> + } else if (gaddr != 0) {
> + /* New Entry */
> + set_pt_pfaa(entry, gaddr);
> + validate_pt_entry(entry);
> + }
> +
> + return rc;
> +}
> +
> +static unsigned long *dma_walk_guest_cpu_trans(struct kvm_vcpu *vcpu,
> + struct kvm_zdev_ioat *ioat,
> + dma_addr_t dma_addr)
> +{
> + unsigned long *rto, *sto, *pto;
> + unsigned int rtx, rts, sx, px, idx;
> + struct page *page;
> + gpa_t addr;
> + int i;
> +
> + /* Pin guest segment table if needed */
> + rtx = calc_rtx(dma_addr);
> + rto = ioat->head[(rtx / ZPCI_TABLE_ENTRIES_PER_PAGE)];
> + rts = rtx * ZPCI_TABLE_PAGES;
> + if (!ioat->seg[rts]) {
> + if (!reg_entry_isvalid(rto[rtx % ZPCI_TABLE_ENTRIES_PER_PAGE]))
> + return NULL;
> + sto = get_rt_sto(rto[rtx % ZPCI_TABLE_ENTRIES_PER_PAGE]);
> + addr = ((u64)sto & ZPCI_RTE_ADDR_MASK);
> + idx = srcu_read_lock(&vcpu->kvm->srcu);
> + for (i = 0; i < ZPCI_TABLE_PAGES; i++) {
> + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
> + if (is_error_page(page)) {
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + return NULL;
> + }
> + ioat->seg[rts + i] = page_to_virt(page) +
> + (addr & ~PAGE_MASK);
> + addr += PAGE_SIZE;
> + }
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + }
> +
> + /* Allocate pin pointers for another segment table if needed */
> + if (!ioat->pt[rtx]) {
> + ioat->pt[rtx] = kcalloc(ZPCI_TABLE_ENTRIES,
> + (sizeof(unsigned long *)), GFP_KERNEL);
> + if (!ioat->pt[rtx])
> + return NULL;
> + }
> + /* Pin guest page table if needed */
> + sx = calc_sx(dma_addr);
> + sto = ioat->seg[(rts + (sx / ZPCI_TABLE_ENTRIES_PER_PAGE))];
> + if (!ioat->pt[rtx][sx]) {
> + if (!reg_entry_isvalid(sto[sx % ZPCI_TABLE_ENTRIES_PER_PAGE]))
> + return NULL;
> + pto = get_st_pto(sto[sx % ZPCI_TABLE_ENTRIES_PER_PAGE]);
> + if (!pto)
> + return NULL;
> + addr = ((u64)pto & ZPCI_STE_ADDR_MASK);
> + idx = srcu_read_lock(&vcpu->kvm->srcu);
> + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + if (is_error_page(page))
> + return NULL;
> + ioat->pt[rtx][sx] = page_to_virt(page) + (addr & ~PAGE_MASK);
> + }
> + pto = ioat->pt[rtx][sx];
> +
> + /* Return guest PTE */
> + px = calc_px(dma_addr);
> + return &pto[px];
> +}
> +
> +
> +static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev *zdev,
> + dma_addr_t dma_addr, size_t size)
> +{
> + unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
> + struct kvm_zdev *kzdev = zdev->kzdev;
> + unsigned long *entry, *gentry;
> + int i, rc = 0, rc2;
> +
> + if (!nr_pages || !kzdev)
> + return -EINVAL;
> +
> + mutex_lock(&kzdev->ioat.lock);
> + if (!zdev->dma_table || !kzdev->ioat.head[0]) {
> + rc = -EINVAL;
> + goto out_unlock;
> + }
> +
> + for (i = 0; i < nr_pages; i++) {
> + gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat, dma_addr);
> + if (!gentry)
> + continue;
> + entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
> +
> + if (!entry) {
> + rc = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
> + if (rc2 < 0) {
> + rc = -EIO;
> + goto out_unlock;
> + }
> + dma_addr += PAGE_SIZE;
> + rc += rc2;
> + }
> +

In case of error, shouldn't we invalidate the shadow tables entries we
did validate until the error?

> +out_unlock:
> + mutex_unlock(&kzdev->ioat.lock);
> + return rc;
> +}
> +
> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
> + unsigned long start, unsigned long size,
> + u8 *status)
> +{
> + struct zpci_dev *zdev;
> + u32 fh = req >> 32;
> + int rc;
> +
> + /* Make sure this is a valid device associated with this guest */
> + zdev = get_zdev_by_fh(fh);
> + if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
> + *status = 0;

Wouldn't it be interesting to add some debug information here.
When would this appear?

Also if we have this error this looks like we have a VM problem,
shouldn't we treat this in QEMU and return -EOPNOTSUPP ?

> + return -EINVAL;
> + }
> +
> + /* Only proceed if the device is using the assist */
> + if (zdev->kzdev->ioat.head[0] == 0)
> + return -EOPNOTSUPP;
> +
> + rc = dma_table_shadow(vcpu, zdev, start, size);
> + if (rc < 0) {
> + /*
> + * If errors encountered during shadow operations, we must
> + * fabricate status to present to the guest
> + */
> + switch (rc) {
> + case -ENOMEM:
> + *status = KVM_S390_RPCIT_INS_RES;
> + break;
> + default:
> + *status = KVM_S390_RPCIT_ERR;
> + break;
> + }
> + } else if (rc > 0) {
> + /* Host RPCIT must be issued */
> + rc = zpci_refresh_trans((u64) zdev->fh << 32, start, size,
> + status);
> + }
> + zdev->kzdev->rpcit_count++;
> +
> + return rc;
> +}
> +
> /* Modify PCI: Register floating adapter interruption forwarding */
> static int kvm_zpci_set_airq(struct zpci_dev *zdev)
> {
> @@ -620,6 +822,8 @@ EXPORT_SYMBOL_GPL(kvm_s390_pci_attach_kvm);
>
> int kvm_s390_pci_init(void)
> {
> + int rc;
> +
> aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
> if (!aift)
> return -ENOMEM;
> @@ -627,5 +831,7 @@ int kvm_s390_pci_init(void)
> spin_lock_init(&aift->gait_lock);
> mutex_init(&aift->lock);
>
> - return 0;
> + rc = zpci_get_mdd(&aift->mdd);
> +
> + return rc;
> }
> diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
> index 54355634df82..bb2be7fc3934 100644
> --- a/arch/s390/kvm/pci.h
> +++ b/arch/s390/kvm/pci.h
> @@ -18,6 +18,9 @@
>
> #define KVM_S390_PCI_DTSM_MASK 0x40
>
> +#define KVM_S390_RPCIT_INS_RES 0x10
> +#define KVM_S390_RPCIT_ERR 0x28
> +
> struct zpci_gaite {
> u32 gisa;
> u8 gisc;
> @@ -33,6 +36,7 @@ struct zpci_aift {
> struct kvm_zdev **kzdev;
> spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
> struct mutex lock; /* Protects the other structures in aift */
> + u32 mdd;
> };
>
> extern struct zpci_aift *aift;
> @@ -47,7 +51,9 @@ static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
>
> int kvm_s390_pci_aen_init(u8 nisc);
> void kvm_s390_pci_aen_exit(void);
> -
> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
> + unsigned long start, unsigned long end,
> + u8 *status);
> int kvm_s390_pci_init(void);
>
> #endif /* __KVM_S390_PCI_H */
>

--
Pierre Morel
IBM Lab Boeblingen

2022-01-21 19:50:05

by Matthew Rosato

[permalink] [raw]
Subject: Re: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI translations

On 1/19/22 4:29 AM, Pierre Morel wrote:
>
>
> On 1/14/22 21:31, Matthew Rosato wrote:
...
>> +static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev
>> *zdev,
>> +                dma_addr_t dma_addr, size_t size)
>> +{
>> +    unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>> +    struct kvm_zdev *kzdev = zdev->kzdev;
>> +    unsigned long *entry, *gentry;
>> +    int i, rc = 0, rc2;
>> +
>> +    if (!nr_pages || !kzdev)
>> +        return -EINVAL;
>> +
>> +    mutex_lock(&kzdev->ioat.lock);
>> +    if (!zdev->dma_table || !kzdev->ioat.head[0]) {
>> +        rc = -EINVAL;
>> +        goto out_unlock;
>> +    }
>> +
>> +    for (i = 0; i < nr_pages; i++) {
>> +        gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat, dma_addr);
>> +        if (!gentry)
>> +            continue;
>> +        entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
>> +
>> +        if (!entry) {
>> +            rc = -ENOMEM;
>> +            goto out_unlock;
>> +        }
>> +
>> +        rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
>> +        if (rc2 < 0) {
>> +            rc = -EIO;
>> +            goto out_unlock;
>> +        }
>> +        dma_addr += PAGE_SIZE;
>> +        rc += rc2;
>> +    }
>> +
>
> In case of error, shouldn't we invalidate the shadow tables entries we
> did validate until the error?

Hmm, I don't think this is strictly necessary - the status returned
should indicate the specified DMA range is now in an indeterminate state
(putting the onus on the guest to take corrective action via a global
refresh).

In fact I think I screwed that up below in kvm_s390_pci_refresh_trans,
the fabricated status should always be KVM_S390_RPCIT_INS_RES.

>
>> +out_unlock:
>> +    mutex_unlock(&kzdev->ioat.lock);
>> +    return rc;
>> +}
>> +
>> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
>> +                   unsigned long start, unsigned long size,
>> +                   u8 *status)
>> +{
>> +    struct zpci_dev *zdev;
>> +    u32 fh = req >> 32;
>> +    int rc;
>> +
>> +    /* Make sure this is a valid device associated with this guest */
>> +    zdev = get_zdev_by_fh(fh);
>> +    if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
>> +        *status = 0;
>
> Wouldn't it be interesting to add some debug information here.
> When would this appear?

Yes, I agree -- One of the follow-ons I'd like to add after this series
is s390dbf entries; this seems like a good spot for one.

As to when this could happen; it should not under normal circumstances,
but consider something like arbitrary function handles coming from the
intercepted guest instruction. We need to ensure that the specified
function 1) exists and 2) is associated with the guest issuing the refresh.

>
> Also if we have this error this looks like we have a VM problem,
> shouldn't we treat this in QEMU and return -EOPNOTSUPP ?
>

Well, I'm not sure if we can really tell where the problem is (it could
for example indicate a misbehaving guest, or a bug in our KVM tracking
of hostdevs).

The guest chose the function handle, and if we got here then that means
it doesn't indicate that it's an emulated device, which means either we
are using the assist and KVM should handle the intercept or we are not
and userspace should handle it. But in both of those cases, there
should be a host device and it should be associated with the guest.

I think if we decide to throw this to userspace in this event, QEMU
needs some extra code to handle it (basically, if QEMU receives the
intercept and the device is neither emulated nor using intercept mode
then we must treat as an invalid handle as this intercept should have
been handled by KVM)


>> +        return -EINVAL;
>> +    }
>> +
>> +    /* Only proceed if the device is using the assist */
>> +    if (zdev->kzdev->ioat.head[0] == 0)
>> +        return -EOPNOTSUPP;
>> +
>> +    rc = dma_table_shadow(vcpu, zdev, start, size);
>> +    if (rc < 0) {
>> +        /*
>> +         * If errors encountered during shadow operations, we must
>> +         * fabricate status to present to the guest
>> +         */
>> +        switch (rc) {
>> +        case -ENOMEM:
>> +            *status = KVM_S390_RPCIT_INS_RES;
>> +            break;
>> +        default:
>> +            *status = KVM_S390_RPCIT_ERR;
>> +            break;

As mentioned above I think this switch statement should go away and
instead always set KVM_S390_RPCIT_INS_RES when rc < 0.

>> +        }
>> +    } else if (rc > 0) {
>> +        /* Host RPCIT must be issued */
>> +        rc = zpci_refresh_trans((u64) zdev->fh << 32, start, size,
>> +                    status);
>> +    }
>> +    zdev->kzdev->rpcit_count++;
>> +
>> +    return rc;
>> +}
>> +
>>   /* Modify PCI: Register floating adapter interruption forwarding */
>>   static int kvm_zpci_set_airq(struct zpci_dev *zdev)
>>   {
>> @@ -620,6 +822,8 @@ EXPORT_SYMBOL_GPL(kvm_s390_pci_attach_kvm);
>>   int kvm_s390_pci_init(void)
>>   {
>> +    int rc;
>> +
>>       aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
>>       if (!aift)
>>           return -ENOMEM;
>> @@ -627,5 +831,7 @@ int kvm_s390_pci_init(void)
>>       spin_lock_init(&aift->gait_lock);
>>       mutex_init(&aift->lock);
>> -    return 0;
>> +    rc = zpci_get_mdd(&aift->mdd);
>> +
>> +    return rc;
>>   }
>> diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
>> index 54355634df82..bb2be7fc3934 100644
>> --- a/arch/s390/kvm/pci.h
>> +++ b/arch/s390/kvm/pci.h
>> @@ -18,6 +18,9 @@
>>   #define KVM_S390_PCI_DTSM_MASK 0x40
>> +#define KVM_S390_RPCIT_INS_RES 0x10
>> +#define KVM_S390_RPCIT_ERR 0x28
>> +
>>   struct zpci_gaite {
>>       u32 gisa;
>>       u8 gisc;
>> @@ -33,6 +36,7 @@ struct zpci_aift {
>>       struct kvm_zdev **kzdev;
>>       spinlock_t gait_lock; /* Protects the gait, used during AEN
>> forward */
>>       struct mutex lock; /* Protects the other structures in aift */
>> +    u32 mdd;
>>   };
>>   extern struct zpci_aift *aift;
>> @@ -47,7 +51,9 @@ static inline struct kvm
>> *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
>>   int kvm_s390_pci_aen_init(u8 nisc);
>>   void kvm_s390_pci_aen_exit(void);
>> -
>> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
>> +                   unsigned long start, unsigned long end,
>> +                   u8 *status);
>>   int kvm_s390_pci_init(void);
>>   #endif /* __KVM_S390_PCI_H */
>>
>

2022-01-21 19:55:36

by Pierre Morel

[permalink] [raw]
Subject: Re: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI translations



On 1/19/22 17:39, Matthew Rosato wrote:
> On 1/19/22 4:29 AM, Pierre Morel wrote:
>>
>>
>> On 1/14/22 21:31, Matthew Rosato wrote:
> ...
>>> +static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev
>>> *zdev,
>>> +                dma_addr_t dma_addr, size_t size)
>>> +{
>>> +    unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>>> +    struct kvm_zdev *kzdev = zdev->kzdev;
>>> +    unsigned long *entry, *gentry;
>>> +    int i, rc = 0, rc2;
>>> +
>>> +    if (!nr_pages || !kzdev)
>>> +        return -EINVAL;
>>> +
>>> +    mutex_lock(&kzdev->ioat.lock);
>>> +    if (!zdev->dma_table || !kzdev->ioat.head[0]) {
>>> +        rc = -EINVAL;
>>> +        goto out_unlock;
>>> +    }
>>> +
>>> +    for (i = 0; i < nr_pages; i++) {
>>> +        gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat,
>>> dma_addr);
>>> +        if (!gentry)
>>> +            continue;
>>> +        entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
>>> +
>>> +        if (!entry) {
>>> +            rc = -ENOMEM;
>>> +            goto out_unlock;
>>> +        }
>>> +
>>> +        rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
>>> +        if (rc2 < 0) {
>>> +            rc = -EIO;
>>> +            goto out_unlock;
>>> +        }
>>> +        dma_addr += PAGE_SIZE;
>>> +        rc += rc2;
>>> +    }
>>> +
>>
>> In case of error, shouldn't we invalidate the shadow tables entries we
>> did validate until the error?
>
> Hmm, I don't think this is strictly necessary - the status returned
> should indicate the specified DMA range is now in an indeterminate state
> (putting the onus on the guest to take corrective action via a global
> refresh).
>
> In fact I think I screwed that up below in kvm_s390_pci_refresh_trans,
> the fabricated status should always be KVM_S390_RPCIT_INS_RES.

OK

>
>>
>>> +out_unlock:
>>> +    mutex_unlock(&kzdev->ioat.lock);
>>> +    return rc;
>>> +}
>>> +
>>> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long
>>> req,
>>> +                   unsigned long start, unsigned long size,
>>> +                   u8 *status)
>>> +{
>>> +    struct zpci_dev *zdev;
>>> +    u32 fh = req >> 32;
>>> +    int rc;
>>> +
>>> +    /* Make sure this is a valid device associated with this guest */
>>> +    zdev = get_zdev_by_fh(fh);
>>> +    if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
>>> +        *status = 0;
>>
>> Wouldn't it be interesting to add some debug information here.
>> When would this appear?
>
> Yes, I agree -- One of the follow-ons I'd like to add after this series
> is s390dbf entries; this seems like a good spot for one.
>
> As to when this could happen; it should not under normal circumstances,
> but consider something like arbitrary function handles coming from the
> intercepted guest instruction.  We need to ensure that the specified
> function 1) exists and 2) is associated with the guest issuing the refresh.
>
>>
>> Also if we have this error this looks like we have a VM problem,
>> shouldn't we treat this in QEMU and return -EOPNOTSUPP ?
>>
>
> Well, I'm not sure if we can really tell where the problem is (it could
> for example indicate a misbehaving guest, or a bug in our KVM tracking
> of hostdevs).
>
> The guest chose the function handle, and if we got here then that means
> it doesn't indicate that it's an emulated device, which means either we
> are using the assist and KVM should handle the intercept or we are not
> and userspace should handle it.  But in both of those cases, there
> should be a host device and it should be associated with the guest.

That is right if we can not find an associated zdev = F(fh)
but the two other errors are KVM or QEMU errors AFAIU.

>
> I think if we decide to throw this to userspace in this event, QEMU
> needs some extra code to handle it (basically, if QEMU receives the
> intercept and the device is neither emulated nor using intercept mode
> then we must treat as an invalid handle as this intercept should have
> been handled by KVM)

I do not want to start a discussion on this, I think we can let it like
this at first and come back to it when we have a good idea on how to
handle this.
May be just add a /* TODO */


>
>
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    /* Only proceed if the device is using the assist */
>>> +    if (zdev->kzdev->ioat.head[0] == 0)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    rc = dma_table_shadow(vcpu, zdev, start, size);
>>> +    if (rc < 0) {
>>> +        /*
>>> +         * If errors encountered during shadow operations, we must
>>> +         * fabricate status to present to the guest
>>> +         */
>>> +        switch (rc) {
>>> +        case -ENOMEM:
>>> +            *status = KVM_S390_RPCIT_INS_RES;
>>> +            break;
>>> +        default:
>>> +            *status = KVM_S390_RPCIT_ERR;
>>> +            break;
>
> As mentioned above I think this switch statement should go away and
> instead always set KVM_S390_RPCIT_INS_RES when rc < 0.
>
>>> +        }
>>> +    } else if (rc > 0) {
>>> +        /* Host RPCIT must be issued */
>>> +        rc = zpci_refresh_trans((u64) zdev->fh << 32, start, size,
>>> +                    status);
>>> +    }
>>> +    zdev->kzdev->rpcit_count++;
>>> +
>>> +    return rc;
>>> +}
>>> +
>>>   /* Modify PCI: Register floating adapter interruption forwarding */
>>>   static int kvm_zpci_set_airq(struct zpci_dev *zdev)
>>>   {
>>> @@ -620,6 +822,8 @@ EXPORT_SYMBOL_GPL(kvm_s390_pci_attach_kvm);
>>>   int kvm_s390_pci_init(void)
>>>   {
>>> +    int rc;
>>> +
>>>       aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
>>>       if (!aift)
>>>           return -ENOMEM;
>>> @@ -627,5 +831,7 @@ int kvm_s390_pci_init(void)
>>>       spin_lock_init(&aift->gait_lock);
>>>       mutex_init(&aift->lock);
>>> -    return 0;
>>> +    rc = zpci_get_mdd(&aift->mdd);
>>> +
>>> +    return rc;
>>>   }
>>> diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
>>> index 54355634df82..bb2be7fc3934 100644
>>> --- a/arch/s390/kvm/pci.h
>>> +++ b/arch/s390/kvm/pci.h
>>> @@ -18,6 +18,9 @@
>>>   #define KVM_S390_PCI_DTSM_MASK 0x40
>>> +#define KVM_S390_RPCIT_INS_RES 0x10
>>> +#define KVM_S390_RPCIT_ERR 0x28
>>> +
>>>   struct zpci_gaite {
>>>       u32 gisa;
>>>       u8 gisc;
>>> @@ -33,6 +36,7 @@ struct zpci_aift {
>>>       struct kvm_zdev **kzdev;
>>>       spinlock_t gait_lock; /* Protects the gait, used during AEN
>>> forward */
>>>       struct mutex lock; /* Protects the other structures in aift */
>>> +    u32 mdd;
>>>   };
>>>   extern struct zpci_aift *aift;
>>> @@ -47,7 +51,9 @@ static inline struct kvm
>>> *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
>>>   int kvm_s390_pci_aen_init(u8 nisc);
>>>   void kvm_s390_pci_aen_exit(void);
>>> -
>>> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long
>>> req,
>>> +                   unsigned long start, unsigned long end,
>>> +                   u8 *status);
>>>   int kvm_s390_pci_init(void);
>>>   #endif /* __KVM_S390_PCI_H */
>>>
>>
>

--
Pierre Morel
IBM Lab Boeblingen

2022-01-21 20:01:04

by Matthew Rosato

[permalink] [raw]
Subject: Re: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI translations

On 1/19/22 1:25 PM, Pierre Morel wrote:
>
>
> On 1/19/22 17:39, Matthew Rosato wrote:
>> On 1/19/22 4:29 AM, Pierre Morel wrote:
>>>
>>>
>>> On 1/14/22 21:31, Matthew Rosato wrote:
>> ...
>>>> +static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev
>>>> *zdev,
>>>> +                dma_addr_t dma_addr, size_t size)
>>>> +{
>>>> +    unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>>>> +    struct kvm_zdev *kzdev = zdev->kzdev;
>>>> +    unsigned long *entry, *gentry;
>>>> +    int i, rc = 0, rc2;
>>>> +
>>>> +    if (!nr_pages || !kzdev)
>>>> +        return -EINVAL;
>>>> +
>>>> +    mutex_lock(&kzdev->ioat.lock);
>>>> +    if (!zdev->dma_table || !kzdev->ioat.head[0]) {
>>>> +        rc = -EINVAL;
>>>> +        goto out_unlock;
>>>> +    }
>>>> +
>>>> +    for (i = 0; i < nr_pages; i++) {
>>>> +        gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat,
>>>> dma_addr);
>>>> +        if (!gentry)
>>>> +            continue;
>>>> +        entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
>>>> +
>>>> +        if (!entry) {
>>>> +            rc = -ENOMEM;
>>>> +            goto out_unlock;
>>>> +        }
>>>> +
>>>> +        rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
>>>> +        if (rc2 < 0) {
>>>> +            rc = -EIO;
>>>> +            goto out_unlock;
>>>> +        }
>>>> +        dma_addr += PAGE_SIZE;
>>>> +        rc += rc2;
>>>> +    }
>>>> +
>>>
>>> In case of error, shouldn't we invalidate the shadow tables entries
>>> we did validate until the error?
>>
>> Hmm, I don't think this is strictly necessary - the status returned
>> should indicate the specified DMA range is now in an indeterminate
>> state (putting the onus on the guest to take corrective action via a
>> global refresh).
>>
>> In fact I think I screwed that up below in kvm_s390_pci_refresh_trans,
>> the fabricated status should always be KVM_S390_RPCIT_INS_RES.
>
> OK
>
>>
>>>
>>>> +out_unlock:
>>>> +    mutex_unlock(&kzdev->ioat.lock);
>>>> +    return rc;
>>>> +}
>>>> +
>>>> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long
>>>> req,
>>>> +                   unsigned long start, unsigned long size,
>>>> +                   u8 *status)
>>>> +{
>>>> +    struct zpci_dev *zdev;
>>>> +    u32 fh = req >> 32;
>>>> +    int rc;
>>>> +
>>>> +    /* Make sure this is a valid device associated with this guest */
>>>> +    zdev = get_zdev_by_fh(fh);
>>>> +    if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
>>>> +        *status = 0;
>>>
>>> Wouldn't it be interesting to add some debug information here.
>>> When would this appear?
>>
>> Yes, I agree -- One of the follow-ons I'd like to add after this
>> series is s390dbf entries; this seems like a good spot for one.
>>
>> As to when this could happen; it should not under normal
>> circumstances, but consider something like arbitrary function handles
>> coming from the intercepted guest instruction.  We need to ensure that
>> the specified function 1) exists and 2) is associated with the guest
>> issuing the refresh.
>>
>>>
>>> Also if we have this error this looks like we have a VM problem,
>>> shouldn't we treat this in QEMU and return -EOPNOTSUPP ?
>>>
>>
>> Well, I'm not sure if we can really tell where the problem is (it
>> could for example indicate a misbehaving guest, or a bug in our KVM
>> tracking of hostdevs).
>>
>> The guest chose the function handle, and if we got here then that
>> means it doesn't indicate that it's an emulated device, which means
>> either we are using the assist and KVM should handle the intercept or
>> we are not and userspace should handle it.  But in both of those
>> cases, there should be a host device and it should be associated with
>> the guest.
>
> That is right if we can not find an associated zdev = F(fh)
> but the two other errors are KVM or QEMU errors AFAIU.

I don't think we know for sure for any of the cases... For a
well-behaved guest I agree with your assessment. However, the guest
decides what fh to put into its refresh instruction and so a misbehaving
guest could just pick arbitrary numbers for fh and circumstantially
match some other host device. What if the guest just decided to try
every single possible fh number in a loop with a refresh instruction?
That's neither KVM nor QEMU's fault but can trip each of these cases.

Consider the different cases:

!zdev - Either the guest provided a bogus fh, KVM provided a bad fh via
the VFIO ioctl which then QEMU fed into CLP or KVM provided the right fh
via ioctl but QEMU clobbered it when providing it to the guest via CLP.

!zdev->kzdev - Either the guest provided a bogus fh that just so
happened to match a host fh that has no KVM association, or KVM or QEMU
screwed up somewhere (as above or because we failed to make the KVM
assocation somehow)

kzdev->kvm != vcpu->kvm - Pretty much the same as above, but the
matching device is actually in use by some other guest. Again it's
possible the a misbehaving guest 'got lucky' with an arbitrary fh that
happened to match a host fh with an existing KVM association -- or more
likely that KVM or QEMU screwed up somewhere.

>
>>
>> I think if we decide to throw this to userspace in this event, QEMU
>> needs some extra code to handle it (basically, if QEMU receives the
>> intercept and the device is neither emulated nor using intercept mode
>> then we must treat as an invalid handle as this intercept should have
>> been handled by KVM)
>
> I do not want to start a discussion on this, I think we can let it like
> this at first and come back to it when we have a good idea on how to
> handle this.
> May be just add a /* TODO */

OK, sure. In any of the above cases, we are certainly done in KVM
anyway. Whether there's value in passing it onto userspace vs
immediately giving an error, let's think about it.

2022-01-21 21:14:59

by Pierre Morel

[permalink] [raw]
Subject: Re: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI translations



On 1/19/22 21:02, Matthew Rosato wrote:
> On 1/19/22 1:25 PM, Pierre Morel wrote:
>>
>>
>> On 1/19/22 17:39, Matthew Rosato wrote:
>>> On 1/19/22 4:29 AM, Pierre Morel wrote:
>>>>
>>>>
>>>> On 1/14/22 21:31, Matthew Rosato wrote:
>>> ...
>>>>> +static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev
>>>>> *zdev,
>>>>> +                dma_addr_t dma_addr, size_t size)
>>>>> +{
>>>>> +    unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
>>>>> +    struct kvm_zdev *kzdev = zdev->kzdev;
>>>>> +    unsigned long *entry, *gentry;
>>>>> +    int i, rc = 0, rc2;
>>>>> +
>>>>> +    if (!nr_pages || !kzdev)
>>>>> +        return -EINVAL;
>>>>> +
>>>>> +    mutex_lock(&kzdev->ioat.lock);
>>>>> +    if (!zdev->dma_table || !kzdev->ioat.head[0]) {
>>>>> +        rc = -EINVAL;
>>>>> +        goto out_unlock;
>>>>> +    }
>>>>> +
>>>>> +    for (i = 0; i < nr_pages; i++) {
>>>>> +        gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat,
>>>>> dma_addr);
>>>>> +        if (!gentry)
>>>>> +            continue;
>>>>> +        entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
>>>>> +
>>>>> +        if (!entry) {
>>>>> +            rc = -ENOMEM;
>>>>> +            goto out_unlock;
>>>>> +        }
>>>>> +
>>>>> +        rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
>>>>> +        if (rc2 < 0) {
>>>>> +            rc = -EIO;
>>>>> +            goto out_unlock;
>>>>> +        }
>>>>> +        dma_addr += PAGE_SIZE;
>>>>> +        rc += rc2;
>>>>> +    }
>>>>> +
>>>>
>>>> In case of error, shouldn't we invalidate the shadow tables entries
>>>> we did validate until the error?
>>>
>>> Hmm, I don't think this is strictly necessary - the status returned
>>> should indicate the specified DMA range is now in an indeterminate
>>> state (putting the onus on the guest to take corrective action via a
>>> global refresh).
>>>
>>> In fact I think I screwed that up below in
>>> kvm_s390_pci_refresh_trans, the fabricated status should always be
>>> KVM_S390_RPCIT_INS_RES.
>>
>> OK
>>
>>>
>>>>
>>>>> +out_unlock:
>>>>> +    mutex_unlock(&kzdev->ioat.lock);
>>>>> +    return rc;
>>>>> +}
>>>>> +
>>>>> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned
>>>>> long req,
>>>>> +                   unsigned long start, unsigned long size,
>>>>> +                   u8 *status)
>>>>> +{
>>>>> +    struct zpci_dev *zdev;
>>>>> +    u32 fh = req >> 32;
>>>>> +    int rc;
>>>>> +
>>>>> +    /* Make sure this is a valid device associated with this guest */
>>>>> +    zdev = get_zdev_by_fh(fh);
>>>>> +    if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
>>>>> +        *status = 0;
>>>>
>>>> Wouldn't it be interesting to add some debug information here.
>>>> When would this appear?
>>>
>>> Yes, I agree -- One of the follow-ons I'd like to add after this
>>> series is s390dbf entries; this seems like a good spot for one.
>>>
>>> As to when this could happen; it should not under normal
>>> circumstances, but consider something like arbitrary function handles
>>> coming from the intercepted guest instruction.  We need to ensure
>>> that the specified function 1) exists and 2) is associated with the
>>> guest issuing the refresh.
>>>
>>>>
>>>> Also if we have this error this looks like we have a VM problem,
>>>> shouldn't we treat this in QEMU and return -EOPNOTSUPP ?
>>>>
>>>
>>> Well, I'm not sure if we can really tell where the problem is (it
>>> could for example indicate a misbehaving guest, or a bug in our KVM
>>> tracking of hostdevs).
>>>
>>> The guest chose the function handle, and if we got here then that
>>> means it doesn't indicate that it's an emulated device, which means
>>> either we are using the assist and KVM should handle the intercept or
>>> we are not and userspace should handle it.  But in both of those
>>> cases, there should be a host device and it should be associated with
>>> the guest.
>>
>> That is right if we can not find an associated zdev = F(fh)
>> but the two other errors are KVM or QEMU errors AFAIU.
>
> I don't think we know for sure for any of the cases...  For a
> well-behaved guest I agree with your assessment.  However, the guest
> decides what fh to put into its refresh instruction and so a misbehaving
> guest could just pick arbitrary numbers for fh and circumstantially
> match some other host device.  What if the guest just decided to try
> every single possible fh number in a loop with a refresh instruction?
> That's neither KVM nor QEMU's fault but can trip each of these cases.
>
> Consider the different cases:
>
> !zdev - Either the guest provided a bogus fh, KVM provided a bad fh via
> the VFIO ioctl which then QEMU fed into CLP or KVM provided the right fh
> via ioctl but QEMU clobbered it when providing it to the guest via CLP.
>
> !zdev->kzdev - Either the guest provided a bogus fh that just so
> happened to match a host fh that has no KVM association, or KVM or QEMU
> screwed up somewhere (as above or because we failed to make the KVM
> assocation somehow)
>
> kzdev->kvm != vcpu->kvm - Pretty much the same as above, but the
> matching device is actually in use by some other guest.  Again it's
> possible the a misbehaving guest 'got lucky' with an arbitrary fh that
> happened to match a host fh with an existing KVM association -- or more
> likely that KVM or QEMU screwed up somewhere.

OK, I understand and you are right, my error was to consider that
get_zdev_by_fh() returns a zdev associated with a valid FH for the guest
while it returns a zdev associated with a valid FH for the host.

If the comment would have been after the get_zdev_by_fh() and before the
test I may be wouldn't have done this mistake.

>
>>
>>>
>>> I think if we decide to throw this to userspace in this event, QEMU
>>> needs some extra code to handle it (basically, if QEMU receives the
>>> intercept and the device is neither emulated nor using intercept mode
>>> then we must treat as an invalid handle as this intercept should have
>>> been handled by KVM)
>>
>> I do not want to start a discussion on this, I think we can let it
>> like this at first and come back to it when we have a good idea on how
>> to handle this.
>> May be just add a /* TODO */
>
> OK, sure.  In any of the above cases, we are certainly done in KVM
> anyway.  Whether there's value in passing it onto userspace vs
> immediately giving an error, let's think about it.

No, I do not think we should anymore.
Sorry for this wrong idea.

--
Pierre Morel
IBM Lab Boeblingen