While s390x makes sure to never have PMD-mapped THP in processes that use
KVM -- by remapping them using PTEs in
thp_split_walk_pmd_entry()->split_huge_pmd() -- there is still the
possibility of having PTE-mapped THPs (large folios) mapped into guest
memory.
This would happen if user space allocates memory before calling
KVM_CREATE_VM (which would call s390_enable_sie()). With upstream QEMU,
this currently doesn't happen, because guest memory is setup and
condiitonally preallocated after KVM_CREATE_VM.
Could it happen with shmem/file-backed memory when another process
allocated memory in the pagecache? Likely, although currently not a
common setup.
Trying to split any PTE-mapped large folios sounds like the right and
future-proof thing to do here. So let's call split_folio() and handle the
return values accordingly.
Signed-off-by: David Hildenbrand <[email protected]>
---
arch/s390/kernel/uv.c | 31 +++++++++++++++++++++++++------
1 file changed, 25 insertions(+), 6 deletions(-)
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 25fe28d189df..3c6d86e3e828 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -338,11 +338,10 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
goto out;
if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
folio = page_folio(pte_page(*ptep));
- rc = -EINVAL;
- if (folio_test_large(folio))
- goto unlock;
rc = -EAGAIN;
- if (folio_trylock(folio)) {
+ if (folio_test_large(folio)) {
+ rc = -E2BIG;
+ } else if (folio_trylock(folio)) {
if (should_export_before_import(uvcb, gmap->mm))
uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
rc = make_folio_secure(folio, uvcb);
@@ -353,15 +352,35 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
* Once we drop the PTL, the folio may get unmapped and
* freed immediately. We need a temporary reference.
*/
- if (rc == -EAGAIN)
+ if (rc == -EAGAIN || rc == -E2BIG)
folio_get(folio);
}
-unlock:
pte_unmap_unlock(ptep, ptelock);
out:
mmap_read_unlock(gmap->mm);
switch (rc) {
+ case -E2BIG:
+ folio_lock(folio);
+ rc = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ switch (rc) {
+ case 0:
+ /* Splitting succeeded, try again immediately. */
+ goto again;
+ case -EAGAIN:
+ /* Additional folio references. */
+ if (drain_lru(&drain_lru_called))
+ goto again;
+ return -EAGAIN;
+ case -EBUSY:
+ /* Unexpected race. */
+ return -EAGAIN;
+ }
+ WARN_ON_ONCE(1);
+ return -ENXIO;
case -EAGAIN:
/*
* If we are here because the UVC returned busy or partial
--
2.44.0
On Fri, 12 Apr 2024 16:21:13 +0200
David Hildenbrand <[email protected]> wrote:
> While s390x makes sure to never have PMD-mapped THP in processes that use
> KVM -- by remapping them using PTEs in
> thp_split_walk_pmd_entry()->split_huge_pmd() -- there is still the
> possibility of having PTE-mapped THPs (large folios) mapped into guest
> memory.
>
> This would happen if user space allocates memory before calling
> KVM_CREATE_VM (which would call s390_enable_sie()). With upstream QEMU,
> this currently doesn't happen, because guest memory is setup and
> condiitonally preallocated after KVM_CREATE_VM.
*conditionally
>
> Could it happen with shmem/file-backed memory when another process
> allocated memory in the pagecache? Likely, although currently not a
> common setup.
>
> Trying to split any PTE-mapped large folios sounds like the right and
> future-proof thing to do here. So let's call split_folio() and handle the
> return values accordingly.
Reviewed-by: Claudio Imbrenda <[email protected]>
>
> Signed-off-by: David Hildenbrand <[email protected]>
> ---
> arch/s390/kernel/uv.c | 31 +++++++++++++++++++++++++------
> 1 file changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
> index 25fe28d189df..3c6d86e3e828 100644
> --- a/arch/s390/kernel/uv.c
> +++ b/arch/s390/kernel/uv.c
> @@ -338,11 +338,10 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
> goto out;
> if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
> folio = page_folio(pte_page(*ptep));
> - rc = -EINVAL;
> - if (folio_test_large(folio))
> - goto unlock;
> rc = -EAGAIN;
> - if (folio_trylock(folio)) {
> + if (folio_test_large(folio)) {
> + rc = -E2BIG;
> + } else if (folio_trylock(folio)) {
> if (should_export_before_import(uvcb, gmap->mm))
> uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
> rc = make_folio_secure(folio, uvcb);
> @@ -353,15 +352,35 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
> * Once we drop the PTL, the folio may get unmapped and
> * freed immediately. We need a temporary reference.
> */
> - if (rc == -EAGAIN)
> + if (rc == -EAGAIN || rc == -E2BIG)
> folio_get(folio);
> }
> -unlock:
> pte_unmap_unlock(ptep, ptelock);
> out:
> mmap_read_unlock(gmap->mm);
>
> switch (rc) {
> + case -E2BIG:
> + folio_lock(folio);
> + rc = split_folio(folio);
> + folio_unlock(folio);
> + folio_put(folio);
> +
> + switch (rc) {
> + case 0:
> + /* Splitting succeeded, try again immediately. */
> + goto again;
> + case -EAGAIN:
> + /* Additional folio references. */
> + if (drain_lru(&drain_lru_called))
> + goto again;
> + return -EAGAIN;
> + case -EBUSY:
> + /* Unexpected race. */
> + return -EAGAIN;
> + }
> + WARN_ON_ONCE(1);
> + return -ENXIO;
> case -EAGAIN:
> /*
> * If we are here because the UVC returned busy or partial