2023-10-27 18:23:56

by Sean Christopherson

[permalink] [raw]
Subject: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
information can be supplied without setting userspace up to fail. The
padding in the new kvm_userspace_memory_region2 structure will be used to
pass a file descriptor in addition to the userspace_addr, i.e. allow
userspace to point at a file descriptor and map memory into a guest that
is NOT mapped into host userspace.

Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
makes detection of bad flags a bit more robust, e.g. if the new fd field
is guarded only by a flag and not a new ioctl(), then a userspace bug
(setting a "bad" flag) would generate out-of-bounds access instead of an
-EINVAL error.

Cc: Jarkko Sakkinen <[email protected]>
Reviewed-by: Paolo Bonzini <[email protected]>
Reviewed-by: Xiaoyao Li <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 4 ++--
include/uapi/linux/kvm.h | 13 ++++++++++++
virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++-------
5 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 21a7578142a1..ace984acc125 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
interface. No error will be returned, but the resulting offset will not be
applied.

+4.139 KVM_SET_USER_MEMORY_REGION2
+---------------------------------
+
+:Capability: KVM_CAP_USER_MEMORY2
+:Architectures: all
+:Type: vm ioctl
+:Parameters: struct kvm_userspace_memory_region2 (in)
+:Returns: 0 on success, -1 on error
+
+::
+
+ struct kvm_userspace_memory_region2 {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size; /* bytes */
+ __u64 userspace_addr; /* start of the userspace allocated memory */
+ };
+
+See KVM_SET_USER_MEMORY_REGION.
+
5. The kvm_run structure
========================

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41cce5031126..6409914428ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
}

for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- struct kvm_userspace_memory_region m;
+ struct kvm_userspace_memory_region2 m;

m.slot = id | (i << 16);
m.flags = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5faba69403ac..4e741ff27af3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1146,9 +1146,9 @@ enum kvm_mr_change {
};

int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem);
+ const struct kvm_userspace_memory_region2 *mem);
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 13065dd96132..bd1abe067f28 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
};

+/* for KVM_SET_USER_MEMORY_REGION2 */
+struct kvm_userspace_memory_region2 {
+ __u32 slot;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 userspace_addr;
+ __u64 pad[16];
+};
+
/*
* The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
* userspace, other bits are reserved for kvm internal use which are defined
@@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_COUNTER_OFFSET 227
#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
+#define KVM_CAP_USER_MEMORY2 230

#ifdef KVM_CAP_IRQ_ROUTING

@@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {
struct kvm_userspace_memory_region)
#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
+ struct kvm_userspace_memory_region2)

/* enable ucontrol for s390 */
struct kvm_s390_ucas_mapping {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6e708017064d..3f5b7c2c5327 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1578,7 +1578,7 @@ static void kvm_replace_memslot(struct kvm *kvm,
}
}

-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

@@ -1980,7 +1980,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
* Must be called holding kvm->slots_lock for write.
*/
int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem)
+ const struct kvm_userspace_memory_region2 *mem)
{
struct kvm_memory_slot *old, *new;
struct kvm_memslots *slots;
@@ -2084,7 +2084,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);

int kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem)
+ const struct kvm_userspace_memory_region2 *mem)
{
int r;

@@ -2096,7 +2096,7 @@ int kvm_set_memory_region(struct kvm *kvm,
EXPORT_SYMBOL_GPL(kvm_set_memory_region);

static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+ struct kvm_userspace_memory_region2 *mem)
{
if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
return -EINVAL;
@@ -4566,6 +4566,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
{
switch (arg) {
case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_USER_MEMORY2:
case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
case KVM_CAP_INTERNAL_ERROR_DATA:
@@ -4821,6 +4822,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
return fd;
}

+#define SANITY_CHECK_MEM_REGION_FIELD(field) \
+do { \
+ BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
+ offsetof(struct kvm_userspace_memory_region2, field)); \
+ BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
+ sizeof_field(struct kvm_userspace_memory_region2, field)); \
+} while (0)
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4843,15 +4852,28 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
break;
}
+ case KVM_SET_USER_MEMORY_REGION2:
case KVM_SET_USER_MEMORY_REGION: {
- struct kvm_userspace_memory_region kvm_userspace_mem;
+ struct kvm_userspace_memory_region2 mem;
+ unsigned long size;
+
+ if (ioctl == KVM_SET_USER_MEMORY_REGION)
+ size = sizeof(struct kvm_userspace_memory_region);
+ else
+ size = sizeof(struct kvm_userspace_memory_region2);
+
+ /* Ensure the common parts of the two structs are identical. */
+ SANITY_CHECK_MEM_REGION_FIELD(slot);
+ SANITY_CHECK_MEM_REGION_FIELD(flags);
+ SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+ SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+ SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);

r = -EFAULT;
- if (copy_from_user(&kvm_userspace_mem, argp,
- sizeof(kvm_userspace_mem)))
+ if (copy_from_user(&mem, argp, size))
goto out;

- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+ r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
break;
}
case KVM_GET_DIRTY_LOG: {
--
2.42.0.820.g83a721a137-goog


2023-10-30 16:42:28

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On 10/27/23 20:21, Sean Christopherson wrote:
>
> + if (ioctl == KVM_SET_USER_MEMORY_REGION)
> + size = sizeof(struct kvm_userspace_memory_region);

This also needs a memset(&mem, 0, sizeof(mem)), otherwise the
out-of-bounds access of the commit message becomes a kernel stack read.

Probably worth adding a check on valid flags here.

Paolo

> + else
> + size = sizeof(struct kvm_userspace_memory_region2);
> +
> + /* Ensure the common parts of the two structs are identical. */
> + SANITY_CHECK_MEM_REGION_FIELD(slot);
> + SANITY_CHECK_MEM_REGION_FIELD(flags);
> + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
> + SANITY_CHECK_MEM_REGION_FIELD(memory_size);
> + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
>


2023-10-30 20:26:41

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On Mon, Oct 30, 2023, Paolo Bonzini wrote:
> On 10/27/23 20:21, Sean Christopherson wrote:
> >
> > + if (ioctl == KVM_SET_USER_MEMORY_REGION)
> > + size = sizeof(struct kvm_userspace_memory_region);
>
> This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds
> access of the commit message becomes a kernel stack read.

Ouch. There's some irony. Might be worth doing memset(&mem, -1, sizeof(mem))
though as '0' is a valid file descriptor and a valid file offset.

> Probably worth adding a check on valid flags here.

Definitely needed. There's a very real bug here. But rather than duplicate flags
checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(),
what if we:

1. Acquire/release slots_lock in __kvm_set_memory_region()
2. Call kvm_set_memory_region() from x86 code for the internal memslots
3. Disallow *any* flags for internal memslots
4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()
5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE
only for KVM_SET_USER_MEMORY_REGION2

E.g. this over ~5 patches

---
arch/x86/kvm/x86.c | 2 +-
include/linux/kvm_host.h | 4 +--
virt/kvm/kvm_main.c | 65 +++++++++++++++++-----------------------
3 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3eb608b6692..dd3e2017366c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12478,7 +12478,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
m.guest_phys_addr = gpa;
m.userspace_addr = hva;
m.memory_size = size;
- r = __kvm_set_memory_region(kvm, &m);
+ r = kvm_set_memory_region(kvm, &m);
if (r < 0)
return ERR_PTR_USR(r);
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 687589ce9f63..fbb98efe8200 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1170,7 +1170,7 @@ static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_
* -- just change its flags
*
* Since flags can be changed by some of these operations, the following
- * differentiation is the best we can do for __kvm_set_memory_region():
+ * differentiation is the best we can do for __kvm_set_memory_region().
*/
enum kvm_mr_change {
KVM_MR_CREATE,
@@ -1181,8 +1181,6 @@ enum kvm_mr_change {

int kvm_set_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region2 *mem);
-int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region2 *mem);
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 23633984142f..39ceee2f67f2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1608,28 +1608,6 @@ static void kvm_replace_memslot(struct kvm *kvm,
}
}

-static int check_memory_region_flags(struct kvm *kvm,
- const struct kvm_userspace_memory_region2 *mem)
-{
- u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
-
- if (kvm_arch_has_private_mem(kvm))
- valid_flags |= KVM_MEM_PRIVATE;
-
- /* Dirty logging private memory is not currently supported. */
- if (mem->flags & KVM_MEM_PRIVATE)
- valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
-#ifdef __KVM_HAVE_READONLY_MEM
- valid_flags |= KVM_MEM_READONLY;
-#endif
-
- if (mem->flags & ~valid_flags)
- return -EINVAL;
-
- return 0;
-}
-
static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
{
struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
@@ -2014,11 +1992,9 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
* space.
*
* Discontiguous memory is allowed, mostly for framebuffers.
- *
- * Must be called holding kvm->slots_lock for write.
*/
-int __kvm_set_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region2 *mem)
+static int __kvm_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region2 *mem)
{
struct kvm_memory_slot *old, *new;
struct kvm_memslots *slots;
@@ -2028,9 +2004,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
int as_id, id;
int r;

- r = check_memory_region_flags(kvm, mem);
- if (r)
- return r;
+ guard(mutex)(&kvm->slots_lock);

as_id = mem->slot >> 16;
id = (u16)mem->slot;
@@ -2139,27 +2113,42 @@ int __kvm_set_memory_region(struct kvm *kvm,
kfree(new);
return r;
}
-EXPORT_SYMBOL_GPL(__kvm_set_memory_region);

int kvm_set_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region2 *mem)
{
- int r;
+ /* Flags aren't supported for KVM-internal memslots. */
+ if (WARN_ON_ONCE(mem->flags))
+ return -EINVAL;

- mutex_lock(&kvm->slots_lock);
- r = __kvm_set_memory_region(kvm, mem);
- mutex_unlock(&kvm->slots_lock);
- return r;
+ return __kvm_set_memory_region(kvm, mem);
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);

-static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, unsigned int ioctl,
struct kvm_userspace_memory_region2 *mem)
{
+ u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+ if (ioctl == KVM_SET_USER_MEMORY_REGION2 &&
+ kvm_arch_has_private_mem(kvm))
+ valid_flags |= KVM_MEM_PRIVATE;
+
+ /* Dirty logging private memory is not currently supported. */
+ if (mem->flags & KVM_MEM_PRIVATE)
+ valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef __KVM_HAVE_READONLY_MEM
+ valid_flags |= KVM_MEM_READONLY;
+#endif
+
+ if (mem->flags & ~valid_flags)
+ return -EINVAL;
+
if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
return -EINVAL;

- return kvm_set_memory_region(kvm, mem);
+ return __kvm_set_memory_region(kvm, mem);
}

#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
@@ -5145,7 +5134,7 @@ static long kvm_vm_ioctl(struct file *filp,
if (copy_from_user(&mem, argp, size))
goto out;

- r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
+ r = kvm_vm_ioctl_set_memory_region(kvm, ioctl, &mem);
break;
}
case KVM_GET_DIRTY_LOG: {

base-commit: 881375a408c0f4ea451ff14545b59216d2923881
--

2023-10-30 22:12:30

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On Mon, Oct 30, 2023, Sean Christopherson wrote:
> On Mon, Oct 30, 2023, Paolo Bonzini wrote:
> > On 10/27/23 20:21, Sean Christopherson wrote:
> > >
> > > + if (ioctl == KVM_SET_USER_MEMORY_REGION)
> > > + size = sizeof(struct kvm_userspace_memory_region);
> >
> > This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds
> > access of the commit message becomes a kernel stack read.
>
> Ouch. There's some irony. Might be worth doing memset(&mem, -1, sizeof(mem))
> though as '0' is a valid file descriptor and a valid file offset.
>
> > Probably worth adding a check on valid flags here.
>
> Definitely needed. There's a very real bug here. But rather than duplicate flags
> checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
> have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(),
> what if we:
>
> 1. Acquire/release slots_lock in __kvm_set_memory_region()

Gah, this won't work with x86's usage, which acquire slots_lock quite far away
from __kvm_set_memory_region() in several places. The rest of the idea still
works, kvm_vm_ioctl_set_memory_region() will just need to take slots_lock manually.

> 2. Call kvm_set_memory_region() from x86 code for the internal memslots
> 3. Disallow *any* flags for internal memslots
> 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()
> 5. Pass @ioctl to kvm_vm_ioctl_set_memory_region() and allow KVM_MEM_PRIVATE
> only for KVM_SET_USER_MEMORY_REGION2

2023-10-30 23:23:15

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On 10/30/23 21:25, Sean Christopherson wrote:
> On Mon, Oct 30, 2023, Paolo Bonzini wrote:
>> On 10/27/23 20:21, Sean Christopherson wrote:
>>>
>>> + if (ioctl == KVM_SET_USER_MEMORY_REGION)
>>> + size = sizeof(struct kvm_userspace_memory_region);
>>
>> This also needs a memset(&mem, 0, sizeof(mem)), otherwise the out-of-bounds
>> access of the commit message becomes a kernel stack read.
>
> Ouch. There's some irony. Might be worth doing memset(&mem, -1, sizeof(mem))
> though as '0' is a valid file descriptor and a valid file offset.

Either is okay, because unless the flags check is screwed up it should
not matter. The memset is actually unnecessary, though it may be a good
idea anyway to keep it, aka belt-and-suspenders.

>> Probably worth adding a check on valid flags here.
>
> Definitely needed. There's a very real bug here. But rather than duplicate flags
> checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
> have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(),
> what if we:
>
> 1. Acquire/release slots_lock in __kvm_set_memory_region()
> 2. Call kvm_set_memory_region() from x86 code for the internal memslots
> 3. Disallow *any* flags for internal memslots
> 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()

I dislike this step, there is a clear point where all paths meet
(ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region().
I think that's the place where flags should be checked. (I don't mind
the restriction on internal memslots; it's just that to me it's not a
particularly natural way to structure the checks).

On the other hand, the place where to protect from out-of-bounds
accesses, is the place where you stop caring about struct
kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and
your code gets it right, by dropping "ioctl" as soon as possible).

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 87f45aa91ced..fe5a2af14fff 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm *kvm)
return true;
}

+/*
+ * Flags that do not access any of the extra space of struct
+ * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_FLAGS
+ * only allows these.
+ */
+#define KVM_SET_USER_MEMORY_REGION_FLAGS \
+ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
+
static int check_memory_region_flags(struct kvm *kvm,
const struct kvm_userspace_memory_region2 *mem)
{
@@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp,
struct kvm_userspace_memory_region2 mem;
unsigned long size;

- if (ioctl == KVM_SET_USER_MEMORY_REGION)
+ if (ioctl == KVM_SET_USER_MEMORY_REGION) {
+ /*
+ * Fields beyond struct kvm_userspace_memory_region shouldn't be
+ * accessed, but avoid leaking kernel memory in case of a bug.
+ */
+ memset(&mem, 0, sizeof(mem));
size = sizeof(struct kvm_userspace_memory_region);
- else
+ } else {
size = sizeof(struct kvm_userspace_memory_region2);
+ }

/* Ensure the common parts of the two structs are identical. */
SANITY_CHECK_MEM_REGION_FIELD(slot);
@@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp,
if (copy_from_user(&mem, argp, size))
goto out;

+ r = -EINVAL;
+ if (ioctl == KVM_SET_USER_MEMORY_REGION &&
+ (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS))
+ goto out;
+
r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
break;
}


That's a kind of patch that you can't really get wrong (though I have
the brown paper bag ready).

Maintainance-wise it's fine, since flags are being added at a pace of
roughly one every five years, and anyway it's also future proof: I placed
the #define near check_memory_region_flags so that in five years we remember
to keep it up to date. But worst case, the new flags will only be allowed
by KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues
waiting to bite us.

In sum, this is exactly the only kind of fix that should be in the v13->v14
delta.

Paolo

2023-10-31 00:18:35

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On Tue, Oct 31, 2023, Paolo Bonzini wrote:
> On 10/30/23 21:25, Sean Christopherson wrote:
> > > Probably worth adding a check on valid flags here.
> >
> > Definitely needed. There's a very real bug here. But rather than duplicate flags
> > checking or plumb @ioctl all the way to __kvm_set_memory_region(), now that we
> > have the fancy guard(mutex) and there are no internal calls to kvm_set_memory_region(),
> > what if we:
> >
> > 1. Acquire/release slots_lock in __kvm_set_memory_region()
> > 2. Call kvm_set_memory_region() from x86 code for the internal memslots
> > 3. Disallow *any* flags for internal memslots
> > 4. Open code check_memory_region_flags in kvm_vm_ioctl_set_memory_region()
>
> I dislike this step, there is a clear point where all paths meet
> (ioctl/internal, locked/unlocked) and that's __kvm_set_memory_region().
> I think that's the place where flags should be checked. (I don't mind
> the restriction on internal memslots; it's just that to me it's not a
> particularly natural way to structure the checks).

Yeah, I just don't like the discrepancy it causes where some flags are explicitly
checked and allowed, allowed and then later disallowed.

> On the other hand, the place where to protect from out-of-bounds
> accesses, is the place where you stop caring about struct
> kvm_userspace_memory_region vs kvm_userspace_memory_region2 (and
> your code gets it right, by dropping "ioctl" as soon as possible).
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 87f45aa91ced..fe5a2af14fff 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1635,6 +1635,14 @@ bool __weak kvm_arch_dirty_log_supported(struct kvm *kvm)
> return true;
> }
> +/*
> + * Flags that do not access any of the extra space of struct
> + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_FLAGS
> + * only allows these.
> + */
> +#define KVM_SET_USER_MEMORY_REGION_FLAGS \

Can we name this KVM_SET_USER_MEMORY_REGION_LEGACY_FLAGS, or something equally
horrific? As is, this sounds way too much like a generic "allowed flags for any
memory region".

Or maybe invert the macro? I.e. something to make it more obvious that it's
effectively a versioning check, not a generic "what's supported?" check.

#define KVM_SET_USER_MEMORY_FLAGS_V2_ONLY \
(~(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY))


> + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
> +
> static int check_memory_region_flags(struct kvm *kvm,
> const struct kvm_userspace_memory_region2 *mem)
> {
> @@ -5149,10 +5149,16 @@ static long kvm_vm_ioctl(struct file *filp,
> struct kvm_userspace_memory_region2 mem;
> unsigned long size;
> - if (ioctl == KVM_SET_USER_MEMORY_REGION)
> + if (ioctl == KVM_SET_USER_MEMORY_REGION) {
> + /*
> + * Fields beyond struct kvm_userspace_memory_region shouldn't be
> + * accessed, but avoid leaking kernel memory in case of a bug.
> + */
> + memset(&mem, 0, sizeof(mem));
> size = sizeof(struct kvm_userspace_memory_region);
> - else
> + } else {
> size = sizeof(struct kvm_userspace_memory_region2);
> + }
> /* Ensure the common parts of the two structs are identical. */
> SANITY_CHECK_MEM_REGION_FIELD(slot);
> @@ -5165,6 +5167,11 @@ static long kvm_vm_ioctl(struct file *filp,
> if (copy_from_user(&mem, argp, size))
> goto out;
> + r = -EINVAL;
> + if (ioctl == KVM_SET_USER_MEMORY_REGION &&
> + (mem->flags & ~KVM_SET_USER_MEMORY_REGION_FLAGS))
> + goto out;
> +
> r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> break;
> }
>
>
> That's a kind of patch that you can't really get wrong (though I have
> the brown paper bag ready).
>
> Maintainance-wise it's fine, since flags are being added at a pace of
> roughly one every five years,

Heh, true.

> and anyway it's also future proof: I placed the #define near
> check_memory_region_flags so that in five years we remember to keep it up to
> date. But worst case, the new flags will only be allowed by
> KVM_SET_USER_MEMORY_REGION2 unnecessarily; there are no security issues
> waiting to bite us.
>
> In sum, this is exactly the only kind of fix that should be in the v13->v14
> delta.

Boiling the ocean can be fun too ;-)

2023-10-31 02:26:46

by Xiaoyao Li

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On 10/28/2023 2:21 AM, Sean Christopherson wrote:
> Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
> information can be supplied without setting userspace up to fail. The
> padding in the new kvm_userspace_memory_region2 structure will be used to
> pass a file descriptor in addition to the userspace_addr, i.e. allow
> userspace to point at a file descriptor and map memory into a guest that
> is NOT mapped into host userspace.
>
> Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
> without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
> makes detection of bad flags a bit more robust, e.g. if the new fd field
> is guarded only by a flag and not a new ioctl(), then a userspace bug
> (setting a "bad" flag) would generate out-of-bounds access instead of an
> -EINVAL error.
>
> Cc: Jarkko Sakkinen <[email protected]>
> Reviewed-by: Paolo Bonzini <[email protected]>
> Reviewed-by: Xiaoyao Li <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++
> arch/x86/kvm/x86.c | 2 +-
> include/linux/kvm_host.h | 4 ++--
> include/uapi/linux/kvm.h | 13 ++++++++++++
> virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++-------
> 5 files changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 21a7578142a1..ace984acc125 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
> interface. No error will be returned, but the resulting offset will not be
> applied.
>
> +4.139 KVM_SET_USER_MEMORY_REGION2
> +---------------------------------
> +
> +:Capability: KVM_CAP_USER_MEMORY2
> +:Architectures: all
> +:Type: vm ioctl
> +:Parameters: struct kvm_userspace_memory_region2 (in)
> +:Returns: 0 on success, -1 on error
> +
> +::
> +
> + struct kvm_userspace_memory_region2 {
> + __u32 slot;
> + __u32 flags;
> + __u64 guest_phys_addr;
> + __u64 memory_size; /* bytes */
> + __u64 userspace_addr; /* start of the userspace allocated memory */

missing

__u64 pad[16];

> + };
> +
> +See KVM_SET_USER_MEMORY_REGION.
> +
> 5. The kvm_run structure
> ========================
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 41cce5031126..6409914428ca 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
> }
>
> for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> - struct kvm_userspace_memory_region m;
> + struct kvm_userspace_memory_region2 m;
>
> m.slot = id | (i << 16);
> m.flags = 0;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5faba69403ac..4e741ff27af3 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1146,9 +1146,9 @@ enum kvm_mr_change {
> };
>
> int kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem);
> + const struct kvm_userspace_memory_region2 *mem);
> int __kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem);
> + const struct kvm_userspace_memory_region2 *mem);
> void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
> void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
> int kvm_arch_prepare_memory_region(struct kvm *kvm,
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 13065dd96132..bd1abe067f28 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
> __u64 userspace_addr; /* start of the userspace allocated memory */
> };
>
> +/* for KVM_SET_USER_MEMORY_REGION2 */
> +struct kvm_userspace_memory_region2 {
> + __u32 slot;
> + __u32 flags;
> + __u64 guest_phys_addr;
> + __u64 memory_size;
> + __u64 userspace_addr;
> + __u64 pad[16];
> +};
> +
> /*
> * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
> * userspace, other bits are reserved for kvm internal use which are defined
> @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_COUNTER_OFFSET 227
> #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
> #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
> +#define KVM_CAP_USER_MEMORY2 230
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {
> struct kvm_userspace_memory_region)
> #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
> #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
> +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
> + struct kvm_userspace_memory_region2)
>
> /* enable ucontrol for s390 */
> struct kvm_s390_ucas_mapping {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 6e708017064d..3f5b7c2c5327 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1578,7 +1578,7 @@ static void kvm_replace_memslot(struct kvm *kvm,
> }
> }
>
> -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
> +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
> {
> u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>
> @@ -1980,7 +1980,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> * Must be called holding kvm->slots_lock for write.
> */
> int __kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem)
> + const struct kvm_userspace_memory_region2 *mem)
> {
> struct kvm_memory_slot *old, *new;
> struct kvm_memslots *slots;
> @@ -2084,7 +2084,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
> EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
>
> int kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem)
> + const struct kvm_userspace_memory_region2 *mem)
> {
> int r;
>
> @@ -2096,7 +2096,7 @@ int kvm_set_memory_region(struct kvm *kvm,
> EXPORT_SYMBOL_GPL(kvm_set_memory_region);
>
> static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
> - struct kvm_userspace_memory_region *mem)
> + struct kvm_userspace_memory_region2 *mem)
> {
> if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
> return -EINVAL;
> @@ -4566,6 +4566,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
> {
> switch (arg) {
> case KVM_CAP_USER_MEMORY:
> + case KVM_CAP_USER_MEMORY2:
> case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
> case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
> case KVM_CAP_INTERNAL_ERROR_DATA:
> @@ -4821,6 +4822,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
> return fd;
> }
>
> +#define SANITY_CHECK_MEM_REGION_FIELD(field) \
> +do { \
> + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
> + offsetof(struct kvm_userspace_memory_region2, field)); \
> + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
> + sizeof_field(struct kvm_userspace_memory_region2, field)); \
> +} while (0)
> +
> static long kvm_vm_ioctl(struct file *filp,
> unsigned int ioctl, unsigned long arg)
> {
> @@ -4843,15 +4852,28 @@ static long kvm_vm_ioctl(struct file *filp,
> r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
> break;
> }
> + case KVM_SET_USER_MEMORY_REGION2:
> case KVM_SET_USER_MEMORY_REGION: {
> - struct kvm_userspace_memory_region kvm_userspace_mem;
> + struct kvm_userspace_memory_region2 mem;
> + unsigned long size;
> +
> + if (ioctl == KVM_SET_USER_MEMORY_REGION)
> + size = sizeof(struct kvm_userspace_memory_region);
> + else
> + size = sizeof(struct kvm_userspace_memory_region2);
> +
> + /* Ensure the common parts of the two structs are identical. */
> + SANITY_CHECK_MEM_REGION_FIELD(slot);
> + SANITY_CHECK_MEM_REGION_FIELD(flags);
> + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
> + SANITY_CHECK_MEM_REGION_FIELD(memory_size);
> + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
>
> r = -EFAULT;
> - if (copy_from_user(&kvm_userspace_mem, argp,
> - sizeof(kvm_userspace_mem)))
> + if (copy_from_user(&mem, argp, size))
> goto out;
>
> - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
> + r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> break;
> }
> case KVM_GET_DIRTY_LOG: {

2023-10-31 14:06:04

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On Tue, Oct 31, 2023, Xiaoyao Li wrote:
> On 10/28/2023 2:21 AM, Sean Christopherson wrote:
> > Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
> > information can be supplied without setting userspace up to fail. The
> > padding in the new kvm_userspace_memory_region2 structure will be used to
> > pass a file descriptor in addition to the userspace_addr, i.e. allow
> > userspace to point at a file descriptor and map memory into a guest that
> > is NOT mapped into host userspace.
> >
> > Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
> > without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
> > makes detection of bad flags a bit more robust, e.g. if the new fd field
> > is guarded only by a flag and not a new ioctl(), then a userspace bug
> > (setting a "bad" flag) would generate out-of-bounds access instead of an
> > -EINVAL error.
> >
> > Cc: Jarkko Sakkinen <[email protected]>
> > Reviewed-by: Paolo Bonzini <[email protected]>
> > Reviewed-by: Xiaoyao Li <[email protected]>
> > Signed-off-by: Sean Christopherson <[email protected]>
> > ---
> > Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++
> > arch/x86/kvm/x86.c | 2 +-
> > include/linux/kvm_host.h | 4 ++--
> > include/uapi/linux/kvm.h | 13 ++++++++++++
> > virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++-------
> > 5 files changed, 67 insertions(+), 11 deletions(-)
> >
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index 21a7578142a1..ace984acc125 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
> > interface. No error will be returned, but the resulting offset will not be
> > applied.
> > +4.139 KVM_SET_USER_MEMORY_REGION2
> > +---------------------------------
> > +
> > +:Capability: KVM_CAP_USER_MEMORY2
> > +:Architectures: all
> > +:Type: vm ioctl
> > +:Parameters: struct kvm_userspace_memory_region2 (in)
> > +:Returns: 0 on success, -1 on error
> > +
> > +::
> > +
> > + struct kvm_userspace_memory_region2 {
> > + __u32 slot;
> > + __u32 flags;
> > + __u64 guest_phys_addr;
> > + __u64 memory_size; /* bytes */
> > + __u64 userspace_addr; /* start of the userspace allocated memory */
>
> missing
>
> __u64 pad[16];

I can't even copy+paste correctly :-(

2023-11-01 14:20:52

by Fuad Tabba

[permalink] [raw]
Subject: Re: [PATCH v13 08/35] KVM: Introduce KVM_SET_USER_MEMORY_REGION2

On Fri, Oct 27, 2023 at 7:22 PM Sean Christopherson <[email protected]> wrote:
>
> Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional
> information can be supplied without setting userspace up to fail. The
> padding in the new kvm_userspace_memory_region2 structure will be used to
> pass a file descriptor in addition to the userspace_addr, i.e. allow
> userspace to point at a file descriptor and map memory into a guest that
> is NOT mapped into host userspace.
>
> Alternatively, KVM could simply add "struct kvm_userspace_memory_region2"
> without a new ioctl(), but as Paolo pointed out, adding a new ioctl()
> makes detection of bad flags a bit more robust, e.g. if the new fd field
> is guarded only by a flag and not a new ioctl(), then a userspace bug
> (setting a "bad" flag) would generate out-of-bounds access instead of an
> -EINVAL error.
>
> Cc: Jarkko Sakkinen <[email protected]>
> Reviewed-by: Paolo Bonzini <[email protected]>
> Reviewed-by: Xiaoyao Li <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---

With the missing pad in api.rst fixed:
Reviewed-by: Fuad Tabba <[email protected]>
Tested-by: Fuad Tabba <[email protected]>

Cheers,
/fuad

> Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++
> arch/x86/kvm/x86.c | 2 +-
> include/linux/kvm_host.h | 4 ++--
> include/uapi/linux/kvm.h | 13 ++++++++++++
> virt/kvm/kvm_main.c | 38 +++++++++++++++++++++++++++-------
> 5 files changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 21a7578142a1..ace984acc125 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6070,6 +6070,27 @@ writes to the CNTVCT_EL0 and CNTPCT_EL0 registers using the SET_ONE_REG
> interface. No error will be returned, but the resulting offset will not be
> applied.
>
> +4.139 KVM_SET_USER_MEMORY_REGION2
> +---------------------------------
> +
> +:Capability: KVM_CAP_USER_MEMORY2
> +:Architectures: all
> +:Type: vm ioctl
> +:Parameters: struct kvm_userspace_memory_region2 (in)
> +:Returns: 0 on success, -1 on error
> +
> +::
> +
> + struct kvm_userspace_memory_region2 {
> + __u32 slot;
> + __u32 flags;
> + __u64 guest_phys_addr;
> + __u64 memory_size; /* bytes */
> + __u64 userspace_addr; /* start of the userspace allocated memory */
> + };
> +
> +See KVM_SET_USER_MEMORY_REGION.
> +
> 5. The kvm_run structure
> ========================
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 41cce5031126..6409914428ca 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12455,7 +12455,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
> }
>
> for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> - struct kvm_userspace_memory_region m;
> + struct kvm_userspace_memory_region2 m;
>
> m.slot = id | (i << 16);
> m.flags = 0;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5faba69403ac..4e741ff27af3 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1146,9 +1146,9 @@ enum kvm_mr_change {
> };
>
> int kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem);
> + const struct kvm_userspace_memory_region2 *mem);
> int __kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem);
> + const struct kvm_userspace_memory_region2 *mem);
> void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
> void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
> int kvm_arch_prepare_memory_region(struct kvm *kvm,
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 13065dd96132..bd1abe067f28 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region {
> __u64 userspace_addr; /* start of the userspace allocated memory */
> };
>
> +/* for KVM_SET_USER_MEMORY_REGION2 */
> +struct kvm_userspace_memory_region2 {
> + __u32 slot;
> + __u32 flags;
> + __u64 guest_phys_addr;
> + __u64 memory_size;
> + __u64 userspace_addr;
> + __u64 pad[16];
> +};
> +
> /*
> * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for
> * userspace, other bits are reserved for kvm internal use which are defined
> @@ -1192,6 +1202,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_COUNTER_OFFSET 227
> #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
> #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
> +#define KVM_CAP_USER_MEMORY2 230
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1473,6 +1484,8 @@ struct kvm_vfio_spapr_tce {
> struct kvm_userspace_memory_region)
> #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
> #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
> +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \
> + struct kvm_userspace_memory_region2)
>
> /* enable ucontrol for s390 */
> struct kvm_s390_ucas_mapping {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 6e708017064d..3f5b7c2c5327 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1578,7 +1578,7 @@ static void kvm_replace_memslot(struct kvm *kvm,
> }
> }
>
> -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
> +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
> {
> u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>
> @@ -1980,7 +1980,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> * Must be called holding kvm->slots_lock for write.
> */
> int __kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem)
> + const struct kvm_userspace_memory_region2 *mem)
> {
> struct kvm_memory_slot *old, *new;
> struct kvm_memslots *slots;
> @@ -2084,7 +2084,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
> EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
>
> int kvm_set_memory_region(struct kvm *kvm,
> - const struct kvm_userspace_memory_region *mem)
> + const struct kvm_userspace_memory_region2 *mem)
> {
> int r;
>
> @@ -2096,7 +2096,7 @@ int kvm_set_memory_region(struct kvm *kvm,
> EXPORT_SYMBOL_GPL(kvm_set_memory_region);
>
> static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
> - struct kvm_userspace_memory_region *mem)
> + struct kvm_userspace_memory_region2 *mem)
> {
> if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
> return -EINVAL;
> @@ -4566,6 +4566,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
> {
> switch (arg) {
> case KVM_CAP_USER_MEMORY:
> + case KVM_CAP_USER_MEMORY2:
> case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
> case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
> case KVM_CAP_INTERNAL_ERROR_DATA:
> @@ -4821,6 +4822,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
> return fd;
> }
>
> +#define SANITY_CHECK_MEM_REGION_FIELD(field) \
> +do { \
> + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
> + offsetof(struct kvm_userspace_memory_region2, field)); \
> + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
> + sizeof_field(struct kvm_userspace_memory_region2, field)); \
> +} while (0)
> +
> static long kvm_vm_ioctl(struct file *filp,
> unsigned int ioctl, unsigned long arg)
> {
> @@ -4843,15 +4852,28 @@ static long kvm_vm_ioctl(struct file *filp,
> r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
> break;
> }
> + case KVM_SET_USER_MEMORY_REGION2:
> case KVM_SET_USER_MEMORY_REGION: {
> - struct kvm_userspace_memory_region kvm_userspace_mem;
> + struct kvm_userspace_memory_region2 mem;
> + unsigned long size;
> +
> + if (ioctl == KVM_SET_USER_MEMORY_REGION)
> + size = sizeof(struct kvm_userspace_memory_region);
> + else
> + size = sizeof(struct kvm_userspace_memory_region2);
> +
> + /* Ensure the common parts of the two structs are identical. */
> + SANITY_CHECK_MEM_REGION_FIELD(slot);
> + SANITY_CHECK_MEM_REGION_FIELD(flags);
> + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
> + SANITY_CHECK_MEM_REGION_FIELD(memory_size);
> + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
>
> r = -EFAULT;
> - if (copy_from_user(&kvm_userspace_mem, argp,
> - sizeof(kvm_userspace_mem)))
> + if (copy_from_user(&mem, argp, size))
> goto out;
>
> - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
> + r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> break;
> }
> case KVM_GET_DIRTY_LOG: {
> --
> 2.42.0.820.g83a721a137-goog
>