A large page with mixed private/shared subpages can't be mapped as large
page since its sub private/shared pages are from different memory
backends and may also treated by architecture differently. When
private/shared memory are mixed in a large page, the current lpage_info
is not sufficient to decide whether the page can be mapped as large page
or not and additional private/shared mixed information is needed.
Tracking this 'mixed' information with the current 'count' like
disallow_lpage is a bit challenge so reserve a bit in 'disallow_lpage'
to indicate a large page has mixed private/share subpages and update
this 'mixed' bit whenever the memory attribute is changed between
private and shared.
Signed-off-by: Chao Peng <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 8 ++
arch/x86/kvm/mmu/mmu.c | 134 +++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 2 +
include/linux/kvm_host.h | 19 +++++
virt/kvm/kvm_main.c | 9 ++-
5 files changed, 169 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 283cbb83d6ae..7772ab37ac89 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,6 +38,7 @@
#include <asm/hyperv-tlfs.h>
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
+#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
#define KVM_MAX_VCPUS 1024
@@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
#endif
};
+/*
+ * Use a bit in disallow_lpage to indicate private/shared pages mixed at the
+ * level. The remaining bits are used as a reference count.
+ */
+#define KVM_LPAGE_PRIVATE_SHARED_MIXED (1U << 31)
+#define KVM_LPAGE_COUNT_MAX ((1U << 31) - 1)
+
struct kvm_lpage_info {
int disallow_lpage;
};
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e2c70b5afa3e..2190fd8c95c0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
{
struct kvm_lpage_info *linfo;
int i;
+ int disallow_count;
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
+
+ disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
+ WARN_ON(disallow_count + count < 0 ||
+ disallow_count > KVM_LPAGE_COUNT_MAX - count);
+
linfo->disallow_lpage += count;
- WARN_ON(linfo->disallow_lpage < 0);
}
}
@@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
if (kvm->arch.nx_huge_page_recovery_thread)
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
+
+static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
+{
+ return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
+}
+
+static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
+ int level, bool mixed)
+{
+ struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
+
+ if (mixed)
+ linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
+ else
+ linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
+}
+
+static bool is_expected_attr_entry(void *entry, unsigned long expected_attrs)
+{
+ bool expect_private = expected_attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+ if (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) {
+ if (!expect_private)
+ return false;
+ } else if (expect_private)
+ return false;
+
+ return true;
+}
+
+static bool mem_attrs_mixed_2m(struct kvm *kvm, unsigned long attrs,
+ gfn_t start, gfn_t end)
+{
+ XA_STATE(xas, &kvm->mem_attr_array, start);
+ gfn_t gfn = start;
+ void *entry;
+ bool mixed = false;
+
+ rcu_read_lock();
+ entry = xas_load(&xas);
+ while (gfn < end) {
+ if (xas_retry(&xas, entry))
+ continue;
+
+ KVM_BUG_ON(gfn != xas.xa_index, kvm);
+
+ if (!is_expected_attr_entry(entry, attrs)) {
+ mixed = true;
+ break;
+ }
+
+ entry = xas_next(&xas);
+ gfn++;
+ }
+
+ rcu_read_unlock();
+ return mixed;
+}
+
+static bool mem_attrs_mixed(struct kvm *kvm, struct kvm_memory_slot *slot,
+ int level, unsigned long attrs,
+ gfn_t start, gfn_t end)
+{
+ unsigned long gfn;
+
+ if (level == PG_LEVEL_2M)
+ return mem_attrs_mixed_2m(kvm, attrs, start, end);
+
+ for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1))
+ if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
+ !is_expected_attr_entry(xa_load(&kvm->mem_attr_array, gfn),
+ attrs))
+ return true;
+ return false;
+}
+
+static void kvm_update_lpage_private_shared_mixed(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long attrs,
+ gfn_t start, gfn_t end)
+{
+ unsigned long pages, mask;
+ gfn_t gfn, gfn_end, first, last;
+ int level;
+ bool mixed;
+
+ /*
+ * The sequence matters here: we set the higher level basing on the
+ * lower level's scanning result.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ pages = KVM_PAGES_PER_HPAGE(level);
+ mask = ~(pages - 1);
+ first = start & mask;
+ last = (end - 1) & mask;
+
+ /*
+ * We only need to scan the head and tail page, for middle pages
+ * we know they will not be mixed.
+ */
+ gfn = max(first, slot->base_gfn);
+ gfn_end = min(first + pages, slot->base_gfn + slot->npages);
+ mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
+ linfo_set_mixed(gfn, slot, level, mixed);
+
+ if (first == last)
+ return;
+
+ for (gfn = first + pages; gfn < last; gfn += pages)
+ linfo_set_mixed(gfn, slot, level, false);
+
+ gfn = last;
+ gfn_end = min(last + pages, slot->base_gfn + slot->npages);
+ mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
+ linfo_set_mixed(gfn, slot, level, mixed);
+ }
+}
+
+void kvm_arch_set_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long attrs,
+ gfn_t start, gfn_t end)
+{
+ if (kvm_slot_can_be_private(slot))
+ kvm_update_lpage_private_shared_mixed(kvm, slot, attrs,
+ start, end);
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a07380f8d3c..5aefcff614d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
linfo[lpages - 1].disallow_lpage = 1;
ugfn = slot->userspace_addr >> PAGE_SHIFT;
+ if (kvm_slot_can_be_private(slot))
+ ugfn |= slot->restricted_offset >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
* other, disable large page support for this slot.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3331c0c92838..25099c94e770 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -592,6 +592,11 @@ struct kvm_memory_slot {
struct restrictedmem_notifier notifier;
};
+static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
+{
+ return slot && (slot->flags & KVM_MEM_PRIVATE);
+}
+
static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
{
return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -2316,4 +2321,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
/* Max number of entries allowed for each kvm dirty ring */
#define KVM_DIRTY_RING_MAX_ENTRIES 65536
+#ifdef __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
+void kvm_arch_set_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long attrs,
+ gfn_t start, gfn_t end);
+#else
+static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long attrs,
+ gfn_t start, gfn_t end)
+{
+}
+#endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */
+
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4e1e1e113bf0..e107afea32f0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2354,7 +2354,8 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
return 0;
}
-static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
+static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long attrs)
{
struct kvm_gfn_range gfn_range;
struct kvm_memory_slot *slot;
@@ -2378,6 +2379,10 @@ static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
gfn_range.slot = slot;
r |= kvm_unmap_gfn_range(kvm, &gfn_range);
+
+ kvm_arch_set_memory_attributes(kvm, slot, attrs,
+ gfn_range.start,
+ gfn_range.end);
}
}
@@ -2427,7 +2432,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
idx = srcu_read_lock(&kvm->srcu);
KVM_MMU_LOCK(kvm);
if (i > start)
- kvm_unmap_mem_range(kvm, start, i);
+ kvm_unmap_mem_range(kvm, start, i, attrs->attributes);
kvm_mmu_invalidate_end(kvm);
KVM_MMU_UNLOCK(kvm);
srcu_read_unlock(&kvm->srcu, idx);
--
2.25.1
On Fri, Dec 02, 2022 at 02:13:45PM +0800,
Chao Peng <[email protected]> wrote:
> A large page with mixed private/shared subpages can't be mapped as large
> page since its sub private/shared pages are from different memory
> backends and may also treated by architecture differently. When
> private/shared memory are mixed in a large page, the current lpage_info
> is not sufficient to decide whether the page can be mapped as large page
> or not and additional private/shared mixed information is needed.
>
> Tracking this 'mixed' information with the current 'count' like
> disallow_lpage is a bit challenge so reserve a bit in 'disallow_lpage'
> to indicate a large page has mixed private/share subpages and update
> this 'mixed' bit whenever the memory attribute is changed between
> private and shared.
>
> Signed-off-by: Chao Peng <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 8 ++
> arch/x86/kvm/mmu/mmu.c | 134 +++++++++++++++++++++++++++++++-
> arch/x86/kvm/x86.c | 2 +
> include/linux/kvm_host.h | 19 +++++
> virt/kvm/kvm_main.c | 9 ++-
> 5 files changed, 169 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 283cbb83d6ae..7772ab37ac89 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -38,6 +38,7 @@
> #include <asm/hyperv-tlfs.h>
>
> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> +#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
>
> #define KVM_MAX_VCPUS 1024
>
> @@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
> #endif
> };
>
> +/*
> + * Use a bit in disallow_lpage to indicate private/shared pages mixed at the
> + * level. The remaining bits are used as a reference count.
> + */
> +#define KVM_LPAGE_PRIVATE_SHARED_MIXED (1U << 31)
> +#define KVM_LPAGE_COUNT_MAX ((1U << 31) - 1)
> +
> struct kvm_lpage_info {
> int disallow_lpage;
> };
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e2c70b5afa3e..2190fd8c95c0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
> {
> struct kvm_lpage_info *linfo;
> int i;
> + int disallow_count;
>
> for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
> linfo = lpage_info_slot(gfn, slot, i);
> +
> + disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
> + WARN_ON(disallow_count + count < 0 ||
> + disallow_count > KVM_LPAGE_COUNT_MAX - count);
> +
> linfo->disallow_lpage += count;
> - WARN_ON(linfo->disallow_lpage < 0);
> }
> }
>
> @@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> if (kvm->arch.nx_huge_page_recovery_thread)
> kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> }
> +
> +static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
> +{
> + return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
> +}
> +
> +static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
> + int level, bool mixed)
> +{
> + struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
> +
> + if (mixed)
> + linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
> + else
> + linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
> +}
> +
> +static bool is_expected_attr_entry(void *entry, unsigned long expected_attrs)
> +{
> + bool expect_private = expected_attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +
> + if (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) {
> + if (!expect_private)
> + return false;
> + } else if (expect_private)
> + return false;
> +
> + return true;
> +}
> +
> +static bool mem_attrs_mixed_2m(struct kvm *kvm, unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + XA_STATE(xas, &kvm->mem_attr_array, start);
> + gfn_t gfn = start;
> + void *entry;
> + bool mixed = false;
> +
> + rcu_read_lock();
> + entry = xas_load(&xas);
> + while (gfn < end) {
> + if (xas_retry(&xas, entry))
> + continue;
> +
> + KVM_BUG_ON(gfn != xas.xa_index, kvm);
> +
> + if (!is_expected_attr_entry(entry, attrs)) {
> + mixed = true;
> + break;
> + }
> +
> + entry = xas_next(&xas);
> + gfn++;
> + }
> +
> + rcu_read_unlock();
> + return mixed;
> +}
> +
> +static bool mem_attrs_mixed(struct kvm *kvm, struct kvm_memory_slot *slot,
> + int level, unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + unsigned long gfn;
> +
> + if (level == PG_LEVEL_2M)
> + return mem_attrs_mixed_2m(kvm, attrs, start, end);
> +
> + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1))
> + if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
> + !is_expected_attr_entry(xa_load(&kvm->mem_attr_array, gfn),
> + attrs))
> + return true;
> + return false;
> +}
> +
> +static void kvm_update_lpage_private_shared_mixed(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + unsigned long pages, mask;
> + gfn_t gfn, gfn_end, first, last;
> + int level;
> + bool mixed;
> +
> + /*
> + * The sequence matters here: we set the higher level basing on the
> + * lower level's scanning result.
> + */
> + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
> + pages = KVM_PAGES_PER_HPAGE(level);
> + mask = ~(pages - 1);
> + first = start & mask;
> + last = (end - 1) & mask;
> +
> + /*
> + * We only need to scan the head and tail page, for middle pages
> + * we know they will not be mixed.
> + */
> + gfn = max(first, slot->base_gfn);
> + gfn_end = min(first + pages, slot->base_gfn + slot->npages);
> + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> + linfo_set_mixed(gfn, slot, level, mixed);
> +
> + if (first == last)
> + return;
continue.
> +
> + for (gfn = first + pages; gfn < last; gfn += pages)
> + linfo_set_mixed(gfn, slot, level, false);
> +
> + gfn = last;
> + gfn_end = min(last + pages, slot->base_gfn + slot->npages);
if (gfn == gfn_end) continue.
> + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> + linfo_set_mixed(gfn, slot, level, mixed);
> + }
> +}
> +
> +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + if (kvm_slot_can_be_private(slot))
> + kvm_update_lpage_private_shared_mixed(kvm, slot, attrs,
> + start, end);
> +}
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a07380f8d3c..5aefcff614d2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> linfo[lpages - 1].disallow_lpage = 1;
> ugfn = slot->userspace_addr >> PAGE_SHIFT;
> + if (kvm_slot_can_be_private(slot))
> + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
Is there any alignment restriction? If no, It should be +=.
In practice, alignment will hold though.
Thanks,
> /*
> * If the gfn and userspace address are not aligned wrt each
> * other, disable large page support for this slot.
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3331c0c92838..25099c94e770 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -592,6 +592,11 @@ struct kvm_memory_slot {
> struct restrictedmem_notifier notifier;
> };
>
> +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> +{
> + return slot && (slot->flags & KVM_MEM_PRIVATE);
> +}
> +
> static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> {
> return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> @@ -2316,4 +2321,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
> /* Max number of entries allowed for each kvm dirty ring */
> #define KVM_DIRTY_RING_MAX_ENTRIES 65536
>
> +#ifdef __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end);
> +#else
> +static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> +}
> +#endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */
> +
> #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 4e1e1e113bf0..e107afea32f0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2354,7 +2354,8 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> return 0;
> }
>
> -static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end,
> + unsigned long attrs)
> {
> struct kvm_gfn_range gfn_range;
> struct kvm_memory_slot *slot;
> @@ -2378,6 +2379,10 @@ static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> gfn_range.slot = slot;
>
> r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> +
> + kvm_arch_set_memory_attributes(kvm, slot, attrs,
> + gfn_range.start,
> + gfn_range.end);
> }
> }
>
> @@ -2427,7 +2432,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> idx = srcu_read_lock(&kvm->srcu);
> KVM_MMU_LOCK(kvm);
> if (i > start)
> - kvm_unmap_mem_range(kvm, start, i);
> + kvm_unmap_mem_range(kvm, start, i, attrs->attributes);
> kvm_mmu_invalidate_end(kvm);
> KVM_MMU_UNLOCK(kvm);
> srcu_read_unlock(&kvm->srcu, idx);
> --
> 2.25.1
>
--
Isaku Yamahata <[email protected]>
On Mon, Dec 05, 2022 at 02:49:59PM -0800, Isaku Yamahata wrote:
> On Fri, Dec 02, 2022 at 02:13:45PM +0800,
> Chao Peng <[email protected]> wrote:
>
> > A large page with mixed private/shared subpages can't be mapped as large
> > page since its sub private/shared pages are from different memory
> > backends and may also treated by architecture differently. When
> > private/shared memory are mixed in a large page, the current lpage_info
> > is not sufficient to decide whether the page can be mapped as large page
> > or not and additional private/shared mixed information is needed.
> >
> > Tracking this 'mixed' information with the current 'count' like
> > disallow_lpage is a bit challenge so reserve a bit in 'disallow_lpage'
> > to indicate a large page has mixed private/share subpages and update
> > this 'mixed' bit whenever the memory attribute is changed between
> > private and shared.
> >
> > Signed-off-by: Chao Peng <[email protected]>
> > ---
> > arch/x86/include/asm/kvm_host.h | 8 ++
> > arch/x86/kvm/mmu/mmu.c | 134 +++++++++++++++++++++++++++++++-
> > arch/x86/kvm/x86.c | 2 +
> > include/linux/kvm_host.h | 19 +++++
> > virt/kvm/kvm_main.c | 9 ++-
> > 5 files changed, 169 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 283cbb83d6ae..7772ab37ac89 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -38,6 +38,7 @@
> > #include <asm/hyperv-tlfs.h>
> >
> > #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> > +#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> >
> > #define KVM_MAX_VCPUS 1024
> >
> > @@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
> > #endif
> > };
> >
> > +/*
> > + * Use a bit in disallow_lpage to indicate private/shared pages mixed at the
> > + * level. The remaining bits are used as a reference count.
> > + */
> > +#define KVM_LPAGE_PRIVATE_SHARED_MIXED (1U << 31)
> > +#define KVM_LPAGE_COUNT_MAX ((1U << 31) - 1)
> > +
> > struct kvm_lpage_info {
> > int disallow_lpage;
> > };
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index e2c70b5afa3e..2190fd8c95c0 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
> > {
> > struct kvm_lpage_info *linfo;
> > int i;
> > + int disallow_count;
> >
> > for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
> > linfo = lpage_info_slot(gfn, slot, i);
> > +
> > + disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
> > + WARN_ON(disallow_count + count < 0 ||
> > + disallow_count > KVM_LPAGE_COUNT_MAX - count);
> > +
> > linfo->disallow_lpage += count;
> > - WARN_ON(linfo->disallow_lpage < 0);
> > }
> > }
> >
> > @@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> > if (kvm->arch.nx_huge_page_recovery_thread)
> > kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> > }
> > +
> > +static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
> > +{
> > + return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > +}
> > +
> > +static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
> > + int level, bool mixed)
> > +{
> > + struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
> > +
> > + if (mixed)
> > + linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > + else
> > + linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > +}
> > +
> > +static bool is_expected_attr_entry(void *entry, unsigned long expected_attrs)
> > +{
> > + bool expect_private = expected_attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > +
> > + if (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) {
> > + if (!expect_private)
> > + return false;
> > + } else if (expect_private)
> > + return false;
> > +
> > + return true;
> > +}
> > +
> > +static bool mem_attrs_mixed_2m(struct kvm *kvm, unsigned long attrs,
> > + gfn_t start, gfn_t end)
> > +{
> > + XA_STATE(xas, &kvm->mem_attr_array, start);
> > + gfn_t gfn = start;
> > + void *entry;
> > + bool mixed = false;
> > +
> > + rcu_read_lock();
> > + entry = xas_load(&xas);
> > + while (gfn < end) {
> > + if (xas_retry(&xas, entry))
> > + continue;
> > +
> > + KVM_BUG_ON(gfn != xas.xa_index, kvm);
> > +
> > + if (!is_expected_attr_entry(entry, attrs)) {
> > + mixed = true;
> > + break;
> > + }
> > +
> > + entry = xas_next(&xas);
> > + gfn++;
> > + }
> > +
> > + rcu_read_unlock();
> > + return mixed;
> > +}
> > +
> > +static bool mem_attrs_mixed(struct kvm *kvm, struct kvm_memory_slot *slot,
> > + int level, unsigned long attrs,
> > + gfn_t start, gfn_t end)
> > +{
> > + unsigned long gfn;
> > +
> > + if (level == PG_LEVEL_2M)
> > + return mem_attrs_mixed_2m(kvm, attrs, start, end);
> > +
> > + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1))
> > + if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
> > + !is_expected_attr_entry(xa_load(&kvm->mem_attr_array, gfn),
> > + attrs))
> > + return true;
> > + return false;
> > +}
> > +
> > +static void kvm_update_lpage_private_shared_mixed(struct kvm *kvm,
> > + struct kvm_memory_slot *slot,
> > + unsigned long attrs,
> > + gfn_t start, gfn_t end)
> > +{
> > + unsigned long pages, mask;
> > + gfn_t gfn, gfn_end, first, last;
> > + int level;
> > + bool mixed;
> > +
> > + /*
> > + * The sequence matters here: we set the higher level basing on the
> > + * lower level's scanning result.
> > + */
> > + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
> > + pages = KVM_PAGES_PER_HPAGE(level);
> > + mask = ~(pages - 1);
> > + first = start & mask;
> > + last = (end - 1) & mask;
> > +
> > + /*
> > + * We only need to scan the head and tail page, for middle pages
> > + * we know they will not be mixed.
> > + */
> > + gfn = max(first, slot->base_gfn);
> > + gfn_end = min(first + pages, slot->base_gfn + slot->npages);
> > + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> > + linfo_set_mixed(gfn, slot, level, mixed);
> > +
> > + if (first == last)
> > + return;
>
>
> continue.
Ya!
>
> > +
> > + for (gfn = first + pages; gfn < last; gfn += pages)
> > + linfo_set_mixed(gfn, slot, level, false);
> > +
> > + gfn = last;
> > + gfn_end = min(last + pages, slot->base_gfn + slot->npages);
>
> if (gfn == gfn_end) continue.
Do you see a case where gfn can equal to gfn_end? Though it does not
hurt to add a check.
>
>
> > + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> > + linfo_set_mixed(gfn, slot, level, mixed);
> > + }
> > +}
> > +
> > +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> > + struct kvm_memory_slot *slot,
> > + unsigned long attrs,
> > + gfn_t start, gfn_t end)
> > +{
> > + if (kvm_slot_can_be_private(slot))
> > + kvm_update_lpage_private_shared_mixed(kvm, slot, attrs,
> > + start, end);
> > +}
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 9a07380f8d3c..5aefcff614d2 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> > if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> > linfo[lpages - 1].disallow_lpage = 1;
> > ugfn = slot->userspace_addr >> PAGE_SHIFT;
> > + if (kvm_slot_can_be_private(slot))
> > + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
>
> Is there any alignment restriction? If no, It should be +=.
> In practice, alignment will hold though.
All we need here is checking whether both userspace_addr and
restricted_offset are aligned to HPAGE_SIZE or not. '+=' actually can
yield wrong value in cases when userspace_addr + restricted_offset is
aligned to HPAGE_SIZE but individually they may not align to HPAGE_SIZE.
Thanks,
Chao
>
> Thanks,
>
> > /*
> > * If the gfn and userspace address are not aligned wrt each
> > * other, disable large page support for this slot.
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 3331c0c92838..25099c94e770 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -592,6 +592,11 @@ struct kvm_memory_slot {
> > struct restrictedmem_notifier notifier;
> > };
> >
> > +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> > +{
> > + return slot && (slot->flags & KVM_MEM_PRIVATE);
> > +}
> > +
> > static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> > {
> > return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -2316,4 +2321,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
> > /* Max number of entries allowed for each kvm dirty ring */
> > #define KVM_DIRTY_RING_MAX_ENTRIES 65536
> >
> > +#ifdef __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> > +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> > + struct kvm_memory_slot *slot,
> > + unsigned long attrs,
> > + gfn_t start, gfn_t end);
> > +#else
> > +static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
> > + struct kvm_memory_slot *slot,
> > + unsigned long attrs,
> > + gfn_t start, gfn_t end)
> > +{
> > +}
> > +#endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */
> > +
> > #endif
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 4e1e1e113bf0..e107afea32f0 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2354,7 +2354,8 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> > return 0;
> > }
> >
> > -static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end,
> > + unsigned long attrs)
> > {
> > struct kvm_gfn_range gfn_range;
> > struct kvm_memory_slot *slot;
> > @@ -2378,6 +2379,10 @@ static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> > gfn_range.slot = slot;
> >
> > r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> > +
> > + kvm_arch_set_memory_attributes(kvm, slot, attrs,
> > + gfn_range.start,
> > + gfn_range.end);
> > }
> > }
> >
> > @@ -2427,7 +2432,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > idx = srcu_read_lock(&kvm->srcu);
> > KVM_MMU_LOCK(kvm);
> > if (i > start)
> > - kvm_unmap_mem_range(kvm, start, i);
> > + kvm_unmap_mem_range(kvm, start, i, attrs->attributes);
> > kvm_mmu_invalidate_end(kvm);
> > KVM_MMU_UNLOCK(kvm);
> > srcu_read_unlock(&kvm->srcu, idx);
> > --
> > 2.25.1
> >
>
> --
> Isaku Yamahata <[email protected]>
On Tue, Dec 06, 2022 at 08:02:24PM +0800,
Chao Peng <[email protected]> wrote:
> On Mon, Dec 05, 2022 at 02:49:59PM -0800, Isaku Yamahata wrote:
> > On Fri, Dec 02, 2022 at 02:13:45PM +0800,
> > Chao Peng <[email protected]> wrote:
> >
> > > A large page with mixed private/shared subpages can't be mapped as large
> > > page since its sub private/shared pages are from different memory
> > > backends and may also treated by architecture differently. When
> > > private/shared memory are mixed in a large page, the current lpage_info
> > > is not sufficient to decide whether the page can be mapped as large page
> > > or not and additional private/shared mixed information is needed.
> > >
> > > Tracking this 'mixed' information with the current 'count' like
> > > disallow_lpage is a bit challenge so reserve a bit in 'disallow_lpage'
> > > to indicate a large page has mixed private/share subpages and update
> > > this 'mixed' bit whenever the memory attribute is changed between
> > > private and shared.
> > >
> > > Signed-off-by: Chao Peng <[email protected]>
> > > ---
> > > arch/x86/include/asm/kvm_host.h | 8 ++
> > > arch/x86/kvm/mmu/mmu.c | 134 +++++++++++++++++++++++++++++++-
> > > arch/x86/kvm/x86.c | 2 +
> > > include/linux/kvm_host.h | 19 +++++
> > > virt/kvm/kvm_main.c | 9 ++-
> > > 5 files changed, 169 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 283cbb83d6ae..7772ab37ac89 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -38,6 +38,7 @@
> > > #include <asm/hyperv-tlfs.h>
> > >
> > > #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> > > +#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> > >
> > > #define KVM_MAX_VCPUS 1024
> > >
> > > @@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
> > > #endif
> > > };
> > >
> > > +/*
> > > + * Use a bit in disallow_lpage to indicate private/shared pages mixed at the
> > > + * level. The remaining bits are used as a reference count.
> > > + */
> > > +#define KVM_LPAGE_PRIVATE_SHARED_MIXED (1U << 31)
> > > +#define KVM_LPAGE_COUNT_MAX ((1U << 31) - 1)
> > > +
> > > struct kvm_lpage_info {
> > > int disallow_lpage;
> > > };
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index e2c70b5afa3e..2190fd8c95c0 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
> > > {
> > > struct kvm_lpage_info *linfo;
> > > int i;
> > > + int disallow_count;
> > >
> > > for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
> > > linfo = lpage_info_slot(gfn, slot, i);
> > > +
> > > + disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
> > > + WARN_ON(disallow_count + count < 0 ||
> > > + disallow_count > KVM_LPAGE_COUNT_MAX - count);
> > > +
> > > linfo->disallow_lpage += count;
> > > - WARN_ON(linfo->disallow_lpage < 0);
> > > }
> > > }
> > >
> > > @@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> > > if (kvm->arch.nx_huge_page_recovery_thread)
> > > kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> > > }
> > > +
> > > +static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
> > > +{
> > > + return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > +}
> > > +
> > > +static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
> > > + int level, bool mixed)
> > > +{
> > > + struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
> > > +
> > > + if (mixed)
> > > + linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > + else
> > > + linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > +}
> > > +
> > > +static bool is_expected_attr_entry(void *entry, unsigned long expected_attrs)
> > > +{
> > > + bool expect_private = expected_attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > > +
> > > + if (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) {
> > > + if (!expect_private)
> > > + return false;
> > > + } else if (expect_private)
> > > + return false;
> > > +
> > > + return true;
> > > +}
> > > +
> > > +static bool mem_attrs_mixed_2m(struct kvm *kvm, unsigned long attrs,
> > > + gfn_t start, gfn_t end)
> > > +{
> > > + XA_STATE(xas, &kvm->mem_attr_array, start);
> > > + gfn_t gfn = start;
> > > + void *entry;
> > > + bool mixed = false;
> > > +
> > > + rcu_read_lock();
> > > + entry = xas_load(&xas);
> > > + while (gfn < end) {
> > > + if (xas_retry(&xas, entry))
> > > + continue;
> > > +
> > > + KVM_BUG_ON(gfn != xas.xa_index, kvm);
> > > +
> > > + if (!is_expected_attr_entry(entry, attrs)) {
> > > + mixed = true;
> > > + break;
> > > + }
> > > +
> > > + entry = xas_next(&xas);
> > > + gfn++;
> > > + }
> > > +
> > > + rcu_read_unlock();
> > > + return mixed;
> > > +}
> > > +
> > > +static bool mem_attrs_mixed(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > + int level, unsigned long attrs,
> > > + gfn_t start, gfn_t end)
> > > +{
> > > + unsigned long gfn;
> > > +
> > > + if (level == PG_LEVEL_2M)
> > > + return mem_attrs_mixed_2m(kvm, attrs, start, end);
> > > +
> > > + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1))
> > > + if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
> > > + !is_expected_attr_entry(xa_load(&kvm->mem_attr_array, gfn),
> > > + attrs))
> > > + return true;
> > > + return false;
> > > +}
> > > +
> > > +static void kvm_update_lpage_private_shared_mixed(struct kvm *kvm,
> > > + struct kvm_memory_slot *slot,
> > > + unsigned long attrs,
> > > + gfn_t start, gfn_t end)
> > > +{
> > > + unsigned long pages, mask;
> > > + gfn_t gfn, gfn_end, first, last;
> > > + int level;
> > > + bool mixed;
> > > +
> > > + /*
> > > + * The sequence matters here: we set the higher level basing on the
> > > + * lower level's scanning result.
> > > + */
> > > + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
> > > + pages = KVM_PAGES_PER_HPAGE(level);
> > > + mask = ~(pages - 1);
> > > + first = start & mask;
> > > + last = (end - 1) & mask;
> > > +
> > > + /*
> > > + * We only need to scan the head and tail page, for middle pages
> > > + * we know they will not be mixed.
> > > + */
> > > + gfn = max(first, slot->base_gfn);
> > > + gfn_end = min(first + pages, slot->base_gfn + slot->npages);
> > > + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> > > + linfo_set_mixed(gfn, slot, level, mixed);
> > > +
> > > + if (first == last)
> > > + return;
> >
> >
> > continue.
>
> Ya!
>
> >
> > > +
> > > + for (gfn = first + pages; gfn < last; gfn += pages)
> > > + linfo_set_mixed(gfn, slot, level, false);
> > > +
> > > + gfn = last;
> > > + gfn_end = min(last + pages, slot->base_gfn + slot->npages);
> >
> > if (gfn == gfn_end) continue.
>
> Do you see a case where gfn can equal to gfn_end? Though it does not
> hurt to add a check.
If last == base_gfn + npages, gfn == gfn_end can occur.
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 9a07380f8d3c..5aefcff614d2 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> > > if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> > > linfo[lpages - 1].disallow_lpage = 1;
> > > ugfn = slot->userspace_addr >> PAGE_SHIFT;
> > > + if (kvm_slot_can_be_private(slot))
> > > + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
> >
> > Is there any alignment restriction? If no, It should be +=.
> > In practice, alignment will hold though.
>
> All we need here is checking whether both userspace_addr and
> restricted_offset are aligned to HPAGE_SIZE or not. '+=' actually can
> yield wrong value in cases when userspace_addr + restricted_offset is
> aligned to HPAGE_SIZE but individually they may not align to HPAGE_SIZE.
Ah, got it. The blow comment explains it.
> Thanks,
> Chao
> >
> > Thanks,
> >
> > > /*
> > > * If the gfn and userspace address are not aligned wrt each
> > > * other, disable large page support for this slot.
--
Isaku Yamahata <[email protected]>
On Tue, Dec 06, 2022 at 10:42:24PM -0800, Isaku Yamahata wrote:
> On Tue, Dec 06, 2022 at 08:02:24PM +0800,
> Chao Peng <[email protected]> wrote:
>
> > On Mon, Dec 05, 2022 at 02:49:59PM -0800, Isaku Yamahata wrote:
> > > On Fri, Dec 02, 2022 at 02:13:45PM +0800,
> > > Chao Peng <[email protected]> wrote:
> > >
> > > > A large page with mixed private/shared subpages can't be mapped as large
> > > > page since its sub private/shared pages are from different memory
> > > > backends and may also treated by architecture differently. When
> > > > private/shared memory are mixed in a large page, the current lpage_info
> > > > is not sufficient to decide whether the page can be mapped as large page
> > > > or not and additional private/shared mixed information is needed.
> > > >
> > > > Tracking this 'mixed' information with the current 'count' like
> > > > disallow_lpage is a bit challenge so reserve a bit in 'disallow_lpage'
> > > > to indicate a large page has mixed private/share subpages and update
> > > > this 'mixed' bit whenever the memory attribute is changed between
> > > > private and shared.
> > > >
> > > > Signed-off-by: Chao Peng <[email protected]>
> > > > ---
> > > > arch/x86/include/asm/kvm_host.h | 8 ++
> > > > arch/x86/kvm/mmu/mmu.c | 134 +++++++++++++++++++++++++++++++-
> > > > arch/x86/kvm/x86.c | 2 +
> > > > include/linux/kvm_host.h | 19 +++++
> > > > virt/kvm/kvm_main.c | 9 ++-
> > > > 5 files changed, 169 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > > index 283cbb83d6ae..7772ab37ac89 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -38,6 +38,7 @@
> > > > #include <asm/hyperv-tlfs.h>
> > > >
> > > > #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> > > > +#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> > > >
> > > > #define KVM_MAX_VCPUS 1024
> > > >
> > > > @@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
> > > > #endif
> > > > };
> > > >
> > > > +/*
> > > > + * Use a bit in disallow_lpage to indicate private/shared pages mixed at the
> > > > + * level. The remaining bits are used as a reference count.
> > > > + */
> > > > +#define KVM_LPAGE_PRIVATE_SHARED_MIXED (1U << 31)
> > > > +#define KVM_LPAGE_COUNT_MAX ((1U << 31) - 1)
> > > > +
> > > > struct kvm_lpage_info {
> > > > int disallow_lpage;
> > > > };
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index e2c70b5afa3e..2190fd8c95c0 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > @@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
> > > > {
> > > > struct kvm_lpage_info *linfo;
> > > > int i;
> > > > + int disallow_count;
> > > >
> > > > for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
> > > > linfo = lpage_info_slot(gfn, slot, i);
> > > > +
> > > > + disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
> > > > + WARN_ON(disallow_count + count < 0 ||
> > > > + disallow_count > KVM_LPAGE_COUNT_MAX - count);
> > > > +
> > > > linfo->disallow_lpage += count;
> > > > - WARN_ON(linfo->disallow_lpage < 0);
> > > > }
> > > > }
> > > >
> > > > @@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> > > > if (kvm->arch.nx_huge_page_recovery_thread)
> > > > kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> > > > }
> > > > +
> > > > +static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
> > > > +{
> > > > + return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > > +}
> > > > +
> > > > +static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
> > > > + int level, bool mixed)
> > > > +{
> > > > + struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
> > > > +
> > > > + if (mixed)
> > > > + linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > > + else
> > > > + linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > > +}
> > > > +
> > > > +static bool is_expected_attr_entry(void *entry, unsigned long expected_attrs)
> > > > +{
> > > > + bool expect_private = expected_attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > > > +
> > > > + if (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) {
> > > > + if (!expect_private)
> > > > + return false;
> > > > + } else if (expect_private)
> > > > + return false;
> > > > +
> > > > + return true;
> > > > +}
> > > > +
> > > > +static bool mem_attrs_mixed_2m(struct kvm *kvm, unsigned long attrs,
> > > > + gfn_t start, gfn_t end)
> > > > +{
> > > > + XA_STATE(xas, &kvm->mem_attr_array, start);
> > > > + gfn_t gfn = start;
> > > > + void *entry;
> > > > + bool mixed = false;
> > > > +
> > > > + rcu_read_lock();
> > > > + entry = xas_load(&xas);
> > > > + while (gfn < end) {
> > > > + if (xas_retry(&xas, entry))
> > > > + continue;
> > > > +
> > > > + KVM_BUG_ON(gfn != xas.xa_index, kvm);
> > > > +
> > > > + if (!is_expected_attr_entry(entry, attrs)) {
> > > > + mixed = true;
> > > > + break;
> > > > + }
> > > > +
> > > > + entry = xas_next(&xas);
> > > > + gfn++;
> > > > + }
> > > > +
> > > > + rcu_read_unlock();
> > > > + return mixed;
> > > > +}
> > > > +
> > > > +static bool mem_attrs_mixed(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > + int level, unsigned long attrs,
> > > > + gfn_t start, gfn_t end)
> > > > +{
> > > > + unsigned long gfn;
> > > > +
> > > > + if (level == PG_LEVEL_2M)
> > > > + return mem_attrs_mixed_2m(kvm, attrs, start, end);
> > > > +
> > > > + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1))
> > > > + if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
> > > > + !is_expected_attr_entry(xa_load(&kvm->mem_attr_array, gfn),
> > > > + attrs))
> > > > + return true;
> > > > + return false;
> > > > +}
> > > > +
> > > > +static void kvm_update_lpage_private_shared_mixed(struct kvm *kvm,
> > > > + struct kvm_memory_slot *slot,
> > > > + unsigned long attrs,
> > > > + gfn_t start, gfn_t end)
> > > > +{
> > > > + unsigned long pages, mask;
> > > > + gfn_t gfn, gfn_end, first, last;
> > > > + int level;
> > > > + bool mixed;
> > > > +
> > > > + /*
> > > > + * The sequence matters here: we set the higher level basing on the
> > > > + * lower level's scanning result.
> > > > + */
> > > > + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
> > > > + pages = KVM_PAGES_PER_HPAGE(level);
> > > > + mask = ~(pages - 1);
> > > > + first = start & mask;
> > > > + last = (end - 1) & mask;
> > > > +
> > > > + /*
> > > > + * We only need to scan the head and tail page, for middle pages
> > > > + * we know they will not be mixed.
> > > > + */
> > > > + gfn = max(first, slot->base_gfn);
> > > > + gfn_end = min(first + pages, slot->base_gfn + slot->npages);
> > > > + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> > > > + linfo_set_mixed(gfn, slot, level, mixed);
> > > > +
> > > > + if (first == last)
> > > > + return;
> > >
> > >
> > > continue.
> >
> > Ya!
> >
> > >
> > > > +
> > > > + for (gfn = first + pages; gfn < last; gfn += pages)
> > > > + linfo_set_mixed(gfn, slot, level, false);
> > > > +
> > > > + gfn = last;
> > > > + gfn_end = min(last + pages, slot->base_gfn + slot->npages);
> > >
> > > if (gfn == gfn_end) continue.
> >
> > Do you see a case where gfn can equal to gfn_end? Though it does not
> > hurt to add a check.
>
> If last == base_gfn + npages, gfn == gfn_end can occur.
'end' is guaranteed <= base_gfn + npages in kvm_unmap_mem_range():
gfn_range.end = min(end, slot->base_gfn + slot->npages);
And 'last' is defined as:
last = (end - 1) & mask;
Then the math is:
last = (end - 1) & mask
<= end - 1
<= base_gfn + npages - 1
< base_gfn + npages
Thanks,
Chao
>
>
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index 9a07380f8d3c..5aefcff614d2 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> > > > if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> > > > linfo[lpages - 1].disallow_lpage = 1;
> > > > ugfn = slot->userspace_addr >> PAGE_SHIFT;
> > > > + if (kvm_slot_can_be_private(slot))
> > > > + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
> > >
> > > Is there any alignment restriction? If no, It should be +=.
> > > In practice, alignment will hold though.
> >
> > All we need here is checking whether both userspace_addr and
> > restricted_offset are aligned to HPAGE_SIZE or not. '+=' actually can
> > yield wrong value in cases when userspace_addr + restricted_offset is
> > aligned to HPAGE_SIZE but individually they may not align to HPAGE_SIZE.
>
> Ah, got it. The blow comment explains it.
>
> > Thanks,
> > Chao
> > >
> > > Thanks,
> > >
> > > > /*
> > > > * If the gfn and userspace address are not aligned wrt each
> > > > * other, disable large page support for this slot.
> --
> Isaku Yamahata <[email protected]>
On Fri, Dec 02, 2022, Chao Peng wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a07380f8d3c..5aefcff614d2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> linfo[lpages - 1].disallow_lpage = 1;
> ugfn = slot->userspace_addr >> PAGE_SHIFT;
> + if (kvm_slot_can_be_private(slot))
> + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
> /*
> * If the gfn and userspace address are not aligned wrt each
> * other, disable large page support for this slot.
Forgot to talk about the bug. This code needs to handle the scenario where a
memslot is created with existing, non-uniform attributes. It might be a bit ugly
(I didn't even try to write the code), but it's definitely possible, and since
memslot updates are already slow I think it's best to handle things here.
In the meantime, I added this so we don't forget to fix it before merging.
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
pr_crit_once("FIXME: Walk the memory attributes of the slot and set the mixed status appropriately");
#endif
On Fri, Dec 02, 2022, Chao Peng wrote:
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 283cbb83d6ae..7772ab37ac89 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -38,6 +38,7 @@
> #include <asm/hyperv-tlfs.h>
>
> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> +#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
No need for this, I think we should just make it mandatory to implement the
arch hook when CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES=y. If another arch gains
support for mem attributes and doesn't need the hook, then we can simply add a
weak helper (or maybe add a #define then if we feel that's the way to go).
> #define KVM_MAX_VCPUS 1024
>
> @@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
> #endif
> };
>
> +/*
> + * Use a bit in disallow_lpage to indicate private/shared pages mixed at the
> + * level. The remaining bits are used as a reference count.
> + */
> +#define KVM_LPAGE_PRIVATE_SHARED_MIXED (1U << 31)
Similar to the need to unmap, I think we should just say "mixed" and ignore the
private vs. shared, i.e. make this a flag for all memory attributes.
> +#define KVM_LPAGE_COUNT_MAX ((1U << 31) - 1)
"MAX" is technically correct, but it's more of a mask. I think we can make it a
moot point though. There's no need to mask the count, we just want to assert that
adjusting the counting doesn't change the flag.
I would also say throw these defines into mmu.c, at least pending the bug fix
for kvm_alloc_memslot_metadata() (more on that below).
> struct kvm_lpage_info {
> int disallow_lpage;
> };
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e2c70b5afa3e..2190fd8c95c0 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
> {
> struct kvm_lpage_info *linfo;
> int i;
> + int disallow_count;
>
> for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
> linfo = lpage_info_slot(gfn, slot, i);
> +
> + disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
> + WARN_ON(disallow_count + count < 0 ||
> + disallow_count > KVM_LPAGE_COUNT_MAX - count);
> +
> linfo->disallow_lpage += count;
> - WARN_ON(linfo->disallow_lpage < 0);
It's been a long week so don't trust my math, but I believe this can simply be:
old = linfo->disallow_lpage;
linfo->disallow_lpage += count;
WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
> }
> }
>
> @@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> if (kvm->arch.nx_huge_page_recovery_thread)
> kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> }
> +
> +static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
> +{
> + return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
> +}
> +
> +static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
> + int level, bool mixed)
> +{
> + struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
> +
> + if (mixed)
> + linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
> + else
> + linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
> +}
> +
> +static bool is_expected_attr_entry(void *entry, unsigned long expected_attrs)
> +{
> + bool expect_private = expected_attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +
> + if (xa_to_value(entry) & KVM_MEMORY_ATTRIBUTE_PRIVATE) {
> + if (!expect_private)
> + return false;
> + } else if (expect_private)
> + return false;
This is messy. If we drop the private vs. shared specifity, this can go away if
we add a helper to get attributes
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
}
and then we can do
if (KVM_BUG_ON(gfn != xas.xa_index, kvm) ||
attrs != kvm_get_memory_attributes(kvm, gfn)) {
mixed = true;
break;
}
and
if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
attrs != kvm_get_memory_attributes(kvm, gfn))
return true;
> +
> + return true;
> +}
> +
> +static bool mem_attrs_mixed_2m(struct kvm *kvm, unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + XA_STATE(xas, &kvm->mem_attr_array, start);
> + gfn_t gfn = start;
> + void *entry;
> + bool mixed = false;
> +
> + rcu_read_lock();
> + entry = xas_load(&xas);
> + while (gfn < end) {
> + if (xas_retry(&xas, entry))
> + continue;
> +
> + KVM_BUG_ON(gfn != xas.xa_index, kvm);
As above, I think it's worth bailing immediately if there's a mismatch.
> +
> + if (!is_expected_attr_entry(entry, attrs)) {
> + mixed = true;
> + break;
> + }
> +
> + entry = xas_next(&xas);
> + gfn++;
> + }
> +
> + rcu_read_unlock();
> + return mixed;
> +}
> +
> +static bool mem_attrs_mixed(struct kvm *kvm, struct kvm_memory_slot *slot,
s/mem_attrs_mixed/has_mixed_attrs to make it clear this is querying, not setting.
And has_mixed_attrs_2m() above.
> + int level, unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + unsigned long gfn;
> +
> + if (level == PG_LEVEL_2M)
> + return mem_attrs_mixed_2m(kvm, attrs, start, end);
> +
> + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1))
Curly braces needed on the for-loop.
> + if (linfo_is_mixed(lpage_info_slot(gfn, slot, level - 1)) ||
> + !is_expected_attr_entry(xa_load(&kvm->mem_attr_array, gfn),
> + attrs))
> + return true;
> + return false;
> +}
> +
> +static void kvm_update_lpage_private_shared_mixed(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + unsigned long pages, mask;
> + gfn_t gfn, gfn_end, first, last;
> + int level;
> + bool mixed;
> +
> + /*
> + * The sequence matters here: we set the higher level basing on the
> + * lower level's scanning result.
> + */
> + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
> + pages = KVM_PAGES_PER_HPAGE(level);
> + mask = ~(pages - 1);
> + first = start & mask;
> + last = (end - 1) & mask;
> +
> + /*
> + * We only need to scan the head and tail page, for middle pages
> + * we know they will not be mixed.
> + */
> + gfn = max(first, slot->base_gfn);
> + gfn_end = min(first + pages, slot->base_gfn + slot->npages);
> + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> + linfo_set_mixed(gfn, slot, level, mixed);
> +
> + if (first == last)
> + return;
> +
> + for (gfn = first + pages; gfn < last; gfn += pages)
> + linfo_set_mixed(gfn, slot, level, false);
> +
> + gfn = last;
> + gfn_end = min(last + pages, slot->base_gfn + slot->npages);
> + mixed = mem_attrs_mixed(kvm, slot, level, attrs, gfn, gfn_end);
> + linfo_set_mixed(gfn, slot, level, mixed);
> + }
> +}
> +
> +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> + if (kvm_slot_can_be_private(slot))
Make this an early return optimization, with a comment explaining that KVM x86
doesn't yet support other attributes.
/*
* KVM x86 currently only supports KVM_MEMORY_ATTRIBUTE_PRIVATE, skip
* the slot if the slot will never consume the PRIVATE attribute.
*/
if (!kvm_slot_can_be_private(slot))
return;
> + kvm_update_lpage_private_shared_mixed(kvm, slot, attrs,
> + start, end);
> +}
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9a07380f8d3c..5aefcff614d2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> linfo[lpages - 1].disallow_lpage = 1;
> ugfn = slot->userspace_addr >> PAGE_SHIFT;
> + if (kvm_slot_can_be_private(slot))
> + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
I would rather reject memslot if the gfn has lesser alignment than the offset.
I'm totally ok with this approach _if_ there's a use case. Until such a use case
presents itself, I would rather be conservative from a uAPI perspective.
> /*
> * If the gfn and userspace address are not aligned wrt each
> * other, disable large page support for this slot.
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 3331c0c92838..25099c94e770 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -592,6 +592,11 @@ struct kvm_memory_slot {
> struct restrictedmem_notifier notifier;
> };
>
> +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> +{
> + return slot && (slot->flags & KVM_MEM_PRIVATE);
KVM_MEM_PRIVATE should really be defined only when private memory is exposed to
userspace. For this patch, even though it means we have untestable code, I think
it makes sense to "return false".
> +}
> +
> static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> {
> return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> @@ -2316,4 +2321,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
> /* Max number of entries allowed for each kvm dirty ring */
> #define KVM_DIRTY_RING_MAX_ENTRIES 65536
>
> +#ifdef __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end);
> +#else
> +static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end)
> +{
> +}
> +#endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */
As above, no stub is necessary.
> #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 4e1e1e113bf0..e107afea32f0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2354,7 +2354,8 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> return 0;
> }
>
> -static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
Feedback for an earlier patch (to avoid churn): this should be kvm_mem_attrs_changed()
or so now that this does more than just unmap.
> +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end,
> + unsigned long attrs)
Weird nit. I think we should keep the prototypes for kvm_mem_attrs_changed()
and kvm_arch_set_memory_attributes() somewhat similar, i.e. squeeze in @attrs
before @start.
> {
> struct kvm_gfn_range gfn_range;
> struct kvm_memory_slot *slot;
> @@ -2378,6 +2379,10 @@ static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> gfn_range.slot = slot;
>
> r |= kvm_unmap_gfn_range(kvm, &gfn_range);
> +
> + kvm_arch_set_memory_attributes(kvm, slot, attrs,
> + gfn_range.start,
> + gfn_range.end);
> }
> }
>
> @@ -2427,7 +2432,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> idx = srcu_read_lock(&kvm->srcu);
> KVM_MMU_LOCK(kvm);
> if (i > start)
> - kvm_unmap_mem_range(kvm, start, i);
> + kvm_unmap_mem_range(kvm, start, i, attrs->attributes);
> kvm_mmu_invalidate_end(kvm);
> KVM_MMU_UNLOCK(kvm);
> srcu_read_unlock(&kvm->srcu, idx);
> --
> 2.25.1
>
On Fri, Jan 13, 2023 at 11:16:27PM +0000, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 9a07380f8d3c..5aefcff614d2 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> > if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
> > linfo[lpages - 1].disallow_lpage = 1;
> > ugfn = slot->userspace_addr >> PAGE_SHIFT;
> > + if (kvm_slot_can_be_private(slot))
> > + ugfn |= slot->restricted_offset >> PAGE_SHIFT;
> > /*
> > * If the gfn and userspace address are not aligned wrt each
> > * other, disable large page support for this slot.
>
> Forgot to talk about the bug. This code needs to handle the scenario where a
> memslot is created with existing, non-uniform attributes. It might be a bit ugly
> (I didn't even try to write the code), but it's definitely possible, and since
> memslot updates are already slow I think it's best to handle things here.
>
> In the meantime, I added this so we don't forget to fix it before merging.
>
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> pr_crit_once("FIXME: Walk the memory attributes of the slot and set the mixed status appropriately");
> #endif
Here is the code to fix (based on your latest github repo).
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e552374f2357..609ff1cba9c5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2195,4 +2195,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+void kvm_memory_attributes_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+#endif
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index eda615f3951c..8833d7201e41 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7201,10 +7201,11 @@ static bool has_mixed_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
return false;
}
-void kvm_arch_set_memory_attributes(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long attrs,
- gfn_t start, gfn_t end)
+static void kvm_update_lpage_mixed_flag(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ bool set_attrs,
+ unsigned long attrs,
+ gfn_t start, gfn_t end)
{
unsigned long pages, mask;
gfn_t gfn, gfn_end, first, last;
@@ -7231,25 +7232,53 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
first = start & mask;
last = (end - 1) & mask;
- /*
- * We only need to scan the head and tail page, for middle pages
- * we know they will not be mixed.
- */
+ /* head page */
gfn = max(first, slot->base_gfn);
gfn_end = min(first + pages, slot->base_gfn + slot->npages);
+ if(!set_attrs)
+ attrs = kvm_get_memory_attributes(kvm, gfn);
mixed = has_mixed_attrs(kvm, slot, level, attrs, gfn, gfn_end);
linfo_update_mixed(gfn, slot, level, mixed);
if (first == last)
return;
- for (gfn = first + pages; gfn < last; gfn += pages)
- linfo_update_mixed(gfn, slot, level, false);
+ /* middle pages */
+ for (gfn = first + pages; gfn < last; gfn += pages) {
+ if (set_attrs) {
+ mixed = false;
+ } else {
+ gfn_end = gfn + pages;
+ attrs = kvm_get_memory_attributes(kvm, gfn);
+ mixed = has_mixed_attrs(kvm, slot, level, attrs,
+ gfn, gfn_end);
+ }
+ linfo_update_mixed(gfn, slot, level, mixed);
+ }
+ /* tail page */
gfn = last;
gfn_end = min(last + pages, slot->base_gfn + slot->npages);
+ if(!set_attrs)
+ attrs = kvm_get_memory_attributes(kvm, gfn);
mixed = has_mixed_attrs(kvm, slot, level, attrs, gfn, gfn_end);
linfo_update_mixed(gfn, slot, level, mixed);
}
}
+
+void kvm_arch_set_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long attrs,
+ gfn_t start, gfn_t end)
+{
+ kvm_update_lpage_mixed_flag(kvm, slot, true, attrs, start, end);
+}
+
+void kvm_memory_attributes_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+
+ kvm_update_lpage_mixed_flag(kvm, slot, false, 0, slot->base_gfn,
+ slot->base_gfn + slot->npages);
+}
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 268c3d16894d..c1074aecf2d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12443,7 +12443,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
- pr_crit_once("FIXME: Walk the memory attributes of the slot and set the mixed status appropriately");
+ kvm_memory_attributes_create_memslot(kvm, slot);
#endif
if (kvm_page_track_create_memslot(kvm, slot, npages))