2009-03-31 00:02:33

by Izik Eidus

[permalink] [raw]
Subject: [PATCH 0/3] kvm support for ksm

apply it against Avi git tree.

Izik Eidus (3):
kvm: dont hold pagecount reference for mapped sptes pages.
kvm: add SPTE_HOST_WRITEABLE flag to the shadow ptes.
kvm: add support for change_pte mmu notifiers

arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 89 ++++++++++++++++++++++++++++++++-------
arch/x86/kvm/paging_tmpl.h | 16 ++++++-
virt/kvm/kvm_main.c | 14 ++++++
4 files changed, 101 insertions(+), 19 deletions(-)


2009-03-31 00:02:52

by Izik Eidus

[permalink] [raw]
Subject: [PATCH 1/3] kvm: dont hold pagecount reference for mapped sptes pages.

When using mmu notifiers, we are allowed to remove the page count
reference tooken by get_user_pages to a specific page that is mapped
inside the shadow page tables.

This is needed so we can balance the pagecount against mapcount
checking.

(Right now kvm increase the pagecount and does not increase the
mapcount when mapping page into shadow page table entry,
so when comparing pagecount against mapcount, you have no
reliable result.)

Signed-off-by: Izik Eidus <[email protected]>
---
arch/x86/kvm/mmu.c | 7 ++-----
1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b625ed4..df8fbaf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -567,9 +567,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
if (*spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (is_writeble_pte(*spte))
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
+ kvm_set_pfn_dirty(pfn);
rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -1812,8 +1810,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
rmap_add(vcpu, shadow_pte, gfn, largepage);
- if (!is_rmap_pte(*shadow_pte))
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(pfn);
} else {
if (was_writeble)
kvm_release_pfn_dirty(pfn);
--
1.5.6.5

2009-03-31 00:03:21

by Izik Eidus

[permalink] [raw]
Subject: [PATCH 2/3] kvm: add SPTE_HOST_WRITEABLE flag to the shadow ptes.

this flag notify that the host physical page we are pointing to from
the spte is write protected, and therefore we cant change its access
to be write unless we run get_user_pages(write = 1).

(this is needed for change_pte support in kvm)

Signed-off-by: Izik Eidus <[email protected]>
---
arch/x86/kvm/mmu.c | 14 ++++++++++----
arch/x86/kvm/paging_tmpl.h | 16 +++++++++++++---
2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index df8fbaf..6b4d795 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -138,6 +138,8 @@ module_param(oos_shadow, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)

+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

struct kvm_rmap_desc {
@@ -1676,7 +1678,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
int global, gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync)
+ bool can_unsync, bool reset_host_protection)
{
u64 spte;
int ret = 0;
@@ -1719,6 +1721,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
kvm_x86_ops->get_mt_mask_shift();
spte |= mt_mask;
}
+ if (reset_host_protection)
+ spte |= SPTE_HOST_WRITEABLE;

spte |= (u64)pfn << PAGE_SHIFT;

@@ -1764,7 +1768,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
int *ptwrite, int largepage, int global,
- gfn_t gfn, pfn_t pfn, bool speculative)
+ gfn_t gfn, pfn_t pfn, bool speculative,
+ bool reset_host_protection)
{
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1793,7 +1798,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
was_rmapped = 1;
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, global, gfn, pfn, speculative, true)) {
+ dirty, largepage, global, gfn, pfn, speculative, true,
+ reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
@@ -1840,7 +1846,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
|| (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
- largepage, 0, gfn, pfn, false);
+ largepage, 0, gfn, pfn, false, true);
++vcpu->stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index eae9499..9fdacd0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -259,10 +259,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
+ /*
+ * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+ */
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, largepage,
gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
- pfn, true);
+ pfn, true, true);
}

/*
@@ -297,7 +301,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
ptwrite, largepage,
gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
- gw->gfn, pfn, false);
+ gw->gfn, pfn, false, true);
break;
}

@@ -547,6 +551,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
+ bool reset_host_protection = 1;

offset = nr_present = 0;

@@ -584,9 +589,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)

nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
+ pte_access &= ~PT_WRITABLE_MASK;
+ reset_host_protection = 0;
+ }
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
- spte_to_pfn(sp->spt[i]), true, false);
+ spte_to_pfn(sp->spt[i]), true, false,
+ reset_host_protection);
}

return !nr_present;
--
1.5.6.5

2009-03-31 00:03:41

by Izik Eidus

[permalink] [raw]
Subject: [PATCH 3/3] kvm: add support for change_pte mmu notifiers

this is needed for kvm if it want ksm to directly map pages into its
shadow page tables.

Signed-off-by: Izik Eidus <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 68 +++++++++++++++++++++++++++++++++++----
virt/kvm/kvm_main.c | 14 ++++++++
3 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8351c4d..9062729 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -791,5 +791,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);

#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6b4d795..f8816dd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -257,6 +257,11 @@ static pfn_t spte_to_pfn(u64 pte)
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
}

+static pte_t ptep_val(pte_t *ptep)
+{
+ return *ptep;
+}
+
static gfn_t pse36_gfn_delta(u32 gpte)
{
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -678,7 +683,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
}

-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int need_tlb_flush = 0;
@@ -693,8 +699,48 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
return need_tlb_flush;
}

+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ int need_flush = 0;
+ u64 *spte, new_spte;
+ pte_t *ptep = (pte_t *)data;
+ pfn_t new_pfn;
+
+ new_pfn = pte_pfn(ptep_val(ptep));
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!is_shadow_present_pte(*spte));
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+ need_flush = 1;
+ if (pte_write(ptep_val(ptep))) {
+ rmap_remove(kvm, spte);
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ spte = rmap_next(kvm, rmapp, NULL);
+ } else {
+ new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+ new_spte |= new_pfn << PAGE_SHIFT;
+
+ if (!pte_write(ptep_val(ptep))) {
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~SPTE_HOST_WRITEABLE;
+ if (is_writeble_pte(*spte))
+ kvm_set_pfn_dirty(spte_to_pfn(*spte));
+ }
+ set_shadow_pte(spte, new_spte);
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ }
+ if (need_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data))
{
int i;
int retval = 0;
@@ -715,11 +761,13 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+ retval |= handler(kvm, &memslot->rmap[gfn_offset],
+ data);
retval |= handler(kvm,
&memslot->lpage_info[
gfn_offset /
- KVM_PAGES_PER_HPAGE].rmap_pde);
+ KVM_PAGES_PER_HPAGE].rmap_pde,
+ data);
}
}

@@ -728,10 +776,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}

-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int young = 0;
@@ -757,7 +811,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)

int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}

#ifdef MMU_DEBUG
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8aa3b95..3d340e9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -846,6 +846,19 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,

}

+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+ spin_lock(&kvm->mmu_lock);
+ kvm->mmu_notifier_seq++;
+ kvm_set_spte_hva(kvm, address, pte);
+ spin_unlock(&kvm->mmu_lock);
+}
+
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
@@ -925,6 +938,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+ .change_pte = kvm_mmu_notifier_change_pte,
.release = kvm_mmu_notifier_release,
};
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
--
1.5.6.5

2022-10-18 03:58:10

by ewandevelop

[permalink] [raw]
Subject: Re: [PATCH 0/3] kvm support for ksm

Hi, I'm learning kvm-mmu codes, when I was reading codes from this patch,

I can't understand why we need to do special process for "writable pte".

> +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
> +                 unsigned long data)
> +{
> +    int need_flush = 0;
> +    u64 *spte, new_spte;
> +    pte_t *ptep = (pte_t *)data;
> +    pfn_t new_pfn;
> +
> +    new_pfn = pte_pfn(ptep_val(ptep));
> +    spte = rmap_next(kvm, rmapp, NULL);
> +    while (spte) {
> +        BUG_ON(!is_shadow_present_pte(*spte));
> +        rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
> +        need_flush = 1;
> +        if (pte_write(ptep_val(ptep))) {
> +            rmap_remove(kvm, spte);
> +            set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +            spte = rmap_next(kvm, rmapp, NULL);
> +        } else {
> +            new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
> +            new_spte |= new_pfn << PAGE_SHIFT;
> +
> +            if (!pte_write(ptep_val(ptep))) {
> +                new_spte &= ~PT_WRITABLE_MASK;
> +                new_spte &= ~SPTE_HOST_WRITEABLE;
> +                if (is_writeble_pte(*spte))
> +                    kvm_set_pfn_dirty(spte_to_pfn(*spte));
> +            }
> +            set_shadow_pte(spte, new_spte);
> +            spte = rmap_next(kvm, rmapp, spte);
> +        }
> +    }
> +    if (need_flush)
> +        kvm_flush_remote_tlbs(kvm);
> +
> +    return 0;
> +}
> +

In my opinion, we can just regard writable pte same as readable/executable,

all the corresponding sptes will be set as write-protect, and when guest

access them, an EPT-violation occurs and we do this #PF in kvm.

Shall anyone has some hint ?

On 2009/3/31 08:00, Izik Eidus wrote:
> apply it against Avi git tree.
>
> Izik Eidus (3):
> kvm: dont hold pagecount reference for mapped sptes pages.
> kvm: add SPTE_HOST_WRITEABLE flag to the shadow ptes.
> kvm: add support for change_pte mmu notifiers
>
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/mmu.c | 89 ++++++++++++++++++++++++++++++++-------
> arch/x86/kvm/paging_tmpl.h | 16 ++++++-
> virt/kvm/kvm_main.c | 14 ++++++
> 4 files changed, 101 insertions(+), 19 deletions(-)
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
>

2022-10-18 04:58:05

by ewandevelop

[permalink] [raw]
Subject: Re: [PATCH 0/3] kvm support for ksm

On 2009/3/31 08:00, Izik Eidus wrote:

> apply it against Avi git tree.
>
> Izik Eidus (3):
> kvm: dont hold pagecount reference for mapped sptes pages.
> kvm: add SPTE_HOST_WRITEABLE flag to the shadow ptes.
> kvm: add support for change_pte mmu notifiers
>
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/mmu.c | 89 ++++++++++++++++++++++++++++++++-------
> arch/x86/kvm/paging_tmpl.h | 16 ++++++-
> virt/kvm/kvm_main.c | 14 ++++++
> 4 files changed, 101 insertions(+), 19 deletions(-)
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
>
Hi, I'm learning kvm-mmu codes, when I was reading codes from this patch,

I can't understand why we need to do special process for "writable pte".

> +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
> +                 unsigned long data)
> +{
> +    int need_flush = 0;
> +    u64 *spte, new_spte;
> +    pte_t *ptep = (pte_t *)data;
> +    pfn_t new_pfn;
> +
> +    new_pfn = pte_pfn(ptep_val(ptep));
> +    spte = rmap_next(kvm, rmapp, NULL);
> +    while (spte) {
> +        BUG_ON(!is_shadow_present_pte(*spte));
> +        rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
> +        need_flush = 1;
> +        if (pte_write(ptep_val(ptep))) {
> +            rmap_remove(kvm, spte);
> +            set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> +            spte = rmap_next(kvm, rmapp, NULL);
> +        } else {
> +            new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
> +            new_spte |= new_pfn << PAGE_SHIFT;
> +
> +            if (!pte_write(ptep_val(ptep))) {
> +                new_spte &= ~PT_WRITABLE_MASK;
> +                new_spte &= ~SPTE_HOST_WRITEABLE;
> +                if (is_writeble_pte(*spte))
> +                    kvm_set_pfn_dirty(spte_to_pfn(*spte));
> +            }
> +            set_shadow_pte(spte, new_spte);
> +            spte = rmap_next(kvm, rmapp, spte);
> +        }
> +    }
> +    if (need_flush)
> +        kvm_flush_remote_tlbs(kvm);
> +
> +    return 0;
> +}
> +

In my opinion, we can just regard writable pte same as readable/executable,

all the corresponding sptes will be set as write-protect, and when guest

access them, an EPT-violation occurs and we do this #PF in kvm.

Shall anyone has some hint ?