This patch series add support for dirty log based on HW DBM.
It works well under some migration test cases, including VM with 4K
pages or 2M THP. I checked the SHA256 hash digest of all memory and
they keep same for source VM and destination VM, which means no dirty
pages is missed under hardware DBM.
Some key points:
1. Only support hardware updates of dirty status for PTEs. PMDs and PUDs
are not involved for now.
2. About *performance*: In RFC patch, I have mentioned that for every 64GB
memory, KVM consumes about 40ms to scan all PTEs to collect dirty log.
This patch solves this problem through two ways: HW/SW dynamic switch
and Multi-core offload.
HW/SW dynamic switch: Give userspace right to enable/disable hw dirty
log. This adds a new KVM cap named KVM_CAP_ARM_HW_DIRTY_LOG. We can
achieve this by change the kvm->arch.vtcr value and kick vCPUs out to
reload this value to VCTR_EL2. Then userspace can enable hw dirty log
at the begining and disable it when dirty pages is little and about to
stop VM, so VM downtime is not affected.
Multi-core offload: Offload the PT scanning workload to multi-core can
greatly reduce scanning time. To promise we can complete in time, I use
smp_call_fuction to realize this policy, which utilize IPI to dispatch
workload to other CPUs. Under 128U Kunpeng 920 platform, it just takes
about 5ms to scan PTs of 256 RAM (use mempress and almost all PTs have
been established). And We dispatch workload iterately (every CPU just
scan PTs of 512M RAM for each iteration), so it won't affect physical
CPUs seriously.
3. About correctness: Only add DBM bit when PTE is already writable, so
we still have readonly PTE and some mechanisms which rely on readonly
PTs are not broken.
4. About PTs modification races: There are two kinds of PTs modification.
The first is adding or clearing specific bit, such as AF or RW. All
these operations have been converted to be atomic, avoid covering
dirty status set by hardware.
The second is replacement, such as PTEs unmapping or changement. All
these operations will invoke kvm_set_pte finally. kvm_set_pte have
been converted to be atomic and we save the dirty status to underlying
bitmap if dirty status is coverred.
Change log:
v2:
- Address Steven's comments.
- Add support of parallel dirty log sync.
- Simplify and merge patches of v1.
v1:
- Address Catalin's comments.
Keqian Zhu (8):
KVM: arm64: Set DBM bit for writable PTEs
KVM: arm64: Scan PTEs to sync dirty log
KVM: arm64: Modify stage2 young mechanism to support hw DBM
KVM: arm64: Save stage2 PTE dirty status if it is covered
KVM: arm64: Steply write protect page table by mask bit
KVM: arm64: Add KVM_CAP_ARM_HW_DIRTY_LOG capability
KVM: arm64: Sync dirty log parallel
KVM: Omit dirty log sync in log clear if initially all set
arch/arm64/include/asm/kvm_host.h | 5 +
arch/arm64/include/asm/kvm_mmu.h | 43 ++++-
arch/arm64/kvm/arm.c | 45 ++++-
arch/arm64/kvm/mmu.c | 307 ++++++++++++++++++++++++++++--
arch/arm64/kvm/reset.c | 5 +
include/uapi/linux/kvm.h | 1 +
tools/include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 3 +-
8 files changed, 389 insertions(+), 21 deletions(-)
--
2.19.1
Marking PTs young (set AF bit) should be atomic to avoid cover
dirty status set by hardware.
Signed-off-by: Keqian Zhu <[email protected]>
Signed-off-by: Peng Liang <[email protected]>
---
arch/arm64/include/asm/kvm_mmu.h | 31 ++++++++++++++++++++++---------
arch/arm64/kvm/mmu.c | 15 ++++++++-------
2 files changed, 30 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 4c12b7ad8ae8..a1b6131d980c 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -219,6 +219,18 @@ static inline void kvm_set_s2pte_readonly(pte_t *ptep)
} while (pteval != old_pteval);
}
+static inline void kvm_set_s2pte_young(pte_t *ptep)
+{
+ pteval_t old_pteval, pteval;
+
+ pteval = READ_ONCE(pte_val(*ptep));
+ do {
+ old_pteval = pteval;
+ pteval |= PTE_AF;
+ pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
+ } while (pteval != old_pteval);
+}
+
static inline bool kvm_s2pte_readonly(pte_t *ptep)
{
return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
@@ -234,6 +246,11 @@ static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
kvm_set_s2pte_readonly((pte_t *)pmdp);
}
+static inline void kvm_set_s2pmd_young(pmd_t *pmdp)
+{
+ kvm_set_s2pte_young((pte_t *)pmdp);
+}
+
static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
{
return kvm_s2pte_readonly((pte_t *)pmdp);
@@ -249,6 +266,11 @@ static inline void kvm_set_s2pud_readonly(pud_t *pudp)
kvm_set_s2pte_readonly((pte_t *)pudp);
}
+static inline void kvm_set_s2pud_young(pud_t *pudp)
+{
+ kvm_set_s2pte_young((pte_t *)pudp);
+}
+
static inline bool kvm_s2pud_readonly(pud_t *pudp)
{
return kvm_s2pte_readonly((pte_t *)pudp);
@@ -259,15 +281,6 @@ static inline bool kvm_s2pud_exec(pud_t *pudp)
return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
}
-static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
-{
- return pud_mkyoung(pud);
-}
-
-static inline bool kvm_s2pud_young(pud_t pud)
-{
- return pud_young(pud);
-}
static inline bool arm_mmu_hw_dbm_supported(void)
{
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index b3cb8b6da4c2..ab8a6ceecbd8 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2008,8 +2008,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Resolve the access fault by making the page young again.
* Note that because the faulting entry is guaranteed not to be
* cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
+ *
+ * Note: Both DBM and HW AF updates are supported for Stage2, so
+ * young operations should be atomic.
*/
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
@@ -2027,15 +2028,15 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
goto out;
if (pud) { /* HugeTLB */
- *pud = kvm_s2pud_mkyoung(*pud);
+ kvm_set_s2pud_young(pud);
pfn = kvm_pud_pfn(*pud);
pfn_valid = true;
} else if (pmd) { /* THP, HugeTLB */
- *pmd = pmd_mkyoung(*pmd);
+ kvm_set_s2pmd_young(pmd);
pfn = pmd_pfn(*pmd);
pfn_valid = true;
- } else {
- *pte = pte_mkyoung(*pte); /* Just a page... */
+ } else { /* Just a page... */
+ kvm_set_s2pte_young(pte);
pfn = pte_pfn(*pte);
pfn_valid = true;
}
@@ -2280,7 +2281,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *
return 0;
if (pud)
- return kvm_s2pud_young(*pud);
+ return pud_young(*pud);
else if (pmd)
return pmd_young(*pmd);
else
--
2.19.1