Hi,
This patchset adds support for PUD hugepages at stage 2. This feature
is useful on cores that have support for large sized TLB mappings
(e.g., 1GB for 4K granule).
The patchset is based on v4.15-rc7 and depends on a fix sent out
earlier[0]. There patchset will conflict with Suzuki's work to
dynamically set IPA size. I'll work with Suzuki to resolve this
depending on the order the features are merged.
The patches have been functionally tested on an A57 based system. To
quantify the benefit, the patches need to be evaluated on cores
supporting larger sized TLB mappings.
I'm sending the patchset as RFC to get feedback on the code as well as
allow evaluation on real systems.
Thanks,
Punit
[0] https://patchwork.kernel.org/patch/10145339/
Punit Agrawal (4):
arm64: Correct type for PUD macros
KVM: arm64: Support dirty page tracking for PUD hugepages
KVM: arm/arm64: Refactor Stage2 PMD hugepages support
KVM: arm64: Add support for PUD hugepages at stage 2
arch/arm/include/asm/kvm_mmu.h | 19 +++++++
arch/arm/include/asm/pgtable-3level.h | 2 +
arch/arm64/include/asm/kvm_mmu.h | 29 +++++++++++
arch/arm64/include/asm/pgtable-hwdef.h | 8 +--
arch/arm64/include/asm/pgtable.h | 4 ++
virt/kvm/arm/mmu.c | 91 +++++++++++++++++++++++++++++-----
6 files changed, 137 insertions(+), 16 deletions(-)
--
2.15.1
The PUD macros (PUD_TABLE_BIT, PUD_TYPE_MASK, PUD_TYPE_SECT) use the
pgdval_t even when pudval_t is available. Even though the underlying
type for both (u64) is the same it is confusing and may lead to issues
in the future.
Fix this by using pudval_t to define the PUD_* macros.
Fixes: 084bd29810a56 ("ARM64: mm: HugeTLB support.")
Fixes: 206a2a73a62d3 ("arm64: mm: Create gigabyte kernel logical mappings where possible")
Signed-off-by: Punit Agrawal <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Will Deacon <[email protected]>
---
arch/arm64/include/asm/pgtable-hwdef.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index eb0c2bd90de9..40a998cdd399 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -116,9 +116,9 @@
* Level 1 descriptor (PUD).
*/
#define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0)
-#define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1)
-#define PUD_TYPE_MASK (_AT(pgdval_t, 3) << 0)
-#define PUD_TYPE_SECT (_AT(pgdval_t, 1) << 0)
+#define PUD_TABLE_BIT (_AT(pudval_t, 1) << 1)
+#define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0)
+#define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0)
/*
* Level 2 descriptor (PMD).
--
2.15.1
In preparation for creating PUD hugepages at stage 2, add support for
write protecting PUD hugepages when they are encountered. Write
protecting guest tables is used to track dirty pages when migrating VMs.
Also, provide trivial implementations of required kvm_s2pud_* helpers to
allow code to compile on arm32.
Signed-off-by: Punit Agrawal <[email protected]>
Cc: Christoffer Dall <[email protected]>
Cc: Marc Zyngier <[email protected]>
---
arch/arm/include/asm/kvm_mmu.h | 9 +++++++++
arch/arm64/include/asm/kvm_mmu.h | 10 ++++++++++
virt/kvm/arm/mmu.c | 9 ++++++---
3 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index fa6f2174276b..3fbe919b9181 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -103,6 +103,15 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
}
+static inline void kvm_set_s2pud_readonly(pud_t *pud)
+{
+}
+
+static inline bool kvm_s2pud_readonly(pud_t *pud)
+{
+ return true;
+}
+
static inline bool kvm_page_empty(void *ptr)
{
struct page *ptr_page = virt_to_page(ptr);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 672c8684d5c2..dbfd18e08cfb 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -201,6 +201,16 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
return kvm_s2pte_readonly((pte_t *)pmd);
}
+static inline void kvm_set_s2pud_readonly(pud_t *pud)
+{
+ kvm_set_s2pte_readonly((pte_t *)pud);
+}
+
+static inline bool kvm_s2pud_readonly(pud_t *pud)
+{
+ return kvm_s2pte_readonly((pte_t *)pud);
+}
+
static inline bool kvm_page_empty(void *ptr)
{
struct page *ptr_page = virt_to_page(ptr);
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 9dea96380339..02eefda5d71e 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1155,9 +1155,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
do {
next = stage2_pud_addr_end(addr, end);
if (!stage2_pud_none(*pud)) {
- /* TODO:PUD not supported, revisit later if supported */
- BUG_ON(stage2_pud_huge(*pud));
- stage2_wp_pmds(pud, addr, next);
+ if (stage2_pud_huge(*pud)) {
+ if (!kvm_s2pud_readonly(pud))
+ kvm_set_s2pud_readonly(pud);
+ } else {
+ stage2_wp_pmds(pud, addr, next);
+ }
}
} while (pud++, addr = next, addr != end);
}
--
2.15.1
Refactor the stage2 PMD hugepages support to split out constructing the
PMD into a separate function. A similar pattern of code will be followed
when introducing PUD hugepages at stage 2 where we need to split support
between architecure specific and common code.
There is no functional change with this patch.
Signed-off-by: Punit Agrawal <[email protected]>
Cc: Christoffer Dall <[email protected]>
Cc: Marc Zyngier <[email protected]>
---
virt/kvm/arm/mmu.c | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 02eefda5d71e..f02219a91b19 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1282,6 +1282,17 @@ static void kvm_send_hwpoison_signal(unsigned long address,
send_sig_info(SIGBUS, &info, current);
}
+static pmd_t stage2_build_pmd(kvm_pfn_t pfn, pgprot_t mem_type, bool writable)
+{
+ pmd_t pmd = pfn_pmd(pfn, mem_type);
+
+ pmd = pmd_mkhuge(pmd);
+ if (writable)
+ pmd = kvm_s2pmd_mkwrite(pmd);
+
+ return pmd;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
@@ -1386,12 +1397,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
if (hugetlb) {
- pmd_t new_pmd = pfn_pmd(pfn, mem_type);
- new_pmd = pmd_mkhuge(new_pmd);
- if (writable) {
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
+ pmd_t new_pmd = stage2_build_pmd(pfn, mem_type, writable);
+
+ if (writable)
kvm_set_pfn_dirty(pfn);
- }
+
coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
--
2.15.1
KVM only supports PMD hugepages at stage 2. Extend the stage 2 fault
handling to add support for PUD hugepages.
Addition of PUD hugpage support enables additional hugepage sizes (1G
with 4K granule and 4TB with 64k granule) which can be useful on cores
that have support for mapping larger block sizes in the TLB entries.
Signed-off-by: Punit Agrawal <[email protected]>
Cc: Marc Zyngier <[email protected]>
Cc: Christoffer Dall <[email protected]>
Cc: Catalin Marinas <[email protected]>
---
arch/arm/include/asm/kvm_mmu.h | 10 +++++
arch/arm/include/asm/pgtable-3level.h | 2 +
arch/arm64/include/asm/kvm_mmu.h | 19 +++++++++
arch/arm64/include/asm/pgtable-hwdef.h | 2 +
arch/arm64/include/asm/pgtable.h | 4 ++
virt/kvm/arm/mmu.c | 72 +++++++++++++++++++++++++++++-----
6 files changed, 99 insertions(+), 10 deletions(-)
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 3fbe919b9181..6e2e34348cb3 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -59,6 +59,10 @@ phys_addr_t kvm_get_idmap_vector(void);
int kvm_mmu_init(void);
void kvm_clear_hyp_idmap(void);
+static inline void kvm_set_pud(pud_t *pud, pud_t new_pud)
+{
+}
+
static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
{
*pmd = new_pmd;
@@ -230,6 +234,12 @@ static inline unsigned int kvm_get_vmid_bits(void)
return 8;
}
+static inline pud_t stage2_build_pud(kvm_pfn_t pfn, pgprot_t mem_type,
+ bool writable)
+{
+ return __pud(0);
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 1a7a17b2a1ba..97e04fdbfa85 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,8 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
+#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
+
/* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index dbfd18e08cfb..89eac3dbe123 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -160,6 +160,7 @@ void kvm_clear_hyp_idmap(void);
#define kvm_set_pte(ptep, pte) set_pte(ptep, pte)
#define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd)
+#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
{
@@ -173,6 +174,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
return pmd;
}
+static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
+{
+ pud_val(pud) |= PUD_S2_RDWR;
+ return pud;
+}
+
static inline void kvm_set_s2pte_readonly(pte_t *pte)
{
pteval_t old_pteval, pteval;
@@ -319,5 +326,17 @@ static inline unsigned int kvm_get_vmid_bits(void)
return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
}
+static inline pud_t stage2_build_pud(kvm_pfn_t pfn, pgprot_t mem_type,
+ bool writable)
+{
+ pud_t pud = pfn_pud(pfn, mem_type);
+
+ pud = pud_mkhuge(pud);
+ if (writable)
+ pud = kvm_s2pud_mkwrite(pud);
+
+ return pud;
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 40a998cdd399..a091a6192eee 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -181,6 +181,8 @@
#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
+#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
+
/*
* Memory Attribute override for Stage-2 (MemAttr[3:0])
*/
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index bdcc7f1c9d06..d5ffff4369d2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -362,7 +362,11 @@ static inline int pmd_protnone(pmd_t pmd)
#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
#define pud_write(pud) pte_write(pud_pte(pud))
+
+#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT))
+
#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
+#define pfn_pud(pfn, prot) (__pud(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index f02219a91b19..5362de098768 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -872,6 +872,32 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
return stage2_pud_offset(pgd, addr);
}
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
+ *cache, phys_addr_t addr, const pud_t *new_pud)
+{
+ pud_t *pud, old_pud;
+
+ pud = stage2_get_pud(kvm, cache, addr);
+ VM_BUG_ON(!pud);
+
+ /*
+ * Mapping in huge pages should only happen through a fault.
+ */
+ VM_BUG_ON(stage2_pud_present(*pud) &&
+ pud_pfn(*pud) != pud_pfn(*new_pud));
+
+ old_pud = *pud;
+ if (stage2_pud_present(old_pud)) {
+ stage2_pud_clear(pud);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pud));
+ }
+
+ kvm_set_pud(pud, *new_pud);
+ return 0;
+}
+
static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr)
{
@@ -1307,6 +1333,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
+ unsigned long vma_pagesize;
unsigned long flags = 0;
write_fault = kvm_is_write_fault(vcpu);
@@ -1324,9 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) {
+ vma_pagesize = vma_kernel_pagesize(vma);
+ if ((vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) &&
+ !logging_active) {
+ struct hstate *h = hstate_vma(vma);
+
hugetlb = true;
- gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+ gfn = (fault_ipa & huge_page_mask(h)) >> PAGE_SHIFT;
} else {
/*
* Pages belonging to memslots that don't have the same
@@ -1393,17 +1424,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
- if (!hugetlb && !force_pte)
+ if (!hugetlb && !force_pte) {
+ /*
+ * We only support PMD_SIZE transparent
+ * hugepages. This code will need updates if we enable
+ * other page sizes for THP.
+ */
hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+ vma_pagesize = PMD_SIZE;
+ }
if (hugetlb) {
- pmd_t new_pmd = stage2_build_pmd(pfn, mem_type, writable);
-
- if (writable)
- kvm_set_pfn_dirty(pfn);
-
- coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
- ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
+ if (vma_pagesize == PUD_SIZE) {
+ pud_t new_pud;
+
+ new_pud = stage2_build_pud(pfn, mem_type, writable);
+ if (writable)
+ kvm_set_pfn_dirty(pfn);
+
+ coherent_cache_guest_page(vcpu, pfn, PUD_SIZE);
+ ret = stage2_set_pud_huge(kvm, memcache,
+ fault_ipa, &new_pud);
+ } else {
+ pmd_t new_pmd;
+
+ new_pmd = stage2_build_pmd(pfn, mem_type, writable);
+ if (writable)
+ kvm_set_pfn_dirty(pfn);
+
+ coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
+ ret = stage2_set_pmd_huge(kvm, memcache,
+ fault_ipa, &new_pmd);
+ }
} else {
pte_t new_pte = pfn_pte(pfn, mem_type);
--
2.15.1
On Wed, Jan 10, 2018 at 07:07:26PM +0000, Punit Agrawal wrote:
> The PUD macros (PUD_TABLE_BIT, PUD_TYPE_MASK, PUD_TYPE_SECT) use the
> pgdval_t even when pudval_t is available. Even though the underlying
> type for both (u64) is the same it is confusing and may lead to issues
> in the future.
>
> Fix this by using pudval_t to define the PUD_* macros.
>
> Fixes: 084bd29810a56 ("ARM64: mm: HugeTLB support.")
> Fixes: 206a2a73a62d3 ("arm64: mm: Create gigabyte kernel logical mappings where possible")
> Signed-off-by: Punit Agrawal <[email protected]>
> Cc: Catalin Marinas <[email protected]>
> Cc: Will Deacon <[email protected]>
I queued this patch. I'll leave the KVM bits to Marc/Christoffer.
--
Catalin
Catalin Marinas <[email protected]> writes:
> On Wed, Jan 10, 2018 at 07:07:26PM +0000, Punit Agrawal wrote:
>> The PUD macros (PUD_TABLE_BIT, PUD_TYPE_MASK, PUD_TYPE_SECT) use the
>> pgdval_t even when pudval_t is available. Even though the underlying
>> type for both (u64) is the same it is confusing and may lead to issues
>> in the future.
>>
>> Fix this by using pudval_t to define the PUD_* macros.
>>
>> Fixes: 084bd29810a56 ("ARM64: mm: HugeTLB support.")
>> Fixes: 206a2a73a62d3 ("arm64: mm: Create gigabyte kernel logical mappings where possible")
>> Signed-off-by: Punit Agrawal <[email protected]>
>> Cc: Catalin Marinas <[email protected]>
>> Cc: Will Deacon <[email protected]>
>
> I queued this patch. I'll leave the KVM bits to Marc/Christoffer.
Thanks for picking up the fix.
Punit
On Wed, Jan 10, 2018 at 07:07:29PM +0000, Punit Agrawal wrote:
> KVM only supports PMD hugepages at stage 2. Extend the stage 2 fault
> handling to add support for PUD hugepages.
>
> Addition of PUD hugpage support enables additional hugepage sizes (1G
*hugepage
> with 4K granule and 4TB with 64k granule) which can be useful on cores
> that have support for mapping larger block sizes in the TLB entries.
>
> Signed-off-by: Punit Agrawal <[email protected]>
> Cc: Marc Zyngier <[email protected]>
> Cc: Christoffer Dall <[email protected]>
> Cc: Catalin Marinas <[email protected]>
> ---
> arch/arm/include/asm/kvm_mmu.h | 10 +++++
> arch/arm/include/asm/pgtable-3level.h | 2 +
> arch/arm64/include/asm/kvm_mmu.h | 19 +++++++++
> arch/arm64/include/asm/pgtable-hwdef.h | 2 +
> arch/arm64/include/asm/pgtable.h | 4 ++
> virt/kvm/arm/mmu.c | 72 +++++++++++++++++++++++++++++-----
> 6 files changed, 99 insertions(+), 10 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index 3fbe919b9181..6e2e34348cb3 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -59,6 +59,10 @@ phys_addr_t kvm_get_idmap_vector(void);
> int kvm_mmu_init(void);
> void kvm_clear_hyp_idmap(void);
>
> +static inline void kvm_set_pud(pud_t *pud, pud_t new_pud)
> +{
> +}
> +
> static inline void kvm_set_pmd(pmd_t *pmd, pmd_t new_pmd)
> {
> *pmd = new_pmd;
> @@ -230,6 +234,12 @@ static inline unsigned int kvm_get_vmid_bits(void)
> return 8;
> }
>
> +static inline pud_t stage2_build_pud(kvm_pfn_t pfn, pgprot_t mem_type,
> + bool writable)
> +{
> + return __pud(0);
> +}
> +
> #endif /* !__ASSEMBLY__ */
>
> #endif /* __ARM_KVM_MMU_H__ */
> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
> index 1a7a17b2a1ba..97e04fdbfa85 100644
> --- a/arch/arm/include/asm/pgtable-3level.h
> +++ b/arch/arm/include/asm/pgtable-3level.h
> @@ -249,6 +249,8 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
> #define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
> #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
>
> +#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
> +
does this make sense on 32-bit arm? Is this ever going to get called
and return something meaningful in that case?
> /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */
> static inline pmd_t pmd_mknotpresent(pmd_t pmd)
> {
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index dbfd18e08cfb..89eac3dbe123 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -160,6 +160,7 @@ void kvm_clear_hyp_idmap(void);
>
> #define kvm_set_pte(ptep, pte) set_pte(ptep, pte)
> #define kvm_set_pmd(pmdp, pmd) set_pmd(pmdp, pmd)
> +#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
>
> static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
> {
> @@ -173,6 +174,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
> return pmd;
> }
>
> +static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
> +{
> + pud_val(pud) |= PUD_S2_RDWR;
> + return pud;
> +}
> +
> static inline void kvm_set_s2pte_readonly(pte_t *pte)
> {
> pteval_t old_pteval, pteval;
> @@ -319,5 +326,17 @@ static inline unsigned int kvm_get_vmid_bits(void)
> return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
> }
>
> +static inline pud_t stage2_build_pud(kvm_pfn_t pfn, pgprot_t mem_type,
> + bool writable)
> +{
> + pud_t pud = pfn_pud(pfn, mem_type);
> +
> + pud = pud_mkhuge(pud);
> + if (writable)
> + pud = kvm_s2pud_mkwrite(pud);
> +
> + return pud;
> +}
> +
> #endif /* __ASSEMBLY__ */
> #endif /* __ARM64_KVM_MMU_H__ */
> diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
> index 40a998cdd399..a091a6192eee 100644
> --- a/arch/arm64/include/asm/pgtable-hwdef.h
> +++ b/arch/arm64/include/asm/pgtable-hwdef.h
> @@ -181,6 +181,8 @@
> #define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
> #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
>
> +#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
> +
> /*
> * Memory Attribute override for Stage-2 (MemAttr[3:0])
> */
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index bdcc7f1c9d06..d5ffff4369d2 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -362,7 +362,11 @@ static inline int pmd_protnone(pmd_t pmd)
> #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
>
> #define pud_write(pud) pte_write(pud_pte(pud))
> +
> +#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT))
> +
> #define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
> +#define pfn_pud(pfn, prot) (__pud(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
>
> #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
>
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index f02219a91b19..5362de098768 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -872,6 +872,32 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
> return stage2_pud_offset(pgd, addr);
> }
>
> +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
> + *cache, phys_addr_t addr, const pud_t *new_pud)
> +{
> + pud_t *pud, old_pud;
> +
> + pud = stage2_get_pud(kvm, cache, addr);
> + VM_BUG_ON(!pud);
> +
> + /*
> + * Mapping in huge pages should only happen through a fault.
> + */
> + VM_BUG_ON(stage2_pud_present(*pud) &&
> + pud_pfn(*pud) != pud_pfn(*new_pud));
> +
> + old_pud = *pud;
> + if (stage2_pud_present(old_pud)) {
> + stage2_pud_clear(pud);
> + kvm_tlb_flush_vmid_ipa(kvm, addr);
> + } else {
> + get_page(virt_to_page(pud));
> + }
> +
> + kvm_set_pud(pud, *new_pud);
> + return 0;
> +}
> +
> static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
> phys_addr_t addr)
> {
> @@ -1307,6 +1333,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> kvm_pfn_t pfn;
> pgprot_t mem_type = PAGE_S2;
> bool logging_active = memslot_is_logging(memslot);
> + unsigned long vma_pagesize;
> unsigned long flags = 0;
>
> write_fault = kvm_is_write_fault(vcpu);
> @@ -1324,9 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> return -EFAULT;
> }
>
> - if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) {
> + vma_pagesize = vma_kernel_pagesize(vma);
> + if ((vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) &&
> + !logging_active) {
> + struct hstate *h = hstate_vma(vma);
> +
> hugetlb = true;
> - gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
> + gfn = (fault_ipa & huge_page_mask(h)) >> PAGE_SHIFT;
> } else {
> /*
> * Pages belonging to memslots that don't have the same
> @@ -1393,17 +1424,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> if (mmu_notifier_retry(kvm, mmu_seq))
> goto out_unlock;
>
> - if (!hugetlb && !force_pte)
> + if (!hugetlb && !force_pte) {
> + /*
> + * We only support PMD_SIZE transparent
> + * hugepages. This code will need updates if we enable
> + * other page sizes for THP.
> + */
> hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
> + vma_pagesize = PMD_SIZE;
> + }
>
> if (hugetlb) {
> - pmd_t new_pmd = stage2_build_pmd(pfn, mem_type, writable);
> -
> - if (writable)
> - kvm_set_pfn_dirty(pfn);
> -
> - coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
> - ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
> + if (vma_pagesize == PUD_SIZE) {
> + pud_t new_pud;
> +
> + new_pud = stage2_build_pud(pfn, mem_type, writable);
> + if (writable)
> + kvm_set_pfn_dirty(pfn);
> +
> + coherent_cache_guest_page(vcpu, pfn, PUD_SIZE);
> + ret = stage2_set_pud_huge(kvm, memcache,
> + fault_ipa, &new_pud);
> + } else {
> + pmd_t new_pmd;
> +
> + new_pmd = stage2_build_pmd(pfn, mem_type, writable);
> + if (writable)
> + kvm_set_pfn_dirty(pfn);
> +
> + coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
> + ret = stage2_set_pmd_huge(kvm, memcache,
> + fault_ipa, &new_pmd);
> + }
This stuff needs rebasing onto v4.16-rc1 when we get there, and it will
clash with Marc's icache optimizations.
But, you should be able to move kvm_set_pfn_dirty() out of the
size-conditional section and also call the cache maintenance functions
using vma_pagesize as parameter.
> } else {
> pte_t new_pte = pfn_pte(pfn, mem_type);
>
> --
> 2.15.1
>
Thanks,
-Christoffer
On Wed, Jan 10, 2018 at 07:07:27PM +0000, Punit Agrawal wrote:
> In preparation for creating PUD hugepages at stage 2, add support for
> write protecting PUD hugepages when they are encountered. Write
> protecting guest tables is used to track dirty pages when migrating VMs.
>
> Also, provide trivial implementations of required kvm_s2pud_* helpers to
> allow code to compile on arm32.
>
> Signed-off-by: Punit Agrawal <[email protected]>
> Cc: Christoffer Dall <[email protected]>
> Cc: Marc Zyngier <[email protected]>
> ---
> arch/arm/include/asm/kvm_mmu.h | 9 +++++++++
> arch/arm64/include/asm/kvm_mmu.h | 10 ++++++++++
> virt/kvm/arm/mmu.c | 9 ++++++---
> 3 files changed, 25 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
> index fa6f2174276b..3fbe919b9181 100644
> --- a/arch/arm/include/asm/kvm_mmu.h
> +++ b/arch/arm/include/asm/kvm_mmu.h
> @@ -103,6 +103,15 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
> return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
> }
>
> +static inline void kvm_set_s2pud_readonly(pud_t *pud)
> +{
> +}
> +
> +static inline bool kvm_s2pud_readonly(pud_t *pud)
> +{
> + return true;
why true? Shouldn't this return the pgd's readonly value, strictly
speaking, or if we rely on this never being called, have VM_BUG_ON() ?
In any case, a comment explaining why we unconditionally return true
would be nice.
> +}
> +
> static inline bool kvm_page_empty(void *ptr)
> {
> struct page *ptr_page = virt_to_page(ptr);
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 672c8684d5c2..dbfd18e08cfb 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -201,6 +201,16 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
> return kvm_s2pte_readonly((pte_t *)pmd);
> }
>
> +static inline void kvm_set_s2pud_readonly(pud_t *pud)
> +{
> + kvm_set_s2pte_readonly((pte_t *)pud);
> +}
> +
> +static inline bool kvm_s2pud_readonly(pud_t *pud)
> +{
> + return kvm_s2pte_readonly((pte_t *)pud);
> +}
> +
> static inline bool kvm_page_empty(void *ptr)
> {
> struct page *ptr_page = virt_to_page(ptr);
> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
> index 9dea96380339..02eefda5d71e 100644
> --- a/virt/kvm/arm/mmu.c
> +++ b/virt/kvm/arm/mmu.c
> @@ -1155,9 +1155,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
> do {
> next = stage2_pud_addr_end(addr, end);
> if (!stage2_pud_none(*pud)) {
> - /* TODO:PUD not supported, revisit later if supported */
> - BUG_ON(stage2_pud_huge(*pud));
> - stage2_wp_pmds(pud, addr, next);
> + if (stage2_pud_huge(*pud)) {
> + if (!kvm_s2pud_readonly(pud))
> + kvm_set_s2pud_readonly(pud);
> + } else {
> + stage2_wp_pmds(pud, addr, next);
> + }
> }
> } while (pud++, addr = next, addr != end);
> }
> --
> 2.15.1
>
Otherwise:
Reviewed-by: Christoffer Dall <[email protected]>
Christoffer Dall <[email protected]> writes:
> On Wed, Jan 10, 2018 at 07:07:29PM +0000, Punit Agrawal wrote:
>> KVM only supports PMD hugepages at stage 2. Extend the stage 2 fault
>> handling to add support for PUD hugepages.
>>
>> Addition of PUD hugpage support enables additional hugepage sizes (1G
>
> *hugepage
>
>> with 4K granule and 4TB with 64k granule) which can be useful on cores
>> that have support for mapping larger block sizes in the TLB entries.
>>
>> Signed-off-by: Punit Agrawal <[email protected]>
>> Cc: Marc Zyngier <[email protected]>
>> Cc: Christoffer Dall <[email protected]>
>> Cc: Catalin Marinas <[email protected]>
>> ---
>> arch/arm/include/asm/kvm_mmu.h | 10 +++++
>> arch/arm/include/asm/pgtable-3level.h | 2 +
>> arch/arm64/include/asm/kvm_mmu.h | 19 +++++++++
>> arch/arm64/include/asm/pgtable-hwdef.h | 2 +
>> arch/arm64/include/asm/pgtable.h | 4 ++
>> virt/kvm/arm/mmu.c | 72 +++++++++++++++++++++++++++++-----
>> 6 files changed, 99 insertions(+), 10 deletions(-)
>>
[...]
>> diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
>> index 1a7a17b2a1ba..97e04fdbfa85 100644
>> --- a/arch/arm/include/asm/pgtable-3level.h
>> +++ b/arch/arm/include/asm/pgtable-3level.h
>> @@ -249,6 +249,8 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
>> #define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
>> #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
>>
>> +#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
>> +
>
> does this make sense on 32-bit arm? Is this ever going to get called
> and return something meaningful in that case?
This macro should never get called as there are no PUD_SIZE hugepages on
arm.
Ideally we want to fold the pud to fallback to pgd like in the rest of
the code. I'll have another go at this.
>
>> /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */
>> static inline pmd_t pmd_mknotpresent(pmd_t pmd)
>> {
[...]
>> @@ -1393,17 +1424,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>> if (mmu_notifier_retry(kvm, mmu_seq))
>> goto out_unlock;
>>
>> - if (!hugetlb && !force_pte)
>> + if (!hugetlb && !force_pte) {
>> + /*
>> + * We only support PMD_SIZE transparent
>> + * hugepages. This code will need updates if we enable
>> + * other page sizes for THP.
>> + */
>> hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
>> + vma_pagesize = PMD_SIZE;
>> + }
>>
>> if (hugetlb) {
>> - pmd_t new_pmd = stage2_build_pmd(pfn, mem_type, writable);
>> -
>> - if (writable)
>> - kvm_set_pfn_dirty(pfn);
>> -
>> - coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
>> - ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
>> + if (vma_pagesize == PUD_SIZE) {
>> + pud_t new_pud;
>> +
>> + new_pud = stage2_build_pud(pfn, mem_type, writable);
>> + if (writable)
>> + kvm_set_pfn_dirty(pfn);
>> +
>> + coherent_cache_guest_page(vcpu, pfn, PUD_SIZE);
>> + ret = stage2_set_pud_huge(kvm, memcache,
>> + fault_ipa, &new_pud);
>> + } else {
>> + pmd_t new_pmd;
>> +
>> + new_pmd = stage2_build_pmd(pfn, mem_type, writable);
>> + if (writable)
>> + kvm_set_pfn_dirty(pfn);
>> +
>> + coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
>> + ret = stage2_set_pmd_huge(kvm, memcache,
>> + fault_ipa, &new_pmd);
>> + }
>
> This stuff needs rebasing onto v4.16-rc1 when we get there, and it will
> clash with Marc's icache optimizations.
Thanks for the heads up.
>
> But, you should be able to move kvm_set_pfn_dirty() out of the
> size-conditional section and also call the cache maintenance functions
> using vma_pagesize as parameter.
Agreed - I'll roll these suggestions into the next version.
Thanks a lot for the review.
Punit
Christoffer Dall <[email protected]> writes:
> On Wed, Jan 10, 2018 at 07:07:27PM +0000, Punit Agrawal wrote:
>> In preparation for creating PUD hugepages at stage 2, add support for
>> write protecting PUD hugepages when they are encountered. Write
>> protecting guest tables is used to track dirty pages when migrating VMs.
>>
>> Also, provide trivial implementations of required kvm_s2pud_* helpers to
>> allow code to compile on arm32.
>>
>> Signed-off-by: Punit Agrawal <[email protected]>
>> Cc: Christoffer Dall <[email protected]>
>> Cc: Marc Zyngier <[email protected]>
>> ---
>> arch/arm/include/asm/kvm_mmu.h | 9 +++++++++
>> arch/arm64/include/asm/kvm_mmu.h | 10 ++++++++++
>> virt/kvm/arm/mmu.c | 9 ++++++---
>> 3 files changed, 25 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index fa6f2174276b..3fbe919b9181 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -103,6 +103,15 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
>> return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
>> }
>>
>> +static inline void kvm_set_s2pud_readonly(pud_t *pud)
>> +{
>> +}
>> +
>> +static inline bool kvm_s2pud_readonly(pud_t *pud)
>> +{
>> + return true;
>
> why true? Shouldn't this return the pgd's readonly value, strictly
> speaking, or if we rely on this never being called, have VM_BUG_ON() ?
It returns true as it prevents a call to kvm_set_s2pud_readonly() but
both of the above functions should never be called on ARM due to
stage2_pud_huge() returning 0.
I'll add a VM_BUG_ON(pud) to indicate that these functions should never
be called and...
>
> In any case, a comment explaining why we unconditionally return true
> would be nice.
... add a comment to explain what's going on.
>
>> +}
>> +
>> static inline bool kvm_page_empty(void *ptr)
>> {
>> struct page *ptr_page = virt_to_page(ptr);
>> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
>> index 672c8684d5c2..dbfd18e08cfb 100644
>> --- a/arch/arm64/include/asm/kvm_mmu.h
>> +++ b/arch/arm64/include/asm/kvm_mmu.h
>> @@ -201,6 +201,16 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
>> return kvm_s2pte_readonly((pte_t *)pmd);
>> }
>>
>> +static inline void kvm_set_s2pud_readonly(pud_t *pud)
>> +{
>> + kvm_set_s2pte_readonly((pte_t *)pud);
>> +}
>> +
>> +static inline bool kvm_s2pud_readonly(pud_t *pud)
>> +{
>> + return kvm_s2pte_readonly((pte_t *)pud);
>> +}
>> +
>> static inline bool kvm_page_empty(void *ptr)
>> {
>> struct page *ptr_page = virt_to_page(ptr);
>> diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
>> index 9dea96380339..02eefda5d71e 100644
>> --- a/virt/kvm/arm/mmu.c
>> +++ b/virt/kvm/arm/mmu.c
>> @@ -1155,9 +1155,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
>> do {
>> next = stage2_pud_addr_end(addr, end);
>> if (!stage2_pud_none(*pud)) {
>> - /* TODO:PUD not supported, revisit later if supported */
>> - BUG_ON(stage2_pud_huge(*pud));
>> - stage2_wp_pmds(pud, addr, next);
>> + if (stage2_pud_huge(*pud)) {
>> + if (!kvm_s2pud_readonly(pud))
>> + kvm_set_s2pud_readonly(pud);
>> + } else {
>> + stage2_wp_pmds(pud, addr, next);
>> + }
>> }
>> } while (pud++, addr = next, addr != end);
>> }
>> --
>> 2.15.1
>>
>
> Otherwise:
>
> Reviewed-by: Christoffer Dall <[email protected]>
Thanks for taking a look.
Punit