LinuxLists.cc - [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

2019-08-22 09:52:05

Subject: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

This patch implements all required functions for programming
the stage2 page table for each Guest/VM.

At high-level, the flow of stage2 related functions is similar
from KVM ARM/ARM64 implementation but the stage2 page table
format is quite different for KVM RISC-V.

Signed-off-by: Anup Patel <[email protected]>
Acked-by: Paolo Bonzini <[email protected]>
Reviewed-by: Paolo Bonzini <[email protected]>
---
arch/riscv/include/asm/kvm_host.h | 10 +
arch/riscv/include/asm/pgtable-bits.h | 1 +
arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
3 files changed, 638 insertions(+), 10 deletions(-)

diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 3b09158f80f2..a37775c92586 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -72,6 +72,13 @@ struct kvm_mmio_decode {
int shift;
};

+#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
+
+struct kvm_mmu_page_cache {
+ int nobjs;
+ void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
+};
+
struct kvm_cpu_context {
unsigned long zero;
unsigned long ra;
@@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
/* MMIO instruction details */
struct kvm_mmio_decode mmio_decode;

+ /* Cache pages needed to program page tables with spinlock held */
+ struct kvm_mmu_page_cache mmu_page_cache;
+
/* VCPU power-off state */
bool power_off;

diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index bbaeb5d35842..be49d62fcc2b 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -26,6 +26,7 @@

#define _PAGE_SPECIAL _PAGE_SOFT
#define _PAGE_TABLE _PAGE_PRESENT
+#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)

/*
* _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 2b965f9aac07..9e95ab6769f6 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -18,6 +18,432 @@
#include <asm/page.h>
#include <asm/pgtable.h>

+#ifdef CONFIG_64BIT
+#define stage2_have_pmd true
+#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
+#define stage2_cache_min_pages 2
+#else
+#define pmd_index(x) 0
+#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
+#define stage2_have_pmd false
+#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
+#define stage2_cache_min_pages 1
+#endif
+
+static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
+ int min, int max)
+{
+ void *page;
+
+ BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
+ if (pcache->nobjs >= min)
+ return 0;
+ while (pcache->nobjs < max) {
+ page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+ pcache->objects[pcache->nobjs++] = page;
+ }
+
+ return 0;
+}
+
+static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
+{
+ while (pcache && pcache->nobjs)
+ free_page((unsigned long)pcache->objects[--pcache->nobjs]);
+}
+
+static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
+{
+ void *p;
+
+ if (!pcache)
+ return NULL;
+
+ BUG_ON(!pcache->nobjs);
+ p = pcache->objects[--pcache->nobjs];
+
+ return p;
+}
+
+struct local_guest_tlb_info {
+ struct kvm_vmid *vmid;
+ gpa_t addr;
+};
+
+static void local_guest_tlb_flush_vmid_gpa(void *info)
+{
+ struct local_guest_tlb_info *infop = info;
+
+ __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
+ infop->addr);
+}
+
+static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
+{
+ struct local_guest_tlb_info info;
+ struct kvm_vmid *vmid = &kvm->arch.vmid;
+
+ /* TODO: This should be SBI call */
+ info.vmid = vmid;
+ info.addr = addr;
+ preempt_disable();
+ smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
+ &info, true);
+ preempt_enable();
+}
+
+static int stage2_set_pgd(struct kvm *kvm, gpa_t addr, const pgd_t *new_pgd)
+{
+ pgd_t *pgdp = &kvm->arch.pgd[pgd_index(addr)];
+
+ *pgdp = *new_pgd;
+ if (pgd_val(*pgdp) & _PAGE_LEAF)
+ stage2_remote_tlb_flush(kvm, addr);
+
+ return 0;
+}
+
+static int stage2_set_pmd(struct kvm *kvm, struct kvm_mmu_page_cache *pcache,
+ gpa_t addr, const pmd_t *new_pmd)
+{
+ int rc;
+ pmd_t *pmdp;
+ pgd_t new_pgd;
+ pgd_t *pgdp = &kvm->arch.pgd[pgd_index(addr)];
+
+ if (!pgd_val(*pgdp)) {
+ pmdp = stage2_cache_alloc(pcache);
+ if (!pmdp)
+ return -ENOMEM;
+ new_pgd = pfn_pgd(PFN_DOWN(__pa(pmdp)), __pgprot(_PAGE_TABLE));
+ rc = stage2_set_pgd(kvm, addr, &new_pgd);
+ if (rc)
+ return rc;
+ }
+
+ if (pgd_val(*pgdp) & _PAGE_LEAF)
+ return -EEXIST;
+
+ pmdp = (void *)pgd_page_vaddr(*pgdp);
+ pmdp = &pmdp[pmd_index(addr)];
+
+ *pmdp = *new_pmd;
+ if (pmd_val(*pmdp) & _PAGE_LEAF)
+ stage2_remote_tlb_flush(kvm, addr);
+
+ return 0;
+}
+
+static int stage2_set_pte(struct kvm *kvm,
+ struct kvm_mmu_page_cache *pcache,
+ gpa_t addr, const pte_t *new_pte)
+{
+ int rc;
+ pte_t *ptep;
+ pmd_t new_pmd;
+ pmd_t *pmdp;
+ pgd_t new_pgd;
+ pgd_t *pgdp = &kvm->arch.pgd[pgd_index(addr)];
+
+ if (!pgd_val(*pgdp)) {
+ pmdp = stage2_cache_alloc(pcache);
+ if (!pmdp)
+ return -ENOMEM;
+ new_pgd = pfn_pgd(PFN_DOWN(__pa(pmdp)), __pgprot(_PAGE_TABLE));
+ rc = stage2_set_pgd(kvm, addr, &new_pgd);
+ if (rc)
+ return rc;
+ }
+
+ if (pgd_val(*pgdp) & _PAGE_LEAF)
+ return -EEXIST;
+
+ if (stage2_have_pmd) {
+ pmdp = (void *)pgd_page_vaddr(*pgdp);
+ pmdp = &pmdp[pmd_index(addr)];
+ if (!pmd_present(*pmdp)) {
+ ptep = stage2_cache_alloc(pcache);
+ if (!ptep)
+ return -ENOMEM;
+ new_pmd = pfn_pmd(PFN_DOWN(__pa(ptep)),
+ __pgprot(_PAGE_TABLE));
+ rc = stage2_set_pmd(kvm, pcache, addr, &new_pmd);
+ if (rc)
+ return rc;
+ }
+
+ if (pmd_val(*pmdp) & _PAGE_LEAF)
+ return -EEXIST;
+
+ ptep = (void *)pmd_page_vaddr(*pmdp);
+ } else {
+ ptep = (void *)pgd_page_vaddr(*pgdp);
+ }
+
+ ptep = &ptep[pte_index(addr)];
+
+ *ptep = *new_pte;
+ if (pte_val(*ptep) & _PAGE_LEAF)
+ stage2_remote_tlb_flush(kvm, addr);
+
+ return 0;
+}
+
+static int stage2_map_page(struct kvm *kvm,
+ struct kvm_mmu_page_cache *pcache,
+ gpa_t gpa, phys_addr_t hpa,
+ unsigned long page_size, pgprot_t prot)
+{
+ pte_t new_pte;
+ pmd_t new_pmd;
+ pgd_t new_pgd;
+
+ if (page_size == PAGE_SIZE) {
+ new_pte = pfn_pte(PFN_DOWN(hpa), prot);
+ return stage2_set_pte(kvm, pcache, gpa, &new_pte);
+ }
+
+ if (stage2_have_pmd && page_size == PMD_SIZE) {
+ new_pmd = pfn_pmd(PFN_DOWN(hpa), prot);
+ return stage2_set_pmd(kvm, pcache, gpa, &new_pmd);
+ }
+
+ if (page_size == PGDIR_SIZE) {
+ new_pgd = pfn_pgd(PFN_DOWN(hpa), prot);
+ return stage2_set_pgd(kvm, gpa, &new_pgd);
+ }
+
+ return -EINVAL;
+}
+
+enum stage2_op {
+ STAGE2_OP_NOP = 0, /* Nothing */
+ STAGE2_OP_CLEAR, /* Clear/Unmap */
+ STAGE2_OP_WP, /* Write-protect */
+};
+
+static void stage2_op_pte(struct kvm *kvm, gpa_t addr, pte_t *ptep,
+ enum stage2_op op)
+{
+ BUG_ON(addr & (PAGE_SIZE - 1));
+
+ if (!pte_present(*ptep))
+ return;
+
+ if (op == STAGE2_OP_CLEAR)
+ set_pte(ptep, __pte(0));
+ else if (op == STAGE2_OP_WP)
+ set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE));
+ stage2_remote_tlb_flush(kvm, addr);
+}
+
+static void stage2_op_pmd(struct kvm *kvm, gpa_t addr, pmd_t *pmdp,
+ enum stage2_op op)
+{
+ int i;
+ pte_t *ptep;
+
+ BUG_ON(addr & (PMD_SIZE - 1));
+
+ if (!pmd_present(*pmdp))
+ return;
+
+ if (pmd_val(*pmdp) & _PAGE_LEAF)
+ ptep = NULL;
+ else
+ ptep = (pte_t *)pmd_page_vaddr(*pmdp);
+
+ if (op == STAGE2_OP_CLEAR)
+ set_pmd(pmdp, __pmd(0));
+
+ if (ptep) {
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ stage2_op_pte(kvm, addr + i * PAGE_SIZE, &ptep[i], op);
+ if (op == STAGE2_OP_CLEAR)
+ put_page(virt_to_page(ptep));
+ } else {
+ if (op == STAGE2_OP_WP)
+ set_pmd(pmdp, __pmd(pmd_val(*pmdp) & ~_PAGE_WRITE));
+ stage2_remote_tlb_flush(kvm, addr);
+ }
+}
+
+static void stage2_op_pgd(struct kvm *kvm, gpa_t addr, pgd_t *pgdp,
+ enum stage2_op op)
+{
+ int i;
+ pte_t *ptep;
+ pmd_t *pmdp;
+
+ BUG_ON(addr & (PGDIR_SIZE - 1));
+
+ if (!pgd_val(*pgdp))
+ return;
+
+ ptep = NULL;
+ pmdp = NULL;
+ if (!(pgd_val(*pgdp) & _PAGE_LEAF)) {
+ if (stage2_have_pmd)
+ pmdp = (pmd_t *)pgd_page_vaddr(*pgdp);
+ else
+ ptep = (pte_t *)pgd_page_vaddr(*pgdp);
+ }
+
+ if (op == STAGE2_OP_CLEAR)
+ set_pgd(pgdp, __pgd(0));
+
+ if (pmdp) {
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ stage2_op_pmd(kvm, addr + i * PMD_SIZE, &pmdp[i], op);
+ if (op == STAGE2_OP_CLEAR)
+ put_page(virt_to_page(pmdp));
+ } else if (ptep) {
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ stage2_op_pte(kvm, addr + i * PAGE_SIZE, &ptep[i], op);
+ if (op == STAGE2_OP_CLEAR)
+ put_page(virt_to_page(ptep));
+ } else {
+ if (op == STAGE2_OP_WP)
+ set_pgd(pgdp, __pgd(pgd_val(*pgdp) & ~_PAGE_WRITE));
+ stage2_remote_tlb_flush(kvm, addr);
+ }
+}
+
+static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size)
+{
+ pmd_t *pmdp;
+ pte_t *ptep;
+ pgd_t *pgdp;
+ gpa_t addr = start, end = start + size;
+
+ while (addr < end) {
+ pgdp = &kvm->arch.pgd[pgd_index(addr)];
+ if (!pgd_val(*pgdp)) {
+ addr += PGDIR_SIZE;
+ continue;
+ } else if (!(addr & (PGDIR_SIZE - 1)) &&
+ ((end - addr) >= PGDIR_SIZE)) {
+ stage2_op_pgd(kvm, addr, pgdp, STAGE2_OP_CLEAR);
+ addr += PGDIR_SIZE;
+ continue;
+ }
+
+ if (stage2_have_pmd) {
+ pmdp = (pmd_t *)pgd_page_vaddr(*pgdp);
+ if (!pmd_present(*pmdp)) {
+ addr += PMD_SIZE;
+ continue;
+ } else if (!(addr & (PMD_SIZE - 1)) &&
+ ((end - addr) >= PMD_SIZE)) {
+ stage2_op_pmd(kvm, addr, pmdp,
+ STAGE2_OP_CLEAR);
+ addr += PMD_SIZE;
+ continue;
+ }
+ ptep = (pte_t *)pmd_page_vaddr(*pmdp);
+ } else {
+ ptep = (pte_t *)pgd_page_vaddr(*pgdp);
+ }
+
+ stage2_op_pte(kvm, addr, ptep, STAGE2_OP_CLEAR);
+ addr += PAGE_SIZE;
+ }
+}
+
+static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end)
+{
+ pmd_t *pmdp;
+ pte_t *ptep;
+ pgd_t *pgdp;
+ gpa_t addr = start;
+
+ while (addr < end) {
+ pgdp = &kvm->arch.pgd[pgd_index(addr)];
+ if (!pgd_val(*pgdp)) {
+ addr += PGDIR_SIZE;
+ continue;
+ } else if (!(addr & (PGDIR_SIZE - 1)) &&
+ ((end - addr) >= PGDIR_SIZE)) {
+ stage2_op_pgd(kvm, addr, pgdp, STAGE2_OP_WP);
+ addr += PGDIR_SIZE;
+ continue;
+ }
+
+ if (stage2_have_pmd) {
+ pmdp = (pmd_t *)pgd_page_vaddr(*pgdp);
+ if (!pmd_present(*pmdp)) {
+ addr += PMD_SIZE;
+ continue;
+ } else if (!(addr & (PMD_SIZE - 1)) &&
+ ((end - addr) >= PMD_SIZE)) {
+ stage2_op_pmd(kvm, addr, pmdp, STAGE2_OP_WP);
+ addr += PMD_SIZE;
+ continue;
+ }
+ ptep = (pte_t *)pmd_page_vaddr(*pmdp);
+ } else {
+ ptep = (pte_t *)pgd_page_vaddr(*pgdp);
+ }
+
+ stage2_op_pte(kvm, addr, ptep, STAGE2_OP_WP);
+ addr += PAGE_SIZE;
+ }
+}
+
+void stage2_wp_memory_region(struct kvm *kvm, int slot)
+{
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+ phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
+ phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ stage2_wp_range(kvm, start, end);
+ spin_unlock(&kvm->mmu_lock);
+ kvm_flush_remote_tlbs(kvm);
+}
+
+int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
+ unsigned long size, bool writable)
+{
+ pte_t pte;
+ int ret = 0;
+ unsigned long pfn;
+ phys_addr_t addr, end;
+ struct kvm_mmu_page_cache pcache = { 0, };
+
+ end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
+ pfn = __phys_to_pfn(hpa);
+
+ for (addr = gpa; addr < end; addr += PAGE_SIZE) {
+ pte = pfn_pte(pfn, PAGE_KERNEL);
+
+ if (!writable)
+ pte = pte_wrprotect(pte);
+
+ ret = stage2_cache_topup(&pcache,
+ stage2_cache_min_pages,
+ KVM_MMU_PAGE_CACHE_NR_OBJS);
+ if (ret)
+ goto out;
+
+ spin_lock(&kvm->mmu_lock);
+ ret = stage2_set_pte(kvm, &pcache, addr, &pte);
+ spin_unlock(&kvm->mmu_lock);
+ if (ret)
+ goto out;
+
+ pfn++;
+ }
+
+out:
+ stage2_cache_flush(&pcache);
+ return ret;
+
+}
+
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
@@ -35,7 +461,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)

void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- /* TODO: */
+ kvm_riscv_stage2_free_pgd(kvm);
}

void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -49,7 +475,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- /* TODO: */
+ /*
+ * At this point memslot has been committed and there is an
+ * allocated dirty_bitmap[], dirty pages will be be tracked while the
+ * memory slot is write protected.
+ */
+ if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
+ stage2_wp_memory_region(kvm, mem->slot);
}

int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -57,34 +489,219 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
- /* TODO: */
- return 0;
+ hva_t hva = mem->userspace_addr;
+ hva_t reg_end = hva + mem->memory_size;
+ bool writable = !(mem->flags & KVM_MEM_READONLY);
+ int ret = 0;
+
+ if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+ change != KVM_MR_FLAGS_ONLY)
+ return 0;
+
+ /*
+ * Prevent userspace from creating a memory region outside of the GPA
+ * space addressable by the KVM guest GPA space.
+ */
+ if ((memslot->base_gfn + memslot->npages) >=
+ (stage2_gpa_size >> PAGE_SHIFT))
+ return -EFAULT;
+
+ down_read(&current->mm->mmap_sem);
+
+ /*
+ * A memory region could potentially cover multiple VMAs, and
+ * any holes between them, so iterate over all of them to find
+ * out if we can map any of them right now.
+ *
+ * +--------------------------------------------+
+ * +---------------+----------------+ +----------------+
+ * | : VMA 1 | VMA 2 | | VMA 3 : |
+ * +---------------+----------------+ +----------------+
+ * | memory region |
+ * +--------------------------------------------+
+ */
+ do {
+ struct vm_area_struct *vma = find_vma(current->mm, hva);
+ hva_t vm_start, vm_end;
+
+ if (!vma || vma->vm_start >= reg_end)
+ break;
+
+ /*
+ * Mapping a read-only VMA is only allowed if the
+ * memory region is configured as read-only.
+ */
+ if (writable && !(vma->vm_flags & VM_WRITE)) {
+ ret = -EPERM;
+ break;
+ }
+
+ /* Take the intersection of this VMA with the memory region */
+ vm_start = max(hva, vma->vm_start);
+ vm_end = min(reg_end, vma->vm_end);
+
+ if (vma->vm_flags & VM_PFNMAP) {
+ gpa_t gpa = mem->guest_phys_addr +
+ (vm_start - mem->userspace_addr);
+ phys_addr_t pa;
+
+ pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
+ pa += vm_start - vma->vm_start;
+
+ /* IO region dirty page logging not allowed */
+ if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = stage2_ioremap(kvm, gpa, pa,
+ vm_end - vm_start, writable);
+ if (ret)
+ break;
+ }
+ hva = vm_end;
+ } while (hva < reg_end);
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ goto out;
+
+ spin_lock(&kvm->mmu_lock);
+ if (ret)
+ stage2_unmap_range(kvm, mem->guest_phys_addr,
+ mem->memory_size);
+ spin_unlock(&kvm->mmu_lock);
+
+out:
+ up_read(&current->mm->mmap_sem);
+ return ret;
}

int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
bool is_write)
{
- /* TODO: */
- return 0;
+ int ret;
+ short lsb;
+ kvm_pfn_t hfn;
+ bool writeable;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache;
+ unsigned long vma_pagesize;
+
+ down_read(&current->mm->mmap_sem);
+
+ vma = find_vma_intersection(current->mm, hva, hva + 1);
+ if (unlikely(!vma)) {
+ kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+ up_read(&current->mm->mmap_sem);
+ return -EFAULT;
+ }
+
+ vma_pagesize = vma_kernel_pagesize(vma);
+
+ up_read(&current->mm->mmap_sem);
+
+ if (vma_pagesize != PGDIR_SIZE &&
+ vma_pagesize != PMD_SIZE &&
+ vma_pagesize != PAGE_SIZE) {
+ kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize);
+ return -EFAULT;
+ }
+
+ /* We need minimum second+third level pages */
+ ret = stage2_cache_topup(pcache, stage2_cache_min_pages,
+ KVM_MMU_PAGE_CACHE_NR_OBJS);
+ if (ret) {
+ kvm_err("Failed to topup stage2 cache\n");
+ return ret;
+ }
+
+ hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable);
+ if (hfn == KVM_PFN_ERR_HWPOISON) {
+ if (is_vm_hugetlb_page(vma))
+ lsb = huge_page_shift(hstate_vma(vma));
+ else
+ lsb = PAGE_SHIFT;
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
+ lsb, current);
+ return 0;
+ }
+ if (is_error_noslot_pfn(hfn))
+ return -EFAULT;
+ if (!writeable && is_write)
+ return -EPERM;
+
+ spin_lock(&kvm->mmu_lock);
+
+ if (writeable) {
+ kvm_set_pfn_dirty(hfn);
+ ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, PAGE_WRITE_EXEC);
+ } else {
+ ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
+ vma_pagesize, PAGE_READ_EXEC);
+ }
+
+ if (ret)
+ kvm_err("Failed to map in stage2\n");
+
+ spin_unlock(&kvm->mmu_lock);
+ kvm_set_pfn_accessed(hfn);
+ kvm_release_pfn_clean(hfn);
+ return ret;
}

void kvm_riscv_stage2_flush_cache(struct kvm_vcpu *vcpu)
{
- /* TODO: */
+ stage2_cache_flush(&vcpu->arch.mmu_page_cache);
}

int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm)
{
- /* TODO: */
+ if (kvm->arch.pgd != NULL) {
+ kvm_err("kvm_arch already initialized?\n");
+ return -EINVAL;
+ }
+
+ kvm->arch.pgd = alloc_pages_exact(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ if (!kvm->arch.pgd)
+ return -ENOMEM;
+ kvm->arch.pgd_phys = virt_to_phys(kvm->arch.pgd);
+
return 0;
}

void kvm_riscv_stage2_free_pgd(struct kvm *kvm)
{
- /* TODO: */
+ void *pgd = NULL;
+
+ spin_lock(&kvm->mmu_lock);
+ if (kvm->arch.pgd) {
+ stage2_unmap_range(kvm, 0UL, stage2_gpa_size);
+ pgd = READ_ONCE(kvm->arch.pgd);
+ kvm->arch.pgd = NULL;
+ kvm->arch.pgd_phys = 0;
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ /* Free the HW pgd, one page at a time */
+ if (pgd)
+ free_pages_exact(pgd, PAGE_SIZE);
}

void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu)
{
- /* TODO: */
+ unsigned long hgatp = HGATP_MODE;
+ struct kvm_arch *k = &vcpu->kvm->arch;
+
+ hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) &
+ HGATP_VMID_MASK;
+ hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
+
+ csr_write(CSR_HGATP, hgatp);
+
+ if (!kvm_riscv_stage2_vmid_bits())
+ __kvm_riscv_hfence_gvma_all();
}
--
2.17.1

2019-08-22 15:26:28

by Anup Patel

[permalink] [raw]

Subject: Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

On Thu, Aug 22, 2019 at 6:57 PM Alexander Graf <[email protected]> wrote:
>
>
>
> On 22.08.19 14:38, Anup Patel wrote:
> > On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <[email protected]> wrote:
> >>
> >> On 22.08.19 10:45, Anup Patel wrote:
> >>> This patch implements all required functions for programming
> >>> the stage2 page table for each Guest/VM.
> >>>
> >>> At high-level, the flow of stage2 related functions is similar
> >>> from KVM ARM/ARM64 implementation but the stage2 page table
> >>> format is quite different for KVM RISC-V.
> >>>
> >>> Signed-off-by: Anup Patel <[email protected]>
> >>> Acked-by: Paolo Bonzini <[email protected]>
> >>> Reviewed-by: Paolo Bonzini <[email protected]>
> >>> ---
> >>> arch/riscv/include/asm/kvm_host.h | 10 +
> >>> arch/riscv/include/asm/pgtable-bits.h | 1 +
> >>> arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
> >>> 3 files changed, 638 insertions(+), 10 deletions(-)
> >>>
> >>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> >>> index 3b09158f80f2..a37775c92586 100644
> >>> --- a/arch/riscv/include/asm/kvm_host.h
> >>> +++ b/arch/riscv/include/asm/kvm_host.h
> >>> @@ -72,6 +72,13 @@ struct kvm_mmio_decode {
> >>> int shift;
> >>> };
> >>>
> >>> +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
> >>> +
> >>> +struct kvm_mmu_page_cache {
> >>> + int nobjs;
> >>> + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
> >>> +};
> >>> +
> >>> struct kvm_cpu_context {
> >>> unsigned long zero;
> >>> unsigned long ra;
> >>> @@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
> >>> /* MMIO instruction details */
> >>> struct kvm_mmio_decode mmio_decode;
> >>>
> >>> + /* Cache pages needed to program page tables with spinlock held */
> >>> + struct kvm_mmu_page_cache mmu_page_cache;
> >>> +
> >>> /* VCPU power-off state */
> >>> bool power_off;
> >>>
> >>> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> >>> index bbaeb5d35842..be49d62fcc2b 100644
> >>> --- a/arch/riscv/include/asm/pgtable-bits.h
> >>> +++ b/arch/riscv/include/asm/pgtable-bits.h
> >>> @@ -26,6 +26,7 @@
> >>>
> >>> #define _PAGE_SPECIAL _PAGE_SOFT
> >>> #define _PAGE_TABLE _PAGE_PRESENT
> >>> +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
> >>>
> >>> /*
> >>> * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
> >>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> >>> index 2b965f9aac07..9e95ab6769f6 100644
> >>> --- a/arch/riscv/kvm/mmu.c
> >>> +++ b/arch/riscv/kvm/mmu.c
> >>> @@ -18,6 +18,432 @@
> >>> #include <asm/page.h>
> >>> #include <asm/pgtable.h>
> >>>
> >>> +#ifdef CONFIG_64BIT
> >>> +#define stage2_have_pmd true
> >>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
> >>> +#define stage2_cache_min_pages 2
> >>> +#else
> >>> +#define pmd_index(x) 0
> >>> +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
> >>> +#define stage2_have_pmd false
> >>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
> >>> +#define stage2_cache_min_pages 1
> >>> +#endif
> >>> +
> >>> +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
> >>> + int min, int max)
> >>> +{
> >>> + void *page;
> >>> +
> >>> + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
> >>> + if (pcache->nobjs >= min)
> >>> + return 0;
> >>> + while (pcache->nobjs < max) {
> >>> + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
> >>> + if (!page)
> >>> + return -ENOMEM;
> >>> + pcache->objects[pcache->nobjs++] = page;
> >>> + }
> >>> +
> >>> + return 0;
> >>> +}
> >>> +
> >>> +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
> >>> +{
> >>> + while (pcache && pcache->nobjs)
> >>> + free_page((unsigned long)pcache->objects[--pcache->nobjs]);
> >>> +}
> >>> +
> >>> +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
> >>> +{
> >>> + void *p;
> >>> +
> >>> + if (!pcache)
> >>> + return NULL;
> >>> +
> >>> + BUG_ON(!pcache->nobjs);
> >>> + p = pcache->objects[--pcache->nobjs];
> >>> +
> >>> + return p;
> >>> +}
> >>> +
> >>> +struct local_guest_tlb_info {
> >>> + struct kvm_vmid *vmid;
> >>> + gpa_t addr;
> >>> +};
> >>> +
> >>> +static void local_guest_tlb_flush_vmid_gpa(void *info)
> >>> +{
> >>> + struct local_guest_tlb_info *infop = info;
> >>> +
> >>> + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
> >>> + infop->addr);
> >>> +}
> >>> +
> >>> +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
> >>> +{
> >>> + struct local_guest_tlb_info info;
> >>> + struct kvm_vmid *vmid = &kvm->arch.vmid;
> >>> +
> >>> + /* TODO: This should be SBI call */
> >>> + info.vmid = vmid;
> >>> + info.addr = addr;
> >>> + preempt_disable();
> >>> + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
> >>> + &info, true);
> >>
> >> This is all nice and dandy on the toy 4 core systems we have today, but
> >> it will become a bottleneck further down the road.
> >>
> >> How many VMIDs do you have? Could you just allocate a new one every time
> >> you switch host CPUs? Then you know exactly which CPUs to flush by
> >> looking at all your vcpu structs and a local field that tells you which
> >> pCPU they're on at this moment.
> >>
> >> Either way, it's nothing that should block inclusion. For today, we're fine.
> >
> > We are not happy about this either.
> >
> > Other two options, we have are:
> > 1. Have SBI calls for remote HFENCEs
> > 2. Propose RISC-V ISA extension for remote FENCEs
> >
> > Option1 is mostly extending SBI spec and implementing it in runtime
> > firmware.
> >
> > Option2 is ideal solution but requires consensus among wider audience
> > in RISC-V foundation.
> >
> > At this point, we are fine with a simple solution.
>
> It's fine to explicitly IPI other CPUs to flush their TLBs. What is not
> fine is to IPI *all* CPUs to flush their TLBs.

Ahh, this should have been cpu_online_mask instead of cpu_all_mask

I will update this in next revision.

Regards,
Anup

>
>
> Alex

2019-08-22 17:57:54

by Alexander Graf

[permalink] [raw]

Subject: Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

On 22.08.19 10:45, Anup Patel wrote:
> This patch implements all required functions for programming
> the stage2 page table for each Guest/VM.
>
> At high-level, the flow of stage2 related functions is similar
> from KVM ARM/ARM64 implementation but the stage2 page table
> format is quite different for KVM RISC-V.
>
> Signed-off-by: Anup Patel <[email protected]>
> Acked-by: Paolo Bonzini <[email protected]>
> Reviewed-by: Paolo Bonzini <[email protected]>
> ---
> arch/riscv/include/asm/kvm_host.h | 10 +
> arch/riscv/include/asm/pgtable-bits.h | 1 +
> arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
> 3 files changed, 638 insertions(+), 10 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 3b09158f80f2..a37775c92586 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -72,6 +72,13 @@ struct kvm_mmio_decode {
> int shift;
> };
>
> +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
> +
> +struct kvm_mmu_page_cache {
> + int nobjs;
> + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
> +};
> +
> struct kvm_cpu_context {
> unsigned long zero;
> unsigned long ra;
> @@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
> /* MMIO instruction details */
> struct kvm_mmio_decode mmio_decode;
>
> + /* Cache pages needed to program page tables with spinlock held */
> + struct kvm_mmu_page_cache mmu_page_cache;
> +
> /* VCPU power-off state */
> bool power_off;
>
> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> index bbaeb5d35842..be49d62fcc2b 100644
> --- a/arch/riscv/include/asm/pgtable-bits.h
> +++ b/arch/riscv/include/asm/pgtable-bits.h
> @@ -26,6 +26,7 @@
>
> #define _PAGE_SPECIAL _PAGE_SOFT
> #define _PAGE_TABLE _PAGE_PRESENT
> +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>
> /*
> * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 2b965f9aac07..9e95ab6769f6 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -18,6 +18,432 @@
> #include <asm/page.h>
> #include <asm/pgtable.h>
>
> +#ifdef CONFIG_64BIT
> +#define stage2_have_pmd true
> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
> +#define stage2_cache_min_pages 2
> +#else
> +#define pmd_index(x) 0
> +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
> +#define stage2_have_pmd false
> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
> +#define stage2_cache_min_pages 1
> +#endif
> +
> +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
> + int min, int max)
> +{
> + void *page;
> +
> + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
> + if (pcache->nobjs >= min)
> + return 0;
> + while (pcache->nobjs < max) {
> + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
> + if (!page)
> + return -ENOMEM;
> + pcache->objects[pcache->nobjs++] = page;
> + }
> +
> + return 0;
> +}
> +
> +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
> +{
> + while (pcache && pcache->nobjs)
> + free_page((unsigned long)pcache->objects[--pcache->nobjs]);
> +}
> +
> +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
> +{
> + void *p;
> +
> + if (!pcache)
> + return NULL;
> +
> + BUG_ON(!pcache->nobjs);
> + p = pcache->objects[--pcache->nobjs];
> +
> + return p;
> +}
> +
> +struct local_guest_tlb_info {
> + struct kvm_vmid *vmid;
> + gpa_t addr;
> +};
> +
> +static void local_guest_tlb_flush_vmid_gpa(void *info)
> +{
> + struct local_guest_tlb_info *infop = info;
> +
> + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
> + infop->addr);
> +}
> +
> +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
> +{
> + struct local_guest_tlb_info info;
> + struct kvm_vmid *vmid = &kvm->arch.vmid;
> +
> + /* TODO: This should be SBI call */
> + info.vmid = vmid;
> + info.addr = addr;
> + preempt_disable();
> + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
> + &info, true);

This is all nice and dandy on the toy 4 core systems we have today, but
it will become a bottleneck further down the road.

How many VMIDs do you have? Could you just allocate a new one every time
you switch host CPUs? Then you know exactly which CPUs to flush by
looking at all your vcpu structs and a local field that tells you which
pCPU they're on at this moment.

Either way, it's nothing that should block inclusion. For today, we're fine.

Alex

2019-08-22 17:59:40

by Anup Patel

[permalink] [raw]

Subject: Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <[email protected]> wrote:
>
> On 22.08.19 10:45, Anup Patel wrote:
> > This patch implements all required functions for programming
> > the stage2 page table for each Guest/VM.
> >
> > At high-level, the flow of stage2 related functions is similar
> > from KVM ARM/ARM64 implementation but the stage2 page table
> > format is quite different for KVM RISC-V.
> >
> > Signed-off-by: Anup Patel <[email protected]>
> > Acked-by: Paolo Bonzini <[email protected]>
> > Reviewed-by: Paolo Bonzini <[email protected]>
> > ---
> > arch/riscv/include/asm/kvm_host.h | 10 +
> > arch/riscv/include/asm/pgtable-bits.h | 1 +
> > arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
> > 3 files changed, 638 insertions(+), 10 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> > index 3b09158f80f2..a37775c92586 100644
> > --- a/arch/riscv/include/asm/kvm_host.h
> > +++ b/arch/riscv/include/asm/kvm_host.h
> > @@ -72,6 +72,13 @@ struct kvm_mmio_decode {
> > int shift;
> > };
> >
> > +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
> > +
> > +struct kvm_mmu_page_cache {
> > + int nobjs;
> > + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
> > +};
> > +
> > struct kvm_cpu_context {
> > unsigned long zero;
> > unsigned long ra;
> > @@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
> > /* MMIO instruction details */
> > struct kvm_mmio_decode mmio_decode;
> >
> > + /* Cache pages needed to program page tables with spinlock held */
> > + struct kvm_mmu_page_cache mmu_page_cache;
> > +
> > /* VCPU power-off state */
> > bool power_off;
> >
> > diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> > index bbaeb5d35842..be49d62fcc2b 100644
> > --- a/arch/riscv/include/asm/pgtable-bits.h
> > +++ b/arch/riscv/include/asm/pgtable-bits.h
> > @@ -26,6 +26,7 @@
> >
> > #define _PAGE_SPECIAL _PAGE_SOFT
> > #define _PAGE_TABLE _PAGE_PRESENT
> > +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
> >
> > /*
> > * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
> > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> > index 2b965f9aac07..9e95ab6769f6 100644
> > --- a/arch/riscv/kvm/mmu.c
> > +++ b/arch/riscv/kvm/mmu.c
> > @@ -18,6 +18,432 @@
> > #include <asm/page.h>
> > #include <asm/pgtable.h>
> >
> > +#ifdef CONFIG_64BIT
> > +#define stage2_have_pmd true
> > +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
> > +#define stage2_cache_min_pages 2
> > +#else
> > +#define pmd_index(x) 0
> > +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
> > +#define stage2_have_pmd false
> > +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
> > +#define stage2_cache_min_pages 1
> > +#endif
> > +
> > +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
> > + int min, int max)
> > +{
> > + void *page;
> > +
> > + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
> > + if (pcache->nobjs >= min)
> > + return 0;
> > + while (pcache->nobjs < max) {
> > + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
> > + if (!page)
> > + return -ENOMEM;
> > + pcache->objects[pcache->nobjs++] = page;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
> > +{
> > + while (pcache && pcache->nobjs)
> > + free_page((unsigned long)pcache->objects[--pcache->nobjs]);
> > +}
> > +
> > +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
> > +{
> > + void *p;
> > +
> > + if (!pcache)
> > + return NULL;
> > +
> > + BUG_ON(!pcache->nobjs);
> > + p = pcache->objects[--pcache->nobjs];
> > +
> > + return p;
> > +}
> > +
> > +struct local_guest_tlb_info {
> > + struct kvm_vmid *vmid;
> > + gpa_t addr;
> > +};
> > +
> > +static void local_guest_tlb_flush_vmid_gpa(void *info)
> > +{
> > + struct local_guest_tlb_info *infop = info;
> > +
> > + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
> > + infop->addr);
> > +}
> > +
> > +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
> > +{
> > + struct local_guest_tlb_info info;
> > + struct kvm_vmid *vmid = &kvm->arch.vmid;
> > +
> > + /* TODO: This should be SBI call */
> > + info.vmid = vmid;
> > + info.addr = addr;
> > + preempt_disable();
> > + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
> > + &info, true);
>
> This is all nice and dandy on the toy 4 core systems we have today, but
> it will become a bottleneck further down the road.
>
> How many VMIDs do you have? Could you just allocate a new one every time
> you switch host CPUs? Then you know exactly which CPUs to flush by
> looking at all your vcpu structs and a local field that tells you which
> pCPU they're on at this moment.
>
> Either way, it's nothing that should block inclusion. For today, we're fine.

We are not happy about this either.

Other two options, we have are:
1. Have SBI calls for remote HFENCEs
2. Propose RISC-V ISA extension for remote FENCEs

Option1 is mostly extending SBI spec and implementing it in runtime
firmware.

Option2 is ideal solution but requires consensus among wider audience
in RISC-V foundation.

At this point, we are fine with a simple solution.

Regards,
Anup

>
>
> Alex

2019-08-22 18:44:44

by Alexander Graf

[permalink] [raw]

Subject: Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

On 22.08.19 14:38, Anup Patel wrote:
> On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <[email protected]> wrote:
>>
>> On 22.08.19 10:45, Anup Patel wrote:
>>> This patch implements all required functions for programming
>>> the stage2 page table for each Guest/VM.
>>>
>>> At high-level, the flow of stage2 related functions is similar
>>> from KVM ARM/ARM64 implementation but the stage2 page table
>>> format is quite different for KVM RISC-V.
>>>
>>> Signed-off-by: Anup Patel <[email protected]>
>>> Acked-by: Paolo Bonzini <[email protected]>
>>> Reviewed-by: Paolo Bonzini <[email protected]>
>>> ---
>>> arch/riscv/include/asm/kvm_host.h | 10 +
>>> arch/riscv/include/asm/pgtable-bits.h | 1 +
>>> arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
>>> 3 files changed, 638 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
>>> index 3b09158f80f2..a37775c92586 100644
>>> --- a/arch/riscv/include/asm/kvm_host.h
>>> +++ b/arch/riscv/include/asm/kvm_host.h
>>> @@ -72,6 +72,13 @@ struct kvm_mmio_decode {
>>> int shift;
>>> };
>>>
>>> +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
>>> +
>>> +struct kvm_mmu_page_cache {
>>> + int nobjs;
>>> + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
>>> +};
>>> +
>>> struct kvm_cpu_context {
>>> unsigned long zero;
>>> unsigned long ra;
>>> @@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
>>> /* MMIO instruction details */
>>> struct kvm_mmio_decode mmio_decode;
>>>
>>> + /* Cache pages needed to program page tables with spinlock held */
>>> + struct kvm_mmu_page_cache mmu_page_cache;
>>> +
>>> /* VCPU power-off state */
>>> bool power_off;
>>>
>>> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
>>> index bbaeb5d35842..be49d62fcc2b 100644
>>> --- a/arch/riscv/include/asm/pgtable-bits.h
>>> +++ b/arch/riscv/include/asm/pgtable-bits.h
>>> @@ -26,6 +26,7 @@
>>>
>>> #define _PAGE_SPECIAL _PAGE_SOFT
>>> #define _PAGE_TABLE _PAGE_PRESENT
>>> +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>>>
>>> /*
>>> * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
>>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
>>> index 2b965f9aac07..9e95ab6769f6 100644
>>> --- a/arch/riscv/kvm/mmu.c
>>> +++ b/arch/riscv/kvm/mmu.c
>>> @@ -18,6 +18,432 @@
>>> #include <asm/page.h>
>>> #include <asm/pgtable.h>
>>>
>>> +#ifdef CONFIG_64BIT
>>> +#define stage2_have_pmd true
>>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
>>> +#define stage2_cache_min_pages 2
>>> +#else
>>> +#define pmd_index(x) 0
>>> +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
>>> +#define stage2_have_pmd false
>>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
>>> +#define stage2_cache_min_pages 1
>>> +#endif
>>> +
>>> +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
>>> + int min, int max)
>>> +{
>>> + void *page;
>>> +
>>> + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
>>> + if (pcache->nobjs >= min)
>>> + return 0;
>>> + while (pcache->nobjs < max) {
>>> + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
>>> + if (!page)
>>> + return -ENOMEM;
>>> + pcache->objects[pcache->nobjs++] = page;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
>>> +{
>>> + while (pcache && pcache->nobjs)
>>> + free_page((unsigned long)pcache->objects[--pcache->nobjs]);
>>> +}
>>> +
>>> +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
>>> +{
>>> + void *p;
>>> +
>>> + if (!pcache)
>>> + return NULL;
>>> +
>>> + BUG_ON(!pcache->nobjs);
>>> + p = pcache->objects[--pcache->nobjs];
>>> +
>>> + return p;
>>> +}
>>> +
>>> +struct local_guest_tlb_info {
>>> + struct kvm_vmid *vmid;
>>> + gpa_t addr;
>>> +};
>>> +
>>> +static void local_guest_tlb_flush_vmid_gpa(void *info)
>>> +{
>>> + struct local_guest_tlb_info *infop = info;
>>> +
>>> + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
>>> + infop->addr);
>>> +}
>>> +
>>> +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
>>> +{
>>> + struct local_guest_tlb_info info;
>>> + struct kvm_vmid *vmid = &kvm->arch.vmid;
>>> +
>>> + /* TODO: This should be SBI call */
>>> + info.vmid = vmid;
>>> + info.addr = addr;
>>> + preempt_disable();
>>> + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
>>> + &info, true);
>>
>> This is all nice and dandy on the toy 4 core systems we have today, but
>> it will become a bottleneck further down the road.
>>
>> How many VMIDs do you have? Could you just allocate a new one every time
>> you switch host CPUs? Then you know exactly which CPUs to flush by
>> looking at all your vcpu structs and a local field that tells you which
>> pCPU they're on at this moment.
>>
>> Either way, it's nothing that should block inclusion. For today, we're fine.
>
> We are not happy about this either.
>
> Other two options, we have are:
> 1. Have SBI calls for remote HFENCEs
> 2. Propose RISC-V ISA extension for remote FENCEs
>
> Option1 is mostly extending SBI spec and implementing it in runtime
> firmware.
>
> Option2 is ideal solution but requires consensus among wider audience
> in RISC-V foundation.
>
> At this point, we are fine with a simple solution.

It's fine to explicitly IPI other CPUs to flush their TLBs. What is not
fine is to IPI *all* CPUs to flush their TLBs.

Alex

2019-08-22 19:18:44

by Alexander Graf

[permalink] [raw]

Subject: Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

On 22.08.19 15:58, Anup Patel wrote:
> On Thu, Aug 22, 2019 at 6:57 PM Alexander Graf <[email protected]> wrote:
>>
>>
>>
>> On 22.08.19 14:38, Anup Patel wrote:
>>> On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <[email protected]> wrote:
>>>>
>>>> On 22.08.19 10:45, Anup Patel wrote:
>>>>> This patch implements all required functions for programming
>>>>> the stage2 page table for each Guest/VM.
>>>>>
>>>>> At high-level, the flow of stage2 related functions is similar
>>>>> from KVM ARM/ARM64 implementation but the stage2 page table
>>>>> format is quite different for KVM RISC-V.
>>>>>
>>>>> Signed-off-by: Anup Patel <[email protected]>
>>>>> Acked-by: Paolo Bonzini <[email protected]>
>>>>> Reviewed-by: Paolo Bonzini <[email protected]>
>>>>> ---
>>>>> arch/riscv/include/asm/kvm_host.h | 10 +
>>>>> arch/riscv/include/asm/pgtable-bits.h | 1 +
>>>>> arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
>>>>> 3 files changed, 638 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
>>>>> index 3b09158f80f2..a37775c92586 100644
>>>>> --- a/arch/riscv/include/asm/kvm_host.h
>>>>> +++ b/arch/riscv/include/asm/kvm_host.h
>>>>> @@ -72,6 +72,13 @@ struct kvm_mmio_decode {
>>>>> int shift;
>>>>> };
>>>>>
>>>>> +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
>>>>> +
>>>>> +struct kvm_mmu_page_cache {
>>>>> + int nobjs;
>>>>> + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
>>>>> +};
>>>>> +
>>>>> struct kvm_cpu_context {
>>>>> unsigned long zero;
>>>>> unsigned long ra;
>>>>> @@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
>>>>> /* MMIO instruction details */
>>>>> struct kvm_mmio_decode mmio_decode;
>>>>>
>>>>> + /* Cache pages needed to program page tables with spinlock held */
>>>>> + struct kvm_mmu_page_cache mmu_page_cache;
>>>>> +
>>>>> /* VCPU power-off state */
>>>>> bool power_off;
>>>>>
>>>>> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
>>>>> index bbaeb5d35842..be49d62fcc2b 100644
>>>>> --- a/arch/riscv/include/asm/pgtable-bits.h
>>>>> +++ b/arch/riscv/include/asm/pgtable-bits.h
>>>>> @@ -26,6 +26,7 @@
>>>>>
>>>>> #define _PAGE_SPECIAL _PAGE_SOFT
>>>>> #define _PAGE_TABLE _PAGE_PRESENT
>>>>> +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>>>>>
>>>>> /*
>>>>> * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
>>>>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
>>>>> index 2b965f9aac07..9e95ab6769f6 100644
>>>>> --- a/arch/riscv/kvm/mmu.c
>>>>> +++ b/arch/riscv/kvm/mmu.c
>>>>> @@ -18,6 +18,432 @@
>>>>> #include <asm/page.h>
>>>>> #include <asm/pgtable.h>
>>>>>
>>>>> +#ifdef CONFIG_64BIT
>>>>> +#define stage2_have_pmd true
>>>>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
>>>>> +#define stage2_cache_min_pages 2
>>>>> +#else
>>>>> +#define pmd_index(x) 0
>>>>> +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
>>>>> +#define stage2_have_pmd false
>>>>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
>>>>> +#define stage2_cache_min_pages 1
>>>>> +#endif
>>>>> +
>>>>> +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
>>>>> + int min, int max)
>>>>> +{
>>>>> + void *page;
>>>>> +
>>>>> + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
>>>>> + if (pcache->nobjs >= min)
>>>>> + return 0;
>>>>> + while (pcache->nobjs < max) {
>>>>> + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
>>>>> + if (!page)
>>>>> + return -ENOMEM;
>>>>> + pcache->objects[pcache->nobjs++] = page;
>>>>> + }
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
>>>>> +{
>>>>> + while (pcache && pcache->nobjs)
>>>>> + free_page((unsigned long)pcache->objects[--pcache->nobjs]);
>>>>> +}
>>>>> +
>>>>> +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
>>>>> +{
>>>>> + void *p;
>>>>> +
>>>>> + if (!pcache)
>>>>> + return NULL;
>>>>> +
>>>>> + BUG_ON(!pcache->nobjs);
>>>>> + p = pcache->objects[--pcache->nobjs];
>>>>> +
>>>>> + return p;
>>>>> +}
>>>>> +
>>>>> +struct local_guest_tlb_info {
>>>>> + struct kvm_vmid *vmid;
>>>>> + gpa_t addr;
>>>>> +};
>>>>> +
>>>>> +static void local_guest_tlb_flush_vmid_gpa(void *info)
>>>>> +{
>>>>> + struct local_guest_tlb_info *infop = info;
>>>>> +
>>>>> + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
>>>>> + infop->addr);
>>>>> +}
>>>>> +
>>>>> +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
>>>>> +{
>>>>> + struct local_guest_tlb_info info;
>>>>> + struct kvm_vmid *vmid = &kvm->arch.vmid;
>>>>> +
>>>>> + /* TODO: This should be SBI call */
>>>>> + info.vmid = vmid;
>>>>> + info.addr = addr;
>>>>> + preempt_disable();
>>>>> + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
>>>>> + &info, true);
>>>>
>>>> This is all nice and dandy on the toy 4 core systems we have today, but
>>>> it will become a bottleneck further down the road.
>>>>
>>>> How many VMIDs do you have? Could you just allocate a new one every time
>>>> you switch host CPUs? Then you know exactly which CPUs to flush by
>>>> looking at all your vcpu structs and a local field that tells you which
>>>> pCPU they're on at this moment.
>>>>
>>>> Either way, it's nothing that should block inclusion. For today, we're fine.
>>>
>>> We are not happy about this either.
>>>
>>> Other two options, we have are:
>>> 1. Have SBI calls for remote HFENCEs
>>> 2. Propose RISC-V ISA extension for remote FENCEs
>>>
>>> Option1 is mostly extending SBI spec and implementing it in runtime
>>> firmware.
>>>
>>> Option2 is ideal solution but requires consensus among wider audience
>>> in RISC-V foundation.
>>>
>>> At this point, we are fine with a simple solution.
>>
>> It's fine to explicitly IPI other CPUs to flush their TLBs. What is not
>> fine is to IPI *all* CPUs to flush their TLBs.
>
> Ahh, this should have been cpu_online_mask instead of cpu_all_mask
>
> I will update this in next revision.

What I was trying to say is that you only want to flush currently
running other vcpus and add a hint for all the others saying "please
flush the next time you come up".

I think we had a mechanism for that somewhere in the EVENT magic.

But as I said, this is a performance optimization - that's something I'm
happy to delay. Security and user space ABI are the bits I'm worried
about at this stage.

Alex

2019-08-23 23:06:34

by Anup Patel

[permalink] [raw]

Subject: Re: [PATCH v5 13/20] RISC-V: KVM: Implement stage2 page table programming

On Thu, Aug 22, 2019 at 7:39 PM Alexander Graf <[email protected]> wrote:
>
>
>
> On 22.08.19 15:58, Anup Patel wrote:
> > On Thu, Aug 22, 2019 at 6:57 PM Alexander Graf <[email protected]> wrote:
> >>
> >>
> >>
> >> On 22.08.19 14:38, Anup Patel wrote:
> >>> On Thu, Aug 22, 2019 at 5:58 PM Alexander Graf <[email protected]> wrote:
> >>>>
> >>>> On 22.08.19 10:45, Anup Patel wrote:
> >>>>> This patch implements all required functions for programming
> >>>>> the stage2 page table for each Guest/VM.
> >>>>>
> >>>>> At high-level, the flow of stage2 related functions is similar
> >>>>> from KVM ARM/ARM64 implementation but the stage2 page table
> >>>>> format is quite different for KVM RISC-V.
> >>>>>
> >>>>> Signed-off-by: Anup Patel <[email protected]>
> >>>>> Acked-by: Paolo Bonzini <[email protected]>
> >>>>> Reviewed-by: Paolo Bonzini <[email protected]>
> >>>>> ---
> >>>>> arch/riscv/include/asm/kvm_host.h | 10 +
> >>>>> arch/riscv/include/asm/pgtable-bits.h | 1 +
> >>>>> arch/riscv/kvm/mmu.c | 637 +++++++++++++++++++++++++-
> >>>>> 3 files changed, 638 insertions(+), 10 deletions(-)
> >>>>>
> >>>>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> >>>>> index 3b09158f80f2..a37775c92586 100644
> >>>>> --- a/arch/riscv/include/asm/kvm_host.h
> >>>>> +++ b/arch/riscv/include/asm/kvm_host.h
> >>>>> @@ -72,6 +72,13 @@ struct kvm_mmio_decode {
> >>>>> int shift;
> >>>>> };
> >>>>>
> >>>>> +#define KVM_MMU_PAGE_CACHE_NR_OBJS 32
> >>>>> +
> >>>>> +struct kvm_mmu_page_cache {
> >>>>> + int nobjs;
> >>>>> + void *objects[KVM_MMU_PAGE_CACHE_NR_OBJS];
> >>>>> +};
> >>>>> +
> >>>>> struct kvm_cpu_context {
> >>>>> unsigned long zero;
> >>>>> unsigned long ra;
> >>>>> @@ -163,6 +170,9 @@ struct kvm_vcpu_arch {
> >>>>> /* MMIO instruction details */
> >>>>> struct kvm_mmio_decode mmio_decode;
> >>>>>
> >>>>> + /* Cache pages needed to program page tables with spinlock held */
> >>>>> + struct kvm_mmu_page_cache mmu_page_cache;
> >>>>> +
> >>>>> /* VCPU power-off state */
> >>>>> bool power_off;
> >>>>>
> >>>>> diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
> >>>>> index bbaeb5d35842..be49d62fcc2b 100644
> >>>>> --- a/arch/riscv/include/asm/pgtable-bits.h
> >>>>> +++ b/arch/riscv/include/asm/pgtable-bits.h
> >>>>> @@ -26,6 +26,7 @@
> >>>>>
> >>>>> #define _PAGE_SPECIAL _PAGE_SOFT
> >>>>> #define _PAGE_TABLE _PAGE_PRESENT
> >>>>> +#define _PAGE_LEAF (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
> >>>>>
> >>>>> /*
> >>>>> * _PAGE_PROT_NONE is set on not-present pages (and ignored by the hardware) to
> >>>>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> >>>>> index 2b965f9aac07..9e95ab6769f6 100644
> >>>>> --- a/arch/riscv/kvm/mmu.c
> >>>>> +++ b/arch/riscv/kvm/mmu.c
> >>>>> @@ -18,6 +18,432 @@
> >>>>> #include <asm/page.h>
> >>>>> #include <asm/pgtable.h>
> >>>>>
> >>>>> +#ifdef CONFIG_64BIT
> >>>>> +#define stage2_have_pmd true
> >>>>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 39))
> >>>>> +#define stage2_cache_min_pages 2
> >>>>> +#else
> >>>>> +#define pmd_index(x) 0
> >>>>> +#define pfn_pmd(x, y) ({ pmd_t __x = { 0 }; __x; })
> >>>>> +#define stage2_have_pmd false
> >>>>> +#define stage2_gpa_size ((phys_addr_t)(1ULL << 32))
> >>>>> +#define stage2_cache_min_pages 1
> >>>>> +#endif
> >>>>> +
> >>>>> +static int stage2_cache_topup(struct kvm_mmu_page_cache *pcache,
> >>>>> + int min, int max)
> >>>>> +{
> >>>>> + void *page;
> >>>>> +
> >>>>> + BUG_ON(max > KVM_MMU_PAGE_CACHE_NR_OBJS);
> >>>>> + if (pcache->nobjs >= min)
> >>>>> + return 0;
> >>>>> + while (pcache->nobjs < max) {
> >>>>> + page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
> >>>>> + if (!page)
> >>>>> + return -ENOMEM;
> >>>>> + pcache->objects[pcache->nobjs++] = page;
> >>>>> + }
> >>>>> +
> >>>>> + return 0;
> >>>>> +}
> >>>>> +
> >>>>> +static void stage2_cache_flush(struct kvm_mmu_page_cache *pcache)
> >>>>> +{
> >>>>> + while (pcache && pcache->nobjs)
> >>>>> + free_page((unsigned long)pcache->objects[--pcache->nobjs]);
> >>>>> +}
> >>>>> +
> >>>>> +static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache)
> >>>>> +{
> >>>>> + void *p;
> >>>>> +
> >>>>> + if (!pcache)
> >>>>> + return NULL;
> >>>>> +
> >>>>> + BUG_ON(!pcache->nobjs);
> >>>>> + p = pcache->objects[--pcache->nobjs];
> >>>>> +
> >>>>> + return p;
> >>>>> +}
> >>>>> +
> >>>>> +struct local_guest_tlb_info {
> >>>>> + struct kvm_vmid *vmid;
> >>>>> + gpa_t addr;
> >>>>> +};
> >>>>> +
> >>>>> +static void local_guest_tlb_flush_vmid_gpa(void *info)
> >>>>> +{
> >>>>> + struct local_guest_tlb_info *infop = info;
> >>>>> +
> >>>>> + __kvm_riscv_hfence_gvma_vmid_gpa(READ_ONCE(infop->vmid->vmid_version),
> >>>>> + infop->addr);
> >>>>> +}
> >>>>> +
> >>>>> +static void stage2_remote_tlb_flush(struct kvm *kvm, gpa_t addr)
> >>>>> +{
> >>>>> + struct local_guest_tlb_info info;
> >>>>> + struct kvm_vmid *vmid = &kvm->arch.vmid;
> >>>>> +
> >>>>> + /* TODO: This should be SBI call */
> >>>>> + info.vmid = vmid;
> >>>>> + info.addr = addr;
> >>>>> + preempt_disable();
> >>>>> + smp_call_function_many(cpu_all_mask, local_guest_tlb_flush_vmid_gpa,
> >>>>> + &info, true);
> >>>>
> >>>> This is all nice and dandy on the toy 4 core systems we have today, but
> >>>> it will become a bottleneck further down the road.
> >>>>
> >>>> How many VMIDs do you have? Could you just allocate a new one every time
> >>>> you switch host CPUs? Then you know exactly which CPUs to flush by
> >>>> looking at all your vcpu structs and a local field that tells you which
> >>>> pCPU they're on at this moment.
> >>>>
> >>>> Either way, it's nothing that should block inclusion. For today, we're fine.
> >>>
> >>> We are not happy about this either.
> >>>
> >>> Other two options, we have are:
> >>> 1. Have SBI calls for remote HFENCEs
> >>> 2. Propose RISC-V ISA extension for remote FENCEs
> >>>
> >>> Option1 is mostly extending SBI spec and implementing it in runtime
> >>> firmware.
> >>>
> >>> Option2 is ideal solution but requires consensus among wider audience
> >>> in RISC-V foundation.
> >>>
> >>> At this point, we are fine with a simple solution.
> >>
> >> It's fine to explicitly IPI other CPUs to flush their TLBs. What is not
> >> fine is to IPI *all* CPUs to flush their TLBs.
> >
> > Ahh, this should have been cpu_online_mask instead of cpu_all_mask
> >
> > I will update this in next revision.
>
> What I was trying to say is that you only want to flush currently
> running other vcpus and add a hint for all the others saying "please
> flush the next time you come up".
>
> I think we had a mechanism for that somewhere in the EVENT magic.
>
> But as I said, this is a performance optimization - that's something I'm
> happy to delay. Security and user space ABI are the bits I'm worried
> about at this stage.

I had thought about this previously. This will be certainly a good
optimization. Let me add a TODO comment here so that we don't
forget it.

Regards,
Anup

>
>
> Alex