This change provides the initial framework support for KVM on tilegx.
Basic virtual disk and networking is supported.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/Kconfig | 19 +-
arch/tile/Makefile | 1 +
arch/tile/include/asm/io.h | 2 +
arch/tile/include/asm/kvm.h | 29 +
arch/tile/include/asm/kvm_host.h | 119 +++
arch/tile/include/asm/kvm_para.h | 20 +
arch/tile/include/asm/kvm_virtio.h | 26 +
arch/tile/include/asm/module.h | 9 +-
arch/tile/include/asm/page.h | 56 +-
arch/tile/include/asm/pgtable_32.h | 2 +-
arch/tile/include/asm/pgtable_64.h | 3 +-
arch/tile/include/asm/processor.h | 6 +-
arch/tile/include/asm/ptrace.h | 2 +-
arch/tile/include/asm/switch_to.h | 25 +-
arch/tile/include/asm/thread_info.h | 17 +-
arch/tile/include/asm/timex.h | 8 +
arch/tile/include/hv/hypervisor.h | 183 +++-
arch/tile/include/uapi/arch/sim.h | 19 +
arch/tile/include/uapi/arch/sim_def.h | 8 +
arch/tile/include/uapi/arch/spr_def_32.h | 15 +
arch/tile/include/uapi/arch/spr_def_64.h | 25 +
arch/tile/include/uapi/asm/Kbuild | 2 +
arch/tile/include/uapi/asm/kvm.h | 249 +++++
arch/tile/include/uapi/asm/kvm_virtio.h | 60 ++
arch/tile/kernel/Makefile | 1 +
arch/tile/kernel/asm-offsets.c | 7 +
arch/tile/kernel/early_printk.c | 17 +
arch/tile/kernel/head_32.S | 4 +-
arch/tile/kernel/head_64.S | 6 +-
arch/tile/kernel/hvglue.S | 8 +-
arch/tile/kernel/hvglue_trace.c | 14 +
arch/tile/kernel/intvec_32.S | 18 +-
arch/tile/kernel/intvec_64.S | 226 +++--
arch/tile/kernel/kvm_virtio.c | 430 ++++++++
arch/tile/kernel/process.c | 40 +-
arch/tile/kernel/relocate_kernel_64.S | 9 +-
arch/tile/kernel/setup.c | 21 +-
arch/tile/kernel/smp.c | 28 +-
arch/tile/kernel/stack.c | 2 +-
arch/tile/kernel/sysfs.c | 4 +
arch/tile/kernel/time.c | 14 +-
arch/tile/kernel/traps.c | 2 +-
arch/tile/kernel/vmlinux.lds.S | 10 +-
arch/tile/kvm/Kconfig | 3 -
arch/tile/kvm/Makefile | 12 +
arch/tile/kvm/entry.S | 91 ++
arch/tile/kvm/kvm-tile.c | 1585 ++++++++++++++++++++++++++++++
arch/tile/lib/exports.c | 20 +-
arch/tile/mm/elf.c | 2 +
arch/tile/mm/fault.c | 4 +-
arch/tile/mm/init.c | 8 +-
arch/tile/mm/pgtable.c | 35 +-
include/uapi/linux/kvm.h | 3 +
virt/kvm/kvm_main.c | 7 +-
54 files changed, 3338 insertions(+), 198 deletions(-)
create mode 100644 arch/tile/include/asm/kvm.h
create mode 100644 arch/tile/include/asm/kvm_host.h
create mode 100644 arch/tile/include/asm/kvm_para.h
create mode 100644 arch/tile/include/asm/kvm_virtio.h
create mode 100644 arch/tile/include/uapi/asm/kvm.h
create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h
create mode 100644 arch/tile/kernel/kvm_virtio.c
create mode 100644 arch/tile/kvm/Makefile
create mode 100644 arch/tile/kvm/entry.S
create mode 100644 arch/tile/kvm/kvm-tile.c
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index ecff467..bbb6d51 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -5,7 +5,6 @@ config TILE
def_bool y
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
- select HAVE_KVM if !TILEGX
select GENERIC_FIND_FIRST_BIT
select SYSCTL_EXCEPTION_TRACE
select USE_GENERIC_SMP_HELPERS
@@ -113,6 +112,7 @@ config SMP
def_bool y
config HVC_TILE
+ depends on !KVM_GUEST
depends on TTY
select HVC_DRIVER
select HVC_IRQ if TILEGX
@@ -127,6 +127,7 @@ config TILEGX
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KPROBES
select HAVE_KRETPROBES
+ select HAVE_KVM if !KVM_GUEST
config TILEPRO
def_bool !TILEGX
@@ -366,11 +367,23 @@ config HARDWALL
bool "Hardwall support to allow access to user dynamic network"
default y
+config KVM_GUEST
+ bool "Build kernel as guest for KVM"
+ default n
+ depends on TILEGX
+ select VIRTIO
+ select VIRTIO_RING
+ select VIRTIO_CONSOLE
+ ---help---
+ This will build a kernel that runs at a lower protection level
+ than the default kernel and is suitable to run under KVM.
+
+# TILEPro kernels run at PL1; TILE-Gx runs at PL2 unless it's a KVM guest.
config KERNEL_PL
int "Processor protection level for kernel"
range 1 2
- default 2 if TILEGX
- default 1 if !TILEGX
+ default 2 if TILEGX && !KVM_GUEST
+ default 1 if !TILEGX || KVM_GUEST
---help---
Since MDE 4.2, the Tilera hypervisor runs the kernel
at PL2 by default. If running under an older hypervisor,
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364..8e7f852 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH)
# See arch/tile/Kbuild for content of core part of the kernel
core-y += arch/tile/
+core-$(CONFIG_KVM) += arch/tile/kvm/
core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 9fe4349..023659b 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -43,6 +43,8 @@
* long before casting it to a pointer to avoid compiler warnings.
*/
#if CHIP_HAS_MMIO()
+extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot);
extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
pgprot_t pgprot);
diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
new file mode 100644
index 0000000..2ea6c41
--- /dev/null
+++ b/arch/tile/include/asm/kvm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_H
+#define _ASM_TILE_KVM_H
+
+#include <hv/hypervisor.h>
+#include <uapi/asm/kvm.h>
+
+#ifndef __ASSEMBLER__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
+#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
+#endif
+#endif /* _ASM_TILE_KVM_H */
diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
new file mode 100644
index 0000000..8241f50
--- /dev/null
+++ b/arch/tile/include/asm/kvm_host.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _ASM_TILE_KVM_HOST_H
+#define _ASM_TILE_KVM_HOST_H
+
+#define KVM_MAX_VCPUS 64
+#define KVM_USER_MEM_SLOTS 32
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+/* For now, claim we have no huge pages. */
+#define KVM_HPAGE_GFN_SHIFT(x) 0
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) 1
+
+/* Max number of message tags for hv_send/receive_message() */
+#define MAX_MSG_TAG (sizeof(unsigned long) * 8)
+
+/* Bits in pending_downcalls */
+#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kvm_vcpu_stat {
+ /* None yet. */
+};
+
+struct kvm_vcpu_arch {
+ struct pt_regs regs;
+ unsigned long host_sp; /* Host "real" sp during vmresume. */
+ HV_Context guest_context;
+ unsigned long pending_msgs; /* Pending guest messages */
+ unsigned long ipi_events; /* Pending guest ipi events. */
+ unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
+ pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
+ unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */
+ int suspended; /* true for cores not yet started by host */
+ unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */
+ unsigned long vmexit_cycles; /* cycle count of last vmexit */
+
+#define FOR_EACH_GUEST_SPR(f) \
+ f(INTERRUPT_MASK_1); \
+ f(INTERRUPT_VECTOR_BASE_1); \
+ f(EX_CONTEXT_1_0); \
+ f(EX_CONTEXT_1_1); \
+ f(SYSTEM_SAVE_1_0); \
+ f(SYSTEM_SAVE_1_1); \
+ f(SYSTEM_SAVE_1_2); \
+ f(SYSTEM_SAVE_1_3); \
+ f(INTCTRL_1_STATUS); \
+ f(IPI_MASK_1); \
+ f(IPI_EVENT_1); \
+ f(SINGLE_STEP_CONTROL_1); \
+ f(SINGLE_STEP_EN_1_1); \
+
+#define DECLARE_SPR(f) unsigned long f
+ FOR_EACH_GUEST_SPR(DECLARE_SPR)
+#undef DECLARE_SPR
+};
+
+struct kvm_vm_stat {
+ /*
+ * FIXME - does this make sense for us? It's used in common KVM
+ * code.
+ */
+ u32 remote_tlb_flush;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+struct kvm_arch {
+ pgd_t *vpgd;
+ unsigned long resv_gpa_start; /* For special purpose. */
+ struct completion smp_start;
+};
+
+struct kvm_vcpu;
+
+extern void kvm_vmresume(struct pt_regs *guest,
+ unsigned long *host_sp_ptr);
+extern void kvm_vmexit(unsigned long host_sp);
+extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
+extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
+extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long, unsigned long);
+extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
+
+extern void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
+
+#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
+
+#define gpmd_offset(kvm, pud, address) \
+ ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
+
+#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
+
+#define gpte_offset_kernel(kvm, pmd, address) \
+ ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
+
+#endif /* __ASSEMBLY__*/
+
+#endif /* _ASM_TILE_KVM_HOST_H */
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 0000000..c8c31d5
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_PARA_H
+#define _ASM_TILE_KVM_PARA_H
+
+#include <uapi/asm/kvm_para.h>
+
+int hcall_virtio(unsigned long instrument, unsigned long mem);
+#endif /* _ASM_TILE_KVM_PARA_H */
diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
new file mode 100644
index 0000000..8faa959
--- /dev/null
+++ b/arch/tile/include/asm/kvm_virtio.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_VIRTIO_H
+#define _ASM_TILE_KVM_VIRTIO_H
+
+#include <uapi/asm/kvm_virtio.h>
+
+
+struct kvm_device {
+ struct virtio_device vdev;
+ struct kvm_device_desc *desc;
+ unsigned long desc_pa;
+};
+
+#endif /* _ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
index 44ed07c..927c97f 100644
--- a/arch/tile/include/asm/module.h
+++ b/arch/tile/include/asm/module.h
@@ -28,6 +28,13 @@
# define MODULE_PGSZ ""
#endif
+/* Tag guest Linux, since it uses different SPRs, etc. */
+#if CONFIG_KERNEL_PL == 2
+#define MODULE_PL ""
+#else
+#define MODULE_PL " guest"
+#endif
+
/* We don't really support no-SMP so tag if someone tries. */
#ifdef CONFIG_SMP
#define MODULE_NOSMP ""
@@ -35,6 +42,6 @@
#define MODULE_NOSMP " nosmp"
#endif
-#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP
#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index b4f96c0..65ee752 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif
+#ifdef CONFIG_KVM_GUEST
+/* Paravirtualized guests get half the VA, and thus half the PA. */
+#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1)
+#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1)
+#else
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+#endif
+
/* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
#define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
#define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
#define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
* We reserve the lower half of memory for user-space programs, and the
* upper half for system code. We re-map all of physical memory in the
* upper half, which takes a quarter of our VA space. Then we have
- * the vmalloc regions. The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions. The supervisor code lives at the highest address,
* with the hypervisor above that.
*
* Loadable kernel modules are placed immediately after the static
@@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size)
* Similarly, for now we don't play any struct page mapping games.
*/
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
# error Too much PA to map with the VA available!
#endif
-#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
-#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */
-#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */
-#define PAGE_OFFSET MEM_HIGH_START
-#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */
+#ifdef CONFIG_KVM_GUEST
+#define PAGE_OFFSET (_AC(1, UL) << (MAX_VA_WIDTH - 1))
+#define KERNEL_HIGH_VADDR (_AC(1, UL) << MAX_VA_WIDTH)
+#else
+#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */
+#endif
+
+#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
#define _VMALLOC_START FIXADDR_TOP
-#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT MEM_SV_START
-#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */
#define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR MEM_SV_START
#else /* !__tilegx__ */
@@ -213,8 +221,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
* values, and after that, we show "typical" values, since the actual
* addresses depend on kernel #defines.
*
- * MEM_HV_INTRPT 0xfe000000
- * MEM_SV_INTRPT (kernel code) 0xfd000000
+ * MEM_HV_START 0xfe000000
+ * MEM_SV_START (kernel code) 0xfd000000
* MEM_USER_INTRPT (user vector) 0xfc000000
* FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR)
* PKMAP_BASE 0xf7000000 (via LAST_PKMAP)
@@ -224,14 +232,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
*/
#define MEM_USER_INTRPT _AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT _AC(0xfd000000, UL)
-#define MEM_HV_INTRPT _AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT _AC(0xfd000000, UL)
-#define MEM_SV_INTRPT _AC(0xfe000000, UL)
-#define MEM_HV_INTRPT _AC(0xff000000, UL)
-#endif
+#define MEM_SV_START _AC(0xfd000000, UL)
+#define MEM_HV_START _AC(0xfe000000, UL)
#define INTRPT_SIZE 0x4000
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index e5bdc0e..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; }
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_INTRPT;
+ return addr >= MEM_HV_START;
}
/*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 7cb8d35..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_START ||
- (addr > MEM_LOW_END && addr < MEM_HIGH_START);
+ return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
}
/*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 230b830..5aa5431 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
#ifndef _ASM_TILE_PROCESSOR_H
#define _ASM_TILE_PROCESSOR_H
+#include <arch/chip.h>
+
#ifndef __ASSEMBLY__
/*
@@ -25,7 +27,6 @@
#include <asm/ptrace.h>
#include <asm/percpu.h>
-#include <arch/chip.h>
#include <arch/spr_def.h>
struct task_struct;
@@ -167,7 +168,7 @@ struct thread_struct {
#ifndef __ASSEMBLY__
#ifdef __tilegx__
-#define TASK_SIZE_MAX (MEM_LOW_END + 1)
+#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1))
#else
#define TASK_SIZE_MAX PAGE_OFFSET
#endif
@@ -347,7 +348,6 @@ extern int kdata_huge;
/*
* Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
*/
#define USER_PL 0
#if CONFIG_KERNEL_PL == 2
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 0d25c21..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
#define user_stack_pointer(regs) ((regs)->sp)
/* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
/* Fill in a struct pt_regs with the current kernel registers. */
struct pt_regs *get_pt_regs(struct pt_regs *);
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index b8f888c..8e9150f 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev,
extern unsigned long get_switch_to_pc(void);
/*
+ * Normally we notify the simulator whenever we change from one pid
+ * to another, so it can track symbol files appropriately on the fly.
+ * For now, we don't do this for the guest Linux, since we don't
+ * have a way to tell the simulator that we are entering a separate
+ * pid space when we are in the guest.
+ */
+#ifdef CONFIG_KVM_GUEST
+#define notify_sim_task_change(prev) do { } while (0)
+#else
+#define notify_sim_task_change(prev) do { \
+ if (unlikely((prev)->state == TASK_DEAD)) \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
+ ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
+ (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+} while (0)
+#endif
+
+/*
* Kernel threads can check to see if they need to migrate their
* stack whenever they return from a context switch; for user
* threads, we defer until they are returning to user-space.
*/
#define finish_arch_switch(prev) do { \
- if (unlikely((prev)->state == TASK_DEAD)) \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
- ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
- (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ notify_sim_task_change(prev); \
if (current->mm == NULL && !kstack_hash && \
current_thread_info()->homecache_cpu != smp_processor_id()) \
homecache_migrate_kthread(); \
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index b8aa6df..1c26cdf 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -18,7 +18,9 @@
#include <asm/processor.h>
#include <asm/page.h>
+
#ifndef __ASSEMBLY__
+struct kvm_vcpu;
/*
* Low level task data that assembly code needs immediate access to.
@@ -44,6 +46,9 @@ struct thread_info {
unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */
void __user *unalign_jit_base; /* unalign fixup JIT base */
#endif
+#ifdef CONFIG_KVM
+ struct kvm_vcpu *vcpu; /* vcpu during vmresume */
+#endif
};
/*
@@ -117,8 +122,8 @@ extern void _cpu_idle(void);
/*
* Thread information flags that various assembly files may need to access.
- * Keep flags accessed frequently in low bits, particular since it makes
- * it easier to build constants in assembly.
+ * Keep flags accessed frequently in low bits, since it makes it
+ * easier to build constants in assembly.
*/
#define TIF_SIGPENDING 0 /* signal pending */
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
@@ -131,6 +136,7 @@ extern void _cpu_idle(void);
#define TIF_MEMDIE 7 /* OOM killer at work */
#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
+#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
@@ -142,11 +148,12 @@ extern void _cpu_idle(void);
#define _TIF_MEMDIE (1<<TIF_MEMDIE)
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VIRT_EXIT (1<<TIF_VIRT_EXIT)
/* Work to do on any return to user space. */
-#define _TIF_ALLWORK_MASK \
- (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
- _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
+#define _TIF_ALLWORK_MASK \
+ (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP| \
+ _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)
/* Work to do at syscall entry. */
#define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
index edbd7e4..0417617 100644
--- a/arch/tile/include/asm/timex.h
+++ b/arch/tile/include/asm/timex.h
@@ -27,6 +27,14 @@
typedef unsigned long long cycles_t;
+#ifdef CONFIG_KVM_GUEST
+#define INT_LINUX_TIMER INT_AUX_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_AUX_TILE_TIMER_CONTROL
+#else
+#define INT_LINUX_TIMER INT_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_TILE_TIMER_CONTROL
+#endif
+
#if CHIP_HAS_SPLIT_CYCLE()
cycles_t get_cycles(void);
#define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index f71b08e..71abe38 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -321,6 +321,18 @@
/** hv_set_speed */
#define HV_DISPATCH_SET_SPEED 58
+/** hv_install_virt_context */
+#define HV_DISPATCH_INSTALL_VIRT_CONTEXT 59
+
+/** hv_inquire_virt_context */
+#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT 60
+
+/** hv_install_guest_context */
+#define HV_DISPATCH_INSTALL_GUEST_CONTEXT 61
+
+/** hv_inquire_guest_context */
+#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT 62
+
/** hv_console_set_ipi */
#define HV_DISPATCH_CONSOLE_SET_IPI 63
@@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
* new page table does not need to contain any mapping for the
* hv_install_context address itself.
*
- * At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ * At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
* if multiple flags are specified, HV_EINVAL is returned.
* Specifying none of the flags results in using the default page size.
* All cores participating in a given client must request the same
* page size, or the results are undefined.
*
+ * To disable an installed page table, install HV_CTX_NONE. The access
+ * and asid fields are ignored.
+ *
* @param page_table Root of the page table.
* @param access PTE providing info on how to read the page table. This
* value must be consistent between multiple tiles sharing a page table,
@@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
#endif /* !__ASSEMBLER__ */
+#define HV_CTX_NONE ((HV_PhysAddr)-1) /**< Disable page table. */
+
#define HV_CTX_DIRECTIO 0x1 /**< Direct I/O requests are accepted from
PL0. */
+#define HV_CTX_GUEST_CACHE 0x4 /**< Let guest control caching flags (only
+ usable with hv_install_virt_context.) */
+
#define HV_CTX_PG_SM_4K 0x10 /**< Use 4K small pages, if available. */
#define HV_CTX_PG_SM_16K 0x20 /**< Use 16K small pages, if available. */
#define HV_CTX_PG_SM_64K 0x40 /**< Use 64K small pages, if available. */
#define HV_CTX_PG_SM_MASK 0xf0 /**< Mask of all possible small pages. */
+
#ifndef __ASSEMBLER__
+/** Install a virtualization context.
+ *
+ * When a virtualization context is installed, all faults from PL0 or
+ * PL1 are handled via a "guest context" and then post-processed by
+ * the "virtualization context"; faults at PL2 are still handled by
+ * the normal context. For guest faults, the "guest PAs" produced by
+ * the guest page table are passed through the virtualization page
+ * table as pseudo-VAs, generating the true CPA as a result. See the
+ * individual HV_PTE_xxx bits for the effect the bits have when
+ * present in the virtualization page table. The ASID is currently
+ * ignored in this syscall, but it might be used later, so the API
+ * includes it. The HV_CTX_GUEST_CACHE flag indicates that all
+ * cache-related flags should be taken from the primary page table,
+ * not the virtualization page table.
+ *
+ * Once the virtualization context is installed, a guest context
+ * should also be installed; otherwise a VA-equals-PA context will be
+ * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
+ * the virtualization context to generate CPAs.
+ *
+ * When entering client PL after being at guest or user PL, the
+ * client is expected to call hv_flush_all() to clear any TLB mappings
+ * that might otherwise conflict. Similarly, hv_flush_all() should
+ * be called before returning to guest or user PL with a virtualization
+ * context installed, so that any TLB mappings are cleared. Future
+ * work may include adding a "vpid" or similar namespace so that
+ * the TLBs may be managed independently.
+ *
+ * Subsequent guest page table installations will have their root PA
+ * and PTE cached after translating through the virtualization
+ * context, so if entries in the virtualization page table are
+ * modified or removed, the guest context should be re-installed.
+ * This, in conjunction with flushing the TLB on return to the guest,
+ * will ensure that the new virtualization entries are honored.
+ *
+ * @param page_table Root of the page table.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for (currently ignored).
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current virtualization context (see below).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
+
+
+/** Install a guest context.
+ *
+ * The guest context is only consulted when a virtualization context
+ * is also installed, and for faults that occur below the client's PL.
+ * If no guest context is installed, in such a case, a VA=PA context
+ * is used instead.
+ *
+ * The access PTE will only be honored if the virtualization table was
+ * installed with HV_CTX_GUEST_CACHE.
+ *
+ * A virtualization context must already be installed prior to
+ * installing the guest context.
+ *
+ * @param page_table Root of the page table; the value is the guest's
+ * physical address (GPA), not a CPA.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for.
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current context (HV_CTX_xxx).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
/** Set the number of pages ganged together by HV_PTE_SUPER at a
* particular level of the page table.
@@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
* "super" page size must be less than the span of the next level in
* the page table. The largest size that can be requested is 64GB.
*
- * The shift value is initially "0" for all page table levels,
+ * The shift value is initially 0 for all page table levels,
* indicating that the HV_PTE_SUPER bit is effectively ignored.
*
* If you change the count from one non-zero value to another, the
@@ -854,11 +954,26 @@ typedef struct
} HV_Context;
/** Retrieve information about the currently installed context.
- * @return The data passed to the last successful hv_install_context call.
+ * @return The data passed to the last successful call to
+ * hv_install_context().
*/
HV_Context hv_inquire_context(void);
+/** Retrieve information about the currently installed virtualization context.
+ * @return The data passed to the last successful call to
+ * hv_install_virt_context().
+ */
+HV_Context hv_inquire_virt_context(void);
+
+
+/** Retrieve information about the currently installed guest context.
+ * @return The data passed to the last successful call to
+ * hv_install_guest_context().
+ */
+HV_Context hv_inquire_guest_context(void);
+
+
/** Flushes all translations associated with the named address space
* identifier from the TLB and any other hypervisor data structures.
* Translations installed with the "global" bit are not flushed.
@@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
/** Flushes all non-global translations (if preserve_global is true),
* or absolutely all translations (if preserve_global is false).
*
- * @param preserve_global Non-zero if we want to preserve "global" mappings.
+ * @param preserve_global Non-zero if we want to preserve global mappings.
* @return Zero on success, or a hypervisor error code on failure.
*/
int hv_flush_all(int preserve_global);
@@ -991,7 +1106,11 @@ typedef enum {
HV_INQ_TILES_HFH_CACHE = 2,
/** The set of tiles that can be legally used as a LOTAR for a PTE. */
- HV_INQ_TILES_LOTAR = 3
+ HV_INQ_TILES_LOTAR = 3,
+
+ /** The set of "shared" driver tiles that the hypervisor may
+ * periodically interrupt. */
+ HV_INQ_TILES_SHARED = 4
} HV_InqTileSet;
/** Returns specific information about various sets of tiles within the
@@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
*/
/** Message receive downcall interrupt vector */
#define INT_MESSAGE_RCV_DWNCL INT_BOOT_ACCESS
+/** Device interrupt downcall interrupt vector */
+#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+#ifdef __tilegx__
+/** Virtualization page table miss downcall interrupt vector */
+#define INT_VPGTABLE_MISS_DWNCL INT_I_ASID
+/** Virtualization guest illegal page table */
+#define INT_VGUEST_FATAL_DWNCL INT_D_ASID
+#else
/** DMA TLB miss downcall interrupt vector */
#define INT_DMATLB_MISS_DWNCL INT_DMA_ASID
-/** Static nework processor instruction TLB miss interrupt vector */
-#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
/** DMA TLB access violation downcall interrupt vector */
#define INT_DMATLB_ACCESS_DWNCL INT_DMA_CPL
-/** Device interrupt downcall interrupt vector */
-#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+/** Static nework processor instruction TLB miss interrupt vector */
+#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
+#endif
#ifndef __ASSEMBLER__
@@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
#define HV_PTE_PTFN_BITS 29 /**< Number of bits in a PTFN */
/*
- * Legal values for the PTE's mode field
+ * Legal values for the PTE's mode field.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
+ * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
+ * to access MMIO resources via pseudo PAs that map to MMIO in the
+ * virtualization page table.
*/
+
/** Data is not resident in any caches; loads and stores access memory
* directly.
*/
@@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the primary page table if a virtualization
+ * page table is installed.
*/
#define HV_PTE_GLOBAL (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
@@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the virtualization page table.
*/
#define HV_PTE_USER (__HV_PTE_ONE << HV_PTE_INDEX_USER)
@@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_ACCESSED (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
@@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_DIRTY (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
@@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NC (__HV_PTE_ONE << HV_PTE_INDEX_NC)
@@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit
* determines how the level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L1 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
@@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L2 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
@@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* the page map directly to memory.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_CACHED_PRIORITY (__HV_PTE_ONE << \
HV_PTE_INDEX_CACHED_PRIORITY)
@@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* It is illegal for this bit to be clear if the Writable bit is set.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Readable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_READABLE (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
@@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* PTE.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Writable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_WRITABLE (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
@@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* than one.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Executable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_EXECUTABLE (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
index e54b7b0..36fb24c 100644
--- a/arch/tile/include/uapi/arch/sim.h
+++ b/arch/tile/include/uapi/arch/sim.h
@@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
__insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
}
+/**
+ * Set vCPU number for a given task.
+ * @param vcpu Virtual cpu to set.
+ */
+static __inline void
+sim_set_vcpu(int vcpu)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+/** Clear vCPU status for a given task. */
+static __inline void
+sim_clear_vcpu(void)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
+}
+
/*
* Event support.
diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
index 4b44a2b..b9aad66 100644
--- a/arch/tile/include/uapi/arch/sim_def.h
+++ b/arch/tile/include/uapi/arch/sim_def.h
@@ -221,6 +221,14 @@
*/
#define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
+/**
+ * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
+ * number shifted by 8, will tag any identification of the cpu that
+ * task is running on with the given virtual cpu number. If the
+ * virtual cpu number is -1, the tag is removed.
+ */
+#define SIM_CONTROL_VCPU 37
+
/*
* Syscall numbers for use with "sim_syscall()".
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446..4644c8d 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -121,6 +121,9 @@
#define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
#define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
#define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_GPV_SET_0 0x0600
+#define SPR_MPL_GPV_SET_1 0x0601
+#define SPR_MPL_GPV_SET_2 0x0602
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -142,6 +145,9 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x3400
#define SPR_MPL_IDN_TIMER_SET_1 0x3401
#define SPR_MPL_IDN_TIMER_SET_2 0x3402
+#define SPR_MPL_ILL_SET_0 0x0400
+#define SPR_MPL_ILL_SET_1 0x0401
+#define SPR_MPL_ILL_SET_2 0x0402
#define SPR_MPL_INTCTRL_0_SET_0 0x4a00
#define SPR_MPL_INTCTRL_0_SET_1 0x4a01
#define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -166,6 +172,12 @@
#define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
#define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
#define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
+#define SPR_MPL_SWINT_0_SET_0 0x1c00
+#define SPR_MPL_SWINT_0_SET_1 0x1c01
+#define SPR_MPL_SWINT_0_SET_2 0x1c02
+#define SPR_MPL_SWINT_1_SET_0 0x1a00
+#define SPR_MPL_SWINT_1_SET_1 0x1a01
+#define SPR_MPL_SWINT_1_SET_2 0x1a02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
@@ -187,6 +199,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x3600
#define SPR_MPL_UDN_TIMER_SET_1 0x3601
#define SPR_MPL_UDN_TIMER_SET_2 0x3602
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
#define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
#define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
#define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
index 67a6c17..727cda7 100644
--- a/arch/tile/include/uapi/arch/spr_def_64.h
+++ b/arch/tile/include/uapi/arch/spr_def_64.h
@@ -21,6 +21,10 @@
#define SPR_AUX_PERF_COUNT_1 0x2106
#define SPR_AUX_PERF_COUNT_CTL 0x2107
#define SPR_AUX_PERF_COUNT_STS 0x2108
+#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
+#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK 0xffffffff
+#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
+#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
#define SPR_CMPEXCH_VALUE 0x2780
#define SPR_CYCLE 0x2781
#define SPR_DONE 0x2705
@@ -101,6 +105,9 @@
#define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
#define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
#define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_GPV_SET_0 0x0900
+#define SPR_MPL_GPV_SET_1 0x0901
+#define SPR_MPL_GPV_SET_2 0x0902
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -116,6 +123,12 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x1800
#define SPR_MPL_IDN_TIMER_SET_1 0x1801
#define SPR_MPL_IDN_TIMER_SET_2 0x1802
+#define SPR_MPL_ILL_SET_0 0x0800
+#define SPR_MPL_ILL_SET_1 0x0801
+#define SPR_MPL_ILL_SET_2 0x0802
+#define SPR_MPL_ILL_TRANS_SET_0 0x1000
+#define SPR_MPL_ILL_TRANS_SET_1 0x1001
+#define SPR_MPL_ILL_TRANS_SET_2 0x1002
#define SPR_MPL_INTCTRL_0_SET_0 0x2500
#define SPR_MPL_INTCTRL_0_SET_1 0x2501
#define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -140,6 +153,15 @@
#define SPR_MPL_PERF_COUNT_SET_0 0x2000
#define SPR_MPL_PERF_COUNT_SET_1 0x2001
#define SPR_MPL_PERF_COUNT_SET_2 0x2002
+#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
+#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
+#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
+#define SPR_MPL_SWINT_0_SET_0 0x0f00
+#define SPR_MPL_SWINT_0_SET_1 0x0f01
+#define SPR_MPL_SWINT_0_SET_2 0x0f02
+#define SPR_MPL_SWINT_1_SET_0 0x0e00
+#define SPR_MPL_SWINT_1_SET_1 0x0e01
+#define SPR_MPL_SWINT_1_SET_2 0x0e02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -155,6 +177,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x1900
#define SPR_MPL_UDN_TIMER_SET_1 0x1901
#define SPR_MPL_UDN_TIMER_SET_2 0x1902
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
#define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
#define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
#define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index c20db8e..f07cc24 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -6,7 +6,9 @@ header-y += bitsperlong.h
header-y += byteorder.h
header-y += cachectl.h
header-y += hardwall.h
+header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_virtio.h
header-y += mman.h
header-y += ptrace.h
header-y += setup.h
diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
new file mode 100644
index 0000000..25ca8ce
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_H
+#define _UAPI_ASM_TILE_KVM_H
+
+#ifndef __ASSEMBLER__
+#include <linux/ptrace.h>
+#endif
+
+#include <arch/abi.h>
+
+/*
+ * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
+ * with small modifications: Remove HV_SYS_fence_incoherent.
+ */
+/* Syscall allowed from guest PL bit mask. */
+#define HV_SYS_GUEST_SHIFT 12
+#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT)
+/* downcall_dispatch; this syscall number must be zero */
+#define HV_SYS_downcall_dispatch 0
+/* install_context */
+#define HV_SYS_install_context 1
+/* sysconf */
+#define HV_SYS_sysconf 2
+/* get_rtc */
+#define HV_SYS_get_rtc 3
+/* set_rtc */
+#define HV_SYS_set_rtc 4
+/* flush_asid */
+#define HV_SYS_flush_asid 5
+/* flush_page */
+#define HV_SYS_flush_page 6
+/* flush_pages */
+#define HV_SYS_flush_pages 7
+/* restart */
+#define HV_SYS_restart 8
+/* halt */
+#define HV_SYS_halt 9
+/* power_off */
+#define HV_SYS_power_off 10
+/* inquire_physical */
+#define HV_SYS_inquire_physical 11
+/* inquire_memory_controller */
+#define HV_SYS_inquire_memory_controller 12
+/* inquire_virtual */
+#define HV_SYS_inquire_virtual 13
+/* inquire_asid */
+#define HV_SYS_inquire_asid 14
+/* console_read_if_ready */
+#define HV_SYS_console_read_if_ready 15
+/* console_write */
+#define HV_SYS_console_write 16
+/* init */
+#define HV_SYS_init 17
+/* inquire_topology */
+#define HV_SYS_inquire_topology 18
+/* fs_findfile */
+#define HV_SYS_fs_findfile 19
+/* fs_fstat */
+#define HV_SYS_fs_fstat 20
+/* fs_pread */
+#define HV_SYS_fs_pread 21
+/* physaddr_read64 */
+#define HV_SYS_physaddr_read64 22
+/* physaddr_write64 */
+#define HV_SYS_physaddr_write64 23
+/* get_command_line */
+#define HV_SYS_get_command_line 24
+/* set_caching */
+#define HV_SYS_set_caching 25
+/* bzero_page */
+#define HV_SYS_bzero_page 26
+/* register_message_state */
+#define HV_SYS_register_message_state 27
+/* send_message */
+#define HV_SYS_send_message 28
+/* receive_message */
+#define HV_SYS_receive_message 29
+/* inquire_context */
+#define HV_SYS_inquire_context 30
+/* start_all_tiles */
+#define HV_SYS_start_all_tiles 31
+/* dev_open */
+#define HV_SYS_dev_open 32
+/* dev_close */
+#define HV_SYS_dev_close 33
+/* dev_pread */
+#define HV_SYS_dev_pread 34
+/* dev_pwrite */
+#define HV_SYS_dev_pwrite 35
+/* dev_poll */
+#define HV_SYS_dev_poll 36
+/* dev_poll_cancel */
+#define HV_SYS_dev_poll_cancel 37
+/* dev_preada */
+#define HV_SYS_dev_preada 38
+/* dev_pwritea */
+#define HV_SYS_dev_pwritea 39
+/* flush_remote */
+#define HV_SYS_flush_remote 40
+/* console_putc */
+#define HV_SYS_console_putc 41
+/* inquire_tiles */
+#define HV_SYS_inquire_tiles 42
+/* confstr */
+#define HV_SYS_confstr 43
+/* reexec */
+#define HV_SYS_reexec 44
+/* set_command_line */
+#define HV_SYS_set_command_line 45
+
+/* store_mapping */
+#define HV_SYS_store_mapping 52
+/* inquire_realpa */
+#define HV_SYS_inquire_realpa 53
+/* flush_all */
+#define HV_SYS_flush_all 54
+/* get_ipi_pte */
+#define HV_SYS_get_ipi_pte 55
+/* set_pte_super_shift */
+#define HV_SYS_set_pte_super_shift 56
+/* set_speed */
+#define HV_SYS_set_speed 57
+/* install_virt_context */
+#define HV_SYS_install_virt_context 58
+/* inquire_virt_context */
+#define HV_SYS_inquire_virt_context 59
+/* inquire_guest_context */
+#define HV_SYS_install_guest_context 60
+/* inquire_guest_context */
+#define HV_SYS_inquire_guest_context 61
+
+/*
+ * Number of hypercall (from guest os to host os) other than hv_*().
+ * We leave the previous 128 entries to the usual hv_*() calls
+ * as defined in hypervisor.h.
+ */
+#define KVM_OTHER_HCALL 128
+
+/* Hypercall index for virtio. */
+#define KVM_HCALL_virtio 128
+
+/* One greater than the maximum hypercall number. */
+#define KVM_NUM_HCALLS 256
+
+#ifndef __ASSEMBLER__
+
+struct kvm_regs {
+ struct pt_regs regs;
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#ifndef __KERNEL__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
+#endif
+
+#define HCALL_DEFS \
+ /* For hv_*() */ \
+ KVM_EMULATE(init) \
+ NO_EMULATE(install_context) \
+ KVM_EMULATE(sysconf) \
+ KVM_EMULATE(get_rtc) \
+ KVM_EMULATE(set_rtc) \
+ NO_EMULATE(flush_asid) \
+ NO_EMULATE(flush_page) \
+ NO_EMULATE(flush_pages) \
+ USER_EMULATE(restart) \
+ USER_EMULATE(halt) \
+ USER_EMULATE(power_off) \
+ USER_EMULATE(inquire_physical) \
+ USER_EMULATE(inquire_memory_controller) \
+ KVM_EMULATE(inquire_virtual) \
+ KVM_EMULATE(inquire_asid) \
+ NO_EMULATE(console_read_if_ready) \
+ NO_EMULATE(console_write) \
+ NO_EMULATE(downcall_dispatch) \
+ KVM_EMULATE(inquire_topology) \
+ USER_EMULATE(fs_findfile) \
+ USER_EMULATE(fs_fstat) \
+ USER_EMULATE(fs_pread) \
+ KVM_EMULATE(physaddr_read64) \
+ KVM_EMULATE(physaddr_write64) \
+ USER_EMULATE(get_command_line) \
+ USER_EMULATE(set_caching) \
+ NO_EMULATE(bzero_page) \
+ KVM_EMULATE(register_message_state) \
+ KVM_EMULATE(send_message) \
+ KVM_EMULATE(receive_message) \
+ KVM_EMULATE(inquire_context) \
+ KVM_EMULATE(start_all_tiles) \
+ USER_EMULATE(dev_open) \
+ USER_EMULATE(dev_close) \
+ USER_EMULATE(dev_pread) \
+ USER_EMULATE(dev_pwrite) \
+ USER_EMULATE(dev_poll) \
+ USER_EMULATE(dev_poll_cancel) \
+ USER_EMULATE(dev_preada) \
+ USER_EMULATE(dev_pwritea) \
+ USER_EMULATE(flush_remote) \
+ NO_EMULATE(console_putc) \
+ KVM_EMULATE(inquire_tiles) \
+ KVM_EMULATE(confstr) \
+ USER_EMULATE(reexec) \
+ USER_EMULATE(set_command_line) \
+ USER_EMULATE(store_mapping) \
+ NO_EMULATE(inquire_realpa) \
+ NO_EMULATE(flush_all) \
+ KVM_EMULATE(get_ipi_pte) \
+ KVM_EMULATE(set_pte_super_shift) \
+ KVM_EMULATE(set_speed) \
+ /* For others */ \
+ USER_HCALL(virtio)
+
+#endif
+
+#endif /* _UAPI_ASM_TILE_KVM_H */
diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
new file mode 100644
index 0000000..d94f535
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm_virtio.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
+#define _UAPI_ASM_TILE_KVM_VIRTIO_H
+
+#include <linux/types.h>
+
+#define KVM_VIRTIO_UNKNOWN 0
+#define KVM_VIRTIO_NOTIFY 1
+#define KVM_VIRTIO_RESET 2
+#define KVM_VIRTIO_SET_STATUS 3
+
+struct kvm_device_desc {
+ /* The device type: console, network, disk etc. Type 0 terminates. */
+ __u8 type;
+ /* The number of virtqueues (first in config array) */
+ __u8 num_vq;
+ /*
+ * The number of bytes of feature bits. Multiply by 2: one for host
+ * features and one for Guest acknowledgements.
+ */
+ __u8 feature_len;
+ /* The number of bytes of the config array after virtqueues. */
+ __u8 config_len;
+ /* A status byte, written by the Guest. */
+ __u8 status;
+ __u64 config[0];
+};
+
+struct kvm_vqinfo {
+ /* Pointer to the information contained in the device config. */
+ struct kvm_vqconfig *config;
+ /* The address where we mapped the virtio ring, so we can unmap it. */
+ void *pages;
+};
+
+struct kvm_vqconfig {
+ /* The physical address of the virtio ring */
+ __u64 pa;
+ /* The number of entries in the virtio_ring */
+ __u64 num;
+ /* The interrupt we get when something happens. Set by the guest. */
+ __u32 irq;
+
+};
+
+
+#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b7c8b5e..b638d3e 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o
obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o
obj-y += vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 97ea6ac..0a04a16 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -20,6 +20,9 @@
#include <linux/hardirq.h>
#include <linux/ptrace.h>
#include <hv/hypervisor.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif
/* Check for compatible compiler early in the build. */
#ifdef CONFIG_TILEGX
@@ -68,6 +71,10 @@ void foo(void)
DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
offsetof(struct thread_info, unalign_jit_tmp));
#endif
+#ifdef CONFIG_KVM
+ DEFINE(THREAD_INFO_VCPU_OFFSET,
+ offsetof(struct thread_info, vcpu));
+#endif
DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
offsetof(struct task_struct, thread.ksp));
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index b608e00..0393689 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -18,11 +18,27 @@
#include <linux/string.h>
#include <linux/irqflags.h>
#include <linux/printk.h>
+#ifdef CONFIG_KVM_GUEST
+#include <linux/virtio_console.h>
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+#endif
#include <asm/setup.h>
#include <hv/hypervisor.h>
+
static void early_hv_write(struct console *con, const char *s, unsigned n)
{
+#ifdef CONFIG_KVM_GUEST
+ char buf[512];
+
+ if (n > sizeof(buf) - 1)
+ n = sizeof(buf) - 1;
+ memcpy(buf, s, n);
+ buf[n] = '\0';
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
+#else
tile_console_write(s, n);
/*
@@ -32,6 +48,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n)
*/
if (n && s[n-1] == '\n')
tile_console_write("\r", 1);
+#endif
}
static struct console early_hv_console = {
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index f3f17b0..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
.set addr, addr + PGDIR_SIZE
.endr
- /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
- PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+ /* The true text VAs are mapped as VA = PA + MEM_SV_START */
+ PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
(1 << (HV_PTE_INDEX_EXECUTABLE - 32))
.org swapper_pg_dir + PGDIR_SIZE
END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 652b814..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -135,9 +135,9 @@ ENTRY(_start)
1:
/* Install the interrupt base. */
- moveli r0, hw2_last(MEM_SV_START)
- shl16insli r0, r0, hw1(MEM_SV_START)
- shl16insli r0, r0, hw0(MEM_SV_START)
+ moveli r0, hw2_last(intrpt_start)
+ shl16insli r0, r0, hw1(intrpt_start)
+ shl16insli r0, r0, hw0(intrpt_start)
mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
/* Get our processor number and save it away in SAVE_K_0. */
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
index 16576c6..2914a9e 100644
--- a/arch/tile/kernel/hvglue.S
+++ b/arch/tile/kernel/hvglue.S
@@ -71,5 +71,11 @@ gensym hv_flush_all, 0x6e0, 32
gensym hv_get_ipi_pte, 0x700, 32
gensym hv_set_pte_super_shift, 0x720, 32
gensym hv_set_speed, 0x740, 32
+gensym hv_install_virt_context, 0x760, 32
+gensym hv_inquire_virt_context, 0x780, 32
+gensym hv_install_guest_context, 0x7a0, 32
+gensym hv_inquire_guest_context, 0x7c0, 32
gensym hv_console_set_ipi, 0x7e0, 32
-gensym hv_glue_internals, 0x800, 30720
+gensym hv_glue_internals, 0x800, 2048
+gensym hcall_virtio, 0x1000, 32
+gensym hv_hcall_internals, 0x1020, 28640
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
index 16ef6c1..3b15c76 100644
--- a/arch/tile/kernel/hvglue_trace.c
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -75,6 +75,10 @@
#define hv_get_ipi_pte _hv_get_ipi_pte
#define hv_set_pte_super_shift _hv_set_pte_super_shift
#define hv_set_speed _hv_set_speed
+#define hv_install_virt_context _hv_install_virt_context
+#define hv_inquire_virt_context _hv_inquire_virt_context
+#define hv_install_guest_context _hv_install_guest_context
+#define hv_inquire_guest_context _hv_inquire_guest_context
#define hv_console_set_ipi _hv_console_set_ipi
#include <hv/hypervisor.h>
#undef hv_init
@@ -135,6 +139,10 @@
#undef hv_get_ipi_pte
#undef hv_set_pte_super_shift
#undef hv_set_speed
+#undef hv_install_virt_context
+#undef hv_inquire_virt_context
+#undef hv_install_guest_context
+#undef hv_inquire_guest_context
#undef hv_console_set_ipi
/*
@@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
unsigned long, flags)
HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP0(HV_Context, hv_inquire_virt_context)
+HV_WRAP0(HV_Context, hv_inquire_guest_context)
HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f3d26f4..2ce69a5 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -353,7 +353,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -806,7 +806,7 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnz r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnz r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
lw r29, r29
@@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
seq r27, r27, r28
}
{
- bbns r27, .Lrestore_all
+ bbns r27, restore_all
addi r28, r28, 8
}
sw r29, r28
- j .Lrestore_all
+ j restore_all
.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
auli r1, r1, ha16(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- bzt r1, .Lrestore_all
+ bzt r1, restore_all
/*
* Make sure we have all the registers saved for signal
@@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
* profile interrupt will actually disable interrupts in both SPRs
* before returning, which is OK.)
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
lw r0, r0
@@ -1890,8 +1892,8 @@ int_unalign:
push_extra_callee_saves r0
j do_trap
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 30d2d02..54ae76b 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -29,11 +29,25 @@
#include <arch/abi.h>
#include <arch/interrupts.h>
#include <arch/spr_def.h>
+#include <arch/opcode.h>
+#ifdef CONFIG_KVM
+#include <asm/kvm_host.h>
+#endif
#define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
#define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set). Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
.macro push_reg reg, ptr=sp, delta=-8
{
@@ -302,7 +316,7 @@ intvec_\vecname:
mtspr SPR_SYSTEM_SAVE_K_1, r0
mfspr r0, SPR_EX_CONTEXT_K_1
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r0, r0)
.ifc \vecnum, INT_DOUBLE_FAULT
/*
@@ -340,10 +354,6 @@ intvec_\vecname:
*
* Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
* any path that turns into a downcall to one of our TLB handlers.
- *
- * FIXME: if we end up never using this path, perhaps we should
- * prevent the hypervisor from generating downcalls in this case.
- * The advantage of getting a downcall is we can panic in Linux.
*/
mfspr r0, SPR_SYSTEM_SAVE_K_2
{
@@ -483,6 +493,10 @@ intvec_\vecname:
mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
.else
+ .ifc \c_routine, kvm_vpgtable_miss
+ mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
+ mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
+ .else
.ifc \vecnum, INT_ILL_TRANS
mfspr r2, ILL_VA_PC
.else
@@ -505,6 +519,7 @@ intvec_\vecname:
.endif
.endif
.endif
+ .endif
/* Put function pointer in r0 */
moveli r0, hw2_last(\c_routine)
shl16insli r0, r0, hw1(\c_routine)
@@ -518,7 +533,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -634,24 +649,25 @@ intvec_\vecname:
/*
* If we will be returning to the kernel, we will need to
* reset the interrupt masks to the state they had before.
- * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+ * Set DISABLE_IRQ in flags iff we came from kernel pl with
+ * irqs disabled.
*/
- mfspr r32, SPR_EX_CONTEXT_K_1
+ mfspr r22, SPR_EX_CONTEXT_K_1
{
- andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r22, r22)
PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
}
- beqzt r32, 1f /* zero if from user space */
- IRQS_DISABLED(r32) /* zero if irqs enabled */
+ beqzt r22, 1f /* zero if from user space */
+ IRQS_DISABLED(r22) /* zero if irqs enabled */
#if PT_FLAGS_DISABLE_IRQ != 1
# error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
#endif
1:
.ifnc \function,handle_syscall
/* Record the fact that we saved the caller-save registers above. */
- ori r32, r32, PT_FLAGS_CALLER_SAVES
+ ori r22, r22, PT_FLAGS_CALLER_SAVES
.endif
- st r21, r32
+ st r21, r22
/*
* we've captured enough state to the stack (including in
@@ -691,12 +707,29 @@ intvec_\vecname:
move tp, zero
#endif
+ /*
+ * Prepare the first 256 stack bytes to be rapidly accessible
+ * without having to fetch the background data.
+ */
+ addi r52, sp, -64
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ wh64 r52
+
#ifdef __COLLECT_LINKER_FEEDBACK__
/*
* Notify the feedback routines that we were in the
- * appropriate fixed interrupt vector area. Note that we
- * still have ICS set at this point, so we can't invoke any
- * atomic operations or we will panic. The feedback
+ * appropriate fixed interrupt vector area. The feedback
* routines internally preserve r0..r10 and r30 up.
*/
.ifnc \function,handle_syscall
@@ -715,23 +748,15 @@ intvec_\vecname:
#endif
/*
- * Prepare the first 256 stack bytes to be rapidly accessible
- * without having to fetch the background data.
+ * Stash any interrupt state in r30..r33 for now.
+ * This makes it easier to call C code in the code that follows.
+ * We don't need to on the syscall path since we reload
+ * them from the stack instead.
*/
- addi r52, sp, -64
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- wh64 r52
+ .ifnc \function,handle_syscall
+ { move r30, r0; move r31, r1 }
+ { move r32, r2; move r33, r3 }
+ .endif
#ifdef CONFIG_TRACE_IRQFLAGS
.ifnc \function,handle_nmi
@@ -742,17 +767,8 @@ intvec_\vecname:
* For syscalls, we already have the register state saved away
* on the stack, so we don't bother to do any register saves here,
* and later we pop the registers back off the kernel stack.
- * For interrupt handlers, save r0-r3 in callee-saved registers.
*/
- .ifnc \function,handle_syscall
- { move r30, r0; move r31, r1 }
- { move r32, r2; move r33, r3 }
- .endif
TRACE_IRQS_OFF
- .ifnc \function,handle_syscall
- { move r0, r30; move r1, r31 }
- { move r2, r32; move r3, r33 }
- .endif
.endif
#endif
@@ -801,11 +817,11 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnez r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnez r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
ld r29, r29
- andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r29, r29)
{
beqzt r29, .Lresume_userspace
move r29, sp
@@ -817,14 +833,25 @@ STD_ENTRY(interrupt_return)
addli r28, r29, THREAD_INFO_FLAGS_OFFSET
{
ld r28, r28
- addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+ addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
}
{
- andi r28, r28, _TIF_NEED_RESCHED
- ld4s r29, r29
+ andi r27, r28, _TIF_NEED_RESCHED
+ ld4s r26, r26
}
- beqzt r28, 1f
- bnez r29, 1f
+ beqzt r27, 1f
+ bnez r26, 1f
+#ifdef CONFIG_KVM
+ addli r27, r29, THREAD_INFO_VCPU_OFFSET
+ ld r27, r27
+ {
+ beqzt r27, 0f
+ movei r1, KVM_EXIT_AGAIN
+ }
+ push_extra_callee_saves r0
+ j kvm_trigger_vmexit
+0:
+#endif
jal preempt_schedule_irq
FEEDBACK_REENTER(interrupt_return)
1:
@@ -846,11 +873,11 @@ STD_ENTRY(interrupt_return)
cmpeq r27, r27, r28
}
{
- blbc r27, .Lrestore_all
+ blbc r27, restore_all
addi r28, r28, 8
}
st r29, r28
- j .Lrestore_all
+ j restore_all
.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -890,7 +917,7 @@ STD_ENTRY(interrupt_return)
shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- beqzt r1, .Lrestore_all
+ beqzt r1, restore_all
/*
* Make sure we have all the registers saved for signal
@@ -922,14 +949,16 @@ STD_ENTRY(interrupt_return)
* ICS can only be used in very tight chunks of code to avoid
* tripping over various assertions that it is off.
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
ld r0, r0
PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
}
{
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+ IS_KERNEL_EX1(r0, r0)
ld r32, r32
}
bnez r0, 1f
@@ -1000,7 +1029,7 @@ STD_ENTRY(interrupt_return)
pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
{
mtspr SPR_EX_CONTEXT_K_1, lr
- andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(lr, lr)
}
{
mtspr SPR_EX_CONTEXT_K_0, r21
@@ -1450,6 +1479,26 @@ int_unalign:
j do_unaligned
ENDPROC(hand_unalign_slow)
+#ifdef CONFIG_KVM
+/*
+ * Any call path that may lead to a vmexit needs to save the full
+ * callee-save register state, since if we vmexit we don't unwind
+ * the callee-saves from the C function stack frames, and instead
+ * just save away the register state from the interrupt handler as-is
+ * and later reload it directly and call back into the guest.
+ */
+ .macro save_callee_saves_and_tailcall func
+kvm_\func:
+ push_extra_callee_saves r0
+ j kvm_do_\func
+ ENDPROC(\func)
+ .endm
+
+ save_callee_saves_and_tailcall hypervisor_call
+ save_callee_saves_and_tailcall vpgtable_miss
+ save_callee_saves_and_tailcall vguest_fatal
+#endif
+
/* Fill the return address stack with nonzero entries. */
STD_ENTRY(fill_ra_stack)
{
@@ -1462,13 +1511,57 @@ STD_ENTRY(fill_ra_stack)
4: jrp r0
STD_ENDPROC(fill_ra_stack)
+#ifdef CONFIG_KVM
+/*
+ * Handle the downcall dispatch service. On entry, the client's
+ * system save register 3 holds the original contents of
+ * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
+ * the correct interrupt vector.
+ * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
+ * here, since this is the only interrupt handled this way on GX.
+ */
+handle_downcall_dispatch:
+ /*
+ * If we were called from PL0, jump back to slow path.
+ * We check just the low bit to make sure it's set, since we
+ * can only be called from PL0 or PL1.
+ */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
+ blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0
+
+ /* Set the PC to the downcall interrupt vector, and PL to guest. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
+ addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
+ INT_MESSAGE_RCV_DWNCL << 8
+ {
+ mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
+ movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
+ }
+ mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
+
+ /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
+ iret
+
+ .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \
+ processing=handle_interrupt
+ .org (\vecnum << 8)
+ /* Need special code for downcall dispatch syscall. */
+ beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
+ __int_hand \vecnum, \vecname, \c_routine, \processing
+ .endm
+
+#endif /* CONFIG_KVM */
+
.macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
.org (\vecnum << 8)
__int_hand \vecnum, \vecname, \c_routine, \processing
.endm
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
+ .global intrpt_start
+intrpt_start:
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
@@ -1477,6 +1570,11 @@ STD_ENTRY(fill_ra_stack)
#define do_hardwall_trap bad_intr
#endif
+#ifndef CONFIG_KVM
+#define kvm_vpgtable_miss bad_intr
+#define kvm_vguest_fatal bad_intr
+#endif
+
int_hand INT_MEM_ERROR, MEM_ERROR, do_trap
int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
#if CONFIG_KERNEL_PL == 2
@@ -1497,14 +1595,24 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_SWINT_3, SWINT_3, do_trap
int_hand INT_SWINT_2, SWINT_2, do_trap
int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
+#ifdef CONFIG_KVM
+ int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
+#else
int_hand INT_SWINT_0, SWINT_0, do_trap
+#endif
int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
+#ifndef CONFIG_KVM_GUEST
int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr
+#else
+ int_hand INT_TILE_TIMER, TILE_TIMER, bad_intr
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt
+#endif
int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr
int_hand INT_UDN_TIMER, UDN_TIMER, bad_intr
int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr
@@ -1534,8 +1642,10 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
hv_message_intr
int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
- int_hand INT_I_ASID, I_ASID, bad_intr
- int_hand INT_D_ASID, D_ASID, bad_intr
+ int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
+ kvm_vpgtable_miss
+ int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
+ kvm_vguest_fatal
int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
/* Synthetic interrupt delivered only by the simulator */
diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c
new file mode 100644
index 0000000..c6b6c6a
--- /dev/null
+++ b/arch/tile/kernel/kvm_virtio.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+/* Referred lguest & s390 implemenation */
+/*
+ * kvm_virtio.c - virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): Christian Borntraeger <[email protected]>
+ */
+
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+
+static void *kvm_devices;
+
+/*
+ * TODO: We actually does not use PCI virtio here. We use this
+ * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
+ * Maybe we should change them to generic definitions in both qemu & Linux.
+ * Besides, Let's check whether the alignment value (4096, i.e. default
+ * x86 page size) affects performance later.
+ */
+#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN
+#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
+
+/*
+ * memory layout: (Total: PAGE_SIZE)
+ * <device 0>
+ * - kvm device descriptor
+ * struct kvm_device_desc
+ * - vqueue configuration (totally desc->num_vq)
+ * struct kvm_vqconfig
+ * ......
+ * struct kvm_vqconfig
+ * - feature bits (size: desc->feature_len * 2)
+ * - config space (size: desc->config_len)
+ * <device 1>
+ * ......
+ */
+static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
+{
+ return (struct kvm_vqconfig *)(desc + 1);
+}
+
+static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
+{
+ return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
+}
+
+static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
+{
+ return kvm_vq_features(desc) + desc->feature_len * 2;
+}
+
+/*
+ * The total size of the config page used by this device (incl. desc)
+ */
+static unsigned desc_size(const struct kvm_device_desc *desc)
+{
+ return sizeof(*desc)
+ + desc->num_vq * sizeof(struct kvm_vqconfig)
+ + desc->feature_len * 2
+ + desc->config_len;
+}
+
+/* This gets the device's feature bits. */
+static u32 kvm_get_features(struct virtio_device *vdev)
+{
+ unsigned int i;
+ u32 features = 0;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ u8 *in_features = kvm_vq_features(desc);
+
+ for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+ if (in_features[i / 8] & (1 << (i % 8)))
+ features |= (1 << i);
+ return features;
+}
+
+static void kvm_finalize_features(struct virtio_device *vdev)
+{
+ unsigned int i, bits;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ /* Second half of bitmap is features we accept. */
+ u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ memset(out_features, 0, desc->feature_len);
+ bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, vdev->features))
+ out_features[i / 8] |= (1 << (i % 8));
+ }
+}
+
+/*
+ * Reading and writing elements in config space
+ */
+static void kvm_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(buf, kvm_vq_configspace(desc) + offset, len);
+}
+
+static void kvm_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(kvm_vq_configspace(desc) + offset, buf, len);
+}
+
+/*
+ * The operations to get and set the status word just access
+ * the status field of the device descriptor. set_status will also
+ * make a hypercall to the host, to tell about status changes
+ */
+static u8 kvm_get_status(struct virtio_device *vdev)
+{
+ return to_kvmdev(vdev)->desc->status;
+}
+
+static void kvm_set_status(struct virtio_device *vdev, u8 status)
+{
+ BUG_ON(!status);
+ to_kvmdev(vdev)->desc->status = status;
+ hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
+ * descriptor address. The Host will zero the status and all the
+ * features.
+ */
+static void kvm_reset(struct virtio_device *vdev)
+{
+ hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * When the virtio_ring code wants to notify the Host, it calls us here and we
+ * make a hypercall. We hand the address of the virtqueue so the Host
+ * knows which virtqueue we're talking about.
+ */
+static void kvm_notify(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
+}
+
+/*
+ * Must set some caching mode to keep set_pte() happy.
+ * It doesn't matter what we choose, because the PFN
+ * is illegal, so we're going to take a page fault anyway.
+ */
+static inline pgprot_t io_prot(void)
+{
+ return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
+}
+
+/*
+ * This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ */
+static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
+ unsigned index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ struct kvm_vqinfo *vqi;
+ struct kvm_vqconfig *config;
+ struct virtqueue *vq;
+ long irq;
+ int err = -EINVAL;
+
+ if (index >= kdev->desc->num_vq)
+ return ERR_PTR(-ENOENT);
+
+ vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
+ if (!vqi)
+ return ERR_PTR(-ENOMEM);
+
+ config = kvm_vq_config(kdev->desc)+index;
+
+ vqi->config = config;
+ vqi->pages = generic_remap_prot(config->pa,
+ vring_size(config->num,
+ KVM_TILE_VIRTIO_RING_ALIGN),
+ 0, io_prot());
+ if (!vqi->pages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
+ vdev, 0, vqi->pages,
+ kvm_notify, callback, name);
+ if (!vq) {
+ err = -ENOMEM;
+ goto unmap;
+ }
+
+ /*
+ * Trigger the IPI interrupt in SW way.
+ * TODO: We do not need to create one irq for each vq. A bit wasteful.
+ */
+ irq = create_irq();
+ if (irq < 0) {
+ err = -ENXIO;
+ goto del_virtqueue;
+ }
+
+ tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
+
+ if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
+ err = -ENXIO;
+ destroy_irq(irq);
+ goto del_virtqueue;
+ }
+
+ config->irq = irq;
+
+ vq->priv = vqi;
+ return vq;
+
+del_virtqueue:
+ vring_del_virtqueue(vq);
+unmap:
+ vunmap(vqi->pages);
+out:
+ return ERR_PTR(err);
+}
+
+static void kvm_del_vq(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ vring_del_virtqueue(vq);
+ vunmap(vqi->pages);
+ kfree(vqi);
+}
+
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ int i;
+
+ /* We must have this many virtqueues. */
+ if (nvqs > kdev->desc->num_vq)
+ return -ENOENT;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ kvm_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+}
+
+/*
+ * The config ops structure as defined by virtio config
+ */
+static struct virtio_config_ops kvm_vq_config_ops = {
+ .get_features = kvm_get_features,
+ .finalize_features = kvm_finalize_features,
+ .get = kvm_get,
+ .set = kvm_set,
+ .get_status = kvm_get_status,
+ .set_status = kvm_set_status,
+ .reset = kvm_reset,
+ .find_vqs = kvm_find_vqs,
+ .del_vqs = kvm_del_vqs,
+};
+
+/*
+ * The root device for the kvm virtio devices.
+ * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
+ */
+static struct device *kvm_root;
+
+/*
+ * adds a new device and register it with virtio
+ * appropriate drivers are loaded by the device model
+ */
+static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
+{
+ struct kvm_device *kdev;
+
+ kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+ if (!kdev) {
+ pr_emerg("Cannot allocate kvm dev %u type %u\n",
+ offset, d->type);
+ return;
+ }
+
+ kdev->vdev.dev.parent = kvm_root;
+ kdev->vdev.id.device = d->type;
+ kdev->vdev.config = &kvm_vq_config_ops;
+ kdev->desc = d;
+ kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
+
+ if (register_virtio_device(&kdev->vdev) != 0) {
+ pr_err("Failed to register kvm device %u type %u\n",
+ offset, d->type);
+ kfree(kdev);
+ }
+}
+
+/*
+ * scan_devices() simply iterates through the device page.
+ * The type 0 is reserved to mean "end of devices".
+ */
+static void scan_devices(void)
+{
+ unsigned int i;
+ struct kvm_device_desc *d;
+
+ for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+ d = kvm_devices + i;
+
+ if (d->type == 0)
+ break;
+
+ add_kvm_device(d, i);
+ }
+}
+
+/*
+ * Init function for virtio.
+ * devices are in a single page above the top of "normal" mem.
+ */
+static int __init kvm_devices_init(void)
+{
+ int rc = -ENOMEM;
+
+ kvm_root = root_device_register("kvm_tile");
+ if (IS_ERR(kvm_root)) {
+ rc = PTR_ERR(kvm_root);
+ pr_err("Could not register kvm_tile root device");
+ return rc;
+ }
+
+ kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
+ 0, io_prot());
+ if (!kvm_devices) {
+ kvm_devices = NULL;
+ root_device_unregister(kvm_root);
+ return rc;
+ }
+
+ scan_devices();
+ return 0;
+}
+
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int len)
+{
+ char scratch[512];
+
+ if (len > sizeof(scratch) - 1)
+ len = sizeof(scratch) - 1;
+ scratch[len] = '\0';
+ memcpy(scratch, buf, len);
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
+
+ return len;
+}
+
+static int __init tile_virtio_console_init(void)
+{
+ return virtio_cons_early_init(early_put_chars);
+}
+console_initcall(tile_virtio_console_init);
+
+/*
+ * We do this after core stuff, but before the drivers.
+ */
+postcore_initcall(kvm_devices_init);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 44cdc4a..2629ff1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -27,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/tracehook.h>
#include <linux/signal.h>
+#include <linux/kvm_host.h>
#include <asm/stack.h>
#include <asm/switch_to.h>
#include <asm/homecache.h>
@@ -247,11 +248,13 @@ struct task_struct *validate_current(void)
/* Take and return the pointer to the previous task, for schedule_tail(). */
struct task_struct *sim_notify_fork(struct task_struct *prev)
{
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
struct task_struct *tsk = current;
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT |
(tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS));
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK |
(tsk->pid << _SIM_CONTROL_OPERATOR_BITS));
+#endif
return prev;
}
@@ -450,6 +453,11 @@ void _prepare_arch_switch(struct task_struct *next)
struct task_struct *__sched _switch_to(struct task_struct *prev,
struct task_struct *next)
{
+#ifdef CONFIG_KVM
+ /* vmexit is needed before context switch. */
+ BUG_ON(task_thread_info(prev)->vcpu);
+#endif
+
/* DMA state is already saved; save off other arch state. */
save_arch_state(&prev->thread);
@@ -519,6 +527,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
/* Enable interrupts; they are disabled again on return to caller. */
local_irq_enable();
+#ifdef CONFIG_KVM
+ /*
+ * Some work requires us to exit the VM first. Typically this
+ * allows the process running the VM to respond to the work
+ * (e.g. a signal), or allows the VM mechanism to latch
+ * modified host state (e.g. a "hypervisor" message sent to a
+ * different vcpu). It also means that if we are considering
+ * calling schedule(), we exit the VM first, so we never have
+ * to worry about context-switching into a VM.
+ */
+ if (current_thread_info()->vcpu) {
+ u32 do_exit = thread_info_flags &
+ (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
+
+ if (thread_info_flags & _TIF_VIRT_EXIT)
+ clear_thread_flag(TIF_VIRT_EXIT);
+ if (do_exit) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
+ /*NORETURN*/
+ }
+ }
+#endif
+
if (thread_info_flags & _TIF_NEED_RESCHED) {
schedule();
return 1;
@@ -538,11 +569,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
tracehook_notify_resume(regs);
return 1;
}
- if (thread_info_flags & _TIF_SINGLESTEP) {
+
+ /* Handle a few flags here that stay set. */
+ if (thread_info_flags & _TIF_SINGLESTEP)
single_step_once(regs);
- return 0;
- }
- panic("work_pending: bad flags %#x\n", thread_info_flags);
+
+ return 0;
}
unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..02bc446 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
addi sp, sp, -8
/* we now have a stack (whether we need one or not) */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r40, hw2_last(hv_console_putc)
shl16insli r40, r40, hw1(hv_console_putc)
shl16insli r40, r40, hw0(hv_console_putc)
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, 'r'
jalr r40
@@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
/* we should not get here */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, '?'
jalr r40
moveli r0, '\n'
jalr r40
+#endif
j .Lhalt
@@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
j .Lloop
-.Lerr: moveli r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+ moveli r0, 'e'
jalr r40
moveli r0, 'r'
jalr r40
@@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
jalr r40
moveli r0, '\n'
jalr r40
+#endif
.Lhalt:
moveli r41, hw2_last(hv_halt)
shl16insli r41, r41, hw1(hv_halt)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 774e819..2352a81 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
/*
* Determine for each controller where its lowmem is mapped and how much of
* it is mapped there. On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
* start our data mappings higher up, but for now we don't bother, to avoid
* additional confusion.
*
@@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
* SPRs, as well as the interrupt mask.
*/
__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
+
+#ifdef CONFIG_KVM
+ /*
+ * If we launch a guest kernel, it will need some interrupts
+ * that otherwise are not used by the host or by userspace.
+ * Set them to MPL 1 now and leave them alone going forward;
+ * they are masked in the host so will never fire there anyway,
+ * and we mask them at PL1 as we exit the guest.
+ */
__insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
+ __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
+#endif
/* Initialize IRQ support for this cpu. */
setup_irq_regs();
@@ -1242,7 +1255,7 @@ static void __init validate_va(void)
#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
/*
* Similarly, make sure we're only using allowed VAs.
- * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+ * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
* and 0 .. KERNEL_HIGH_VADDR.
* In addition, make sure we CAN'T use the end of memory, since
* we use the last chunk of each pgd for the pgd_list.
@@ -1257,7 +1270,7 @@ static void __init validate_va(void)
if (range.size == 0)
break;
if (range.start <= MEM_USER_INTRPT &&
- range.start + range.size >= MEM_HV_INTRPT)
+ range.start + range.size >= MEM_HV_START)
user_kernel_ok = 1;
if (range.start == 0)
max_va = range.size;
@@ -1693,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
static int __init request_standard_resources(void)
{
int i;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if defined(CONFIG_PCI) && !defined(__tilegx__)
insert_non_bus_resource();
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 0ae1c59..62b3ba9 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -223,30 +223,34 @@ void __init ipi_init(void)
#if CHIP_HAS_IPI()
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- WARN_ON(cpu_is_offline(cpu));
-
/*
* We just want to do an MMIO store. The traditional writeq()
* functions aren't really correct here, since they're always
* directed at the PCI shim. For now, just do a raw store,
- * casting away the __iomem attribute.
+ * casting away the __iomem attribute. We do the store as a
+ * single asm() instruction to ensure that we can force a step
+ * over it in the KVM case, if we are not binding vcpus to cpus,
+ * rather than require it to be possible to issue validly.
*/
- ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
+ unsigned long *addr =
+ &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
+ asm volatile("st %0, zero" :: "r" (addr));
}
#else
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- HV_Coord coord;
-
- WARN_ON(cpu_is_offline(cpu));
-
- coord.y = cpu_y(cpu);
- coord.x = cpu_x(cpu);
+ HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
hv_trigger_ipi(coord, IRQ_RESCHEDULE);
}
#endif /* CHIP_HAS_IPI() */
+
+void smp_send_reschedule(int cpu)
+{
+ WARN_ON(cpu_is_offline(cpu));
+ __smp_send_reschedule(cpu);
+}
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 24fd223..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
p->sp >= sp) {
if (kbt->verbose)
pr_err(" <%s while in kernel mode>\n", fault);
- } else if (EX1_PL(p->ex1) == USER_PL &&
+ } else if (user_mode(p) &&
p->sp < PAGE_OFFSET && p->sp != 0) {
if (kbt->verbose)
pr_err(" <%s while in user mode>\n", fault);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a8..024b978 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
+#ifdef CONFIG_KVM_GUEST
+ return sprintf(page, "KVM\n");
+#else
return sprintf(page, "tilera\n");
+#endif
}
static DEVICE_ATTR(type, 0444, type_show, NULL);
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 3c2dc87..b0b7264 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -117,9 +117,9 @@ void __init time_init(void)
/*
* Define the tile timer clock event device. The timer is driven by
- * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
+ * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down
* counter, plus bit 31, which signifies that the counter has wrapped
- * from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be
+ * from zero to (2**31) - 1. The INT_[AUX_]TILE_TIMER interrupt will be
* raised as long as bit 31 is set.
*/
@@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks,
struct clock_event_device *evt)
{
BUG_ON(ticks > MAX_TICK);
- __insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
- arch_local_irq_unmask_now(INT_TILE_TIMER);
+ __insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks);
+ arch_local_irq_unmask_now(INT_LINUX_TIMER);
return 0;
}
@@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
static void tile_timer_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
}
static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
@@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void)
evt->cpumask = cpumask_of(smp_processor_id());
/* Start out with timer not firing. */
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
/*
* Register tile timer. Set min_delta to 1 microsecond, since
@@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
* Mask the timer interrupt here, since we are a oneshot timer
* and there are now by definition no events pending.
*/
- arch_local_irq_mask(INT_TILE_TIMER);
+ arch_local_irq_mask(INT_LINUX_TIMER);
/* Track time spent here in an interrupt context */
irq_enter();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index f110785..19d465c 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -30,7 +30,7 @@
void __init trap_init(void)
{
- /* Nothing needed here since we link code at .intrpt1 */
+ /* Nothing needed here since we link code at .intrpt */
}
int unaligned_fixup = 1;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index c7ae53d..8b20163 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
#include <hv/hypervisor.h>
/* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
OUTPUT_ARCH(tile)
ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
PHDRS
{
- intrpt1 PT_LOAD ;
+ intrpt PT_LOAD ;
text PT_LOAD ;
data PT_LOAD ;
}
@@ -24,11 +24,11 @@ SECTIONS
#define LOAD_OFFSET TEXT_OFFSET
/* Interrupt vectors */
- .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
+ .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
{
_text = .;
- *(.intrpt1)
- } :intrpt1 =0
+ *(.intrpt)
+ } :intrpt =0
/* Hypervisor call vectors */
. = ALIGN(0x10000);
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 2298cb1..65f7f9d 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -27,9 +27,6 @@ config KVM
This module provides access to the hardware capabilities through
a character device node named /dev/kvm.
- To compile this as a module, choose M here: the module
- will be called kvm.
-
If unsure, say N.
source drivers/vhost/Kconfig
diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
new file mode 100644
index 0000000..2c3d206
--- /dev/null
+++ b/arch/tile/kvm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
+
+kvm-y += kvm-tile.o
+kvm-y += entry.o
+
+obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
new file mode 100644
index 0000000..07aa3a6
--- /dev/null
+++ b/arch/tile/kvm/entry.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/switch_to.h>
+#include <asm/processor.h>
+#include <arch/spr_def.h>
+#include <arch/abi.h>
+
+#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
+#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
+#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
+#define FOR_EACH_CALLEE_SAVED_REG(f) \
+ f(r30); f(r31); \
+ f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \
+ f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
+ f(r48); f(r49); f(r50); f(r51); f(r52);
+
+/*
+ * Called with interrupts disabled from kvm_tile_run() and is responsible
+ * just for saving the callee-save registers and the stack pointer, then
+ * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
+ * It uses restore_all in intvec_64.S to jump back into the guest.
+ * The kvm_vmexit function below undoes the stack manipulation.
+ */
+STD_ENTRY(kvm_vmresume)
+ /* Do function prolog and save callee-saves on stack. */
+ {
+ move r10, sp
+ st sp, lr
+ }
+ {
+ addli r11, sp, -FRAME_SIZE + 8
+ addli sp, sp, -FRAME_SIZE
+ }
+ {
+ st r11, r10
+ addi r12, sp, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
+ SAVE_REG(tp)
+ SAVE_REG(lr)
+
+ /* Save frame pointer in thread_info so we can get it back later. */
+ st r1, sp
+
+ /* Set the ksp0 for this core to be below this frame. */
+ mfspr r10, SPR_SYSTEM_SAVE_K_0
+ bfins r10, sp, 0, CPU_SHIFT-1
+ mtspr SPR_SYSTEM_SAVE_K_0, r10
+
+ /* sp points to ABI save area below pt_regs for restore_all. */
+ addli sp, r0, -C_ABI_SAVE_AREA_SIZE
+
+ /* Execute an "interrupt return" to the guest. */
+ {
+ movei r30, 0
+ j restore_all
+ }
+ STD_ENDPROC(kvm_vmresume)
+
+/*
+ * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
+ * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
+ * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller
+ * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
+ */
+STD_ENTRY(kvm_vmexit)
+ {
+ move sp, r0
+ addi r12, r0, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
+ LOAD_REG(tp)
+ LOAD_REG(lr)
+ {
+ addli sp, sp, FRAME_SIZE
+ jrp lr
+ }
+ STD_ENDPROC(kvm_vmexit)
diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
new file mode 100644
index 0000000..29b601a
--- /dev/null
+++ b/arch/tile/kvm/kvm-tile.c
@@ -0,0 +1,1585 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <asm/traps.h>
+#include <asm/pgalloc.h>
+#include <hv/hypervisor.h>
+#include <linux/rtc.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+#include <arch/spr_def.h>
+#include <arch/sim.h>
+#include <generated/utsrelease.h>
+
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { NULL }
+};
+
+static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
+{
+ struct mm_struct *mm = kvm->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (kvm->arch.vpgd == NULL)
+ kvm->arch.vpgd = pgd_alloc(kvm->mm);
+ pgd = kvm->arch.vpgd + pgd_index(address);
+ pud = pud_alloc(mm, pgd, address);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return NULL;
+ return pte_alloc_kernel(pmd, address);
+}
+
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ return 0;
+}
+
+/* FIXME: support huge pages. */
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, i;
+
+ gpa = mem->guest_phys_addr;
+ for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
+ if (get_vpgd_pte(kvm, gpa) == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, address, pfn, i;
+ struct page *page[1];
+ pte_t *ptep, *vptep;
+
+ gpa = mem->guest_phys_addr;
+ address = mem->userspace_addr;
+ for (i = 0; i < mem->memory_size;
+ i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
+ vptep = get_vpgd_pte(kvm, gpa);
+ BUG_ON(vptep == NULL);
+ get_user_pages_fast(address, 1, 1, page);
+ pfn = page_to_pfn(page[0]);
+ ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
+ *vptep = *ptep;
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_arch_flush_shadow_all(kvm);
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ return 0;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
+{
+ if (irq < 0)
+ return -EINVAL;
+
+ set_bit(irq, &vcpu->arch.ipi_events);
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = 0;
+
+ switch (ioctl) {
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof(irq)))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_TILE_RESET_SPR: {
+ /* Initialize guest SPR values */
+ vcpu->arch.timer_control =
+ 1UL << SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT;
+ vcpu->arch.vmexit_cycles = get_cycles();
+ vcpu->arch.INTERRUPT_MASK_1 = -1UL;
+ vcpu->arch.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
+ vcpu->arch.IPI_MASK_1 = -1UL;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ return r;
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+ return 0;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ return 0;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ long r = -EINVAL;
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long page_size;
+ unsigned long gva = tr->linear_address;
+ unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
+ pud_t gpud;
+ pmd_t gpmd;
+ pte_t gpte;
+
+ /* Get guest pgd (aka pud for three-level tables). */
+ gpgd_gpa = vcpu->arch.guest_context.page_table +
+ (sizeof(pgd_t) * pgd_index(gva));
+ if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
+ goto fail;
+ if (!pud_present(gpud))
+ goto fail;
+
+ /* Get guest pmd. */
+ if (pud_huge_page(gpud)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpud))
+ goto fail;
+ gpte = *(pte_t *)&gpud;
+ page_size = PGDIR_SIZE;
+ goto ok;
+ }
+ gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pmd_t) * pmd_index(gva));
+ if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
+ goto fail;
+ if (!pmd_present(gpmd))
+ goto fail;
+
+ /* Get guest pte. */
+ if (pmd_huge_page(gpmd)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpmd))
+ goto fail;
+ gpte = *(pte_t *)&gpmd;
+ page_size = PMD_SIZE;
+ goto ok;
+ }
+ gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pte_t) * pte_index(gva));
+ if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
+ goto fail;
+ if (!pte_present(gpte))
+ goto fail;
+
+ page_size = PAGE_SIZE;
+
+ok:
+ tr->physical_address =
+ PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
+ tr->valid = 1;
+ tr->writeable = pte_write(gpte);
+ tr->usermode = pte_user(gpte);
+
+ return 0;
+
+fail:
+ tr->valid = 0;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ regs->regs = vcpu->arch.regs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.regs = regs->regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ return 0;
+}
+
+/*
+ * panic_hv() will dump stack info of both guest os and host os, and set
+ * proper exit reason so that qemu can terminate the guest process.
+ *
+ * FIXME: Probably KVM_EXIT_EXCEPTION? If using KVM_EXIT_EXCEPTION,
+ * current qemu process will "hang" (killable but Ctrl+C not working),
+ * so use KVM_EXIT_SHUTDOWN here temporarily.
+ */
+static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
+{
+ char panic_buf[256];
+ struct pt_regs *regs;
+ va_list ap;
+ int i;
+
+ va_start(ap, fmt);
+ vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
+ va_end(ap);
+ pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
+
+ /* Show guest os info */
+ regs = &vcpu->arch.regs;
+ for (i = 0; i < 17; i++)
+ pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
+ i, regs->regs[i], i+18, regs->regs[i+18],
+ i+36, regs->regs[i+36]);
+ pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+ regs->regs[18], regs->regs[35], regs->tp);
+ pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
+ pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n",
+ regs->pc, regs->ex1, regs->faultnum);
+
+ /* Show host os info */
+ pr_err("\nKVM stack in the host:\n");
+ dump_stack();
+
+ /* Shut down the guest os */
+ pr_err("Shutting down guest.\n");
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+/* Copied from virt/kvm/kvm_main.c */
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ const void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+/*
+ * The following functions are emulation functions for various
+ * hypervisor system calls (i.e. hv_*()). Return value:
+ * 1 if the host os can emulate it completely.
+ * < 0 if errors occur and then qemu will handle them.
+ * 0 if qemu emulation is needed.
+ * In both the < 0 and the == 0 cases, exit reason should
+ * be set for qemu handling.
+ */
+
+/* generic handler for hypercall which needs user (QEMU) to handle. */
+static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ return 0;
+}
+
+/* handler for illegal hypercall */
+static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
+{
+ return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
+ (unsigned long)vcpu->arch.regs.regs[10]);
+}
+
+static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
+{
+ int version = vcpu->arch.regs.regs[0];
+ int chip_num = vcpu->arch.regs.regs[1];
+ int chip_rev_num = vcpu->arch.regs.regs[2];
+ int client_pl = vcpu->arch.regs.regs[3];
+
+ if (client_pl != 1)
+ return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
+ " guests must request PL 1.\n"
+ "Reconfigure your guest with KVM_GUEST set.\n",
+ client_pl);
+
+ if (version != HV_VERSION)
+ return panic_hv(vcpu, "Client built for hv version %d, but"
+ " this hv is version %d\n",
+ version, HV_VERSION);
+
+ if (chip_num != TILE_CHIP)
+ return panic_hv(vcpu, "Client built for chip %d, but this"
+ " hardware is chip %d\n",
+ chip_num, TILE_CHIP);
+
+ if (chip_rev_num != TILE_CHIP_REV)
+ return panic_hv(vcpu, "Client built for chip rev %d, but this"
+ " hardware is chip rev %d\n",
+ chip_rev_num, TILE_CHIP_REV);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long rc;
+
+ switch (query) {
+ case HV_SYSCONF_PAGE_SIZE_SMALL:
+ rc = PAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_LARGE:
+ rc = HPAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_VALID_PAGE_SIZES:
+#if PAGE_SHIFT == 16
+ rc = HV_CTX_PG_SM_64K;
+#elif PAGE_SHIFT == 14
+ rc = HV_CTX_PG_SM_16K;
+#else
+# error Fix hv_sysconf emulation for new page size
+#endif
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_JUMBO:
+ rc = 0; /* FIXME add super page support */
+ break;
+
+ case HV_SYSCONF_CPU_SPEED:
+ case HV_SYSCONF_CPU_TEMP:
+ case HV_SYSCONF_BOARD_TEMP:
+ rc = hv_sysconf(query);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long buflen = vcpu->arch.regs.regs[2];
+ char hvbuf[256];
+ const char *p;
+ long rc;
+
+ switch (query) {
+
+ /* For hardware attributes, just pass to the hypervisor. */
+ case HV_CONFSTR_BOARD_PART_NUM:
+ case HV_CONFSTR_BOARD_SERIAL_NUM:
+ case HV_CONFSTR_CHIP_SERIAL_NUM:
+ case HV_CONFSTR_BOARD_REV:
+ case HV_CONFSTR_CHIP_MODEL:
+ case HV_CONFSTR_BOARD_DESC:
+ case HV_CONFSTR_MEZZ_PART_NUM:
+ case HV_CONFSTR_MEZZ_SERIAL_NUM:
+ case HV_CONFSTR_MEZZ_REV:
+ case HV_CONFSTR_MEZZ_DESC:
+ case HV_CONFSTR_SWITCH_CONTROL:
+ case HV_CONFSTR_CHIP_REV:
+ case HV_CONFSTR_CPUMOD_PART_NUM:
+ case HV_CONFSTR_CPUMOD_SERIAL_NUM:
+ case HV_CONFSTR_CPUMOD_REV:
+ case HV_CONFSTR_CPUMOD_DESC:
+ rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
+ if (rc > sizeof(hvbuf)) {
+ /* Not the best answer, but very unlikely anyway. */
+ rc = sizeof(hvbuf);
+ hvbuf[sizeof(hvbuf)-1] = '\0';
+ }
+ p = hvbuf;
+ break;
+
+ /* For hypervisor version info, just report the kernel version. */
+ case HV_CONFSTR_HV_SW_VER:
+ p = UTS_RELEASE;
+ break;
+ case HV_CONFSTR_HV_CONFIG:
+ case HV_CONFSTR_HV_CONFIG_VER:
+ p = "";
+ break;
+
+ default:
+ rc = HV_EINVAL;
+ goto done;
+ }
+
+ rc = strlen(p) + 1; /* include NUL */
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
+ p, min(rc, buflen)))
+ rc = HV_EFAULT;
+
+done:
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
+{
+ HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
+ struct rtc_time tm;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ rtc_time_to_tm(tv.tv_sec, &tm);
+ hvtm->tm_sec = tm.tm_sec;
+ hvtm->tm_min = tm.tm_min;
+ hvtm->tm_hour = tm.tm_hour;
+ hvtm->tm_mday = tm.tm_mday;
+ hvtm->tm_mon = tm.tm_mon;
+ hvtm->tm_year = tm.tm_year;
+ hvtm->flags = 0;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
+{
+ /* Do nothing here. */
+ pr_warn("hv_set_rtc() will not work in kvm guest\n");
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
+
+ switch (idx) {
+ case 0:
+ var->start = 0UL;
+ var->size = 0x20000000000UL;
+ break;
+ case 1:
+ var->start = 0xFFFFFFFF80000000UL;
+ var->size = 0x80000000UL;
+ break;
+ default:
+ var->start = 0UL;
+ var->size = 0UL;
+ break;
+ }
+
+ return 1;
+}
+
+/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
+static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
+
+ if (idx == 0) {
+ var->start = min_asid;
+ var->size = max_asid - min_asid + 1;
+ } else {
+ var->start = 0;
+ var->size = 0;
+ }
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
+{
+ HV_Topology *tp;
+ int cpus;
+
+ /* Depends on the definition of struct HV_Topology */
+ tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
+
+ cpus = atomic_read(&vcpu->kvm->online_vcpus);
+ tp->coord.x = vcpu->vcpu_id;
+ tp->coord.y = 0;
+ tp->width = cpus;
+ tp->height = 1;
+
+ return 1;
+}
+
+static int xy_to_vcpu(struct kvm *kvm, int x, int y)
+{
+ if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
+ return -1;
+ return x;
+}
+
+/*
+ * The primary vcpu is the one that initially runs while the others
+ * all block. It is the only that is allowed to call hv_start_all_tiles().
+ * The other cpus are secondary.
+ */
+static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id != 0;
+}
+
+static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
+{
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (is_secondary_vcpu(vcpu) || completion_done(c))
+ return panic_hv(vcpu, "start_all_tiles() called again");
+ complete_all(c);
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
+ uint64_t val = vcpu->arch.regs.regs[2];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ hv_physaddr_write64(hpa, *access, val);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
+{
+ /* Do we care about the argument msgstate? */
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+/*
+ * NOTE: we may coalesce multiple messages with the same tag to the
+ * same recepient. Currently the only messages used by Linux are
+ * start/stop cpu (where coalescing is OK), and the smp_call_function()
+ * IPI message tag. In the latter case we rely on the generic
+ * smp_call_function code to properly handle this, and since it only
+ * uses the IPI as a way to wake up the generic list-walking code,
+ * it's OK if we coalesce several IPI deliveries before the recipient
+ * core takes action.
+ */
+static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *vcpui;
+ HV_Recipient recip[NR_CPUS];
+ HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
+ int nrecip = vcpu->arch.regs.regs[1];
+ int buflen = vcpu->arch.regs.regs[3];
+ int sent, vcpu_id, tag;
+
+ /* NOTE: we only support the Linux usage of buflen == sizeof(int). */
+ if (unlikely(buflen != sizeof(int) ||
+ nrecip >= atomic_read(&kvm->online_vcpus))) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ /* Get the buf info */
+ if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(tag))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Range-check the tag value. */
+ if (tag < 0 || tag >= MAX_MSG_TAG) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Get all the recipients */
+ if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ for (sent = 0; sent < nrecip; sent++) {
+ if (recip[sent].state != HV_TO_BE_SENT)
+ continue;
+ vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
+ if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
+ recip[sent].state = HV_BAD_RECIP;
+ continue;
+ }
+ vcpui = kvm_get_vcpu(kvm, vcpu_id);
+ set_bit(tag, &vcpui->arch.pending_msgs);
+ kvm_vcpu_kick(vcpui);
+ recip[sent].state = HV_SENT;
+ }
+
+ if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = sent;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
+{
+ HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
+ int buflen = vcpu->arch.regs.regs[3];
+ int tag;
+
+ /* Currently we only support messages from other tiles. */
+ rmi->source = HV_MSG_TILE;
+
+ if (buflen <= sizeof(int)) {
+ rmi->msglen = HV_E2BIG;
+ return 1;
+ }
+
+ tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
+ if (tag >= MAX_MSG_TAG) {
+ /* No more messages */
+ rmi->msglen = 0;
+ return 1;
+ }
+
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(int))) {
+ rmi->msglen = HV_EFAULT;
+ return 1;
+ }
+
+ /*
+ * This clear_bit could race with a set_bit as another core
+ * delivers a new smp_function_call to this core. However,
+ * the smp_function_call code will have set up the additional
+ * smp_function_call data on the kernel's list prior to
+ * raising the interrupt, so even if we lose the new
+ * interrupt due to the race, we still haven't dispatched
+ * to the original interrupt handler, and when we do, it
+ * will find both smp_function_calls waiting for it, so the
+ * race is harmless. This is consistent with the fact that
+ * the generic code is trying to support pretty much
+ * arbitrary architecture-dependent IPI semantics, so it
+ * is very conservative about what it assumes.
+ *
+ * Also note that we only clear_bit on the core that owns
+ * the mask, so there's no race condition caused by the
+ * find_first_bit above and the clear_bit here, since once
+ * a bit is found it will stay set until this point.
+ */
+ clear_bit(tag, &vcpu->arch.pending_msgs);
+ rmi->msglen = sizeof(int);
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
+
+ *ctx = hv_inquire_guest_context();
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ HV_InqTileSet set = vcpu->arch.regs.regs[0];
+ unsigned long gva = vcpu->arch.regs.regs[1];
+ int length = vcpu->arch.regs.regs[2];
+ struct cpumask mask = CPU_MASK_NONE;
+ int cpus, i, retval, bytes2copy, bytes2zero;
+
+ switch (set) {
+ case HV_INQ_TILES_AVAIL:
+ case HV_INQ_TILES_HFH_CACHE:
+ case HV_INQ_TILES_LOTAR:
+ cpus = atomic_read(&kvm->online_vcpus);
+ for (i = 0; i < cpus; ++i)
+ cpumask_set_cpu(i, &mask);
+ break;
+ case HV_INQ_TILES_SHARED:
+ break;
+ default:
+ retval = HV_EINVAL;
+ goto done;
+ }
+
+ bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
+ bytes2zero = length - bytes2copy;
+
+ if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ retval = HV_OK;
+done:
+ vcpu->arch.regs.regs[0] = retval;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
+{
+ HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
+ int pl = (int) vcpu->arch.regs.regs[1];
+ struct kvm_vcpu *target_vcpu;
+ int vcpu_id;
+
+ vcpu_id = vtarget.x;
+ if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
+ vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
+{
+ struct kvm_vcpu *vcpui;
+ unsigned long idx;
+
+ kvm_for_each_vcpu(idx, vcpui, kvm)
+ if (vcpui->arch.ipi_gpa == gpa)
+ return vcpui;
+
+ return NULL;
+}
+
+/*
+ * Most page faults will be downcall-ed from hv to and be handled directly
+ * by either guest os or host os. This function is used to handle the
+ * rest cases.
+ */
+static int handle_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_translation tr;
+ struct kvm_vcpu *ipi_vcpu;
+
+ tr.linear_address = (__u64) vcpu->arch.fault_addr;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return 0;
+
+ /* ipi PTE for rescheduling interrupt? */
+ ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
+ if (!ipi_vcpu)
+ return 0;
+
+ set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
+ kvm_vcpu_kick(ipi_vcpu);
+
+ /* Juke the PC past the store instruction. */
+ vcpu->arch.regs.pc += 8;
+ return 1;
+}
+
+static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We do not expect this call in guest so far. At least guest os
+ * should just follow host os instead of *set*. Besides,
+ * hv_set_pte_super_shift() will not be called in guest os with
+ * current guest os setting.
+ */
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
+{
+ HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
+
+ hvss->new_speed = HV_EPERM;
+ hvss->end_cycle = 0;
+ hvss->delta_ns = 0;
+
+ return 1;
+}
+
+static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
+ HCALL_DEFS
+};
+
+static int kvm_handle_exit(struct kvm_vcpu *vcpu)
+{
+ unsigned long hcall_idx;
+
+ switch (vcpu->run->exit_reason) {
+ case KVM_EXIT_HYPERCALL:
+ hcall_idx = vcpu->arch.regs.regs[10];
+ if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
+ hcall_handlers[hcall_idx] == NULL))
+ return kvm_emulate_illegal(vcpu);
+
+ /* Juke us past the swint0 when we return. */
+ vcpu->arch.regs.pc += 8;
+
+ return hcall_handlers[hcall_idx](vcpu);
+
+ case KVM_EXIT_MMIO:
+ if (handle_mmio(vcpu))
+ return 1;
+ return panic_hv(vcpu, "Out-of-bounds client memory access");
+
+ case KVM_EXIT_AGAIN:
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+static void kvm_kick_func(void *info)
+{
+ struct kvm_vcpu *vcpu = info;
+
+ /* If this is not the thread that we expect, just return. */
+ if (unlikely(vcpu->pid != get_task_pid(current, PIDTYPE_PID)))
+ return;
+
+ /* Setting this flag will cause a vmexit instead of a vmresume. */
+ set_thread_flag(TIF_VIRT_EXIT);
+}
+
+/* Note this function has been a standard kvm interface in latest Linux. */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int me, cpu;
+
+ /* If it is waiting in kvm_vcpu_block(), wake it up. */
+ if (waitqueue_active(&vcpu->wq))
+ wake_up_interruptible(&vcpu->wq);
+
+ /* If we are kicking our own vcpu, make sure we vmexit. */
+ if (vcpu == current_thread_info()->vcpu) {
+ set_thread_flag(TIF_VIRT_EXIT);
+ return;
+ }
+
+ /*
+ * If the vcpu is running the guest, interrupt its cpu,
+ * causing it to vmexit by setting TIF_VIRT_EXIT. Note we can
+ * race with a guest already doing a vmexit, but that is benign.
+ */
+ cpu = vcpu->cpu;
+ me = get_cpu();
+ if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ smp_call_function_single(cpu, kvm_kick_func, vcpu, 0);
+ put_cpu();
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+
+/*
+ * Any interrupt that would normally be handled by the host at PL2
+ * needs to be reassigned to the guest at PL1 as we enter.
+ *
+ * The TLB interrupts remain handled by the hypervisor and are downcalled
+ * to the appropriate host or guest as necessary.
+ *
+ * FIXME: We don't give the UDN interrupts for now; at some point we
+ * plan to allow an option to pin the vcpus and report the true
+ * geometry to the guest, at which point passing the UDN access would
+ * make sense.
+ *
+ * FIXME: For now we don't pass the profiling interrupts to the guest,
+ * and instead require profiling be run in the host; we should be able
+ * to support guest-level profiling pretty easily, but we need to
+ * think about whether there are vcpu migration issues there.
+ */
+static void kvm_grant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_1, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
+}
+
+static void kvm_ungrant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_2, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
+}
+
+/*
+ * There is lots of state that is (for the non-virtualized case) held
+ * permanently in SPRs, or that is in any case not context-switched.
+ * The next two routines switch in and out all the SPR state.
+ *
+ * We try to fix the timer so that when we restart, we fix up the
+ * timer value so that will fire at the correct wall-clock time even
+ * if we have been scheduled out for a little bit. This may also
+ * mean we end up firing it immediately on return, and suffer a
+ * timer delay in the guest.
+ */
+static void kvm_save_sprs(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
+ vcpu->arch.vmexit_cycles = get_cycles();
+
+#define SAVE_SPR(x) vcpu->arch.x = __insn_mfspr(SPR_ ## x)
+ FOR_EACH_GUEST_SPR(SAVE_SPR);
+#undef SAVE_SPR
+}
+
+static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
+{
+ unsigned long count = vcpu->arch.timer_control;
+ unsigned long underflow =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
+ unsigned long disabled =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
+
+ if (!disabled) {
+ unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ underflow |= delta > count;
+ count -= delta;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
+ }
+ __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
+
+#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.x)
+ FOR_EACH_GUEST_SPR(RESTORE_SPR);
+#undef RESTORE_SPR
+}
+
+/*
+ * When entering the guest, we need to eliminate any PL0 translations
+ * that were in use by qemu, since the guest's PL0 translations will
+ * be different. We also flush PL1 translations in case there have
+ * been changes to the virtualization page table, etc.
+ *
+ * FIXME: Add a way to just flush PL0/PL1, or just flush below
+ * the host PAGE_OFFSET, or add vpid support, etc.
+ */
+static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx;
+ pgd_t *vpgdir;
+ pte_t *ptep;
+ int rc;
+
+ /* Install virtualization context */
+ vpgdir = vcpu->kvm->arch.vpgd;
+ BUG_ON(vpgdir == NULL);
+ ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
+ rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Install guest context */
+ ctx = &vcpu->arch.guest_context;
+ rc = hv_install_guest_context(ctx->page_table, ctx->access,
+ ctx->asid, ctx->flags);
+ WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
+ ctx->page_table, ctx->access.val,
+ ctx->asid, ctx->flags, rc);
+
+ hv_flush_all(0);
+}
+
+/*
+ * De-install the virtualization context so we take faults below the
+ * host Linux PL in the normal manner going forward.
+ *
+ * We flush all the TLB mappings as we exit the guest, since the
+ * guest has been using the ASIDs as it pleases, and may have installed
+ * incompatible mappings for qemu's process as well. Note that we don't
+ * worry about host-PL interrupts that occur while the guest is running,
+ * on the assumption that such interrupts can't touch userspace
+ * addresses legally anyway.
+ *
+ * NOTE: we may want to add a hypervisor call to just flush mappings
+ * below PL2 and use that here instead.
+ */
+static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
+{
+ int rc;
+
+ /* Remember guest context */
+ vcpu->arch.guest_context = hv_inquire_guest_context();
+
+ /* Disable virtualization context */
+ rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Flush everything in the TLB. */
+ hv_flush_all(0);
+}
+
+static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Capture current set of ipi_events. We might race with
+ * another thread adding an event, but if so we'll just miss
+ * it on this go-around and see it next time.
+ */
+ vcpu->arch.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
+
+ /*
+ * Note: We could set PC and EX1 for the guest os to jump
+ * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
+ * is unmasked and the guest is not at PL1 with ICS set.
+ * But in fact it's about as fast to just set INTCTRL_1_STATUS
+ * here and then run the short INTCTRL_1 handler in the guest.
+ */
+ vcpu->arch.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
+}
+
+static void kvm_tile_run(struct kvm_vcpu *vcpu)
+{
+ struct thread_info *ti = current_thread_info();
+ unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
+
+ /*
+ * Disable interrupts while we set up the guest state.
+ * This way, if we race with another core trying to tell us
+ * to fix up our guest state, we will take the kick only as
+ * we actually try to enter the guest, and instead we will
+ * vmexit and end up retrying.
+ */
+ local_irq_disable();
+ kvm_guest_context_enter(vcpu);
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ ti->vcpu = vcpu;
+ vcpu->cpu = get_cpu();
+ kvm_inject_interrupts(vcpu);
+ kvm_grant_mpls();
+ kvm_restore_sprs(vcpu);
+
+ /* Calling this function irets into the guest. */
+ kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
+
+ /* We resume here due to a call to kvm_vmexit. */
+ __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
+
+ vcpu->cpu = -1;
+ put_cpu();
+ ti->vcpu = NULL;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
+ vcpu->run->ready_for_interrupt_injection = 1;
+ kvm_ungrant_mpls();
+ kvm_save_sprs(vcpu);
+ __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
+ kvm_guest_context_exit(vcpu);
+ local_irq_enable();
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r = 1;
+
+ while (r > 0) {
+ kvm_guest_enter();
+ kvm_tile_run(vcpu);
+ kvm_guest_exit();
+
+ r = kvm_handle_exit(vcpu);
+ /*
+ * <0: error for userspace.
+ * =0: QEMU to handle.
+ * >0: host os can handle it fully.
+ */
+ if (r <= 0)
+ break;
+
+ if (signal_pending(current)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+
+#ifdef CONFIG_HOMECACHE
+ if (current_thread_info()->homecache_cpu !=
+ smp_processor_id()) {
+ /* Do homecache migration when returning to qemu. */
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+#endif
+
+ kvm_resched(vcpu);
+ }
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ /* Secondary cpus must wait until they are told they can start. */
+ if (vcpu->arch.suspended) {
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (wait_for_completion_interruptible(c))
+ return -EINTR;
+ vcpu->arch.suspended = 0;
+ }
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ r = __vcpu_run(vcpu, kvm_run);
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return r;
+}
+
+int kvm_arch_init(void *opaque)
+{
+ return 0;
+}
+
+void kvm_arch_exit(void)
+{
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ unsigned long resv_gfn_start;
+ struct kvm_memory_slot *s;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.resv_gpa_start) {
+ resv_gfn_start = 0;
+
+ for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
+ s = &kvm->memslots->memslots[i];
+
+ if (!s->npages)
+ continue;
+
+ if ((s->base_gfn + s->npages) > resv_gfn_start)
+ resv_gfn_start = s->base_gfn + s->npages;
+ }
+
+ kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
+ }
+
+ /* Initialize to enter fake PA=VA mode in hypervisor. */
+ vcpu->arch.guest_context.page_table = HV_CTX_NONE;
+
+ vcpu->arch.ipi_gpa =
+ kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
+ vcpu->arch.ipi_gpte =
+ pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
+
+ /* Mark the core suspended if it is not the boot cpu. */
+ vcpu->arch.suspended = is_secondary_vcpu(vcpu);
+
+ return 0;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Notify simulator that this task handles this vcpu. */
+ sim_set_vcpu(vcpu->vcpu_id);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ sim_clear_vcpu();
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+ /* FIXME: some archs set up a cache for these structs? */
+ struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+ int rc;
+
+ if (!vcpu)
+ return ERR_PTR(-ENOMEM);
+
+ rc = kvm_vcpu_init(vcpu, kvm, id);
+ if (rc) {
+ kfree(vcpu);
+ return ERR_PTR(rc);
+ }
+
+ return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
+ return 0;
+}
+
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_vcpu_uninit(vcpu);
+ kfree(vcpu);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_destroy(vcpu);
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ if (type)
+ return -EINVAL;
+
+ init_completion(&kvm->arch.smp_start);
+ return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ /* Seems to be unnecessary? */
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+
+ /* FIXME: release all the pmds and ptes as well! */
+ if (kvm->arch.vpgd)
+ pgd_free(kvm->mm, kvm->arch.vpgd);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+/* Called from guest hv glue via swint0 traps. */
+void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
+{
+ /* Hypercalls are only valid from PL1. */
+ if (EX1_PL(regs->ex1) != 0) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
+ /*NORETURN*/
+ }
+ do_trap(regs, fault_num, 0);
+}
+
+void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long fault_addr, unsigned long write)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ BUG_ON(vcpu == NULL);
+ vcpu->arch.fault_addr = fault_addr;
+ kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
+ /*NORETURN*/
+}
+
+void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
+{
+ kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
+ /*NORETURN*/
+}
+
+void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->arch.regs = *regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ kvm_vmexit(vcpu->arch.host_sp);
+ /*NORETURN*/
+}
+
+static int __init kvm_tile_init(void)
+{
+ return kvm_init(NULL, sizeof(struct kvm_vcpu),
+ __alignof__(struct kvm_vcpu), THIS_MODULE);
+}
+
+static void __exit kvm_tile_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(kvm_tile_init);
+module_exit(kvm_tile_exit);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 82733c8..1590282 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);
/* hypervisor glue */
#include <hv/hypervisor.h>
+EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_dev_close);
EXPORT_SYMBOL(hv_dev_open);
+EXPORT_SYMBOL(hv_dev_poll);
+EXPORT_SYMBOL(hv_dev_poll_cancel);
EXPORT_SYMBOL(hv_dev_pread);
-EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_preada);
+EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_pwritea);
-EXPORT_SYMBOL(hv_dev_poll);
-EXPORT_SYMBOL(hv_dev_poll_cancel);
-EXPORT_SYMBOL(hv_dev_close);
-EXPORT_SYMBOL(hv_sysconf);
-EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_flush_all);
EXPORT_SYMBOL(hv_get_rtc);
+#ifdef __tilegx__
+EXPORT_SYMBOL(hv_inquire_guest_context);
+EXPORT_SYMBOL(hv_install_guest_context);
+EXPORT_SYMBOL(hv_install_virt_context);
+#endif
+EXPORT_SYMBOL(hv_physaddr_read64);
+EXPORT_SYMBOL(hv_physaddr_write64);
EXPORT_SYMBOL(hv_set_rtc);
+EXPORT_SYMBOL(hv_sysconf);
/* libgcc.a */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 23f044e..86cff48 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm)
char *buf, *path;
struct vm_area_struct *vma;
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
if (!sim_is_simulator())
+#endif
return 1;
if (mm->exe_file == NULL)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 64eec3f..39c48cb 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
- is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+ is_kernel_mode = !user_mode(regs);
tsk = validate_current();
@@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
}
#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
- if (EX1_PL(regs->ex1) != USER_PL) {
+ if (!user_mode(regs)) {
struct async_tlb *async;
switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 3bfa127..c6d2160 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
{
int cpu;
unsigned long page;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if CHIP_HAS_CBOX_HOME_MAP()
/* For kdata=huge, everything is just hash-for-home. */
@@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
}
- address = MEM_SV_INTRPT;
+ address = MEM_SV_START;
pmd = get_pmd(pgtables, address);
pfn = 0; /* code starts at PA 0 */
if (ktext_small) {
@@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
void free_initmem(void)
{
- const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+ const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
/*
* Evict the dirty initdata on the boot cpu, evict the w1data
@@ -1040,7 +1040,7 @@ void free_initmem(void)
/*
* Free the pages mapped from 0xc0000000 that correspond to code
- * pages from MEM_SV_INTRPT that we won't use again after init.
+ * pages from MEM_SV_START that we won't use again after init.
*/
free_init_pages("unused kernel text",
(unsigned long)_sinittext - text_delta,
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 3004433..d6948d4 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
#if CHIP_HAS_MMIO()
-/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
-void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
- pgprot_t home)
+void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot)
{
void *addr;
struct vm_struct *area;
unsigned long offset, last_addr;
- pgprot_t pgprot;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
- /* Create a read/write, MMIO VA mapping homed at the requested shim. */
- pgprot = PAGE_KERNEL;
- pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
- pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
-
/*
* Mappings have to be page-aligned
*/
@@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
+ area = get_vm_area(size, flags);
if (!area)
return NULL;
area->phys_addr = phys_addr;
addr = area->addr;
if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
+ phys_addr, prot)) {
free_vm_area(area);
return NULL;
}
- return (__force void __iomem *) (offset + (char *)addr);
+ return (void *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(generic_remap_prot);
+
+/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ pgprot_t home)
+{
+ pgprot_t pgprot;
+ unsigned long flags;
+
+ /* Create a read/write, MMIO VA mapping homed at the requested shim. */
+ pgprot = PAGE_KERNEL;
+ pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
+ pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
+ flags = VM_IOREMAP; /* | other flags? */
+
+ return (__force void __iomem *) generic_remap_prot(phys_addr,
+ size, flags, pgprot);
}
EXPORT_SYMBOL(ioremap_prot);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08..d3879c5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -171,6 +171,7 @@ struct kvm_pit_config {
#define KVM_EXIT_WATCHDOG 21
#define KVM_EXIT_S390_TSCH 22
#define KVM_EXIT_EPR 23
+#define KVM_EXIT_AGAIN 24
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -1012,6 +1013,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad)
#define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init)
#define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Reset some SPR registers for tilegx */
+#define KVM_TILE_RESET_SPR _IO(KVMIO, 0xa8)
#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0)
#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..1b8a1f1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1691,7 +1691,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
}
-#ifndef CONFIG_S390
+#if !defined(CONFIG_S390) && !defined(CONFIG_TILE)
/*
* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
*/
@@ -1714,7 +1714,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
put_cpu();
}
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
-#endif /* !CONFIG_S390 */
+#endif
void kvm_resched(struct kvm_vcpu *vcpu)
{
@@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu->kvm->mm != current->mm)
return -EIO;
-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
+ defined(CONFIG_TILEGX)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
* so vcpu_load() would break it.
--
1.8.3.1
On 2013-08-12 17:24, Chris Metcalf wrote:
> This change provides the initial framework support for KVM on tilegx.
> Basic virtual disk and networking is supported.
>
> Signed-off-by: Chris Metcalf <[email protected]>
> ---
...
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index acccd08..d3879c5 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -171,6 +171,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_WATCHDOG 21
> #define KVM_EXIT_S390_TSCH 22
> #define KVM_EXIT_EPR 23
> +#define KVM_EXIT_AGAIN 24
>
> /* For KVM_EXIT_INTERNAL_ERROR */
> /* Emulate instruction failed. */
> @@ -1012,6 +1013,8 @@ struct kvm_s390_ucas_mapping {
> #define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad)
> #define KVM_ARM_VCPU_INIT _IOW(KVMIO, 0xae, struct kvm_vcpu_init)
> #define KVM_GET_REG_LIST _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
> +/* Reset some SPR registers for tilegx */
> +#define KVM_TILE_RESET_SPR _IO(KVMIO, 0xa8)
The KVM way of doing this is typically GET/SET_SPR, i.e. a way to
read/write the registers from userspace. This will be required for
migration support anyway, I bet.
Jan
--
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux
This change provides the initial framework support for KVM on tilegx.
Basic virtual disk and networking is supported.
Signed-off-by: Chris Metcalf <[email protected]>
---
v2: remove KVM_TILE_RESET_SPR based on feedback from Jan Kiszka.
qemu ends up modified to just use KVM_SET_SREGS instead.
arch/tile/Kconfig | 19 +-
arch/tile/Makefile | 1 +
arch/tile/include/asm/io.h | 2 +
arch/tile/include/asm/kvm.h | 29 +
arch/tile/include/asm/kvm_host.h | 101 ++
arch/tile/include/asm/kvm_para.h | 20 +
arch/tile/include/asm/kvm_virtio.h | 26 +
arch/tile/include/asm/module.h | 9 +-
arch/tile/include/asm/page.h | 56 +-
arch/tile/include/asm/pgtable_32.h | 2 +-
arch/tile/include/asm/pgtable_64.h | 3 +-
arch/tile/include/asm/processor.h | 6 +-
arch/tile/include/asm/ptrace.h | 2 +-
arch/tile/include/asm/switch_to.h | 25 +-
arch/tile/include/asm/thread_info.h | 17 +-
arch/tile/include/asm/timex.h | 8 +
arch/tile/include/hv/hypervisor.h | 183 +++-
arch/tile/include/uapi/arch/sim.h | 19 +
arch/tile/include/uapi/arch/sim_def.h | 8 +
arch/tile/include/uapi/arch/spr_def_32.h | 15 +
arch/tile/include/uapi/arch/spr_def_64.h | 25 +
arch/tile/include/uapi/asm/Kbuild | 2 +
arch/tile/include/uapi/asm/kvm.h | 267 +++++
arch/tile/include/uapi/asm/kvm_virtio.h | 60 ++
arch/tile/kernel/Makefile | 1 +
arch/tile/kernel/asm-offsets.c | 7 +
arch/tile/kernel/early_printk.c | 16 +
arch/tile/kernel/head_32.S | 4 +-
arch/tile/kernel/head_64.S | 6 +-
arch/tile/kernel/hvglue.S | 8 +-
arch/tile/kernel/hvglue_trace.c | 14 +
arch/tile/kernel/intvec_32.S | 18 +-
arch/tile/kernel/intvec_64.S | 226 +++--
arch/tile/kernel/kvm_virtio.c | 430 ++++++++
arch/tile/kernel/process.c | 40 +-
arch/tile/kernel/relocate_kernel_64.S | 9 +-
arch/tile/kernel/setup.c | 21 +-
arch/tile/kernel/smp.c | 28 +-
arch/tile/kernel/stack.c | 2 +-
arch/tile/kernel/sysfs.c | 4 +
arch/tile/kernel/time.c | 14 +-
arch/tile/kernel/traps.c | 2 +-
arch/tile/kernel/vmlinux.lds.S | 10 +-
arch/tile/kvm/Kconfig | 3 -
arch/tile/kvm/Makefile | 12 +
arch/tile/kvm/entry.S | 91 ++
arch/tile/kvm/kvm-tile.c | 1581 ++++++++++++++++++++++++++++++
arch/tile/lib/exports.c | 20 +-
arch/tile/mm/elf.c | 2 +
arch/tile/mm/fault.c | 4 +-
arch/tile/mm/init.c | 8 +-
arch/tile/mm/pgtable.c | 35 +-
include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 7 +-
54 files changed, 3331 insertions(+), 198 deletions(-)
create mode 100644 arch/tile/include/asm/kvm.h
create mode 100644 arch/tile/include/asm/kvm_host.h
create mode 100644 arch/tile/include/asm/kvm_para.h
create mode 100644 arch/tile/include/asm/kvm_virtio.h
create mode 100644 arch/tile/include/uapi/asm/kvm.h
create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h
create mode 100644 arch/tile/kernel/kvm_virtio.c
create mode 100644 arch/tile/kvm/Makefile
create mode 100644 arch/tile/kvm/entry.S
create mode 100644 arch/tile/kvm/kvm-tile.c
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index ecff467..bbb6d51 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -5,7 +5,6 @@ config TILE
def_bool y
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
- select HAVE_KVM if !TILEGX
select GENERIC_FIND_FIRST_BIT
select SYSCTL_EXCEPTION_TRACE
select USE_GENERIC_SMP_HELPERS
@@ -113,6 +112,7 @@ config SMP
def_bool y
config HVC_TILE
+ depends on !KVM_GUEST
depends on TTY
select HVC_DRIVER
select HVC_IRQ if TILEGX
@@ -127,6 +127,7 @@ config TILEGX
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KPROBES
select HAVE_KRETPROBES
+ select HAVE_KVM if !KVM_GUEST
config TILEPRO
def_bool !TILEGX
@@ -366,11 +367,23 @@ config HARDWALL
bool "Hardwall support to allow access to user dynamic network"
default y
+config KVM_GUEST
+ bool "Build kernel as guest for KVM"
+ default n
+ depends on TILEGX
+ select VIRTIO
+ select VIRTIO_RING
+ select VIRTIO_CONSOLE
+ ---help---
+ This will build a kernel that runs at a lower protection level
+ than the default kernel and is suitable to run under KVM.
+
+# TILEPro kernels run at PL1; TILE-Gx runs at PL2 unless it's a KVM guest.
config KERNEL_PL
int "Processor protection level for kernel"
range 1 2
- default 2 if TILEGX
- default 1 if !TILEGX
+ default 2 if TILEGX && !KVM_GUEST
+ default 1 if !TILEGX || KVM_GUEST
---help---
Since MDE 4.2, the Tilera hypervisor runs the kernel
at PL2 by default. If running under an older hypervisor,
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364..8e7f852 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH)
# See arch/tile/Kbuild for content of core part of the kernel
core-y += arch/tile/
+core-$(CONFIG_KVM) += arch/tile/kvm/
core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 9fe4349..023659b 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -43,6 +43,8 @@
* long before casting it to a pointer to avoid compiler warnings.
*/
#if CHIP_HAS_MMIO()
+extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot);
extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
pgprot_t pgprot);
diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
new file mode 100644
index 0000000..2ea6c41
--- /dev/null
+++ b/arch/tile/include/asm/kvm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_H
+#define _ASM_TILE_KVM_H
+
+#include <hv/hypervisor.h>
+#include <uapi/asm/kvm.h>
+
+#ifndef __ASSEMBLER__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
+#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
+#endif
+#endif /* _ASM_TILE_KVM_H */
diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
new file mode 100644
index 0000000..58b6bf3
--- /dev/null
+++ b/arch/tile/include/asm/kvm_host.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _ASM_TILE_KVM_HOST_H
+#define _ASM_TILE_KVM_HOST_H
+
+#define KVM_MAX_VCPUS 64
+#define KVM_USER_MEM_SLOTS 32
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+/* For now, claim we have no huge pages. */
+#define KVM_HPAGE_GFN_SHIFT(x) 0
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) 1
+
+/* Max number of message tags for hv_send/receive_message() */
+#define MAX_MSG_TAG (sizeof(unsigned long) * 8)
+
+/* Bits in pending_downcalls */
+#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kvm_vcpu_stat {
+ /* None yet. */
+};
+
+struct kvm_vcpu_arch {
+ struct pt_regs regs;
+ struct kvm_sregs sregs;
+ unsigned long host_sp; /* Host "real" sp during vmresume. */
+ HV_Context guest_context;
+ unsigned long pending_msgs; /* Pending guest messages */
+ unsigned long ipi_events; /* Pending guest ipi events. */
+ unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
+ pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
+ unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */
+ int suspended; /* true for cores not yet started by host */
+ unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */
+ unsigned long vmexit_cycles; /* cycle count of last vmexit */
+};
+
+struct kvm_vm_stat {
+ /*
+ * FIXME - does this make sense for us? It's used in common KVM
+ * code.
+ */
+ u32 remote_tlb_flush;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+struct kvm_arch {
+ pgd_t *vpgd;
+ unsigned long resv_gpa_start; /* For special purpose. */
+ struct completion smp_start;
+};
+
+struct kvm_vcpu;
+
+extern void kvm_vmresume(struct pt_regs *guest,
+ unsigned long *host_sp_ptr);
+extern void kvm_vmexit(unsigned long host_sp);
+extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
+extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
+extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long, unsigned long);
+extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
+
+extern void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
+
+#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
+
+#define gpmd_offset(kvm, pud, address) \
+ ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
+
+#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
+
+#define gpte_offset_kernel(kvm, pmd, address) \
+ ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
+
+#endif /* __ASSEMBLY__*/
+
+#endif /* _ASM_TILE_KVM_HOST_H */
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 0000000..c8c31d5
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_PARA_H
+#define _ASM_TILE_KVM_PARA_H
+
+#include <uapi/asm/kvm_para.h>
+
+int hcall_virtio(unsigned long instrument, unsigned long mem);
+#endif /* _ASM_TILE_KVM_PARA_H */
diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
new file mode 100644
index 0000000..8faa959
--- /dev/null
+++ b/arch/tile/include/asm/kvm_virtio.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_VIRTIO_H
+#define _ASM_TILE_KVM_VIRTIO_H
+
+#include <uapi/asm/kvm_virtio.h>
+
+
+struct kvm_device {
+ struct virtio_device vdev;
+ struct kvm_device_desc *desc;
+ unsigned long desc_pa;
+};
+
+#endif /* _ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
index 44ed07c..927c97f 100644
--- a/arch/tile/include/asm/module.h
+++ b/arch/tile/include/asm/module.h
@@ -28,6 +28,13 @@
# define MODULE_PGSZ ""
#endif
+/* Tag guest Linux, since it uses different SPRs, etc. */
+#if CONFIG_KERNEL_PL == 2
+#define MODULE_PL ""
+#else
+#define MODULE_PL " guest"
+#endif
+
/* We don't really support no-SMP so tag if someone tries. */
#ifdef CONFIG_SMP
#define MODULE_NOSMP ""
@@ -35,6 +42,6 @@
#define MODULE_NOSMP " nosmp"
#endif
-#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP
#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index b4f96c0..65ee752 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif
+#ifdef CONFIG_KVM_GUEST
+/* Paravirtualized guests get half the VA, and thus half the PA. */
+#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1)
+#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1)
+#else
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+#endif
+
/* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
#define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
#define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
#define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
* We reserve the lower half of memory for user-space programs, and the
* upper half for system code. We re-map all of physical memory in the
* upper half, which takes a quarter of our VA space. Then we have
- * the vmalloc regions. The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions. The supervisor code lives at the highest address,
* with the hypervisor above that.
*
* Loadable kernel modules are placed immediately after the static
@@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size)
* Similarly, for now we don't play any struct page mapping games.
*/
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
# error Too much PA to map with the VA available!
#endif
-#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
-#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */
-#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */
-#define PAGE_OFFSET MEM_HIGH_START
-#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */
+#ifdef CONFIG_KVM_GUEST
+#define PAGE_OFFSET (_AC(1, UL) << (MAX_VA_WIDTH - 1))
+#define KERNEL_HIGH_VADDR (_AC(1, UL) << MAX_VA_WIDTH)
+#else
+#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */
+#endif
+
+#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
#define _VMALLOC_START FIXADDR_TOP
-#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT MEM_SV_START
-#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */
#define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR MEM_SV_START
#else /* !__tilegx__ */
@@ -213,8 +221,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
* values, and after that, we show "typical" values, since the actual
* addresses depend on kernel #defines.
*
- * MEM_HV_INTRPT 0xfe000000
- * MEM_SV_INTRPT (kernel code) 0xfd000000
+ * MEM_HV_START 0xfe000000
+ * MEM_SV_START (kernel code) 0xfd000000
* MEM_USER_INTRPT (user vector) 0xfc000000
* FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR)
* PKMAP_BASE 0xf7000000 (via LAST_PKMAP)
@@ -224,14 +232,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
*/
#define MEM_USER_INTRPT _AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT _AC(0xfd000000, UL)
-#define MEM_HV_INTRPT _AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT _AC(0xfd000000, UL)
-#define MEM_SV_INTRPT _AC(0xfe000000, UL)
-#define MEM_HV_INTRPT _AC(0xff000000, UL)
-#endif
+#define MEM_SV_START _AC(0xfd000000, UL)
+#define MEM_HV_START _AC(0xfe000000, UL)
#define INTRPT_SIZE 0x4000
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index e5bdc0e..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; }
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_INTRPT;
+ return addr >= MEM_HV_START;
}
/*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 7cb8d35..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_START ||
- (addr > MEM_LOW_END && addr < MEM_HIGH_START);
+ return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
}
/*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 230b830..5aa5431 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
#ifndef _ASM_TILE_PROCESSOR_H
#define _ASM_TILE_PROCESSOR_H
+#include <arch/chip.h>
+
#ifndef __ASSEMBLY__
/*
@@ -25,7 +27,6 @@
#include <asm/ptrace.h>
#include <asm/percpu.h>
-#include <arch/chip.h>
#include <arch/spr_def.h>
struct task_struct;
@@ -167,7 +168,7 @@ struct thread_struct {
#ifndef __ASSEMBLY__
#ifdef __tilegx__
-#define TASK_SIZE_MAX (MEM_LOW_END + 1)
+#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1))
#else
#define TASK_SIZE_MAX PAGE_OFFSET
#endif
@@ -347,7 +348,6 @@ extern int kdata_huge;
/*
* Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
*/
#define USER_PL 0
#if CONFIG_KERNEL_PL == 2
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 0d25c21..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
#define user_stack_pointer(regs) ((regs)->sp)
/* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
/* Fill in a struct pt_regs with the current kernel registers. */
struct pt_regs *get_pt_regs(struct pt_regs *);
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index b8f888c..8e9150f 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev,
extern unsigned long get_switch_to_pc(void);
/*
+ * Normally we notify the simulator whenever we change from one pid
+ * to another, so it can track symbol files appropriately on the fly.
+ * For now, we don't do this for the guest Linux, since we don't
+ * have a way to tell the simulator that we are entering a separate
+ * pid space when we are in the guest.
+ */
+#ifdef CONFIG_KVM_GUEST
+#define notify_sim_task_change(prev) do { } while (0)
+#else
+#define notify_sim_task_change(prev) do { \
+ if (unlikely((prev)->state == TASK_DEAD)) \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
+ ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
+ (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+} while (0)
+#endif
+
+/*
* Kernel threads can check to see if they need to migrate their
* stack whenever they return from a context switch; for user
* threads, we defer until they are returning to user-space.
*/
#define finish_arch_switch(prev) do { \
- if (unlikely((prev)->state == TASK_DEAD)) \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
- ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
- (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ notify_sim_task_change(prev); \
if (current->mm == NULL && !kstack_hash && \
current_thread_info()->homecache_cpu != smp_processor_id()) \
homecache_migrate_kthread(); \
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index b8aa6df..1c26cdf 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -18,7 +18,9 @@
#include <asm/processor.h>
#include <asm/page.h>
+
#ifndef __ASSEMBLY__
+struct kvm_vcpu;
/*
* Low level task data that assembly code needs immediate access to.
@@ -44,6 +46,9 @@ struct thread_info {
unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */
void __user *unalign_jit_base; /* unalign fixup JIT base */
#endif
+#ifdef CONFIG_KVM
+ struct kvm_vcpu *vcpu; /* vcpu during vmresume */
+#endif
};
/*
@@ -117,8 +122,8 @@ extern void _cpu_idle(void);
/*
* Thread information flags that various assembly files may need to access.
- * Keep flags accessed frequently in low bits, particular since it makes
- * it easier to build constants in assembly.
+ * Keep flags accessed frequently in low bits, since it makes it
+ * easier to build constants in assembly.
*/
#define TIF_SIGPENDING 0 /* signal pending */
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
@@ -131,6 +136,7 @@ extern void _cpu_idle(void);
#define TIF_MEMDIE 7 /* OOM killer at work */
#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
+#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
@@ -142,11 +148,12 @@ extern void _cpu_idle(void);
#define _TIF_MEMDIE (1<<TIF_MEMDIE)
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VIRT_EXIT (1<<TIF_VIRT_EXIT)
/* Work to do on any return to user space. */
-#define _TIF_ALLWORK_MASK \
- (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
- _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
+#define _TIF_ALLWORK_MASK \
+ (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP| \
+ _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)
/* Work to do at syscall entry. */
#define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
index edbd7e4..0417617 100644
--- a/arch/tile/include/asm/timex.h
+++ b/arch/tile/include/asm/timex.h
@@ -27,6 +27,14 @@
typedef unsigned long long cycles_t;
+#ifdef CONFIG_KVM_GUEST
+#define INT_LINUX_TIMER INT_AUX_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_AUX_TILE_TIMER_CONTROL
+#else
+#define INT_LINUX_TIMER INT_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_TILE_TIMER_CONTROL
+#endif
+
#if CHIP_HAS_SPLIT_CYCLE()
cycles_t get_cycles(void);
#define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index f71b08e..71abe38 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -321,6 +321,18 @@
/** hv_set_speed */
#define HV_DISPATCH_SET_SPEED 58
+/** hv_install_virt_context */
+#define HV_DISPATCH_INSTALL_VIRT_CONTEXT 59
+
+/** hv_inquire_virt_context */
+#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT 60
+
+/** hv_install_guest_context */
+#define HV_DISPATCH_INSTALL_GUEST_CONTEXT 61
+
+/** hv_inquire_guest_context */
+#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT 62
+
/** hv_console_set_ipi */
#define HV_DISPATCH_CONSOLE_SET_IPI 63
@@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
* new page table does not need to contain any mapping for the
* hv_install_context address itself.
*
- * At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ * At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
* if multiple flags are specified, HV_EINVAL is returned.
* Specifying none of the flags results in using the default page size.
* All cores participating in a given client must request the same
* page size, or the results are undefined.
*
+ * To disable an installed page table, install HV_CTX_NONE. The access
+ * and asid fields are ignored.
+ *
* @param page_table Root of the page table.
* @param access PTE providing info on how to read the page table. This
* value must be consistent between multiple tiles sharing a page table,
@@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
#endif /* !__ASSEMBLER__ */
+#define HV_CTX_NONE ((HV_PhysAddr)-1) /**< Disable page table. */
+
#define HV_CTX_DIRECTIO 0x1 /**< Direct I/O requests are accepted from
PL0. */
+#define HV_CTX_GUEST_CACHE 0x4 /**< Let guest control caching flags (only
+ usable with hv_install_virt_context.) */
+
#define HV_CTX_PG_SM_4K 0x10 /**< Use 4K small pages, if available. */
#define HV_CTX_PG_SM_16K 0x20 /**< Use 16K small pages, if available. */
#define HV_CTX_PG_SM_64K 0x40 /**< Use 64K small pages, if available. */
#define HV_CTX_PG_SM_MASK 0xf0 /**< Mask of all possible small pages. */
+
#ifndef __ASSEMBLER__
+/** Install a virtualization context.
+ *
+ * When a virtualization context is installed, all faults from PL0 or
+ * PL1 are handled via a "guest context" and then post-processed by
+ * the "virtualization context"; faults at PL2 are still handled by
+ * the normal context. For guest faults, the "guest PAs" produced by
+ * the guest page table are passed through the virtualization page
+ * table as pseudo-VAs, generating the true CPA as a result. See the
+ * individual HV_PTE_xxx bits for the effect the bits have when
+ * present in the virtualization page table. The ASID is currently
+ * ignored in this syscall, but it might be used later, so the API
+ * includes it. The HV_CTX_GUEST_CACHE flag indicates that all
+ * cache-related flags should be taken from the primary page table,
+ * not the virtualization page table.
+ *
+ * Once the virtualization context is installed, a guest context
+ * should also be installed; otherwise a VA-equals-PA context will be
+ * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
+ * the virtualization context to generate CPAs.
+ *
+ * When entering client PL after being at guest or user PL, the
+ * client is expected to call hv_flush_all() to clear any TLB mappings
+ * that might otherwise conflict. Similarly, hv_flush_all() should
+ * be called before returning to guest or user PL with a virtualization
+ * context installed, so that any TLB mappings are cleared. Future
+ * work may include adding a "vpid" or similar namespace so that
+ * the TLBs may be managed independently.
+ *
+ * Subsequent guest page table installations will have their root PA
+ * and PTE cached after translating through the virtualization
+ * context, so if entries in the virtualization page table are
+ * modified or removed, the guest context should be re-installed.
+ * This, in conjunction with flushing the TLB on return to the guest,
+ * will ensure that the new virtualization entries are honored.
+ *
+ * @param page_table Root of the page table.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for (currently ignored).
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current virtualization context (see below).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
+
+
+/** Install a guest context.
+ *
+ * The guest context is only consulted when a virtualization context
+ * is also installed, and for faults that occur below the client's PL.
+ * If no guest context is installed, in such a case, a VA=PA context
+ * is used instead.
+ *
+ * The access PTE will only be honored if the virtualization table was
+ * installed with HV_CTX_GUEST_CACHE.
+ *
+ * A virtualization context must already be installed prior to
+ * installing the guest context.
+ *
+ * @param page_table Root of the page table; the value is the guest's
+ * physical address (GPA), not a CPA.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for.
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current context (HV_CTX_xxx).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
/** Set the number of pages ganged together by HV_PTE_SUPER at a
* particular level of the page table.
@@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
* "super" page size must be less than the span of the next level in
* the page table. The largest size that can be requested is 64GB.
*
- * The shift value is initially "0" for all page table levels,
+ * The shift value is initially 0 for all page table levels,
* indicating that the HV_PTE_SUPER bit is effectively ignored.
*
* If you change the count from one non-zero value to another, the
@@ -854,11 +954,26 @@ typedef struct
} HV_Context;
/** Retrieve information about the currently installed context.
- * @return The data passed to the last successful hv_install_context call.
+ * @return The data passed to the last successful call to
+ * hv_install_context().
*/
HV_Context hv_inquire_context(void);
+/** Retrieve information about the currently installed virtualization context.
+ * @return The data passed to the last successful call to
+ * hv_install_virt_context().
+ */
+HV_Context hv_inquire_virt_context(void);
+
+
+/** Retrieve information about the currently installed guest context.
+ * @return The data passed to the last successful call to
+ * hv_install_guest_context().
+ */
+HV_Context hv_inquire_guest_context(void);
+
+
/** Flushes all translations associated with the named address space
* identifier from the TLB and any other hypervisor data structures.
* Translations installed with the "global" bit are not flushed.
@@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
/** Flushes all non-global translations (if preserve_global is true),
* or absolutely all translations (if preserve_global is false).
*
- * @param preserve_global Non-zero if we want to preserve "global" mappings.
+ * @param preserve_global Non-zero if we want to preserve global mappings.
* @return Zero on success, or a hypervisor error code on failure.
*/
int hv_flush_all(int preserve_global);
@@ -991,7 +1106,11 @@ typedef enum {
HV_INQ_TILES_HFH_CACHE = 2,
/** The set of tiles that can be legally used as a LOTAR for a PTE. */
- HV_INQ_TILES_LOTAR = 3
+ HV_INQ_TILES_LOTAR = 3,
+
+ /** The set of "shared" driver tiles that the hypervisor may
+ * periodically interrupt. */
+ HV_INQ_TILES_SHARED = 4
} HV_InqTileSet;
/** Returns specific information about various sets of tiles within the
@@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
*/
/** Message receive downcall interrupt vector */
#define INT_MESSAGE_RCV_DWNCL INT_BOOT_ACCESS
+/** Device interrupt downcall interrupt vector */
+#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+#ifdef __tilegx__
+/** Virtualization page table miss downcall interrupt vector */
+#define INT_VPGTABLE_MISS_DWNCL INT_I_ASID
+/** Virtualization guest illegal page table */
+#define INT_VGUEST_FATAL_DWNCL INT_D_ASID
+#else
/** DMA TLB miss downcall interrupt vector */
#define INT_DMATLB_MISS_DWNCL INT_DMA_ASID
-/** Static nework processor instruction TLB miss interrupt vector */
-#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
/** DMA TLB access violation downcall interrupt vector */
#define INT_DMATLB_ACCESS_DWNCL INT_DMA_CPL
-/** Device interrupt downcall interrupt vector */
-#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+/** Static nework processor instruction TLB miss interrupt vector */
+#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
+#endif
#ifndef __ASSEMBLER__
@@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
#define HV_PTE_PTFN_BITS 29 /**< Number of bits in a PTFN */
/*
- * Legal values for the PTE's mode field
+ * Legal values for the PTE's mode field.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
+ * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
+ * to access MMIO resources via pseudo PAs that map to MMIO in the
+ * virtualization page table.
*/
+
/** Data is not resident in any caches; loads and stores access memory
* directly.
*/
@@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the primary page table if a virtualization
+ * page table is installed.
*/
#define HV_PTE_GLOBAL (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
@@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the virtualization page table.
*/
#define HV_PTE_USER (__HV_PTE_ONE << HV_PTE_INDEX_USER)
@@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_ACCESSED (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
@@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_DIRTY (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
@@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NC (__HV_PTE_ONE << HV_PTE_INDEX_NC)
@@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit
* determines how the level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L1 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
@@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L2 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
@@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* the page map directly to memory.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_CACHED_PRIORITY (__HV_PTE_ONE << \
HV_PTE_INDEX_CACHED_PRIORITY)
@@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* It is illegal for this bit to be clear if the Writable bit is set.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Readable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_READABLE (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
@@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* PTE.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Writable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_WRITABLE (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
@@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* than one.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Executable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_EXECUTABLE (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
index e54b7b0..36fb24c 100644
--- a/arch/tile/include/uapi/arch/sim.h
+++ b/arch/tile/include/uapi/arch/sim.h
@@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
__insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
}
+/**
+ * Set vCPU number for a given task.
+ * @param vcpu Virtual cpu to set.
+ */
+static __inline void
+sim_set_vcpu(int vcpu)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+/** Clear vCPU status for a given task. */
+static __inline void
+sim_clear_vcpu(void)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
+}
+
/*
* Event support.
diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
index 4b44a2b..b9aad66 100644
--- a/arch/tile/include/uapi/arch/sim_def.h
+++ b/arch/tile/include/uapi/arch/sim_def.h
@@ -221,6 +221,14 @@
*/
#define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
+/**
+ * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
+ * number shifted by 8, will tag any identification of the cpu that
+ * task is running on with the given virtual cpu number. If the
+ * virtual cpu number is -1, the tag is removed.
+ */
+#define SIM_CONTROL_VCPU 37
+
/*
* Syscall numbers for use with "sim_syscall()".
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446..4644c8d 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -121,6 +121,9 @@
#define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
#define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
#define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_GPV_SET_0 0x0600
+#define SPR_MPL_GPV_SET_1 0x0601
+#define SPR_MPL_GPV_SET_2 0x0602
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -142,6 +145,9 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x3400
#define SPR_MPL_IDN_TIMER_SET_1 0x3401
#define SPR_MPL_IDN_TIMER_SET_2 0x3402
+#define SPR_MPL_ILL_SET_0 0x0400
+#define SPR_MPL_ILL_SET_1 0x0401
+#define SPR_MPL_ILL_SET_2 0x0402
#define SPR_MPL_INTCTRL_0_SET_0 0x4a00
#define SPR_MPL_INTCTRL_0_SET_1 0x4a01
#define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -166,6 +172,12 @@
#define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
#define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
#define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
+#define SPR_MPL_SWINT_0_SET_0 0x1c00
+#define SPR_MPL_SWINT_0_SET_1 0x1c01
+#define SPR_MPL_SWINT_0_SET_2 0x1c02
+#define SPR_MPL_SWINT_1_SET_0 0x1a00
+#define SPR_MPL_SWINT_1_SET_1 0x1a01
+#define SPR_MPL_SWINT_1_SET_2 0x1a02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
@@ -187,6 +199,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x3600
#define SPR_MPL_UDN_TIMER_SET_1 0x3601
#define SPR_MPL_UDN_TIMER_SET_2 0x3602
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
#define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
#define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
#define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
index 67a6c17..727cda7 100644
--- a/arch/tile/include/uapi/arch/spr_def_64.h
+++ b/arch/tile/include/uapi/arch/spr_def_64.h
@@ -21,6 +21,10 @@
#define SPR_AUX_PERF_COUNT_1 0x2106
#define SPR_AUX_PERF_COUNT_CTL 0x2107
#define SPR_AUX_PERF_COUNT_STS 0x2108
+#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
+#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK 0xffffffff
+#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
+#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
#define SPR_CMPEXCH_VALUE 0x2780
#define SPR_CYCLE 0x2781
#define SPR_DONE 0x2705
@@ -101,6 +105,9 @@
#define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
#define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
#define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_GPV_SET_0 0x0900
+#define SPR_MPL_GPV_SET_1 0x0901
+#define SPR_MPL_GPV_SET_2 0x0902
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -116,6 +123,12 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x1800
#define SPR_MPL_IDN_TIMER_SET_1 0x1801
#define SPR_MPL_IDN_TIMER_SET_2 0x1802
+#define SPR_MPL_ILL_SET_0 0x0800
+#define SPR_MPL_ILL_SET_1 0x0801
+#define SPR_MPL_ILL_SET_2 0x0802
+#define SPR_MPL_ILL_TRANS_SET_0 0x1000
+#define SPR_MPL_ILL_TRANS_SET_1 0x1001
+#define SPR_MPL_ILL_TRANS_SET_2 0x1002
#define SPR_MPL_INTCTRL_0_SET_0 0x2500
#define SPR_MPL_INTCTRL_0_SET_1 0x2501
#define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -140,6 +153,15 @@
#define SPR_MPL_PERF_COUNT_SET_0 0x2000
#define SPR_MPL_PERF_COUNT_SET_1 0x2001
#define SPR_MPL_PERF_COUNT_SET_2 0x2002
+#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
+#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
+#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
+#define SPR_MPL_SWINT_0_SET_0 0x0f00
+#define SPR_MPL_SWINT_0_SET_1 0x0f01
+#define SPR_MPL_SWINT_0_SET_2 0x0f02
+#define SPR_MPL_SWINT_1_SET_0 0x0e00
+#define SPR_MPL_SWINT_1_SET_1 0x0e01
+#define SPR_MPL_SWINT_1_SET_2 0x0e02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -155,6 +177,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x1900
#define SPR_MPL_UDN_TIMER_SET_1 0x1901
#define SPR_MPL_UDN_TIMER_SET_2 0x1902
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
#define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
#define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
#define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index c20db8e..f07cc24 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -6,7 +6,9 @@ header-y += bitsperlong.h
header-y += byteorder.h
header-y += cachectl.h
header-y += hardwall.h
+header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_virtio.h
header-y += mman.h
header-y += ptrace.h
header-y += setup.h
diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
new file mode 100644
index 0000000..4346520
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_H
+#define _UAPI_ASM_TILE_KVM_H
+
+#ifndef __ASSEMBLER__
+#include <linux/ptrace.h>
+#endif
+
+#include <arch/abi.h>
+
+/*
+ * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
+ * with small modifications: Remove HV_SYS_fence_incoherent.
+ */
+/* Syscall allowed from guest PL bit mask. */
+#define HV_SYS_GUEST_SHIFT 12
+#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT)
+/* downcall_dispatch; this syscall number must be zero */
+#define HV_SYS_downcall_dispatch 0
+/* install_context */
+#define HV_SYS_install_context 1
+/* sysconf */
+#define HV_SYS_sysconf 2
+/* get_rtc */
+#define HV_SYS_get_rtc 3
+/* set_rtc */
+#define HV_SYS_set_rtc 4
+/* flush_asid */
+#define HV_SYS_flush_asid 5
+/* flush_page */
+#define HV_SYS_flush_page 6
+/* flush_pages */
+#define HV_SYS_flush_pages 7
+/* restart */
+#define HV_SYS_restart 8
+/* halt */
+#define HV_SYS_halt 9
+/* power_off */
+#define HV_SYS_power_off 10
+/* inquire_physical */
+#define HV_SYS_inquire_physical 11
+/* inquire_memory_controller */
+#define HV_SYS_inquire_memory_controller 12
+/* inquire_virtual */
+#define HV_SYS_inquire_virtual 13
+/* inquire_asid */
+#define HV_SYS_inquire_asid 14
+/* console_read_if_ready */
+#define HV_SYS_console_read_if_ready 15
+/* console_write */
+#define HV_SYS_console_write 16
+/* init */
+#define HV_SYS_init 17
+/* inquire_topology */
+#define HV_SYS_inquire_topology 18
+/* fs_findfile */
+#define HV_SYS_fs_findfile 19
+/* fs_fstat */
+#define HV_SYS_fs_fstat 20
+/* fs_pread */
+#define HV_SYS_fs_pread 21
+/* physaddr_read64 */
+#define HV_SYS_physaddr_read64 22
+/* physaddr_write64 */
+#define HV_SYS_physaddr_write64 23
+/* get_command_line */
+#define HV_SYS_get_command_line 24
+/* set_caching */
+#define HV_SYS_set_caching 25
+/* bzero_page */
+#define HV_SYS_bzero_page 26
+/* register_message_state */
+#define HV_SYS_register_message_state 27
+/* send_message */
+#define HV_SYS_send_message 28
+/* receive_message */
+#define HV_SYS_receive_message 29
+/* inquire_context */
+#define HV_SYS_inquire_context 30
+/* start_all_tiles */
+#define HV_SYS_start_all_tiles 31
+/* dev_open */
+#define HV_SYS_dev_open 32
+/* dev_close */
+#define HV_SYS_dev_close 33
+/* dev_pread */
+#define HV_SYS_dev_pread 34
+/* dev_pwrite */
+#define HV_SYS_dev_pwrite 35
+/* dev_poll */
+#define HV_SYS_dev_poll 36
+/* dev_poll_cancel */
+#define HV_SYS_dev_poll_cancel 37
+/* dev_preada */
+#define HV_SYS_dev_preada 38
+/* dev_pwritea */
+#define HV_SYS_dev_pwritea 39
+/* flush_remote */
+#define HV_SYS_flush_remote 40
+/* console_putc */
+#define HV_SYS_console_putc 41
+/* inquire_tiles */
+#define HV_SYS_inquire_tiles 42
+/* confstr */
+#define HV_SYS_confstr 43
+/* reexec */
+#define HV_SYS_reexec 44
+/* set_command_line */
+#define HV_SYS_set_command_line 45
+
+/* store_mapping */
+#define HV_SYS_store_mapping 52
+/* inquire_realpa */
+#define HV_SYS_inquire_realpa 53
+/* flush_all */
+#define HV_SYS_flush_all 54
+/* get_ipi_pte */
+#define HV_SYS_get_ipi_pte 55
+/* set_pte_super_shift */
+#define HV_SYS_set_pte_super_shift 56
+/* set_speed */
+#define HV_SYS_set_speed 57
+/* install_virt_context */
+#define HV_SYS_install_virt_context 58
+/* inquire_virt_context */
+#define HV_SYS_inquire_virt_context 59
+/* inquire_guest_context */
+#define HV_SYS_install_guest_context 60
+/* inquire_guest_context */
+#define HV_SYS_inquire_guest_context 61
+
+/*
+ * Number of hypercall (from guest os to host os) other than hv_*().
+ * We leave the previous 128 entries to the usual hv_*() calls
+ * as defined in hypervisor.h.
+ */
+#define KVM_OTHER_HCALL 128
+
+/* Hypercall index for virtio. */
+#define KVM_HCALL_virtio 128
+
+/* One greater than the maximum hypercall number. */
+#define KVM_NUM_HCALLS 256
+
+#ifndef __ASSEMBLER__
+
+struct kvm_regs {
+ struct pt_regs regs;
+};
+
+#define FOR_EACH_GUEST_SPR(f) \
+ f(INTERRUPT_MASK_1); \
+ f(INTERRUPT_VECTOR_BASE_1); \
+ f(EX_CONTEXT_1_0); \
+ f(EX_CONTEXT_1_1); \
+ f(SYSTEM_SAVE_1_0); \
+ f(SYSTEM_SAVE_1_1); \
+ f(SYSTEM_SAVE_1_2); \
+ f(SYSTEM_SAVE_1_3); \
+ f(INTCTRL_1_STATUS); \
+ f(IPI_MASK_1); \
+ f(IPI_EVENT_1); \
+ f(SINGLE_STEP_CONTROL_1); \
+ f(SINGLE_STEP_EN_1_1); \
+
+struct kvm_sregs {
+#define DECLARE_SPR(f) unsigned long f
+ FOR_EACH_GUEST_SPR(DECLARE_SPR)
+#undef DECLARE_SPR
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#ifndef __KERNEL__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
+#endif
+
+#define HCALL_DEFS \
+ /* For hv_*() */ \
+ KVM_EMULATE(init) \
+ NO_EMULATE(install_context) \
+ KVM_EMULATE(sysconf) \
+ KVM_EMULATE(get_rtc) \
+ KVM_EMULATE(set_rtc) \
+ NO_EMULATE(flush_asid) \
+ NO_EMULATE(flush_page) \
+ NO_EMULATE(flush_pages) \
+ USER_EMULATE(restart) \
+ USER_EMULATE(halt) \
+ USER_EMULATE(power_off) \
+ USER_EMULATE(inquire_physical) \
+ USER_EMULATE(inquire_memory_controller) \
+ KVM_EMULATE(inquire_virtual) \
+ KVM_EMULATE(inquire_asid) \
+ NO_EMULATE(console_read_if_ready) \
+ NO_EMULATE(console_write) \
+ NO_EMULATE(downcall_dispatch) \
+ KVM_EMULATE(inquire_topology) \
+ USER_EMULATE(fs_findfile) \
+ USER_EMULATE(fs_fstat) \
+ USER_EMULATE(fs_pread) \
+ KVM_EMULATE(physaddr_read64) \
+ KVM_EMULATE(physaddr_write64) \
+ USER_EMULATE(get_command_line) \
+ USER_EMULATE(set_caching) \
+ NO_EMULATE(bzero_page) \
+ KVM_EMULATE(register_message_state) \
+ KVM_EMULATE(send_message) \
+ KVM_EMULATE(receive_message) \
+ KVM_EMULATE(inquire_context) \
+ KVM_EMULATE(start_all_tiles) \
+ USER_EMULATE(dev_open) \
+ USER_EMULATE(dev_close) \
+ USER_EMULATE(dev_pread) \
+ USER_EMULATE(dev_pwrite) \
+ USER_EMULATE(dev_poll) \
+ USER_EMULATE(dev_poll_cancel) \
+ USER_EMULATE(dev_preada) \
+ USER_EMULATE(dev_pwritea) \
+ USER_EMULATE(flush_remote) \
+ NO_EMULATE(console_putc) \
+ KVM_EMULATE(inquire_tiles) \
+ KVM_EMULATE(confstr) \
+ USER_EMULATE(reexec) \
+ USER_EMULATE(set_command_line) \
+ USER_EMULATE(store_mapping) \
+ NO_EMULATE(inquire_realpa) \
+ NO_EMULATE(flush_all) \
+ KVM_EMULATE(get_ipi_pte) \
+ KVM_EMULATE(set_pte_super_shift) \
+ KVM_EMULATE(set_speed) \
+ /* For others */ \
+ USER_HCALL(virtio)
+
+#endif
+
+#endif /* _UAPI_ASM_TILE_KVM_H */
diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
new file mode 100644
index 0000000..d94f535
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm_virtio.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
+#define _UAPI_ASM_TILE_KVM_VIRTIO_H
+
+#include <linux/types.h>
+
+#define KVM_VIRTIO_UNKNOWN 0
+#define KVM_VIRTIO_NOTIFY 1
+#define KVM_VIRTIO_RESET 2
+#define KVM_VIRTIO_SET_STATUS 3
+
+struct kvm_device_desc {
+ /* The device type: console, network, disk etc. Type 0 terminates. */
+ __u8 type;
+ /* The number of virtqueues (first in config array) */
+ __u8 num_vq;
+ /*
+ * The number of bytes of feature bits. Multiply by 2: one for host
+ * features and one for Guest acknowledgements.
+ */
+ __u8 feature_len;
+ /* The number of bytes of the config array after virtqueues. */
+ __u8 config_len;
+ /* A status byte, written by the Guest. */
+ __u8 status;
+ __u64 config[0];
+};
+
+struct kvm_vqinfo {
+ /* Pointer to the information contained in the device config. */
+ struct kvm_vqconfig *config;
+ /* The address where we mapped the virtio ring, so we can unmap it. */
+ void *pages;
+};
+
+struct kvm_vqconfig {
+ /* The physical address of the virtio ring */
+ __u64 pa;
+ /* The number of entries in the virtio_ring */
+ __u64 num;
+ /* The interrupt we get when something happens. Set by the guest. */
+ __u32 irq;
+
+};
+
+
+#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b7c8b5e..b638d3e 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o
obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o
obj-y += vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 97ea6ac..0a04a16 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -20,6 +20,9 @@
#include <linux/hardirq.h>
#include <linux/ptrace.h>
#include <hv/hypervisor.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif
/* Check for compatible compiler early in the build. */
#ifdef CONFIG_TILEGX
@@ -68,6 +71,10 @@ void foo(void)
DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
offsetof(struct thread_info, unalign_jit_tmp));
#endif
+#ifdef CONFIG_KVM
+ DEFINE(THREAD_INFO_VCPU_OFFSET,
+ offsetof(struct thread_info, vcpu));
+#endif
DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
offsetof(struct task_struct, thread.ksp));
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index b608e00..53f2be4 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -18,11 +18,26 @@
#include <linux/string.h>
#include <linux/irqflags.h>
#include <linux/printk.h>
+#ifdef CONFIG_KVM_GUEST
+#include <linux/virtio_console.h>
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+#endif
#include <asm/setup.h>
#include <hv/hypervisor.h>
static void early_hv_write(struct console *con, const char *s, unsigned n)
{
+#ifdef CONFIG_KVM_GUEST
+ char buf[512];
+
+ if (n > sizeof(buf) - 1)
+ n = sizeof(buf) - 1;
+ memcpy(buf, s, n);
+ buf[n] = '\0';
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
+#else
tile_console_write(s, n);
/*
@@ -32,6 +47,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n)
*/
if (n && s[n-1] == '\n')
tile_console_write("\r", 1);
+#endif
}
static struct console early_hv_console = {
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index f3f17b0..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
.set addr, addr + PGDIR_SIZE
.endr
- /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
- PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+ /* The true text VAs are mapped as VA = PA + MEM_SV_START */
+ PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
(1 << (HV_PTE_INDEX_EXECUTABLE - 32))
.org swapper_pg_dir + PGDIR_SIZE
END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 652b814..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -135,9 +135,9 @@ ENTRY(_start)
1:
/* Install the interrupt base. */
- moveli r0, hw2_last(MEM_SV_START)
- shl16insli r0, r0, hw1(MEM_SV_START)
- shl16insli r0, r0, hw0(MEM_SV_START)
+ moveli r0, hw2_last(intrpt_start)
+ shl16insli r0, r0, hw1(intrpt_start)
+ shl16insli r0, r0, hw0(intrpt_start)
mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
/* Get our processor number and save it away in SAVE_K_0. */
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
index 16576c6..2914a9e 100644
--- a/arch/tile/kernel/hvglue.S
+++ b/arch/tile/kernel/hvglue.S
@@ -71,5 +71,11 @@ gensym hv_flush_all, 0x6e0, 32
gensym hv_get_ipi_pte, 0x700, 32
gensym hv_set_pte_super_shift, 0x720, 32
gensym hv_set_speed, 0x740, 32
+gensym hv_install_virt_context, 0x760, 32
+gensym hv_inquire_virt_context, 0x780, 32
+gensym hv_install_guest_context, 0x7a0, 32
+gensym hv_inquire_guest_context, 0x7c0, 32
gensym hv_console_set_ipi, 0x7e0, 32
-gensym hv_glue_internals, 0x800, 30720
+gensym hv_glue_internals, 0x800, 2048
+gensym hcall_virtio, 0x1000, 32
+gensym hv_hcall_internals, 0x1020, 28640
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
index 16ef6c1..3b15c76 100644
--- a/arch/tile/kernel/hvglue_trace.c
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -75,6 +75,10 @@
#define hv_get_ipi_pte _hv_get_ipi_pte
#define hv_set_pte_super_shift _hv_set_pte_super_shift
#define hv_set_speed _hv_set_speed
+#define hv_install_virt_context _hv_install_virt_context
+#define hv_inquire_virt_context _hv_inquire_virt_context
+#define hv_install_guest_context _hv_install_guest_context
+#define hv_inquire_guest_context _hv_inquire_guest_context
#define hv_console_set_ipi _hv_console_set_ipi
#include <hv/hypervisor.h>
#undef hv_init
@@ -135,6 +139,10 @@
#undef hv_get_ipi_pte
#undef hv_set_pte_super_shift
#undef hv_set_speed
+#undef hv_install_virt_context
+#undef hv_inquire_virt_context
+#undef hv_install_guest_context
+#undef hv_inquire_guest_context
#undef hv_console_set_ipi
/*
@@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
unsigned long, flags)
HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP0(HV_Context, hv_inquire_virt_context)
+HV_WRAP0(HV_Context, hv_inquire_guest_context)
HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f3d26f4..2ce69a5 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -353,7 +353,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -806,7 +806,7 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnz r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnz r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
lw r29, r29
@@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
seq r27, r27, r28
}
{
- bbns r27, .Lrestore_all
+ bbns r27, restore_all
addi r28, r28, 8
}
sw r29, r28
- j .Lrestore_all
+ j restore_all
.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
auli r1, r1, ha16(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- bzt r1, .Lrestore_all
+ bzt r1, restore_all
/*
* Make sure we have all the registers saved for signal
@@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
* profile interrupt will actually disable interrupts in both SPRs
* before returning, which is OK.)
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
lw r0, r0
@@ -1890,8 +1892,8 @@ int_unalign:
push_extra_callee_saves r0
j do_trap
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 18b2dcc..2c5cbe0 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -29,11 +29,25 @@
#include <arch/abi.h>
#include <arch/interrupts.h>
#include <arch/spr_def.h>
+#include <arch/opcode.h>
+#ifdef CONFIG_KVM
+#include <asm/kvm_host.h>
+#endif
#define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
#define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set). Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
.macro push_reg reg, ptr=sp, delta=-8
{
@@ -308,7 +322,7 @@ intvec_\vecname:
*/
{
blbs sp, 2f
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r0, r0)
}
.ifc \vecnum, INT_DOUBLE_FAULT
@@ -347,10 +361,6 @@ intvec_\vecname:
*
* Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
* any path that turns into a downcall to one of our TLB handlers.
- *
- * FIXME: if we end up never using this path, perhaps we should
- * prevent the hypervisor from generating downcalls in this case.
- * The advantage of getting a downcall is we can panic in Linux.
*/
mfspr r0, SPR_SYSTEM_SAVE_K_2
{
@@ -490,6 +500,10 @@ intvec_\vecname:
mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
.else
+ .ifc \c_routine, kvm_vpgtable_miss
+ mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
+ mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
+ .else
.ifc \vecnum, INT_ILL_TRANS
mfspr r2, ILL_VA_PC
.else
@@ -512,6 +526,7 @@ intvec_\vecname:
.endif
.endif
.endif
+ .endif
/* Put function pointer in r0 */
moveli r0, hw2_last(\c_routine)
shl16insli r0, r0, hw1(\c_routine)
@@ -525,7 +540,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -641,24 +656,25 @@ intvec_\vecname:
/*
* If we will be returning to the kernel, we will need to
* reset the interrupt masks to the state they had before.
- * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+ * Set DISABLE_IRQ in flags iff we came from kernel pl with
+ * irqs disabled.
*/
- mfspr r32, SPR_EX_CONTEXT_K_1
+ mfspr r22, SPR_EX_CONTEXT_K_1
{
- andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r22, r22)
PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
}
- beqzt r32, 1f /* zero if from user space */
- IRQS_DISABLED(r32) /* zero if irqs enabled */
+ beqzt r22, 1f /* zero if from user space */
+ IRQS_DISABLED(r22) /* zero if irqs enabled */
#if PT_FLAGS_DISABLE_IRQ != 1
# error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
#endif
1:
.ifnc \function,handle_syscall
/* Record the fact that we saved the caller-save registers above. */
- ori r32, r32, PT_FLAGS_CALLER_SAVES
+ ori r22, r22, PT_FLAGS_CALLER_SAVES
.endif
- st r21, r32
+ st r21, r22
/*
* we've captured enough state to the stack (including in
@@ -698,12 +714,29 @@ intvec_\vecname:
move tp, zero
#endif
+ /*
+ * Prepare the first 256 stack bytes to be rapidly accessible
+ * without having to fetch the background data.
+ */
+ addi r52, sp, -64
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ wh64 r52
+
#ifdef __COLLECT_LINKER_FEEDBACK__
/*
* Notify the feedback routines that we were in the
- * appropriate fixed interrupt vector area. Note that we
- * still have ICS set at this point, so we can't invoke any
- * atomic operations or we will panic. The feedback
+ * appropriate fixed interrupt vector area. The feedback
* routines internally preserve r0..r10 and r30 up.
*/
.ifnc \function,handle_syscall
@@ -722,23 +755,15 @@ intvec_\vecname:
#endif
/*
- * Prepare the first 256 stack bytes to be rapidly accessible
- * without having to fetch the background data.
+ * Stash any interrupt state in r30..r33 for now.
+ * This makes it easier to call C code in the code that follows.
+ * We don't need to on the syscall path since we reload
+ * them from the stack instead.
*/
- addi r52, sp, -64
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- wh64 r52
+ .ifnc \function,handle_syscall
+ { move r30, r0; move r31, r1 }
+ { move r32, r2; move r33, r3 }
+ .endif
#ifdef CONFIG_TRACE_IRQFLAGS
.ifnc \function,handle_nmi
@@ -749,17 +774,8 @@ intvec_\vecname:
* For syscalls, we already have the register state saved away
* on the stack, so we don't bother to do any register saves here,
* and later we pop the registers back off the kernel stack.
- * For interrupt handlers, save r0-r3 in callee-saved registers.
*/
- .ifnc \function,handle_syscall
- { move r30, r0; move r31, r1 }
- { move r32, r2; move r33, r3 }
- .endif
TRACE_IRQS_OFF
- .ifnc \function,handle_syscall
- { move r0, r30; move r1, r31 }
- { move r2, r32; move r3, r33 }
- .endif
.endif
#endif
@@ -808,11 +824,11 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnez r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnez r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
ld r29, r29
- andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r29, r29)
{
beqzt r29, .Lresume_userspace
move r29, sp
@@ -824,14 +840,25 @@ STD_ENTRY(interrupt_return)
addli r28, r29, THREAD_INFO_FLAGS_OFFSET
{
ld r28, r28
- addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+ addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
}
{
- andi r28, r28, _TIF_NEED_RESCHED
- ld4s r29, r29
+ andi r27, r28, _TIF_NEED_RESCHED
+ ld4s r26, r26
}
- beqzt r28, 1f
- bnez r29, 1f
+ beqzt r27, 1f
+ bnez r26, 1f
+#ifdef CONFIG_KVM
+ addli r27, r29, THREAD_INFO_VCPU_OFFSET
+ ld r27, r27
+ {
+ beqzt r27, 0f
+ movei r1, KVM_EXIT_AGAIN
+ }
+ push_extra_callee_saves r0
+ j kvm_trigger_vmexit
+0:
+#endif
jal preempt_schedule_irq
FEEDBACK_REENTER(interrupt_return)
1:
@@ -853,11 +880,11 @@ STD_ENTRY(interrupt_return)
cmpeq r27, r27, r28
}
{
- blbc r27, .Lrestore_all
+ blbc r27, restore_all
addi r28, r28, 8
}
st r29, r28
- j .Lrestore_all
+ j restore_all
.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -897,7 +924,7 @@ STD_ENTRY(interrupt_return)
shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- beqzt r1, .Lrestore_all
+ beqzt r1, restore_all
/*
* Make sure we have all the registers saved for signal
@@ -929,14 +956,16 @@ STD_ENTRY(interrupt_return)
* ICS can only be used in very tight chunks of code to avoid
* tripping over various assertions that it is off.
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
ld r0, r0
PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
}
{
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+ IS_KERNEL_EX1(r0, r0)
ld r32, r32
}
bnez r0, 1f
@@ -1007,7 +1036,7 @@ STD_ENTRY(interrupt_return)
pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
{
mtspr SPR_EX_CONTEXT_K_1, lr
- andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(lr, lr)
}
{
mtspr SPR_EX_CONTEXT_K_0, r21
@@ -1457,6 +1486,26 @@ int_unalign:
j do_unaligned
ENDPROC(hand_unalign_slow)
+#ifdef CONFIG_KVM
+/*
+ * Any call path that may lead to a vmexit needs to save the full
+ * callee-save register state, since if we vmexit we don't unwind
+ * the callee-saves from the C function stack frames, and instead
+ * just save away the register state from the interrupt handler as-is
+ * and later reload it directly and call back into the guest.
+ */
+ .macro save_callee_saves_and_tailcall func
+kvm_\func:
+ push_extra_callee_saves r0
+ j kvm_do_\func
+ ENDPROC(\func)
+ .endm
+
+ save_callee_saves_and_tailcall hypervisor_call
+ save_callee_saves_and_tailcall vpgtable_miss
+ save_callee_saves_and_tailcall vguest_fatal
+#endif
+
/* Fill the return address stack with nonzero entries. */
STD_ENTRY(fill_ra_stack)
{
@@ -1469,13 +1518,57 @@ STD_ENTRY(fill_ra_stack)
4: jrp r0
STD_ENDPROC(fill_ra_stack)
+#ifdef CONFIG_KVM
+/*
+ * Handle the downcall dispatch service. On entry, the client's
+ * system save register 3 holds the original contents of
+ * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
+ * the correct interrupt vector.
+ * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
+ * here, since this is the only interrupt handled this way on GX.
+ */
+handle_downcall_dispatch:
+ /*
+ * If we were called from PL0, jump back to slow path.
+ * We check just the low bit to make sure it's set, since we
+ * can only be called from PL0 or PL1.
+ */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
+ blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0
+
+ /* Set the PC to the downcall interrupt vector, and PL to guest. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
+ addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
+ INT_MESSAGE_RCV_DWNCL << 8
+ {
+ mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
+ movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
+ }
+ mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
+
+ /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
+ iret
+
+ .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \
+ processing=handle_interrupt
+ .org (\vecnum << 8)
+ /* Need special code for downcall dispatch syscall. */
+ beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
+ __int_hand \vecnum, \vecname, \c_routine, \processing
+ .endm
+
+#endif /* CONFIG_KVM */
+
.macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
.org (\vecnum << 8)
__int_hand \vecnum, \vecname, \c_routine, \processing
.endm
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
+ .global intrpt_start
+intrpt_start:
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
@@ -1484,6 +1577,11 @@ STD_ENTRY(fill_ra_stack)
#define do_hardwall_trap bad_intr
#endif
+#ifndef CONFIG_KVM
+#define kvm_vpgtable_miss bad_intr
+#define kvm_vguest_fatal bad_intr
+#endif
+
int_hand INT_MEM_ERROR, MEM_ERROR, do_trap
int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
#if CONFIG_KERNEL_PL == 2
@@ -1504,14 +1602,24 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_SWINT_3, SWINT_3, do_trap
int_hand INT_SWINT_2, SWINT_2, do_trap
int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
+#ifdef CONFIG_KVM
+ int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
+#else
int_hand INT_SWINT_0, SWINT_0, do_trap
+#endif
int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
+#ifndef CONFIG_KVM_GUEST
int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr
+#else
+ int_hand INT_TILE_TIMER, TILE_TIMER, bad_intr
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt
+#endif
int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr
int_hand INT_UDN_TIMER, UDN_TIMER, bad_intr
int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr
@@ -1541,8 +1649,10 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
hv_message_intr
int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
- int_hand INT_I_ASID, I_ASID, bad_intr
- int_hand INT_D_ASID, D_ASID, bad_intr
+ int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
+ kvm_vpgtable_miss
+ int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
+ kvm_vguest_fatal
int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
/* Synthetic interrupt delivered only by the simulator */
diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c
new file mode 100644
index 0000000..c6b6c6a
--- /dev/null
+++ b/arch/tile/kernel/kvm_virtio.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+/* Referred lguest & s390 implemenation */
+/*
+ * kvm_virtio.c - virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): Christian Borntraeger <[email protected]>
+ */
+
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+
+static void *kvm_devices;
+
+/*
+ * TODO: We actually does not use PCI virtio here. We use this
+ * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
+ * Maybe we should change them to generic definitions in both qemu & Linux.
+ * Besides, Let's check whether the alignment value (4096, i.e. default
+ * x86 page size) affects performance later.
+ */
+#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN
+#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
+
+/*
+ * memory layout: (Total: PAGE_SIZE)
+ * <device 0>
+ * - kvm device descriptor
+ * struct kvm_device_desc
+ * - vqueue configuration (totally desc->num_vq)
+ * struct kvm_vqconfig
+ * ......
+ * struct kvm_vqconfig
+ * - feature bits (size: desc->feature_len * 2)
+ * - config space (size: desc->config_len)
+ * <device 1>
+ * ......
+ */
+static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
+{
+ return (struct kvm_vqconfig *)(desc + 1);
+}
+
+static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
+{
+ return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
+}
+
+static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
+{
+ return kvm_vq_features(desc) + desc->feature_len * 2;
+}
+
+/*
+ * The total size of the config page used by this device (incl. desc)
+ */
+static unsigned desc_size(const struct kvm_device_desc *desc)
+{
+ return sizeof(*desc)
+ + desc->num_vq * sizeof(struct kvm_vqconfig)
+ + desc->feature_len * 2
+ + desc->config_len;
+}
+
+/* This gets the device's feature bits. */
+static u32 kvm_get_features(struct virtio_device *vdev)
+{
+ unsigned int i;
+ u32 features = 0;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ u8 *in_features = kvm_vq_features(desc);
+
+ for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+ if (in_features[i / 8] & (1 << (i % 8)))
+ features |= (1 << i);
+ return features;
+}
+
+static void kvm_finalize_features(struct virtio_device *vdev)
+{
+ unsigned int i, bits;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ /* Second half of bitmap is features we accept. */
+ u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ memset(out_features, 0, desc->feature_len);
+ bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, vdev->features))
+ out_features[i / 8] |= (1 << (i % 8));
+ }
+}
+
+/*
+ * Reading and writing elements in config space
+ */
+static void kvm_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(buf, kvm_vq_configspace(desc) + offset, len);
+}
+
+static void kvm_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(kvm_vq_configspace(desc) + offset, buf, len);
+}
+
+/*
+ * The operations to get and set the status word just access
+ * the status field of the device descriptor. set_status will also
+ * make a hypercall to the host, to tell about status changes
+ */
+static u8 kvm_get_status(struct virtio_device *vdev)
+{
+ return to_kvmdev(vdev)->desc->status;
+}
+
+static void kvm_set_status(struct virtio_device *vdev, u8 status)
+{
+ BUG_ON(!status);
+ to_kvmdev(vdev)->desc->status = status;
+ hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
+ * descriptor address. The Host will zero the status and all the
+ * features.
+ */
+static void kvm_reset(struct virtio_device *vdev)
+{
+ hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * When the virtio_ring code wants to notify the Host, it calls us here and we
+ * make a hypercall. We hand the address of the virtqueue so the Host
+ * knows which virtqueue we're talking about.
+ */
+static void kvm_notify(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
+}
+
+/*
+ * Must set some caching mode to keep set_pte() happy.
+ * It doesn't matter what we choose, because the PFN
+ * is illegal, so we're going to take a page fault anyway.
+ */
+static inline pgprot_t io_prot(void)
+{
+ return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
+}
+
+/*
+ * This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ */
+static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
+ unsigned index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ struct kvm_vqinfo *vqi;
+ struct kvm_vqconfig *config;
+ struct virtqueue *vq;
+ long irq;
+ int err = -EINVAL;
+
+ if (index >= kdev->desc->num_vq)
+ return ERR_PTR(-ENOENT);
+
+ vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
+ if (!vqi)
+ return ERR_PTR(-ENOMEM);
+
+ config = kvm_vq_config(kdev->desc)+index;
+
+ vqi->config = config;
+ vqi->pages = generic_remap_prot(config->pa,
+ vring_size(config->num,
+ KVM_TILE_VIRTIO_RING_ALIGN),
+ 0, io_prot());
+ if (!vqi->pages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
+ vdev, 0, vqi->pages,
+ kvm_notify, callback, name);
+ if (!vq) {
+ err = -ENOMEM;
+ goto unmap;
+ }
+
+ /*
+ * Trigger the IPI interrupt in SW way.
+ * TODO: We do not need to create one irq for each vq. A bit wasteful.
+ */
+ irq = create_irq();
+ if (irq < 0) {
+ err = -ENXIO;
+ goto del_virtqueue;
+ }
+
+ tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
+
+ if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
+ err = -ENXIO;
+ destroy_irq(irq);
+ goto del_virtqueue;
+ }
+
+ config->irq = irq;
+
+ vq->priv = vqi;
+ return vq;
+
+del_virtqueue:
+ vring_del_virtqueue(vq);
+unmap:
+ vunmap(vqi->pages);
+out:
+ return ERR_PTR(err);
+}
+
+static void kvm_del_vq(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ vring_del_virtqueue(vq);
+ vunmap(vqi->pages);
+ kfree(vqi);
+}
+
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ int i;
+
+ /* We must have this many virtqueues. */
+ if (nvqs > kdev->desc->num_vq)
+ return -ENOENT;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ kvm_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+}
+
+/*
+ * The config ops structure as defined by virtio config
+ */
+static struct virtio_config_ops kvm_vq_config_ops = {
+ .get_features = kvm_get_features,
+ .finalize_features = kvm_finalize_features,
+ .get = kvm_get,
+ .set = kvm_set,
+ .get_status = kvm_get_status,
+ .set_status = kvm_set_status,
+ .reset = kvm_reset,
+ .find_vqs = kvm_find_vqs,
+ .del_vqs = kvm_del_vqs,
+};
+
+/*
+ * The root device for the kvm virtio devices.
+ * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
+ */
+static struct device *kvm_root;
+
+/*
+ * adds a new device and register it with virtio
+ * appropriate drivers are loaded by the device model
+ */
+static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
+{
+ struct kvm_device *kdev;
+
+ kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+ if (!kdev) {
+ pr_emerg("Cannot allocate kvm dev %u type %u\n",
+ offset, d->type);
+ return;
+ }
+
+ kdev->vdev.dev.parent = kvm_root;
+ kdev->vdev.id.device = d->type;
+ kdev->vdev.config = &kvm_vq_config_ops;
+ kdev->desc = d;
+ kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
+
+ if (register_virtio_device(&kdev->vdev) != 0) {
+ pr_err("Failed to register kvm device %u type %u\n",
+ offset, d->type);
+ kfree(kdev);
+ }
+}
+
+/*
+ * scan_devices() simply iterates through the device page.
+ * The type 0 is reserved to mean "end of devices".
+ */
+static void scan_devices(void)
+{
+ unsigned int i;
+ struct kvm_device_desc *d;
+
+ for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+ d = kvm_devices + i;
+
+ if (d->type == 0)
+ break;
+
+ add_kvm_device(d, i);
+ }
+}
+
+/*
+ * Init function for virtio.
+ * devices are in a single page above the top of "normal" mem.
+ */
+static int __init kvm_devices_init(void)
+{
+ int rc = -ENOMEM;
+
+ kvm_root = root_device_register("kvm_tile");
+ if (IS_ERR(kvm_root)) {
+ rc = PTR_ERR(kvm_root);
+ pr_err("Could not register kvm_tile root device");
+ return rc;
+ }
+
+ kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
+ 0, io_prot());
+ if (!kvm_devices) {
+ kvm_devices = NULL;
+ root_device_unregister(kvm_root);
+ return rc;
+ }
+
+ scan_devices();
+ return 0;
+}
+
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int len)
+{
+ char scratch[512];
+
+ if (len > sizeof(scratch) - 1)
+ len = sizeof(scratch) - 1;
+ scratch[len] = '\0';
+ memcpy(scratch, buf, len);
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
+
+ return len;
+}
+
+static int __init tile_virtio_console_init(void)
+{
+ return virtio_cons_early_init(early_put_chars);
+}
+console_initcall(tile_virtio_console_init);
+
+/*
+ * We do this after core stuff, but before the drivers.
+ */
+postcore_initcall(kvm_devices_init);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 44cdc4a..2629ff1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -27,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/tracehook.h>
#include <linux/signal.h>
+#include <linux/kvm_host.h>
#include <asm/stack.h>
#include <asm/switch_to.h>
#include <asm/homecache.h>
@@ -247,11 +248,13 @@ struct task_struct *validate_current(void)
/* Take and return the pointer to the previous task, for schedule_tail(). */
struct task_struct *sim_notify_fork(struct task_struct *prev)
{
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
struct task_struct *tsk = current;
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT |
(tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS));
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK |
(tsk->pid << _SIM_CONTROL_OPERATOR_BITS));
+#endif
return prev;
}
@@ -450,6 +453,11 @@ void _prepare_arch_switch(struct task_struct *next)
struct task_struct *__sched _switch_to(struct task_struct *prev,
struct task_struct *next)
{
+#ifdef CONFIG_KVM
+ /* vmexit is needed before context switch. */
+ BUG_ON(task_thread_info(prev)->vcpu);
+#endif
+
/* DMA state is already saved; save off other arch state. */
save_arch_state(&prev->thread);
@@ -519,6 +527,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
/* Enable interrupts; they are disabled again on return to caller. */
local_irq_enable();
+#ifdef CONFIG_KVM
+ /*
+ * Some work requires us to exit the VM first. Typically this
+ * allows the process running the VM to respond to the work
+ * (e.g. a signal), or allows the VM mechanism to latch
+ * modified host state (e.g. a "hypervisor" message sent to a
+ * different vcpu). It also means that if we are considering
+ * calling schedule(), we exit the VM first, so we never have
+ * to worry about context-switching into a VM.
+ */
+ if (current_thread_info()->vcpu) {
+ u32 do_exit = thread_info_flags &
+ (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
+
+ if (thread_info_flags & _TIF_VIRT_EXIT)
+ clear_thread_flag(TIF_VIRT_EXIT);
+ if (do_exit) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
+ /*NORETURN*/
+ }
+ }
+#endif
+
if (thread_info_flags & _TIF_NEED_RESCHED) {
schedule();
return 1;
@@ -538,11 +569,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
tracehook_notify_resume(regs);
return 1;
}
- if (thread_info_flags & _TIF_SINGLESTEP) {
+
+ /* Handle a few flags here that stay set. */
+ if (thread_info_flags & _TIF_SINGLESTEP)
single_step_once(regs);
- return 0;
- }
- panic("work_pending: bad flags %#x\n", thread_info_flags);
+
+ return 0;
}
unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..02bc446 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
addi sp, sp, -8
/* we now have a stack (whether we need one or not) */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r40, hw2_last(hv_console_putc)
shl16insli r40, r40, hw1(hv_console_putc)
shl16insli r40, r40, hw0(hv_console_putc)
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, 'r'
jalr r40
@@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
/* we should not get here */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, '?'
jalr r40
moveli r0, '\n'
jalr r40
+#endif
j .Lhalt
@@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
j .Lloop
-.Lerr: moveli r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+ moveli r0, 'e'
jalr r40
moveli r0, 'r'
jalr r40
@@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
jalr r40
moveli r0, '\n'
jalr r40
+#endif
.Lhalt:
moveli r41, hw2_last(hv_halt)
shl16insli r41, r41, hw1(hv_halt)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 774e819..2352a81 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
/*
* Determine for each controller where its lowmem is mapped and how much of
* it is mapped there. On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
* start our data mappings higher up, but for now we don't bother, to avoid
* additional confusion.
*
@@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
* SPRs, as well as the interrupt mask.
*/
__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
+
+#ifdef CONFIG_KVM
+ /*
+ * If we launch a guest kernel, it will need some interrupts
+ * that otherwise are not used by the host or by userspace.
+ * Set them to MPL 1 now and leave them alone going forward;
+ * they are masked in the host so will never fire there anyway,
+ * and we mask them at PL1 as we exit the guest.
+ */
__insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
+ __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
+#endif
/* Initialize IRQ support for this cpu. */
setup_irq_regs();
@@ -1242,7 +1255,7 @@ static void __init validate_va(void)
#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
/*
* Similarly, make sure we're only using allowed VAs.
- * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+ * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
* and 0 .. KERNEL_HIGH_VADDR.
* In addition, make sure we CAN'T use the end of memory, since
* we use the last chunk of each pgd for the pgd_list.
@@ -1257,7 +1270,7 @@ static void __init validate_va(void)
if (range.size == 0)
break;
if (range.start <= MEM_USER_INTRPT &&
- range.start + range.size >= MEM_HV_INTRPT)
+ range.start + range.size >= MEM_HV_START)
user_kernel_ok = 1;
if (range.start == 0)
max_va = range.size;
@@ -1693,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
static int __init request_standard_resources(void)
{
int i;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if defined(CONFIG_PCI) && !defined(__tilegx__)
insert_non_bus_resource();
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 0ae1c59..62b3ba9 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -223,30 +223,34 @@ void __init ipi_init(void)
#if CHIP_HAS_IPI()
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- WARN_ON(cpu_is_offline(cpu));
-
/*
* We just want to do an MMIO store. The traditional writeq()
* functions aren't really correct here, since they're always
* directed at the PCI shim. For now, just do a raw store,
- * casting away the __iomem attribute.
+ * casting away the __iomem attribute. We do the store as a
+ * single asm() instruction to ensure that we can force a step
+ * over it in the KVM case, if we are not binding vcpus to cpus,
+ * rather than require it to be possible to issue validly.
*/
- ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
+ unsigned long *addr =
+ &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
+ asm volatile("st %0, zero" :: "r" (addr));
}
#else
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- HV_Coord coord;
-
- WARN_ON(cpu_is_offline(cpu));
-
- coord.y = cpu_y(cpu);
- coord.x = cpu_x(cpu);
+ HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
hv_trigger_ipi(coord, IRQ_RESCHEDULE);
}
#endif /* CHIP_HAS_IPI() */
+
+void smp_send_reschedule(int cpu)
+{
+ WARN_ON(cpu_is_offline(cpu));
+ __smp_send_reschedule(cpu);
+}
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 24fd223..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
p->sp >= sp) {
if (kbt->verbose)
pr_err(" <%s while in kernel mode>\n", fault);
- } else if (EX1_PL(p->ex1) == USER_PL &&
+ } else if (user_mode(p) &&
p->sp < PAGE_OFFSET && p->sp != 0) {
if (kbt->verbose)
pr_err(" <%s while in user mode>\n", fault);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a8..024b978 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
+#ifdef CONFIG_KVM_GUEST
+ return sprintf(page, "KVM\n");
+#else
return sprintf(page, "tilera\n");
+#endif
}
static DEVICE_ATTR(type, 0444, type_show, NULL);
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 3c2dc87..b0b7264 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -117,9 +117,9 @@ void __init time_init(void)
/*
* Define the tile timer clock event device. The timer is driven by
- * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
+ * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down
* counter, plus bit 31, which signifies that the counter has wrapped
- * from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be
+ * from zero to (2**31) - 1. The INT_[AUX_]TILE_TIMER interrupt will be
* raised as long as bit 31 is set.
*/
@@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks,
struct clock_event_device *evt)
{
BUG_ON(ticks > MAX_TICK);
- __insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
- arch_local_irq_unmask_now(INT_TILE_TIMER);
+ __insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks);
+ arch_local_irq_unmask_now(INT_LINUX_TIMER);
return 0;
}
@@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
static void tile_timer_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
}
static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
@@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void)
evt->cpumask = cpumask_of(smp_processor_id());
/* Start out with timer not firing. */
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
/*
* Register tile timer. Set min_delta to 1 microsecond, since
@@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
* Mask the timer interrupt here, since we are a oneshot timer
* and there are now by definition no events pending.
*/
- arch_local_irq_mask(INT_TILE_TIMER);
+ arch_local_irq_mask(INT_LINUX_TIMER);
/* Track time spent here in an interrupt context */
irq_enter();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index f110785..19d465c 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -30,7 +30,7 @@
void __init trap_init(void)
{
- /* Nothing needed here since we link code at .intrpt1 */
+ /* Nothing needed here since we link code at .intrpt */
}
int unaligned_fixup = 1;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index c7ae53d..8b20163 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
#include <hv/hypervisor.h>
/* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
OUTPUT_ARCH(tile)
ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
PHDRS
{
- intrpt1 PT_LOAD ;
+ intrpt PT_LOAD ;
text PT_LOAD ;
data PT_LOAD ;
}
@@ -24,11 +24,11 @@ SECTIONS
#define LOAD_OFFSET TEXT_OFFSET
/* Interrupt vectors */
- .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
+ .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
{
_text = .;
- *(.intrpt1)
- } :intrpt1 =0
+ *(.intrpt)
+ } :intrpt =0
/* Hypervisor call vectors */
. = ALIGN(0x10000);
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 2298cb1..65f7f9d 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -27,9 +27,6 @@ config KVM
This module provides access to the hardware capabilities through
a character device node named /dev/kvm.
- To compile this as a module, choose M here: the module
- will be called kvm.
-
If unsure, say N.
source drivers/vhost/Kconfig
diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
new file mode 100644
index 0000000..2c3d206
--- /dev/null
+++ b/arch/tile/kvm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
+
+kvm-y += kvm-tile.o
+kvm-y += entry.o
+
+obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
new file mode 100644
index 0000000..07aa3a6
--- /dev/null
+++ b/arch/tile/kvm/entry.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/switch_to.h>
+#include <asm/processor.h>
+#include <arch/spr_def.h>
+#include <arch/abi.h>
+
+#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
+#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
+#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
+#define FOR_EACH_CALLEE_SAVED_REG(f) \
+ f(r30); f(r31); \
+ f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \
+ f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
+ f(r48); f(r49); f(r50); f(r51); f(r52);
+
+/*
+ * Called with interrupts disabled from kvm_tile_run() and is responsible
+ * just for saving the callee-save registers and the stack pointer, then
+ * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
+ * It uses restore_all in intvec_64.S to jump back into the guest.
+ * The kvm_vmexit function below undoes the stack manipulation.
+ */
+STD_ENTRY(kvm_vmresume)
+ /* Do function prolog and save callee-saves on stack. */
+ {
+ move r10, sp
+ st sp, lr
+ }
+ {
+ addli r11, sp, -FRAME_SIZE + 8
+ addli sp, sp, -FRAME_SIZE
+ }
+ {
+ st r11, r10
+ addi r12, sp, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
+ SAVE_REG(tp)
+ SAVE_REG(lr)
+
+ /* Save frame pointer in thread_info so we can get it back later. */
+ st r1, sp
+
+ /* Set the ksp0 for this core to be below this frame. */
+ mfspr r10, SPR_SYSTEM_SAVE_K_0
+ bfins r10, sp, 0, CPU_SHIFT-1
+ mtspr SPR_SYSTEM_SAVE_K_0, r10
+
+ /* sp points to ABI save area below pt_regs for restore_all. */
+ addli sp, r0, -C_ABI_SAVE_AREA_SIZE
+
+ /* Execute an "interrupt return" to the guest. */
+ {
+ movei r30, 0
+ j restore_all
+ }
+ STD_ENDPROC(kvm_vmresume)
+
+/*
+ * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
+ * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
+ * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller
+ * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
+ */
+STD_ENTRY(kvm_vmexit)
+ {
+ move sp, r0
+ addi r12, r0, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
+ LOAD_REG(tp)
+ LOAD_REG(lr)
+ {
+ addli sp, sp, FRAME_SIZE
+ jrp lr
+ }
+ STD_ENDPROC(kvm_vmexit)
diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
new file mode 100644
index 0000000..4c33991
--- /dev/null
+++ b/arch/tile/kvm/kvm-tile.c
@@ -0,0 +1,1581 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <asm/traps.h>
+#include <asm/pgalloc.h>
+#include <hv/hypervisor.h>
+#include <linux/rtc.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+#include <arch/spr_def.h>
+#include <arch/sim.h>
+#include <generated/utsrelease.h>
+
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { NULL }
+};
+
+static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
+{
+ struct mm_struct *mm = kvm->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (kvm->arch.vpgd == NULL)
+ kvm->arch.vpgd = pgd_alloc(kvm->mm);
+ pgd = kvm->arch.vpgd + pgd_index(address);
+ pud = pud_alloc(mm, pgd, address);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return NULL;
+ return pte_alloc_kernel(pmd, address);
+}
+
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ return 0;
+}
+
+/* FIXME: support huge pages. */
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, i;
+
+ gpa = mem->guest_phys_addr;
+ for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
+ if (get_vpgd_pte(kvm, gpa) == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, address, pfn, i;
+ struct page *page[1];
+ pte_t *ptep, *vptep;
+
+ gpa = mem->guest_phys_addr;
+ address = mem->userspace_addr;
+ for (i = 0; i < mem->memory_size;
+ i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
+ vptep = get_vpgd_pte(kvm, gpa);
+ BUG_ON(vptep == NULL);
+ get_user_pages_fast(address, 1, 1, page);
+ pfn = page_to_pfn(page[0]);
+ ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
+ *vptep = *ptep;
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_arch_flush_shadow_all(kvm);
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ return 0;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
+{
+ if (irq < 0)
+ return -EINVAL;
+
+ set_bit(irq, &vcpu->arch.ipi_events);
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = 0;
+
+ switch (ioctl) {
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof(irq)))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ return r;
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+ return 0;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ return 0;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ long r = -EINVAL;
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long page_size;
+ unsigned long gva = tr->linear_address;
+ unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
+ pud_t gpud;
+ pmd_t gpmd;
+ pte_t gpte;
+
+ /* Get guest pgd (aka pud for three-level tables). */
+ gpgd_gpa = vcpu->arch.guest_context.page_table +
+ (sizeof(pgd_t) * pgd_index(gva));
+ if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
+ goto fail;
+ if (!pud_present(gpud))
+ goto fail;
+
+ /* Get guest pmd. */
+ if (pud_huge_page(gpud)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpud))
+ goto fail;
+ gpte = *(pte_t *)&gpud;
+ page_size = PGDIR_SIZE;
+ goto ok;
+ }
+ gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pmd_t) * pmd_index(gva));
+ if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
+ goto fail;
+ if (!pmd_present(gpmd))
+ goto fail;
+
+ /* Get guest pte. */
+ if (pmd_huge_page(gpmd)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpmd))
+ goto fail;
+ gpte = *(pte_t *)&gpmd;
+ page_size = PMD_SIZE;
+ goto ok;
+ }
+ gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pte_t) * pte_index(gva));
+ if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
+ goto fail;
+ if (!pte_present(gpte))
+ goto fail;
+
+ page_size = PAGE_SIZE;
+
+ok:
+ tr->physical_address =
+ PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
+ tr->valid = 1;
+ tr->writeable = pte_write(gpte);
+ tr->usermode = pte_user(gpte);
+
+ return 0;
+
+fail:
+ tr->valid = 0;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ regs->regs = vcpu->arch.regs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.regs = regs->regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ *sregs = vcpu->arch.sregs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu->arch.sregs = *sregs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ return 0;
+}
+
+/*
+ * panic_hv() will dump stack info of both guest os and host os, and set
+ * proper exit reason so that qemu can terminate the guest process.
+ *
+ * FIXME: Probably KVM_EXIT_EXCEPTION? If using KVM_EXIT_EXCEPTION,
+ * current qemu process will "hang" (killable but Ctrl+C not working),
+ * so use KVM_EXIT_SHUTDOWN here temporarily.
+ */
+static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
+{
+ char panic_buf[256];
+ struct pt_regs *regs;
+ va_list ap;
+ int i;
+
+ va_start(ap, fmt);
+ vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
+ va_end(ap);
+ pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
+
+ /* Show guest os info */
+ regs = &vcpu->arch.regs;
+ for (i = 0; i < 17; i++)
+ pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
+ i, regs->regs[i], i+18, regs->regs[i+18],
+ i+36, regs->regs[i+36]);
+ pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+ regs->regs[18], regs->regs[35], regs->tp);
+ pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
+ pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n",
+ regs->pc, regs->ex1, regs->faultnum);
+
+ /* Show host os info */
+ pr_err("\nKVM stack in the host:\n");
+ dump_stack();
+
+ /* Shut down the guest os */
+ pr_err("Shutting down guest.\n");
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+/* Copied from virt/kvm/kvm_main.c */
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ const void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+/*
+ * The following functions are emulation functions for various
+ * hypervisor system calls (i.e. hv_*()). Return value:
+ * 1 if the host os can emulate it completely.
+ * < 0 if errors occur and then qemu will handle them.
+ * 0 if qemu emulation is needed.
+ * In both the < 0 and the == 0 cases, exit reason should
+ * be set for qemu handling.
+ */
+
+/* generic handler for hypercall which needs user (QEMU) to handle. */
+static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ return 0;
+}
+
+/* handler for illegal hypercall */
+static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
+{
+ return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
+ (unsigned long)vcpu->arch.regs.regs[10]);
+}
+
+static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
+{
+ int version = vcpu->arch.regs.regs[0];
+ int chip_num = vcpu->arch.regs.regs[1];
+ int chip_rev_num = vcpu->arch.regs.regs[2];
+ int client_pl = vcpu->arch.regs.regs[3];
+
+ if (client_pl != 1)
+ return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
+ " guests must request PL 1.\n"
+ "Reconfigure your guest with KVM_GUEST set.\n",
+ client_pl);
+
+ if (version != HV_VERSION)
+ return panic_hv(vcpu, "Client built for hv version %d, but"
+ " this hv is version %d\n",
+ version, HV_VERSION);
+
+ if (chip_num != TILE_CHIP)
+ return panic_hv(vcpu, "Client built for chip %d, but this"
+ " hardware is chip %d\n",
+ chip_num, TILE_CHIP);
+
+ if (chip_rev_num != TILE_CHIP_REV)
+ return panic_hv(vcpu, "Client built for chip rev %d, but this"
+ " hardware is chip rev %d\n",
+ chip_rev_num, TILE_CHIP_REV);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long rc;
+
+ switch (query) {
+ case HV_SYSCONF_PAGE_SIZE_SMALL:
+ rc = PAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_LARGE:
+ rc = HPAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_VALID_PAGE_SIZES:
+#if PAGE_SHIFT == 16
+ rc = HV_CTX_PG_SM_64K;
+#elif PAGE_SHIFT == 14
+ rc = HV_CTX_PG_SM_16K;
+#else
+# error Fix hv_sysconf emulation for new page size
+#endif
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_JUMBO:
+ rc = 0; /* FIXME add super page support */
+ break;
+
+ case HV_SYSCONF_CPU_SPEED:
+ case HV_SYSCONF_CPU_TEMP:
+ case HV_SYSCONF_BOARD_TEMP:
+ rc = hv_sysconf(query);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long buflen = vcpu->arch.regs.regs[2];
+ char hvbuf[256];
+ const char *p;
+ long rc;
+
+ switch (query) {
+
+ /* For hardware attributes, just pass to the hypervisor. */
+ case HV_CONFSTR_BOARD_PART_NUM:
+ case HV_CONFSTR_BOARD_SERIAL_NUM:
+ case HV_CONFSTR_CHIP_SERIAL_NUM:
+ case HV_CONFSTR_BOARD_REV:
+ case HV_CONFSTR_CHIP_MODEL:
+ case HV_CONFSTR_BOARD_DESC:
+ case HV_CONFSTR_MEZZ_PART_NUM:
+ case HV_CONFSTR_MEZZ_SERIAL_NUM:
+ case HV_CONFSTR_MEZZ_REV:
+ case HV_CONFSTR_MEZZ_DESC:
+ case HV_CONFSTR_SWITCH_CONTROL:
+ case HV_CONFSTR_CHIP_REV:
+ case HV_CONFSTR_CPUMOD_PART_NUM:
+ case HV_CONFSTR_CPUMOD_SERIAL_NUM:
+ case HV_CONFSTR_CPUMOD_REV:
+ case HV_CONFSTR_CPUMOD_DESC:
+ rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
+ if (rc > sizeof(hvbuf)) {
+ /* Not the best answer, but very unlikely anyway. */
+ rc = sizeof(hvbuf);
+ hvbuf[sizeof(hvbuf)-1] = '\0';
+ }
+ p = hvbuf;
+ break;
+
+ /* For hypervisor version info, just report the kernel version. */
+ case HV_CONFSTR_HV_SW_VER:
+ p = UTS_RELEASE;
+ break;
+ case HV_CONFSTR_HV_CONFIG:
+ case HV_CONFSTR_HV_CONFIG_VER:
+ p = "";
+ break;
+
+ default:
+ rc = HV_EINVAL;
+ goto done;
+ }
+
+ rc = strlen(p) + 1; /* include NUL */
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
+ p, min(rc, buflen)))
+ rc = HV_EFAULT;
+
+done:
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
+{
+ HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
+ struct rtc_time tm;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ rtc_time_to_tm(tv.tv_sec, &tm);
+ hvtm->tm_sec = tm.tm_sec;
+ hvtm->tm_min = tm.tm_min;
+ hvtm->tm_hour = tm.tm_hour;
+ hvtm->tm_mday = tm.tm_mday;
+ hvtm->tm_mon = tm.tm_mon;
+ hvtm->tm_year = tm.tm_year;
+ hvtm->flags = 0;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
+{
+ /* Do nothing here. */
+ pr_warn("hv_set_rtc() will not work in kvm guest\n");
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
+
+ switch (idx) {
+ case 0:
+ var->start = 0UL;
+ var->size = 0x20000000000UL;
+ break;
+ case 1:
+ var->start = 0xFFFFFFFF80000000UL;
+ var->size = 0x80000000UL;
+ break;
+ default:
+ var->start = 0UL;
+ var->size = 0UL;
+ break;
+ }
+
+ return 1;
+}
+
+/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
+static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
+
+ if (idx == 0) {
+ var->start = min_asid;
+ var->size = max_asid - min_asid + 1;
+ } else {
+ var->start = 0;
+ var->size = 0;
+ }
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
+{
+ HV_Topology *tp;
+ int cpus;
+
+ /* Depends on the definition of struct HV_Topology */
+ tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
+
+ cpus = atomic_read(&vcpu->kvm->online_vcpus);
+ tp->coord.x = vcpu->vcpu_id;
+ tp->coord.y = 0;
+ tp->width = cpus;
+ tp->height = 1;
+
+ return 1;
+}
+
+static int xy_to_vcpu(struct kvm *kvm, int x, int y)
+{
+ if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
+ return -1;
+ return x;
+}
+
+/*
+ * The primary vcpu is the one that initially runs while the others
+ * all block. It is the only that is allowed to call hv_start_all_tiles().
+ * The other cpus are secondary.
+ */
+static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id != 0;
+}
+
+static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
+{
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (is_secondary_vcpu(vcpu) || completion_done(c))
+ return panic_hv(vcpu, "start_all_tiles() called again");
+ complete_all(c);
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
+ uint64_t val = vcpu->arch.regs.regs[2];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ hv_physaddr_write64(hpa, *access, val);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
+{
+ /* Do we care about the argument msgstate? */
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+/*
+ * NOTE: we may coalesce multiple messages with the same tag to the
+ * same recepient. Currently the only messages used by Linux are
+ * start/stop cpu (where coalescing is OK), and the smp_call_function()
+ * IPI message tag. In the latter case we rely on the generic
+ * smp_call_function code to properly handle this, and since it only
+ * uses the IPI as a way to wake up the generic list-walking code,
+ * it's OK if we coalesce several IPI deliveries before the recipient
+ * core takes action.
+ */
+static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *vcpui;
+ HV_Recipient recip[NR_CPUS];
+ HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
+ int nrecip = vcpu->arch.regs.regs[1];
+ int buflen = vcpu->arch.regs.regs[3];
+ int sent, vcpu_id, tag;
+
+ /* NOTE: we only support the Linux usage of buflen == sizeof(int). */
+ if (unlikely(buflen != sizeof(int) ||
+ nrecip >= atomic_read(&kvm->online_vcpus))) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ /* Get the buf info */
+ if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(tag))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Range-check the tag value. */
+ if (tag < 0 || tag >= MAX_MSG_TAG) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Get all the recipients */
+ if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ for (sent = 0; sent < nrecip; sent++) {
+ if (recip[sent].state != HV_TO_BE_SENT)
+ continue;
+ vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
+ if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
+ recip[sent].state = HV_BAD_RECIP;
+ continue;
+ }
+ vcpui = kvm_get_vcpu(kvm, vcpu_id);
+ set_bit(tag, &vcpui->arch.pending_msgs);
+ kvm_vcpu_kick(vcpui);
+ recip[sent].state = HV_SENT;
+ }
+
+ if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = sent;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
+{
+ HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
+ int buflen = vcpu->arch.regs.regs[3];
+ int tag;
+
+ /* Currently we only support messages from other tiles. */
+ rmi->source = HV_MSG_TILE;
+
+ if (buflen <= sizeof(int)) {
+ rmi->msglen = HV_E2BIG;
+ return 1;
+ }
+
+ tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
+ if (tag >= MAX_MSG_TAG) {
+ /* No more messages */
+ rmi->msglen = 0;
+ return 1;
+ }
+
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(int))) {
+ rmi->msglen = HV_EFAULT;
+ return 1;
+ }
+
+ /*
+ * This clear_bit could race with a set_bit as another core
+ * delivers a new smp_function_call to this core. However,
+ * the smp_function_call code will have set up the additional
+ * smp_function_call data on the kernel's list prior to
+ * raising the interrupt, so even if we lose the new
+ * interrupt due to the race, we still haven't dispatched
+ * to the original interrupt handler, and when we do, it
+ * will find both smp_function_calls waiting for it, so the
+ * race is harmless. This is consistent with the fact that
+ * the generic code is trying to support pretty much
+ * arbitrary architecture-dependent IPI semantics, so it
+ * is very conservative about what it assumes.
+ *
+ * Also note that we only clear_bit on the core that owns
+ * the mask, so there's no race condition caused by the
+ * find_first_bit above and the clear_bit here, since once
+ * a bit is found it will stay set until this point.
+ */
+ clear_bit(tag, &vcpu->arch.pending_msgs);
+ rmi->msglen = sizeof(int);
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
+
+ *ctx = hv_inquire_guest_context();
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ HV_InqTileSet set = vcpu->arch.regs.regs[0];
+ unsigned long gva = vcpu->arch.regs.regs[1];
+ int length = vcpu->arch.regs.regs[2];
+ struct cpumask mask = CPU_MASK_NONE;
+ int cpus, i, retval, bytes2copy, bytes2zero;
+
+ switch (set) {
+ case HV_INQ_TILES_AVAIL:
+ case HV_INQ_TILES_HFH_CACHE:
+ case HV_INQ_TILES_LOTAR:
+ cpus = atomic_read(&kvm->online_vcpus);
+ for (i = 0; i < cpus; ++i)
+ cpumask_set_cpu(i, &mask);
+ break;
+ case HV_INQ_TILES_SHARED:
+ break;
+ default:
+ retval = HV_EINVAL;
+ goto done;
+ }
+
+ bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
+ bytes2zero = length - bytes2copy;
+
+ if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ retval = HV_OK;
+done:
+ vcpu->arch.regs.regs[0] = retval;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
+{
+ HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
+ int pl = (int) vcpu->arch.regs.regs[1];
+ struct kvm_vcpu *target_vcpu;
+ int vcpu_id;
+
+ vcpu_id = vtarget.x;
+ if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
+ vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
+{
+ struct kvm_vcpu *vcpui;
+ unsigned long idx;
+
+ kvm_for_each_vcpu(idx, vcpui, kvm)
+ if (vcpui->arch.ipi_gpa == gpa)
+ return vcpui;
+
+ return NULL;
+}
+
+/*
+ * Most page faults will be downcall-ed from hv to and be handled directly
+ * by either guest os or host os. This function is used to handle the
+ * rest cases.
+ */
+static int handle_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_translation tr;
+ struct kvm_vcpu *ipi_vcpu;
+
+ tr.linear_address = (__u64) vcpu->arch.fault_addr;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return 0;
+
+ /* ipi PTE for rescheduling interrupt? */
+ ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
+ if (!ipi_vcpu)
+ return 0;
+
+ set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
+ kvm_vcpu_kick(ipi_vcpu);
+
+ /* Juke the PC past the store instruction. */
+ vcpu->arch.regs.pc += 8;
+ return 1;
+}
+
+static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We do not expect this call in guest so far. At least guest os
+ * should just follow host os instead of *set*. Besides,
+ * hv_set_pte_super_shift() will not be called in guest os with
+ * current guest os setting.
+ */
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
+{
+ HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
+
+ hvss->new_speed = HV_EPERM;
+ hvss->end_cycle = 0;
+ hvss->delta_ns = 0;
+
+ return 1;
+}
+
+static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
+ HCALL_DEFS
+};
+
+static int kvm_handle_exit(struct kvm_vcpu *vcpu)
+{
+ unsigned long hcall_idx;
+
+ switch (vcpu->run->exit_reason) {
+ case KVM_EXIT_HYPERCALL:
+ hcall_idx = vcpu->arch.regs.regs[10];
+ if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
+ hcall_handlers[hcall_idx] == NULL))
+ return kvm_emulate_illegal(vcpu);
+
+ /* Juke us past the swint0 when we return. */
+ vcpu->arch.regs.pc += 8;
+
+ return hcall_handlers[hcall_idx](vcpu);
+
+ case KVM_EXIT_MMIO:
+ if (handle_mmio(vcpu))
+ return 1;
+ return panic_hv(vcpu, "Out-of-bounds client memory access");
+
+ case KVM_EXIT_AGAIN:
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+static void kvm_kick_func(void *info)
+{
+ struct kvm_vcpu *vcpu = info;
+
+ /* If this is not the thread that we expect, just return. */
+ if (unlikely(vcpu->pid != get_task_pid(current, PIDTYPE_PID)))
+ return;
+
+ /* Setting this flag will cause a vmexit instead of a vmresume. */
+ set_thread_flag(TIF_VIRT_EXIT);
+}
+
+/* Note this function has been a standard kvm interface in latest Linux. */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+ int me, cpu;
+
+ /* If it is waiting in kvm_vcpu_block(), wake it up. */
+ if (waitqueue_active(&vcpu->wq))
+ wake_up_interruptible(&vcpu->wq);
+
+ /* If we are kicking our own vcpu, make sure we vmexit. */
+ if (vcpu == current_thread_info()->vcpu) {
+ set_thread_flag(TIF_VIRT_EXIT);
+ return;
+ }
+
+ /*
+ * If the vcpu is running the guest, interrupt its cpu,
+ * causing it to vmexit by setting TIF_VIRT_EXIT. Note we can
+ * race with a guest already doing a vmexit, but that is benign.
+ */
+ cpu = vcpu->cpu;
+ me = get_cpu();
+ if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+ if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ smp_call_function_single(cpu, kvm_kick_func, vcpu, 0);
+ put_cpu();
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+
+/*
+ * Any interrupt that would normally be handled by the host at PL2
+ * needs to be reassigned to the guest at PL1 as we enter.
+ *
+ * The TLB interrupts remain handled by the hypervisor and are downcalled
+ * to the appropriate host or guest as necessary.
+ *
+ * FIXME: We don't give the UDN interrupts for now; at some point we
+ * plan to allow an option to pin the vcpus and report the true
+ * geometry to the guest, at which point passing the UDN access would
+ * make sense.
+ *
+ * FIXME: For now we don't pass the profiling interrupts to the guest,
+ * and instead require profiling be run in the host; we should be able
+ * to support guest-level profiling pretty easily, but we need to
+ * think about whether there are vcpu migration issues there.
+ */
+static void kvm_grant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_1, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
+}
+
+static void kvm_ungrant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_2, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
+}
+
+/*
+ * There is lots of state that is (for the non-virtualized case) held
+ * permanently in SPRs, or that is in any case not context-switched.
+ * The next two routines switch in and out all the SPR state.
+ *
+ * We try to fix the timer so that when we restart, we fix up the
+ * timer value so that will fire at the correct wall-clock time even
+ * if we have been scheduled out for a little bit. This may also
+ * mean we end up firing it immediately on return, and suffer a
+ * timer delay in the guest.
+ */
+static void kvm_save_sprs(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
+ vcpu->arch.vmexit_cycles = get_cycles();
+
+#define SAVE_SPR(x) vcpu->arch.sregs.x = __insn_mfspr(SPR_ ## x)
+ FOR_EACH_GUEST_SPR(SAVE_SPR);
+#undef SAVE_SPR
+}
+
+static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
+{
+ unsigned long count = vcpu->arch.timer_control;
+ unsigned long underflow =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
+ unsigned long disabled =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
+
+ if (!disabled) {
+ unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ underflow |= delta > count;
+ count -= delta;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
+ }
+ __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
+
+#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.sregs.x)
+ FOR_EACH_GUEST_SPR(RESTORE_SPR);
+#undef RESTORE_SPR
+}
+
+/*
+ * When entering the guest, we need to eliminate any PL0 translations
+ * that were in use by qemu, since the guest's PL0 translations will
+ * be different. We also flush PL1 translations in case there have
+ * been changes to the virtualization page table, etc.
+ *
+ * FIXME: Add a way to just flush PL0/PL1, or just flush below
+ * the host PAGE_OFFSET, or add vpid support, etc.
+ */
+static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx;
+ pgd_t *vpgdir;
+ pte_t *ptep;
+ int rc;
+
+ /* Install virtualization context */
+ vpgdir = vcpu->kvm->arch.vpgd;
+ BUG_ON(vpgdir == NULL);
+ ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
+ rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Install guest context */
+ ctx = &vcpu->arch.guest_context;
+ rc = hv_install_guest_context(ctx->page_table, ctx->access,
+ ctx->asid, ctx->flags);
+ WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
+ ctx->page_table, ctx->access.val,
+ ctx->asid, ctx->flags, rc);
+
+ hv_flush_all(0);
+}
+
+/*
+ * De-install the virtualization context so we take faults below the
+ * host Linux PL in the normal manner going forward.
+ *
+ * We flush all the TLB mappings as we exit the guest, since the
+ * guest has been using the ASIDs as it pleases, and may have installed
+ * incompatible mappings for qemu's process as well. Note that we don't
+ * worry about host-PL interrupts that occur while the guest is running,
+ * on the assumption that such interrupts can't touch userspace
+ * addresses legally anyway.
+ *
+ * NOTE: we may want to add a hypervisor call to just flush mappings
+ * below PL2 and use that here instead.
+ */
+static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
+{
+ int rc;
+
+ /* Remember guest context */
+ vcpu->arch.guest_context = hv_inquire_guest_context();
+
+ /* Disable virtualization context */
+ rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Flush everything in the TLB. */
+ hv_flush_all(0);
+}
+
+static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Capture current set of ipi_events. We might race with
+ * another thread adding an event, but if so we'll just miss
+ * it on this go-around and see it next time.
+ */
+ vcpu->arch.sregs.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
+
+ /*
+ * Note: We could set PC and EX1 for the guest os to jump
+ * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
+ * is unmasked and the guest is not at PL1 with ICS set.
+ * But in fact it's about as fast to just set INTCTRL_1_STATUS
+ * here and then run the short INTCTRL_1 handler in the guest.
+ */
+ vcpu->arch.sregs.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
+}
+
+static void kvm_tile_run(struct kvm_vcpu *vcpu)
+{
+ struct thread_info *ti = current_thread_info();
+ unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
+
+ /*
+ * Disable interrupts while we set up the guest state.
+ * This way, if we race with another core trying to tell us
+ * to fix up our guest state, we will take the kick only as
+ * we actually try to enter the guest, and instead we will
+ * vmexit and end up retrying.
+ */
+ local_irq_disable();
+ kvm_guest_context_enter(vcpu);
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ ti->vcpu = vcpu;
+ vcpu->cpu = get_cpu();
+ kvm_inject_interrupts(vcpu);
+ kvm_grant_mpls();
+ kvm_restore_sprs(vcpu);
+
+ /* Calling this function irets into the guest. */
+ kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
+
+ /* We resume here due to a call to kvm_vmexit. */
+ __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
+
+ vcpu->cpu = -1;
+ put_cpu();
+ ti->vcpu = NULL;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
+ vcpu->run->ready_for_interrupt_injection = 1;
+ kvm_ungrant_mpls();
+ kvm_save_sprs(vcpu);
+ __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
+ kvm_guest_context_exit(vcpu);
+ local_irq_enable();
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r = 1;
+
+ while (r > 0) {
+ kvm_guest_enter();
+ kvm_tile_run(vcpu);
+ kvm_guest_exit();
+
+ r = kvm_handle_exit(vcpu);
+ /*
+ * <0: error for userspace.
+ * =0: QEMU to handle.
+ * >0: host os can handle it fully.
+ */
+ if (r <= 0)
+ break;
+
+ if (signal_pending(current)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+
+#ifdef CONFIG_HOMECACHE
+ if (current_thread_info()->homecache_cpu !=
+ smp_processor_id()) {
+ /* Do homecache migration when returning to qemu. */
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+#endif
+
+ kvm_resched(vcpu);
+ }
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ /* Secondary cpus must wait until they are told they can start. */
+ if (vcpu->arch.suspended) {
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (wait_for_completion_interruptible(c))
+ return -EINTR;
+ vcpu->arch.suspended = 0;
+ }
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ r = __vcpu_run(vcpu, kvm_run);
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return r;
+}
+
+int kvm_arch_init(void *opaque)
+{
+ return 0;
+}
+
+void kvm_arch_exit(void)
+{
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ unsigned long resv_gfn_start;
+ struct kvm_memory_slot *s;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.resv_gpa_start) {
+ resv_gfn_start = 0;
+
+ for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
+ s = &kvm->memslots->memslots[i];
+
+ if (!s->npages)
+ continue;
+
+ if ((s->base_gfn + s->npages) > resv_gfn_start)
+ resv_gfn_start = s->base_gfn + s->npages;
+ }
+
+ kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
+ }
+
+ /* Initialize to enter fake PA=VA mode in hypervisor. */
+ vcpu->arch.guest_context.page_table = HV_CTX_NONE;
+
+ vcpu->arch.ipi_gpa =
+ kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
+ vcpu->arch.ipi_gpte =
+ pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
+
+ /* Mark the core suspended if it is not the boot cpu. */
+ vcpu->arch.suspended = is_secondary_vcpu(vcpu);
+
+ return 0;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Notify simulator that this task handles this vcpu. */
+ sim_set_vcpu(vcpu->vcpu_id);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ sim_clear_vcpu();
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+ /* FIXME: some archs set up a cache for these structs? */
+ struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+ int rc;
+
+ if (!vcpu)
+ return ERR_PTR(-ENOMEM);
+
+ rc = kvm_vcpu_init(vcpu, kvm, id);
+ if (rc) {
+ kfree(vcpu);
+ return ERR_PTR(rc);
+ }
+
+ return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
+ memset(&vcpu->arch.sregs, 0, sizeof(struct pt_regs));
+ vcpu->arch.sregs.IPI_MASK_1 = -1UL;
+ vcpu->arch.sregs.INTERRUPT_MASK_1 = -1UL;
+ vcpu->arch.sregs.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
+ return 0;
+}
+
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_vcpu_uninit(vcpu);
+ kfree(vcpu);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_destroy(vcpu);
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ if (type)
+ return -EINVAL;
+
+ init_completion(&kvm->arch.smp_start);
+ return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ /* Seems to be unnecessary? */
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+
+ /* FIXME: release all the pmds and ptes as well! */
+ if (kvm->arch.vpgd)
+ pgd_free(kvm->mm, kvm->arch.vpgd);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+/* Called from guest hv glue via swint0 traps. */
+void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
+{
+ /* Hypercalls are only valid from PL1. */
+ if (EX1_PL(regs->ex1) != 0) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
+ /*NORETURN*/
+ }
+ do_trap(regs, fault_num, 0);
+}
+
+void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long fault_addr, unsigned long write)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ BUG_ON(vcpu == NULL);
+ vcpu->arch.fault_addr = fault_addr;
+ kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
+ /*NORETURN*/
+}
+
+void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
+{
+ kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
+ /*NORETURN*/
+}
+
+void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->arch.regs = *regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ kvm_vmexit(vcpu->arch.host_sp);
+ /*NORETURN*/
+}
+
+static int __init kvm_tile_init(void)
+{
+ return kvm_init(NULL, sizeof(struct kvm_vcpu),
+ __alignof__(struct kvm_vcpu), THIS_MODULE);
+}
+
+static void __exit kvm_tile_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(kvm_tile_init);
+module_exit(kvm_tile_exit);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 82733c8..1590282 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);
/* hypervisor glue */
#include <hv/hypervisor.h>
+EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_dev_close);
EXPORT_SYMBOL(hv_dev_open);
+EXPORT_SYMBOL(hv_dev_poll);
+EXPORT_SYMBOL(hv_dev_poll_cancel);
EXPORT_SYMBOL(hv_dev_pread);
-EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_preada);
+EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_pwritea);
-EXPORT_SYMBOL(hv_dev_poll);
-EXPORT_SYMBOL(hv_dev_poll_cancel);
-EXPORT_SYMBOL(hv_dev_close);
-EXPORT_SYMBOL(hv_sysconf);
-EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_flush_all);
EXPORT_SYMBOL(hv_get_rtc);
+#ifdef __tilegx__
+EXPORT_SYMBOL(hv_inquire_guest_context);
+EXPORT_SYMBOL(hv_install_guest_context);
+EXPORT_SYMBOL(hv_install_virt_context);
+#endif
+EXPORT_SYMBOL(hv_physaddr_read64);
+EXPORT_SYMBOL(hv_physaddr_write64);
EXPORT_SYMBOL(hv_set_rtc);
+EXPORT_SYMBOL(hv_sysconf);
/* libgcc.a */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 23f044e..86cff48 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm)
char *buf, *path;
struct vm_area_struct *vma;
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
if (!sim_is_simulator())
+#endif
return 1;
if (mm->exe_file == NULL)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 64eec3f..39c48cb 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
- is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+ is_kernel_mode = !user_mode(regs);
tsk = validate_current();
@@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
}
#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
- if (EX1_PL(regs->ex1) != USER_PL) {
+ if (!user_mode(regs)) {
struct async_tlb *async;
switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 3bfa127..c6d2160 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
{
int cpu;
unsigned long page;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if CHIP_HAS_CBOX_HOME_MAP()
/* For kdata=huge, everything is just hash-for-home. */
@@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
}
- address = MEM_SV_INTRPT;
+ address = MEM_SV_START;
pmd = get_pmd(pgtables, address);
pfn = 0; /* code starts at PA 0 */
if (ktext_small) {
@@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
void free_initmem(void)
{
- const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+ const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
/*
* Evict the dirty initdata on the boot cpu, evict the w1data
@@ -1040,7 +1040,7 @@ void free_initmem(void)
/*
* Free the pages mapped from 0xc0000000 that correspond to code
- * pages from MEM_SV_INTRPT that we won't use again after init.
+ * pages from MEM_SV_START that we won't use again after init.
*/
free_init_pages("unused kernel text",
(unsigned long)_sinittext - text_delta,
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 3004433..d6948d4 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
#if CHIP_HAS_MMIO()
-/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
-void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
- pgprot_t home)
+void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot)
{
void *addr;
struct vm_struct *area;
unsigned long offset, last_addr;
- pgprot_t pgprot;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
- /* Create a read/write, MMIO VA mapping homed at the requested shim. */
- pgprot = PAGE_KERNEL;
- pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
- pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
-
/*
* Mappings have to be page-aligned
*/
@@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
+ area = get_vm_area(size, flags);
if (!area)
return NULL;
area->phys_addr = phys_addr;
addr = area->addr;
if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
+ phys_addr, prot)) {
free_vm_area(area);
return NULL;
}
- return (__force void __iomem *) (offset + (char *)addr);
+ return (void *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(generic_remap_prot);
+
+/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ pgprot_t home)
+{
+ pgprot_t pgprot;
+ unsigned long flags;
+
+ /* Create a read/write, MMIO VA mapping homed at the requested shim. */
+ pgprot = PAGE_KERNEL;
+ pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
+ pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
+ flags = VM_IOREMAP; /* | other flags? */
+
+ return (__force void __iomem *) generic_remap_prot(phys_addr,
+ size, flags, pgprot);
}
EXPORT_SYMBOL(ioremap_prot);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08..b622337 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -171,6 +171,7 @@ struct kvm_pit_config {
#define KVM_EXIT_WATCHDOG 21
#define KVM_EXIT_S390_TSCH 22
#define KVM_EXIT_EPR 23
+#define KVM_EXIT_AGAIN 24
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..1b8a1f1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1691,7 +1691,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
finish_wait(&vcpu->wq, &wait);
}
-#ifndef CONFIG_S390
+#if !defined(CONFIG_S390) && !defined(CONFIG_TILE)
/*
* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
*/
@@ -1714,7 +1714,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
put_cpu();
}
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
-#endif /* !CONFIG_S390 */
+#endif
void kvm_resched(struct kvm_vcpu *vcpu)
{
@@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu->kvm->mm != current->mm)
return -EIO;
-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
+ defined(CONFIG_TILEGX)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
* so vcpu_load() would break it.
--
1.8.3.1
On Mon, Aug 12, 2013 at 04:24:11PM -0400, Chris Metcalf wrote:
> This change provides the initial framework support for KVM on tilegx.
> Basic virtual disk and networking is supported.
>
This needs to be broken down to more reviewable patches. Also can you
describe the implementation a little bit? Does tile arch has vitalization
extension this implementation uses, or is it trap and emulate approach?
If later does it run unmodified guest kernels? What userspace are you
using with this implementation?
> Signed-off-by: Chris Metcalf <[email protected]>
> ---
> v2: remove KVM_TILE_RESET_SPR based on feedback from Jan Kiszka.
> qemu ends up modified to just use KVM_SET_SREGS instead.
>
> arch/tile/Kconfig | 19 +-
> arch/tile/Makefile | 1 +
> arch/tile/include/asm/io.h | 2 +
> arch/tile/include/asm/kvm.h | 29 +
> arch/tile/include/asm/kvm_host.h | 101 ++
> arch/tile/include/asm/kvm_para.h | 20 +
> arch/tile/include/asm/kvm_virtio.h | 26 +
> arch/tile/include/asm/module.h | 9 +-
> arch/tile/include/asm/page.h | 56 +-
> arch/tile/include/asm/pgtable_32.h | 2 +-
> arch/tile/include/asm/pgtable_64.h | 3 +-
> arch/tile/include/asm/processor.h | 6 +-
> arch/tile/include/asm/ptrace.h | 2 +-
> arch/tile/include/asm/switch_to.h | 25 +-
> arch/tile/include/asm/thread_info.h | 17 +-
> arch/tile/include/asm/timex.h | 8 +
> arch/tile/include/hv/hypervisor.h | 183 +++-
> arch/tile/include/uapi/arch/sim.h | 19 +
> arch/tile/include/uapi/arch/sim_def.h | 8 +
> arch/tile/include/uapi/arch/spr_def_32.h | 15 +
> arch/tile/include/uapi/arch/spr_def_64.h | 25 +
> arch/tile/include/uapi/asm/Kbuild | 2 +
> arch/tile/include/uapi/asm/kvm.h | 267 +++++
> arch/tile/include/uapi/asm/kvm_virtio.h | 60 ++
> arch/tile/kernel/Makefile | 1 +
> arch/tile/kernel/asm-offsets.c | 7 +
> arch/tile/kernel/early_printk.c | 16 +
> arch/tile/kernel/head_32.S | 4 +-
> arch/tile/kernel/head_64.S | 6 +-
> arch/tile/kernel/hvglue.S | 8 +-
> arch/tile/kernel/hvglue_trace.c | 14 +
> arch/tile/kernel/intvec_32.S | 18 +-
> arch/tile/kernel/intvec_64.S | 226 +++--
> arch/tile/kernel/kvm_virtio.c | 430 ++++++++
> arch/tile/kernel/process.c | 40 +-
> arch/tile/kernel/relocate_kernel_64.S | 9 +-
> arch/tile/kernel/setup.c | 21 +-
> arch/tile/kernel/smp.c | 28 +-
> arch/tile/kernel/stack.c | 2 +-
> arch/tile/kernel/sysfs.c | 4 +
> arch/tile/kernel/time.c | 14 +-
> arch/tile/kernel/traps.c | 2 +-
> arch/tile/kernel/vmlinux.lds.S | 10 +-
> arch/tile/kvm/Kconfig | 3 -
> arch/tile/kvm/Makefile | 12 +
> arch/tile/kvm/entry.S | 91 ++
> arch/tile/kvm/kvm-tile.c | 1581 ++++++++++++++++++++++++++++++
> arch/tile/lib/exports.c | 20 +-
> arch/tile/mm/elf.c | 2 +
> arch/tile/mm/fault.c | 4 +-
> arch/tile/mm/init.c | 8 +-
> arch/tile/mm/pgtable.c | 35 +-
> include/uapi/linux/kvm.h | 1 +
> virt/kvm/kvm_main.c | 7 +-
> 54 files changed, 3331 insertions(+), 198 deletions(-)
> create mode 100644 arch/tile/include/asm/kvm.h
> create mode 100644 arch/tile/include/asm/kvm_host.h
> create mode 100644 arch/tile/include/asm/kvm_para.h
> create mode 100644 arch/tile/include/asm/kvm_virtio.h
> create mode 100644 arch/tile/include/uapi/asm/kvm.h
> create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h
> create mode 100644 arch/tile/kernel/kvm_virtio.c
> create mode 100644 arch/tile/kvm/Makefile
> create mode 100644 arch/tile/kvm/entry.S
> create mode 100644 arch/tile/kvm/kvm-tile.c
>
> diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
> index ecff467..bbb6d51 100644
> --- a/arch/tile/Kconfig
> +++ b/arch/tile/Kconfig
> @@ -5,7 +5,6 @@ config TILE
> def_bool y
> select HAVE_DMA_ATTRS
> select HAVE_DMA_API_DEBUG
> - select HAVE_KVM if !TILEGX
> select GENERIC_FIND_FIRST_BIT
> select SYSCTL_EXCEPTION_TRACE
> select USE_GENERIC_SMP_HELPERS
> @@ -113,6 +112,7 @@ config SMP
> def_bool y
>
> config HVC_TILE
> + depends on !KVM_GUEST
> depends on TTY
> select HVC_DRIVER
> select HVC_IRQ if TILEGX
> @@ -127,6 +127,7 @@ config TILEGX
> select HAVE_FTRACE_MCOUNT_RECORD
> select HAVE_KPROBES
> select HAVE_KRETPROBES
> + select HAVE_KVM if !KVM_GUEST
>
> config TILEPRO
> def_bool !TILEGX
> @@ -366,11 +367,23 @@ config HARDWALL
> bool "Hardwall support to allow access to user dynamic network"
> default y
>
> +config KVM_GUEST
> + bool "Build kernel as guest for KVM"
> + default n
> + depends on TILEGX
> + select VIRTIO
> + select VIRTIO_RING
> + select VIRTIO_CONSOLE
> + ---help---
> + This will build a kernel that runs at a lower protection level
> + than the default kernel and is suitable to run under KVM.
> +
> +# TILEPro kernels run at PL1; TILE-Gx runs at PL2 unless it's a KVM guest.
> config KERNEL_PL
> int "Processor protection level for kernel"
> range 1 2
> - default 2 if TILEGX
> - default 1 if !TILEGX
> + default 2 if TILEGX && !KVM_GUEST
> + default 1 if !TILEGX || KVM_GUEST
> ---help---
> Since MDE 4.2, the Tilera hypervisor runs the kernel
> at PL2 by default. If running under an older hypervisor,
> diff --git a/arch/tile/Makefile b/arch/tile/Makefile
> index 3d15364..8e7f852 100644
> --- a/arch/tile/Makefile
> +++ b/arch/tile/Makefile
> @@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH)
>
> # See arch/tile/Kbuild for content of core part of the kernel
> core-y += arch/tile/
> +core-$(CONFIG_KVM) += arch/tile/kvm/
>
> core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
>
> diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
> index 9fe4349..023659b 100644
> --- a/arch/tile/include/asm/io.h
> +++ b/arch/tile/include/asm/io.h
> @@ -43,6 +43,8 @@
> * long before casting it to a pointer to avoid compiler warnings.
> */
> #if CHIP_HAS_MMIO()
> +extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
> + unsigned long flags, pgprot_t prot);
> extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
> extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
> pgprot_t pgprot);
> diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
> new file mode 100644
> index 0000000..2ea6c41
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm.h
> @@ -0,0 +1,29 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _ASM_TILE_KVM_H
> +#define _ASM_TILE_KVM_H
> +
> +#include <hv/hypervisor.h>
> +#include <uapi/asm/kvm.h>
> +
> +#ifndef __ASSEMBLER__
> +/* For hv_*() */
> +#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
> +#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
> +#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
> +#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
> +/* For others */
> +#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
> +#endif
> +#endif /* _ASM_TILE_KVM_H */
> diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
> new file mode 100644
> index 0000000..58b6bf3
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm_host.h
> @@ -0,0 +1,101 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _ASM_TILE_KVM_HOST_H
> +#define _ASM_TILE_KVM_HOST_H
> +
> +#define KVM_MAX_VCPUS 64
> +#define KVM_USER_MEM_SLOTS 32
> +#define KVM_PRIVATE_MEM_SLOTS 4
> +
> +/* For now, claim we have no huge pages. */
> +#define KVM_HPAGE_GFN_SHIFT(x) 0
> +#define KVM_NR_PAGE_SIZES 1
> +#define KVM_PAGES_PER_HPAGE(x) 1
> +
> +/* Max number of message tags for hv_send/receive_message() */
> +#define MAX_MSG_TAG (sizeof(unsigned long) * 8)
> +
> +/* Bits in pending_downcalls */
> +#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <linux/types.h>
> +#include <linux/ptrace.h>
> +
> +struct kvm_vcpu_stat {
> + /* None yet. */
> +};
> +
> +struct kvm_vcpu_arch {
> + struct pt_regs regs;
> + struct kvm_sregs sregs;
> + unsigned long host_sp; /* Host "real" sp during vmresume. */
> + HV_Context guest_context;
> + unsigned long pending_msgs; /* Pending guest messages */
> + unsigned long ipi_events; /* Pending guest ipi events. */
> + unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
> + pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
> + unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */
> + int suspended; /* true for cores not yet started by host */
> + unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */
> + unsigned long vmexit_cycles; /* cycle count of last vmexit */
> +};
> +
> +struct kvm_vm_stat {
> + /*
> + * FIXME - does this make sense for us? It's used in common KVM
> + * code.
> + */
> + u32 remote_tlb_flush;
> +};
> +
> +struct kvm_arch_memory_slot {
> +};
> +
> +struct kvm_arch {
> + pgd_t *vpgd;
> + unsigned long resv_gpa_start; /* For special purpose. */
> + struct completion smp_start;
> +};
> +
> +struct kvm_vcpu;
> +
> +extern void kvm_vmresume(struct pt_regs *guest,
> + unsigned long *host_sp_ptr);
> +extern void kvm_vmexit(unsigned long host_sp);
> +extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
> +extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
> +extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
> + unsigned long, unsigned long);
> +extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
> +
> +extern void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
> +
> +#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
> +
> +#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
> +
> +#define gpmd_offset(kvm, pud, address) \
> + ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
> +
> +#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
> +
> +#define gpte_offset_kernel(kvm, pmd, address) \
> + ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
> +
> +#endif /* __ASSEMBLY__*/
> +
> +#endif /* _ASM_TILE_KVM_HOST_H */
> diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
> new file mode 100644
> index 0000000..c8c31d5
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm_para.h
> @@ -0,0 +1,20 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _ASM_TILE_KVM_PARA_H
> +#define _ASM_TILE_KVM_PARA_H
> +
> +#include <uapi/asm/kvm_para.h>
> +
> +int hcall_virtio(unsigned long instrument, unsigned long mem);
> +#endif /* _ASM_TILE_KVM_PARA_H */
> diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
> new file mode 100644
> index 0000000..8faa959
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm_virtio.h
> @@ -0,0 +1,26 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _ASM_TILE_KVM_VIRTIO_H
> +#define _ASM_TILE_KVM_VIRTIO_H
> +
> +#include <uapi/asm/kvm_virtio.h>
> +
> +
> +struct kvm_device {
> + struct virtio_device vdev;
> + struct kvm_device_desc *desc;
> + unsigned long desc_pa;
> +};
> +
> +#endif /* _ASM_TILE_KVM_VIRTIO_H */
> diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
> index 44ed07c..927c97f 100644
> --- a/arch/tile/include/asm/module.h
> +++ b/arch/tile/include/asm/module.h
> @@ -28,6 +28,13 @@
> # define MODULE_PGSZ ""
> #endif
>
> +/* Tag guest Linux, since it uses different SPRs, etc. */
> +#if CONFIG_KERNEL_PL == 2
> +#define MODULE_PL ""
> +#else
> +#define MODULE_PL " guest"
> +#endif
> +
> /* We don't really support no-SMP so tag if someone tries. */
> #ifdef CONFIG_SMP
> #define MODULE_NOSMP ""
> @@ -35,6 +42,6 @@
> #define MODULE_NOSMP " nosmp"
> #endif
>
> -#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
> +#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP
>
> #endif /* _ASM_TILE_MODULE_H */
> diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
> index b4f96c0..65ee752 100644
> --- a/arch/tile/include/asm/page.h
> +++ b/arch/tile/include/asm/page.h
> @@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
> #endif
>
> +#ifdef CONFIG_KVM_GUEST
> +/* Paravirtualized guests get half the VA, and thus half the PA. */
> +#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1)
> +#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1)
> +#else
> +#define MAX_PA_WIDTH CHIP_PA_WIDTH()
> +#define MAX_VA_WIDTH CHIP_VA_WIDTH()
> +#endif
> +
> /* Each memory controller has PAs distinct in their high bits. */
> -#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
> +#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
> #define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
> #define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
> #define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
> @@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
> * We reserve the lower half of memory for user-space programs, and the
> * upper half for system code. We re-map all of physical memory in the
> * upper half, which takes a quarter of our VA space. Then we have
> - * the vmalloc regions. The supervisor code lives at 0xfffffff700000000,
> + * the vmalloc regions. The supervisor code lives at the highest address,
> * with the hypervisor above that.
> *
> * Loadable kernel modules are placed immediately after the static
> @@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size)
> * Similarly, for now we don't play any struct page mapping games.
> */
>
> -#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
> +#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
> # error Too much PA to map with the VA available!
> #endif
> -#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
>
> -#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */
> -#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */
> -#define PAGE_OFFSET MEM_HIGH_START
> -#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */
> -#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */
> +#ifdef CONFIG_KVM_GUEST
> +#define PAGE_OFFSET (_AC(1, UL) << (MAX_VA_WIDTH - 1))
> +#define KERNEL_HIGH_VADDR (_AC(1, UL) << MAX_VA_WIDTH)
> +#else
> +#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
> +#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */
> +#endif
> +
> +#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
> +#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
> #define _VMALLOC_START FIXADDR_TOP
> -#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */
> -#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */
> -#define MEM_SV_INTRPT MEM_SV_START
> -#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */
> +#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
> +#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
> +#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */
> #define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024))
> -#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */
> -
> -/* Highest DTLB address we will use */
> -#define KERNEL_HIGH_VADDR MEM_SV_START
>
> #else /* !__tilegx__ */
>
> @@ -213,8 +221,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
> * values, and after that, we show "typical" values, since the actual
> * addresses depend on kernel #defines.
> *
> - * MEM_HV_INTRPT 0xfe000000
> - * MEM_SV_INTRPT (kernel code) 0xfd000000
> + * MEM_HV_START 0xfe000000
> + * MEM_SV_START (kernel code) 0xfd000000
> * MEM_USER_INTRPT (user vector) 0xfc000000
> * FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR)
> * PKMAP_BASE 0xf7000000 (via LAST_PKMAP)
> @@ -224,14 +232,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
> */
>
> #define MEM_USER_INTRPT _AC(0xfc000000, UL)
> -#if CONFIG_KERNEL_PL == 1
> -#define MEM_SV_INTRPT _AC(0xfd000000, UL)
> -#define MEM_HV_INTRPT _AC(0xfe000000, UL)
> -#else
> -#define MEM_GUEST_INTRPT _AC(0xfd000000, UL)
> -#define MEM_SV_INTRPT _AC(0xfe000000, UL)
> -#define MEM_HV_INTRPT _AC(0xff000000, UL)
> -#endif
> +#define MEM_SV_START _AC(0xfd000000, UL)
> +#define MEM_HV_START _AC(0xfe000000, UL)
>
> #define INTRPT_SIZE 0x4000
>
> diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
> index e5bdc0e..63142ab 100644
> --- a/arch/tile/include/asm/pgtable_32.h
> +++ b/arch/tile/include/asm/pgtable_32.h
> @@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; }
> /* We don't define any pgds for these addresses. */
> static inline int pgd_addr_invalid(unsigned long addr)
> {
> - return addr >= MEM_HV_INTRPT;
> + return addr >= MEM_HV_START;
> }
>
> /*
> diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
> index 7cb8d35..3421177 100644
> --- a/arch/tile/include/asm/pgtable_64.h
> +++ b/arch/tile/include/asm/pgtable_64.h
> @@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
> /* We don't define any pgds for these addresses. */
> static inline int pgd_addr_invalid(unsigned long addr)
> {
> - return addr >= MEM_HV_START ||
> - (addr > MEM_LOW_END && addr < MEM_HIGH_START);
> + return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
> }
>
> /*
> diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
> index 230b830..5aa5431 100644
> --- a/arch/tile/include/asm/processor.h
> +++ b/arch/tile/include/asm/processor.h
> @@ -15,6 +15,8 @@
> #ifndef _ASM_TILE_PROCESSOR_H
> #define _ASM_TILE_PROCESSOR_H
>
> +#include <arch/chip.h>
> +
> #ifndef __ASSEMBLY__
>
> /*
> @@ -25,7 +27,6 @@
> #include <asm/ptrace.h>
> #include <asm/percpu.h>
>
> -#include <arch/chip.h>
> #include <arch/spr_def.h>
>
> struct task_struct;
> @@ -167,7 +168,7 @@ struct thread_struct {
> #ifndef __ASSEMBLY__
>
> #ifdef __tilegx__
> -#define TASK_SIZE_MAX (MEM_LOW_END + 1)
> +#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1))
> #else
> #define TASK_SIZE_MAX PAGE_OFFSET
> #endif
> @@ -347,7 +348,6 @@ extern int kdata_huge;
>
> /*
> * Provide symbolic constants for PLs.
> - * Note that assembly code assumes that USER_PL is zero.
> */
> #define USER_PL 0
> #if CONFIG_KERNEL_PL == 2
> diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
> index 0d25c21..b9620c0 100644
> --- a/arch/tile/include/asm/ptrace.h
> +++ b/arch/tile/include/asm/ptrace.h
> @@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
> #define user_stack_pointer(regs) ((regs)->sp)
>
> /* Does the process account for user or for system time? */
> -#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
> +#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
>
> /* Fill in a struct pt_regs with the current kernel registers. */
> struct pt_regs *get_pt_regs(struct pt_regs *);
> diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
> index b8f888c..8e9150f 100644
> --- a/arch/tile/include/asm/switch_to.h
> +++ b/arch/tile/include/asm/switch_to.h
> @@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev,
> extern unsigned long get_switch_to_pc(void);
>
> /*
> + * Normally we notify the simulator whenever we change from one pid
> + * to another, so it can track symbol files appropriately on the fly.
> + * For now, we don't do this for the guest Linux, since we don't
> + * have a way to tell the simulator that we are entering a separate
> + * pid space when we are in the guest.
> + */
> +#ifdef CONFIG_KVM_GUEST
> +#define notify_sim_task_change(prev) do { } while (0)
> +#else
> +#define notify_sim_task_change(prev) do { \
> + if (unlikely((prev)->state == TASK_DEAD)) \
> + __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
> + ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
> + __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
> + (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
> +} while (0)
> +#endif
> +
> +/*
> * Kernel threads can check to see if they need to migrate their
> * stack whenever they return from a context switch; for user
> * threads, we defer until they are returning to user-space.
> */
> #define finish_arch_switch(prev) do { \
> - if (unlikely((prev)->state == TASK_DEAD)) \
> - __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
> - ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
> - __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
> - (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
> + notify_sim_task_change(prev); \
> if (current->mm == NULL && !kstack_hash && \
> current_thread_info()->homecache_cpu != smp_processor_id()) \
> homecache_migrate_kthread(); \
> diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
> index b8aa6df..1c26cdf 100644
> --- a/arch/tile/include/asm/thread_info.h
> +++ b/arch/tile/include/asm/thread_info.h
> @@ -18,7 +18,9 @@
>
> #include <asm/processor.h>
> #include <asm/page.h>
> +
> #ifndef __ASSEMBLY__
> +struct kvm_vcpu;
>
> /*
> * Low level task data that assembly code needs immediate access to.
> @@ -44,6 +46,9 @@ struct thread_info {
> unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */
> void __user *unalign_jit_base; /* unalign fixup JIT base */
> #endif
> +#ifdef CONFIG_KVM
> + struct kvm_vcpu *vcpu; /* vcpu during vmresume */
> +#endif
> };
>
> /*
> @@ -117,8 +122,8 @@ extern void _cpu_idle(void);
>
> /*
> * Thread information flags that various assembly files may need to access.
> - * Keep flags accessed frequently in low bits, particular since it makes
> - * it easier to build constants in assembly.
> + * Keep flags accessed frequently in low bits, since it makes it
> + * easier to build constants in assembly.
> */
> #define TIF_SIGPENDING 0 /* signal pending */
> #define TIF_NEED_RESCHED 1 /* rescheduling necessary */
> @@ -131,6 +136,7 @@ extern void _cpu_idle(void);
> #define TIF_MEMDIE 7 /* OOM killer at work */
> #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
> #define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
> +#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */
>
> #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
> @@ -142,11 +148,12 @@ extern void _cpu_idle(void);
> #define _TIF_MEMDIE (1<<TIF_MEMDIE)
> #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
> #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
> +#define _TIF_VIRT_EXIT (1<<TIF_VIRT_EXIT)
>
> /* Work to do on any return to user space. */
> -#define _TIF_ALLWORK_MASK \
> - (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
> - _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
> +#define _TIF_ALLWORK_MASK \
> + (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP| \
> + _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)
>
> /* Work to do at syscall entry. */
> #define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
> diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
> index edbd7e4..0417617 100644
> --- a/arch/tile/include/asm/timex.h
> +++ b/arch/tile/include/asm/timex.h
> @@ -27,6 +27,14 @@
>
> typedef unsigned long long cycles_t;
>
> +#ifdef CONFIG_KVM_GUEST
> +#define INT_LINUX_TIMER INT_AUX_TILE_TIMER
> +#define SPR_LINUX_TIMER_CONTROL SPR_AUX_TILE_TIMER_CONTROL
> +#else
> +#define INT_LINUX_TIMER INT_TILE_TIMER
> +#define SPR_LINUX_TIMER_CONTROL SPR_TILE_TIMER_CONTROL
> +#endif
> +
> #if CHIP_HAS_SPLIT_CYCLE()
> cycles_t get_cycles(void);
> #define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
> diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
> index f71b08e..71abe38 100644
> --- a/arch/tile/include/hv/hypervisor.h
> +++ b/arch/tile/include/hv/hypervisor.h
> @@ -321,6 +321,18 @@
> /** hv_set_speed */
> #define HV_DISPATCH_SET_SPEED 58
>
> +/** hv_install_virt_context */
> +#define HV_DISPATCH_INSTALL_VIRT_CONTEXT 59
> +
> +/** hv_inquire_virt_context */
> +#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT 60
> +
> +/** hv_install_guest_context */
> +#define HV_DISPATCH_INSTALL_GUEST_CONTEXT 61
> +
> +/** hv_inquire_guest_context */
> +#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT 62
> +
> /** hv_console_set_ipi */
> #define HV_DISPATCH_CONSOLE_SET_IPI 63
>
> @@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
> * new page table does not need to contain any mapping for the
> * hv_install_context address itself.
> *
> - * At most one HV_CTX_PG_SM_* flag may be specified in "flags";
> + * At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
> * if multiple flags are specified, HV_EINVAL is returned.
> * Specifying none of the flags results in using the default page size.
> * All cores participating in a given client must request the same
> * page size, or the results are undefined.
> *
> + * To disable an installed page table, install HV_CTX_NONE. The access
> + * and asid fields are ignored.
> + *
> * @param page_table Root of the page table.
> * @param access PTE providing info on how to read the page table. This
> * value must be consistent between multiple tiles sharing a page table,
> @@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
>
> #endif /* !__ASSEMBLER__ */
>
> +#define HV_CTX_NONE ((HV_PhysAddr)-1) /**< Disable page table. */
> +
> #define HV_CTX_DIRECTIO 0x1 /**< Direct I/O requests are accepted from
> PL0. */
>
> +#define HV_CTX_GUEST_CACHE 0x4 /**< Let guest control caching flags (only
> + usable with hv_install_virt_context.) */
> +
> #define HV_CTX_PG_SM_4K 0x10 /**< Use 4K small pages, if available. */
> #define HV_CTX_PG_SM_16K 0x20 /**< Use 16K small pages, if available. */
> #define HV_CTX_PG_SM_64K 0x40 /**< Use 64K small pages, if available. */
> #define HV_CTX_PG_SM_MASK 0xf0 /**< Mask of all possible small pages. */
>
> +
> #ifndef __ASSEMBLER__
>
> +/** Install a virtualization context.
> + *
> + * When a virtualization context is installed, all faults from PL0 or
> + * PL1 are handled via a "guest context" and then post-processed by
> + * the "virtualization context"; faults at PL2 are still handled by
> + * the normal context. For guest faults, the "guest PAs" produced by
> + * the guest page table are passed through the virtualization page
> + * table as pseudo-VAs, generating the true CPA as a result. See the
> + * individual HV_PTE_xxx bits for the effect the bits have when
> + * present in the virtualization page table. The ASID is currently
> + * ignored in this syscall, but it might be used later, so the API
> + * includes it. The HV_CTX_GUEST_CACHE flag indicates that all
> + * cache-related flags should be taken from the primary page table,
> + * not the virtualization page table.
> + *
> + * Once the virtualization context is installed, a guest context
> + * should also be installed; otherwise a VA-equals-PA context will be
> + * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
> + * the virtualization context to generate CPAs.
> + *
> + * When entering client PL after being at guest or user PL, the
> + * client is expected to call hv_flush_all() to clear any TLB mappings
> + * that might otherwise conflict. Similarly, hv_flush_all() should
> + * be called before returning to guest or user PL with a virtualization
> + * context installed, so that any TLB mappings are cleared. Future
> + * work may include adding a "vpid" or similar namespace so that
> + * the TLBs may be managed independently.
> + *
> + * Subsequent guest page table installations will have their root PA
> + * and PTE cached after translating through the virtualization
> + * context, so if entries in the virtualization page table are
> + * modified or removed, the guest context should be re-installed.
> + * This, in conjunction with flushing the TLB on return to the guest,
> + * will ensure that the new virtualization entries are honored.
> + *
> + * @param page_table Root of the page table.
> + * @param access PTE providing info on how to read the page table. This
> + * value must be consistent between multiple tiles sharing a page table,
> + * and must also be consistent with any virtual mappings the client
> + * may be using to access the page table.
> + * @param asid HV_ASID the page table is to be used for (currently ignored).
> + * @param flags Context flags, denoting attributes or privileges of the
> + * current virtualization context (see below).
> + * @return Zero on success, or a hypervisor error code on failure.
> + */
> +
> +int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
> + HV_ASID asid, __hv32 flags);
> +
> +
> +
> +/** Install a guest context.
> + *
> + * The guest context is only consulted when a virtualization context
> + * is also installed, and for faults that occur below the client's PL.
> + * If no guest context is installed, in such a case, a VA=PA context
> + * is used instead.
> + *
> + * The access PTE will only be honored if the virtualization table was
> + * installed with HV_CTX_GUEST_CACHE.
> + *
> + * A virtualization context must already be installed prior to
> + * installing the guest context.
> + *
> + * @param page_table Root of the page table; the value is the guest's
> + * physical address (GPA), not a CPA.
> + * @param access PTE providing info on how to read the page table. This
> + * value must be consistent between multiple tiles sharing a page table,
> + * and must also be consistent with any virtual mappings the client
> + * may be using to access the page table.
> + * @param asid HV_ASID the page table is to be used for.
> + * @param flags Context flags, denoting attributes or privileges of the
> + * current context (HV_CTX_xxx).
> + * @return Zero on success, or a hypervisor error code on failure.
> + */
> +
> +int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
> + HV_ASID asid, __hv32 flags);
> +
>
> /** Set the number of pages ganged together by HV_PTE_SUPER at a
> * particular level of the page table.
> @@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
> * "super" page size must be less than the span of the next level in
> * the page table. The largest size that can be requested is 64GB.
> *
> - * The shift value is initially "0" for all page table levels,
> + * The shift value is initially 0 for all page table levels,
> * indicating that the HV_PTE_SUPER bit is effectively ignored.
> *
> * If you change the count from one non-zero value to another, the
> @@ -854,11 +954,26 @@ typedef struct
> } HV_Context;
>
> /** Retrieve information about the currently installed context.
> - * @return The data passed to the last successful hv_install_context call.
> + * @return The data passed to the last successful call to
> + * hv_install_context().
> */
> HV_Context hv_inquire_context(void);
>
>
> +/** Retrieve information about the currently installed virtualization context.
> + * @return The data passed to the last successful call to
> + * hv_install_virt_context().
> + */
> +HV_Context hv_inquire_virt_context(void);
> +
> +
> +/** Retrieve information about the currently installed guest context.
> + * @return The data passed to the last successful call to
> + * hv_install_guest_context().
> + */
> +HV_Context hv_inquire_guest_context(void);
> +
> +
> /** Flushes all translations associated with the named address space
> * identifier from the TLB and any other hypervisor data structures.
> * Translations installed with the "global" bit are not flushed.
> @@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
> /** Flushes all non-global translations (if preserve_global is true),
> * or absolutely all translations (if preserve_global is false).
> *
> - * @param preserve_global Non-zero if we want to preserve "global" mappings.
> + * @param preserve_global Non-zero if we want to preserve global mappings.
> * @return Zero on success, or a hypervisor error code on failure.
> */
> int hv_flush_all(int preserve_global);
> @@ -991,7 +1106,11 @@ typedef enum {
> HV_INQ_TILES_HFH_CACHE = 2,
>
> /** The set of tiles that can be legally used as a LOTAR for a PTE. */
> - HV_INQ_TILES_LOTAR = 3
> + HV_INQ_TILES_LOTAR = 3,
> +
> + /** The set of "shared" driver tiles that the hypervisor may
> + * periodically interrupt. */
> + HV_INQ_TILES_SHARED = 4
> } HV_InqTileSet;
>
> /** Returns specific information about various sets of tiles within the
> @@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
> */
> /** Message receive downcall interrupt vector */
> #define INT_MESSAGE_RCV_DWNCL INT_BOOT_ACCESS
> +/** Device interrupt downcall interrupt vector */
> +#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
> +#ifdef __tilegx__
> +/** Virtualization page table miss downcall interrupt vector */
> +#define INT_VPGTABLE_MISS_DWNCL INT_I_ASID
> +/** Virtualization guest illegal page table */
> +#define INT_VGUEST_FATAL_DWNCL INT_D_ASID
> +#else
> /** DMA TLB miss downcall interrupt vector */
> #define INT_DMATLB_MISS_DWNCL INT_DMA_ASID
> -/** Static nework processor instruction TLB miss interrupt vector */
> -#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
> /** DMA TLB access violation downcall interrupt vector */
> #define INT_DMATLB_ACCESS_DWNCL INT_DMA_CPL
> -/** Device interrupt downcall interrupt vector */
> -#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
> +/** Static nework processor instruction TLB miss interrupt vector */
> +#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
> +#endif
>
> #ifndef __ASSEMBLER__
>
> @@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> #define HV_PTE_PTFN_BITS 29 /**< Number of bits in a PTFN */
>
> /*
> - * Legal values for the PTE's mode field
> + * Legal values for the PTE's mode field.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> + * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
> + * to access MMIO resources via pseudo PAs that map to MMIO in the
> + * virtualization page table.
> */
> +
> /** Data is not resident in any caches; loads and stores access memory
> * directly.
> */
> @@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in the primary page table if a virtualization
> + * page table is installed.
> */
> #define HV_PTE_GLOBAL (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
>
> @@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in the virtualization page table.
> */
> #define HV_PTE_USER (__HV_PTE_ONE << HV_PTE_INDEX_USER)
>
> @@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * has been cleared, subsequent references are not guaranteed to set
> * it again until the translation has been flushed from the TLB.
> *
> - * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
> */
> #define HV_PTE_ACCESSED (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
>
> @@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * has been cleared, subsequent references are not guaranteed to set
> * it again until the translation has been flushed from the TLB.
> *
> - * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
> */
> #define HV_PTE_DIRTY (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
>
> @@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> *
> * In level-1 PTEs, if the Page bit is clear, this bit determines how the
> * level-2 page table is accessed.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_NC (__HV_PTE_ONE << HV_PTE_INDEX_NC)
>
> @@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> *
> * In level-1 PTEs, if the Page bit is clear, this bit
> * determines how the level-2 page table is accessed.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_NO_ALLOC_L1 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
>
> @@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> *
> * In level-1 PTEs, if the Page bit is clear, this bit determines how the
> * level-2 page table is accessed.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_NO_ALLOC_L2 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
>
> @@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * the page map directly to memory.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_CACHED_PRIORITY (__HV_PTE_ONE << \
> HV_PTE_INDEX_CACHED_PRIORITY)
> @@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * It is illegal for this bit to be clear if the Writable bit is set.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * If a virtualization page table is present, the final Readable status
> + * is the logical "and" of this bit in both page tables.
> */
> #define HV_PTE_READABLE (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
>
> @@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * PTE.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * If a virtualization page table is present, the final Writable status
> + * is the logical "and" of this bit in both page tables.
> */
> #define HV_PTE_WRITABLE (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
>
> @@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * than one.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * If a virtualization page table is present, the final Executable status
> + * is the logical "and" of this bit in both page tables.
> */
> #define HV_PTE_EXECUTABLE (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
>
> diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
> index e54b7b0..36fb24c 100644
> --- a/arch/tile/include/uapi/arch/sim.h
> +++ b/arch/tile/include/uapi/arch/sim.h
> @@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
> __insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
> }
>
> +/**
> + * Set vCPU number for a given task.
> + * @param vcpu Virtual cpu to set.
> + */
> +static __inline void
> +sim_set_vcpu(int vcpu)
> +{
> + __insn_mtspr(SPR_SIM_CONTROL,
> + SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
> +}
> +
> +/** Clear vCPU status for a given task. */
> +static __inline void
> +sim_clear_vcpu(void)
> +{
> + __insn_mtspr(SPR_SIM_CONTROL,
> + SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
> +}
> +
>
> /*
> * Event support.
> diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
> index 4b44a2b..b9aad66 100644
> --- a/arch/tile/include/uapi/arch/sim_def.h
> +++ b/arch/tile/include/uapi/arch/sim_def.h
> @@ -221,6 +221,14 @@
> */
> #define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
>
> +/**
> + * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
> + * number shifted by 8, will tag any identification of the cpu that
> + * task is running on with the given virtual cpu number. If the
> + * virtual cpu number is -1, the tag is removed.
> + */
> +#define SIM_CONTROL_VCPU 37
> +
>
> /*
> * Syscall numbers for use with "sim_syscall()".
> diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
> index c689446..4644c8d 100644
> --- a/arch/tile/include/uapi/arch/spr_def_32.h
> +++ b/arch/tile/include/uapi/arch/spr_def_32.h
> @@ -121,6 +121,9 @@
> #define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
> #define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
> #define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
> +#define SPR_MPL_GPV_SET_0 0x0600
> +#define SPR_MPL_GPV_SET_1 0x0601
> +#define SPR_MPL_GPV_SET_2 0x0602
> #define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
> #define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
> #define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
> @@ -142,6 +145,9 @@
> #define SPR_MPL_IDN_TIMER_SET_0 0x3400
> #define SPR_MPL_IDN_TIMER_SET_1 0x3401
> #define SPR_MPL_IDN_TIMER_SET_2 0x3402
> +#define SPR_MPL_ILL_SET_0 0x0400
> +#define SPR_MPL_ILL_SET_1 0x0401
> +#define SPR_MPL_ILL_SET_2 0x0402
> #define SPR_MPL_INTCTRL_0_SET_0 0x4a00
> #define SPR_MPL_INTCTRL_0_SET_1 0x4a01
> #define SPR_MPL_INTCTRL_0_SET_2 0x4a02
> @@ -166,6 +172,12 @@
> #define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
> #define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
> #define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
> +#define SPR_MPL_SWINT_0_SET_0 0x1c00
> +#define SPR_MPL_SWINT_0_SET_1 0x1c01
> +#define SPR_MPL_SWINT_0_SET_2 0x1c02
> +#define SPR_MPL_SWINT_1_SET_0 0x1a00
> +#define SPR_MPL_SWINT_1_SET_1 0x1a01
> +#define SPR_MPL_SWINT_1_SET_2 0x1a02
> #define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
> #define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
> #define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
> @@ -187,6 +199,9 @@
> #define SPR_MPL_UDN_TIMER_SET_0 0x3600
> #define SPR_MPL_UDN_TIMER_SET_1 0x3601
> #define SPR_MPL_UDN_TIMER_SET_2 0x3602
> +#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
> +#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
> +#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
> #define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
> #define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
> #define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
> diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
> index 67a6c17..727cda7 100644
> --- a/arch/tile/include/uapi/arch/spr_def_64.h
> +++ b/arch/tile/include/uapi/arch/spr_def_64.h
> @@ -21,6 +21,10 @@
> #define SPR_AUX_PERF_COUNT_1 0x2106
> #define SPR_AUX_PERF_COUNT_CTL 0x2107
> #define SPR_AUX_PERF_COUNT_STS 0x2108
> +#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
> +#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK 0xffffffff
> +#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
> +#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
> #define SPR_CMPEXCH_VALUE 0x2780
> #define SPR_CYCLE 0x2781
> #define SPR_DONE 0x2705
> @@ -101,6 +105,9 @@
> #define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
> #define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
> #define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
> +#define SPR_MPL_GPV_SET_0 0x0900
> +#define SPR_MPL_GPV_SET_1 0x0901
> +#define SPR_MPL_GPV_SET_2 0x0902
> #define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
> #define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
> #define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
> @@ -116,6 +123,12 @@
> #define SPR_MPL_IDN_TIMER_SET_0 0x1800
> #define SPR_MPL_IDN_TIMER_SET_1 0x1801
> #define SPR_MPL_IDN_TIMER_SET_2 0x1802
> +#define SPR_MPL_ILL_SET_0 0x0800
> +#define SPR_MPL_ILL_SET_1 0x0801
> +#define SPR_MPL_ILL_SET_2 0x0802
> +#define SPR_MPL_ILL_TRANS_SET_0 0x1000
> +#define SPR_MPL_ILL_TRANS_SET_1 0x1001
> +#define SPR_MPL_ILL_TRANS_SET_2 0x1002
> #define SPR_MPL_INTCTRL_0_SET_0 0x2500
> #define SPR_MPL_INTCTRL_0_SET_1 0x2501
> #define SPR_MPL_INTCTRL_0_SET_2 0x2502
> @@ -140,6 +153,15 @@
> #define SPR_MPL_PERF_COUNT_SET_0 0x2000
> #define SPR_MPL_PERF_COUNT_SET_1 0x2001
> #define SPR_MPL_PERF_COUNT_SET_2 0x2002
> +#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
> +#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
> +#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
> +#define SPR_MPL_SWINT_0_SET_0 0x0f00
> +#define SPR_MPL_SWINT_0_SET_1 0x0f01
> +#define SPR_MPL_SWINT_0_SET_2 0x0f02
> +#define SPR_MPL_SWINT_1_SET_0 0x0e00
> +#define SPR_MPL_SWINT_1_SET_1 0x0e01
> +#define SPR_MPL_SWINT_1_SET_2 0x0e02
> #define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
> #define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
> #define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
> @@ -155,6 +177,9 @@
> #define SPR_MPL_UDN_TIMER_SET_0 0x1900
> #define SPR_MPL_UDN_TIMER_SET_1 0x1901
> #define SPR_MPL_UDN_TIMER_SET_2 0x1902
> +#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
> +#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
> +#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
> #define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
> #define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
> #define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
> diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
> index c20db8e..f07cc24 100644
> --- a/arch/tile/include/uapi/asm/Kbuild
> +++ b/arch/tile/include/uapi/asm/Kbuild
> @@ -6,7 +6,9 @@ header-y += bitsperlong.h
> header-y += byteorder.h
> header-y += cachectl.h
> header-y += hardwall.h
> +header-y += kvm.h
> header-y += kvm_para.h
> +header-y += kvm_virtio.h
> header-y += mman.h
> header-y += ptrace.h
> header-y += setup.h
> diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
> new file mode 100644
> index 0000000..4346520
> --- /dev/null
> +++ b/arch/tile/include/uapi/asm/kvm.h
> @@ -0,0 +1,267 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _UAPI_ASM_TILE_KVM_H
> +#define _UAPI_ASM_TILE_KVM_H
> +
> +#ifndef __ASSEMBLER__
> +#include <linux/ptrace.h>
> +#endif
> +
> +#include <arch/abi.h>
> +
> +/*
> + * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
> + * with small modifications: Remove HV_SYS_fence_incoherent.
> + */
> +/* Syscall allowed from guest PL bit mask. */
> +#define HV_SYS_GUEST_SHIFT 12
> +#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT)
> +/* downcall_dispatch; this syscall number must be zero */
> +#define HV_SYS_downcall_dispatch 0
> +/* install_context */
> +#define HV_SYS_install_context 1
> +/* sysconf */
> +#define HV_SYS_sysconf 2
> +/* get_rtc */
> +#define HV_SYS_get_rtc 3
> +/* set_rtc */
> +#define HV_SYS_set_rtc 4
> +/* flush_asid */
> +#define HV_SYS_flush_asid 5
> +/* flush_page */
> +#define HV_SYS_flush_page 6
> +/* flush_pages */
> +#define HV_SYS_flush_pages 7
> +/* restart */
> +#define HV_SYS_restart 8
> +/* halt */
> +#define HV_SYS_halt 9
> +/* power_off */
> +#define HV_SYS_power_off 10
> +/* inquire_physical */
> +#define HV_SYS_inquire_physical 11
> +/* inquire_memory_controller */
> +#define HV_SYS_inquire_memory_controller 12
> +/* inquire_virtual */
> +#define HV_SYS_inquire_virtual 13
> +/* inquire_asid */
> +#define HV_SYS_inquire_asid 14
> +/* console_read_if_ready */
> +#define HV_SYS_console_read_if_ready 15
> +/* console_write */
> +#define HV_SYS_console_write 16
> +/* init */
> +#define HV_SYS_init 17
> +/* inquire_topology */
> +#define HV_SYS_inquire_topology 18
> +/* fs_findfile */
> +#define HV_SYS_fs_findfile 19
> +/* fs_fstat */
> +#define HV_SYS_fs_fstat 20
> +/* fs_pread */
> +#define HV_SYS_fs_pread 21
> +/* physaddr_read64 */
> +#define HV_SYS_physaddr_read64 22
> +/* physaddr_write64 */
> +#define HV_SYS_physaddr_write64 23
> +/* get_command_line */
> +#define HV_SYS_get_command_line 24
> +/* set_caching */
> +#define HV_SYS_set_caching 25
> +/* bzero_page */
> +#define HV_SYS_bzero_page 26
> +/* register_message_state */
> +#define HV_SYS_register_message_state 27
> +/* send_message */
> +#define HV_SYS_send_message 28
> +/* receive_message */
> +#define HV_SYS_receive_message 29
> +/* inquire_context */
> +#define HV_SYS_inquire_context 30
> +/* start_all_tiles */
> +#define HV_SYS_start_all_tiles 31
> +/* dev_open */
> +#define HV_SYS_dev_open 32
> +/* dev_close */
> +#define HV_SYS_dev_close 33
> +/* dev_pread */
> +#define HV_SYS_dev_pread 34
> +/* dev_pwrite */
> +#define HV_SYS_dev_pwrite 35
> +/* dev_poll */
> +#define HV_SYS_dev_poll 36
> +/* dev_poll_cancel */
> +#define HV_SYS_dev_poll_cancel 37
> +/* dev_preada */
> +#define HV_SYS_dev_preada 38
> +/* dev_pwritea */
> +#define HV_SYS_dev_pwritea 39
> +/* flush_remote */
> +#define HV_SYS_flush_remote 40
> +/* console_putc */
> +#define HV_SYS_console_putc 41
> +/* inquire_tiles */
> +#define HV_SYS_inquire_tiles 42
> +/* confstr */
> +#define HV_SYS_confstr 43
> +/* reexec */
> +#define HV_SYS_reexec 44
> +/* set_command_line */
> +#define HV_SYS_set_command_line 45
> +
> +/* store_mapping */
> +#define HV_SYS_store_mapping 52
> +/* inquire_realpa */
> +#define HV_SYS_inquire_realpa 53
> +/* flush_all */
> +#define HV_SYS_flush_all 54
> +/* get_ipi_pte */
> +#define HV_SYS_get_ipi_pte 55
> +/* set_pte_super_shift */
> +#define HV_SYS_set_pte_super_shift 56
> +/* set_speed */
> +#define HV_SYS_set_speed 57
> +/* install_virt_context */
> +#define HV_SYS_install_virt_context 58
> +/* inquire_virt_context */
> +#define HV_SYS_inquire_virt_context 59
> +/* inquire_guest_context */
> +#define HV_SYS_install_guest_context 60
> +/* inquire_guest_context */
> +#define HV_SYS_inquire_guest_context 61
> +
> +/*
> + * Number of hypercall (from guest os to host os) other than hv_*().
> + * We leave the previous 128 entries to the usual hv_*() calls
> + * as defined in hypervisor.h.
> + */
> +#define KVM_OTHER_HCALL 128
> +
> +/* Hypercall index for virtio. */
> +#define KVM_HCALL_virtio 128
> +
> +/* One greater than the maximum hypercall number. */
> +#define KVM_NUM_HCALLS 256
> +
> +#ifndef __ASSEMBLER__
> +
> +struct kvm_regs {
> + struct pt_regs regs;
> +};
> +
> +#define FOR_EACH_GUEST_SPR(f) \
> + f(INTERRUPT_MASK_1); \
> + f(INTERRUPT_VECTOR_BASE_1); \
> + f(EX_CONTEXT_1_0); \
> + f(EX_CONTEXT_1_1); \
> + f(SYSTEM_SAVE_1_0); \
> + f(SYSTEM_SAVE_1_1); \
> + f(SYSTEM_SAVE_1_2); \
> + f(SYSTEM_SAVE_1_3); \
> + f(INTCTRL_1_STATUS); \
> + f(IPI_MASK_1); \
> + f(IPI_EVENT_1); \
> + f(SINGLE_STEP_CONTROL_1); \
> + f(SINGLE_STEP_EN_1_1); \
> +
> +struct kvm_sregs {
> +#define DECLARE_SPR(f) unsigned long f
> + FOR_EACH_GUEST_SPR(DECLARE_SPR)
> +#undef DECLARE_SPR
> +};
> +
> +struct kvm_fpu {
> +};
> +
> +struct kvm_debug_exit_arch {
> +};
> +
> +struct kvm_guest_debug_arch {
> +};
> +
> +/* definition of registers in kvm_run */
> +struct kvm_sync_regs {
> +};
> +
> +#ifndef __KERNEL__
> +/* For hv_*() */
> +#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
> +#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
> +#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
> +#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
> +/* For others */
> +#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
> +#endif
> +
> +#define HCALL_DEFS \
> + /* For hv_*() */ \
> + KVM_EMULATE(init) \
> + NO_EMULATE(install_context) \
> + KVM_EMULATE(sysconf) \
> + KVM_EMULATE(get_rtc) \
> + KVM_EMULATE(set_rtc) \
> + NO_EMULATE(flush_asid) \
> + NO_EMULATE(flush_page) \
> + NO_EMULATE(flush_pages) \
> + USER_EMULATE(restart) \
> + USER_EMULATE(halt) \
> + USER_EMULATE(power_off) \
> + USER_EMULATE(inquire_physical) \
> + USER_EMULATE(inquire_memory_controller) \
> + KVM_EMULATE(inquire_virtual) \
> + KVM_EMULATE(inquire_asid) \
> + NO_EMULATE(console_read_if_ready) \
> + NO_EMULATE(console_write) \
> + NO_EMULATE(downcall_dispatch) \
> + KVM_EMULATE(inquire_topology) \
> + USER_EMULATE(fs_findfile) \
> + USER_EMULATE(fs_fstat) \
> + USER_EMULATE(fs_pread) \
> + KVM_EMULATE(physaddr_read64) \
> + KVM_EMULATE(physaddr_write64) \
> + USER_EMULATE(get_command_line) \
> + USER_EMULATE(set_caching) \
> + NO_EMULATE(bzero_page) \
> + KVM_EMULATE(register_message_state) \
> + KVM_EMULATE(send_message) \
> + KVM_EMULATE(receive_message) \
> + KVM_EMULATE(inquire_context) \
> + KVM_EMULATE(start_all_tiles) \
> + USER_EMULATE(dev_open) \
> + USER_EMULATE(dev_close) \
> + USER_EMULATE(dev_pread) \
> + USER_EMULATE(dev_pwrite) \
> + USER_EMULATE(dev_poll) \
> + USER_EMULATE(dev_poll_cancel) \
> + USER_EMULATE(dev_preada) \
> + USER_EMULATE(dev_pwritea) \
> + USER_EMULATE(flush_remote) \
> + NO_EMULATE(console_putc) \
> + KVM_EMULATE(inquire_tiles) \
> + KVM_EMULATE(confstr) \
> + USER_EMULATE(reexec) \
> + USER_EMULATE(set_command_line) \
> + USER_EMULATE(store_mapping) \
> + NO_EMULATE(inquire_realpa) \
> + NO_EMULATE(flush_all) \
> + KVM_EMULATE(get_ipi_pte) \
> + KVM_EMULATE(set_pte_super_shift) \
> + KVM_EMULATE(set_speed) \
> + /* For others */ \
> + USER_HCALL(virtio)
> +
> +#endif
> +
> +#endif /* _UAPI_ASM_TILE_KVM_H */
> diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
> new file mode 100644
> index 0000000..d94f535
> --- /dev/null
> +++ b/arch/tile/include/uapi/asm/kvm_virtio.h
> @@ -0,0 +1,60 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
> +#define _UAPI_ASM_TILE_KVM_VIRTIO_H
> +
> +#include <linux/types.h>
> +
> +#define KVM_VIRTIO_UNKNOWN 0
> +#define KVM_VIRTIO_NOTIFY 1
> +#define KVM_VIRTIO_RESET 2
> +#define KVM_VIRTIO_SET_STATUS 3
> +
> +struct kvm_device_desc {
> + /* The device type: console, network, disk etc. Type 0 terminates. */
> + __u8 type;
> + /* The number of virtqueues (first in config array) */
> + __u8 num_vq;
> + /*
> + * The number of bytes of feature bits. Multiply by 2: one for host
> + * features and one for Guest acknowledgements.
> + */
> + __u8 feature_len;
> + /* The number of bytes of the config array after virtqueues. */
> + __u8 config_len;
> + /* A status byte, written by the Guest. */
> + __u8 status;
> + __u64 config[0];
> +};
> +
> +struct kvm_vqinfo {
> + /* Pointer to the information contained in the device config. */
> + struct kvm_vqconfig *config;
> + /* The address where we mapped the virtio ring, so we can unmap it. */
> + void *pages;
> +};
> +
> +struct kvm_vqconfig {
> + /* The physical address of the virtio ring */
> + __u64 pa;
> + /* The number of entries in the virtio_ring */
> + __u64 num;
> + /* The interrupt we get when something happens. Set by the guest. */
> + __u32 irq;
> +
> +};
> +
> +
> +#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
> diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
> index b7c8b5e..b638d3e 100644
> --- a/arch/tile/kernel/Makefile
> +++ b/arch/tile/kernel/Makefile
> @@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o
> obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o
> obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o
> obj-$(CONFIG_KPROBES) += kprobes.o
> +obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o
>
> obj-y += vdso/
> diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
> index 97ea6ac..0a04a16 100644
> --- a/arch/tile/kernel/asm-offsets.c
> +++ b/arch/tile/kernel/asm-offsets.c
> @@ -20,6 +20,9 @@
> #include <linux/hardirq.h>
> #include <linux/ptrace.h>
> #include <hv/hypervisor.h>
> +#ifdef CONFIG_KVM
> +#include <linux/kvm_host.h>
> +#endif
>
> /* Check for compatible compiler early in the build. */
> #ifdef CONFIG_TILEGX
> @@ -68,6 +71,10 @@ void foo(void)
> DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
> offsetof(struct thread_info, unalign_jit_tmp));
> #endif
> +#ifdef CONFIG_KVM
> + DEFINE(THREAD_INFO_VCPU_OFFSET,
> + offsetof(struct thread_info, vcpu));
> +#endif
>
> DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
> offsetof(struct task_struct, thread.ksp));
> diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
> index b608e00..53f2be4 100644
> --- a/arch/tile/kernel/early_printk.c
> +++ b/arch/tile/kernel/early_printk.c
> @@ -18,11 +18,26 @@
> #include <linux/string.h>
> #include <linux/irqflags.h>
> #include <linux/printk.h>
> +#ifdef CONFIG_KVM_GUEST
> +#include <linux/virtio_console.h>
> +#include <linux/kvm_para.h>
> +#include <asm/kvm_virtio.h>
> +#endif
> #include <asm/setup.h>
> #include <hv/hypervisor.h>
>
> static void early_hv_write(struct console *con, const char *s, unsigned n)
> {
> +#ifdef CONFIG_KVM_GUEST
> + char buf[512];
> +
> + if (n > sizeof(buf) - 1)
> + n = sizeof(buf) - 1;
> + memcpy(buf, s, n);
> + buf[n] = '\0';
> +
> + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
> +#else
> tile_console_write(s, n);
>
> /*
> @@ -32,6 +47,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n)
> */
> if (n && s[n-1] == '\n')
> tile_console_write("\r", 1);
> +#endif
> }
>
> static struct console early_hv_console = {
> diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
> index f3f17b0..8d5b40f 100644
> --- a/arch/tile/kernel/head_32.S
> +++ b/arch/tile/kernel/head_32.S
> @@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
> .set addr, addr + PGDIR_SIZE
> .endr
>
> - /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
> - PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
> + /* The true text VAs are mapped as VA = PA + MEM_SV_START */
> + PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
> (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
> .org swapper_pg_dir + PGDIR_SIZE
> END(swapper_pg_dir)
> diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
> index 652b814..bd0e12f 100644
> --- a/arch/tile/kernel/head_64.S
> +++ b/arch/tile/kernel/head_64.S
> @@ -135,9 +135,9 @@ ENTRY(_start)
> 1:
>
> /* Install the interrupt base. */
> - moveli r0, hw2_last(MEM_SV_START)
> - shl16insli r0, r0, hw1(MEM_SV_START)
> - shl16insli r0, r0, hw0(MEM_SV_START)
> + moveli r0, hw2_last(intrpt_start)
> + shl16insli r0, r0, hw1(intrpt_start)
> + shl16insli r0, r0, hw0(intrpt_start)
> mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
>
> /* Get our processor number and save it away in SAVE_K_0. */
> diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
> index 16576c6..2914a9e 100644
> --- a/arch/tile/kernel/hvglue.S
> +++ b/arch/tile/kernel/hvglue.S
> @@ -71,5 +71,11 @@ gensym hv_flush_all, 0x6e0, 32
> gensym hv_get_ipi_pte, 0x700, 32
> gensym hv_set_pte_super_shift, 0x720, 32
> gensym hv_set_speed, 0x740, 32
> +gensym hv_install_virt_context, 0x760, 32
> +gensym hv_inquire_virt_context, 0x780, 32
> +gensym hv_install_guest_context, 0x7a0, 32
> +gensym hv_inquire_guest_context, 0x7c0, 32
> gensym hv_console_set_ipi, 0x7e0, 32
> -gensym hv_glue_internals, 0x800, 30720
> +gensym hv_glue_internals, 0x800, 2048
> +gensym hcall_virtio, 0x1000, 32
> +gensym hv_hcall_internals, 0x1020, 28640
> diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
> index 16ef6c1..3b15c76 100644
> --- a/arch/tile/kernel/hvglue_trace.c
> +++ b/arch/tile/kernel/hvglue_trace.c
> @@ -75,6 +75,10 @@
> #define hv_get_ipi_pte _hv_get_ipi_pte
> #define hv_set_pte_super_shift _hv_set_pte_super_shift
> #define hv_set_speed _hv_set_speed
> +#define hv_install_virt_context _hv_install_virt_context
> +#define hv_inquire_virt_context _hv_inquire_virt_context
> +#define hv_install_guest_context _hv_install_guest_context
> +#define hv_inquire_guest_context _hv_inquire_guest_context
> #define hv_console_set_ipi _hv_console_set_ipi
> #include <hv/hypervisor.h>
> #undef hv_init
> @@ -135,6 +139,10 @@
> #undef hv_get_ipi_pte
> #undef hv_set_pte_super_shift
> #undef hv_set_speed
> +#undef hv_install_virt_context
> +#undef hv_inquire_virt_context
> +#undef hv_install_guest_context
> +#undef hv_inquire_guest_context
> #undef hv_console_set_ipi
>
> /*
> @@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
> unsigned long, flags)
> HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
> HV_ASID, asid, __hv32, flags)
> +HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
> + HV_ASID, asid, __hv32, flags)
> +HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
> + HV_ASID, asid, __hv32, flags)
> HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
> HV_WRAP0(HV_Context, hv_inquire_context)
> +HV_WRAP0(HV_Context, hv_inquire_virt_context)
> +HV_WRAP0(HV_Context, hv_inquire_guest_context)
> HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
> HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
> HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
> diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
> index f3d26f4..2ce69a5 100644
> --- a/arch/tile/kernel/intvec_32.S
> +++ b/arch/tile/kernel/intvec_32.S
> @@ -353,7 +353,7 @@ intvec_\vecname:
> #ifdef __COLLECT_LINKER_FEEDBACK__
> .pushsection .text.intvec_feedback,"ax"
> .org (\vecnum << 5)
> - FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
> + FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
> jrp lr
> .popsection
> #endif
> @@ -806,7 +806,7 @@ handle_interrupt:
> STD_ENTRY(interrupt_return)
> /* If we're resuming to kernel space, don't check thread flags. */
> {
> - bnz r30, .Lrestore_all /* NMIs don't special-case user-space */
> + bnz r30, restore_all /* NMIs don't special-case user-space */
> PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
> }
> lw r29, r29
> @@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
> seq r27, r27, r28
> }
> {
> - bbns r27, .Lrestore_all
> + bbns r27, restore_all
> addi r28, r28, 8
> }
> sw r29, r28
> - j .Lrestore_all
> + j restore_all
>
> .Lresume_userspace:
> FEEDBACK_REENTER(interrupt_return)
> @@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
> auli r1, r1, ha16(_TIF_ALLWORK_MASK)
> }
> and r1, r29, r1
> - bzt r1, .Lrestore_all
> + bzt r1, restore_all
>
> /*
> * Make sure we have all the registers saved for signal
> @@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
> * profile interrupt will actually disable interrupts in both SPRs
> * before returning, which is OK.)
> */
> -.Lrestore_all:
> + .global restore_all
> + .type restore_all, @function
> +restore_all:
> PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
> {
> lw r0, r0
> @@ -1890,8 +1892,8 @@ int_unalign:
> push_extra_callee_saves r0
> j do_trap
>
> -/* Include .intrpt1 array of interrupt vectors */
> - .section ".intrpt1", "ax"
> +/* Include .intrpt array of interrupt vectors */
> + .section ".intrpt", "ax"
>
> #define op_handle_perf_interrupt bad_intr
> #define op_handle_aux_perf_interrupt bad_intr
> diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
> index 18b2dcc..2c5cbe0 100644
> --- a/arch/tile/kernel/intvec_64.S
> +++ b/arch/tile/kernel/intvec_64.S
> @@ -29,11 +29,25 @@
> #include <arch/abi.h>
> #include <arch/interrupts.h>
> #include <arch/spr_def.h>
> +#include <arch/opcode.h>
> +#ifdef CONFIG_KVM
> +#include <asm/kvm_host.h>
> +#endif
>
> #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
>
> #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
>
> +#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
> +/*
> + * Set "result" non-zero if ex1 holds the PL of the kernel
> + * (with or without ICS being set). Note this works only
> + * because we never find the PL at level 3.
> + */
> +# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
> +#else
> +# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
> +#endif
>
> .macro push_reg reg, ptr=sp, delta=-8
> {
> @@ -308,7 +322,7 @@ intvec_\vecname:
> */
> {
> blbs sp, 2f
> - andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
> + IS_KERNEL_EX1(r0, r0)
> }
>
> .ifc \vecnum, INT_DOUBLE_FAULT
> @@ -347,10 +361,6 @@ intvec_\vecname:
> *
> * Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
> * any path that turns into a downcall to one of our TLB handlers.
> - *
> - * FIXME: if we end up never using this path, perhaps we should
> - * prevent the hypervisor from generating downcalls in this case.
> - * The advantage of getting a downcall is we can panic in Linux.
> */
> mfspr r0, SPR_SYSTEM_SAVE_K_2
> {
> @@ -490,6 +500,10 @@ intvec_\vecname:
> mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
> mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
> .else
> + .ifc \c_routine, kvm_vpgtable_miss
> + mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
> + mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
> + .else
> .ifc \vecnum, INT_ILL_TRANS
> mfspr r2, ILL_VA_PC
> .else
> @@ -512,6 +526,7 @@ intvec_\vecname:
> .endif
> .endif
> .endif
> + .endif
> /* Put function pointer in r0 */
> moveli r0, hw2_last(\c_routine)
> shl16insli r0, r0, hw1(\c_routine)
> @@ -525,7 +540,7 @@ intvec_\vecname:
> #ifdef __COLLECT_LINKER_FEEDBACK__
> .pushsection .text.intvec_feedback,"ax"
> .org (\vecnum << 5)
> - FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
> + FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
> jrp lr
> .popsection
> #endif
> @@ -641,24 +656,25 @@ intvec_\vecname:
> /*
> * If we will be returning to the kernel, we will need to
> * reset the interrupt masks to the state they had before.
> - * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
> + * Set DISABLE_IRQ in flags iff we came from kernel pl with
> + * irqs disabled.
> */
> - mfspr r32, SPR_EX_CONTEXT_K_1
> + mfspr r22, SPR_EX_CONTEXT_K_1
> {
> - andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
> + IS_KERNEL_EX1(r22, r22)
> PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
> }
> - beqzt r32, 1f /* zero if from user space */
> - IRQS_DISABLED(r32) /* zero if irqs enabled */
> + beqzt r22, 1f /* zero if from user space */
> + IRQS_DISABLED(r22) /* zero if irqs enabled */
> #if PT_FLAGS_DISABLE_IRQ != 1
> # error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
> #endif
> 1:
> .ifnc \function,handle_syscall
> /* Record the fact that we saved the caller-save registers above. */
> - ori r32, r32, PT_FLAGS_CALLER_SAVES
> + ori r22, r22, PT_FLAGS_CALLER_SAVES
> .endif
> - st r21, r32
> + st r21, r22
>
> /*
> * we've captured enough state to the stack (including in
> @@ -698,12 +714,29 @@ intvec_\vecname:
> move tp, zero
> #endif
>
> + /*
> + * Prepare the first 256 stack bytes to be rapidly accessible
> + * without having to fetch the background data.
> + */
> + addi r52, sp, -64
> + {
> + wh64 r52
> + addi r52, r52, -64
> + }
> + {
> + wh64 r52
> + addi r52, r52, -64
> + }
> + {
> + wh64 r52
> + addi r52, r52, -64
> + }
> + wh64 r52
> +
> #ifdef __COLLECT_LINKER_FEEDBACK__
> /*
> * Notify the feedback routines that we were in the
> - * appropriate fixed interrupt vector area. Note that we
> - * still have ICS set at this point, so we can't invoke any
> - * atomic operations or we will panic. The feedback
> + * appropriate fixed interrupt vector area. The feedback
> * routines internally preserve r0..r10 and r30 up.
> */
> .ifnc \function,handle_syscall
> @@ -722,23 +755,15 @@ intvec_\vecname:
> #endif
>
> /*
> - * Prepare the first 256 stack bytes to be rapidly accessible
> - * without having to fetch the background data.
> + * Stash any interrupt state in r30..r33 for now.
> + * This makes it easier to call C code in the code that follows.
> + * We don't need to on the syscall path since we reload
> + * them from the stack instead.
> */
> - addi r52, sp, -64
> - {
> - wh64 r52
> - addi r52, r52, -64
> - }
> - {
> - wh64 r52
> - addi r52, r52, -64
> - }
> - {
> - wh64 r52
> - addi r52, r52, -64
> - }
> - wh64 r52
> + .ifnc \function,handle_syscall
> + { move r30, r0; move r31, r1 }
> + { move r32, r2; move r33, r3 }
> + .endif
>
> #ifdef CONFIG_TRACE_IRQFLAGS
> .ifnc \function,handle_nmi
> @@ -749,17 +774,8 @@ intvec_\vecname:
> * For syscalls, we already have the register state saved away
> * on the stack, so we don't bother to do any register saves here,
> * and later we pop the registers back off the kernel stack.
> - * For interrupt handlers, save r0-r3 in callee-saved registers.
> */
> - .ifnc \function,handle_syscall
> - { move r30, r0; move r31, r1 }
> - { move r32, r2; move r33, r3 }
> - .endif
> TRACE_IRQS_OFF
> - .ifnc \function,handle_syscall
> - { move r0, r30; move r1, r31 }
> - { move r2, r32; move r3, r33 }
> - .endif
> .endif
> #endif
>
> @@ -808,11 +824,11 @@ handle_interrupt:
> STD_ENTRY(interrupt_return)
> /* If we're resuming to kernel space, don't check thread flags. */
> {
> - bnez r30, .Lrestore_all /* NMIs don't special-case user-space */
> + bnez r30, restore_all /* NMIs don't special-case user-space */
> PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
> }
> ld r29, r29
> - andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
> + IS_KERNEL_EX1(r29, r29)
> {
> beqzt r29, .Lresume_userspace
> move r29, sp
> @@ -824,14 +840,25 @@ STD_ENTRY(interrupt_return)
> addli r28, r29, THREAD_INFO_FLAGS_OFFSET
> {
> ld r28, r28
> - addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
> + addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
> }
> {
> - andi r28, r28, _TIF_NEED_RESCHED
> - ld4s r29, r29
> + andi r27, r28, _TIF_NEED_RESCHED
> + ld4s r26, r26
> }
> - beqzt r28, 1f
> - bnez r29, 1f
> + beqzt r27, 1f
> + bnez r26, 1f
> +#ifdef CONFIG_KVM
> + addli r27, r29, THREAD_INFO_VCPU_OFFSET
> + ld r27, r27
> + {
> + beqzt r27, 0f
> + movei r1, KVM_EXIT_AGAIN
> + }
> + push_extra_callee_saves r0
> + j kvm_trigger_vmexit
> +0:
> +#endif
> jal preempt_schedule_irq
> FEEDBACK_REENTER(interrupt_return)
> 1:
> @@ -853,11 +880,11 @@ STD_ENTRY(interrupt_return)
> cmpeq r27, r27, r28
> }
> {
> - blbc r27, .Lrestore_all
> + blbc r27, restore_all
> addi r28, r28, 8
> }
> st r29, r28
> - j .Lrestore_all
> + j restore_all
>
> .Lresume_userspace:
> FEEDBACK_REENTER(interrupt_return)
> @@ -897,7 +924,7 @@ STD_ENTRY(interrupt_return)
> shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
> }
> and r1, r29, r1
> - beqzt r1, .Lrestore_all
> + beqzt r1, restore_all
>
> /*
> * Make sure we have all the registers saved for signal
> @@ -929,14 +956,16 @@ STD_ENTRY(interrupt_return)
> * ICS can only be used in very tight chunks of code to avoid
> * tripping over various assertions that it is off.
> */
> -.Lrestore_all:
> + .global restore_all
> + .type restore_all, @function
> +restore_all:
> PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
> {
> ld r0, r0
> PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
> }
> {
> - andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
> + IS_KERNEL_EX1(r0, r0)
> ld r32, r32
> }
> bnez r0, 1f
> @@ -1007,7 +1036,7 @@ STD_ENTRY(interrupt_return)
> pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
> {
> mtspr SPR_EX_CONTEXT_K_1, lr
> - andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
> + IS_KERNEL_EX1(lr, lr)
> }
> {
> mtspr SPR_EX_CONTEXT_K_0, r21
> @@ -1457,6 +1486,26 @@ int_unalign:
> j do_unaligned
> ENDPROC(hand_unalign_slow)
>
> +#ifdef CONFIG_KVM
> +/*
> + * Any call path that may lead to a vmexit needs to save the full
> + * callee-save register state, since if we vmexit we don't unwind
> + * the callee-saves from the C function stack frames, and instead
> + * just save away the register state from the interrupt handler as-is
> + * and later reload it directly and call back into the guest.
> + */
> + .macro save_callee_saves_and_tailcall func
> +kvm_\func:
> + push_extra_callee_saves r0
> + j kvm_do_\func
> + ENDPROC(\func)
> + .endm
> +
> + save_callee_saves_and_tailcall hypervisor_call
> + save_callee_saves_and_tailcall vpgtable_miss
> + save_callee_saves_and_tailcall vguest_fatal
> +#endif
> +
> /* Fill the return address stack with nonzero entries. */
> STD_ENTRY(fill_ra_stack)
> {
> @@ -1469,13 +1518,57 @@ STD_ENTRY(fill_ra_stack)
> 4: jrp r0
> STD_ENDPROC(fill_ra_stack)
>
> +#ifdef CONFIG_KVM
> +/*
> + * Handle the downcall dispatch service. On entry, the client's
> + * system save register 3 holds the original contents of
> + * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
> + * the correct interrupt vector.
> + * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
> + * here, since this is the only interrupt handled this way on GX.
> + */
> +handle_downcall_dispatch:
> + /*
> + * If we were called from PL0, jump back to slow path.
> + * We check just the low bit to make sure it's set, since we
> + * can only be called from PL0 or PL1.
> + */
> + mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
> + blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0
> +
> + /* Set the PC to the downcall interrupt vector, and PL to guest. */
> + mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
> + addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
> + INT_MESSAGE_RCV_DWNCL << 8
> + {
> + mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
> + movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
> + }
> + mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
> +
> + /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
> + mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
> + iret
> +
> + .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \
> + processing=handle_interrupt
> + .org (\vecnum << 8)
> + /* Need special code for downcall dispatch syscall. */
> + beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
> + __int_hand \vecnum, \vecname, \c_routine, \processing
> + .endm
> +
> +#endif /* CONFIG_KVM */
> +
> .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
> .org (\vecnum << 8)
> __int_hand \vecnum, \vecname, \c_routine, \processing
> .endm
>
> -/* Include .intrpt1 array of interrupt vectors */
> - .section ".intrpt1", "ax"
> +/* Include .intrpt array of interrupt vectors */
> + .section ".intrpt", "ax"
> + .global intrpt_start
> +intrpt_start:
>
> #define op_handle_perf_interrupt bad_intr
> #define op_handle_aux_perf_interrupt bad_intr
> @@ -1484,6 +1577,11 @@ STD_ENTRY(fill_ra_stack)
> #define do_hardwall_trap bad_intr
> #endif
>
> +#ifndef CONFIG_KVM
> +#define kvm_vpgtable_miss bad_intr
> +#define kvm_vguest_fatal bad_intr
> +#endif
> +
> int_hand INT_MEM_ERROR, MEM_ERROR, do_trap
> int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
> #if CONFIG_KERNEL_PL == 2
> @@ -1504,14 +1602,24 @@ STD_ENTRY(fill_ra_stack)
> int_hand INT_SWINT_3, SWINT_3, do_trap
> int_hand INT_SWINT_2, SWINT_2, do_trap
> int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
> +#ifdef CONFIG_KVM
> + int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
> +#else
> int_hand INT_SWINT_0, SWINT_0, do_trap
> +#endif
> int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
> int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
> int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
> int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
> int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
> int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
> +#ifndef CONFIG_KVM_GUEST
> int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
> + int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr
> +#else
> + int_hand INT_TILE_TIMER, TILE_TIMER, bad_intr
> + int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt
> +#endif
> int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr
> int_hand INT_UDN_TIMER, UDN_TIMER, bad_intr
> int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr
> @@ -1541,8 +1649,10 @@ STD_ENTRY(fill_ra_stack)
> int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
> hv_message_intr
> int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
> - int_hand INT_I_ASID, I_ASID, bad_intr
> - int_hand INT_D_ASID, D_ASID, bad_intr
> + int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
> + kvm_vpgtable_miss
> + int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
> + kvm_vguest_fatal
> int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
>
> /* Synthetic interrupt delivered only by the simulator */
> diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c
> new file mode 100644
> index 0000000..c6b6c6a
> --- /dev/null
> +++ b/arch/tile/kernel/kvm_virtio.c
> @@ -0,0 +1,430 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +/* Referred lguest & s390 implemenation */
> +/*
> + * kvm_virtio.c - virtio for kvm on s390
> + *
> + * Copyright IBM Corp. 2008
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License (version 2 only)
> + * as published by the Free Software Foundation.
> + *
> + * Author(s): Christian Borntraeger <[email protected]>
> + */
> +
> +#include <linux/bootmem.h>
> +#include <linux/io.h>
> +#include <linux/vmalloc.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/export.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_console.h>
> +#include <linux/virtio_ring.h>
> +#include <linux/virtio_pci.h>
> +
> +#include <linux/kvm_para.h>
> +#include <asm/kvm_virtio.h>
> +
> +static void *kvm_devices;
> +
> +/*
> + * TODO: We actually does not use PCI virtio here. We use this
> + * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
> + * Maybe we should change them to generic definitions in both qemu & Linux.
> + * Besides, Let's check whether the alignment value (4096, i.e. default
> + * x86 page size) affects performance later.
> + */
> +#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN
> +#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
> +
> +/*
> + * memory layout: (Total: PAGE_SIZE)
> + * <device 0>
> + * - kvm device descriptor
> + * struct kvm_device_desc
> + * - vqueue configuration (totally desc->num_vq)
> + * struct kvm_vqconfig
> + * ......
> + * struct kvm_vqconfig
> + * - feature bits (size: desc->feature_len * 2)
> + * - config space (size: desc->config_len)
> + * <device 1>
> + * ......
> + */
> +static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
> +{
> + return (struct kvm_vqconfig *)(desc + 1);
> +}
> +
> +static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
> +{
> + return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
> +}
> +
> +static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
> +{
> + return kvm_vq_features(desc) + desc->feature_len * 2;
> +}
> +
> +/*
> + * The total size of the config page used by this device (incl. desc)
> + */
> +static unsigned desc_size(const struct kvm_device_desc *desc)
> +{
> + return sizeof(*desc)
> + + desc->num_vq * sizeof(struct kvm_vqconfig)
> + + desc->feature_len * 2
> + + desc->config_len;
> +}
> +
> +/* This gets the device's feature bits. */
> +static u32 kvm_get_features(struct virtio_device *vdev)
> +{
> + unsigned int i;
> + u32 features = 0;
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> + u8 *in_features = kvm_vq_features(desc);
> +
> + for (i = 0; i < min(desc->feature_len * 8, 32); i++)
> + if (in_features[i / 8] & (1 << (i % 8)))
> + features |= (1 << i);
> + return features;
> +}
> +
> +static void kvm_finalize_features(struct virtio_device *vdev)
> +{
> + unsigned int i, bits;
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> + /* Second half of bitmap is features we accept. */
> + u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
> +
> + /* Give virtio_ring a chance to accept features. */
> + vring_transport_features(vdev);
> +
> + memset(out_features, 0, desc->feature_len);
> + bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
> + for (i = 0; i < bits; i++) {
> + if (test_bit(i, vdev->features))
> + out_features[i / 8] |= (1 << (i % 8));
> + }
> +}
> +
> +/*
> + * Reading and writing elements in config space
> + */
> +static void kvm_get(struct virtio_device *vdev, unsigned int offset,
> + void *buf, unsigned len)
> +{
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> +
> + BUG_ON(offset + len > desc->config_len);
> + memcpy(buf, kvm_vq_configspace(desc) + offset, len);
> +}
> +
> +static void kvm_set(struct virtio_device *vdev, unsigned int offset,
> + const void *buf, unsigned len)
> +{
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> +
> + BUG_ON(offset + len > desc->config_len);
> + memcpy(kvm_vq_configspace(desc) + offset, buf, len);
> +}
> +
> +/*
> + * The operations to get and set the status word just access
> + * the status field of the device descriptor. set_status will also
> + * make a hypercall to the host, to tell about status changes
> + */
> +static u8 kvm_get_status(struct virtio_device *vdev)
> +{
> + return to_kvmdev(vdev)->desc->status;
> +}
> +
> +static void kvm_set_status(struct virtio_device *vdev, u8 status)
> +{
> + BUG_ON(!status);
> + to_kvmdev(vdev)->desc->status = status;
> + hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
> +}
> +
> +/*
> + * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
> + * descriptor address. The Host will zero the status and all the
> + * features.
> + */
> +static void kvm_reset(struct virtio_device *vdev)
> +{
> + hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
> +}
> +
> +/*
> + * When the virtio_ring code wants to notify the Host, it calls us here and we
> + * make a hypercall. We hand the address of the virtqueue so the Host
> + * knows which virtqueue we're talking about.
> + */
> +static void kvm_notify(struct virtqueue *vq)
> +{
> + struct kvm_vqinfo *vqi = vq->priv;
> +
> + hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
> +}
> +
> +/*
> + * Must set some caching mode to keep set_pte() happy.
> + * It doesn't matter what we choose, because the PFN
> + * is illegal, so we're going to take a page fault anyway.
> + */
> +static inline pgprot_t io_prot(void)
> +{
> + return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
> +}
> +
> +/*
> + * This routine finds the first virtqueue described in the configuration of
> + * this device and sets it up.
> + */
> +static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
> + unsigned index,
> + void (*callback)(struct virtqueue *vq),
> + const char *name)
> +{
> + struct kvm_device *kdev = to_kvmdev(vdev);
> + struct kvm_vqinfo *vqi;
> + struct kvm_vqconfig *config;
> + struct virtqueue *vq;
> + long irq;
> + int err = -EINVAL;
> +
> + if (index >= kdev->desc->num_vq)
> + return ERR_PTR(-ENOENT);
> +
> + vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
> + if (!vqi)
> + return ERR_PTR(-ENOMEM);
> +
> + config = kvm_vq_config(kdev->desc)+index;
> +
> + vqi->config = config;
> + vqi->pages = generic_remap_prot(config->pa,
> + vring_size(config->num,
> + KVM_TILE_VIRTIO_RING_ALIGN),
> + 0, io_prot());
> + if (!vqi->pages) {
> + err = -ENOMEM;
> + goto out;
> + }
> +
> + vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
> + vdev, 0, vqi->pages,
> + kvm_notify, callback, name);
> + if (!vq) {
> + err = -ENOMEM;
> + goto unmap;
> + }
> +
> + /*
> + * Trigger the IPI interrupt in SW way.
> + * TODO: We do not need to create one irq for each vq. A bit wasteful.
> + */
> + irq = create_irq();
> + if (irq < 0) {
> + err = -ENXIO;
> + goto del_virtqueue;
> + }
> +
> + tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
> +
> + if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
> + err = -ENXIO;
> + destroy_irq(irq);
> + goto del_virtqueue;
> + }
> +
> + config->irq = irq;
> +
> + vq->priv = vqi;
> + return vq;
> +
> +del_virtqueue:
> + vring_del_virtqueue(vq);
> +unmap:
> + vunmap(vqi->pages);
> +out:
> + return ERR_PTR(err);
> +}
> +
> +static void kvm_del_vq(struct virtqueue *vq)
> +{
> + struct kvm_vqinfo *vqi = vq->priv;
> +
> + vring_del_virtqueue(vq);
> + vunmap(vqi->pages);
> + kfree(vqi);
> +}
> +
> +static void kvm_del_vqs(struct virtio_device *vdev)
> +{
> + struct virtqueue *vq, *n;
> +
> + list_for_each_entry_safe(vq, n, &vdev->vqs, list)
> + kvm_del_vq(vq);
> +}
> +
> +static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
> + struct virtqueue *vqs[],
> + vq_callback_t *callbacks[],
> + const char *names[])
> +{
> + struct kvm_device *kdev = to_kvmdev(vdev);
> + int i;
> +
> + /* We must have this many virtqueues. */
> + if (nvqs > kdev->desc->num_vq)
> + return -ENOENT;
> +
> + for (i = 0; i < nvqs; ++i) {
> + vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
> + if (IS_ERR(vqs[i]))
> + goto error;
> + }
> + return 0;
> +
> +error:
> + kvm_del_vqs(vdev);
> + return PTR_ERR(vqs[i]);
> +}
> +
> +/*
> + * The config ops structure as defined by virtio config
> + */
> +static struct virtio_config_ops kvm_vq_config_ops = {
> + .get_features = kvm_get_features,
> + .finalize_features = kvm_finalize_features,
> + .get = kvm_get,
> + .set = kvm_set,
> + .get_status = kvm_get_status,
> + .set_status = kvm_set_status,
> + .reset = kvm_reset,
> + .find_vqs = kvm_find_vqs,
> + .del_vqs = kvm_del_vqs,
> +};
> +
> +/*
> + * The root device for the kvm virtio devices.
> + * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
> + */
> +static struct device *kvm_root;
> +
> +/*
> + * adds a new device and register it with virtio
> + * appropriate drivers are loaded by the device model
> + */
> +static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
> +{
> + struct kvm_device *kdev;
> +
> + kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
> + if (!kdev) {
> + pr_emerg("Cannot allocate kvm dev %u type %u\n",
> + offset, d->type);
> + return;
> + }
> +
> + kdev->vdev.dev.parent = kvm_root;
> + kdev->vdev.id.device = d->type;
> + kdev->vdev.config = &kvm_vq_config_ops;
> + kdev->desc = d;
> + kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
> +
> + if (register_virtio_device(&kdev->vdev) != 0) {
> + pr_err("Failed to register kvm device %u type %u\n",
> + offset, d->type);
> + kfree(kdev);
> + }
> +}
> +
> +/*
> + * scan_devices() simply iterates through the device page.
> + * The type 0 is reserved to mean "end of devices".
> + */
> +static void scan_devices(void)
> +{
> + unsigned int i;
> + struct kvm_device_desc *d;
> +
> + for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
> + d = kvm_devices + i;
> +
> + if (d->type == 0)
> + break;
> +
> + add_kvm_device(d, i);
> + }
> +}
> +
> +/*
> + * Init function for virtio.
> + * devices are in a single page above the top of "normal" mem.
> + */
> +static int __init kvm_devices_init(void)
> +{
> + int rc = -ENOMEM;
> +
> + kvm_root = root_device_register("kvm_tile");
> + if (IS_ERR(kvm_root)) {
> + rc = PTR_ERR(kvm_root);
> + pr_err("Could not register kvm_tile root device");
> + return rc;
> + }
> +
> + kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
> + 0, io_prot());
> + if (!kvm_devices) {
> + kvm_devices = NULL;
> + root_device_unregister(kvm_root);
> + return rc;
> + }
> +
> + scan_devices();
> + return 0;
> +}
> +
> +/* code for early console output with virtio_console */
> +static __init int early_put_chars(u32 vtermno, const char *buf, int len)
> +{
> + char scratch[512];
> +
> + if (len > sizeof(scratch) - 1)
> + len = sizeof(scratch) - 1;
> + scratch[len] = '\0';
> + memcpy(scratch, buf, len);
> + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
> +
> + return len;
> +}
> +
> +static int __init tile_virtio_console_init(void)
> +{
> + return virtio_cons_early_init(early_put_chars);
> +}
> +console_initcall(tile_virtio_console_init);
> +
> +/*
> + * We do this after core stuff, but before the drivers.
> + */
> +postcore_initcall(kvm_devices_init);
> diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
> index 44cdc4a..2629ff1 100644
> --- a/arch/tile/kernel/process.c
> +++ b/arch/tile/kernel/process.c
> @@ -27,6 +27,7 @@
> #include <linux/kernel.h>
> #include <linux/tracehook.h>
> #include <linux/signal.h>
> +#include <linux/kvm_host.h>
> #include <asm/stack.h>
> #include <asm/switch_to.h>
> #include <asm/homecache.h>
> @@ -247,11 +248,13 @@ struct task_struct *validate_current(void)
> /* Take and return the pointer to the previous task, for schedule_tail(). */
> struct task_struct *sim_notify_fork(struct task_struct *prev)
> {
> +#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
> struct task_struct *tsk = current;
> __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT |
> (tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS));
> __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK |
> (tsk->pid << _SIM_CONTROL_OPERATOR_BITS));
> +#endif
> return prev;
> }
>
> @@ -450,6 +453,11 @@ void _prepare_arch_switch(struct task_struct *next)
> struct task_struct *__sched _switch_to(struct task_struct *prev,
> struct task_struct *next)
> {
> +#ifdef CONFIG_KVM
> + /* vmexit is needed before context switch. */
> + BUG_ON(task_thread_info(prev)->vcpu);
> +#endif
> +
> /* DMA state is already saved; save off other arch state. */
> save_arch_state(&prev->thread);
>
> @@ -519,6 +527,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
> /* Enable interrupts; they are disabled again on return to caller. */
> local_irq_enable();
>
> +#ifdef CONFIG_KVM
> + /*
> + * Some work requires us to exit the VM first. Typically this
> + * allows the process running the VM to respond to the work
> + * (e.g. a signal), or allows the VM mechanism to latch
> + * modified host state (e.g. a "hypervisor" message sent to a
> + * different vcpu). It also means that if we are considering
> + * calling schedule(), we exit the VM first, so we never have
> + * to worry about context-switching into a VM.
> + */
> + if (current_thread_info()->vcpu) {
> + u32 do_exit = thread_info_flags &
> + (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
> +
> + if (thread_info_flags & _TIF_VIRT_EXIT)
> + clear_thread_flag(TIF_VIRT_EXIT);
> + if (do_exit) {
> + kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
> + /*NORETURN*/
> + }
> + }
> +#endif
> +
> if (thread_info_flags & _TIF_NEED_RESCHED) {
> schedule();
> return 1;
> @@ -538,11 +569,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
> tracehook_notify_resume(regs);
> return 1;
> }
> - if (thread_info_flags & _TIF_SINGLESTEP) {
> +
> + /* Handle a few flags here that stay set. */
> + if (thread_info_flags & _TIF_SINGLESTEP)
> single_step_once(regs);
> - return 0;
> - }
> - panic("work_pending: bad flags %#x\n", thread_info_flags);
> +
> + return 0;
> }
>
> unsigned long get_wchan(struct task_struct *p)
> diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
> index 1c09a4f..02bc446 100644
> --- a/arch/tile/kernel/relocate_kernel_64.S
> +++ b/arch/tile/kernel/relocate_kernel_64.S
> @@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
> addi sp, sp, -8
> /* we now have a stack (whether we need one or not) */
>
> +#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> moveli r40, hw2_last(hv_console_putc)
> shl16insli r40, r40, hw1(hv_console_putc)
> shl16insli r40, r40, hw0(hv_console_putc)
>
> -#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> moveli r0, 'r'
> jalr r40
>
> @@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
>
> /* we should not get here */
>
> +#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> moveli r0, '?'
> jalr r40
> moveli r0, '\n'
> jalr r40
> +#endif
>
> j .Lhalt
>
> @@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
> j .Lloop
>
>
> -.Lerr: moveli r0, 'e'
> +.Lerr:
> +#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> + moveli r0, 'e'
> jalr r40
> moveli r0, 'r'
> jalr r40
> @@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
> jalr r40
> moveli r0, '\n'
> jalr r40
> +#endif
> .Lhalt:
> moveli r41, hw2_last(hv_halt)
> shl16insli r41, r41, hw1(hv_halt)
> diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
> index 774e819..2352a81 100644
> --- a/arch/tile/kernel/setup.c
> +++ b/arch/tile/kernel/setup.c
> @@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
> /*
> * Determine for each controller where its lowmem is mapped and how much of
> * it is mapped there. On controller zero, the first few megabytes are
> - * already mapped in as code at MEM_SV_INTRPT, so in principle we could
> + * already mapped in as code at MEM_SV_START, so in principle we could
> * start our data mappings higher up, but for now we don't bother, to avoid
> * additional confusion.
> *
> @@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
> * SPRs, as well as the interrupt mask.
> */
> __insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
> +
> +#ifdef CONFIG_KVM
> + /*
> + * If we launch a guest kernel, it will need some interrupts
> + * that otherwise are not used by the host or by userspace.
> + * Set them to MPL 1 now and leave them alone going forward;
> + * they are masked in the host so will never fire there anyway,
> + * and we mask them at PL1 as we exit the guest.
> + */
> __insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
> + __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
> + __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
> + __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
> +#endif
>
> /* Initialize IRQ support for this cpu. */
> setup_irq_regs();
> @@ -1242,7 +1255,7 @@ static void __init validate_va(void)
> #ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
> /*
> * Similarly, make sure we're only using allowed VAs.
> - * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
> + * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
> * and 0 .. KERNEL_HIGH_VADDR.
> * In addition, make sure we CAN'T use the end of memory, since
> * we use the last chunk of each pgd for the pgd_list.
> @@ -1257,7 +1270,7 @@ static void __init validate_va(void)
> if (range.size == 0)
> break;
> if (range.start <= MEM_USER_INTRPT &&
> - range.start + range.size >= MEM_HV_INTRPT)
> + range.start + range.size >= MEM_HV_START)
> user_kernel_ok = 1;
> if (range.start == 0)
> max_va = range.size;
> @@ -1693,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
> static int __init request_standard_resources(void)
> {
> int i;
> - enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
> + enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
>
> #if defined(CONFIG_PCI) && !defined(__tilegx__)
> insert_non_bus_resource();
> diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
> index 0ae1c59..62b3ba9 100644
> --- a/arch/tile/kernel/smp.c
> +++ b/arch/tile/kernel/smp.c
> @@ -223,30 +223,34 @@ void __init ipi_init(void)
>
> #if CHIP_HAS_IPI()
>
> -void smp_send_reschedule(int cpu)
> +static void __smp_send_reschedule(int cpu)
> {
> - WARN_ON(cpu_is_offline(cpu));
> -
> /*
> * We just want to do an MMIO store. The traditional writeq()
> * functions aren't really correct here, since they're always
> * directed at the PCI shim. For now, just do a raw store,
> - * casting away the __iomem attribute.
> + * casting away the __iomem attribute. We do the store as a
> + * single asm() instruction to ensure that we can force a step
> + * over it in the KVM case, if we are not binding vcpus to cpus,
> + * rather than require it to be possible to issue validly.
> */
> - ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
> + unsigned long *addr =
> + &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
> + asm volatile("st %0, zero" :: "r" (addr));
> }
>
> #else
>
> -void smp_send_reschedule(int cpu)
> +static void __smp_send_reschedule(int cpu)
> {
> - HV_Coord coord;
> -
> - WARN_ON(cpu_is_offline(cpu));
> -
> - coord.y = cpu_y(cpu);
> - coord.x = cpu_x(cpu);
> + HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
> hv_trigger_ipi(coord, IRQ_RESCHEDULE);
> }
>
> #endif /* CHIP_HAS_IPI() */
> +
> +void smp_send_reschedule(int cpu)
> +{
> + WARN_ON(cpu_is_offline(cpu));
> + __smp_send_reschedule(cpu);
> +}
> diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
> index 24fd223..362284a 100644
> --- a/arch/tile/kernel/stack.c
> +++ b/arch/tile/kernel/stack.c
> @@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
> p->sp >= sp) {
> if (kbt->verbose)
> pr_err(" <%s while in kernel mode>\n", fault);
> - } else if (EX1_PL(p->ex1) == USER_PL &&
> + } else if (user_mode(p) &&
> p->sp < PAGE_OFFSET && p->sp != 0) {
> if (kbt->verbose)
> pr_err(" <%s while in user mode>\n", fault);
> diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
> index e25b0a8..024b978 100644
> --- a/arch/tile/kernel/sysfs.c
> +++ b/arch/tile/kernel/sysfs.c
> @@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev,
> struct device_attribute *attr,
> char *page)
> {
> +#ifdef CONFIG_KVM_GUEST
> + return sprintf(page, "KVM\n");
> +#else
> return sprintf(page, "tilera\n");
> +#endif
> }
> static DEVICE_ATTR(type, 0444, type_show, NULL);
>
> diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
> index 3c2dc87..b0b7264 100644
> --- a/arch/tile/kernel/time.c
> +++ b/arch/tile/kernel/time.c
> @@ -117,9 +117,9 @@ void __init time_init(void)
>
> /*
> * Define the tile timer clock event device. The timer is driven by
> - * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
> + * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down
> * counter, plus bit 31, which signifies that the counter has wrapped
> - * from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be
> + * from zero to (2**31) - 1. The INT_[AUX_]TILE_TIMER interrupt will be
> * raised as long as bit 31 is set.
> */
>
> @@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks,
> struct clock_event_device *evt)
> {
> BUG_ON(ticks > MAX_TICK);
> - __insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
> - arch_local_irq_unmask_now(INT_TILE_TIMER);
> + __insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks);
> + arch_local_irq_unmask_now(INT_LINUX_TIMER);
> return 0;
> }
>
> @@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
> static void tile_timer_set_mode(enum clock_event_mode mode,
> struct clock_event_device *evt)
> {
> - arch_local_irq_mask_now(INT_TILE_TIMER);
> + arch_local_irq_mask_now(INT_LINUX_TIMER);
> }
>
> static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
> @@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void)
> evt->cpumask = cpumask_of(smp_processor_id());
>
> /* Start out with timer not firing. */
> - arch_local_irq_mask_now(INT_TILE_TIMER);
> + arch_local_irq_mask_now(INT_LINUX_TIMER);
>
> /*
> * Register tile timer. Set min_delta to 1 microsecond, since
> @@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
> * Mask the timer interrupt here, since we are a oneshot timer
> * and there are now by definition no events pending.
> */
> - arch_local_irq_mask(INT_TILE_TIMER);
> + arch_local_irq_mask(INT_LINUX_TIMER);
>
> /* Track time spent here in an interrupt context */
> irq_enter();
> diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
> index f110785..19d465c 100644
> --- a/arch/tile/kernel/traps.c
> +++ b/arch/tile/kernel/traps.c
> @@ -30,7 +30,7 @@
>
> void __init trap_init(void)
> {
> - /* Nothing needed here since we link code at .intrpt1 */
> + /* Nothing needed here since we link code at .intrpt */
> }
>
> int unaligned_fixup = 1;
> diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
> index c7ae53d..8b20163 100644
> --- a/arch/tile/kernel/vmlinux.lds.S
> +++ b/arch/tile/kernel/vmlinux.lds.S
> @@ -5,7 +5,7 @@
> #include <hv/hypervisor.h>
>
> /* Text loads starting from the supervisor interrupt vector address. */
> -#define TEXT_OFFSET MEM_SV_INTRPT
> +#define TEXT_OFFSET MEM_SV_START
>
> OUTPUT_ARCH(tile)
> ENTRY(_start)
> @@ -13,7 +13,7 @@ jiffies = jiffies_64;
>
> PHDRS
> {
> - intrpt1 PT_LOAD ;
> + intrpt PT_LOAD ;
> text PT_LOAD ;
> data PT_LOAD ;
> }
> @@ -24,11 +24,11 @@ SECTIONS
> #define LOAD_OFFSET TEXT_OFFSET
>
> /* Interrupt vectors */
> - .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
> + .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
> {
> _text = .;
> - *(.intrpt1)
> - } :intrpt1 =0
> + *(.intrpt)
> + } :intrpt =0
>
> /* Hypervisor call vectors */
> . = ALIGN(0x10000);
> diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
> index 2298cb1..65f7f9d 100644
> --- a/arch/tile/kvm/Kconfig
> +++ b/arch/tile/kvm/Kconfig
> @@ -27,9 +27,6 @@ config KVM
> This module provides access to the hardware capabilities through
> a character device node named /dev/kvm.
>
> - To compile this as a module, choose M here: the module
> - will be called kvm.
> -
> If unsure, say N.
>
> source drivers/vhost/Kconfig
> diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
> new file mode 100644
> index 0000000..2c3d206
> --- /dev/null
> +++ b/arch/tile/kvm/Makefile
> @@ -0,0 +1,12 @@
> +#
> +# Makefile for Kernel-based Virtual Machine module
> +#
> +
> +ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
> +
> +kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
> +
> +kvm-y += kvm-tile.o
> +kvm-y += entry.o
> +
> +obj-$(CONFIG_KVM) += kvm.o
> diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
> new file mode 100644
> index 0000000..07aa3a6
> --- /dev/null
> +++ b/arch/tile/kvm/entry.S
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/switch_to.h>
> +#include <asm/processor.h>
> +#include <arch/spr_def.h>
> +#include <arch/abi.h>
> +
> +#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
> +#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
> +#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
> +#define FOR_EACH_CALLEE_SAVED_REG(f) \
> + f(r30); f(r31); \
> + f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \
> + f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
> + f(r48); f(r49); f(r50); f(r51); f(r52);
> +
> +/*
> + * Called with interrupts disabled from kvm_tile_run() and is responsible
> + * just for saving the callee-save registers and the stack pointer, then
> + * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
> + * It uses restore_all in intvec_64.S to jump back into the guest.
> + * The kvm_vmexit function below undoes the stack manipulation.
> + */
> +STD_ENTRY(kvm_vmresume)
> + /* Do function prolog and save callee-saves on stack. */
> + {
> + move r10, sp
> + st sp, lr
> + }
> + {
> + addli r11, sp, -FRAME_SIZE + 8
> + addli sp, sp, -FRAME_SIZE
> + }
> + {
> + st r11, r10
> + addi r12, sp, 16
> + }
> + FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
> + SAVE_REG(tp)
> + SAVE_REG(lr)
> +
> + /* Save frame pointer in thread_info so we can get it back later. */
> + st r1, sp
> +
> + /* Set the ksp0 for this core to be below this frame. */
> + mfspr r10, SPR_SYSTEM_SAVE_K_0
> + bfins r10, sp, 0, CPU_SHIFT-1
> + mtspr SPR_SYSTEM_SAVE_K_0, r10
> +
> + /* sp points to ABI save area below pt_regs for restore_all. */
> + addli sp, r0, -C_ABI_SAVE_AREA_SIZE
> +
> + /* Execute an "interrupt return" to the guest. */
> + {
> + movei r30, 0
> + j restore_all
> + }
> + STD_ENDPROC(kvm_vmresume)
> +
> +/*
> + * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
> + * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
> + * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller
> + * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
> + */
> +STD_ENTRY(kvm_vmexit)
> + {
> + move sp, r0
> + addi r12, r0, 16
> + }
> + FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
> + LOAD_REG(tp)
> + LOAD_REG(lr)
> + {
> + addli sp, sp, FRAME_SIZE
> + jrp lr
> + }
> + STD_ENDPROC(kvm_vmexit)
> diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
> new file mode 100644
> index 0000000..4c33991
> --- /dev/null
> +++ b/arch/tile/kvm/kvm-tile.c
> @@ -0,0 +1,1581 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/err.h>
> +#include <linux/init.h>
> +#include <linux/fs.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/kvm_types.h>
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/ptrace.h>
> +#include <asm/traps.h>
> +#include <asm/pgalloc.h>
> +#include <hv/hypervisor.h>
> +#include <linux/rtc.h>
> +#include <asm/atomic.h>
> +#include <asm/tlbflush.h>
> +#include <arch/spr_def.h>
> +#include <arch/sim.h>
> +#include <generated/utsrelease.h>
> +
> +
> +struct kvm_stats_debugfs_item debugfs_entries[] = {
> + { NULL }
> +};
> +
> +static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
> +{
> + struct mm_struct *mm = kvm->mm;
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + if (kvm->arch.vpgd == NULL)
> + kvm->arch.vpgd = pgd_alloc(kvm->mm);
> + pgd = kvm->arch.vpgd + pgd_index(address);
> + pud = pud_alloc(mm, pgd, address);
> + if (!pud)
> + return NULL;
> + pmd = pmd_alloc(mm, pud, address);
> + if (!pmd)
> + return NULL;
> + return pte_alloc_kernel(pmd, address);
> +}
> +
> +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> +{
> + return VM_FAULT_SIGBUS;
> +}
> +
> +void kvm_arch_free_memslot(struct kvm_memory_slot *free,
> + struct kvm_memory_slot *dont)
> +{
> +}
> +
> +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
> +{
> + return 0;
> +}
> +
> +/* FIXME: support huge pages. */
> +int kvm_arch_prepare_memory_region(struct kvm *kvm,
> + struct kvm_memory_slot *memslot,
> + struct kvm_userspace_memory_region *mem,
> + enum kvm_mr_change change)
> +{
> + unsigned long gpa, i;
> +
> + gpa = mem->guest_phys_addr;
> + for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
> + if (get_vpgd_pte(kvm, gpa) == NULL)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +void kvm_arch_commit_memory_region(struct kvm *kvm,
> + struct kvm_userspace_memory_region *mem,
> + const struct kvm_memory_slot *old,
> + enum kvm_mr_change change)
> +{
> + unsigned long gpa, address, pfn, i;
> + struct page *page[1];
> + pte_t *ptep, *vptep;
> +
> + gpa = mem->guest_phys_addr;
> + address = mem->userspace_addr;
> + for (i = 0; i < mem->memory_size;
> + i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
> + vptep = get_vpgd_pte(kvm, gpa);
> + BUG_ON(vptep == NULL);
> + get_user_pages_fast(address, 1, 1, page);
> + pfn = page_to_pfn(page[0]);
> + ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
> + *vptep = *ptep;
> + }
> +}
> +
> +void kvm_arch_flush_shadow_all(struct kvm *kvm)
> +{
> +}
> +
> +void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> + struct kvm_memory_slot *slot)
> +{
> + kvm_arch_flush_shadow_all(kvm);
> +}
> +
> +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
> +{
> + return 0;
> +}
> +
> +long kvm_arch_dev_ioctl(struct file *filp,
> + unsigned int ioctl, unsigned long arg)
> +{
> + return 0;
> +}
> +
> +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
> +{
> + if (irq < 0)
> + return -EINVAL;
> +
> + set_bit(irq, &vcpu->arch.ipi_events);
> + kvm_vcpu_kick(vcpu);
> +
> + return 0;
> +}
> +
> +long kvm_arch_vcpu_ioctl(struct file *filp,
> + unsigned int ioctl, unsigned long arg)
> +{
> + struct kvm_vcpu *vcpu = filp->private_data;
> + void __user *argp = (void __user *)arg;
> + int r = 0;
> +
> + switch (ioctl) {
> + case KVM_INTERRUPT: {
> + struct kvm_interrupt irq;
> +
> + r = -EFAULT;
> + if (copy_from_user(&irq, argp, sizeof(irq)))
> + goto out;
> + r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
> + if (r)
> + goto out;
> + r = 0;
> + break;
> + }
> + default:
> + r = -EINVAL;
> + }
> +
> +out:
> + return r;
> +}
> +
> +int kvm_dev_ioctl_check_extension(long ext)
> +{
> + return 0;
> +}
> +
> +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
> + struct kvm_dirty_log *log)
> +{
> + return 0;
> +}
> +
> +long kvm_arch_vm_ioctl(struct file *filp,
> + unsigned int ioctl, unsigned long arg)
> +{
> + long r = -EINVAL;
> +
> + return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
> + struct kvm_translation *tr)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + unsigned long page_size;
> + unsigned long gva = tr->linear_address;
> + unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
> + pud_t gpud;
> + pmd_t gpmd;
> + pte_t gpte;
> +
> + /* Get guest pgd (aka pud for three-level tables). */
> + gpgd_gpa = vcpu->arch.guest_context.page_table +
> + (sizeof(pgd_t) * pgd_index(gva));
> + if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
> + goto fail;
> + if (!pud_present(gpud))
> + goto fail;
> +
> + /* Get guest pmd. */
> + if (pud_huge_page(gpud)) {
> + /* FIXME: no super huge page support yet. */
> + if (pte_super(*(pte_t *)&gpud))
> + goto fail;
> + gpte = *(pte_t *)&gpud;
> + page_size = PGDIR_SIZE;
> + goto ok;
> + }
> + gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
> + (sizeof(pmd_t) * pmd_index(gva));
> + if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
> + goto fail;
> + if (!pmd_present(gpmd))
> + goto fail;
> +
> + /* Get guest pte. */
> + if (pmd_huge_page(gpmd)) {
> + /* FIXME: no super huge page support yet. */
> + if (pte_super(*(pte_t *)&gpmd))
> + goto fail;
> + gpte = *(pte_t *)&gpmd;
> + page_size = PMD_SIZE;
> + goto ok;
> + }
> + gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
> + (sizeof(pte_t) * pte_index(gva));
> + if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
> + goto fail;
> + if (!pte_present(gpte))
> + goto fail;
> +
> + page_size = PAGE_SIZE;
> +
> +ok:
> + tr->physical_address =
> + PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
> + tr->valid = 1;
> + tr->writeable = pte_write(gpte);
> + tr->usermode = pte_user(gpte);
> +
> + return 0;
> +
> +fail:
> + tr->valid = 0;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +{
> + regs->regs = vcpu->arch.regs;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +{
> + vcpu->arch.regs = regs->regs;
> + vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> + struct kvm_sregs *sregs)
> +{
> + *sregs = vcpu->arch.sregs;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> + struct kvm_sregs *sregs)
> +{
> + vcpu->arch.sregs = *sregs;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
> + struct kvm_mp_state *mp_state)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
> + struct kvm_mp_state *mp_state)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
> + struct kvm_guest_debug *dbg)
> +{
> + return 0;
> +}
> +
> +/*
> + * panic_hv() will dump stack info of both guest os and host os, and set
> + * proper exit reason so that qemu can terminate the guest process.
> + *
> + * FIXME: Probably KVM_EXIT_EXCEPTION? If using KVM_EXIT_EXCEPTION,
> + * current qemu process will "hang" (killable but Ctrl+C not working),
> + * so use KVM_EXIT_SHUTDOWN here temporarily.
> + */
> +static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
> +{
> + char panic_buf[256];
> + struct pt_regs *regs;
> + va_list ap;
> + int i;
> +
> + va_start(ap, fmt);
> + vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
> + va_end(ap);
> + pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
> +
> + /* Show guest os info */
> + regs = &vcpu->arch.regs;
> + for (i = 0; i < 17; i++)
> + pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
> + i, regs->regs[i], i+18, regs->regs[i+18],
> + i+36, regs->regs[i+36]);
> + pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
> + regs->regs[18], regs->regs[35], regs->tp);
> + pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
> + pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n",
> + regs->pc, regs->ex1, regs->faultnum);
> +
> + /* Show host os info */
> + pr_err("\nKVM stack in the host:\n");
> + dump_stack();
> +
> + /* Shut down the guest os */
> + pr_err("Shutting down guest.\n");
> + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
> + return 0;
> +}
> +
> +/* Copied from virt/kvm/kvm_main.c */
> +static int next_segment(unsigned long len, int offset)
> +{
> + if (len > PAGE_SIZE - offset)
> + return PAGE_SIZE - offset;
> + else
> + return len;
> +}
> +
> +static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
> + void *data, unsigned long len)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int seg;
> + int offset = offset_in_page(gva);
> + int ret;
> +
> + while ((seg = next_segment(len, offset)) != 0) {
> + struct kvm_translation tr;
> + tr.linear_address = gva;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return -EFAULT;
> + ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
> + data, offset, seg);
> + if (ret < 0)
> + return ret;
> + offset = 0;
> + len -= seg;
> + data += seg;
> + gva += seg;
> + }
> + return 0;
> +}
> +
> +static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
> + const void *data, unsigned long len)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int seg;
> + int offset = offset_in_page(gva);
> + int ret;
> +
> + while ((seg = next_segment(len, offset)) != 0) {
> + struct kvm_translation tr;
> + tr.linear_address = gva;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return -EFAULT;
> + ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
> + data, offset, seg);
> + if (ret < 0)
> + return ret;
> + offset = 0;
> + len -= seg;
> + data += seg;
> + gva += seg;
> + }
> + return 0;
> +}
> +
> +static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
> + unsigned long len)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int seg;
> + int offset = offset_in_page(gva);
> + int ret;
> +
> + while ((seg = next_segment(len, offset)) != 0) {
> + struct kvm_translation tr;
> + tr.linear_address = gva;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return -EFAULT;
> + ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
> + offset, seg);
> + if (ret < 0)
> + return ret;
> + offset = 0;
> + len -= seg;
> + gva += seg;
> + }
> + return 0;
> +}
> +
> +/*
> + * The following functions are emulation functions for various
> + * hypervisor system calls (i.e. hv_*()). Return value:
> + * 1 if the host os can emulate it completely.
> + * < 0 if errors occur and then qemu will handle them.
> + * 0 if qemu emulation is needed.
> + * In both the < 0 and the == 0 cases, exit reason should
> + * be set for qemu handling.
> + */
> +
> +/* generic handler for hypercall which needs user (QEMU) to handle. */
> +static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
> +{
> + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
> + return 0;
> +}
> +
> +/* handler for illegal hypercall */
> +static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
> +{
> + return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
> + (unsigned long)vcpu->arch.regs.regs[10]);
> +}
> +
> +static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
> +{
> + int version = vcpu->arch.regs.regs[0];
> + int chip_num = vcpu->arch.regs.regs[1];
> + int chip_rev_num = vcpu->arch.regs.regs[2];
> + int client_pl = vcpu->arch.regs.regs[3];
> +
> + if (client_pl != 1)
> + return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
> + " guests must request PL 1.\n"
> + "Reconfigure your guest with KVM_GUEST set.\n",
> + client_pl);
> +
> + if (version != HV_VERSION)
> + return panic_hv(vcpu, "Client built for hv version %d, but"
> + " this hv is version %d\n",
> + version, HV_VERSION);
> +
> + if (chip_num != TILE_CHIP)
> + return panic_hv(vcpu, "Client built for chip %d, but this"
> + " hardware is chip %d\n",
> + chip_num, TILE_CHIP);
> +
> + if (chip_rev_num != TILE_CHIP_REV)
> + return panic_hv(vcpu, "Client built for chip rev %d, but this"
> + " hardware is chip rev %d\n",
> + chip_rev_num, TILE_CHIP_REV);
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
> +{
> + HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
> + long rc;
> +
> + switch (query) {
> + case HV_SYSCONF_PAGE_SIZE_SMALL:
> + rc = PAGE_SIZE;
> + break;
> +
> + case HV_SYSCONF_PAGE_SIZE_LARGE:
> + rc = HPAGE_SIZE;
> + break;
> +
> + case HV_SYSCONF_VALID_PAGE_SIZES:
> +#if PAGE_SHIFT == 16
> + rc = HV_CTX_PG_SM_64K;
> +#elif PAGE_SHIFT == 14
> + rc = HV_CTX_PG_SM_16K;
> +#else
> +# error Fix hv_sysconf emulation for new page size
> +#endif
> + break;
> +
> + case HV_SYSCONF_PAGE_SIZE_JUMBO:
> + rc = 0; /* FIXME add super page support */
> + break;
> +
> + case HV_SYSCONF_CPU_SPEED:
> + case HV_SYSCONF_CPU_TEMP:
> + case HV_SYSCONF_BOARD_TEMP:
> + rc = hv_sysconf(query);
> + break;
> +
> + default:
> + rc = -EINVAL;
> + break;
> + }
> +
> + vcpu->arch.regs.regs[0] = rc;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
> +{
> + HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
> + long buflen = vcpu->arch.regs.regs[2];
> + char hvbuf[256];
> + const char *p;
> + long rc;
> +
> + switch (query) {
> +
> + /* For hardware attributes, just pass to the hypervisor. */
> + case HV_CONFSTR_BOARD_PART_NUM:
> + case HV_CONFSTR_BOARD_SERIAL_NUM:
> + case HV_CONFSTR_CHIP_SERIAL_NUM:
> + case HV_CONFSTR_BOARD_REV:
> + case HV_CONFSTR_CHIP_MODEL:
> + case HV_CONFSTR_BOARD_DESC:
> + case HV_CONFSTR_MEZZ_PART_NUM:
> + case HV_CONFSTR_MEZZ_SERIAL_NUM:
> + case HV_CONFSTR_MEZZ_REV:
> + case HV_CONFSTR_MEZZ_DESC:
> + case HV_CONFSTR_SWITCH_CONTROL:
> + case HV_CONFSTR_CHIP_REV:
> + case HV_CONFSTR_CPUMOD_PART_NUM:
> + case HV_CONFSTR_CPUMOD_SERIAL_NUM:
> + case HV_CONFSTR_CPUMOD_REV:
> + case HV_CONFSTR_CPUMOD_DESC:
> + rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
> + if (rc > sizeof(hvbuf)) {
> + /* Not the best answer, but very unlikely anyway. */
> + rc = sizeof(hvbuf);
> + hvbuf[sizeof(hvbuf)-1] = '\0';
> + }
> + p = hvbuf;
> + break;
> +
> + /* For hypervisor version info, just report the kernel version. */
> + case HV_CONFSTR_HV_SW_VER:
> + p = UTS_RELEASE;
> + break;
> + case HV_CONFSTR_HV_CONFIG:
> + case HV_CONFSTR_HV_CONFIG_VER:
> + p = "";
> + break;
> +
> + default:
> + rc = HV_EINVAL;
> + goto done;
> + }
> +
> + rc = strlen(p) + 1; /* include NUL */
> + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
> + p, min(rc, buflen)))
> + rc = HV_EFAULT;
> +
> +done:
> + vcpu->arch.regs.regs[0] = rc;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
> +{
> + HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
> + struct rtc_time tm;
> + struct timeval tv;
> +
> + do_gettimeofday(&tv);
> + rtc_time_to_tm(tv.tv_sec, &tm);
> + hvtm->tm_sec = tm.tm_sec;
> + hvtm->tm_min = tm.tm_min;
> + hvtm->tm_hour = tm.tm_hour;
> + hvtm->tm_mday = tm.tm_mday;
> + hvtm->tm_mon = tm.tm_mon;
> + hvtm->tm_year = tm.tm_year;
> + hvtm->flags = 0;
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
> +{
> + /* Do nothing here. */
> + pr_warn("hv_set_rtc() will not work in kvm guest\n");
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
> +{
> + int idx = vcpu->arch.regs.regs[0];
> + HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
> +
> + switch (idx) {
> + case 0:
> + var->start = 0UL;
> + var->size = 0x20000000000UL;
> + break;
> + case 1:
> + var->start = 0xFFFFFFFF80000000UL;
> + var->size = 0x80000000UL;
> + break;
> + default:
> + var->start = 0UL;
> + var->size = 0UL;
> + break;
> + }
> +
> + return 1;
> +}
> +
> +/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
> +static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
> +{
> + int idx = vcpu->arch.regs.regs[0];
> + HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
> +
> + if (idx == 0) {
> + var->start = min_asid;
> + var->size = max_asid - min_asid + 1;
> + } else {
> + var->start = 0;
> + var->size = 0;
> + }
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
> +{
> + HV_Topology *tp;
> + int cpus;
> +
> + /* Depends on the definition of struct HV_Topology */
> + tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
> +
> + cpus = atomic_read(&vcpu->kvm->online_vcpus);
> + tp->coord.x = vcpu->vcpu_id;
> + tp->coord.y = 0;
> + tp->width = cpus;
> + tp->height = 1;
> +
> + return 1;
> +}
> +
> +static int xy_to_vcpu(struct kvm *kvm, int x, int y)
> +{
> + if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
> + return -1;
> + return x;
> +}
> +
> +/*
> + * The primary vcpu is the one that initially runs while the others
> + * all block. It is the only that is allowed to call hv_start_all_tiles().
> + * The other cpus are secondary.
> + */
> +static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
> +{
> + return vcpu->vcpu_id != 0;
> +}
> +
> +static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
> +{
> + struct completion *c = &vcpu->kvm->arch.smp_start;
> + if (is_secondary_vcpu(vcpu) || completion_done(c))
> + return panic_hv(vcpu, "start_all_tiles() called again");
> + complete_all(c);
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
> +{
> + gpa_t gpa = vcpu->arch.regs.regs[0];
> + HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
> + gfn_t gfn;
> + pfn_t pfn;
> + hpa_t hpa;
> +
> + gfn = gpa_to_gfn(gpa);
> + pfn = gfn_to_pfn(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn))
> + return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
> + gpa);
> + hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
> +
> + vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
> +{
> + gpa_t gpa = vcpu->arch.regs.regs[0];
> + HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
> + uint64_t val = vcpu->arch.regs.regs[2];
> + gfn_t gfn;
> + pfn_t pfn;
> + hpa_t hpa;
> +
> + gfn = gpa_to_gfn(gpa);
> + pfn = gfn_to_pfn(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn))
> + return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
> + gpa);
> + hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
> +
> + hv_physaddr_write64(hpa, *access, val);
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
> +{
> + /* Do we care about the argument msgstate? */
> + vcpu->arch.regs.regs[0] = HV_OK;
> +
> + return 1;
> +}
> +
> +/*
> + * NOTE: we may coalesce multiple messages with the same tag to the
> + * same recepient. Currently the only messages used by Linux are
> + * start/stop cpu (where coalescing is OK), and the smp_call_function()
> + * IPI message tag. In the latter case we rely on the generic
> + * smp_call_function code to properly handle this, and since it only
> + * uses the IPI as a way to wake up the generic list-walking code,
> + * it's OK if we coalesce several IPI deliveries before the recipient
> + * core takes action.
> + */
> +static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + struct kvm_vcpu *vcpui;
> + HV_Recipient recip[NR_CPUS];
> + HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
> + int nrecip = vcpu->arch.regs.regs[1];
> + int buflen = vcpu->arch.regs.regs[3];
> + int sent, vcpu_id, tag;
> +
> + /* NOTE: we only support the Linux usage of buflen == sizeof(int). */
> + if (unlikely(buflen != sizeof(int) ||
> + nrecip >= atomic_read(&kvm->online_vcpus))) {
> + vcpu->arch.regs.regs[0] = HV_EINVAL;
> + return 1;
> + }
> +
> + /* Get the buf info */
> + if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
> + &tag, sizeof(tag))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + /* Range-check the tag value. */
> + if (tag < 0 || tag >= MAX_MSG_TAG) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + /* Get all the recipients */
> + if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
> + nrecip * sizeof(HV_Recipient))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + for (sent = 0; sent < nrecip; sent++) {
> + if (recip[sent].state != HV_TO_BE_SENT)
> + continue;
> + vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
> + if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
> + recip[sent].state = HV_BAD_RECIP;
> + continue;
> + }
> + vcpui = kvm_get_vcpu(kvm, vcpu_id);
> + set_bit(tag, &vcpui->arch.pending_msgs);
> + kvm_vcpu_kick(vcpui);
> + recip[sent].state = HV_SENT;
> + }
> +
> + if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
> + nrecip * sizeof(HV_Recipient))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + vcpu->arch.regs.regs[0] = sent;
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
> +{
> + HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
> + int buflen = vcpu->arch.regs.regs[3];
> + int tag;
> +
> + /* Currently we only support messages from other tiles. */
> + rmi->source = HV_MSG_TILE;
> +
> + if (buflen <= sizeof(int)) {
> + rmi->msglen = HV_E2BIG;
> + return 1;
> + }
> +
> + tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
> + if (tag >= MAX_MSG_TAG) {
> + /* No more messages */
> + rmi->msglen = 0;
> + return 1;
> + }
> +
> + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
> + &tag, sizeof(int))) {
> + rmi->msglen = HV_EFAULT;
> + return 1;
> + }
> +
> + /*
> + * This clear_bit could race with a set_bit as another core
> + * delivers a new smp_function_call to this core. However,
> + * the smp_function_call code will have set up the additional
> + * smp_function_call data on the kernel's list prior to
> + * raising the interrupt, so even if we lose the new
> + * interrupt due to the race, we still haven't dispatched
> + * to the original interrupt handler, and when we do, it
> + * will find both smp_function_calls waiting for it, so the
> + * race is harmless. This is consistent with the fact that
> + * the generic code is trying to support pretty much
> + * arbitrary architecture-dependent IPI semantics, so it
> + * is very conservative about what it assumes.
> + *
> + * Also note that we only clear_bit on the core that owns
> + * the mask, so there's no race condition caused by the
> + * find_first_bit above and the clear_bit here, since once
> + * a bit is found it will stay set until this point.
> + */
> + clear_bit(tag, &vcpu->arch.pending_msgs);
> + rmi->msglen = sizeof(int);
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
> +{
> + HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
> +
> + *ctx = hv_inquire_guest_context();
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + HV_InqTileSet set = vcpu->arch.regs.regs[0];
> + unsigned long gva = vcpu->arch.regs.regs[1];
> + int length = vcpu->arch.regs.regs[2];
> + struct cpumask mask = CPU_MASK_NONE;
> + int cpus, i, retval, bytes2copy, bytes2zero;
> +
> + switch (set) {
> + case HV_INQ_TILES_AVAIL:
> + case HV_INQ_TILES_HFH_CACHE:
> + case HV_INQ_TILES_LOTAR:
> + cpus = atomic_read(&kvm->online_vcpus);
> + for (i = 0; i < cpus; ++i)
> + cpumask_set_cpu(i, &mask);
> + break;
> + case HV_INQ_TILES_SHARED:
> + break;
> + default:
> + retval = HV_EINVAL;
> + goto done;
> + }
> +
> + bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
> + bytes2zero = length - bytes2copy;
> +
> + if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
> + retval = HV_EFAULT;
> + goto done;
> + }
> +
> + if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
> + retval = HV_EFAULT;
> + goto done;
> + }
> +
> + retval = HV_OK;
> +done:
> + vcpu->arch.regs.regs[0] = retval;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
> +{
> + HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
> + int pl = (int) vcpu->arch.regs.regs[1];
> + struct kvm_vcpu *target_vcpu;
> + int vcpu_id;
> +
> + vcpu_id = vtarget.x;
> + if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
> + vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
> + vcpu->arch.regs.regs[0] = HV_EINVAL;
> + return 1;
> + }
> +
> + target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
> + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
> + &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + vcpu->arch.regs.regs[0] = HV_OK;
> +
> + return 1;
> +}
> +
> +struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
> +{
> + struct kvm_vcpu *vcpui;
> + unsigned long idx;
> +
> + kvm_for_each_vcpu(idx, vcpui, kvm)
> + if (vcpui->arch.ipi_gpa == gpa)
> + return vcpui;
> +
> + return NULL;
> +}
> +
> +/*
> + * Most page faults will be downcall-ed from hv to and be handled directly
> + * by either guest os or host os. This function is used to handle the
> + * rest cases.
> + */
> +static int handle_mmio(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + struct kvm_translation tr;
> + struct kvm_vcpu *ipi_vcpu;
> +
> + tr.linear_address = (__u64) vcpu->arch.fault_addr;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return 0;
> +
> + /* ipi PTE for rescheduling interrupt? */
> + ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
> + if (!ipi_vcpu)
> + return 0;
> +
> + set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
> + kvm_vcpu_kick(ipi_vcpu);
> +
> + /* Juke the PC past the store instruction. */
> + vcpu->arch.regs.pc += 8;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
> +{
> + /*
> + * We do not expect this call in guest so far. At least guest os
> + * should just follow host os instead of *set*. Besides,
> + * hv_set_pte_super_shift() will not be called in guest os with
> + * current guest os setting.
> + */
> + vcpu->arch.regs.regs[0] = HV_EINVAL;
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
> +{
> + HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
> +
> + hvss->new_speed = HV_EPERM;
> + hvss->end_cycle = 0;
> + hvss->delta_ns = 0;
> +
> + return 1;
> +}
> +
> +static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
> + HCALL_DEFS
> +};
> +
> +static int kvm_handle_exit(struct kvm_vcpu *vcpu)
> +{
> + unsigned long hcall_idx;
> +
> + switch (vcpu->run->exit_reason) {
> + case KVM_EXIT_HYPERCALL:
> + hcall_idx = vcpu->arch.regs.regs[10];
> + if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
> + hcall_handlers[hcall_idx] == NULL))
> + return kvm_emulate_illegal(vcpu);
> +
> + /* Juke us past the swint0 when we return. */
> + vcpu->arch.regs.pc += 8;
> +
> + return hcall_handlers[hcall_idx](vcpu);
> +
> + case KVM_EXIT_MMIO:
> + if (handle_mmio(vcpu))
> + return 1;
> + return panic_hv(vcpu, "Out-of-bounds client memory access");
> +
> + case KVM_EXIT_AGAIN:
> + return 1;
> +
> + default:
> + return 0;
> + }
> +}
> +
> +static void kvm_kick_func(void *info)
> +{
> + struct kvm_vcpu *vcpu = info;
> +
> + /* If this is not the thread that we expect, just return. */
> + if (unlikely(vcpu->pid != get_task_pid(current, PIDTYPE_PID)))
> + return;
> +
> + /* Setting this flag will cause a vmexit instead of a vmresume. */
> + set_thread_flag(TIF_VIRT_EXIT);
> +}
> +
> +/* Note this function has been a standard kvm interface in latest Linux. */
> +void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
> +{
> + int me, cpu;
> +
> + /* If it is waiting in kvm_vcpu_block(), wake it up. */
> + if (waitqueue_active(&vcpu->wq))
> + wake_up_interruptible(&vcpu->wq);
> +
> + /* If we are kicking our own vcpu, make sure we vmexit. */
> + if (vcpu == current_thread_info()->vcpu) {
> + set_thread_flag(TIF_VIRT_EXIT);
> + return;
> + }
> +
> + /*
> + * If the vcpu is running the guest, interrupt its cpu,
> + * causing it to vmexit by setting TIF_VIRT_EXIT. Note we can
> + * race with a guest already doing a vmexit, but that is benign.
> + */
> + cpu = vcpu->cpu;
> + me = get_cpu();
> + if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
> + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
> + smp_call_function_single(cpu, kvm_kick_func, vcpu, 0);
> + put_cpu();
> +}
> +EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
> +
> +/*
> + * Any interrupt that would normally be handled by the host at PL2
> + * needs to be reassigned to the guest at PL1 as we enter.
> + *
> + * The TLB interrupts remain handled by the hypervisor and are downcalled
> + * to the appropriate host or guest as necessary.
> + *
> + * FIXME: We don't give the UDN interrupts for now; at some point we
> + * plan to allow an option to pin the vcpus and report the true
> + * geometry to the guest, at which point passing the UDN access would
> + * make sense.
> + *
> + * FIXME: For now we don't pass the profiling interrupts to the guest,
> + * and instead require profiling be run in the host; we should be able
> + * to support guest-level profiling pretty easily, but we need to
> + * think about whether there are vcpu migration issues there.
> + */
> +static void kvm_grant_mpls(void)
> +{
> + __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
> + __insn_mtspr(SPR_MPL_ILL_SET_1, 1);
> + __insn_mtspr(SPR_MPL_GPV_SET_1, 1);
> + __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
> + __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
> +}
> +
> +static void kvm_ungrant_mpls(void)
> +{
> + __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
> + __insn_mtspr(SPR_MPL_ILL_SET_2, 1);
> + __insn_mtspr(SPR_MPL_GPV_SET_2, 1);
> + __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
> + __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
> +}
> +
> +/*
> + * There is lots of state that is (for the non-virtualized case) held
> + * permanently in SPRs, or that is in any case not context-switched.
> + * The next two routines switch in and out all the SPR state.
> + *
> + * We try to fix the timer so that when we restart, we fix up the
> + * timer value so that will fire at the correct wall-clock time even
> + * if we have been scheduled out for a little bit. This may also
> + * mean we end up firing it immediately on return, and suffer a
> + * timer delay in the guest.
> + */
> +static void kvm_save_sprs(struct kvm_vcpu *vcpu)
> +{
> + vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
> + vcpu->arch.vmexit_cycles = get_cycles();
> +
> +#define SAVE_SPR(x) vcpu->arch.sregs.x = __insn_mfspr(SPR_ ## x)
> + FOR_EACH_GUEST_SPR(SAVE_SPR);
> +#undef SAVE_SPR
> +}
> +
> +static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
> +{
> + unsigned long count = vcpu->arch.timer_control;
> + unsigned long underflow =
> + (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
> + unsigned long disabled =
> + (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
> +
> + if (!disabled) {
> + unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
> + count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
> + underflow |= delta > count;
> + count -= delta;
> + count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
> + count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
> + }
> + __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
> +
> +#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.sregs.x)
> + FOR_EACH_GUEST_SPR(RESTORE_SPR);
> +#undef RESTORE_SPR
> +}
> +
> +/*
> + * When entering the guest, we need to eliminate any PL0 translations
> + * that were in use by qemu, since the guest's PL0 translations will
> + * be different. We also flush PL1 translations in case there have
> + * been changes to the virtualization page table, etc.
> + *
> + * FIXME: Add a way to just flush PL0/PL1, or just flush below
> + * the host PAGE_OFFSET, or add vpid support, etc.
> + */
> +static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
> +{
> + HV_Context *ctx;
> + pgd_t *vpgdir;
> + pte_t *ptep;
> + int rc;
> +
> + /* Install virtualization context */
> + vpgdir = vcpu->kvm->arch.vpgd;
> + BUG_ON(vpgdir == NULL);
> + ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
> + rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
> + WARN_ON_ONCE(rc < 0);
> +
> + /* Install guest context */
> + ctx = &vcpu->arch.guest_context;
> + rc = hv_install_guest_context(ctx->page_table, ctx->access,
> + ctx->asid, ctx->flags);
> + WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
> + ctx->page_table, ctx->access.val,
> + ctx->asid, ctx->flags, rc);
> +
> + hv_flush_all(0);
> +}
> +
> +/*
> + * De-install the virtualization context so we take faults below the
> + * host Linux PL in the normal manner going forward.
> + *
> + * We flush all the TLB mappings as we exit the guest, since the
> + * guest has been using the ASIDs as it pleases, and may have installed
> + * incompatible mappings for qemu's process as well. Note that we don't
> + * worry about host-PL interrupts that occur while the guest is running,
> + * on the assumption that such interrupts can't touch userspace
> + * addresses legally anyway.
> + *
> + * NOTE: we may want to add a hypervisor call to just flush mappings
> + * below PL2 and use that here instead.
> + */
> +static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
> +{
> + int rc;
> +
> + /* Remember guest context */
> + vcpu->arch.guest_context = hv_inquire_guest_context();
> +
> + /* Disable virtualization context */
> + rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
> + WARN_ON_ONCE(rc < 0);
> +
> + /* Flush everything in the TLB. */
> + hv_flush_all(0);
> +}
> +
> +static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
> +{
> + /*
> + * Capture current set of ipi_events. We might race with
> + * another thread adding an event, but if so we'll just miss
> + * it on this go-around and see it next time.
> + */
> + vcpu->arch.sregs.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
> +
> + /*
> + * Note: We could set PC and EX1 for the guest os to jump
> + * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
> + * is unmasked and the guest is not at PL1 with ICS set.
> + * But in fact it's about as fast to just set INTCTRL_1_STATUS
> + * here and then run the short INTCTRL_1 handler in the guest.
> + */
> + vcpu->arch.sregs.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
> +}
> +
> +static void kvm_tile_run(struct kvm_vcpu *vcpu)
> +{
> + struct thread_info *ti = current_thread_info();
> + unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
> +
> + /*
> + * Disable interrupts while we set up the guest state.
> + * This way, if we race with another core trying to tell us
> + * to fix up our guest state, we will take the kick only as
> + * we actually try to enter the guest, and instead we will
> + * vmexit and end up retrying.
> + */
> + local_irq_disable();
> + kvm_guest_context_enter(vcpu);
> + clear_bit(KVM_REQ_KICK, &vcpu->requests);
> + ti->vcpu = vcpu;
> + vcpu->cpu = get_cpu();
> + kvm_inject_interrupts(vcpu);
> + kvm_grant_mpls();
> + kvm_restore_sprs(vcpu);
> +
> + /* Calling this function irets into the guest. */
> + kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
> +
> + /* We resume here due to a call to kvm_vmexit. */
> + __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
> +
> + vcpu->cpu = -1;
> + put_cpu();
> + ti->vcpu = NULL;
> + set_bit(KVM_REQ_KICK, &vcpu->requests);
> + vcpu->run->ready_for_interrupt_injection = 1;
> + kvm_ungrant_mpls();
> + kvm_save_sprs(vcpu);
> + __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
> + kvm_guest_context_exit(vcpu);
> + local_irq_enable();
> +}
> +
> +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +{
> + int r = 1;
> +
> + while (r > 0) {
> + kvm_guest_enter();
> + kvm_tile_run(vcpu);
> + kvm_guest_exit();
> +
> + r = kvm_handle_exit(vcpu);
> + /*
> + * <0: error for userspace.
> + * =0: QEMU to handle.
> + * >0: host os can handle it fully.
> + */
> + if (r <= 0)
> + break;
> +
> + if (signal_pending(current)) {
> + vcpu->run->exit_reason = KVM_EXIT_INTR;
> + r = -EINTR;
> + break;
> + }
> +
> +#ifdef CONFIG_HOMECACHE
> + if (current_thread_info()->homecache_cpu !=
> + smp_processor_id()) {
> + /* Do homecache migration when returning to qemu. */
> + vcpu->run->exit_reason = KVM_EXIT_INTR;
> + r = -EINTR;
> + break;
> + }
> +#endif
> +
> + kvm_resched(vcpu);
> + }
> +
> + return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +{
> + int r;
> + sigset_t sigsaved;
> +
> + /* Secondary cpus must wait until they are told they can start. */
> + if (vcpu->arch.suspended) {
> + struct completion *c = &vcpu->kvm->arch.smp_start;
> + if (wait_for_completion_interruptible(c))
> + return -EINTR;
> + vcpu->arch.suspended = 0;
> + }
> +
> + if (vcpu->sigset_active)
> + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
> +
> + r = __vcpu_run(vcpu, kvm_run);
> +
> + if (vcpu->sigset_active)
> + sigprocmask(SIG_SETMASK, &sigsaved, NULL);
> +
> + return r;
> +}
> +
> +int kvm_arch_init(void *opaque)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_exit(void)
> +{
> +}
> +
> +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> +{
> + int i;
> + unsigned long resv_gfn_start;
> + struct kvm_memory_slot *s;
> + struct kvm *kvm = vcpu->kvm;
> +
> + if (!kvm->arch.resv_gpa_start) {
> + resv_gfn_start = 0;
> +
> + for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
> + s = &kvm->memslots->memslots[i];
> +
> + if (!s->npages)
> + continue;
> +
> + if ((s->base_gfn + s->npages) > resv_gfn_start)
> + resv_gfn_start = s->base_gfn + s->npages;
> + }
> +
> + kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
> + }
> +
> + /* Initialize to enter fake PA=VA mode in hypervisor. */
> + vcpu->arch.guest_context.page_table = HV_CTX_NONE;
> +
> + vcpu->arch.ipi_gpa =
> + kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
> + vcpu->arch.ipi_gpte =
> + pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
> +
> + /* Mark the core suspended if it is not the boot cpu. */
> + vcpu->arch.suspended = is_secondary_vcpu(vcpu);
> +
> + return 0;
> +}
> +
> +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> + /* Notify simulator that this task handles this vcpu. */
> + sim_set_vcpu(vcpu->vcpu_id);
> +}
> +
> +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> +{
> + sim_clear_vcpu();
> +}
> +
> +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
> +{
> + /* FIXME: some archs set up a cache for these structs? */
> + struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> + int rc;
> +
> + if (!vcpu)
> + return ERR_PTR(-ENOMEM);
> +
> + rc = kvm_vcpu_init(vcpu, kvm, id);
> + if (rc) {
> + kfree(vcpu);
> + return ERR_PTR(rc);
> + }
> +
> + return vcpu;
> +}
> +
> +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
> +{
> + memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
> + memset(&vcpu->arch.sregs, 0, sizeof(struct pt_regs));
> + vcpu->arch.sregs.IPI_MASK_1 = -1UL;
> + vcpu->arch.sregs.INTERRUPT_MASK_1 = -1UL;
> + vcpu->arch.sregs.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
> +{
> + kvm_vcpu_uninit(vcpu);
> + kfree(vcpu);
> +}
> +
> +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> +{
> + return kvm_arch_vcpu_destroy(vcpu);
> +}
> +
> +int kvm_arch_hardware_enable(void *garbage)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_hardware_disable(void *garbage)
> +{
> +}
> +
> +int kvm_arch_hardware_setup(void)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_hardware_unsetup(void)
> +{
> +}
> +
> +void kvm_arch_check_processor_compat(void *rtn)
> +{
> +}
> +
> +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> +{
> + if (type)
> + return -EINVAL;
> +
> + init_completion(&kvm->arch.smp_start);
> + return 0;
> +}
> +
> +void kvm_arch_destroy_vm(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + kvm_arch_vcpu_free(vcpu);
> +
> + /* Seems to be unnecessary? */
> + mutex_lock(&kvm->lock);
> + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
> + kvm->vcpus[i] = NULL;
> +
> + atomic_set(&kvm->online_vcpus, 0);
> + mutex_unlock(&kvm->lock);
> +
> + /* FIXME: release all the pmds and ptes as well! */
> + if (kvm->arch.vpgd)
> + pgd_free(kvm->mm, kvm->arch.vpgd);
> +}
> +
> +void kvm_arch_sync_events(struct kvm *kvm)
> +{
> +}
> +
> +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> +/* Called from guest hv glue via swint0 traps. */
> +void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
> +{
> + /* Hypercalls are only valid from PL1. */
> + if (EX1_PL(regs->ex1) != 0) {
> + kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
> + /*NORETURN*/
> + }
> + do_trap(regs, fault_num, 0);
> +}
> +
> +void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
> + unsigned long fault_addr, unsigned long write)
> +{
> + struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
> + BUG_ON(vcpu == NULL);
> + vcpu->arch.fault_addr = fault_addr;
> + kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
> + /*NORETURN*/
> +}
> +
> +void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
> +{
> + kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
> + /*NORETURN*/
> +}
> +
> +void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
> +{
> + struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
> + vcpu->run->exit_reason = exit_reason;
> + vcpu->arch.regs = *regs;
> + vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
> + kvm_vmexit(vcpu->arch.host_sp);
> + /*NORETURN*/
> +}
> +
> +static int __init kvm_tile_init(void)
> +{
> + return kvm_init(NULL, sizeof(struct kvm_vcpu),
> + __alignof__(struct kvm_vcpu), THIS_MODULE);
> +}
> +
> +static void __exit kvm_tile_exit(void)
> +{
> + kvm_exit();
> +}
> +
> +module_init(kvm_tile_init);
> +module_exit(kvm_tile_exit);
> diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
> index 82733c8..1590282 100644
> --- a/arch/tile/lib/exports.c
> +++ b/arch/tile/lib/exports.c
> @@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);
>
> /* hypervisor glue */
> #include <hv/hypervisor.h>
> +EXPORT_SYMBOL(hv_confstr);
> +EXPORT_SYMBOL(hv_dev_close);
> EXPORT_SYMBOL(hv_dev_open);
> +EXPORT_SYMBOL(hv_dev_poll);
> +EXPORT_SYMBOL(hv_dev_poll_cancel);
> EXPORT_SYMBOL(hv_dev_pread);
> -EXPORT_SYMBOL(hv_dev_pwrite);
> EXPORT_SYMBOL(hv_dev_preada);
> +EXPORT_SYMBOL(hv_dev_pwrite);
> EXPORT_SYMBOL(hv_dev_pwritea);
> -EXPORT_SYMBOL(hv_dev_poll);
> -EXPORT_SYMBOL(hv_dev_poll_cancel);
> -EXPORT_SYMBOL(hv_dev_close);
> -EXPORT_SYMBOL(hv_sysconf);
> -EXPORT_SYMBOL(hv_confstr);
> +EXPORT_SYMBOL(hv_flush_all);
> EXPORT_SYMBOL(hv_get_rtc);
> +#ifdef __tilegx__
> +EXPORT_SYMBOL(hv_inquire_guest_context);
> +EXPORT_SYMBOL(hv_install_guest_context);
> +EXPORT_SYMBOL(hv_install_virt_context);
> +#endif
> +EXPORT_SYMBOL(hv_physaddr_read64);
> +EXPORT_SYMBOL(hv_physaddr_write64);
> EXPORT_SYMBOL(hv_set_rtc);
> +EXPORT_SYMBOL(hv_sysconf);
>
> /* libgcc.a */
> uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
> diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
> index 23f044e..86cff48 100644
> --- a/arch/tile/mm/elf.c
> +++ b/arch/tile/mm/elf.c
> @@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm)
> char *buf, *path;
> struct vm_area_struct *vma;
>
> +#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
> if (!sim_is_simulator())
> +#endif
> return 1;
>
> if (mm->exe_file == NULL)
> diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
> index 64eec3f..39c48cb 100644
> --- a/arch/tile/mm/fault.c
> +++ b/arch/tile/mm/fault.c
> @@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
> flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> (write ? FAULT_FLAG_WRITE : 0));
>
> - is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
> + is_kernel_mode = !user_mode(regs);
>
> tsk = validate_current();
>
> @@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
> }
>
> #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
> - if (EX1_PL(regs->ex1) != USER_PL) {
> + if (!user_mode(regs)) {
> struct async_tlb *async;
> switch (fault_num) {
> #if CHIP_HAS_TILE_DMA()
> diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
> index 3bfa127..c6d2160 100644
> --- a/arch/tile/mm/init.c
> +++ b/arch/tile/mm/init.c
> @@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
> {
> int cpu;
> unsigned long page;
> - enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
> + enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
>
> #if CHIP_HAS_CBOX_HOME_MAP()
> /* For kdata=huge, everything is just hash-for-home. */
> @@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
> }
> }
>
> - address = MEM_SV_INTRPT;
> + address = MEM_SV_START;
> pmd = get_pmd(pgtables, address);
> pfn = 0; /* code starts at PA 0 */
> if (ktext_small) {
> @@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
>
> void free_initmem(void)
> {
> - const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
> + const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
>
> /*
> * Evict the dirty initdata on the boot cpu, evict the w1data
> @@ -1040,7 +1040,7 @@ void free_initmem(void)
>
> /*
> * Free the pages mapped from 0xc0000000 that correspond to code
> - * pages from MEM_SV_INTRPT that we won't use again after init.
> + * pages from MEM_SV_START that we won't use again after init.
> */
> free_init_pages("unused kernel text",
> (unsigned long)_sinittext - text_delta,
> diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
> index 3004433..d6948d4 100644
> --- a/arch/tile/mm/pgtable.c
> +++ b/arch/tile/mm/pgtable.c
> @@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
>
> #if CHIP_HAS_MMIO()
>
> -/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
> -void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
> - pgprot_t home)
> +void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
> + unsigned long flags, pgprot_t prot)
> {
> void *addr;
> struct vm_struct *area;
> unsigned long offset, last_addr;
> - pgprot_t pgprot;
>
> /* Don't allow wraparound or zero size */
> last_addr = phys_addr + size - 1;
> if (!size || last_addr < phys_addr)
> return NULL;
>
> - /* Create a read/write, MMIO VA mapping homed at the requested shim. */
> - pgprot = PAGE_KERNEL;
> - pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
> - pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
> -
> /*
> * Mappings have to be page-aligned
> */
> @@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
> /*
> * Ok, go for it..
> */
> - area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
> + area = get_vm_area(size, flags);
> if (!area)
> return NULL;
> area->phys_addr = phys_addr;
> addr = area->addr;
> if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
> - phys_addr, pgprot)) {
> + phys_addr, prot)) {
> free_vm_area(area);
> return NULL;
> }
> - return (__force void __iomem *) (offset + (char *)addr);
> + return (void *) (offset + (char *)addr);
> +}
> +EXPORT_SYMBOL(generic_remap_prot);
> +
> +/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
> +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
> + pgprot_t home)
> +{
> + pgprot_t pgprot;
> + unsigned long flags;
> +
> + /* Create a read/write, MMIO VA mapping homed at the requested shim. */
> + pgprot = PAGE_KERNEL;
> + pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
> + pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
> + flags = VM_IOREMAP; /* | other flags? */
> +
> + return (__force void __iomem *) generic_remap_prot(phys_addr,
> + size, flags, pgprot);
> }
> EXPORT_SYMBOL(ioremap_prot);
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index acccd08..b622337 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -171,6 +171,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_WATCHDOG 21
> #define KVM_EXIT_S390_TSCH 22
> #define KVM_EXIT_EPR 23
> +#define KVM_EXIT_AGAIN 24
>
> /* For KVM_EXIT_INTERNAL_ERROR */
> /* Emulate instruction failed. */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1580dd4..1b8a1f1 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1691,7 +1691,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
> finish_wait(&vcpu->wq, &wait);
> }
>
> -#ifndef CONFIG_S390
> +#if !defined(CONFIG_S390) && !defined(CONFIG_TILE)
> /*
> * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
> */
> @@ -1714,7 +1714,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
> put_cpu();
> }
> EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
> -#endif /* !CONFIG_S390 */
> +#endif
>
> void kvm_resched(struct kvm_vcpu *vcpu)
> {
> @@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
> if (vcpu->kvm->mm != current->mm)
> return -EIO;
>
> -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
> +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
> + defined(CONFIG_TILEGX)
> /*
> * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
> * so vcpu_load() would break it.
> --
> 1.8.3.1
--
Gleb.
On 8/25/2013 7:39 AM, Gleb Natapov wrote:
> On Mon, Aug 12, 2013 at 04:24:11PM -0400, Chris Metcalf wrote:
>> This change provides the initial framework support for KVM on tilegx.
>> Basic virtual disk and networking is supported.
>>
> This needs to be broken down to more reviewable patches.
I already broke out one pre-requisite patch that wasn't strictly KVM-related:
https://lkml.org/lkml/2013/8/12/339
In addition, we've separately arranged to support booting our kernels in a way that is compatible with the Tilera booter running at the highest privilege level, which enables multiple kernel privilege levels:
https://lkml.org/lkml/2013/5/2/468
How would you recommend further breaking down this patch? It's pretty much just the basic support for minimal KVM. I suppose I could break out all the I/O related stuff into a separate patch, though it wouldn't amount to much; perhaps the console could also be broken out separately. Any other suggestions?
> Also can you
> describe the implementation a little bit? Does tile arch has vitalization
> extension this implementation uses, or is it trap and emulate approach?
> If later does it run unmodified guest kernels? What userspace are you
> using with this implementation?
We could do full virtualization via trap and emulate, but we've elected to do a para-virtualized approach. Userspace runs at PL (privilege level) 0, the guest kernel runs at PL1, and the host runs at PL2. We have available per-PL resources for various things, and take advantage of having two on-chip timers (for example) to handle timing for the host and guest kernels. We run the same userspace with either the host or the guest.
--
Chris Metcalf, Tilera Corp.
http://www.tilera.com
On Sun, Aug 25, 2013 at 09:26:47PM -0400, Chris Metcalf wrote:
> On 8/25/2013 7:39 AM, Gleb Natapov wrote:
> > On Mon, Aug 12, 2013 at 04:24:11PM -0400, Chris Metcalf wrote:
> >> This change provides the initial framework support for KVM on tilegx.
> >> Basic virtual disk and networking is supported.
> >>
> > This needs to be broken down to more reviewable patches.
>
> I already broke out one pre-requisite patch that wasn't strictly KVM-related:
>
> https://lkml.org/lkml/2013/8/12/339
>
> In addition, we've separately arranged to support booting our kernels in a way that is compatible with the Tilera booter running at the highest privilege level, which enables multiple kernel privilege levels:
>
> https://lkml.org/lkml/2013/5/2/468
>
> How would you recommend further breaking down this patch? It's pretty much just the basic support for minimal KVM. I suppose I could break out all the I/O related stuff into a separate patch, though it wouldn't amount to much; perhaps the console could also be broken out separately. Any other suggestions?
>
First of all please break out host and guest bits. Also I/O related stuff,
like you suggest (so that guest PV bits will be in separate patch) and
change to a common code (not much as far as I see) with explanation why
it is needed. (Why kvm_vcpu_kick() is not needed for instance?)
> > Also can you
> > describe the implementation a little bit? Does tile arch has vitalization
> > extension this implementation uses, or is it trap and emulate approach?
> > If later does it run unmodified guest kernels? What userspace are you
> > using with this implementation?
>
> We could do full virtualization via trap and emulate, but we've elected to do a para-virtualized approach. Userspace runs at PL (privilege level) 0, the guest kernel runs at PL1, and the host runs at PL2. We have available per-PL resources for various things, and take advantage of having two on-chip timers (for example) to handle timing for the host and guest kernels. We run the same userspace with either the host or the guest.
>
OK, thanks for explanation. Why have you decided to do PV over trap and
emulate?
--
Gleb.
On 8/26/2013 8:04 AM, Gleb Natapov wrote:
> On Sun, Aug 25, 2013 at 09:26:47PM -0400, Chris Metcalf wrote:
>> On 8/25/2013 7:39 AM, Gleb Natapov wrote:
>>> On Mon, Aug 12, 2013 at 04:24:11PM -0400, Chris Metcalf wrote:
>>>> This change provides the initial framework support for KVM on tilegx.
>>>> Basic virtual disk and networking is supported.
>>>>
>>> This needs to be broken down to more reviewable patches.
>> I already broke out one pre-requisite patch that wasn't strictly KVM-related:
>>
>> https://lkml.org/lkml/2013/8/12/339
>>
>> In addition, we've separately arranged to support booting our kernels in a way that is compatible with the Tilera booter running at the highest privilege level, which enables multiple kernel privilege levels:
>>
>> https://lkml.org/lkml/2013/5/2/468
>>
>> How would you recommend further breaking down this patch? It's pretty much just the basic support for minimal KVM. I suppose I could break out all the I/O related stuff into a separate patch, though it wouldn't amount to much; perhaps the console could also be broken out separately. Any other suggestions?
>>
> First of all please break out host and guest bits. Also I/O related stuff,
> like you suggest (so that guest PV bits will be in separate patch) and
> change to a common code (not much as far as I see) with explanation why
> it is needed. (Why kvm_vcpu_kick() is not needed for instance?)
I broke it down into three pieces in the end: the basic host support, the basic guest PV support, and the virtio/console support. The first piece is still much the biggest. I found that the generic kvm_vcpu_kick() is fine, so I removed the custom version (which predated the generic version in our internal tree). Explanations are now in the git commit comments.
>>> Also can you
>>> describe the implementation a little bit? Does tile arch has vitalization
>>> extension this implementation uses, or is it trap and emulate approach?
>>> If later does it run unmodified guest kernels? What userspace are you
>>> using with this implementation?
>> We could do full virtualization via trap and emulate, but we've elected to do a para-virtualized approach. Userspace runs at PL (privilege level) 0, the guest kernel runs at PL1, and the host runs at PL2. We have available per-PL resources for various things, and take advantage of having two on-chip timers (for example) to handle timing for the host and guest kernels. We run the same userspace with either the host or the guest.
>>
> OK, thanks for explanation. Why have you decided to do PV over trap and
> emulate?
Performance and simplicity; I added comments to the git commit to provide a rationale.
--
Chris Metcalf, Tilera Corp.
http://www.tilera.com
This commit enables the host side of KVM support for tilegx.
KVM support on tilegx presumes a client that runs at privilege level 1
(PL1), above normal user programs, which continue to run at PL0, but
below the normal (host) Linux, which runs at PL2. Omitting all "trap
and emulate" support both simplifies the host as well as allowing a
paravirtualized guest to run with better overall performance; we may
in the future elect to add emulation support.
We don't support huge pages, or any of the tile-specific APIs that
require being locked to specific cores (e.g., UDN hardwall).
The Tilera booter/hypervisor (which runs at PL3) has been extended to
support a guest context (used for interrupts at PL1) as well as a
virtualization context that is applied to map guest PAs to real PAs.
Note that the eventual plan for the Tilera software stack is to
migrate away from having the Tilera booter provide PL3 functionality
like this and instead provide it all in the host kernel at PL2.
Meanwhile, the kvm host provides the Tilera hypervisor API to the
paravirtualized guest.
The commit adds a KVM_EXIT_xxx code, KVM_EXIT_AGAIN, which is used to
exit out to the host kernel, but not all the way out to qemu. This is
helpful if we are trying to handle resched, sigpending, etc., but don't
need to end up back in userspace first.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/Kconfig | 2 +-
arch/tile/Makefile | 1 +
arch/tile/include/asm/io.h | 2 +
arch/tile/include/asm/kvm.h | 29 +
arch/tile/include/asm/kvm_host.h | 95 ++
arch/tile/include/asm/processor.h | 4 +-
arch/tile/include/asm/ptrace.h | 2 +-
arch/tile/include/asm/thread_info.h | 17 +-
arch/tile/include/hv/hypervisor.h | 183 +++-
arch/tile/include/uapi/arch/sim.h | 19 +
arch/tile/include/uapi/arch/sim_def.h | 8 +
arch/tile/include/uapi/arch/spr_def_32.h | 15 +
arch/tile/include/uapi/arch/spr_def_64.h | 25 +
arch/tile/include/uapi/asm/Kbuild | 1 +
arch/tile/include/uapi/asm/kvm.h | 262 +++++
arch/tile/kernel/asm-offsets.c | 7 +
arch/tile/kernel/hvglue.S | 7 +-
arch/tile/kernel/hvglue_trace.c | 14 +
arch/tile/kernel/intvec_32.S | 12 +-
arch/tile/kernel/intvec_64.S | 192 +++-
arch/tile/kernel/process.c | 38 +-
arch/tile/kernel/setup.c | 13 +
arch/tile/kernel/smp.c | 28 +-
arch/tile/kernel/stack.c | 2 +-
arch/tile/kvm/Kconfig | 3 -
arch/tile/kvm/Makefile | 12 +
arch/tile/kvm/entry.S | 91 ++
arch/tile/kvm/kvm-tile.c | 1529 ++++++++++++++++++++++++++++++
arch/tile/lib/exports.c | 20 +-
arch/tile/mm/fault.c | 4 +-
arch/tile/mm/pgtable.c | 35 +-
include/uapi/linux/kvm.h | 1 +
virt/kvm/kvm_main.c | 3 +-
33 files changed, 2558 insertions(+), 118 deletions(-)
create mode 100644 arch/tile/include/asm/kvm.h
create mode 100644 arch/tile/include/asm/kvm_host.h
create mode 100644 arch/tile/include/uapi/asm/kvm.h
create mode 100644 arch/tile/kvm/Makefile
create mode 100644 arch/tile/kvm/entry.S
create mode 100644 arch/tile/kvm/kvm-tile.c
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index ecff467..3bc8fb7 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -5,7 +5,6 @@ config TILE
def_bool y
select HAVE_DMA_ATTRS
select HAVE_DMA_API_DEBUG
- select HAVE_KVM if !TILEGX
select GENERIC_FIND_FIRST_BIT
select SYSCTL_EXCEPTION_TRACE
select USE_GENERIC_SMP_HELPERS
@@ -127,6 +126,7 @@ config TILEGX
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KPROBES
select HAVE_KRETPROBES
+ select HAVE_KVM
config TILEPRO
def_bool !TILEGX
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364..8e7f852 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH)
# See arch/tile/Kbuild for content of core part of the kernel
core-y += arch/tile/
+core-$(CONFIG_KVM) += arch/tile/kvm/
core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 9fe4349..023659b 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -43,6 +43,8 @@
* long before casting it to a pointer to avoid compiler warnings.
*/
#if CHIP_HAS_MMIO()
+extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot);
extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
pgprot_t pgprot);
diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
new file mode 100644
index 0000000..2ea6c41
--- /dev/null
+++ b/arch/tile/include/asm/kvm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_H
+#define _ASM_TILE_KVM_H
+
+#include <hv/hypervisor.h>
+#include <uapi/asm/kvm.h>
+
+#ifndef __ASSEMBLER__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
+#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
+#endif
+#endif /* _ASM_TILE_KVM_H */
diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
new file mode 100644
index 0000000..1c9b6cd
--- /dev/null
+++ b/arch/tile/include/asm/kvm_host.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _ASM_TILE_KVM_HOST_H
+#define _ASM_TILE_KVM_HOST_H
+
+#define KVM_MAX_VCPUS 64
+#define KVM_USER_MEM_SLOTS 32
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+/* For now, claim we have no huge pages. */
+#define KVM_HPAGE_GFN_SHIFT(x) 0
+#define KVM_NR_PAGE_SIZES 1
+#define KVM_PAGES_PER_HPAGE(x) 1
+
+/* Max number of message tags for hv_send/receive_message() */
+#define MAX_MSG_TAG (sizeof(unsigned long) * 8)
+
+/* Bits in pending_downcalls */
+#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kvm_vcpu_stat {
+ u32 halt_wakeup;
+};
+
+struct kvm_vcpu_arch {
+ struct pt_regs regs;
+ struct kvm_sregs sregs;
+ unsigned long host_sp; /* Host "real" sp during vmresume. */
+ HV_Context guest_context;
+ unsigned long pending_msgs; /* Pending guest messages */
+ unsigned long ipi_events; /* Pending guest ipi events. */
+ unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
+ pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
+ unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */
+ int suspended; /* true for cores not yet started by host */
+ unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */
+ unsigned long vmexit_cycles; /* cycle count of last vmexit */
+};
+
+struct kvm_vm_stat {
+ u32 remote_tlb_flush;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+struct kvm_arch {
+ pgd_t *vpgd;
+ unsigned long resv_gpa_start; /* For special purpose. */
+ struct completion smp_start;
+};
+
+struct kvm_vcpu;
+
+extern void kvm_vmresume(struct pt_regs *guest,
+ unsigned long *host_sp_ptr);
+extern void kvm_vmexit(unsigned long host_sp);
+extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
+extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
+extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long, unsigned long);
+extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
+
+#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
+
+#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
+
+#define gpmd_offset(kvm, pud, address) \
+ ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
+
+#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
+
+#define gpte_offset_kernel(kvm, pmd, address) \
+ ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
+
+#endif /* __ASSEMBLY__*/
+
+#endif /* _ASM_TILE_KVM_HOST_H */
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 230b830..c72fcba 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
#ifndef _ASM_TILE_PROCESSOR_H
#define _ASM_TILE_PROCESSOR_H
+#include <arch/chip.h>
+
#ifndef __ASSEMBLY__
/*
@@ -25,7 +27,6 @@
#include <asm/ptrace.h>
#include <asm/percpu.h>
-#include <arch/chip.h>
#include <arch/spr_def.h>
struct task_struct;
@@ -347,7 +348,6 @@ extern int kdata_huge;
/*
* Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
*/
#define USER_PL 0
#if CONFIG_KERNEL_PL == 2
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 0d25c21..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
#define user_stack_pointer(regs) ((regs)->sp)
/* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
/* Fill in a struct pt_regs with the current kernel registers. */
struct pt_regs *get_pt_regs(struct pt_regs *);
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index b8aa6df..1c26cdf 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -18,7 +18,9 @@
#include <asm/processor.h>
#include <asm/page.h>
+
#ifndef __ASSEMBLY__
+struct kvm_vcpu;
/*
* Low level task data that assembly code needs immediate access to.
@@ -44,6 +46,9 @@ struct thread_info {
unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */
void __user *unalign_jit_base; /* unalign fixup JIT base */
#endif
+#ifdef CONFIG_KVM
+ struct kvm_vcpu *vcpu; /* vcpu during vmresume */
+#endif
};
/*
@@ -117,8 +122,8 @@ extern void _cpu_idle(void);
/*
* Thread information flags that various assembly files may need to access.
- * Keep flags accessed frequently in low bits, particular since it makes
- * it easier to build constants in assembly.
+ * Keep flags accessed frequently in low bits, since it makes it
+ * easier to build constants in assembly.
*/
#define TIF_SIGPENDING 0 /* signal pending */
#define TIF_NEED_RESCHED 1 /* rescheduling necessary */
@@ -131,6 +136,7 @@ extern void _cpu_idle(void);
#define TIF_MEMDIE 7 /* OOM killer at work */
#define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
#define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
+#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */
#define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
#define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
@@ -142,11 +148,12 @@ extern void _cpu_idle(void);
#define _TIF_MEMDIE (1<<TIF_MEMDIE)
#define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
#define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VIRT_EXIT (1<<TIF_VIRT_EXIT)
/* Work to do on any return to user space. */
-#define _TIF_ALLWORK_MASK \
- (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
- _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
+#define _TIF_ALLWORK_MASK \
+ (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP| \
+ _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)
/* Work to do at syscall entry. */
#define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index f71b08e..71abe38 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -321,6 +321,18 @@
/** hv_set_speed */
#define HV_DISPATCH_SET_SPEED 58
+/** hv_install_virt_context */
+#define HV_DISPATCH_INSTALL_VIRT_CONTEXT 59
+
+/** hv_inquire_virt_context */
+#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT 60
+
+/** hv_install_guest_context */
+#define HV_DISPATCH_INSTALL_GUEST_CONTEXT 61
+
+/** hv_inquire_guest_context */
+#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT 62
+
/** hv_console_set_ipi */
#define HV_DISPATCH_CONSOLE_SET_IPI 63
@@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
* new page table does not need to contain any mapping for the
* hv_install_context address itself.
*
- * At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ * At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
* if multiple flags are specified, HV_EINVAL is returned.
* Specifying none of the flags results in using the default page size.
* All cores participating in a given client must request the same
* page size, or the results are undefined.
*
+ * To disable an installed page table, install HV_CTX_NONE. The access
+ * and asid fields are ignored.
+ *
* @param page_table Root of the page table.
* @param access PTE providing info on how to read the page table. This
* value must be consistent between multiple tiles sharing a page table,
@@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
#endif /* !__ASSEMBLER__ */
+#define HV_CTX_NONE ((HV_PhysAddr)-1) /**< Disable page table. */
+
#define HV_CTX_DIRECTIO 0x1 /**< Direct I/O requests are accepted from
PL0. */
+#define HV_CTX_GUEST_CACHE 0x4 /**< Let guest control caching flags (only
+ usable with hv_install_virt_context.) */
+
#define HV_CTX_PG_SM_4K 0x10 /**< Use 4K small pages, if available. */
#define HV_CTX_PG_SM_16K 0x20 /**< Use 16K small pages, if available. */
#define HV_CTX_PG_SM_64K 0x40 /**< Use 64K small pages, if available. */
#define HV_CTX_PG_SM_MASK 0xf0 /**< Mask of all possible small pages. */
+
#ifndef __ASSEMBLER__
+/** Install a virtualization context.
+ *
+ * When a virtualization context is installed, all faults from PL0 or
+ * PL1 are handled via a "guest context" and then post-processed by
+ * the "virtualization context"; faults at PL2 are still handled by
+ * the normal context. For guest faults, the "guest PAs" produced by
+ * the guest page table are passed through the virtualization page
+ * table as pseudo-VAs, generating the true CPA as a result. See the
+ * individual HV_PTE_xxx bits for the effect the bits have when
+ * present in the virtualization page table. The ASID is currently
+ * ignored in this syscall, but it might be used later, so the API
+ * includes it. The HV_CTX_GUEST_CACHE flag indicates that all
+ * cache-related flags should be taken from the primary page table,
+ * not the virtualization page table.
+ *
+ * Once the virtualization context is installed, a guest context
+ * should also be installed; otherwise a VA-equals-PA context will be
+ * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
+ * the virtualization context to generate CPAs.
+ *
+ * When entering client PL after being at guest or user PL, the
+ * client is expected to call hv_flush_all() to clear any TLB mappings
+ * that might otherwise conflict. Similarly, hv_flush_all() should
+ * be called before returning to guest or user PL with a virtualization
+ * context installed, so that any TLB mappings are cleared. Future
+ * work may include adding a "vpid" or similar namespace so that
+ * the TLBs may be managed independently.
+ *
+ * Subsequent guest page table installations will have their root PA
+ * and PTE cached after translating through the virtualization
+ * context, so if entries in the virtualization page table are
+ * modified or removed, the guest context should be re-installed.
+ * This, in conjunction with flushing the TLB on return to the guest,
+ * will ensure that the new virtualization entries are honored.
+ *
+ * @param page_table Root of the page table.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for (currently ignored).
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current virtualization context (see below).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
+
+
+/** Install a guest context.
+ *
+ * The guest context is only consulted when a virtualization context
+ * is also installed, and for faults that occur below the client's PL.
+ * If no guest context is installed, in such a case, a VA=PA context
+ * is used instead.
+ *
+ * The access PTE will only be honored if the virtualization table was
+ * installed with HV_CTX_GUEST_CACHE.
+ *
+ * A virtualization context must already be installed prior to
+ * installing the guest context.
+ *
+ * @param page_table Root of the page table; the value is the guest's
+ * physical address (GPA), not a CPA.
+ * @param access PTE providing info on how to read the page table. This
+ * value must be consistent between multiple tiles sharing a page table,
+ * and must also be consistent with any virtual mappings the client
+ * may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for.
+ * @param flags Context flags, denoting attributes or privileges of the
+ * current context (HV_CTX_xxx).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
+ HV_ASID asid, __hv32 flags);
+
/** Set the number of pages ganged together by HV_PTE_SUPER at a
* particular level of the page table.
@@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
* "super" page size must be less than the span of the next level in
* the page table. The largest size that can be requested is 64GB.
*
- * The shift value is initially "0" for all page table levels,
+ * The shift value is initially 0 for all page table levels,
* indicating that the HV_PTE_SUPER bit is effectively ignored.
*
* If you change the count from one non-zero value to another, the
@@ -854,11 +954,26 @@ typedef struct
} HV_Context;
/** Retrieve information about the currently installed context.
- * @return The data passed to the last successful hv_install_context call.
+ * @return The data passed to the last successful call to
+ * hv_install_context().
*/
HV_Context hv_inquire_context(void);
+/** Retrieve information about the currently installed virtualization context.
+ * @return The data passed to the last successful call to
+ * hv_install_virt_context().
+ */
+HV_Context hv_inquire_virt_context(void);
+
+
+/** Retrieve information about the currently installed guest context.
+ * @return The data passed to the last successful call to
+ * hv_install_guest_context().
+ */
+HV_Context hv_inquire_guest_context(void);
+
+
/** Flushes all translations associated with the named address space
* identifier from the TLB and any other hypervisor data structures.
* Translations installed with the "global" bit are not flushed.
@@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
/** Flushes all non-global translations (if preserve_global is true),
* or absolutely all translations (if preserve_global is false).
*
- * @param preserve_global Non-zero if we want to preserve "global" mappings.
+ * @param preserve_global Non-zero if we want to preserve global mappings.
* @return Zero on success, or a hypervisor error code on failure.
*/
int hv_flush_all(int preserve_global);
@@ -991,7 +1106,11 @@ typedef enum {
HV_INQ_TILES_HFH_CACHE = 2,
/** The set of tiles that can be legally used as a LOTAR for a PTE. */
- HV_INQ_TILES_LOTAR = 3
+ HV_INQ_TILES_LOTAR = 3,
+
+ /** The set of "shared" driver tiles that the hypervisor may
+ * periodically interrupt. */
+ HV_INQ_TILES_SHARED = 4
} HV_InqTileSet;
/** Returns specific information about various sets of tiles within the
@@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
*/
/** Message receive downcall interrupt vector */
#define INT_MESSAGE_RCV_DWNCL INT_BOOT_ACCESS
+/** Device interrupt downcall interrupt vector */
+#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+#ifdef __tilegx__
+/** Virtualization page table miss downcall interrupt vector */
+#define INT_VPGTABLE_MISS_DWNCL INT_I_ASID
+/** Virtualization guest illegal page table */
+#define INT_VGUEST_FATAL_DWNCL INT_D_ASID
+#else
/** DMA TLB miss downcall interrupt vector */
#define INT_DMATLB_MISS_DWNCL INT_DMA_ASID
-/** Static nework processor instruction TLB miss interrupt vector */
-#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
/** DMA TLB access violation downcall interrupt vector */
#define INT_DMATLB_ACCESS_DWNCL INT_DMA_CPL
-/** Device interrupt downcall interrupt vector */
-#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
+/** Static nework processor instruction TLB miss interrupt vector */
+#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
+#endif
#ifndef __ASSEMBLER__
@@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
#define HV_PTE_PTFN_BITS 29 /**< Number of bits in a PTFN */
/*
- * Legal values for the PTE's mode field
+ * Legal values for the PTE's mode field.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
+ * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
+ * to access MMIO resources via pseudo PAs that map to MMIO in the
+ * virtualization page table.
*/
+
/** Data is not resident in any caches; loads and stores access memory
* directly.
*/
@@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the primary page table if a virtualization
+ * page table is installed.
*/
#define HV_PTE_GLOBAL (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
@@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the virtualization page table.
*/
#define HV_PTE_USER (__HV_PTE_ONE << HV_PTE_INDEX_USER)
@@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_ACCESSED (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
@@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* has been cleared, subsequent references are not guaranteed to set
* it again until the translation has been flushed from the TLB.
*
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
*/
#define HV_PTE_DIRTY (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
@@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NC (__HV_PTE_ONE << HV_PTE_INDEX_NC)
@@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit
* determines how the level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L1 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
@@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
*
* In level-1 PTEs, if the Page bit is clear, this bit determines how the
* level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_NO_ALLOC_L2 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
@@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* the page map directly to memory.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
*/
#define HV_PTE_CACHED_PRIORITY (__HV_PTE_ONE << \
HV_PTE_INDEX_CACHED_PRIORITY)
@@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* It is illegal for this bit to be clear if the Writable bit is set.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Readable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_READABLE (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
@@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* PTE.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Writable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_WRITABLE (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
@@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
* than one.
*
* This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Executable status
+ * is the logical "and" of this bit in both page tables.
*/
#define HV_PTE_EXECUTABLE (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
index e54b7b0..36fb24c 100644
--- a/arch/tile/include/uapi/arch/sim.h
+++ b/arch/tile/include/uapi/arch/sim.h
@@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
__insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
}
+/**
+ * Set vCPU number for a given task.
+ * @param vcpu Virtual cpu to set.
+ */
+static __inline void
+sim_set_vcpu(int vcpu)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+/** Clear vCPU status for a given task. */
+static __inline void
+sim_clear_vcpu(void)
+{
+ __insn_mtspr(SPR_SIM_CONTROL,
+ SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
+}
+
/*
* Event support.
diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
index 4b44a2b..b9aad66 100644
--- a/arch/tile/include/uapi/arch/sim_def.h
+++ b/arch/tile/include/uapi/arch/sim_def.h
@@ -221,6 +221,14 @@
*/
#define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
+/**
+ * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
+ * number shifted by 8, will tag any identification of the cpu that
+ * task is running on with the given virtual cpu number. If the
+ * virtual cpu number is -1, the tag is removed.
+ */
+#define SIM_CONTROL_VCPU 37
+
/*
* Syscall numbers for use with "sim_syscall()".
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446..4644c8d 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -121,6 +121,9 @@
#define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
#define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
#define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_GPV_SET_0 0x0600
+#define SPR_MPL_GPV_SET_1 0x0601
+#define SPR_MPL_GPV_SET_2 0x0602
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -142,6 +145,9 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x3400
#define SPR_MPL_IDN_TIMER_SET_1 0x3401
#define SPR_MPL_IDN_TIMER_SET_2 0x3402
+#define SPR_MPL_ILL_SET_0 0x0400
+#define SPR_MPL_ILL_SET_1 0x0401
+#define SPR_MPL_ILL_SET_2 0x0402
#define SPR_MPL_INTCTRL_0_SET_0 0x4a00
#define SPR_MPL_INTCTRL_0_SET_1 0x4a01
#define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -166,6 +172,12 @@
#define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
#define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
#define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
+#define SPR_MPL_SWINT_0_SET_0 0x1c00
+#define SPR_MPL_SWINT_0_SET_1 0x1c01
+#define SPR_MPL_SWINT_0_SET_2 0x1c02
+#define SPR_MPL_SWINT_1_SET_0 0x1a00
+#define SPR_MPL_SWINT_1_SET_1 0x1a01
+#define SPR_MPL_SWINT_1_SET_2 0x1a02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
@@ -187,6 +199,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x3600
#define SPR_MPL_UDN_TIMER_SET_1 0x3601
#define SPR_MPL_UDN_TIMER_SET_2 0x3602
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
#define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
#define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
#define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
index 67a6c17..727cda7 100644
--- a/arch/tile/include/uapi/arch/spr_def_64.h
+++ b/arch/tile/include/uapi/arch/spr_def_64.h
@@ -21,6 +21,10 @@
#define SPR_AUX_PERF_COUNT_1 0x2106
#define SPR_AUX_PERF_COUNT_CTL 0x2107
#define SPR_AUX_PERF_COUNT_STS 0x2108
+#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
+#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK 0xffffffff
+#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
+#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
#define SPR_CMPEXCH_VALUE 0x2780
#define SPR_CYCLE 0x2781
#define SPR_DONE 0x2705
@@ -101,6 +105,9 @@
#define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
#define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
#define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_GPV_SET_0 0x0900
+#define SPR_MPL_GPV_SET_1 0x0901
+#define SPR_MPL_GPV_SET_2 0x0902
#define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
#define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
#define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -116,6 +123,12 @@
#define SPR_MPL_IDN_TIMER_SET_0 0x1800
#define SPR_MPL_IDN_TIMER_SET_1 0x1801
#define SPR_MPL_IDN_TIMER_SET_2 0x1802
+#define SPR_MPL_ILL_SET_0 0x0800
+#define SPR_MPL_ILL_SET_1 0x0801
+#define SPR_MPL_ILL_SET_2 0x0802
+#define SPR_MPL_ILL_TRANS_SET_0 0x1000
+#define SPR_MPL_ILL_TRANS_SET_1 0x1001
+#define SPR_MPL_ILL_TRANS_SET_2 0x1002
#define SPR_MPL_INTCTRL_0_SET_0 0x2500
#define SPR_MPL_INTCTRL_0_SET_1 0x2501
#define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -140,6 +153,15 @@
#define SPR_MPL_PERF_COUNT_SET_0 0x2000
#define SPR_MPL_PERF_COUNT_SET_1 0x2001
#define SPR_MPL_PERF_COUNT_SET_2 0x2002
+#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
+#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
+#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
+#define SPR_MPL_SWINT_0_SET_0 0x0f00
+#define SPR_MPL_SWINT_0_SET_1 0x0f01
+#define SPR_MPL_SWINT_0_SET_2 0x0f02
+#define SPR_MPL_SWINT_1_SET_0 0x0e00
+#define SPR_MPL_SWINT_1_SET_1 0x0e01
+#define SPR_MPL_SWINT_1_SET_2 0x0e02
#define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
#define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
#define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -155,6 +177,9 @@
#define SPR_MPL_UDN_TIMER_SET_0 0x1900
#define SPR_MPL_UDN_TIMER_SET_1 0x1901
#define SPR_MPL_UDN_TIMER_SET_2 0x1902
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
#define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
#define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
#define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index c20db8e..89022a5 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -6,6 +6,7 @@ header-y += bitsperlong.h
header-y += byteorder.h
header-y += cachectl.h
header-y += hardwall.h
+header-y += kvm.h
header-y += kvm_para.h
header-y += mman.h
header-y += ptrace.h
diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
new file mode 100644
index 0000000..aa7b97f
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_H
+#define _UAPI_ASM_TILE_KVM_H
+
+#ifndef __ASSEMBLER__
+#include <linux/ptrace.h>
+#endif
+
+#include <arch/abi.h>
+
+/*
+ * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
+ * with small modifications: Remove HV_SYS_fence_incoherent.
+ */
+/* Syscall allowed from guest PL bit mask. */
+#define HV_SYS_GUEST_SHIFT 12
+#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT)
+/* downcall_dispatch; this syscall number must be zero */
+#define HV_SYS_downcall_dispatch 0
+/* install_context */
+#define HV_SYS_install_context 1
+/* sysconf */
+#define HV_SYS_sysconf 2
+/* get_rtc */
+#define HV_SYS_get_rtc 3
+/* set_rtc */
+#define HV_SYS_set_rtc 4
+/* flush_asid */
+#define HV_SYS_flush_asid 5
+/* flush_page */
+#define HV_SYS_flush_page 6
+/* flush_pages */
+#define HV_SYS_flush_pages 7
+/* restart */
+#define HV_SYS_restart 8
+/* halt */
+#define HV_SYS_halt 9
+/* power_off */
+#define HV_SYS_power_off 10
+/* inquire_physical */
+#define HV_SYS_inquire_physical 11
+/* inquire_memory_controller */
+#define HV_SYS_inquire_memory_controller 12
+/* inquire_virtual */
+#define HV_SYS_inquire_virtual 13
+/* inquire_asid */
+#define HV_SYS_inquire_asid 14
+/* console_read_if_ready */
+#define HV_SYS_console_read_if_ready 15
+/* console_write */
+#define HV_SYS_console_write 16
+/* init */
+#define HV_SYS_init 17
+/* inquire_topology */
+#define HV_SYS_inquire_topology 18
+/* fs_findfile */
+#define HV_SYS_fs_findfile 19
+/* fs_fstat */
+#define HV_SYS_fs_fstat 20
+/* fs_pread */
+#define HV_SYS_fs_pread 21
+/* physaddr_read64 */
+#define HV_SYS_physaddr_read64 22
+/* physaddr_write64 */
+#define HV_SYS_physaddr_write64 23
+/* get_command_line */
+#define HV_SYS_get_command_line 24
+/* set_caching */
+#define HV_SYS_set_caching 25
+/* bzero_page */
+#define HV_SYS_bzero_page 26
+/* register_message_state */
+#define HV_SYS_register_message_state 27
+/* send_message */
+#define HV_SYS_send_message 28
+/* receive_message */
+#define HV_SYS_receive_message 29
+/* inquire_context */
+#define HV_SYS_inquire_context 30
+/* start_all_tiles */
+#define HV_SYS_start_all_tiles 31
+/* dev_open */
+#define HV_SYS_dev_open 32
+/* dev_close */
+#define HV_SYS_dev_close 33
+/* dev_pread */
+#define HV_SYS_dev_pread 34
+/* dev_pwrite */
+#define HV_SYS_dev_pwrite 35
+/* dev_poll */
+#define HV_SYS_dev_poll 36
+/* dev_poll_cancel */
+#define HV_SYS_dev_poll_cancel 37
+/* dev_preada */
+#define HV_SYS_dev_preada 38
+/* dev_pwritea */
+#define HV_SYS_dev_pwritea 39
+/* flush_remote */
+#define HV_SYS_flush_remote 40
+/* console_putc */
+#define HV_SYS_console_putc 41
+/* inquire_tiles */
+#define HV_SYS_inquire_tiles 42
+/* confstr */
+#define HV_SYS_confstr 43
+/* reexec */
+#define HV_SYS_reexec 44
+/* set_command_line */
+#define HV_SYS_set_command_line 45
+
+/* store_mapping */
+#define HV_SYS_store_mapping 52
+/* inquire_realpa */
+#define HV_SYS_inquire_realpa 53
+/* flush_all */
+#define HV_SYS_flush_all 54
+/* get_ipi_pte */
+#define HV_SYS_get_ipi_pte 55
+/* set_pte_super_shift */
+#define HV_SYS_set_pte_super_shift 56
+/* set_speed */
+#define HV_SYS_set_speed 57
+/* install_virt_context */
+#define HV_SYS_install_virt_context 58
+/* inquire_virt_context */
+#define HV_SYS_inquire_virt_context 59
+/* inquire_guest_context */
+#define HV_SYS_install_guest_context 60
+/* inquire_guest_context */
+#define HV_SYS_inquire_guest_context 61
+
+/*
+ * Number of hypercall (from guest os to host os) other than hv_*().
+ * We leave the previous 128 entries to the usual hv_*() calls
+ * as defined in hypervisor.h.
+ */
+#define KVM_OTHER_HCALL 128
+
+/* One greater than the maximum hypercall number. */
+#define KVM_NUM_HCALLS 256
+
+#ifndef __ASSEMBLER__
+
+struct kvm_regs {
+ struct pt_regs regs;
+};
+
+#define FOR_EACH_GUEST_SPR(f) \
+ f(INTERRUPT_MASK_1); \
+ f(INTERRUPT_VECTOR_BASE_1); \
+ f(EX_CONTEXT_1_0); \
+ f(EX_CONTEXT_1_1); \
+ f(SYSTEM_SAVE_1_0); \
+ f(SYSTEM_SAVE_1_1); \
+ f(SYSTEM_SAVE_1_2); \
+ f(SYSTEM_SAVE_1_3); \
+ f(INTCTRL_1_STATUS); \
+ f(IPI_MASK_1); \
+ f(IPI_EVENT_1); \
+ f(SINGLE_STEP_CONTROL_1); \
+ f(SINGLE_STEP_EN_1_1); \
+
+struct kvm_sregs {
+#define DECLARE_SPR(f) unsigned long f
+ FOR_EACH_GUEST_SPR(DECLARE_SPR)
+#undef DECLARE_SPR
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#ifndef __KERNEL__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
+#endif
+
+#define HCALL_DEFS \
+ /* For hv_*() */ \
+ KVM_EMULATE(init) \
+ NO_EMULATE(install_context) \
+ KVM_EMULATE(sysconf) \
+ KVM_EMULATE(get_rtc) \
+ KVM_EMULATE(set_rtc) \
+ NO_EMULATE(flush_asid) \
+ NO_EMULATE(flush_page) \
+ NO_EMULATE(flush_pages) \
+ USER_EMULATE(restart) \
+ USER_EMULATE(halt) \
+ USER_EMULATE(power_off) \
+ USER_EMULATE(inquire_physical) \
+ USER_EMULATE(inquire_memory_controller) \
+ KVM_EMULATE(inquire_virtual) \
+ KVM_EMULATE(inquire_asid) \
+ NO_EMULATE(console_read_if_ready) \
+ NO_EMULATE(console_write) \
+ NO_EMULATE(downcall_dispatch) \
+ KVM_EMULATE(inquire_topology) \
+ USER_EMULATE(fs_findfile) \
+ USER_EMULATE(fs_fstat) \
+ USER_EMULATE(fs_pread) \
+ KVM_EMULATE(physaddr_read64) \
+ KVM_EMULATE(physaddr_write64) \
+ USER_EMULATE(get_command_line) \
+ USER_EMULATE(set_caching) \
+ NO_EMULATE(bzero_page) \
+ KVM_EMULATE(register_message_state) \
+ KVM_EMULATE(send_message) \
+ KVM_EMULATE(receive_message) \
+ KVM_EMULATE(inquire_context) \
+ KVM_EMULATE(start_all_tiles) \
+ USER_EMULATE(dev_open) \
+ USER_EMULATE(dev_close) \
+ USER_EMULATE(dev_pread) \
+ USER_EMULATE(dev_pwrite) \
+ USER_EMULATE(dev_poll) \
+ USER_EMULATE(dev_poll_cancel) \
+ USER_EMULATE(dev_preada) \
+ USER_EMULATE(dev_pwritea) \
+ USER_EMULATE(flush_remote) \
+ NO_EMULATE(console_putc) \
+ KVM_EMULATE(inquire_tiles) \
+ KVM_EMULATE(confstr) \
+ USER_EMULATE(reexec) \
+ USER_EMULATE(set_command_line) \
+ USER_EMULATE(store_mapping) \
+ NO_EMULATE(inquire_realpa) \
+ NO_EMULATE(flush_all) \
+ KVM_EMULATE(get_ipi_pte) \
+ KVM_EMULATE(set_pte_super_shift) \
+ KVM_EMULATE(set_speed) \
+
+#endif
+
+#endif /* _UAPI_ASM_TILE_KVM_H */
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 97ea6ac..0a04a16 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -20,6 +20,9 @@
#include <linux/hardirq.h>
#include <linux/ptrace.h>
#include <hv/hypervisor.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif
/* Check for compatible compiler early in the build. */
#ifdef CONFIG_TILEGX
@@ -68,6 +71,10 @@ void foo(void)
DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
offsetof(struct thread_info, unalign_jit_tmp));
#endif
+#ifdef CONFIG_KVM
+ DEFINE(THREAD_INFO_VCPU_OFFSET,
+ offsetof(struct thread_info, vcpu));
+#endif
DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
offsetof(struct task_struct, thread.ksp));
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
index 16576c6..dc5b417 100644
--- a/arch/tile/kernel/hvglue.S
+++ b/arch/tile/kernel/hvglue.S
@@ -71,5 +71,10 @@ gensym hv_flush_all, 0x6e0, 32
gensym hv_get_ipi_pte, 0x700, 32
gensym hv_set_pte_super_shift, 0x720, 32
gensym hv_set_speed, 0x740, 32
+gensym hv_install_virt_context, 0x760, 32
+gensym hv_inquire_virt_context, 0x780, 32
+gensym hv_install_guest_context, 0x7a0, 32
+gensym hv_inquire_guest_context, 0x7c0, 32
gensym hv_console_set_ipi, 0x7e0, 32
-gensym hv_glue_internals, 0x800, 30720
+gensym hv_glue_internals, 0x800, 2048
+gensym hv_hcall_internals, 0x1020, 28640
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
index 16ef6c1..3b15c76 100644
--- a/arch/tile/kernel/hvglue_trace.c
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -75,6 +75,10 @@
#define hv_get_ipi_pte _hv_get_ipi_pte
#define hv_set_pte_super_shift _hv_set_pte_super_shift
#define hv_set_speed _hv_set_speed
+#define hv_install_virt_context _hv_install_virt_context
+#define hv_inquire_virt_context _hv_inquire_virt_context
+#define hv_install_guest_context _hv_install_guest_context
+#define hv_inquire_guest_context _hv_inquire_guest_context
#define hv_console_set_ipi _hv_console_set_ipi
#include <hv/hypervisor.h>
#undef hv_init
@@ -135,6 +139,10 @@
#undef hv_get_ipi_pte
#undef hv_set_pte_super_shift
#undef hv_set_speed
+#undef hv_install_virt_context
+#undef hv_inquire_virt_context
+#undef hv_install_guest_context
+#undef hv_inquire_guest_context
#undef hv_console_set_ipi
/*
@@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
unsigned long, flags)
HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
+ HV_ASID, asid, __hv32, flags)
HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP0(HV_Context, hv_inquire_virt_context)
+HV_WRAP0(HV_Context, hv_inquire_guest_context)
HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f3d26f4..8ac6072 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -806,7 +806,7 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnz r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnz r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
lw r29, r29
@@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
seq r27, r27, r28
}
{
- bbns r27, .Lrestore_all
+ bbns r27, restore_all
addi r28, r28, 8
}
sw r29, r28
- j .Lrestore_all
+ j restore_all
.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
auli r1, r1, ha16(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- bzt r1, .Lrestore_all
+ bzt r1, restore_all
/*
* Make sure we have all the registers saved for signal
@@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
* profile interrupt will actually disable interrupts in both SPRs
* before returning, which is OK.)
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
lw r0, r0
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 3b35bb4..45647a4 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -29,6 +29,10 @@
#include <arch/abi.h>
#include <arch/interrupts.h>
#include <arch/spr_def.h>
+#include <arch/opcode.h>
+#ifdef CONFIG_KVM
+#include <asm/kvm_host.h>
+#endif
#define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
@@ -347,10 +351,6 @@ intvec_\vecname:
*
* Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
* any path that turns into a downcall to one of our TLB handlers.
- *
- * FIXME: if we end up never using this path, perhaps we should
- * prevent the hypervisor from generating downcalls in this case.
- * The advantage of getting a downcall is we can panic in Linux.
*/
mfspr r0, SPR_SYSTEM_SAVE_K_2
{
@@ -490,6 +490,10 @@ intvec_\vecname:
mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
.else
+ .ifc \c_routine, kvm_vpgtable_miss
+ mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
+ mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
+ .else
.ifc \vecnum, INT_ILL_TRANS
mfspr r2, ILL_VA_PC
.else
@@ -512,6 +516,7 @@ intvec_\vecname:
.endif
.endif
.endif
+ .endif
/* Put function pointer in r0 */
moveli r0, hw2_last(\c_routine)
shl16insli r0, r0, hw1(\c_routine)
@@ -641,24 +646,25 @@ intvec_\vecname:
/*
* If we will be returning to the kernel, we will need to
* reset the interrupt masks to the state they had before.
- * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+ * Set DISABLE_IRQ in flags iff we came from kernel pl with
+ * irqs disabled.
*/
- mfspr r32, SPR_EX_CONTEXT_K_1
+ mfspr r22, SPR_EX_CONTEXT_K_1
{
andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
}
- beqzt r32, 1f /* zero if from user space */
- IRQS_DISABLED(r32) /* zero if irqs enabled */
+ beqzt r22, 1f /* zero if from user space */
+ IRQS_DISABLED(r22) /* zero if irqs enabled */
#if PT_FLAGS_DISABLE_IRQ != 1
# error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
#endif
1:
.ifnc \function,handle_syscall
/* Record the fact that we saved the caller-save registers above. */
- ori r32, r32, PT_FLAGS_CALLER_SAVES
+ ori r22, r22, PT_FLAGS_CALLER_SAVES
.endif
- st r21, r32
+ st r21, r22
/*
* we've captured enough state to the stack (including in
@@ -698,12 +704,29 @@ intvec_\vecname:
move tp, zero
#endif
+ /*
+ * Prepare the first 256 stack bytes to be rapidly accessible
+ * without having to fetch the background data.
+ */
+ addi r52, sp, -64
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ {
+ wh64 r52
+ addi r52, r52, -64
+ }
+ wh64 r52
+
#ifdef __COLLECT_LINKER_FEEDBACK__
/*
* Notify the feedback routines that we were in the
- * appropriate fixed interrupt vector area. Note that we
- * still have ICS set at this point, so we can't invoke any
- * atomic operations or we will panic. The feedback
+ * appropriate fixed interrupt vector area. The feedback
* routines internally preserve r0..r10 and r30 up.
*/
.ifnc \function,handle_syscall
@@ -722,23 +745,15 @@ intvec_\vecname:
#endif
/*
- * Prepare the first 256 stack bytes to be rapidly accessible
- * without having to fetch the background data.
+ * Stash any interrupt state in r30..r33 for now.
+ * This makes it easier to call C code in the code that follows.
+ * We don't need to on the syscall path since we reload
+ * them from the stack instead.
*/
- addi r52, sp, -64
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- {
- wh64 r52
- addi r52, r52, -64
- }
- wh64 r52
+ .ifnc \function,handle_syscall
+ { move r30, r0; move r31, r1 }
+ { move r32, r2; move r33, r3 }
+ .endif
#ifdef CONFIG_TRACE_IRQFLAGS
.ifnc \function,handle_nmi
@@ -749,17 +764,8 @@ intvec_\vecname:
* For syscalls, we already have the register state saved away
* on the stack, so we don't bother to do any register saves here,
* and later we pop the registers back off the kernel stack.
- * For interrupt handlers, save r0-r3 in callee-saved registers.
*/
- .ifnc \function,handle_syscall
- { move r30, r0; move r31, r1 }
- { move r32, r2; move r33, r3 }
- .endif
TRACE_IRQS_OFF
- .ifnc \function,handle_syscall
- { move r0, r30; move r1, r31 }
- { move r2, r32; move r3, r33 }
- .endif
.endif
#endif
@@ -808,7 +814,7 @@ handle_interrupt:
STD_ENTRY(interrupt_return)
/* If we're resuming to kernel space, don't check thread flags. */
{
- bnez r30, .Lrestore_all /* NMIs don't special-case user-space */
+ bnez r30, restore_all /* NMIs don't special-case user-space */
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
ld r29, r29
@@ -824,14 +830,25 @@ STD_ENTRY(interrupt_return)
addli r28, r29, THREAD_INFO_FLAGS_OFFSET
{
ld r28, r28
- addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+ addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
}
{
- andi r28, r28, _TIF_NEED_RESCHED
- ld4s r29, r29
+ andi r27, r28, _TIF_NEED_RESCHED
+ ld4s r26, r26
}
- beqzt r28, 1f
- bnez r29, 1f
+ beqzt r27, 1f
+ bnez r26, 1f
+#ifdef CONFIG_KVM
+ addli r27, r29, THREAD_INFO_VCPU_OFFSET
+ ld r27, r27
+ {
+ beqzt r27, 0f
+ movei r1, KVM_EXIT_AGAIN
+ }
+ push_extra_callee_saves r0
+ j kvm_trigger_vmexit
+0:
+#endif
jal preempt_schedule_irq
FEEDBACK_REENTER(interrupt_return)
1:
@@ -853,11 +870,11 @@ STD_ENTRY(interrupt_return)
cmpeq r27, r27, r28
}
{
- blbc r27, .Lrestore_all
+ blbc r27, restore_all
addi r28, r28, 8
}
st r29, r28
- j .Lrestore_all
+ j restore_all
.Lresume_userspace:
FEEDBACK_REENTER(interrupt_return)
@@ -897,7 +914,7 @@ STD_ENTRY(interrupt_return)
shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
}
and r1, r29, r1
- beqzt r1, .Lrestore_all
+ beqzt r1, restore_all
/*
* Make sure we have all the registers saved for signal
@@ -929,7 +946,9 @@ STD_ENTRY(interrupt_return)
* ICS can only be used in very tight chunks of code to avoid
* tripping over various assertions that it is off.
*/
-.Lrestore_all:
+ .global restore_all
+ .type restore_all, @function
+restore_all:
PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
{
ld r0, r0
@@ -1457,6 +1476,26 @@ int_unalign:
j do_unaligned
ENDPROC(hand_unalign_slow)
+#ifdef CONFIG_KVM
+/*
+ * Any call path that may lead to a vmexit needs to save the full
+ * callee-save register state, since if we vmexit we don't unwind
+ * the callee-saves from the C function stack frames, and instead
+ * just save away the register state from the interrupt handler as-is
+ * and later reload it directly and call back into the guest.
+ */
+ .macro save_callee_saves_and_tailcall func
+kvm_\func:
+ push_extra_callee_saves r0
+ j kvm_do_\func
+ ENDPROC(\func)
+ .endm
+
+ save_callee_saves_and_tailcall hypervisor_call
+ save_callee_saves_and_tailcall vpgtable_miss
+ save_callee_saves_and_tailcall vguest_fatal
+#endif
+
/* Fill the return address stack with nonzero entries. */
STD_ENTRY(fill_ra_stack)
{
@@ -1469,6 +1508,48 @@ STD_ENTRY(fill_ra_stack)
4: jrp r0
STD_ENDPROC(fill_ra_stack)
+#ifdef CONFIG_KVM
+/*
+ * Handle the downcall dispatch service. On entry, the client's
+ * system save register 3 holds the original contents of
+ * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
+ * the correct interrupt vector.
+ * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
+ * here, since this is the only interrupt handled this way on GX.
+ */
+handle_downcall_dispatch:
+ /*
+ * If we were called from PL0, jump back to slow path.
+ * We check just the low bit to make sure it's set, since we
+ * can only be called from PL0 or PL1.
+ */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
+ blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0
+
+ /* Set the PC to the downcall interrupt vector, and PL to guest. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
+ addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
+ INT_MESSAGE_RCV_DWNCL << 8
+ {
+ mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
+ movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
+ }
+ mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
+
+ /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
+ mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
+ iret
+
+ .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \
+ processing=handle_interrupt
+ .org (\vecnum << 8)
+ /* Need special code for downcall dispatch syscall. */
+ beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
+ __int_hand \vecnum, \vecname, \c_routine, \processing
+ .endm
+
+#endif /* CONFIG_KVM */
+
.macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
.org (\vecnum << 8)
__int_hand \vecnum, \vecname, \c_routine, \processing
@@ -1484,6 +1565,11 @@ STD_ENTRY(fill_ra_stack)
#define do_hardwall_trap bad_intr
#endif
+#ifndef CONFIG_KVM
+#define kvm_vpgtable_miss bad_intr
+#define kvm_vguest_fatal bad_intr
+#endif
+
int_hand INT_MEM_ERROR, MEM_ERROR, do_trap
int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
#if CONFIG_KERNEL_PL == 2
@@ -1504,7 +1590,11 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_SWINT_3, SWINT_3, do_trap
int_hand INT_SWINT_2, SWINT_2, do_trap
int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
+#ifdef CONFIG_KVM
+ int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
+#else
int_hand INT_SWINT_0, SWINT_0, do_trap
+#endif
int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
@@ -1541,8 +1631,10 @@ STD_ENTRY(fill_ra_stack)
int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
hv_message_intr
int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
- int_hand INT_I_ASID, I_ASID, bad_intr
- int_hand INT_D_ASID, D_ASID, bad_intr
+ int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
+ kvm_vpgtable_miss
+ int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
+ kvm_vguest_fatal
int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
/* Synthetic interrupt delivered only by the simulator */
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 44cdc4a..7040490 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -27,6 +27,7 @@
#include <linux/kernel.h>
#include <linux/tracehook.h>
#include <linux/signal.h>
+#include <linux/kvm_host.h>
#include <asm/stack.h>
#include <asm/switch_to.h>
#include <asm/homecache.h>
@@ -450,6 +451,11 @@ void _prepare_arch_switch(struct task_struct *next)
struct task_struct *__sched _switch_to(struct task_struct *prev,
struct task_struct *next)
{
+#ifdef CONFIG_KVM
+ /* vmexit is needed before context switch. */
+ BUG_ON(task_thread_info(prev)->vcpu);
+#endif
+
/* DMA state is already saved; save off other arch state. */
save_arch_state(&prev->thread);
@@ -519,6 +525,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
/* Enable interrupts; they are disabled again on return to caller. */
local_irq_enable();
+#ifdef CONFIG_KVM
+ /*
+ * Some work requires us to exit the VM first. Typically this
+ * allows the process running the VM to respond to the work
+ * (e.g. a signal), or allows the VM mechanism to latch
+ * modified host state (e.g. a "hypervisor" message sent to a
+ * different vcpu). It also means that if we are considering
+ * calling schedule(), we exit the VM first, so we never have
+ * to worry about context-switching into a VM.
+ */
+ if (current_thread_info()->vcpu) {
+ u32 do_exit = thread_info_flags &
+ (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
+
+ if (thread_info_flags & _TIF_VIRT_EXIT)
+ clear_thread_flag(TIF_VIRT_EXIT);
+ if (do_exit) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
+ /*NORETURN*/
+ }
+ }
+#endif
+
if (thread_info_flags & _TIF_NEED_RESCHED) {
schedule();
return 1;
@@ -538,11 +567,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
tracehook_notify_resume(regs);
return 1;
}
- if (thread_info_flags & _TIF_SINGLESTEP) {
+
+ /* Handle a few flags here that stay set. */
+ if (thread_info_flags & _TIF_SINGLESTEP)
single_step_once(regs);
- return 0;
- }
- panic("work_pending: bad flags %#x\n", thread_info_flags);
+
+ return 0;
}
unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 774e819..7918cf1 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
* SPRs, as well as the interrupt mask.
*/
__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
+
+#ifdef CONFIG_KVM
+ /*
+ * If we launch a guest kernel, it will need some interrupts
+ * that otherwise are not used by the host or by userspace.
+ * Set them to MPL 1 now and leave them alone going forward;
+ * they are masked in the host so will never fire there anyway,
+ * and we mask them at PL1 as we exit the guest.
+ */
__insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
+ __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
+#endif
/* Initialize IRQ support for this cpu. */
setup_irq_regs();
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 0ae1c59..62b3ba9 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -223,30 +223,34 @@ void __init ipi_init(void)
#if CHIP_HAS_IPI()
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- WARN_ON(cpu_is_offline(cpu));
-
/*
* We just want to do an MMIO store. The traditional writeq()
* functions aren't really correct here, since they're always
* directed at the PCI shim. For now, just do a raw store,
- * casting away the __iomem attribute.
+ * casting away the __iomem attribute. We do the store as a
+ * single asm() instruction to ensure that we can force a step
+ * over it in the KVM case, if we are not binding vcpus to cpus,
+ * rather than require it to be possible to issue validly.
*/
- ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
+ unsigned long *addr =
+ &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
+ asm volatile("st %0, zero" :: "r" (addr));
}
#else
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
{
- HV_Coord coord;
-
- WARN_ON(cpu_is_offline(cpu));
-
- coord.y = cpu_y(cpu);
- coord.x = cpu_x(cpu);
+ HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
hv_trigger_ipi(coord, IRQ_RESCHEDULE);
}
#endif /* CHIP_HAS_IPI() */
+
+void smp_send_reschedule(int cpu)
+{
+ WARN_ON(cpu_is_offline(cpu));
+ __smp_send_reschedule(cpu);
+}
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 24fd223..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
p->sp >= sp) {
if (kbt->verbose)
pr_err(" <%s while in kernel mode>\n", fault);
- } else if (EX1_PL(p->ex1) == USER_PL &&
+ } else if (user_mode(p) &&
p->sp < PAGE_OFFSET && p->sp != 0) {
if (kbt->verbose)
pr_err(" <%s while in user mode>\n", fault);
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 2298cb1..65f7f9d 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -27,9 +27,6 @@ config KVM
This module provides access to the hardware capabilities through
a character device node named /dev/kvm.
- To compile this as a module, choose M here: the module
- will be called kvm.
-
If unsure, say N.
source drivers/vhost/Kconfig
diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
new file mode 100644
index 0000000..2c3d206
--- /dev/null
+++ b/arch/tile/kvm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
+
+kvm-y += kvm-tile.o
+kvm-y += entry.o
+
+obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
new file mode 100644
index 0000000..07aa3a6
--- /dev/null
+++ b/arch/tile/kvm/entry.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/switch_to.h>
+#include <asm/processor.h>
+#include <arch/spr_def.h>
+#include <arch/abi.h>
+
+#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
+#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
+#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
+#define FOR_EACH_CALLEE_SAVED_REG(f) \
+ f(r30); f(r31); \
+ f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \
+ f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
+ f(r48); f(r49); f(r50); f(r51); f(r52);
+
+/*
+ * Called with interrupts disabled from kvm_tile_run() and is responsible
+ * just for saving the callee-save registers and the stack pointer, then
+ * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
+ * It uses restore_all in intvec_64.S to jump back into the guest.
+ * The kvm_vmexit function below undoes the stack manipulation.
+ */
+STD_ENTRY(kvm_vmresume)
+ /* Do function prolog and save callee-saves on stack. */
+ {
+ move r10, sp
+ st sp, lr
+ }
+ {
+ addli r11, sp, -FRAME_SIZE + 8
+ addli sp, sp, -FRAME_SIZE
+ }
+ {
+ st r11, r10
+ addi r12, sp, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
+ SAVE_REG(tp)
+ SAVE_REG(lr)
+
+ /* Save frame pointer in thread_info so we can get it back later. */
+ st r1, sp
+
+ /* Set the ksp0 for this core to be below this frame. */
+ mfspr r10, SPR_SYSTEM_SAVE_K_0
+ bfins r10, sp, 0, CPU_SHIFT-1
+ mtspr SPR_SYSTEM_SAVE_K_0, r10
+
+ /* sp points to ABI save area below pt_regs for restore_all. */
+ addli sp, r0, -C_ABI_SAVE_AREA_SIZE
+
+ /* Execute an "interrupt return" to the guest. */
+ {
+ movei r30, 0
+ j restore_all
+ }
+ STD_ENDPROC(kvm_vmresume)
+
+/*
+ * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
+ * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
+ * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller
+ * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
+ */
+STD_ENTRY(kvm_vmexit)
+ {
+ move sp, r0
+ addi r12, r0, 16
+ }
+ FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
+ LOAD_REG(tp)
+ LOAD_REG(lr)
+ {
+ addli sp, sp, FRAME_SIZE
+ jrp lr
+ }
+ STD_ENDPROC(kvm_vmexit)
diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
new file mode 100644
index 0000000..e22d4ad
--- /dev/null
+++ b/arch/tile/kvm/kvm-tile.c
@@ -0,0 +1,1529 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <asm/traps.h>
+#include <asm/pgalloc.h>
+#include <hv/hypervisor.h>
+#include <linux/rtc.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+#include <arch/spr_def.h>
+#include <arch/sim.h>
+#include <generated/utsrelease.h>
+
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { NULL }
+};
+
+static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
+{
+ struct mm_struct *mm = kvm->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (kvm->arch.vpgd == NULL)
+ kvm->arch.vpgd = pgd_alloc(kvm->mm);
+ pgd = kvm->arch.vpgd + pgd_index(address);
+ pud = pud_alloc(mm, pgd, address);
+ if (!pud)
+ return NULL;
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ return NULL;
+ return pte_alloc_kernel(pmd, address);
+}
+
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+ return 0;
+}
+
+/* FIXME: support huge pages. */
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, i;
+
+ gpa = mem->guest_phys_addr;
+ for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
+ if (get_vpgd_pte(kvm, gpa) == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ enum kvm_mr_change change)
+{
+ unsigned long gpa, address, pfn, i;
+ struct page *page[1];
+ pte_t *ptep, *vptep;
+
+ gpa = mem->guest_phys_addr;
+ address = mem->userspace_addr;
+ for (i = 0; i < mem->memory_size;
+ i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
+ vptep = get_vpgd_pte(kvm, gpa);
+ BUG_ON(vptep == NULL);
+ get_user_pages_fast(address, 1, 1, page);
+ pfn = page_to_pfn(page[0]);
+ ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
+ *vptep = *ptep;
+ }
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_arch_flush_shadow_all(kvm);
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ return 0;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
+{
+ if (irq < 0)
+ return -EINVAL;
+
+ set_bit(irq, &vcpu->arch.ipi_events);
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = 0;
+
+ switch (ioctl) {
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof(irq)))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+
+out:
+ return r;
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+ return 0;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+ struct kvm_dirty_log *log)
+{
+ return 0;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ long r = -EINVAL;
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long page_size;
+ unsigned long gva = tr->linear_address;
+ unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
+ pud_t gpud;
+ pmd_t gpmd;
+ pte_t gpte;
+
+ /* Get guest pgd (aka pud for three-level tables). */
+ gpgd_gpa = vcpu->arch.guest_context.page_table +
+ (sizeof(pgd_t) * pgd_index(gva));
+ if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
+ goto fail;
+ if (!pud_present(gpud))
+ goto fail;
+
+ /* Get guest pmd. */
+ if (pud_huge_page(gpud)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpud))
+ goto fail;
+ gpte = *(pte_t *)&gpud;
+ page_size = PGDIR_SIZE;
+ goto ok;
+ }
+ gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pmd_t) * pmd_index(gva));
+ if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
+ goto fail;
+ if (!pmd_present(gpmd))
+ goto fail;
+
+ /* Get guest pte. */
+ if (pmd_huge_page(gpmd)) {
+ /* FIXME: no super huge page support yet. */
+ if (pte_super(*(pte_t *)&gpmd))
+ goto fail;
+ gpte = *(pte_t *)&gpmd;
+ page_size = PMD_SIZE;
+ goto ok;
+ }
+ gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
+ (sizeof(pte_t) * pte_index(gva));
+ if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
+ goto fail;
+ if (!pte_present(gpte))
+ goto fail;
+
+ page_size = PAGE_SIZE;
+
+ok:
+ tr->physical_address =
+ PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
+ tr->valid = 1;
+ tr->writeable = pte_write(gpte);
+ tr->usermode = pte_user(gpte);
+
+ return 0;
+
+fail:
+ tr->valid = 0;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ regs->regs = vcpu->arch.regs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.regs = regs->regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ *sregs = vcpu->arch.sregs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu->arch.sregs = *sregs;
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ return 0;
+}
+
+/*
+ * panic_hv() will dump stack info of both guest os and host os, and set
+ * proper exit reason so that qemu can terminate the guest process.
+ */
+static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
+{
+ char panic_buf[256];
+ struct pt_regs *regs;
+ va_list ap;
+ int i;
+
+ va_start(ap, fmt);
+ vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
+ va_end(ap);
+ pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
+
+ /* Show guest os info */
+ regs = &vcpu->arch.regs;
+ for (i = 0; i < 17; i++)
+ pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
+ i, regs->regs[i], i+18, regs->regs[i+18],
+ i+36, regs->regs[i+36]);
+ pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+ regs->regs[18], regs->regs[35], regs->tp);
+ pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
+ pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n",
+ regs->pc, regs->ex1, regs->faultnum);
+
+ /* Show host os info */
+ pr_err("\nKVM stack in the host:\n");
+ dump_stack();
+
+ /* Shut down the guest os */
+ pr_err("Shutting down guest.\n");
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+/* Copied from virt/kvm/kvm_main.c */
+static int next_segment(unsigned long len, int offset)
+{
+ if (len > PAGE_SIZE - offset)
+ return PAGE_SIZE - offset;
+ else
+ return len;
+}
+
+static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ const void *data, unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ data, offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ data += seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+ unsigned long len)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int seg;
+ int offset = offset_in_page(gva);
+ int ret;
+
+ while ((seg = next_segment(len, offset)) != 0) {
+ struct kvm_translation tr;
+ tr.linear_address = gva;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return -EFAULT;
+ ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
+ offset, seg);
+ if (ret < 0)
+ return ret;
+ offset = 0;
+ len -= seg;
+ gva += seg;
+ }
+ return 0;
+}
+
+/*
+ * The following functions are emulation functions for various
+ * hypervisor system calls (i.e. hv_*()). Return value:
+ * 1 if the host os can emulate it completely.
+ * < 0 if errors occur and then qemu will handle them.
+ * 0 if qemu emulation is needed.
+ * In both the < 0 and the == 0 cases, exit reason should
+ * be set for qemu handling.
+ */
+
+/* generic handler for hypercall which needs user (QEMU) to handle. */
+static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ return 0;
+}
+
+/* handler for illegal hypercall */
+static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
+{
+ return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
+ (unsigned long)vcpu->arch.regs.regs[10]);
+}
+
+static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
+{
+ int version = vcpu->arch.regs.regs[0];
+ int chip_num = vcpu->arch.regs.regs[1];
+ int chip_rev_num = vcpu->arch.regs.regs[2];
+ int client_pl = vcpu->arch.regs.regs[3];
+
+ if (client_pl != 1)
+ return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
+ " guests must request PL 1.\n"
+ "Reconfigure your guest with KVM_GUEST set.\n",
+ client_pl);
+
+ if (version != HV_VERSION)
+ return panic_hv(vcpu, "Client built for hv version %d, but"
+ " this hv is version %d\n",
+ version, HV_VERSION);
+
+ if (chip_num != TILE_CHIP)
+ return panic_hv(vcpu, "Client built for chip %d, but this"
+ " hardware is chip %d\n",
+ chip_num, TILE_CHIP);
+
+ if (chip_rev_num != TILE_CHIP_REV)
+ return panic_hv(vcpu, "Client built for chip rev %d, but this"
+ " hardware is chip rev %d\n",
+ chip_rev_num, TILE_CHIP_REV);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long rc;
+
+ switch (query) {
+ case HV_SYSCONF_PAGE_SIZE_SMALL:
+ rc = PAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_LARGE:
+ rc = HPAGE_SIZE;
+ break;
+
+ case HV_SYSCONF_VALID_PAGE_SIZES:
+#if PAGE_SHIFT == 16
+ rc = HV_CTX_PG_SM_64K;
+#elif PAGE_SHIFT == 14
+ rc = HV_CTX_PG_SM_16K;
+#else
+# error Fix hv_sysconf emulation for new page size
+#endif
+ break;
+
+ case HV_SYSCONF_PAGE_SIZE_JUMBO:
+ rc = 0; /* FIXME add super page support */
+ break;
+
+ case HV_SYSCONF_CPU_SPEED:
+ case HV_SYSCONF_CPU_TEMP:
+ case HV_SYSCONF_BOARD_TEMP:
+ rc = hv_sysconf(query);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
+{
+ HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+ long buflen = vcpu->arch.regs.regs[2];
+ char hvbuf[256];
+ const char *p;
+ long rc;
+
+ switch (query) {
+
+ /* For hardware attributes, just pass to the hypervisor. */
+ case HV_CONFSTR_BOARD_PART_NUM:
+ case HV_CONFSTR_BOARD_SERIAL_NUM:
+ case HV_CONFSTR_CHIP_SERIAL_NUM:
+ case HV_CONFSTR_BOARD_REV:
+ case HV_CONFSTR_CHIP_MODEL:
+ case HV_CONFSTR_BOARD_DESC:
+ case HV_CONFSTR_MEZZ_PART_NUM:
+ case HV_CONFSTR_MEZZ_SERIAL_NUM:
+ case HV_CONFSTR_MEZZ_REV:
+ case HV_CONFSTR_MEZZ_DESC:
+ case HV_CONFSTR_SWITCH_CONTROL:
+ case HV_CONFSTR_CHIP_REV:
+ case HV_CONFSTR_CPUMOD_PART_NUM:
+ case HV_CONFSTR_CPUMOD_SERIAL_NUM:
+ case HV_CONFSTR_CPUMOD_REV:
+ case HV_CONFSTR_CPUMOD_DESC:
+ rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
+ if (rc > sizeof(hvbuf)) {
+ /* Not the best answer, but very unlikely anyway. */
+ rc = sizeof(hvbuf);
+ hvbuf[sizeof(hvbuf)-1] = '\0';
+ }
+ p = hvbuf;
+ break;
+
+ /* For hypervisor version info, just report the kernel version. */
+ case HV_CONFSTR_HV_SW_VER:
+ p = UTS_RELEASE;
+ break;
+ case HV_CONFSTR_HV_CONFIG:
+ case HV_CONFSTR_HV_CONFIG_VER:
+ p = "";
+ break;
+
+ default:
+ rc = HV_EINVAL;
+ goto done;
+ }
+
+ rc = strlen(p) + 1; /* include NUL */
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
+ p, min(rc, buflen)))
+ rc = HV_EFAULT;
+
+done:
+ vcpu->arch.regs.regs[0] = rc;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
+{
+ HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
+ struct rtc_time tm;
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ rtc_time_to_tm(tv.tv_sec, &tm);
+ hvtm->tm_sec = tm.tm_sec;
+ hvtm->tm_min = tm.tm_min;
+ hvtm->tm_hour = tm.tm_hour;
+ hvtm->tm_mday = tm.tm_mday;
+ hvtm->tm_mon = tm.tm_mon;
+ hvtm->tm_year = tm.tm_year;
+ hvtm->flags = 0;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
+{
+ /* Do nothing here. */
+ pr_warn("hv_set_rtc() will not work in kvm guest\n");
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
+
+ switch (idx) {
+ case 0:
+ var->start = 0UL;
+ var->size = 0x20000000000UL;
+ break;
+ case 1:
+ var->start = 0xFFFFFFFF80000000UL;
+ var->size = 0x80000000UL;
+ break;
+ default:
+ var->start = 0UL;
+ var->size = 0UL;
+ break;
+ }
+
+ return 1;
+}
+
+/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
+static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
+{
+ int idx = vcpu->arch.regs.regs[0];
+ HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
+
+ if (idx == 0) {
+ var->start = min_asid;
+ var->size = max_asid - min_asid + 1;
+ } else {
+ var->start = 0;
+ var->size = 0;
+ }
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
+{
+ HV_Topology *tp;
+ int cpus;
+
+ /* Depends on the definition of struct HV_Topology */
+ tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
+
+ cpus = atomic_read(&vcpu->kvm->online_vcpus);
+ tp->coord.x = vcpu->vcpu_id;
+ tp->coord.y = 0;
+ tp->width = cpus;
+ tp->height = 1;
+
+ return 1;
+}
+
+static int xy_to_vcpu(struct kvm *kvm, int x, int y)
+{
+ if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
+ return -1;
+ return x;
+}
+
+/*
+ * The primary vcpu is the one that initially runs while the others
+ * all block. It is the only that is allowed to call hv_start_all_tiles().
+ * The other cpus are secondary.
+ */
+static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id != 0;
+}
+
+static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
+{
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (is_secondary_vcpu(vcpu) || completion_done(c))
+ return panic_hv(vcpu, "start_all_tiles() called again");
+ complete_all(c);
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa = vcpu->arch.regs.regs[0];
+ HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
+ uint64_t val = vcpu->arch.regs.regs[2];
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ gfn = gpa_to_gfn(gpa);
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn))
+ return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+ gpa);
+ hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+ hv_physaddr_write64(hpa, *access, val);
+
+ return 1;
+}
+
+static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
+{
+ /* Do we care about the argument msgstate? */
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+/*
+ * NOTE: we may coalesce multiple messages with the same tag to the
+ * same recepient. Currently the only messages used by Linux are
+ * start/stop cpu (where coalescing is OK), and the smp_call_function()
+ * IPI message tag. In the latter case we rely on the generic
+ * smp_call_function code to properly handle this, and since it only
+ * uses the IPI as a way to wake up the generic list-walking code,
+ * it's OK if we coalesce several IPI deliveries before the recipient
+ * core takes action.
+ */
+static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *vcpui;
+ HV_Recipient recip[NR_CPUS];
+ HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
+ int nrecip = vcpu->arch.regs.regs[1];
+ int buflen = vcpu->arch.regs.regs[3];
+ int sent, vcpu_id, tag;
+
+ /* NOTE: we only support the Linux usage of buflen == sizeof(int). */
+ if (unlikely(buflen != sizeof(int) ||
+ nrecip >= atomic_read(&kvm->online_vcpus))) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ /* Get the buf info */
+ if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(tag))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Range-check the tag value. */
+ if (tag < 0 || tag >= MAX_MSG_TAG) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ /* Get all the recipients */
+ if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ for (sent = 0; sent < nrecip; sent++) {
+ if (recip[sent].state != HV_TO_BE_SENT)
+ continue;
+ vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
+ if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
+ recip[sent].state = HV_BAD_RECIP;
+ continue;
+ }
+ vcpui = kvm_get_vcpu(kvm, vcpu_id);
+ set_bit(tag, &vcpui->arch.pending_msgs);
+ kvm_vcpu_kick(vcpui);
+ recip[sent].state = HV_SENT;
+ }
+
+ if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
+ nrecip * sizeof(HV_Recipient))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = sent;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
+{
+ HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
+ int buflen = vcpu->arch.regs.regs[3];
+ int tag;
+
+ /* Currently we only support messages from other tiles. */
+ rmi->source = HV_MSG_TILE;
+
+ if (buflen <= sizeof(int)) {
+ rmi->msglen = HV_E2BIG;
+ return 1;
+ }
+
+ tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
+ if (tag >= MAX_MSG_TAG) {
+ /* No more messages */
+ rmi->msglen = 0;
+ return 1;
+ }
+
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &tag, sizeof(int))) {
+ rmi->msglen = HV_EFAULT;
+ return 1;
+ }
+
+ /*
+ * This clear_bit could race with a set_bit as another core
+ * delivers a new smp_function_call to this core. However,
+ * the smp_function_call code will have set up the additional
+ * smp_function_call data on the kernel's list prior to
+ * raising the interrupt, so even if we lose the new
+ * interrupt due to the race, we still haven't dispatched
+ * to the original interrupt handler, and when we do, it
+ * will find both smp_function_calls waiting for it, so the
+ * race is harmless. This is consistent with the fact that
+ * the generic code is trying to support pretty much
+ * arbitrary architecture-dependent IPI semantics, so it
+ * is very conservative about what it assumes.
+ *
+ * Also note that we only clear_bit on the core that owns
+ * the mask, so there's no race condition caused by the
+ * find_first_bit above and the clear_bit here, since once
+ * a bit is found it will stay set until this point.
+ */
+ clear_bit(tag, &vcpu->arch.pending_msgs);
+ rmi->msglen = sizeof(int);
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
+
+ *ctx = hv_inquire_guest_context();
+
+ return 1;
+}
+
+static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ HV_InqTileSet set = vcpu->arch.regs.regs[0];
+ unsigned long gva = vcpu->arch.regs.regs[1];
+ int length = vcpu->arch.regs.regs[2];
+ struct cpumask mask = CPU_MASK_NONE;
+ int cpus, i, retval, bytes2copy, bytes2zero;
+
+ switch (set) {
+ case HV_INQ_TILES_AVAIL:
+ case HV_INQ_TILES_HFH_CACHE:
+ case HV_INQ_TILES_LOTAR:
+ cpus = atomic_read(&kvm->online_vcpus);
+ for (i = 0; i < cpus; ++i)
+ cpumask_set_cpu(i, &mask);
+ break;
+ case HV_INQ_TILES_SHARED:
+ break;
+ default:
+ retval = HV_EINVAL;
+ goto done;
+ }
+
+ bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
+ bytes2zero = length - bytes2copy;
+
+ if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
+ retval = HV_EFAULT;
+ goto done;
+ }
+
+ retval = HV_OK;
+done:
+ vcpu->arch.regs.regs[0] = retval;
+ return 1;
+}
+
+static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
+{
+ HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
+ int pl = (int) vcpu->arch.regs.regs[1];
+ struct kvm_vcpu *target_vcpu;
+ int vcpu_id;
+
+ vcpu_id = vtarget.x;
+ if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
+ vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+ return 1;
+ }
+
+ target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+ if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+ &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
+ vcpu->arch.regs.regs[0] = HV_EFAULT;
+ return 1;
+ }
+
+ vcpu->arch.regs.regs[0] = HV_OK;
+
+ return 1;
+}
+
+struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
+{
+ struct kvm_vcpu *vcpui;
+ unsigned long idx;
+
+ kvm_for_each_vcpu(idx, vcpui, kvm)
+ if (vcpui->arch.ipi_gpa == gpa)
+ return vcpui;
+
+ return NULL;
+}
+
+/*
+ * Most page faults will be downcall-ed from hv to and be handled directly
+ * by either guest os or host os. This function is used to handle the
+ * rest cases.
+ */
+static int handle_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_translation tr;
+ struct kvm_vcpu *ipi_vcpu;
+
+ tr.linear_address = (__u64) vcpu->arch.fault_addr;
+ kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ if (!tr.valid)
+ return 0;
+
+ /* ipi PTE for rescheduling interrupt? */
+ ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
+ if (!ipi_vcpu)
+ return 0;
+
+ set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
+ kvm_vcpu_kick(ipi_vcpu);
+
+ /* Juke the PC past the store instruction. */
+ vcpu->arch.regs.pc += 8;
+ return 1;
+}
+
+static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We do not expect this call in guest so far. At least guest os
+ * should just follow host os instead of *set*. Besides,
+ * hv_set_pte_super_shift() will not be called in guest os with
+ * current guest os setting.
+ */
+ vcpu->arch.regs.regs[0] = HV_EINVAL;
+
+ return 1;
+}
+
+static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
+{
+ HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
+
+ hvss->new_speed = HV_EPERM;
+ hvss->end_cycle = 0;
+ hvss->delta_ns = 0;
+
+ return 1;
+}
+
+static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
+ HCALL_DEFS
+};
+
+static int kvm_handle_exit(struct kvm_vcpu *vcpu)
+{
+ unsigned long hcall_idx;
+
+ switch (vcpu->run->exit_reason) {
+ case KVM_EXIT_HYPERCALL:
+ hcall_idx = vcpu->arch.regs.regs[10];
+ if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
+ hcall_handlers[hcall_idx] == NULL))
+ return kvm_emulate_illegal(vcpu);
+
+ /* Juke us past the swint0 when we return. */
+ vcpu->arch.regs.pc += 8;
+
+ return hcall_handlers[hcall_idx](vcpu);
+
+ case KVM_EXIT_MMIO:
+ if (handle_mmio(vcpu))
+ return 1;
+ return panic_hv(vcpu, "Out-of-bounds client memory access");
+
+ case KVM_EXIT_AGAIN:
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return !test_and_set_bit(KVM_REQ_KICK, &vcpu->requests);
+}
+
+/*
+ * Any interrupt that would normally be handled by the host at PL2
+ * needs to be reassigned to the guest at PL1 as we enter.
+ *
+ * The TLB interrupts remain handled by the hypervisor and are downcalled
+ * to the appropriate host or guest as necessary.
+ *
+ * FIXME: We don't give the UDN interrupts for now; at some point we
+ * plan to allow an option to pin the vcpus and report the true
+ * geometry to the guest, at which point passing the UDN access would
+ * make sense.
+ *
+ * FIXME: For now we don't pass the profiling interrupts to the guest,
+ * and instead require profiling be run in the host; we should be able
+ * to support guest-level profiling pretty easily, but we need to
+ * think about whether there are vcpu migration issues there.
+ */
+static void kvm_grant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_1, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_1, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
+}
+
+static void kvm_ungrant_mpls(void)
+{
+ __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_SET_2, 1);
+ __insn_mtspr(SPR_MPL_GPV_SET_2, 1);
+ __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
+ __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
+}
+
+/*
+ * There is lots of state that is (for the non-virtualized case) held
+ * permanently in SPRs, or that is in any case not context-switched.
+ * The next two routines switch in and out all the SPR state.
+ *
+ * We try to fix the timer so that when we restart, we fix up the
+ * timer value so that will fire at the correct wall-clock time even
+ * if we have been scheduled out for a little bit. This may also
+ * mean we end up firing it immediately on return, and suffer a
+ * timer delay in the guest.
+ */
+static void kvm_save_sprs(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
+ vcpu->arch.vmexit_cycles = get_cycles();
+
+#define SAVE_SPR(x) vcpu->arch.sregs.x = __insn_mfspr(SPR_ ## x)
+ FOR_EACH_GUEST_SPR(SAVE_SPR);
+#undef SAVE_SPR
+}
+
+static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
+{
+ unsigned long count = vcpu->arch.timer_control;
+ unsigned long underflow =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
+ unsigned long disabled =
+ (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
+
+ if (!disabled) {
+ unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ underflow |= delta > count;
+ count -= delta;
+ count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+ count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
+ }
+ __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
+
+#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.sregs.x)
+ FOR_EACH_GUEST_SPR(RESTORE_SPR);
+#undef RESTORE_SPR
+}
+
+/*
+ * When entering the guest, we need to eliminate any PL0 translations
+ * that were in use by qemu, since the guest's PL0 translations will
+ * be different. We also flush PL1 translations in case there have
+ * been changes to the virtualization page table, etc.
+ *
+ * FIXME: Add a way to just flush PL0/PL1, or just flush below
+ * the host PAGE_OFFSET, or add vpid support, etc.
+ */
+static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
+{
+ HV_Context *ctx;
+ pgd_t *vpgdir;
+ pte_t *ptep;
+ int rc;
+
+ /* Install virtualization context */
+ vpgdir = vcpu->kvm->arch.vpgd;
+ BUG_ON(vpgdir == NULL);
+ ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
+ rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Install guest context */
+ ctx = &vcpu->arch.guest_context;
+ rc = hv_install_guest_context(ctx->page_table, ctx->access,
+ ctx->asid, ctx->flags);
+ WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
+ ctx->page_table, ctx->access.val,
+ ctx->asid, ctx->flags, rc);
+
+ hv_flush_all(0);
+}
+
+/*
+ * De-install the virtualization context so we take faults below the
+ * host Linux PL in the normal manner going forward.
+ *
+ * We flush all the TLB mappings as we exit the guest, since the
+ * guest has been using the ASIDs as it pleases, and may have installed
+ * incompatible mappings for qemu's process as well. Note that we don't
+ * worry about host-PL interrupts that occur while the guest is running,
+ * on the assumption that such interrupts can't touch userspace
+ * addresses legally anyway.
+ *
+ * NOTE: we may want to add a hypervisor call to just flush mappings
+ * below PL2 and use that here instead.
+ */
+static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
+{
+ int rc;
+
+ /* Remember guest context */
+ vcpu->arch.guest_context = hv_inquire_guest_context();
+
+ /* Disable virtualization context */
+ rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
+ WARN_ON_ONCE(rc < 0);
+
+ /* Flush everything in the TLB. */
+ hv_flush_all(0);
+}
+
+static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Capture current set of ipi_events. We might race with
+ * another thread adding an event, but if so we'll just miss
+ * it on this go-around and see it next time.
+ */
+ vcpu->arch.sregs.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
+
+ /*
+ * Note: We could set PC and EX1 for the guest os to jump
+ * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
+ * is unmasked and the guest is not at PL1 with ICS set.
+ * But in fact it's about as fast to just set INTCTRL_1_STATUS
+ * here and then run the short INTCTRL_1 handler in the guest.
+ */
+ vcpu->arch.sregs.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
+}
+
+static void kvm_tile_run(struct kvm_vcpu *vcpu)
+{
+ struct thread_info *ti = current_thread_info();
+ unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
+
+ /*
+ * Disable interrupts while we set up the guest state.
+ * This way, if we race with another core trying to tell us
+ * to fix up our guest state, we will take the kick only as
+ * we actually try to enter the guest, and instead we will
+ * vmexit and end up retrying.
+ */
+ local_irq_disable();
+ kvm_guest_context_enter(vcpu);
+ clear_bit(KVM_REQ_KICK, &vcpu->requests);
+ ti->vcpu = vcpu;
+ vcpu->cpu = get_cpu();
+ kvm_inject_interrupts(vcpu);
+ kvm_grant_mpls();
+ kvm_restore_sprs(vcpu);
+
+ /* Calling this function irets into the guest. */
+ kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
+
+ /* We resume here due to a call to kvm_vmexit. */
+ __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
+
+ vcpu->cpu = -1;
+ put_cpu();
+ ti->vcpu = NULL;
+ set_bit(KVM_REQ_KICK, &vcpu->requests);
+ vcpu->run->ready_for_interrupt_injection = 1;
+ kvm_ungrant_mpls();
+ kvm_save_sprs(vcpu);
+ __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
+ kvm_guest_context_exit(vcpu);
+ local_irq_enable();
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r = 1;
+
+ while (r > 0) {
+ kvm_guest_enter();
+ kvm_tile_run(vcpu);
+ kvm_guest_exit();
+
+ r = kvm_handle_exit(vcpu);
+ /*
+ * <0: error for userspace.
+ * =0: QEMU to handle.
+ * >0: host os can handle it fully.
+ */
+ if (r <= 0)
+ break;
+
+ if (signal_pending(current)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+
+ kvm_resched(vcpu);
+ }
+
+ return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+ sigset_t sigsaved;
+
+ /* Secondary cpus must wait until they are told they can start. */
+ if (vcpu->arch.suspended) {
+ struct completion *c = &vcpu->kvm->arch.smp_start;
+ if (wait_for_completion_interruptible(c))
+ return -EINTR;
+ vcpu->arch.suspended = 0;
+ }
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ r = __vcpu_run(vcpu, kvm_run);
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return r;
+}
+
+int kvm_arch_init(void *opaque)
+{
+ return 0;
+}
+
+void kvm_arch_exit(void)
+{
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ unsigned long resv_gfn_start;
+ struct kvm_memory_slot *s;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.resv_gpa_start) {
+ resv_gfn_start = 0;
+
+ for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
+ s = &kvm->memslots->memslots[i];
+
+ if (!s->npages)
+ continue;
+
+ if ((s->base_gfn + s->npages) > resv_gfn_start)
+ resv_gfn_start = s->base_gfn + s->npages;
+ }
+
+ kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
+ }
+
+ /* Initialize to enter fake PA=VA mode in hypervisor. */
+ vcpu->arch.guest_context.page_table = HV_CTX_NONE;
+
+ vcpu->arch.ipi_gpa =
+ kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
+ vcpu->arch.ipi_gpte =
+ pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
+
+ /* Mark the core suspended if it is not the boot cpu. */
+ vcpu->arch.suspended = is_secondary_vcpu(vcpu);
+
+ return 0;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Notify simulator that this task handles this vcpu. */
+ sim_set_vcpu(vcpu->vcpu_id);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ sim_clear_vcpu();
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+ struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+ int rc;
+
+ if (!vcpu)
+ return ERR_PTR(-ENOMEM);
+
+ rc = kvm_vcpu_init(vcpu, kvm, id);
+ if (rc) {
+ kfree(vcpu);
+ return ERR_PTR(rc);
+ }
+
+ return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
+ memset(&vcpu->arch.sregs, 0, sizeof(struct pt_regs));
+ vcpu->arch.sregs.IPI_MASK_1 = -1UL;
+ vcpu->arch.sregs.INTERRUPT_MASK_1 = -1UL;
+ vcpu->arch.sregs.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
+ return 0;
+}
+
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_vcpu_uninit(vcpu);
+ kfree(vcpu);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_destroy(vcpu);
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ if (type)
+ return -EINVAL;
+
+ init_completion(&kvm->arch.smp_start);
+ return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ /* Seems to be unnecessary? */
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+
+ if (kvm->arch.vpgd)
+ pgd_free(kvm->mm, kvm->arch.vpgd);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+/* Called from guest hv glue via swint0 traps. */
+void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
+{
+ /* Hypercalls are only valid from PL1. */
+ if (EX1_PL(regs->ex1) != 0) {
+ kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
+ /*NORETURN*/
+ }
+ do_trap(regs, fault_num, 0);
+}
+
+void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+ unsigned long fault_addr, unsigned long write)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ BUG_ON(vcpu == NULL);
+ vcpu->arch.fault_addr = fault_addr;
+ kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
+ /*NORETURN*/
+}
+
+void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
+{
+ kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
+ /*NORETURN*/
+}
+
+void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
+{
+ struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->arch.regs = *regs;
+ vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+ kvm_vmexit(vcpu->arch.host_sp);
+ /*NORETURN*/
+}
+
+static int __init kvm_tile_init(void)
+{
+ return kvm_init(NULL, sizeof(struct kvm_vcpu),
+ __alignof__(struct kvm_vcpu), THIS_MODULE);
+}
+
+static void __exit kvm_tile_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(kvm_tile_init);
+module_exit(kvm_tile_exit);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 82733c8..1590282 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);
/* hypervisor glue */
#include <hv/hypervisor.h>
+EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_dev_close);
EXPORT_SYMBOL(hv_dev_open);
+EXPORT_SYMBOL(hv_dev_poll);
+EXPORT_SYMBOL(hv_dev_poll_cancel);
EXPORT_SYMBOL(hv_dev_pread);
-EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_preada);
+EXPORT_SYMBOL(hv_dev_pwrite);
EXPORT_SYMBOL(hv_dev_pwritea);
-EXPORT_SYMBOL(hv_dev_poll);
-EXPORT_SYMBOL(hv_dev_poll_cancel);
-EXPORT_SYMBOL(hv_dev_close);
-EXPORT_SYMBOL(hv_sysconf);
-EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_flush_all);
EXPORT_SYMBOL(hv_get_rtc);
+#ifdef __tilegx__
+EXPORT_SYMBOL(hv_inquire_guest_context);
+EXPORT_SYMBOL(hv_install_guest_context);
+EXPORT_SYMBOL(hv_install_virt_context);
+#endif
+EXPORT_SYMBOL(hv_physaddr_read64);
+EXPORT_SYMBOL(hv_physaddr_write64);
EXPORT_SYMBOL(hv_set_rtc);
+EXPORT_SYMBOL(hv_sysconf);
/* libgcc.a */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 64eec3f..39c48cb 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
- is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+ is_kernel_mode = !user_mode(regs);
tsk = validate_current();
@@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
}
#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
- if (EX1_PL(regs->ex1) != USER_PL) {
+ if (!user_mode(regs)) {
struct async_tlb *async;
switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 3004433..d6948d4 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
#if CHIP_HAS_MMIO()
-/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
-void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
- pgprot_t home)
+void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags, pgprot_t prot)
{
void *addr;
struct vm_struct *area;
unsigned long offset, last_addr;
- pgprot_t pgprot;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
- /* Create a read/write, MMIO VA mapping homed at the requested shim. */
- pgprot = PAGE_KERNEL;
- pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
- pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
-
/*
* Mappings have to be page-aligned
*/
@@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
+ area = get_vm_area(size, flags);
if (!area)
return NULL;
area->phys_addr = phys_addr;
addr = area->addr;
if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
+ phys_addr, prot)) {
free_vm_area(area);
return NULL;
}
- return (__force void __iomem *) (offset + (char *)addr);
+ return (void *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(generic_remap_prot);
+
+/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ pgprot_t home)
+{
+ pgprot_t pgprot;
+ unsigned long flags;
+
+ /* Create a read/write, MMIO VA mapping homed at the requested shim. */
+ pgprot = PAGE_KERNEL;
+ pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
+ pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
+ flags = VM_IOREMAP; /* | other flags? */
+
+ return (__force void __iomem *) generic_remap_prot(phys_addr,
+ size, flags, pgprot);
}
EXPORT_SYMBOL(ioremap_prot);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08..b622337 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -171,6 +171,7 @@ struct kvm_pit_config {
#define KVM_EXIT_WATCHDOG 21
#define KVM_EXIT_S390_TSCH 22
#define KVM_EXIT_EPR 23
+#define KVM_EXIT_AGAIN 24
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..2c4fd23 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
if (vcpu->kvm->mm != current->mm)
return -EIO;
-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
+ defined(CONFIG_TILEGX)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
* so vcpu_load() would break it.
--
1.8.3.1
This commit enables a special configure option to build the kernel to
run at PL1. In this mode, the client can run under a KVM host kernel;
it can also run under the older Tilera hypervisor that ran the
operating system at PL1 by default.
The PL1 kernel runs with half the virtual address space and half the
physical address space of the PL2 kernel, so <asm/page.h> reflects
modifying those constants appropriately. We make some things a little
more generic ("intrpt1" references from the old PL1 kernel
nomenclature have now been normalized to "intrpt"), and simplify some
other nomenclature (using MEM_SV_START instead of MEM_SV_INTRPT to
reflect the fact that where the supervisor starts is the fact of
interest, not that it happens to start with the interrupt vectors).
The simulator support for reflecting Elf binary data back out
associated with simulator backtrace information needs some additional
extension. It currently will report backtrace information for the
guest kernel, but not for processes running within the guest kernel;
additional work in the simulator is required to be able to provide the
necessary path information for that to work. For now we disable
simulator notifications within the guest kernel.
The timer interrupt for the guest uses the AUX_TILE_TIMER hardware,
leaving the regular TILE_TIMER for the host.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/Kconfig | 14 +++++++--
arch/tile/include/asm/module.h | 10 ++++--
arch/tile/include/asm/page.h | 63 +++++++++++++++++++-------------------
arch/tile/include/asm/pgtable_32.h | 2 +-
arch/tile/include/asm/pgtable_64.h | 3 +-
arch/tile/include/asm/processor.h | 2 +-
arch/tile/include/asm/switch_to.h | 25 ++++++++++++---
arch/tile/include/asm/timex.h | 8 +++++
arch/tile/kernel/head_32.S | 4 +--
arch/tile/kernel/head_64.S | 6 ++--
arch/tile/kernel/intvec_32.S | 6 ++--
arch/tile/kernel/intvec_64.S | 34 +++++++++++++++-----
arch/tile/kernel/process.c | 2 ++
arch/tile/kernel/setup.c | 8 ++---
arch/tile/kernel/sysfs.c | 4 +++
arch/tile/kernel/time.c | 14 ++++-----
arch/tile/kernel/traps.c | 2 +-
arch/tile/kernel/vmlinux.lds.S | 10 +++---
arch/tile/mm/elf.c | 2 ++
arch/tile/mm/init.c | 8 ++---
20 files changed, 145 insertions(+), 82 deletions(-)
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 3bc8fb7..e89aae8 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -126,7 +126,7 @@ config TILEGX
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KPROBES
select HAVE_KRETPROBES
- select HAVE_KVM
+ select HAVE_KVM if !KVM_GUEST
config TILEPRO
def_bool !TILEGX
@@ -366,11 +366,19 @@ config HARDWALL
bool "Hardwall support to allow access to user dynamic network"
default y
+config KVM_GUEST
+ bool "Build kernel as guest for KVM"
+ default n
+ depends on TILEGX
+ ---help---
+ This will build a kernel that runs at a lower protection level
+ than the default kernel and is suitable to run under KVM.
+
config KERNEL_PL
int "Processor protection level for kernel"
range 1 2
- default 2 if TILEGX
- default 1 if !TILEGX
+ default 2 if TILEGX && !KVM_GUEST
+ default 1 if !TILEGX || KVM_GUEST
---help---
Since MDE 4.2, the Tilera hypervisor runs the kernel
at PL2 by default. If running under an older hypervisor,
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
index 44ed07c..a8b546b 100644
--- a/arch/tile/include/asm/module.h
+++ b/arch/tile/include/asm/module.h
@@ -16,7 +16,6 @@
#define _ASM_TILE_MODULE_H
#include <arch/chip.h>
-
#include <asm-generic/module.h>
/* We can't use modules built with different page sizes. */
@@ -28,6 +27,13 @@
# define MODULE_PGSZ ""
#endif
+/* Tag guest Linux, since it uses different SPRs, etc. */
+#if CONFIG_KERNEL_PL == 2
+#define MODULE_PL ""
+#else
+#define MODULE_PL " guest"
+#endif
+
/* We don't really support no-SMP so tag if someone tries. */
#ifdef CONFIG_SMP
#define MODULE_NOSMP ""
@@ -35,6 +41,6 @@
#define MODULE_NOSMP " nosmp"
#endif
-#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP
#endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index b4f96c0..2c991f2 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif
+#ifdef CONFIG_KVM_GUEST
+/* Paravirtualized guests get half the VA, and thus half the PA. */
+#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1)
+#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1)
+#else
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+#endif
+
/* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
#define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
#define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
#define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
* We reserve the lower half of memory for user-space programs, and the
* upper half for system code. We re-map all of physical memory in the
* upper half, which takes a quarter of our VA space. Then we have
- * the vmalloc regions. The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions. The supervisor code lives at the highest address,
* with the hypervisor above that.
*
* Loadable kernel modules are placed immediately after the static
@@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size)
* Similarly, for now we don't play any struct page mapping games.
*/
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
# error Too much PA to map with the VA available!
#endif
-#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
-#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */
-#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */
-#define PAGE_OFFSET MEM_HIGH_START
-#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */
+#ifdef CONFIG_KVM_GUEST
+#define PAGE_OFFSET (_AC(1, UL) << (MAX_VA_WIDTH - 1))
+#define KERNEL_HIGH_VADDR (_AC(1, UL) << MAX_VA_WIDTH)
+#else
+#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */
+#endif
+
+#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
#define _VMALLOC_START FIXADDR_TOP
-#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT MEM_SV_START
-#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */
#define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR MEM_SV_START
#else /* !__tilegx__ */
@@ -213,25 +221,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
* values, and after that, we show "typical" values, since the actual
* addresses depend on kernel #defines.
*
- * MEM_HV_INTRPT 0xfe000000
- * MEM_SV_INTRPT (kernel code) 0xfd000000
+ * MEM_HV_START 0xfe000000
+ * MEM_SV_START (kernel code) 0xfd000000
* MEM_USER_INTRPT (user vector) 0xfc000000
- * FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR)
- * PKMAP_BASE 0xf7000000 (via LAST_PKMAP)
- * HUGE_VMAP 0xf3000000 (via CONFIG_NR_HUGE_VMAPS)
- * VMALLOC_START 0xf0000000 (via __VMALLOC_RESERVE)
+ * FIX_KMAP_xxx 0xfa000000 (via NR_CPUS * KM_TYPE_NR)
+ * PKMAP_BASE 0xf9000000 (via LAST_PKMAP)
+ * VMALLOC_START 0xf7000000 (via VMALLOC_RESERVE)
* mapped LOWMEM 0xc0000000
*/
#define MEM_USER_INTRPT _AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT _AC(0xfd000000, UL)
-#define MEM_HV_INTRPT _AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT _AC(0xfd000000, UL)
-#define MEM_SV_INTRPT _AC(0xfe000000, UL)
-#define MEM_HV_INTRPT _AC(0xff000000, UL)
-#endif
+#define MEM_SV_START _AC(0xfd000000, UL)
+#define MEM_HV_START _AC(0xfe000000, UL)
#define INTRPT_SIZE 0x4000
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index e5bdc0e..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; }
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_INTRPT;
+ return addr >= MEM_HV_START;
}
/*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 7cb8d35..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_START ||
- (addr > MEM_LOW_END && addr < MEM_HIGH_START);
+ return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
}
/*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index c72fcba..5aa5431 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -168,7 +168,7 @@ struct thread_struct {
#ifndef __ASSEMBLY__
#ifdef __tilegx__
-#define TASK_SIZE_MAX (MEM_LOW_END + 1)
+#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1))
#else
#define TASK_SIZE_MAX PAGE_OFFSET
#endif
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index b8f888c..8e9150f 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev,
extern unsigned long get_switch_to_pc(void);
/*
+ * Normally we notify the simulator whenever we change from one pid
+ * to another, so it can track symbol files appropriately on the fly.
+ * For now, we don't do this for the guest Linux, since we don't
+ * have a way to tell the simulator that we are entering a separate
+ * pid space when we are in the guest.
+ */
+#ifdef CONFIG_KVM_GUEST
+#define notify_sim_task_change(prev) do { } while (0)
+#else
+#define notify_sim_task_change(prev) do { \
+ if (unlikely((prev)->state == TASK_DEAD)) \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
+ ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
+ (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+} while (0)
+#endif
+
+/*
* Kernel threads can check to see if they need to migrate their
* stack whenever they return from a context switch; for user
* threads, we defer until they are returning to user-space.
*/
#define finish_arch_switch(prev) do { \
- if (unlikely((prev)->state == TASK_DEAD)) \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \
- ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
- __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \
- (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+ notify_sim_task_change(prev); \
if (current->mm == NULL && !kstack_hash && \
current_thread_info()->homecache_cpu != smp_processor_id()) \
homecache_migrate_kthread(); \
diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
index edbd7e4..0417617 100644
--- a/arch/tile/include/asm/timex.h
+++ b/arch/tile/include/asm/timex.h
@@ -27,6 +27,14 @@
typedef unsigned long long cycles_t;
+#ifdef CONFIG_KVM_GUEST
+#define INT_LINUX_TIMER INT_AUX_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_AUX_TILE_TIMER_CONTROL
+#else
+#define INT_LINUX_TIMER INT_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_TILE_TIMER_CONTROL
+#endif
+
#if CHIP_HAS_SPLIT_CYCLE()
cycles_t get_cycles(void);
#define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index f3f17b0..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
.set addr, addr + PGDIR_SIZE
.endr
- /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
- PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+ /* The true text VAs are mapped as VA = PA + MEM_SV_START */
+ PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
(1 << (HV_PTE_INDEX_EXECUTABLE - 32))
.org swapper_pg_dir + PGDIR_SIZE
END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 652b814..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -135,9 +135,9 @@ ENTRY(_start)
1:
/* Install the interrupt base. */
- moveli r0, hw2_last(MEM_SV_START)
- shl16insli r0, r0, hw1(MEM_SV_START)
- shl16insli r0, r0, hw0(MEM_SV_START)
+ moveli r0, hw2_last(intrpt_start)
+ shl16insli r0, r0, hw1(intrpt_start)
+ shl16insli r0, r0, hw0(intrpt_start)
mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
/* Get our processor number and save it away in SAVE_K_0. */
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index 8ac6072..2ce69a5 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -353,7 +353,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -1892,8 +1892,8 @@ int_unalign:
push_extra_callee_saves r0
j do_trap
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 45647a4..ccb0e65 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -38,6 +38,16 @@
#define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set). Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
.macro push_reg reg, ptr=sp, delta=-8
{
@@ -312,7 +322,7 @@ intvec_\vecname:
*/
{
blbs sp, 2f
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r0, r0)
}
.ifc \vecnum, INT_DOUBLE_FAULT
@@ -530,7 +540,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -651,7 +661,7 @@ intvec_\vecname:
*/
mfspr r22, SPR_EX_CONTEXT_K_1
{
- andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r22, r22)
PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
}
beqzt r22, 1f /* zero if from user space */
@@ -818,7 +828,7 @@ STD_ENTRY(interrupt_return)
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
ld r29, r29
- andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r29, r29)
{
beqzt r29, .Lresume_userspace
move r29, sp
@@ -955,7 +965,7 @@ restore_all:
PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
}
{
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+ IS_KERNEL_EX1(r0, r0)
ld r32, r32
}
bnez r0, 1f
@@ -1026,7 +1036,7 @@ restore_all:
pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
{
mtspr SPR_EX_CONTEXT_K_1, lr
- andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(lr, lr)
}
{
mtspr SPR_EX_CONTEXT_K_0, r21
@@ -1555,8 +1565,10 @@ handle_downcall_dispatch:
__int_hand \vecnum, \vecname, \c_routine, \processing
.endm
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
+ .global intrpt_start
+intrpt_start:
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
@@ -1601,7 +1613,13 @@ handle_downcall_dispatch:
int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
+#ifndef CONFIG_KVM_GUEST
int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr
+#else
+ int_hand INT_TILE_TIMER, TILE_TIMER, bad_intr
+ int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt
+#endif
int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr
int_hand INT_UDN_TIMER, UDN_TIMER, bad_intr
int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 7040490..2629ff1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -248,11 +248,13 @@ struct task_struct *validate_current(void)
/* Take and return the pointer to the previous task, for schedule_tail(). */
struct task_struct *sim_notify_fork(struct task_struct *prev)
{
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
struct task_struct *tsk = current;
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT |
(tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS));
__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK |
(tsk->pid << _SIM_CONTROL_OPERATOR_BITS));
+#endif
return prev;
}
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 7918cf1..2352a81 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
/*
* Determine for each controller where its lowmem is mapped and how much of
* it is mapped there. On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
* start our data mappings higher up, but for now we don't bother, to avoid
* additional confusion.
*
@@ -1255,7 +1255,7 @@ static void __init validate_va(void)
#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
/*
* Similarly, make sure we're only using allowed VAs.
- * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+ * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
* and 0 .. KERNEL_HIGH_VADDR.
* In addition, make sure we CAN'T use the end of memory, since
* we use the last chunk of each pgd for the pgd_list.
@@ -1270,7 +1270,7 @@ static void __init validate_va(void)
if (range.size == 0)
break;
if (range.start <= MEM_USER_INTRPT &&
- range.start + range.size >= MEM_HV_INTRPT)
+ range.start + range.size >= MEM_HV_START)
user_kernel_ok = 1;
if (range.start == 0)
max_va = range.size;
@@ -1706,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
static int __init request_standard_resources(void)
{
int i;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if defined(CONFIG_PCI) && !defined(__tilegx__)
insert_non_bus_resource();
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a8..024b978 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
+#ifdef CONFIG_KVM_GUEST
+ return sprintf(page, "KVM\n");
+#else
return sprintf(page, "tilera\n");
+#endif
}
static DEVICE_ATTR(type, 0444, type_show, NULL);
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 3c2dc87..b0b7264 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -117,9 +117,9 @@ void __init time_init(void)
/*
* Define the tile timer clock event device. The timer is driven by
- * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
+ * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down
* counter, plus bit 31, which signifies that the counter has wrapped
- * from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be
+ * from zero to (2**31) - 1. The INT_[AUX_]TILE_TIMER interrupt will be
* raised as long as bit 31 is set.
*/
@@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks,
struct clock_event_device *evt)
{
BUG_ON(ticks > MAX_TICK);
- __insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
- arch_local_irq_unmask_now(INT_TILE_TIMER);
+ __insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks);
+ arch_local_irq_unmask_now(INT_LINUX_TIMER);
return 0;
}
@@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
static void tile_timer_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
}
static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
@@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void)
evt->cpumask = cpumask_of(smp_processor_id());
/* Start out with timer not firing. */
- arch_local_irq_mask_now(INT_TILE_TIMER);
+ arch_local_irq_mask_now(INT_LINUX_TIMER);
/*
* Register tile timer. Set min_delta to 1 microsecond, since
@@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
* Mask the timer interrupt here, since we are a oneshot timer
* and there are now by definition no events pending.
*/
- arch_local_irq_mask(INT_TILE_TIMER);
+ arch_local_irq_mask(INT_LINUX_TIMER);
/* Track time spent here in an interrupt context */
irq_enter();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index f110785..19d465c 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -30,7 +30,7 @@
void __init trap_init(void)
{
- /* Nothing needed here since we link code at .intrpt1 */
+ /* Nothing needed here since we link code at .intrpt */
}
int unaligned_fixup = 1;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index c7ae53d..8b20163 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
#include <hv/hypervisor.h>
/* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
OUTPUT_ARCH(tile)
ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
PHDRS
{
- intrpt1 PT_LOAD ;
+ intrpt PT_LOAD ;
text PT_LOAD ;
data PT_LOAD ;
}
@@ -24,11 +24,11 @@ SECTIONS
#define LOAD_OFFSET TEXT_OFFSET
/* Interrupt vectors */
- .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
+ .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
{
_text = .;
- *(.intrpt1)
- } :intrpt1 =0
+ *(.intrpt)
+ } :intrpt =0
/* Hypervisor call vectors */
. = ALIGN(0x10000);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 23f044e..86cff48 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm)
char *buf, *path;
struct vm_area_struct *vma;
+#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */
if (!sim_is_simulator())
+#endif
return 1;
if (mm->exe_file == NULL)
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 3bfa127..c6d2160 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
{
int cpu;
unsigned long page;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if CHIP_HAS_CBOX_HOME_MAP()
/* For kdata=huge, everything is just hash-for-home. */
@@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
}
- address = MEM_SV_INTRPT;
+ address = MEM_SV_START;
pmd = get_pmd(pgtables, address);
pfn = 0; /* code starts at PA 0 */
if (ktext_small) {
@@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
void free_initmem(void)
{
- const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+ const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
/*
* Evict the dirty initdata on the boot cpu, evict the w1data
@@ -1040,7 +1040,7 @@ void free_initmem(void)
/*
* Free the pages mapped from 0xc0000000 that correspond to code
- * pages from MEM_SV_INTRPT that we won't use again after init.
+ * pages from MEM_SV_START that we won't use again after init.
*/
free_init_pages("unused kernel text",
(unsigned long)_sinittext - text_delta,
--
1.8.3.1
This change enables support for a virtio-based console,
network support, and block driver support.
We remove some debug code in relocate_kernel_64.S that made raw
calls to the hv_console_putc Tilera hypervisor API, since everything
now should funnel through the early_hv_write() API.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/Kconfig | 3 +
arch/tile/include/asm/kvm_para.h | 20 ++
arch/tile/include/asm/kvm_virtio.h | 26 ++
arch/tile/include/uapi/asm/Kbuild | 1 +
arch/tile/include/uapi/asm/kvm.h | 5 +
arch/tile/include/uapi/asm/kvm_virtio.h | 60 +++++
arch/tile/kernel/Makefile | 1 +
arch/tile/kernel/early_printk.c | 16 ++
arch/tile/kernel/hvglue.S | 1 +
arch/tile/kernel/kvm_virtio.c | 430 ++++++++++++++++++++++++++++++++
arch/tile/kernel/relocate_kernel_64.S | 9 +-
11 files changed, 570 insertions(+), 2 deletions(-)
create mode 100644 arch/tile/include/asm/kvm_para.h
create mode 100644 arch/tile/include/asm/kvm_virtio.h
create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h
create mode 100644 arch/tile/kernel/kvm_virtio.c
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index e89aae8..4e8524b 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -370,6 +370,9 @@ config KVM_GUEST
bool "Build kernel as guest for KVM"
default n
depends on TILEGX
+ select VIRTIO
+ select VIRTIO_RING
+ select VIRTIO_CONSOLE
---help---
This will build a kernel that runs at a lower protection level
than the default kernel and is suitable to run under KVM.
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 0000000..c8c31d5
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_PARA_H
+#define _ASM_TILE_KVM_PARA_H
+
+#include <uapi/asm/kvm_para.h>
+
+int hcall_virtio(unsigned long instrument, unsigned long mem);
+#endif /* _ASM_TILE_KVM_PARA_H */
diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
new file mode 100644
index 0000000..8faa959
--- /dev/null
+++ b/arch/tile/include/asm/kvm_virtio.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _ASM_TILE_KVM_VIRTIO_H
+#define _ASM_TILE_KVM_VIRTIO_H
+
+#include <uapi/asm/kvm_virtio.h>
+
+
+struct kvm_device {
+ struct virtio_device vdev;
+ struct kvm_device_desc *desc;
+ unsigned long desc_pa;
+};
+
+#endif /* _ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index 89022a5..f07cc24 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -8,6 +8,7 @@ header-y += cachectl.h
header-y += hardwall.h
header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_virtio.h
header-y += mman.h
header-y += ptrace.h
header-y += setup.h
diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
index aa7b97f..4346520 100644
--- a/arch/tile/include/uapi/asm/kvm.h
+++ b/arch/tile/include/uapi/asm/kvm.h
@@ -149,6 +149,9 @@
*/
#define KVM_OTHER_HCALL 128
+/* Hypercall index for virtio. */
+#define KVM_HCALL_virtio 128
+
/* One greater than the maximum hypercall number. */
#define KVM_NUM_HCALLS 256
@@ -256,6 +259,8 @@ struct kvm_sync_regs {
KVM_EMULATE(get_ipi_pte) \
KVM_EMULATE(set_pte_super_shift) \
KVM_EMULATE(set_speed) \
+ /* For others */ \
+ USER_HCALL(virtio)
#endif
diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
new file mode 100644
index 0000000..d94f535
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm_virtio.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
+#define _UAPI_ASM_TILE_KVM_VIRTIO_H
+
+#include <linux/types.h>
+
+#define KVM_VIRTIO_UNKNOWN 0
+#define KVM_VIRTIO_NOTIFY 1
+#define KVM_VIRTIO_RESET 2
+#define KVM_VIRTIO_SET_STATUS 3
+
+struct kvm_device_desc {
+ /* The device type: console, network, disk etc. Type 0 terminates. */
+ __u8 type;
+ /* The number of virtqueues (first in config array) */
+ __u8 num_vq;
+ /*
+ * The number of bytes of feature bits. Multiply by 2: one for host
+ * features and one for Guest acknowledgements.
+ */
+ __u8 feature_len;
+ /* The number of bytes of the config array after virtqueues. */
+ __u8 config_len;
+ /* A status byte, written by the Guest. */
+ __u8 status;
+ __u64 config[0];
+};
+
+struct kvm_vqinfo {
+ /* Pointer to the information contained in the device config. */
+ struct kvm_vqconfig *config;
+ /* The address where we mapped the virtio ring, so we can unmap it. */
+ void *pages;
+};
+
+struct kvm_vqconfig {
+ /* The physical address of the virtio ring */
+ __u64 pa;
+ /* The number of entries in the virtio_ring */
+ __u64 num;
+ /* The interrupt we get when something happens. Set by the guest. */
+ __u32 irq;
+
+};
+
+
+#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b7c8b5e..b638d3e 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o
obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o
obj-y += vdso/
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index b608e00..53f2be4 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -18,11 +18,26 @@
#include <linux/string.h>
#include <linux/irqflags.h>
#include <linux/printk.h>
+#ifdef CONFIG_KVM_GUEST
+#include <linux/virtio_console.h>
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+#endif
#include <asm/setup.h>
#include <hv/hypervisor.h>
static void early_hv_write(struct console *con, const char *s, unsigned n)
{
+#ifdef CONFIG_KVM_GUEST
+ char buf[512];
+
+ if (n > sizeof(buf) - 1)
+ n = sizeof(buf) - 1;
+ memcpy(buf, s, n);
+ buf[n] = '\0';
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
+#else
tile_console_write(s, n);
/*
@@ -32,6 +47,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n)
*/
if (n && s[n-1] == '\n')
tile_console_write("\r", 1);
+#endif
}
static struct console early_hv_console = {
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
index dc5b417..2914a9e 100644
--- a/arch/tile/kernel/hvglue.S
+++ b/arch/tile/kernel/hvglue.S
@@ -77,4 +77,5 @@ gensym hv_install_guest_context, 0x7a0, 32
gensym hv_inquire_guest_context, 0x7c0, 32
gensym hv_console_set_ipi, 0x7e0, 32
gensym hv_glue_internals, 0x800, 2048
+gensym hcall_virtio, 0x1000, 32
gensym hv_hcall_internals, 0x1020, 28640
diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c
new file mode 100644
index 0000000..c6b6c6a
--- /dev/null
+++ b/arch/tile/kernel/kvm_virtio.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+/* Referred lguest & s390 implemenation */
+/*
+ * kvm_virtio.c - virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): Christian Borntraeger <[email protected]>
+ */
+
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+
+static void *kvm_devices;
+
+/*
+ * TODO: We actually does not use PCI virtio here. We use this
+ * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
+ * Maybe we should change them to generic definitions in both qemu & Linux.
+ * Besides, Let's check whether the alignment value (4096, i.e. default
+ * x86 page size) affects performance later.
+ */
+#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN
+#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
+
+/*
+ * memory layout: (Total: PAGE_SIZE)
+ * <device 0>
+ * - kvm device descriptor
+ * struct kvm_device_desc
+ * - vqueue configuration (totally desc->num_vq)
+ * struct kvm_vqconfig
+ * ......
+ * struct kvm_vqconfig
+ * - feature bits (size: desc->feature_len * 2)
+ * - config space (size: desc->config_len)
+ * <device 1>
+ * ......
+ */
+static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
+{
+ return (struct kvm_vqconfig *)(desc + 1);
+}
+
+static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
+{
+ return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
+}
+
+static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
+{
+ return kvm_vq_features(desc) + desc->feature_len * 2;
+}
+
+/*
+ * The total size of the config page used by this device (incl. desc)
+ */
+static unsigned desc_size(const struct kvm_device_desc *desc)
+{
+ return sizeof(*desc)
+ + desc->num_vq * sizeof(struct kvm_vqconfig)
+ + desc->feature_len * 2
+ + desc->config_len;
+}
+
+/* This gets the device's feature bits. */
+static u32 kvm_get_features(struct virtio_device *vdev)
+{
+ unsigned int i;
+ u32 features = 0;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ u8 *in_features = kvm_vq_features(desc);
+
+ for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+ if (in_features[i / 8] & (1 << (i % 8)))
+ features |= (1 << i);
+ return features;
+}
+
+static void kvm_finalize_features(struct virtio_device *vdev)
+{
+ unsigned int i, bits;
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+ /* Second half of bitmap is features we accept. */
+ u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
+
+ /* Give virtio_ring a chance to accept features. */
+ vring_transport_features(vdev);
+
+ memset(out_features, 0, desc->feature_len);
+ bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, vdev->features))
+ out_features[i / 8] |= (1 << (i % 8));
+ }
+}
+
+/*
+ * Reading and writing elements in config space
+ */
+static void kvm_get(struct virtio_device *vdev, unsigned int offset,
+ void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(buf, kvm_vq_configspace(desc) + offset, len);
+}
+
+static void kvm_set(struct virtio_device *vdev, unsigned int offset,
+ const void *buf, unsigned len)
+{
+ struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+ BUG_ON(offset + len > desc->config_len);
+ memcpy(kvm_vq_configspace(desc) + offset, buf, len);
+}
+
+/*
+ * The operations to get and set the status word just access
+ * the status field of the device descriptor. set_status will also
+ * make a hypercall to the host, to tell about status changes
+ */
+static u8 kvm_get_status(struct virtio_device *vdev)
+{
+ return to_kvmdev(vdev)->desc->status;
+}
+
+static void kvm_set_status(struct virtio_device *vdev, u8 status)
+{
+ BUG_ON(!status);
+ to_kvmdev(vdev)->desc->status = status;
+ hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
+ * descriptor address. The Host will zero the status and all the
+ * features.
+ */
+static void kvm_reset(struct virtio_device *vdev)
+{
+ hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * When the virtio_ring code wants to notify the Host, it calls us here and we
+ * make a hypercall. We hand the address of the virtqueue so the Host
+ * knows which virtqueue we're talking about.
+ */
+static void kvm_notify(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
+}
+
+/*
+ * Must set some caching mode to keep set_pte() happy.
+ * It doesn't matter what we choose, because the PFN
+ * is illegal, so we're going to take a page fault anyway.
+ */
+static inline pgprot_t io_prot(void)
+{
+ return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
+}
+
+/*
+ * This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ */
+static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
+ unsigned index,
+ void (*callback)(struct virtqueue *vq),
+ const char *name)
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ struct kvm_vqinfo *vqi;
+ struct kvm_vqconfig *config;
+ struct virtqueue *vq;
+ long irq;
+ int err = -EINVAL;
+
+ if (index >= kdev->desc->num_vq)
+ return ERR_PTR(-ENOENT);
+
+ vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
+ if (!vqi)
+ return ERR_PTR(-ENOMEM);
+
+ config = kvm_vq_config(kdev->desc)+index;
+
+ vqi->config = config;
+ vqi->pages = generic_remap_prot(config->pa,
+ vring_size(config->num,
+ KVM_TILE_VIRTIO_RING_ALIGN),
+ 0, io_prot());
+ if (!vqi->pages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
+ vdev, 0, vqi->pages,
+ kvm_notify, callback, name);
+ if (!vq) {
+ err = -ENOMEM;
+ goto unmap;
+ }
+
+ /*
+ * Trigger the IPI interrupt in SW way.
+ * TODO: We do not need to create one irq for each vq. A bit wasteful.
+ */
+ irq = create_irq();
+ if (irq < 0) {
+ err = -ENXIO;
+ goto del_virtqueue;
+ }
+
+ tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
+
+ if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
+ err = -ENXIO;
+ destroy_irq(irq);
+ goto del_virtqueue;
+ }
+
+ config->irq = irq;
+
+ vq->priv = vqi;
+ return vq;
+
+del_virtqueue:
+ vring_del_virtqueue(vq);
+unmap:
+ vunmap(vqi->pages);
+out:
+ return ERR_PTR(err);
+}
+
+static void kvm_del_vq(struct virtqueue *vq)
+{
+ struct kvm_vqinfo *vqi = vq->priv;
+
+ vring_del_virtqueue(vq);
+ vunmap(vqi->pages);
+ kfree(vqi);
+}
+
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+ struct virtqueue *vq, *n;
+
+ list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+ kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+ struct virtqueue *vqs[],
+ vq_callback_t *callbacks[],
+ const char *names[])
+{
+ struct kvm_device *kdev = to_kvmdev(vdev);
+ int i;
+
+ /* We must have this many virtqueues. */
+ if (nvqs > kdev->desc->num_vq)
+ return -ENOENT;
+
+ for (i = 0; i < nvqs; ++i) {
+ vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+ if (IS_ERR(vqs[i]))
+ goto error;
+ }
+ return 0;
+
+error:
+ kvm_del_vqs(vdev);
+ return PTR_ERR(vqs[i]);
+}
+
+/*
+ * The config ops structure as defined by virtio config
+ */
+static struct virtio_config_ops kvm_vq_config_ops = {
+ .get_features = kvm_get_features,
+ .finalize_features = kvm_finalize_features,
+ .get = kvm_get,
+ .set = kvm_set,
+ .get_status = kvm_get_status,
+ .set_status = kvm_set_status,
+ .reset = kvm_reset,
+ .find_vqs = kvm_find_vqs,
+ .del_vqs = kvm_del_vqs,
+};
+
+/*
+ * The root device for the kvm virtio devices.
+ * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
+ */
+static struct device *kvm_root;
+
+/*
+ * adds a new device and register it with virtio
+ * appropriate drivers are loaded by the device model
+ */
+static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
+{
+ struct kvm_device *kdev;
+
+ kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+ if (!kdev) {
+ pr_emerg("Cannot allocate kvm dev %u type %u\n",
+ offset, d->type);
+ return;
+ }
+
+ kdev->vdev.dev.parent = kvm_root;
+ kdev->vdev.id.device = d->type;
+ kdev->vdev.config = &kvm_vq_config_ops;
+ kdev->desc = d;
+ kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
+
+ if (register_virtio_device(&kdev->vdev) != 0) {
+ pr_err("Failed to register kvm device %u type %u\n",
+ offset, d->type);
+ kfree(kdev);
+ }
+}
+
+/*
+ * scan_devices() simply iterates through the device page.
+ * The type 0 is reserved to mean "end of devices".
+ */
+static void scan_devices(void)
+{
+ unsigned int i;
+ struct kvm_device_desc *d;
+
+ for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+ d = kvm_devices + i;
+
+ if (d->type == 0)
+ break;
+
+ add_kvm_device(d, i);
+ }
+}
+
+/*
+ * Init function for virtio.
+ * devices are in a single page above the top of "normal" mem.
+ */
+static int __init kvm_devices_init(void)
+{
+ int rc = -ENOMEM;
+
+ kvm_root = root_device_register("kvm_tile");
+ if (IS_ERR(kvm_root)) {
+ rc = PTR_ERR(kvm_root);
+ pr_err("Could not register kvm_tile root device");
+ return rc;
+ }
+
+ kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
+ 0, io_prot());
+ if (!kvm_devices) {
+ kvm_devices = NULL;
+ root_device_unregister(kvm_root);
+ return rc;
+ }
+
+ scan_devices();
+ return 0;
+}
+
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int len)
+{
+ char scratch[512];
+
+ if (len > sizeof(scratch) - 1)
+ len = sizeof(scratch) - 1;
+ scratch[len] = '\0';
+ memcpy(scratch, buf, len);
+ hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
+
+ return len;
+}
+
+static int __init tile_virtio_console_init(void)
+{
+ return virtio_cons_early_init(early_put_chars);
+}
+console_initcall(tile_virtio_console_init);
+
+/*
+ * We do this after core stuff, but before the drivers.
+ */
+postcore_initcall(kvm_devices_init);
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..02bc446 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
addi sp, sp, -8
/* we now have a stack (whether we need one or not) */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r40, hw2_last(hv_console_putc)
shl16insli r40, r40, hw1(hv_console_putc)
shl16insli r40, r40, hw0(hv_console_putc)
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, 'r'
jalr r40
@@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
/* we should not get here */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, '?'
jalr r40
moveli r0, '\n'
jalr r40
+#endif
j .Lhalt
@@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
j .Lloop
-.Lerr: moveli r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+ moveli r0, 'e'
jalr r40
moveli r0, 'r'
jalr r40
@@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
jalr r40
moveli r0, '\n'
jalr r40
+#endif
.Lhalt:
moveli r41, hw2_last(hv_halt)
shl16insli r41, r41, hw1(hv_halt)
--
1.8.3.1
On 8/26/2013 8:04 AM, Gleb Natapov wrote:
> On Sun, Aug 25, 2013 at 09:26:47PM -0400, Chris Metcalf wrote:
>> On 8/25/2013 7:39 AM, Gleb Natapov wrote:
>>> On Mon, Aug 12, 2013 at 04:24:11PM -0400, Chris Metcalf wrote:
>>>> This change provides the initial framework support for KVM on tilegx.
>>>> Basic virtual disk and networking is supported.
>>>>
>>> This needs to be broken down to more reviewable patches.
Ping! I'd like to ask Linus to pull the tile tree now that 3.12 is open.
Let me know if this would be problematic for you.
I think the tile KVM bits are in good enough shape that it makes sense
to pull them, and if there are follow-up concerns they can be addressed
later in the 3.12-rc series, or in future kernel releases.
In particular, it's probably worth noting the one generic change in the
series, which is adding a KVM_EXIT_AGAIN value. If this seems like it
might be problematic, I can pull it for now and leave it for later review;
it's basically just a performance optimization.
The v3 series is at:
https://lkml.org/lkml/2013/8/28/675
https://lkml.org/lkml/2013/8/28/672
https://lkml.org/lkml/2013/8/28/676
Thank you!
--
Chris Metcalf, Tilera Corp.
http://www.tilera.com
On Tue, Sep 03, 2013 at 01:32:22PM -0400, Chris Metcalf wrote:
> On 8/26/2013 8:04 AM, Gleb Natapov wrote:
> > On Sun, Aug 25, 2013 at 09:26:47PM -0400, Chris Metcalf wrote:
> >> On 8/25/2013 7:39 AM, Gleb Natapov wrote:
> >>> On Mon, Aug 12, 2013 at 04:24:11PM -0400, Chris Metcalf wrote:
> >>>> This change provides the initial framework support for KVM on tilegx.
> >>>> Basic virtual disk and networking is supported.
> >>>>
> >>> This needs to be broken down to more reviewable patches.
>
> Ping! I'd like to ask Linus to pull the tile tree now that 3.12 is open.
> Let me know if this would be problematic for you.
>
That was sent to close to merge window and I and Paolo had vocations in
August so with the email backlog we had no time to review it. Lets wait
for 2.13 and I will give its review top priority after the merge
windows.
> I think the tile KVM bits are in good enough shape that it makes sense
> to pull them, and if there are follow-up concerns they can be addressed
> later in the 3.12-rc series, or in future kernel releases.
>
> In particular, it's probably worth noting the one generic change in the
> series, which is adding a KVM_EXIT_AGAIN value. If this seems like it
> might be problematic, I can pull it for now and leave it for later review;
> it's basically just a performance optimization.
>
> The v3 series is at:
>
> https://lkml.org/lkml/2013/8/28/675
> https://lkml.org/lkml/2013/8/28/672
> https://lkml.org/lkml/2013/8/28/676
>
> Thank you!
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
--
Gleb.
On 9/3/2013 1:39 PM, Gleb Natapov wrote:
> On Tue, Sep 03, 2013 at 01:32:22PM -0400, Chris Metcalf wrote:
>> Ping! I'd like to ask Linus to pull the tile tree now that 3.12 is open.
>> Let me know if this would be problematic for you.
>>
> That was sent to close to merge window and I and Paolo had vocations in
> August so with the email backlog we had no time to review it. Lets wait
> for 2.13 and I will give its review top priority after the merge
> windows.
Thanks, I've pulled it from linux-tile for now. I'll put it back (for linux-next) after Linus pulls the remainder of the tree.
--
Chris Metcalf, Tilera Corp.
http://www.tilera.com
The KVM support for tile requires additional review that is out of scope
for 3.12, but some of the changes included with the KVM code are really
just pre-requisite reorganization or re-parameterization of existing code.
Gleb Natapov had previously requested finer-grained KVM patches, and with
an eye toward simplifying eventual code review I realized there was
more that could be split out into separate patches.
Rather than making these changes part of the eventual KVM review, I've
split them out here into some smaller pieces that don't require review
by the KVM team, since they are essentially internal rework within the
tile architecture. This will also simplify ongoing three-way merges.
I will defer the pull request for linux-tile for a few days in case
anyone has comments on this batch of changes.
Chris Metcalf (3):
tile: clean up relocate_kernel_64 debug code
tile: don't assume user privilege is zero
tile: parameterize VA and PA space more cleanly
arch/tile/include/asm/page.h | 52 ++++++++++++++---------------------
arch/tile/include/asm/pgtable_32.h | 2 +-
arch/tile/include/asm/pgtable_64.h | 3 +-
arch/tile/include/asm/processor.h | 6 ++--
arch/tile/include/asm/ptrace.h | 2 +-
arch/tile/kernel/head_32.S | 4 +--
arch/tile/kernel/head_64.S | 6 ++--
arch/tile/kernel/intvec_32.S | 6 ++--
arch/tile/kernel/intvec_64.S | 31 +++++++++++++++------
arch/tile/kernel/relocate_kernel_64.S | 9 ++++--
arch/tile/kernel/setup.c | 8 +++---
arch/tile/kernel/stack.c | 2 +-
arch/tile/kernel/traps.c | 2 +-
arch/tile/kernel/vmlinux.lds.S | 10 +++----
arch/tile/mm/fault.c | 4 +--
arch/tile/mm/init.c | 8 +++---
16 files changed, 81 insertions(+), 74 deletions(-)
--
1.8.3.1
We remove some debug code in relocate_kernel_64.S that made raw
calls to the hv_console_putc Tilera hypervisor API, since everything
should funnel through the early_hv_write() API.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/relocate_kernel_64.S | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..02bc446 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
addi sp, sp, -8
/* we now have a stack (whether we need one or not) */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r40, hw2_last(hv_console_putc)
shl16insli r40, r40, hw1(hv_console_putc)
shl16insli r40, r40, hw0(hv_console_putc)
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, 'r'
jalr r40
@@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
/* we should not get here */
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
moveli r0, '?'
jalr r40
moveli r0, '\n'
jalr r40
+#endif
j .Lhalt
@@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
j .Lloop
-.Lerr: moveli r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+ moveli r0, 'e'
jalr r40
moveli r0, 'r'
jalr r40
@@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
jalr r40
moveli r0, '\n'
jalr r40
+#endif
.Lhalt:
moveli r41, hw2_last(hv_halt)
shl16insli r41, r41, hw1(hv_halt)
--
1.8.3.1
Technically, user privilege is anything less than kernel
privilege. We modify the existing user_mode() macro to have
this semantic (and use it in a couple of places it wasn't being
used before), and add an IS_KERNEL_EX1() macro to the assembly
code as well.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/processor.h | 4 ++--
arch/tile/include/asm/ptrace.h | 2 +-
arch/tile/kernel/intvec_64.S | 23 +++++++++++++++++------
arch/tile/kernel/stack.c | 2 +-
arch/tile/mm/fault.c | 4 ++--
5 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 230b830..c72fcba 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
#ifndef _ASM_TILE_PROCESSOR_H
#define _ASM_TILE_PROCESSOR_H
+#include <arch/chip.h>
+
#ifndef __ASSEMBLY__
/*
@@ -25,7 +27,6 @@
#include <asm/ptrace.h>
#include <asm/percpu.h>
-#include <arch/chip.h>
#include <arch/spr_def.h>
struct task_struct;
@@ -347,7 +348,6 @@ extern int kdata_huge;
/*
* Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
*/
#define USER_PL 0
#if CONFIG_KERNEL_PL == 2
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 0d25c21..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
#define user_stack_pointer(regs) ((regs)->sp)
/* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
/* Fill in a struct pt_regs with the current kernel registers. */
struct pt_regs *get_pt_regs(struct pt_regs *);
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 3b35bb4..f020f01 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -34,6 +34,16 @@
#define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set). Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
.macro push_reg reg, ptr=sp, delta=-8
{
@@ -308,7 +318,7 @@ intvec_\vecname:
*/
{
blbs sp, 2f
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r0, r0)
}
.ifc \vecnum, INT_DOUBLE_FAULT
@@ -641,11 +651,12 @@ intvec_\vecname:
/*
* If we will be returning to the kernel, we will need to
* reset the interrupt masks to the state they had before.
- * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+ * Set DISABLE_IRQ in flags iff we came from kernel pl with
+ * irqs disabled.
*/
mfspr r32, SPR_EX_CONTEXT_K_1
{
- andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r22, r22)
PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
}
beqzt r32, 1f /* zero if from user space */
@@ -812,7 +823,7 @@ STD_ENTRY(interrupt_return)
PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
}
ld r29, r29
- andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(r29, r29)
{
beqzt r29, .Lresume_userspace
move r29, sp
@@ -936,7 +947,7 @@ STD_ENTRY(interrupt_return)
PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
}
{
- andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+ IS_KERNEL_EX1(r0, r0)
ld r32, r32
}
bnez r0, 1f
@@ -1007,7 +1018,7 @@ STD_ENTRY(interrupt_return)
pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
{
mtspr SPR_EX_CONTEXT_K_1, lr
- andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
+ IS_KERNEL_EX1(lr, lr)
}
{
mtspr SPR_EX_CONTEXT_K_0, r21
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 24fd223..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
p->sp >= sp) {
if (kbt->verbose)
pr_err(" <%s while in kernel mode>\n", fault);
- } else if (EX1_PL(p->ex1) == USER_PL &&
+ } else if (user_mode(p) &&
p->sp < PAGE_OFFSET && p->sp != 0) {
if (kbt->verbose)
pr_err(" <%s while in user mode>\n", fault);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 64eec3f..39c48cb 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
(write ? FAULT_FLAG_WRITE : 0));
- is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+ is_kernel_mode = !user_mode(regs);
tsk = validate_current();
@@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
}
#if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
- if (EX1_PL(regs->ex1) != USER_PL) {
+ if (!user_mode(regs)) {
struct async_tlb *async;
switch (fault_num) {
#if CHIP_HAS_TILE_DMA()
--
1.8.3.1
The existing code relied on the hardware definition (<arch/chip.h>)
to specify how much VA and PA space was available. It's convenient
to allow customizing this for some configurations, so provide symbols
MAX_PA_WIDTH and MAX_VA_WIDTH in <asm/page.h> that can be modified
if desired.
Additionally, move away from the MEM_XX_INTRPT nomenclature to
define the start of various regions within the VA space. In fact
the cleaner symbol is, for example, MEM_SV_START, to indicate the
start of the area used for supervisor code; the actual address of the
interrupt vectors is not as important, and can be changed if desired.
As part of this change, convert from "intrpt1" nomenclature (which
built in the old privilege-level 1 model) to a simple "intrpt".
Also strip out some tilepro-specific code supporting modifying the
PL the kernel could run at, since we don't actually support using
different PLs in tilepro, only tilegx.
Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/page.h | 52 +++++++++++++++-----------------------
arch/tile/include/asm/pgtable_32.h | 2 +-
arch/tile/include/asm/pgtable_64.h | 3 +--
arch/tile/include/asm/processor.h | 2 +-
arch/tile/kernel/head_32.S | 4 +--
arch/tile/kernel/head_64.S | 6 ++---
arch/tile/kernel/intvec_32.S | 6 ++---
arch/tile/kernel/intvec_64.S | 8 +++---
arch/tile/kernel/setup.c | 8 +++---
arch/tile/kernel/traps.c | 2 +-
arch/tile/kernel/vmlinux.lds.S | 10 ++++----
arch/tile/mm/init.c | 8 +++---
12 files changed, 51 insertions(+), 60 deletions(-)
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index b4f96c0..980843d 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -148,8 +148,12 @@ static inline __attribute_const__ int get_order(unsigned long size)
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif
+/* Allow overriding how much VA or PA the kernel will use. */
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+
/* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
#define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
#define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
#define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -160,7 +164,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
* We reserve the lower half of memory for user-space programs, and the
* upper half for system code. We re-map all of physical memory in the
* upper half, which takes a quarter of our VA space. Then we have
- * the vmalloc regions. The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions. The supervisor code lives at the highest address,
* with the hypervisor above that.
*
* Loadable kernel modules are placed immediately after the static
@@ -172,26 +176,19 @@ static inline __attribute_const__ int get_order(unsigned long size)
* Similarly, for now we don't play any struct page mapping games.
*/
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
# error Too much PA to map with the VA available!
#endif
-#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
-#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */
-#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */
-#define PAGE_OFFSET MEM_HIGH_START
-#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */
+#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */
+#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
#define _VMALLOC_START FIXADDR_TOP
-#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT MEM_SV_START
-#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */
#define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR MEM_SV_START
#else /* !__tilegx__ */
@@ -213,25 +210,18 @@ static inline __attribute_const__ int get_order(unsigned long size)
* values, and after that, we show "typical" values, since the actual
* addresses depend on kernel #defines.
*
- * MEM_HV_INTRPT 0xfe000000
- * MEM_SV_INTRPT (kernel code) 0xfd000000
+ * MEM_HV_START 0xfe000000
+ * MEM_SV_START (kernel code) 0xfd000000
* MEM_USER_INTRPT (user vector) 0xfc000000
- * FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR)
- * PKMAP_BASE 0xf7000000 (via LAST_PKMAP)
- * HUGE_VMAP 0xf3000000 (via CONFIG_NR_HUGE_VMAPS)
- * VMALLOC_START 0xf0000000 (via __VMALLOC_RESERVE)
+ * FIX_KMAP_xxx 0xfa000000 (via NR_CPUS * KM_TYPE_NR)
+ * PKMAP_BASE 0xf9000000 (via LAST_PKMAP)
+ * VMALLOC_START 0xf7000000 (via VMALLOC_RESERVE)
* mapped LOWMEM 0xc0000000
*/
#define MEM_USER_INTRPT _AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT _AC(0xfd000000, UL)
-#define MEM_HV_INTRPT _AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT _AC(0xfd000000, UL)
-#define MEM_SV_INTRPT _AC(0xfe000000, UL)
-#define MEM_HV_INTRPT _AC(0xff000000, UL)
-#endif
+#define MEM_SV_START _AC(0xfd000000, UL)
+#define MEM_HV_START _AC(0xfe000000, UL)
#define INTRPT_SIZE 0x4000
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index e5bdc0e..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; }
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_INTRPT;
+ return addr >= MEM_HV_START;
}
/*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 7cb8d35..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
/* We don't define any pgds for these addresses. */
static inline int pgd_addr_invalid(unsigned long addr)
{
- return addr >= MEM_HV_START ||
- (addr > MEM_LOW_END && addr < MEM_HIGH_START);
+ return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
}
/*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index c72fcba..5aa5431 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -168,7 +168,7 @@ struct thread_struct {
#ifndef __ASSEMBLY__
#ifdef __tilegx__
-#define TASK_SIZE_MAX (MEM_LOW_END + 1)
+#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1))
#else
#define TASK_SIZE_MAX PAGE_OFFSET
#endif
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index f3f17b0..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
.set addr, addr + PGDIR_SIZE
.endr
- /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
- PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+ /* The true text VAs are mapped as VA = PA + MEM_SV_START */
+ PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
(1 << (HV_PTE_INDEX_EXECUTABLE - 32))
.org swapper_pg_dir + PGDIR_SIZE
END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 652b814..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -135,9 +135,9 @@ ENTRY(_start)
1:
/* Install the interrupt base. */
- moveli r0, hw2_last(MEM_SV_START)
- shl16insli r0, r0, hw1(MEM_SV_START)
- shl16insli r0, r0, hw0(MEM_SV_START)
+ moveli r0, hw2_last(intrpt_start)
+ shl16insli r0, r0, hw1(intrpt_start)
+ shl16insli r0, r0, hw0(intrpt_start)
mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
/* Get our processor number and save it away in SAVE_K_0. */
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f3d26f4..f084f1c 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -353,7 +353,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -1890,8 +1890,8 @@ int_unalign:
push_extra_callee_saves r0
j do_trap
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index f020f01..c3a2335 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -535,7 +535,7 @@ intvec_\vecname:
#ifdef __COLLECT_LINKER_FEEDBACK__
.pushsection .text.intvec_feedback,"ax"
.org (\vecnum << 5)
- FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+ FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
jrp lr
.popsection
#endif
@@ -1485,8 +1485,10 @@ STD_ENTRY(fill_ra_stack)
__int_hand \vecnum, \vecname, \c_routine, \processing
.endm
-/* Include .intrpt1 array of interrupt vectors */
- .section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+ .section ".intrpt", "ax"
+ .global intrpt_start
+intrpt_start:
#define op_handle_perf_interrupt bad_intr
#define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 774e819..1021784 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
/*
* Determine for each controller where its lowmem is mapped and how much of
* it is mapped there. On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
* start our data mappings higher up, but for now we don't bother, to avoid
* additional confusion.
*
@@ -1242,7 +1242,7 @@ static void __init validate_va(void)
#ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */
/*
* Similarly, make sure we're only using allowed VAs.
- * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+ * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
* and 0 .. KERNEL_HIGH_VADDR.
* In addition, make sure we CAN'T use the end of memory, since
* we use the last chunk of each pgd for the pgd_list.
@@ -1257,7 +1257,7 @@ static void __init validate_va(void)
if (range.size == 0)
break;
if (range.start <= MEM_USER_INTRPT &&
- range.start + range.size >= MEM_HV_INTRPT)
+ range.start + range.size >= MEM_HV_START)
user_kernel_ok = 1;
if (range.start == 0)
max_va = range.size;
@@ -1693,7 +1693,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
static int __init request_standard_resources(void)
{
int i;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if defined(CONFIG_PCI) && !defined(__tilegx__)
insert_non_bus_resource();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index cfff6f9..628661f 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -30,7 +30,7 @@
void __init trap_init(void)
{
- /* Nothing needed here since we link code at .intrpt1 */
+ /* Nothing needed here since we link code at .intrpt */
}
int unaligned_fixup = 1;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index c7ae53d..8b20163 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
#include <hv/hypervisor.h>
/* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
OUTPUT_ARCH(tile)
ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
PHDRS
{
- intrpt1 PT_LOAD ;
+ intrpt PT_LOAD ;
text PT_LOAD ;
data PT_LOAD ;
}
@@ -24,11 +24,11 @@ SECTIONS
#define LOAD_OFFSET TEXT_OFFSET
/* Interrupt vectors */
- .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
+ .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */
{
_text = .;
- *(.intrpt1)
- } :intrpt1 =0
+ *(.intrpt)
+ } :intrpt =0
/* Hypervisor call vectors */
. = ALIGN(0x10000);
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 3bfa127..c6d2160 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
{
int cpu;
unsigned long page;
- enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+ enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
#if CHIP_HAS_CBOX_HOME_MAP()
/* For kdata=huge, everything is just hash-for-home. */
@@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
}
}
- address = MEM_SV_INTRPT;
+ address = MEM_SV_START;
pmd = get_pmd(pgtables, address);
pfn = 0; /* code starts at PA 0 */
if (ktext_small) {
@@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
void free_initmem(void)
{
- const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+ const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
/*
* Evict the dirty initdata on the boot cpu, evict the w1data
@@ -1040,7 +1040,7 @@ void free_initmem(void)
/*
* Free the pages mapped from 0xc0000000 that correspond to code
- * pages from MEM_SV_INTRPT that we won't use again after init.
+ * pages from MEM_SV_START that we won't use again after init.
*/
free_init_pages("unused kernel text",
(unsigned long)_sinittext - text_delta,
--
1.8.3.1
On Wed, Aug 28, 2013 at 03:45:50PM -0400, Chris Metcalf wrote:
> This commit enables the host side of KVM support for tilegx.
>
> KVM support on tilegx presumes a client that runs at privilege level 1
> (PL1), above normal user programs, which continue to run at PL0, but
> below the normal (host) Linux, which runs at PL2. Omitting all "trap
> and emulate" support both simplifies the host as well as allowing a
> paravirtualized guest to run with better overall performance; we may
> in the future elect to add emulation support.
>
> We don't support huge pages, or any of the tile-specific APIs that
> require being locked to specific cores (e.g., UDN hardwall).
>
> The Tilera booter/hypervisor (which runs at PL3) has been extended to
> support a guest context (used for interrupts at PL1) as well as a
> virtualization context that is applied to map guest PAs to real PAs.
> Note that the eventual plan for the Tilera software stack is to
> migrate away from having the Tilera booter provide PL3 functionality
> like this and instead provide it all in the host kernel at PL2.
> Meanwhile, the kvm host provides the Tilera hypervisor API to the
> paravirtualized guest.
>
> The commit adds a KVM_EXIT_xxx code, KVM_EXIT_AGAIN, which is used to
> exit out to the host kernel, but not all the way out to qemu. This is
> helpful if we are trying to handle resched, sigpending, etc., but don't
> need to end up back in userspace first.
>
I think there is a confusion here on how things suppose to work.
KVM_EXIT_xxx defines are only meant to be meaningful to userspace, they
are never used internally by KVM. So KVM_EXIT_AGAIN, as defined above,
does not make sense. Looking at the code I see that you've reused those
defines for vmexit codes too and this is incorrect. On platform with HW
virt support vmexit codes are defined by CPU architecture (and there are
much more of vmexit codes that userspace exit codes), PV define their own
interface.
Se inline for more comments.
> Signed-off-by: Chris Metcalf <[email protected]>
> ---
> arch/tile/Kconfig | 2 +-
> arch/tile/Makefile | 1 +
> arch/tile/include/asm/io.h | 2 +
> arch/tile/include/asm/kvm.h | 29 +
> arch/tile/include/asm/kvm_host.h | 95 ++
> arch/tile/include/asm/processor.h | 4 +-
> arch/tile/include/asm/ptrace.h | 2 +-
> arch/tile/include/asm/thread_info.h | 17 +-
> arch/tile/include/hv/hypervisor.h | 183 +++-
> arch/tile/include/uapi/arch/sim.h | 19 +
> arch/tile/include/uapi/arch/sim_def.h | 8 +
> arch/tile/include/uapi/arch/spr_def_32.h | 15 +
> arch/tile/include/uapi/arch/spr_def_64.h | 25 +
> arch/tile/include/uapi/asm/Kbuild | 1 +
> arch/tile/include/uapi/asm/kvm.h | 262 +++++
> arch/tile/kernel/asm-offsets.c | 7 +
> arch/tile/kernel/hvglue.S | 7 +-
> arch/tile/kernel/hvglue_trace.c | 14 +
> arch/tile/kernel/intvec_32.S | 12 +-
> arch/tile/kernel/intvec_64.S | 192 +++-
> arch/tile/kernel/process.c | 38 +-
> arch/tile/kernel/setup.c | 13 +
> arch/tile/kernel/smp.c | 28 +-
> arch/tile/kernel/stack.c | 2 +-
> arch/tile/kvm/Kconfig | 3 -
> arch/tile/kvm/Makefile | 12 +
> arch/tile/kvm/entry.S | 91 ++
> arch/tile/kvm/kvm-tile.c | 1529 ++++++++++++++++++++++++++++++
> arch/tile/lib/exports.c | 20 +-
> arch/tile/mm/fault.c | 4 +-
> arch/tile/mm/pgtable.c | 35 +-
> include/uapi/linux/kvm.h | 1 +
> virt/kvm/kvm_main.c | 3 +-
> 33 files changed, 2558 insertions(+), 118 deletions(-)
> create mode 100644 arch/tile/include/asm/kvm.h
> create mode 100644 arch/tile/include/asm/kvm_host.h
> create mode 100644 arch/tile/include/uapi/asm/kvm.h
> create mode 100644 arch/tile/kvm/Makefile
> create mode 100644 arch/tile/kvm/entry.S
> create mode 100644 arch/tile/kvm/kvm-tile.c
>
> diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
> index ecff467..3bc8fb7 100644
> --- a/arch/tile/Kconfig
> +++ b/arch/tile/Kconfig
> @@ -5,7 +5,6 @@ config TILE
> def_bool y
> select HAVE_DMA_ATTRS
> select HAVE_DMA_API_DEBUG
> - select HAVE_KVM if !TILEGX
> select GENERIC_FIND_FIRST_BIT
> select SYSCTL_EXCEPTION_TRACE
> select USE_GENERIC_SMP_HELPERS
> @@ -127,6 +126,7 @@ config TILEGX
> select HAVE_FTRACE_MCOUNT_RECORD
> select HAVE_KPROBES
> select HAVE_KRETPROBES
> + select HAVE_KVM
>
> config TILEPRO
> def_bool !TILEGX
> diff --git a/arch/tile/Makefile b/arch/tile/Makefile
> index 3d15364..8e7f852 100644
> --- a/arch/tile/Makefile
> +++ b/arch/tile/Makefile
> @@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH)
>
> # See arch/tile/Kbuild for content of core part of the kernel
> core-y += arch/tile/
> +core-$(CONFIG_KVM) += arch/tile/kvm/
>
> core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
>
> diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
> index 9fe4349..023659b 100644
> --- a/arch/tile/include/asm/io.h
> +++ b/arch/tile/include/asm/io.h
> @@ -43,6 +43,8 @@
> * long before casting it to a pointer to avoid compiler warnings.
> */
> #if CHIP_HAS_MMIO()
> +extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
> + unsigned long flags, pgprot_t prot);
> extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
> extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
> pgprot_t pgprot);
> diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
> new file mode 100644
> index 0000000..2ea6c41
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm.h
> @@ -0,0 +1,29 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _ASM_TILE_KVM_H
> +#define _ASM_TILE_KVM_H
> +
> +#include <hv/hypervisor.h>
> +#include <uapi/asm/kvm.h>
> +
> +#ifndef __ASSEMBLER__
> +/* For hv_*() */
> +#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
> +#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
> +#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
> +#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
> +/* For others */
> +#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
> +#endif
> +#endif /* _ASM_TILE_KVM_H */
> diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
> new file mode 100644
> index 0000000..1c9b6cd
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm_host.h
> @@ -0,0 +1,95 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _ASM_TILE_KVM_HOST_H
> +#define _ASM_TILE_KVM_HOST_H
> +
> +#define KVM_MAX_VCPUS 64
> +#define KVM_USER_MEM_SLOTS 32
> +#define KVM_PRIVATE_MEM_SLOTS 4
> +
> +/* For now, claim we have no huge pages. */
> +#define KVM_HPAGE_GFN_SHIFT(x) 0
> +#define KVM_NR_PAGE_SIZES 1
> +#define KVM_PAGES_PER_HPAGE(x) 1
> +
> +/* Max number of message tags for hv_send/receive_message() */
> +#define MAX_MSG_TAG (sizeof(unsigned long) * 8)
> +
> +/* Bits in pending_downcalls */
> +#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <linux/types.h>
> +#include <linux/ptrace.h>
> +
> +struct kvm_vcpu_stat {
> + u32 halt_wakeup;
> +};
> +
> +struct kvm_vcpu_arch {
> + struct pt_regs regs;
> + struct kvm_sregs sregs;
> + unsigned long host_sp; /* Host "real" sp during vmresume. */
> + HV_Context guest_context;
> + unsigned long pending_msgs; /* Pending guest messages */
> + unsigned long ipi_events; /* Pending guest ipi events. */
> + unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
> + pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
> + unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */
> + int suspended; /* true for cores not yet started by host */
> + unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */
> + unsigned long vmexit_cycles; /* cycle count of last vmexit */
> +};
> +
> +struct kvm_vm_stat {
> + u32 remote_tlb_flush;
> +};
> +
> +struct kvm_arch_memory_slot {
> +};
> +
> +struct kvm_arch {
> + pgd_t *vpgd;
> + unsigned long resv_gpa_start; /* For special purpose. */
> + struct completion smp_start;
> +};
> +
> +struct kvm_vcpu;
> +
> +extern void kvm_vmresume(struct pt_regs *guest,
> + unsigned long *host_sp_ptr);
> +extern void kvm_vmexit(unsigned long host_sp);
> +extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
> +extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
> +extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
> + unsigned long, unsigned long);
> +extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
> +
> +#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
> +
> +#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
> +
> +#define gpmd_offset(kvm, pud, address) \
> + ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
> +
> +#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
> +
> +#define gpte_offset_kernel(kvm, pmd, address) \
> + ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
> +
I can't find where those four defines are used, but in case they are
comment about gfn_to_pfn() bellow apples here to.
> +#endif /* __ASSEMBLY__*/
> +
> +#endif /* _ASM_TILE_KVM_HOST_H */
> diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
> index 230b830..c72fcba 100644
> --- a/arch/tile/include/asm/processor.h
> +++ b/arch/tile/include/asm/processor.h
> @@ -15,6 +15,8 @@
> #ifndef _ASM_TILE_PROCESSOR_H
> #define _ASM_TILE_PROCESSOR_H
>
> +#include <arch/chip.h>
> +
> #ifndef __ASSEMBLY__
>
> /*
> @@ -25,7 +27,6 @@
> #include <asm/ptrace.h>
> #include <asm/percpu.h>
>
> -#include <arch/chip.h>
> #include <arch/spr_def.h>
>
> struct task_struct;
> @@ -347,7 +348,6 @@ extern int kdata_huge;
>
> /*
> * Provide symbolic constants for PLs.
> - * Note that assembly code assumes that USER_PL is zero.
> */
> #define USER_PL 0
> #if CONFIG_KERNEL_PL == 2
> diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
> index 0d25c21..b9620c0 100644
> --- a/arch/tile/include/asm/ptrace.h
> +++ b/arch/tile/include/asm/ptrace.h
> @@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
> #define user_stack_pointer(regs) ((regs)->sp)
>
> /* Does the process account for user or for system time? */
> -#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
> +#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
>
> /* Fill in a struct pt_regs with the current kernel registers. */
> struct pt_regs *get_pt_regs(struct pt_regs *);
> diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
> index b8aa6df..1c26cdf 100644
> --- a/arch/tile/include/asm/thread_info.h
> +++ b/arch/tile/include/asm/thread_info.h
> @@ -18,7 +18,9 @@
>
> #include <asm/processor.h>
> #include <asm/page.h>
> +
> #ifndef __ASSEMBLY__
> +struct kvm_vcpu;
>
> /*
> * Low level task data that assembly code needs immediate access to.
> @@ -44,6 +46,9 @@ struct thread_info {
> unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */
> void __user *unalign_jit_base; /* unalign fixup JIT base */
> #endif
> +#ifdef CONFIG_KVM
> + struct kvm_vcpu *vcpu; /* vcpu during vmresume */
> +#endif
> };
>
> /*
> @@ -117,8 +122,8 @@ extern void _cpu_idle(void);
>
> /*
> * Thread information flags that various assembly files may need to access.
> - * Keep flags accessed frequently in low bits, particular since it makes
> - * it easier to build constants in assembly.
> + * Keep flags accessed frequently in low bits, since it makes it
> + * easier to build constants in assembly.
> */
> #define TIF_SIGPENDING 0 /* signal pending */
> #define TIF_NEED_RESCHED 1 /* rescheduling necessary */
> @@ -131,6 +136,7 @@ extern void _cpu_idle(void);
> #define TIF_MEMDIE 7 /* OOM killer at work */
> #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */
> #define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */
> +#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */
>
> #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
> #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
> @@ -142,11 +148,12 @@ extern void _cpu_idle(void);
> #define _TIF_MEMDIE (1<<TIF_MEMDIE)
> #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
> #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
> +#define _TIF_VIRT_EXIT (1<<TIF_VIRT_EXIT)
>
> /* Work to do on any return to user space. */
> -#define _TIF_ALLWORK_MASK \
> - (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
> - _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
> +#define _TIF_ALLWORK_MASK \
> + (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP| \
> + _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)
>
> /* Work to do at syscall entry. */
> #define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
> diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
> index f71b08e..71abe38 100644
> --- a/arch/tile/include/hv/hypervisor.h
> +++ b/arch/tile/include/hv/hypervisor.h
> @@ -321,6 +321,18 @@
> /** hv_set_speed */
> #define HV_DISPATCH_SET_SPEED 58
>
> +/** hv_install_virt_context */
> +#define HV_DISPATCH_INSTALL_VIRT_CONTEXT 59
> +
> +/** hv_inquire_virt_context */
> +#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT 60
> +
> +/** hv_install_guest_context */
> +#define HV_DISPATCH_INSTALL_GUEST_CONTEXT 61
> +
> +/** hv_inquire_guest_context */
> +#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT 62
> +
> /** hv_console_set_ipi */
> #define HV_DISPATCH_CONSOLE_SET_IPI 63
>
> @@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
> * new page table does not need to contain any mapping for the
> * hv_install_context address itself.
> *
> - * At most one HV_CTX_PG_SM_* flag may be specified in "flags";
> + * At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
> * if multiple flags are specified, HV_EINVAL is returned.
> * Specifying none of the flags results in using the default page size.
> * All cores participating in a given client must request the same
> * page size, or the results are undefined.
> *
> + * To disable an installed page table, install HV_CTX_NONE. The access
> + * and asid fields are ignored.
> + *
> * @param page_table Root of the page table.
> * @param access PTE providing info on how to read the page table. This
> * value must be consistent between multiple tiles sharing a page table,
> @@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
>
> #endif /* !__ASSEMBLER__ */
>
> +#define HV_CTX_NONE ((HV_PhysAddr)-1) /**< Disable page table. */
> +
> #define HV_CTX_DIRECTIO 0x1 /**< Direct I/O requests are accepted from
> PL0. */
>
> +#define HV_CTX_GUEST_CACHE 0x4 /**< Let guest control caching flags (only
> + usable with hv_install_virt_context.) */
> +
> #define HV_CTX_PG_SM_4K 0x10 /**< Use 4K small pages, if available. */
> #define HV_CTX_PG_SM_16K 0x20 /**< Use 16K small pages, if available. */
> #define HV_CTX_PG_SM_64K 0x40 /**< Use 64K small pages, if available. */
> #define HV_CTX_PG_SM_MASK 0xf0 /**< Mask of all possible small pages. */
>
> +
> #ifndef __ASSEMBLER__
>
> +/** Install a virtualization context.
> + *
> + * When a virtualization context is installed, all faults from PL0 or
> + * PL1 are handled via a "guest context" and then post-processed by
> + * the "virtualization context"; faults at PL2 are still handled by
> + * the normal context. For guest faults, the "guest PAs" produced by
> + * the guest page table are passed through the virtualization page
> + * table as pseudo-VAs, generating the true CPA as a result. See the
> + * individual HV_PTE_xxx bits for the effect the bits have when
> + * present in the virtualization page table. The ASID is currently
> + * ignored in this syscall, but it might be used later, so the API
> + * includes it. The HV_CTX_GUEST_CACHE flag indicates that all
> + * cache-related flags should be taken from the primary page table,
> + * not the virtualization page table.
> + *
> + * Once the virtualization context is installed, a guest context
> + * should also be installed; otherwise a VA-equals-PA context will be
> + * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
> + * the virtualization context to generate CPAs.
> + *
> + * When entering client PL after being at guest or user PL, the
> + * client is expected to call hv_flush_all() to clear any TLB mappings
> + * that might otherwise conflict. Similarly, hv_flush_all() should
> + * be called before returning to guest or user PL with a virtualization
> + * context installed, so that any TLB mappings are cleared. Future
> + * work may include adding a "vpid" or similar namespace so that
> + * the TLBs may be managed independently.
> + *
> + * Subsequent guest page table installations will have their root PA
> + * and PTE cached after translating through the virtualization
> + * context, so if entries in the virtualization page table are
> + * modified or removed, the guest context should be re-installed.
> + * This, in conjunction with flushing the TLB on return to the guest,
> + * will ensure that the new virtualization entries are honored.
> + *
> + * @param page_table Root of the page table.
> + * @param access PTE providing info on how to read the page table. This
> + * value must be consistent between multiple tiles sharing a page table,
> + * and must also be consistent with any virtual mappings the client
> + * may be using to access the page table.
> + * @param asid HV_ASID the page table is to be used for (currently ignored).
> + * @param flags Context flags, denoting attributes or privileges of the
> + * current virtualization context (see below).
> + * @return Zero on success, or a hypervisor error code on failure.
> + */
> +
> +int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
> + HV_ASID asid, __hv32 flags);
> +
> +
> +
> +/** Install a guest context.
> + *
> + * The guest context is only consulted when a virtualization context
> + * is also installed, and for faults that occur below the client's PL.
> + * If no guest context is installed, in such a case, a VA=PA context
> + * is used instead.
> + *
> + * The access PTE will only be honored if the virtualization table was
> + * installed with HV_CTX_GUEST_CACHE.
> + *
> + * A virtualization context must already be installed prior to
> + * installing the guest context.
> + *
> + * @param page_table Root of the page table; the value is the guest's
> + * physical address (GPA), not a CPA.
> + * @param access PTE providing info on how to read the page table. This
> + * value must be consistent between multiple tiles sharing a page table,
> + * and must also be consistent with any virtual mappings the client
> + * may be using to access the page table.
> + * @param asid HV_ASID the page table is to be used for.
> + * @param flags Context flags, denoting attributes or privileges of the
> + * current context (HV_CTX_xxx).
> + * @return Zero on success, or a hypervisor error code on failure.
> + */
> +
> +int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
> + HV_ASID asid, __hv32 flags);
> +
>
> /** Set the number of pages ganged together by HV_PTE_SUPER at a
> * particular level of the page table.
> @@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
> * "super" page size must be less than the span of the next level in
> * the page table. The largest size that can be requested is 64GB.
> *
> - * The shift value is initially "0" for all page table levels,
> + * The shift value is initially 0 for all page table levels,
> * indicating that the HV_PTE_SUPER bit is effectively ignored.
> *
> * If you change the count from one non-zero value to another, the
> @@ -854,11 +954,26 @@ typedef struct
> } HV_Context;
>
> /** Retrieve information about the currently installed context.
> - * @return The data passed to the last successful hv_install_context call.
> + * @return The data passed to the last successful call to
> + * hv_install_context().
> */
> HV_Context hv_inquire_context(void);
>
>
> +/** Retrieve information about the currently installed virtualization context.
> + * @return The data passed to the last successful call to
> + * hv_install_virt_context().
> + */
> +HV_Context hv_inquire_virt_context(void);
> +
> +
> +/** Retrieve information about the currently installed guest context.
> + * @return The data passed to the last successful call to
> + * hv_install_guest_context().
> + */
> +HV_Context hv_inquire_guest_context(void);
> +
> +
> /** Flushes all translations associated with the named address space
> * identifier from the TLB and any other hypervisor data structures.
> * Translations installed with the "global" bit are not flushed.
> @@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
> /** Flushes all non-global translations (if preserve_global is true),
> * or absolutely all translations (if preserve_global is false).
> *
> - * @param preserve_global Non-zero if we want to preserve "global" mappings.
> + * @param preserve_global Non-zero if we want to preserve global mappings.
> * @return Zero on success, or a hypervisor error code on failure.
> */
> int hv_flush_all(int preserve_global);
> @@ -991,7 +1106,11 @@ typedef enum {
> HV_INQ_TILES_HFH_CACHE = 2,
>
> /** The set of tiles that can be legally used as a LOTAR for a PTE. */
> - HV_INQ_TILES_LOTAR = 3
> + HV_INQ_TILES_LOTAR = 3,
> +
> + /** The set of "shared" driver tiles that the hypervisor may
> + * periodically interrupt. */
> + HV_INQ_TILES_SHARED = 4
> } HV_InqTileSet;
>
> /** Returns specific information about various sets of tiles within the
> @@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
> */
> /** Message receive downcall interrupt vector */
> #define INT_MESSAGE_RCV_DWNCL INT_BOOT_ACCESS
> +/** Device interrupt downcall interrupt vector */
> +#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
> +#ifdef __tilegx__
> +/** Virtualization page table miss downcall interrupt vector */
> +#define INT_VPGTABLE_MISS_DWNCL INT_I_ASID
> +/** Virtualization guest illegal page table */
> +#define INT_VGUEST_FATAL_DWNCL INT_D_ASID
> +#else
> /** DMA TLB miss downcall interrupt vector */
> #define INT_DMATLB_MISS_DWNCL INT_DMA_ASID
> -/** Static nework processor instruction TLB miss interrupt vector */
> -#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
> /** DMA TLB access violation downcall interrupt vector */
> #define INT_DMATLB_ACCESS_DWNCL INT_DMA_CPL
> -/** Device interrupt downcall interrupt vector */
> -#define INT_DEV_INTR_DWNCL INT_WORLD_ACCESS
> +/** Static nework processor instruction TLB miss interrupt vector */
> +#define INT_SNITLB_MISS_DWNCL INT_SNI_ASID
> +#endif
>
> #ifndef __ASSEMBLER__
>
> @@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> #define HV_PTE_PTFN_BITS 29 /**< Number of bits in a PTFN */
>
> /*
> - * Legal values for the PTE's mode field
> + * Legal values for the PTE's mode field.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> + * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
> + * to access MMIO resources via pseudo PAs that map to MMIO in the
> + * virtualization page table.
> */
> +
> /** Data is not resident in any caches; loads and stores access memory
> * directly.
> */
> @@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in the primary page table if a virtualization
> + * page table is installed.
> */
> #define HV_PTE_GLOBAL (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
>
> @@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in the virtualization page table.
> */
> #define HV_PTE_USER (__HV_PTE_ONE << HV_PTE_INDEX_USER)
>
> @@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * has been cleared, subsequent references are not guaranteed to set
> * it again until the translation has been flushed from the TLB.
> *
> - * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
> */
> #define HV_PTE_ACCESSED (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
>
> @@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * has been cleared, subsequent references are not guaranteed to set
> * it again until the translation has been flushed from the TLB.
> *
> - * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
> */
> #define HV_PTE_DIRTY (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
>
> @@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> *
> * In level-1 PTEs, if the Page bit is clear, this bit determines how the
> * level-2 page table is accessed.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_NC (__HV_PTE_ONE << HV_PTE_INDEX_NC)
>
> @@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> *
> * In level-1 PTEs, if the Page bit is clear, this bit
> * determines how the level-2 page table is accessed.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_NO_ALLOC_L1 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
>
> @@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> *
> * In level-1 PTEs, if the Page bit is clear, this bit determines how the
> * level-2 page table is accessed.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_NO_ALLOC_L2 (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
>
> @@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * the page map directly to memory.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + *
> + * If a virtualization page table is installed, this field is only honored
> + * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
> + * table was installed, otherwise only in the virtualization page table.
> */
> #define HV_PTE_CACHED_PRIORITY (__HV_PTE_ONE << \
> HV_PTE_INDEX_CACHED_PRIORITY)
> @@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * It is illegal for this bit to be clear if the Writable bit is set.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * If a virtualization page table is present, the final Readable status
> + * is the logical "and" of this bit in both page tables.
> */
> #define HV_PTE_READABLE (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
>
> @@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * PTE.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * If a virtualization page table is present, the final Writable status
> + * is the logical "and" of this bit in both page tables.
> */
> #define HV_PTE_WRITABLE (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
>
> @@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
> * than one.
> *
> * This bit is ignored in level-1 PTEs unless the Page bit is set.
> + * If a virtualization page table is present, the final Executable status
> + * is the logical "and" of this bit in both page tables.
> */
> #define HV_PTE_EXECUTABLE (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
>
> diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
> index e54b7b0..36fb24c 100644
> --- a/arch/tile/include/uapi/arch/sim.h
> +++ b/arch/tile/include/uapi/arch/sim.h
> @@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
> __insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
> }
>
> +/**
> + * Set vCPU number for a given task.
> + * @param vcpu Virtual cpu to set.
> + */
> +static __inline void
> +sim_set_vcpu(int vcpu)
> +{
> + __insn_mtspr(SPR_SIM_CONTROL,
> + SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
> +}
> +
> +/** Clear vCPU status for a given task. */
> +static __inline void
> +sim_clear_vcpu(void)
> +{
> + __insn_mtspr(SPR_SIM_CONTROL,
> + SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
> +}
> +
>
> /*
> * Event support.
> diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
> index 4b44a2b..b9aad66 100644
> --- a/arch/tile/include/uapi/arch/sim_def.h
> +++ b/arch/tile/include/uapi/arch/sim_def.h
> @@ -221,6 +221,14 @@
> */
> #define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
>
> +/**
> + * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
> + * number shifted by 8, will tag any identification of the cpu that
> + * task is running on with the given virtual cpu number. If the
> + * virtual cpu number is -1, the tag is removed.
> + */
> +#define SIM_CONTROL_VCPU 37
> +
>
> /*
> * Syscall numbers for use with "sim_syscall()".
> diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
> index c689446..4644c8d 100644
> --- a/arch/tile/include/uapi/arch/spr_def_32.h
> +++ b/arch/tile/include/uapi/arch/spr_def_32.h
> @@ -121,6 +121,9 @@
> #define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
> #define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
> #define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
> +#define SPR_MPL_GPV_SET_0 0x0600
> +#define SPR_MPL_GPV_SET_1 0x0601
> +#define SPR_MPL_GPV_SET_2 0x0602
> #define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
> #define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
> #define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
> @@ -142,6 +145,9 @@
> #define SPR_MPL_IDN_TIMER_SET_0 0x3400
> #define SPR_MPL_IDN_TIMER_SET_1 0x3401
> #define SPR_MPL_IDN_TIMER_SET_2 0x3402
> +#define SPR_MPL_ILL_SET_0 0x0400
> +#define SPR_MPL_ILL_SET_1 0x0401
> +#define SPR_MPL_ILL_SET_2 0x0402
> #define SPR_MPL_INTCTRL_0_SET_0 0x4a00
> #define SPR_MPL_INTCTRL_0_SET_1 0x4a01
> #define SPR_MPL_INTCTRL_0_SET_2 0x4a02
> @@ -166,6 +172,12 @@
> #define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
> #define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
> #define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
> +#define SPR_MPL_SWINT_0_SET_0 0x1c00
> +#define SPR_MPL_SWINT_0_SET_1 0x1c01
> +#define SPR_MPL_SWINT_0_SET_2 0x1c02
> +#define SPR_MPL_SWINT_1_SET_0 0x1a00
> +#define SPR_MPL_SWINT_1_SET_1 0x1a01
> +#define SPR_MPL_SWINT_1_SET_2 0x1a02
> #define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
> #define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
> #define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
> @@ -187,6 +199,9 @@
> #define SPR_MPL_UDN_TIMER_SET_0 0x3600
> #define SPR_MPL_UDN_TIMER_SET_1 0x3601
> #define SPR_MPL_UDN_TIMER_SET_2 0x3602
> +#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
> +#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
> +#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
> #define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
> #define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
> #define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
> diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
> index 67a6c17..727cda7 100644
> --- a/arch/tile/include/uapi/arch/spr_def_64.h
> +++ b/arch/tile/include/uapi/arch/spr_def_64.h
> @@ -21,6 +21,10 @@
> #define SPR_AUX_PERF_COUNT_1 0x2106
> #define SPR_AUX_PERF_COUNT_CTL 0x2107
> #define SPR_AUX_PERF_COUNT_STS 0x2108
> +#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
> +#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK 0xffffffff
> +#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
> +#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
> #define SPR_CMPEXCH_VALUE 0x2780
> #define SPR_CYCLE 0x2781
> #define SPR_DONE 0x2705
> @@ -101,6 +105,9 @@
> #define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
> #define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
> #define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
> +#define SPR_MPL_GPV_SET_0 0x0900
> +#define SPR_MPL_GPV_SET_1 0x0901
> +#define SPR_MPL_GPV_SET_2 0x0902
> #define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
> #define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
> #define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
> @@ -116,6 +123,12 @@
> #define SPR_MPL_IDN_TIMER_SET_0 0x1800
> #define SPR_MPL_IDN_TIMER_SET_1 0x1801
> #define SPR_MPL_IDN_TIMER_SET_2 0x1802
> +#define SPR_MPL_ILL_SET_0 0x0800
> +#define SPR_MPL_ILL_SET_1 0x0801
> +#define SPR_MPL_ILL_SET_2 0x0802
> +#define SPR_MPL_ILL_TRANS_SET_0 0x1000
> +#define SPR_MPL_ILL_TRANS_SET_1 0x1001
> +#define SPR_MPL_ILL_TRANS_SET_2 0x1002
> #define SPR_MPL_INTCTRL_0_SET_0 0x2500
> #define SPR_MPL_INTCTRL_0_SET_1 0x2501
> #define SPR_MPL_INTCTRL_0_SET_2 0x2502
> @@ -140,6 +153,15 @@
> #define SPR_MPL_PERF_COUNT_SET_0 0x2000
> #define SPR_MPL_PERF_COUNT_SET_1 0x2001
> #define SPR_MPL_PERF_COUNT_SET_2 0x2002
> +#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
> +#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
> +#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
> +#define SPR_MPL_SWINT_0_SET_0 0x0f00
> +#define SPR_MPL_SWINT_0_SET_1 0x0f01
> +#define SPR_MPL_SWINT_0_SET_2 0x0f02
> +#define SPR_MPL_SWINT_1_SET_0 0x0e00
> +#define SPR_MPL_SWINT_1_SET_1 0x0e01
> +#define SPR_MPL_SWINT_1_SET_2 0x0e02
> #define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
> #define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
> #define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
> @@ -155,6 +177,9 @@
> #define SPR_MPL_UDN_TIMER_SET_0 0x1900
> #define SPR_MPL_UDN_TIMER_SET_1 0x1901
> #define SPR_MPL_UDN_TIMER_SET_2 0x1902
> +#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
> +#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
> +#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
> #define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
> #define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
> #define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
> diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
> index c20db8e..89022a5 100644
> --- a/arch/tile/include/uapi/asm/Kbuild
> +++ b/arch/tile/include/uapi/asm/Kbuild
> @@ -6,6 +6,7 @@ header-y += bitsperlong.h
> header-y += byteorder.h
> header-y += cachectl.h
> header-y += hardwall.h
> +header-y += kvm.h
> header-y += kvm_para.h
> header-y += mman.h
> header-y += ptrace.h
> diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
> new file mode 100644
> index 0000000..aa7b97f
> --- /dev/null
> +++ b/arch/tile/include/uapi/asm/kvm.h
> @@ -0,0 +1,262 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _UAPI_ASM_TILE_KVM_H
> +#define _UAPI_ASM_TILE_KVM_H
> +
> +#ifndef __ASSEMBLER__
> +#include <linux/ptrace.h>
> +#endif
> +
> +#include <arch/abi.h>
> +
> +/*
> + * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
> + * with small modifications: Remove HV_SYS_fence_incoherent.
> + */
> +/* Syscall allowed from guest PL bit mask. */
> +#define HV_SYS_GUEST_SHIFT 12
> +#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT)
> +/* downcall_dispatch; this syscall number must be zero */
> +#define HV_SYS_downcall_dispatch 0
> +/* install_context */
> +#define HV_SYS_install_context 1
> +/* sysconf */
> +#define HV_SYS_sysconf 2
> +/* get_rtc */
> +#define HV_SYS_get_rtc 3
> +/* set_rtc */
> +#define HV_SYS_set_rtc 4
> +/* flush_asid */
> +#define HV_SYS_flush_asid 5
> +/* flush_page */
> +#define HV_SYS_flush_page 6
> +/* flush_pages */
> +#define HV_SYS_flush_pages 7
> +/* restart */
> +#define HV_SYS_restart 8
> +/* halt */
> +#define HV_SYS_halt 9
> +/* power_off */
> +#define HV_SYS_power_off 10
> +/* inquire_physical */
> +#define HV_SYS_inquire_physical 11
> +/* inquire_memory_controller */
> +#define HV_SYS_inquire_memory_controller 12
> +/* inquire_virtual */
> +#define HV_SYS_inquire_virtual 13
> +/* inquire_asid */
> +#define HV_SYS_inquire_asid 14
> +/* console_read_if_ready */
> +#define HV_SYS_console_read_if_ready 15
> +/* console_write */
> +#define HV_SYS_console_write 16
> +/* init */
> +#define HV_SYS_init 17
> +/* inquire_topology */
> +#define HV_SYS_inquire_topology 18
> +/* fs_findfile */
> +#define HV_SYS_fs_findfile 19
> +/* fs_fstat */
> +#define HV_SYS_fs_fstat 20
> +/* fs_pread */
> +#define HV_SYS_fs_pread 21
> +/* physaddr_read64 */
> +#define HV_SYS_physaddr_read64 22
> +/* physaddr_write64 */
> +#define HV_SYS_physaddr_write64 23
> +/* get_command_line */
> +#define HV_SYS_get_command_line 24
> +/* set_caching */
> +#define HV_SYS_set_caching 25
> +/* bzero_page */
> +#define HV_SYS_bzero_page 26
> +/* register_message_state */
> +#define HV_SYS_register_message_state 27
> +/* send_message */
> +#define HV_SYS_send_message 28
> +/* receive_message */
> +#define HV_SYS_receive_message 29
> +/* inquire_context */
> +#define HV_SYS_inquire_context 30
> +/* start_all_tiles */
> +#define HV_SYS_start_all_tiles 31
> +/* dev_open */
> +#define HV_SYS_dev_open 32
> +/* dev_close */
> +#define HV_SYS_dev_close 33
> +/* dev_pread */
> +#define HV_SYS_dev_pread 34
> +/* dev_pwrite */
> +#define HV_SYS_dev_pwrite 35
> +/* dev_poll */
> +#define HV_SYS_dev_poll 36
> +/* dev_poll_cancel */
> +#define HV_SYS_dev_poll_cancel 37
> +/* dev_preada */
> +#define HV_SYS_dev_preada 38
> +/* dev_pwritea */
> +#define HV_SYS_dev_pwritea 39
> +/* flush_remote */
> +#define HV_SYS_flush_remote 40
> +/* console_putc */
> +#define HV_SYS_console_putc 41
> +/* inquire_tiles */
> +#define HV_SYS_inquire_tiles 42
> +/* confstr */
> +#define HV_SYS_confstr 43
> +/* reexec */
> +#define HV_SYS_reexec 44
> +/* set_command_line */
> +#define HV_SYS_set_command_line 45
> +
> +/* store_mapping */
> +#define HV_SYS_store_mapping 52
> +/* inquire_realpa */
> +#define HV_SYS_inquire_realpa 53
> +/* flush_all */
> +#define HV_SYS_flush_all 54
> +/* get_ipi_pte */
> +#define HV_SYS_get_ipi_pte 55
> +/* set_pte_super_shift */
> +#define HV_SYS_set_pte_super_shift 56
> +/* set_speed */
> +#define HV_SYS_set_speed 57
> +/* install_virt_context */
> +#define HV_SYS_install_virt_context 58
> +/* inquire_virt_context */
> +#define HV_SYS_inquire_virt_context 59
> +/* inquire_guest_context */
> +#define HV_SYS_install_guest_context 60
> +/* inquire_guest_context */
> +#define HV_SYS_inquire_guest_context 61
> +
> +/*
> + * Number of hypercall (from guest os to host os) other than hv_*().
> + * We leave the previous 128 entries to the usual hv_*() calls
> + * as defined in hypervisor.h.
> + */
> +#define KVM_OTHER_HCALL 128
> +
> +/* One greater than the maximum hypercall number. */
> +#define KVM_NUM_HCALLS 256
> +
> +#ifndef __ASSEMBLER__
> +
> +struct kvm_regs {
> + struct pt_regs regs;
> +};
> +
> +#define FOR_EACH_GUEST_SPR(f) \
> + f(INTERRUPT_MASK_1); \
> + f(INTERRUPT_VECTOR_BASE_1); \
> + f(EX_CONTEXT_1_0); \
> + f(EX_CONTEXT_1_1); \
> + f(SYSTEM_SAVE_1_0); \
> + f(SYSTEM_SAVE_1_1); \
> + f(SYSTEM_SAVE_1_2); \
> + f(SYSTEM_SAVE_1_3); \
> + f(INTCTRL_1_STATUS); \
> + f(IPI_MASK_1); \
> + f(IPI_EVENT_1); \
> + f(SINGLE_STEP_CONTROL_1); \
> + f(SINGLE_STEP_EN_1_1); \
> +
> +struct kvm_sregs {
> +#define DECLARE_SPR(f) unsigned long f
> + FOR_EACH_GUEST_SPR(DECLARE_SPR)
> +#undef DECLARE_SPR
> +};
> +
> +struct kvm_fpu {
> +};
> +
> +struct kvm_debug_exit_arch {
> +};
> +
> +struct kvm_guest_debug_arch {
> +};
> +
> +/* definition of registers in kvm_run */
> +struct kvm_sync_regs {
> +};
> +
> +#ifndef __KERNEL__
> +/* For hv_*() */
> +#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
> +#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
> +#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
> +#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
> +/* For others */
> +#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
This does not belong to a kernel header. QEMU is not the only user of KVM
kernel APIs. Please drop that and change all the references in comment
from "qemu" to "userspace". If you add code that workarounds QEMU bugs it
is appropriate to mention QEMU by name, otherwise interface to userspace
should not be QEMU specific.
> +#endif
> +
> +#define HCALL_DEFS \
> + /* For hv_*() */ \
> + KVM_EMULATE(init) \
> + NO_EMULATE(install_context) \
> + KVM_EMULATE(sysconf) \
> + KVM_EMULATE(get_rtc) \
> + KVM_EMULATE(set_rtc) \
> + NO_EMULATE(flush_asid) \
> + NO_EMULATE(flush_page) \
> + NO_EMULATE(flush_pages) \
> + USER_EMULATE(restart) \
> + USER_EMULATE(halt) \
> + USER_EMULATE(power_off) \
> + USER_EMULATE(inquire_physical) \
> + USER_EMULATE(inquire_memory_controller) \
> + KVM_EMULATE(inquire_virtual) \
> + KVM_EMULATE(inquire_asid) \
> + NO_EMULATE(console_read_if_ready) \
> + NO_EMULATE(console_write) \
> + NO_EMULATE(downcall_dispatch) \
> + KVM_EMULATE(inquire_topology) \
> + USER_EMULATE(fs_findfile) \
> + USER_EMULATE(fs_fstat) \
> + USER_EMULATE(fs_pread) \
> + KVM_EMULATE(physaddr_read64) \
> + KVM_EMULATE(physaddr_write64) \
> + USER_EMULATE(get_command_line) \
> + USER_EMULATE(set_caching) \
> + NO_EMULATE(bzero_page) \
> + KVM_EMULATE(register_message_state) \
> + KVM_EMULATE(send_message) \
> + KVM_EMULATE(receive_message) \
> + KVM_EMULATE(inquire_context) \
> + KVM_EMULATE(start_all_tiles) \
> + USER_EMULATE(dev_open) \
> + USER_EMULATE(dev_close) \
> + USER_EMULATE(dev_pread) \
> + USER_EMULATE(dev_pwrite) \
> + USER_EMULATE(dev_poll) \
> + USER_EMULATE(dev_poll_cancel) \
> + USER_EMULATE(dev_preada) \
> + USER_EMULATE(dev_pwritea) \
> + USER_EMULATE(flush_remote) \
> + NO_EMULATE(console_putc) \
> + KVM_EMULATE(inquire_tiles) \
> + KVM_EMULATE(confstr) \
> + USER_EMULATE(reexec) \
> + USER_EMULATE(set_command_line) \
> + USER_EMULATE(store_mapping) \
> + NO_EMULATE(inquire_realpa) \
> + NO_EMULATE(flush_all) \
> + KVM_EMULATE(get_ipi_pte) \
> + KVM_EMULATE(set_pte_super_shift) \
> + KVM_EMULATE(set_speed) \
> +
> +#endif
> +
> +#endif /* _UAPI_ASM_TILE_KVM_H */
> diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
> index 97ea6ac..0a04a16 100644
> --- a/arch/tile/kernel/asm-offsets.c
> +++ b/arch/tile/kernel/asm-offsets.c
> @@ -20,6 +20,9 @@
> #include <linux/hardirq.h>
> #include <linux/ptrace.h>
> #include <hv/hypervisor.h>
> +#ifdef CONFIG_KVM
> +#include <linux/kvm_host.h>
> +#endif
>
> /* Check for compatible compiler early in the build. */
> #ifdef CONFIG_TILEGX
> @@ -68,6 +71,10 @@ void foo(void)
> DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
> offsetof(struct thread_info, unalign_jit_tmp));
> #endif
> +#ifdef CONFIG_KVM
> + DEFINE(THREAD_INFO_VCPU_OFFSET,
> + offsetof(struct thread_info, vcpu));
> +#endif
>
> DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
> offsetof(struct task_struct, thread.ksp));
> diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
> index 16576c6..dc5b417 100644
> --- a/arch/tile/kernel/hvglue.S
> +++ b/arch/tile/kernel/hvglue.S
> @@ -71,5 +71,10 @@ gensym hv_flush_all, 0x6e0, 32
> gensym hv_get_ipi_pte, 0x700, 32
> gensym hv_set_pte_super_shift, 0x720, 32
> gensym hv_set_speed, 0x740, 32
> +gensym hv_install_virt_context, 0x760, 32
> +gensym hv_inquire_virt_context, 0x780, 32
> +gensym hv_install_guest_context, 0x7a0, 32
> +gensym hv_inquire_guest_context, 0x7c0, 32
> gensym hv_console_set_ipi, 0x7e0, 32
> -gensym hv_glue_internals, 0x800, 30720
> +gensym hv_glue_internals, 0x800, 2048
> +gensym hv_hcall_internals, 0x1020, 28640
> diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
> index 16ef6c1..3b15c76 100644
> --- a/arch/tile/kernel/hvglue_trace.c
> +++ b/arch/tile/kernel/hvglue_trace.c
> @@ -75,6 +75,10 @@
> #define hv_get_ipi_pte _hv_get_ipi_pte
> #define hv_set_pte_super_shift _hv_set_pte_super_shift
> #define hv_set_speed _hv_set_speed
> +#define hv_install_virt_context _hv_install_virt_context
> +#define hv_inquire_virt_context _hv_inquire_virt_context
> +#define hv_install_guest_context _hv_install_guest_context
> +#define hv_inquire_guest_context _hv_inquire_guest_context
> #define hv_console_set_ipi _hv_console_set_ipi
> #include <hv/hypervisor.h>
> #undef hv_init
> @@ -135,6 +139,10 @@
> #undef hv_get_ipi_pte
> #undef hv_set_pte_super_shift
> #undef hv_set_speed
> +#undef hv_install_virt_context
> +#undef hv_inquire_virt_context
> +#undef hv_install_guest_context
> +#undef hv_inquire_guest_context
> #undef hv_console_set_ipi
>
> /*
> @@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
> unsigned long, flags)
> HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
> HV_ASID, asid, __hv32, flags)
> +HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
> + HV_ASID, asid, __hv32, flags)
> +HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
> + HV_ASID, asid, __hv32, flags)
> HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
> HV_WRAP0(HV_Context, hv_inquire_context)
> +HV_WRAP0(HV_Context, hv_inquire_virt_context)
> +HV_WRAP0(HV_Context, hv_inquire_guest_context)
> HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
> HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
> HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
> diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
> index f3d26f4..8ac6072 100644
> --- a/arch/tile/kernel/intvec_32.S
> +++ b/arch/tile/kernel/intvec_32.S
> @@ -806,7 +806,7 @@ handle_interrupt:
> STD_ENTRY(interrupt_return)
> /* If we're resuming to kernel space, don't check thread flags. */
> {
> - bnz r30, .Lrestore_all /* NMIs don't special-case user-space */
> + bnz r30, restore_all /* NMIs don't special-case user-space */
> PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
> }
> lw r29, r29
> @@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
> seq r27, r27, r28
> }
> {
> - bbns r27, .Lrestore_all
> + bbns r27, restore_all
> addi r28, r28, 8
> }
> sw r29, r28
> - j .Lrestore_all
> + j restore_all
>
> .Lresume_userspace:
> FEEDBACK_REENTER(interrupt_return)
> @@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
> auli r1, r1, ha16(_TIF_ALLWORK_MASK)
> }
> and r1, r29, r1
> - bzt r1, .Lrestore_all
> + bzt r1, restore_all
>
> /*
> * Make sure we have all the registers saved for signal
> @@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
> * profile interrupt will actually disable interrupts in both SPRs
> * before returning, which is OK.)
> */
> -.Lrestore_all:
> + .global restore_all
> + .type restore_all, @function
> +restore_all:
> PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
> {
> lw r0, r0
> diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
> index 3b35bb4..45647a4 100644
> --- a/arch/tile/kernel/intvec_64.S
> +++ b/arch/tile/kernel/intvec_64.S
> @@ -29,6 +29,10 @@
> #include <arch/abi.h>
> #include <arch/interrupts.h>
> #include <arch/spr_def.h>
> +#include <arch/opcode.h>
> +#ifdef CONFIG_KVM
> +#include <asm/kvm_host.h>
> +#endif
>
> #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
>
> @@ -347,10 +351,6 @@ intvec_\vecname:
> *
> * Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
> * any path that turns into a downcall to one of our TLB handlers.
> - *
> - * FIXME: if we end up never using this path, perhaps we should
> - * prevent the hypervisor from generating downcalls in this case.
> - * The advantage of getting a downcall is we can panic in Linux.
> */
> mfspr r0, SPR_SYSTEM_SAVE_K_2
> {
> @@ -490,6 +490,10 @@ intvec_\vecname:
> mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
> mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
> .else
> + .ifc \c_routine, kvm_vpgtable_miss
> + mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */
> + mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */
> + .else
> .ifc \vecnum, INT_ILL_TRANS
> mfspr r2, ILL_VA_PC
> .else
> @@ -512,6 +516,7 @@ intvec_\vecname:
> .endif
> .endif
> .endif
> + .endif
> /* Put function pointer in r0 */
> moveli r0, hw2_last(\c_routine)
> shl16insli r0, r0, hw1(\c_routine)
> @@ -641,24 +646,25 @@ intvec_\vecname:
> /*
> * If we will be returning to the kernel, we will need to
> * reset the interrupt masks to the state they had before.
> - * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
> + * Set DISABLE_IRQ in flags iff we came from kernel pl with
> + * irqs disabled.
> */
> - mfspr r32, SPR_EX_CONTEXT_K_1
> + mfspr r22, SPR_EX_CONTEXT_K_1
> {
> andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */
> PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
> }
> - beqzt r32, 1f /* zero if from user space */
> - IRQS_DISABLED(r32) /* zero if irqs enabled */
> + beqzt r22, 1f /* zero if from user space */
> + IRQS_DISABLED(r22) /* zero if irqs enabled */
> #if PT_FLAGS_DISABLE_IRQ != 1
> # error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
> #endif
> 1:
> .ifnc \function,handle_syscall
> /* Record the fact that we saved the caller-save registers above. */
> - ori r32, r32, PT_FLAGS_CALLER_SAVES
> + ori r22, r22, PT_FLAGS_CALLER_SAVES
> .endif
> - st r21, r32
> + st r21, r22
>
> /*
> * we've captured enough state to the stack (including in
> @@ -698,12 +704,29 @@ intvec_\vecname:
> move tp, zero
> #endif
>
> + /*
> + * Prepare the first 256 stack bytes to be rapidly accessible
> + * without having to fetch the background data.
> + */
> + addi r52, sp, -64
> + {
> + wh64 r52
> + addi r52, r52, -64
> + }
> + {
> + wh64 r52
> + addi r52, r52, -64
> + }
> + {
> + wh64 r52
> + addi r52, r52, -64
> + }
> + wh64 r52
> +
> #ifdef __COLLECT_LINKER_FEEDBACK__
> /*
> * Notify the feedback routines that we were in the
> - * appropriate fixed interrupt vector area. Note that we
> - * still have ICS set at this point, so we can't invoke any
> - * atomic operations or we will panic. The feedback
> + * appropriate fixed interrupt vector area. The feedback
> * routines internally preserve r0..r10 and r30 up.
> */
> .ifnc \function,handle_syscall
> @@ -722,23 +745,15 @@ intvec_\vecname:
> #endif
>
> /*
> - * Prepare the first 256 stack bytes to be rapidly accessible
> - * without having to fetch the background data.
> + * Stash any interrupt state in r30..r33 for now.
> + * This makes it easier to call C code in the code that follows.
> + * We don't need to on the syscall path since we reload
> + * them from the stack instead.
> */
> - addi r52, sp, -64
> - {
> - wh64 r52
> - addi r52, r52, -64
> - }
> - {
> - wh64 r52
> - addi r52, r52, -64
> - }
> - {
> - wh64 r52
> - addi r52, r52, -64
> - }
> - wh64 r52
> + .ifnc \function,handle_syscall
> + { move r30, r0; move r31, r1 }
> + { move r32, r2; move r33, r3 }
> + .endif
>
> #ifdef CONFIG_TRACE_IRQFLAGS
> .ifnc \function,handle_nmi
> @@ -749,17 +764,8 @@ intvec_\vecname:
> * For syscalls, we already have the register state saved away
> * on the stack, so we don't bother to do any register saves here,
> * and later we pop the registers back off the kernel stack.
> - * For interrupt handlers, save r0-r3 in callee-saved registers.
> */
> - .ifnc \function,handle_syscall
> - { move r30, r0; move r31, r1 }
> - { move r32, r2; move r33, r3 }
> - .endif
> TRACE_IRQS_OFF
> - .ifnc \function,handle_syscall
> - { move r0, r30; move r1, r31 }
> - { move r2, r32; move r3, r33 }
> - .endif
> .endif
> #endif
>
> @@ -808,7 +814,7 @@ handle_interrupt:
> STD_ENTRY(interrupt_return)
> /* If we're resuming to kernel space, don't check thread flags. */
> {
> - bnez r30, .Lrestore_all /* NMIs don't special-case user-space */
> + bnez r30, restore_all /* NMIs don't special-case user-space */
> PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
> }
> ld r29, r29
> @@ -824,14 +830,25 @@ STD_ENTRY(interrupt_return)
> addli r28, r29, THREAD_INFO_FLAGS_OFFSET
> {
> ld r28, r28
> - addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
> + addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
> }
> {
> - andi r28, r28, _TIF_NEED_RESCHED
> - ld4s r29, r29
> + andi r27, r28, _TIF_NEED_RESCHED
> + ld4s r26, r26
> }
> - beqzt r28, 1f
> - bnez r29, 1f
> + beqzt r27, 1f
> + bnez r26, 1f
> +#ifdef CONFIG_KVM
> + addli r27, r29, THREAD_INFO_VCPU_OFFSET
> + ld r27, r27
> + {
> + beqzt r27, 0f
> + movei r1, KVM_EXIT_AGAIN
> + }
> + push_extra_callee_saves r0
> + j kvm_trigger_vmexit
> +0:
> +#endif
> jal preempt_schedule_irq
> FEEDBACK_REENTER(interrupt_return)
> 1:
> @@ -853,11 +870,11 @@ STD_ENTRY(interrupt_return)
> cmpeq r27, r27, r28
> }
> {
> - blbc r27, .Lrestore_all
> + blbc r27, restore_all
> addi r28, r28, 8
> }
> st r29, r28
> - j .Lrestore_all
> + j restore_all
>
> .Lresume_userspace:
> FEEDBACK_REENTER(interrupt_return)
> @@ -897,7 +914,7 @@ STD_ENTRY(interrupt_return)
> shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
> }
> and r1, r29, r1
> - beqzt r1, .Lrestore_all
> + beqzt r1, restore_all
>
> /*
> * Make sure we have all the registers saved for signal
> @@ -929,7 +946,9 @@ STD_ENTRY(interrupt_return)
> * ICS can only be used in very tight chunks of code to avoid
> * tripping over various assertions that it is off.
> */
> -.Lrestore_all:
> + .global restore_all
> + .type restore_all, @function
> +restore_all:
> PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
> {
> ld r0, r0
> @@ -1457,6 +1476,26 @@ int_unalign:
> j do_unaligned
> ENDPROC(hand_unalign_slow)
>
> +#ifdef CONFIG_KVM
> +/*
> + * Any call path that may lead to a vmexit needs to save the full
> + * callee-save register state, since if we vmexit we don't unwind
> + * the callee-saves from the C function stack frames, and instead
> + * just save away the register state from the interrupt handler as-is
> + * and later reload it directly and call back into the guest.
> + */
> + .macro save_callee_saves_and_tailcall func
> +kvm_\func:
> + push_extra_callee_saves r0
> + j kvm_do_\func
> + ENDPROC(\func)
> + .endm
> +
> + save_callee_saves_and_tailcall hypervisor_call
> + save_callee_saves_and_tailcall vpgtable_miss
> + save_callee_saves_and_tailcall vguest_fatal
> +#endif
> +
> /* Fill the return address stack with nonzero entries. */
> STD_ENTRY(fill_ra_stack)
> {
> @@ -1469,6 +1508,48 @@ STD_ENTRY(fill_ra_stack)
> 4: jrp r0
> STD_ENDPROC(fill_ra_stack)
>
> +#ifdef CONFIG_KVM
> +/*
> + * Handle the downcall dispatch service. On entry, the client's
> + * system save register 3 holds the original contents of
> + * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
> + * the correct interrupt vector.
> + * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
> + * here, since this is the only interrupt handled this way on GX.
> + */
> +handle_downcall_dispatch:
> + /*
> + * If we were called from PL0, jump back to slow path.
> + * We check just the low bit to make sure it's set, since we
> + * can only be called from PL0 or PL1.
> + */
> + mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
> + blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0
> +
> + /* Set the PC to the downcall interrupt vector, and PL to guest. */
> + mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
> + addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
> + INT_MESSAGE_RCV_DWNCL << 8
> + {
> + mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
> + movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
> + }
> + mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
> +
> + /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
> + mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
> + iret
> +
> + .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \
> + processing=handle_interrupt
> + .org (\vecnum << 8)
> + /* Need special code for downcall dispatch syscall. */
> + beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
> + __int_hand \vecnum, \vecname, \c_routine, \processing
> + .endm
> +
> +#endif /* CONFIG_KVM */
> +
> .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt
> .org (\vecnum << 8)
> __int_hand \vecnum, \vecname, \c_routine, \processing
> @@ -1484,6 +1565,11 @@ STD_ENTRY(fill_ra_stack)
> #define do_hardwall_trap bad_intr
> #endif
>
> +#ifndef CONFIG_KVM
> +#define kvm_vpgtable_miss bad_intr
> +#define kvm_vguest_fatal bad_intr
> +#endif
> +
> int_hand INT_MEM_ERROR, MEM_ERROR, do_trap
> int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
> #if CONFIG_KERNEL_PL == 2
> @@ -1504,7 +1590,11 @@ STD_ENTRY(fill_ra_stack)
> int_hand INT_SWINT_3, SWINT_3, do_trap
> int_hand INT_SWINT_2, SWINT_2, do_trap
> int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
> +#ifdef CONFIG_KVM
> + int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
> +#else
> int_hand INT_SWINT_0, SWINT_0, do_trap
> +#endif
> int_hand INT_ILL_TRANS, ILL_TRANS, do_trap
> int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
> int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault
> @@ -1541,8 +1631,10 @@ STD_ENTRY(fill_ra_stack)
> int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
> hv_message_intr
> int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
> - int_hand INT_I_ASID, I_ASID, bad_intr
> - int_hand INT_D_ASID, D_ASID, bad_intr
> + int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
> + kvm_vpgtable_miss
> + int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
> + kvm_vguest_fatal
> int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
>
> /* Synthetic interrupt delivered only by the simulator */
> diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
> index 44cdc4a..7040490 100644
> --- a/arch/tile/kernel/process.c
> +++ b/arch/tile/kernel/process.c
> @@ -27,6 +27,7 @@
> #include <linux/kernel.h>
> #include <linux/tracehook.h>
> #include <linux/signal.h>
> +#include <linux/kvm_host.h>
> #include <asm/stack.h>
> #include <asm/switch_to.h>
> #include <asm/homecache.h>
> @@ -450,6 +451,11 @@ void _prepare_arch_switch(struct task_struct *next)
> struct task_struct *__sched _switch_to(struct task_struct *prev,
> struct task_struct *next)
> {
> +#ifdef CONFIG_KVM
> + /* vmexit is needed before context switch. */
> + BUG_ON(task_thread_info(prev)->vcpu);
> +#endif
> +
> /* DMA state is already saved; save off other arch state. */
> save_arch_state(&prev->thread);
>
> @@ -519,6 +525,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
> /* Enable interrupts; they are disabled again on return to caller. */
> local_irq_enable();
>
> +#ifdef CONFIG_KVM
> + /*
> + * Some work requires us to exit the VM first. Typically this
> + * allows the process running the VM to respond to the work
> + * (e.g. a signal), or allows the VM mechanism to latch
> + * modified host state (e.g. a "hypervisor" message sent to a
> + * different vcpu). It also means that if we are considering
> + * calling schedule(), we exit the VM first, so we never have
> + * to worry about context-switching into a VM.
> + */
> + if (current_thread_info()->vcpu) {
> + u32 do_exit = thread_info_flags &
> + (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
> +
> + if (thread_info_flags & _TIF_VIRT_EXIT)
> + clear_thread_flag(TIF_VIRT_EXIT);
> + if (do_exit) {
> + kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
> + /*NORETURN*/
> + }
> + }
> +#endif
> +
> if (thread_info_flags & _TIF_NEED_RESCHED) {
> schedule();
> return 1;
> @@ -538,11 +567,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
> tracehook_notify_resume(regs);
> return 1;
> }
> - if (thread_info_flags & _TIF_SINGLESTEP) {
> +
> + /* Handle a few flags here that stay set. */
> + if (thread_info_flags & _TIF_SINGLESTEP)
> single_step_once(regs);
> - return 0;
> - }
> - panic("work_pending: bad flags %#x\n", thread_info_flags);
> +
> + return 0;
> }
>
> unsigned long get_wchan(struct task_struct *p)
> diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
> index 774e819..7918cf1 100644
> --- a/arch/tile/kernel/setup.c
> +++ b/arch/tile/kernel/setup.c
> @@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
> * SPRs, as well as the interrupt mask.
> */
> __insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
> +
> +#ifdef CONFIG_KVM
> + /*
> + * If we launch a guest kernel, it will need some interrupts
> + * that otherwise are not used by the host or by userspace.
> + * Set them to MPL 1 now and leave them alone going forward;
> + * they are masked in the host so will never fire there anyway,
> + * and we mask them at PL1 as we exit the guest.
> + */
> __insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
> + __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
> + __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
> + __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
> +#endif
>
> /* Initialize IRQ support for this cpu. */
> setup_irq_regs();
> diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
> index 0ae1c59..62b3ba9 100644
> --- a/arch/tile/kernel/smp.c
> +++ b/arch/tile/kernel/smp.c
> @@ -223,30 +223,34 @@ void __init ipi_init(void)
>
> #if CHIP_HAS_IPI()
>
> -void smp_send_reschedule(int cpu)
> +static void __smp_send_reschedule(int cpu)
> {
> - WARN_ON(cpu_is_offline(cpu));
> -
> /*
> * We just want to do an MMIO store. The traditional writeq()
> * functions aren't really correct here, since they're always
> * directed at the PCI shim. For now, just do a raw store,
> - * casting away the __iomem attribute.
> + * casting away the __iomem attribute. We do the store as a
> + * single asm() instruction to ensure that we can force a step
> + * over it in the KVM case, if we are not binding vcpus to cpus,
> + * rather than require it to be possible to issue validly.
> */
> - ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
> + unsigned long *addr =
> + &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
> + asm volatile("st %0, zero" :: "r" (addr));
> }
>
> #else
>
> -void smp_send_reschedule(int cpu)
> +static void __smp_send_reschedule(int cpu)
> {
> - HV_Coord coord;
> -
> - WARN_ON(cpu_is_offline(cpu));
> -
> - coord.y = cpu_y(cpu);
> - coord.x = cpu_x(cpu);
> + HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
> hv_trigger_ipi(coord, IRQ_RESCHEDULE);
> }
>
> #endif /* CHIP_HAS_IPI() */
> +
> +void smp_send_reschedule(int cpu)
> +{
> + WARN_ON(cpu_is_offline(cpu));
> + __smp_send_reschedule(cpu);
> +}
> diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
> index 24fd223..362284a 100644
> --- a/arch/tile/kernel/stack.c
> +++ b/arch/tile/kernel/stack.c
> @@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
> p->sp >= sp) {
> if (kbt->verbose)
> pr_err(" <%s while in kernel mode>\n", fault);
> - } else if (EX1_PL(p->ex1) == USER_PL &&
> + } else if (user_mode(p) &&
> p->sp < PAGE_OFFSET && p->sp != 0) {
> if (kbt->verbose)
> pr_err(" <%s while in user mode>\n", fault);
> diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
> index 2298cb1..65f7f9d 100644
> --- a/arch/tile/kvm/Kconfig
> +++ b/arch/tile/kvm/Kconfig
> @@ -27,9 +27,6 @@ config KVM
> This module provides access to the hardware capabilities through
> a character device node named /dev/kvm.
>
> - To compile this as a module, choose M here: the module
> - will be called kvm.
> -
> If unsure, say N.
>
> source drivers/vhost/Kconfig
> diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
> new file mode 100644
> index 0000000..2c3d206
> --- /dev/null
> +++ b/arch/tile/kvm/Makefile
> @@ -0,0 +1,12 @@
> +#
> +# Makefile for Kernel-based Virtual Machine module
> +#
> +
> +ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
> +
> +kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
> +
> +kvm-y += kvm-tile.o
> +kvm-y += entry.o
> +
> +obj-$(CONFIG_KVM) += kvm.o
> diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
> new file mode 100644
> index 0000000..07aa3a6
> --- /dev/null
> +++ b/arch/tile/kvm/entry.S
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/switch_to.h>
> +#include <asm/processor.h>
> +#include <arch/spr_def.h>
> +#include <arch/abi.h>
> +
> +#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
> +#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
> +#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
> +#define FOR_EACH_CALLEE_SAVED_REG(f) \
> + f(r30); f(r31); \
> + f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \
> + f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
> + f(r48); f(r49); f(r50); f(r51); f(r52);
> +
> +/*
> + * Called with interrupts disabled from kvm_tile_run() and is responsible
> + * just for saving the callee-save registers and the stack pointer, then
> + * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
> + * It uses restore_all in intvec_64.S to jump back into the guest.
> + * The kvm_vmexit function below undoes the stack manipulation.
> + */
> +STD_ENTRY(kvm_vmresume)
> + /* Do function prolog and save callee-saves on stack. */
> + {
> + move r10, sp
> + st sp, lr
> + }
> + {
> + addli r11, sp, -FRAME_SIZE + 8
> + addli sp, sp, -FRAME_SIZE
> + }
> + {
> + st r11, r10
> + addi r12, sp, 16
> + }
> + FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
> + SAVE_REG(tp)
> + SAVE_REG(lr)
> +
> + /* Save frame pointer in thread_info so we can get it back later. */
> + st r1, sp
> +
> + /* Set the ksp0 for this core to be below this frame. */
> + mfspr r10, SPR_SYSTEM_SAVE_K_0
> + bfins r10, sp, 0, CPU_SHIFT-1
> + mtspr SPR_SYSTEM_SAVE_K_0, r10
> +
> + /* sp points to ABI save area below pt_regs for restore_all. */
> + addli sp, r0, -C_ABI_SAVE_AREA_SIZE
> +
> + /* Execute an "interrupt return" to the guest. */
> + {
> + movei r30, 0
> + j restore_all
> + }
> + STD_ENDPROC(kvm_vmresume)
> +
> +/*
> + * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
> + * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
> + * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller
> + * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
> + */
> +STD_ENTRY(kvm_vmexit)
> + {
> + move sp, r0
> + addi r12, r0, 16
> + }
> + FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
> + LOAD_REG(tp)
> + LOAD_REG(lr)
> + {
> + addli sp, sp, FRAME_SIZE
> + jrp lr
> + }
> + STD_ENDPROC(kvm_vmexit)
> diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
> new file mode 100644
> index 0000000..e22d4ad
> --- /dev/null
> +++ b/arch/tile/kvm/kvm-tile.c
> @@ -0,0 +1,1529 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/err.h>
> +#include <linux/init.h>
> +#include <linux/fs.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/kvm_types.h>
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/ptrace.h>
> +#include <asm/traps.h>
> +#include <asm/pgalloc.h>
> +#include <hv/hypervisor.h>
> +#include <linux/rtc.h>
> +#include <asm/atomic.h>
> +#include <asm/tlbflush.h>
> +#include <arch/spr_def.h>
> +#include <arch/sim.h>
> +#include <generated/utsrelease.h>
> +
> +
> +struct kvm_stats_debugfs_item debugfs_entries[] = {
> + { NULL }
> +};
> +
> +static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
> +{
> + struct mm_struct *mm = kvm->mm;
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd;
> +
> + if (kvm->arch.vpgd == NULL)
> + kvm->arch.vpgd = pgd_alloc(kvm->mm);
> + pgd = kvm->arch.vpgd + pgd_index(address);
> + pud = pud_alloc(mm, pgd, address);
> + if (!pud)
> + return NULL;
> + pmd = pmd_alloc(mm, pud, address);
> + if (!pmd)
> + return NULL;
> + return pte_alloc_kernel(pmd, address);
> +}
> +
> +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
> +{
> + return VM_FAULT_SIGBUS;
> +}
> +
> +void kvm_arch_free_memslot(struct kvm_memory_slot *free,
> + struct kvm_memory_slot *dont)
> +{
> +}
> +
> +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
> +{
> + return 0;
> +}
> +
> +/* FIXME: support huge pages. */
> +int kvm_arch_prepare_memory_region(struct kvm *kvm,
> + struct kvm_memory_slot *memslot,
> + struct kvm_userspace_memory_region *mem,
> + enum kvm_mr_change change)
> +{
> + unsigned long gpa, i;
> +
> + gpa = mem->guest_phys_addr;
> + for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
> + if (get_vpgd_pte(kvm, gpa) == NULL)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +void kvm_arch_commit_memory_region(struct kvm *kvm,
> + struct kvm_userspace_memory_region *mem,
> + const struct kvm_memory_slot *old,
> + enum kvm_mr_change change)
> +{
> + unsigned long gpa, address, pfn, i;
> + struct page *page[1];
> + pte_t *ptep, *vptep;
> +
> + gpa = mem->guest_phys_addr;
> + address = mem->userspace_addr;
> + for (i = 0; i < mem->memory_size;
> + i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
> + vptep = get_vpgd_pte(kvm, gpa);
> + BUG_ON(vptep == NULL);
> + get_user_pages_fast(address, 1, 1, page);
get_user_pages_fast() can fail and you do not handle an error. Do I
understand correctly that all guest memory is pinned here? Where is it
unpinned? I do not see put_page anywhere.
> + pfn = page_to_pfn(page[0]);
> + ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
> + *vptep = *ptep;
> + }
> +}
> +
> +void kvm_arch_flush_shadow_all(struct kvm *kvm)
> +{
> +}
> +
> +void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> + struct kvm_memory_slot *slot)
> +{
> + kvm_arch_flush_shadow_all(kvm);
> +}
> +
> +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
> +{
> + return 0;
> +}
> +
> +long kvm_arch_dev_ioctl(struct file *filp,
> + unsigned int ioctl, unsigned long arg)
> +{
> + return 0;
> +}
> +
> +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
> +{
> + if (irq < 0)
> + return -EINVAL;
> +
> + set_bit(irq, &vcpu->arch.ipi_events);
> + kvm_vcpu_kick(vcpu);
> +
> + return 0;
> +}
> +
> +long kvm_arch_vcpu_ioctl(struct file *filp,
> + unsigned int ioctl, unsigned long arg)
> +{
> + struct kvm_vcpu *vcpu = filp->private_data;
> + void __user *argp = (void __user *)arg;
> + int r = 0;
> +
> + switch (ioctl) {
> + case KVM_INTERRUPT: {
> + struct kvm_interrupt irq;
> +
> + r = -EFAULT;
> + if (copy_from_user(&irq, argp, sizeof(irq)))
> + goto out;
> + r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
> + if (r)
> + goto out;
> + r = 0;
> + break;
> + }
> + default:
> + r = -EINVAL;
> + }
> +
> +out:
> + return r;
> +}
> +
> +int kvm_dev_ioctl_check_extension(long ext)
> +{
> + return 0;
> +}
> +
> +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
> + struct kvm_dirty_log *log)
> +{
> + return 0;
> +}
> +
> +long kvm_arch_vm_ioctl(struct file *filp,
> + unsigned int ioctl, unsigned long arg)
> +{
> + long r = -EINVAL;
> +
> + return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
> + struct kvm_translation *tr)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + unsigned long page_size;
> + unsigned long gva = tr->linear_address;
> + unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
> + pud_t gpud;
> + pmd_t gpmd;
> + pte_t gpte;
> +
> + /* Get guest pgd (aka pud for three-level tables). */
> + gpgd_gpa = vcpu->arch.guest_context.page_table +
> + (sizeof(pgd_t) * pgd_index(gva));
> + if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
> + goto fail;
> + if (!pud_present(gpud))
> + goto fail;
> +
> + /* Get guest pmd. */
> + if (pud_huge_page(gpud)) {
> + /* FIXME: no super huge page support yet. */
> + if (pte_super(*(pte_t *)&gpud))
> + goto fail;
> + gpte = *(pte_t *)&gpud;
> + page_size = PGDIR_SIZE;
> + goto ok;
> + }
> + gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
> + (sizeof(pmd_t) * pmd_index(gva));
> + if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
> + goto fail;
> + if (!pmd_present(gpmd))
> + goto fail;
> +
> + /* Get guest pte. */
> + if (pmd_huge_page(gpmd)) {
> + /* FIXME: no super huge page support yet. */
> + if (pte_super(*(pte_t *)&gpmd))
> + goto fail;
> + gpte = *(pte_t *)&gpmd;
> + page_size = PMD_SIZE;
> + goto ok;
> + }
> + gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
> + (sizeof(pte_t) * pte_index(gva));
> + if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
All the kvm_read_guest() in the function need to be in kvm->srcu read
section. See comment about gfn_to_pfn() bellow.
> + goto fail;
> + if (!pte_present(gpte))
> + goto fail;
> +
> + page_size = PAGE_SIZE;
> +
> +ok:
> + tr->physical_address =
> + PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
> + tr->valid = 1;
> + tr->writeable = pte_write(gpte);
> + tr->usermode = pte_user(gpte);
> +
> + return 0;
> +
> +fail:
> + tr->valid = 0;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +{
> + regs->regs = vcpu->arch.regs;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> +{
> + vcpu->arch.regs = regs->regs;
> + vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> + struct kvm_sregs *sregs)
> +{
> + *sregs = vcpu->arch.sregs;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> + struct kvm_sregs *sregs)
> +{
> + vcpu->arch.sregs = *sregs;
> + return 0;
> +}
Most arches prefer to use KVM_GET_ONE_REG/KVM_SET_ONE_REG interface
to get/set all vcpu registers since the interface is more flexible, but
the way you are doing it is OK too.
> +
> +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
> + struct kvm_mp_state *mp_state)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
> + struct kvm_mp_state *mp_state)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
> + struct kvm_guest_debug *dbg)
> +{
> + return 0;
> +}
> +
> +/*
> + * panic_hv() will dump stack info of both guest os and host os, and set
> + * proper exit reason so that qemu can terminate the guest process.
> + */
> +static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
> +{
> + char panic_buf[256];
> + struct pt_regs *regs;
> + va_list ap;
> + int i;
> +
> + va_start(ap, fmt);
> + vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
> + va_end(ap);
> + pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
> +
> + /* Show guest os info */
> + regs = &vcpu->arch.regs;
> + for (i = 0; i < 17; i++)
> + pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
> + i, regs->regs[i], i+18, regs->regs[i+18],
> + i+36, regs->regs[i+36]);
> + pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
> + regs->regs[18], regs->regs[35], regs->tp);
> + pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
> + pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n",
> + regs->pc, regs->ex1, regs->faultnum);
> +
> + /* Show host os info */
> + pr_err("\nKVM stack in the host:\n");
> + dump_stack();
> +
> + /* Shut down the guest os */
> + pr_err("Shutting down guest.\n");
> + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
> + return 0;
> +}
> +
> +/* Copied from virt/kvm/kvm_main.c */
> +static int next_segment(unsigned long len, int offset)
> +{
> + if (len > PAGE_SIZE - offset)
> + return PAGE_SIZE - offset;
> + else
> + return len;
> +}
> +
> +static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
> + void *data, unsigned long len)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int seg;
> + int offset = offset_in_page(gva);
> + int ret;
> +
> + while ((seg = next_segment(len, offset)) != 0) {
> + struct kvm_translation tr;
> + tr.linear_address = gva;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return -EFAULT;
> + ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
> + data, offset, seg);
kvm->srcu read section. See comment about gfn_to_pfn() bellow.
> + if (ret < 0)
> + return ret;
> + offset = 0;
> + len -= seg;
> + data += seg;
> + gva += seg;
> + }
> + return 0;
> +}
> +
> +static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
> + const void *data, unsigned long len)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int seg;
> + int offset = offset_in_page(gva);
> + int ret;
> +
> + while ((seg = next_segment(len, offset)) != 0) {
> + struct kvm_translation tr;
> + tr.linear_address = gva;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return -EFAULT;
> + ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
> + data, offset, seg);
kvm->srcu read section. See comment about gfn_to_pfn() bellow.
> + if (ret < 0)
> + return ret;
> + offset = 0;
> + len -= seg;
> + data += seg;
> + gva += seg;
> + }
> + return 0;
> +}
> +
> +static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
> + unsigned long len)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + int seg;
> + int offset = offset_in_page(gva);
> + int ret;
> +
> + while ((seg = next_segment(len, offset)) != 0) {
> + struct kvm_translation tr;
> + tr.linear_address = gva;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return -EFAULT;
> + ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
> + offset, seg);
> + if (ret < 0)
> + return ret;
> + offset = 0;
> + len -= seg;
> + gva += seg;
> + }
> + return 0;
> +}
> +
> +/*
> + * The following functions are emulation functions for various
> + * hypervisor system calls (i.e. hv_*()). Return value:
> + * 1 if the host os can emulate it completely.
> + * < 0 if errors occur and then qemu will handle them.
> + * 0 if qemu emulation is needed.
> + * In both the < 0 and the == 0 cases, exit reason should
> + * be set for qemu handling.
> + */
> +
> +/* generic handler for hypercall which needs user (QEMU) to handle. */
> +static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
> +{
> + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
> + return 0;
> +}
> +
> +/* handler for illegal hypercall */
> +static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
> +{
> + return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
> + (unsigned long)vcpu->arch.regs.regs[10]);
> +}
> +
> +static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
> +{
> + int version = vcpu->arch.regs.regs[0];
> + int chip_num = vcpu->arch.regs.regs[1];
> + int chip_rev_num = vcpu->arch.regs.regs[2];
> + int client_pl = vcpu->arch.regs.regs[3];
> +
> + if (client_pl != 1)
> + return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
> + " guests must request PL 1.\n"
> + "Reconfigure your guest with KVM_GUEST set.\n",
> + client_pl);
> +
> + if (version != HV_VERSION)
> + return panic_hv(vcpu, "Client built for hv version %d, but"
> + " this hv is version %d\n",
> + version, HV_VERSION);
> +
> + if (chip_num != TILE_CHIP)
> + return panic_hv(vcpu, "Client built for chip %d, but this"
> + " hardware is chip %d\n",
> + chip_num, TILE_CHIP);
> +
> + if (chip_rev_num != TILE_CHIP_REV)
> + return panic_hv(vcpu, "Client built for chip rev %d, but this"
> + " hardware is chip rev %d\n",
> + chip_rev_num, TILE_CHIP_REV);
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
> +{
> + HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
> + long rc;
> +
> + switch (query) {
> + case HV_SYSCONF_PAGE_SIZE_SMALL:
> + rc = PAGE_SIZE;
> + break;
> +
> + case HV_SYSCONF_PAGE_SIZE_LARGE:
> + rc = HPAGE_SIZE;
> + break;
> +
> + case HV_SYSCONF_VALID_PAGE_SIZES:
> +#if PAGE_SHIFT == 16
> + rc = HV_CTX_PG_SM_64K;
> +#elif PAGE_SHIFT == 14
> + rc = HV_CTX_PG_SM_16K;
> +#else
> +# error Fix hv_sysconf emulation for new page size
> +#endif
> + break;
> +
> + case HV_SYSCONF_PAGE_SIZE_JUMBO:
> + rc = 0; /* FIXME add super page support */
> + break;
> +
> + case HV_SYSCONF_CPU_SPEED:
> + case HV_SYSCONF_CPU_TEMP:
> + case HV_SYSCONF_BOARD_TEMP:
> + rc = hv_sysconf(query);
> + break;
> +
> + default:
> + rc = -EINVAL;
> + break;
> + }
> +
> + vcpu->arch.regs.regs[0] = rc;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
> +{
> + HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
> + long buflen = vcpu->arch.regs.regs[2];
> + char hvbuf[256];
> + const char *p;
> + long rc;
> +
> + switch (query) {
> +
> + /* For hardware attributes, just pass to the hypervisor. */
> + case HV_CONFSTR_BOARD_PART_NUM:
> + case HV_CONFSTR_BOARD_SERIAL_NUM:
> + case HV_CONFSTR_CHIP_SERIAL_NUM:
> + case HV_CONFSTR_BOARD_REV:
> + case HV_CONFSTR_CHIP_MODEL:
> + case HV_CONFSTR_BOARD_DESC:
> + case HV_CONFSTR_MEZZ_PART_NUM:
> + case HV_CONFSTR_MEZZ_SERIAL_NUM:
> + case HV_CONFSTR_MEZZ_REV:
> + case HV_CONFSTR_MEZZ_DESC:
> + case HV_CONFSTR_SWITCH_CONTROL:
> + case HV_CONFSTR_CHIP_REV:
> + case HV_CONFSTR_CPUMOD_PART_NUM:
> + case HV_CONFSTR_CPUMOD_SERIAL_NUM:
> + case HV_CONFSTR_CPUMOD_REV:
> + case HV_CONFSTR_CPUMOD_DESC:
> + rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
> + if (rc > sizeof(hvbuf)) {
> + /* Not the best answer, but very unlikely anyway. */
> + rc = sizeof(hvbuf);
> + hvbuf[sizeof(hvbuf)-1] = '\0';
> + }
> + p = hvbuf;
> + break;
> +
> + /* For hypervisor version info, just report the kernel version. */
> + case HV_CONFSTR_HV_SW_VER:
> + p = UTS_RELEASE;
> + break;
> + case HV_CONFSTR_HV_CONFIG:
> + case HV_CONFSTR_HV_CONFIG_VER:
> + p = "";
> + break;
> +
> + default:
> + rc = HV_EINVAL;
> + goto done;
> + }
> +
> + rc = strlen(p) + 1; /* include NUL */
> + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
> + p, min(rc, buflen)))
> + rc = HV_EFAULT;
> +
> +done:
> + vcpu->arch.regs.regs[0] = rc;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
> +{
> + HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
> + struct rtc_time tm;
> + struct timeval tv;
> +
> + do_gettimeofday(&tv);
> + rtc_time_to_tm(tv.tv_sec, &tm);
> + hvtm->tm_sec = tm.tm_sec;
> + hvtm->tm_min = tm.tm_min;
> + hvtm->tm_hour = tm.tm_hour;
> + hvtm->tm_mday = tm.tm_mday;
> + hvtm->tm_mon = tm.tm_mon;
> + hvtm->tm_year = tm.tm_year;
> + hvtm->flags = 0;
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
> +{
> + /* Do nothing here. */
> + pr_warn("hv_set_rtc() will not work in kvm guest\n");
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
> +{
> + int idx = vcpu->arch.regs.regs[0];
> + HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
> +
> + switch (idx) {
> + case 0:
> + var->start = 0UL;
> + var->size = 0x20000000000UL;
> + break;
> + case 1:
> + var->start = 0xFFFFFFFF80000000UL;
> + var->size = 0x80000000UL;
> + break;
> + default:
> + var->start = 0UL;
> + var->size = 0UL;
> + break;
> + }
> +
> + return 1;
> +}
> +
> +/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
> +static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
> +{
> + int idx = vcpu->arch.regs.regs[0];
> + HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
> +
> + if (idx == 0) {
> + var->start = min_asid;
> + var->size = max_asid - min_asid + 1;
> + } else {
> + var->start = 0;
> + var->size = 0;
> + }
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
> +{
> + HV_Topology *tp;
> + int cpus;
> +
> + /* Depends on the definition of struct HV_Topology */
> + tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
> +
> + cpus = atomic_read(&vcpu->kvm->online_vcpus);
> + tp->coord.x = vcpu->vcpu_id;
> + tp->coord.y = 0;
> + tp->width = cpus;
> + tp->height = 1;
> +
> + return 1;
> +}
> +
> +static int xy_to_vcpu(struct kvm *kvm, int x, int y)
> +{
> + if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
> + return -1;
> + return x;
> +}
> +
> +/*
> + * The primary vcpu is the one that initially runs while the others
> + * all block. It is the only that is allowed to call hv_start_all_tiles().
> + * The other cpus are secondary.
> + */
> +static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
> +{
> + return vcpu->vcpu_id != 0;
> +}
> +
> +static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
> +{
> + struct completion *c = &vcpu->kvm->arch.smp_start;
> + if (is_secondary_vcpu(vcpu) || completion_done(c))
> + return panic_hv(vcpu, "start_all_tiles() called again");
> + complete_all(c);
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
> +{
> + gpa_t gpa = vcpu->arch.regs.regs[0];
> + HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
> + gfn_t gfn;
> + pfn_t pfn;
> + hpa_t hpa;
> +
> + gfn = gpa_to_gfn(gpa);
> + pfn = gfn_to_pfn(vcpu->kvm, gfn);
> + if (is_error_pfn(pfn))
> + return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
> + gpa);
> + hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
> +
> + vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
> +{
> + gpa_t gpa = vcpu->arch.regs.regs[0];
> + HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
> + uint64_t val = vcpu->arch.regs.regs[2];
> + gfn_t gfn;
> + pfn_t pfn;
> + hpa_t hpa;
> +
> + gfn = gpa_to_gfn(gpa);
> + pfn = gfn_to_pfn(vcpu->kvm, gfn);
Here and in the function above you use gfn_to_pfn() which access
memslots. Memslots are srcu protected, so you have to take kvm->srcu
read lock around those calls. Here is the link with the patch that
documents that: http://www.mail-archive.com/[email protected]/msg95566.html
Another thing about those function is that they are very similar to
kvm_read_guest/kvm_write_guest, the only difference I see is that they
use hv_physaddr_write64/hv_physaddr_read64 instead of
__copy_to_user/__copy_from_user. What are those special functions and
why can't we use __copy_to_user/__copy_from_user here?
> + if (is_error_pfn(pfn))
> + return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
> + gpa);
> + hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
> +
> + hv_physaddr_write64(hpa, *access, val);
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
> +{
> + /* Do we care about the argument msgstate? */
> + vcpu->arch.regs.regs[0] = HV_OK;
> +
> + return 1;
> +}
> +
> +/*
> + * NOTE: we may coalesce multiple messages with the same tag to the
> + * same recepient. Currently the only messages used by Linux are
> + * start/stop cpu (where coalescing is OK), and the smp_call_function()
> + * IPI message tag. In the latter case we rely on the generic
> + * smp_call_function code to properly handle this, and since it only
> + * uses the IPI as a way to wake up the generic list-walking code,
> + * it's OK if we coalesce several IPI deliveries before the recipient
> + * core takes action.
> + */
> +static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + struct kvm_vcpu *vcpui;
> + HV_Recipient recip[NR_CPUS];
> + HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
> + int nrecip = vcpu->arch.regs.regs[1];
> + int buflen = vcpu->arch.regs.regs[3];
> + int sent, vcpu_id, tag;
> +
> + /* NOTE: we only support the Linux usage of buflen == sizeof(int). */
> + if (unlikely(buflen != sizeof(int) ||
> + nrecip >= atomic_read(&kvm->online_vcpus))) {
> + vcpu->arch.regs.regs[0] = HV_EINVAL;
> + return 1;
> + }
> +
> + /* Get the buf info */
> + if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
> + &tag, sizeof(tag))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + /* Range-check the tag value. */
> + if (tag < 0 || tag >= MAX_MSG_TAG) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + /* Get all the recipients */
> + if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
> + nrecip * sizeof(HV_Recipient))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + for (sent = 0; sent < nrecip; sent++) {
> + if (recip[sent].state != HV_TO_BE_SENT)
> + continue;
> + vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
> + if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
> + recip[sent].state = HV_BAD_RECIP;
> + continue;
> + }
> + vcpui = kvm_get_vcpu(kvm, vcpu_id);
> + set_bit(tag, &vcpui->arch.pending_msgs);
> + kvm_vcpu_kick(vcpui);
> + recip[sent].state = HV_SENT;
> + }
> +
> + if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
> + nrecip * sizeof(HV_Recipient))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + vcpu->arch.regs.regs[0] = sent;
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
> +{
> + HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
> + int buflen = vcpu->arch.regs.regs[3];
> + int tag;
> +
> + /* Currently we only support messages from other tiles. */
> + rmi->source = HV_MSG_TILE;
> +
> + if (buflen <= sizeof(int)) {
> + rmi->msglen = HV_E2BIG;
> + return 1;
> + }
> +
> + tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
> + if (tag >= MAX_MSG_TAG) {
> + /* No more messages */
> + rmi->msglen = 0;
> + return 1;
> + }
> +
> + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
> + &tag, sizeof(int))) {
> + rmi->msglen = HV_EFAULT;
> + return 1;
> + }
> +
> + /*
> + * This clear_bit could race with a set_bit as another core
> + * delivers a new smp_function_call to this core. However,
> + * the smp_function_call code will have set up the additional
> + * smp_function_call data on the kernel's list prior to
> + * raising the interrupt, so even if we lose the new
> + * interrupt due to the race, we still haven't dispatched
> + * to the original interrupt handler, and when we do, it
> + * will find both smp_function_calls waiting for it, so the
> + * race is harmless. This is consistent with the fact that
> + * the generic code is trying to support pretty much
> + * arbitrary architecture-dependent IPI semantics, so it
> + * is very conservative about what it assumes.
> + *
> + * Also note that we only clear_bit on the core that owns
> + * the mask, so there's no race condition caused by the
> + * find_first_bit above and the clear_bit here, since once
> + * a bit is found it will stay set until this point.
> + */
> + clear_bit(tag, &vcpu->arch.pending_msgs);
> + rmi->msglen = sizeof(int);
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
> +{
> + HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
> +
> + *ctx = hv_inquire_guest_context();
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + HV_InqTileSet set = vcpu->arch.regs.regs[0];
> + unsigned long gva = vcpu->arch.regs.regs[1];
> + int length = vcpu->arch.regs.regs[2];
> + struct cpumask mask = CPU_MASK_NONE;
> + int cpus, i, retval, bytes2copy, bytes2zero;
> +
> + switch (set) {
> + case HV_INQ_TILES_AVAIL:
> + case HV_INQ_TILES_HFH_CACHE:
> + case HV_INQ_TILES_LOTAR:
> + cpus = atomic_read(&kvm->online_vcpus);
> + for (i = 0; i < cpus; ++i)
> + cpumask_set_cpu(i, &mask);
> + break;
> + case HV_INQ_TILES_SHARED:
> + break;
> + default:
> + retval = HV_EINVAL;
> + goto done;
> + }
> +
> + bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
> + bytes2zero = length - bytes2copy;
> +
> + if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
> + retval = HV_EFAULT;
> + goto done;
> + }
> +
> + if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
> + retval = HV_EFAULT;
> + goto done;
> + }
> +
> + retval = HV_OK;
> +done:
> + vcpu->arch.regs.regs[0] = retval;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
> +{
> + HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
> + int pl = (int) vcpu->arch.regs.regs[1];
> + struct kvm_vcpu *target_vcpu;
> + int vcpu_id;
> +
> + vcpu_id = vtarget.x;
> + if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
> + vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
> + vcpu->arch.regs.regs[0] = HV_EINVAL;
> + return 1;
> + }
> +
> + target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
> + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
> + &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
> + vcpu->arch.regs.regs[0] = HV_EFAULT;
> + return 1;
> + }
> +
> + vcpu->arch.regs.regs[0] = HV_OK;
> +
> + return 1;
> +}
> +
> +struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
> +{
> + struct kvm_vcpu *vcpui;
> + unsigned long idx;
> +
> + kvm_for_each_vcpu(idx, vcpui, kvm)
> + if (vcpui->arch.ipi_gpa == gpa)
> + return vcpui;
> +
> + return NULL;
> +}
> +
> +/*
> + * Most page faults will be downcall-ed from hv to and be handled directly
> + * by either guest os or host os. This function is used to handle the
> + * rest cases.
> + */
> +static int handle_mmio(struct kvm_vcpu *vcpu)
> +{
> + struct kvm *kvm = vcpu->kvm;
> + struct kvm_translation tr;
> + struct kvm_vcpu *ipi_vcpu;
> +
> + tr.linear_address = (__u64) vcpu->arch.fault_addr;
> + kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
> + if (!tr.valid)
> + return 0;
> +
> + /* ipi PTE for rescheduling interrupt? */
> + ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
> + if (!ipi_vcpu)
> + return 0;
> +
> + set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
> + kvm_vcpu_kick(ipi_vcpu);
> +
> + /* Juke the PC past the store instruction. */
> + vcpu->arch.regs.pc += 8;
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
> +{
> + /*
> + * We do not expect this call in guest so far. At least guest os
> + * should just follow host os instead of *set*. Besides,
> + * hv_set_pte_super_shift() will not be called in guest os with
> + * current guest os setting.
> + */
> + vcpu->arch.regs.regs[0] = HV_EINVAL;
> +
> + return 1;
> +}
> +
> +static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
> +{
> + HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
> +
> + hvss->new_speed = HV_EPERM;
> + hvss->end_cycle = 0;
> + hvss->delta_ns = 0;
> +
> + return 1;
> +}
> +
> +static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
> + HCALL_DEFS
> +};
> +
> +static int kvm_handle_exit(struct kvm_vcpu *vcpu)
> +{
> + unsigned long hcall_idx;
> +
> + switch (vcpu->run->exit_reason) {
> + case KVM_EXIT_HYPERCALL:
> + hcall_idx = vcpu->arch.regs.regs[10];
> + if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
> + hcall_handlers[hcall_idx] == NULL))
> + return kvm_emulate_illegal(vcpu);
> +
> + /* Juke us past the swint0 when we return. */
> + vcpu->arch.regs.pc += 8;
> +
> + return hcall_handlers[hcall_idx](vcpu);
> +
> + case KVM_EXIT_MMIO:
> + if (handle_mmio(vcpu))
> + return 1;
> + return panic_hv(vcpu, "Out-of-bounds client memory access");
> +
> + case KVM_EXIT_AGAIN:
> + return 1;
> +
> + default:
> + return 0;
> + }
> +}
> +
> +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> +{
> + return !test_and_set_bit(KVM_REQ_KICK, &vcpu->requests);
> +}
Use of KVM_REQ_KICK was deprecated some time ago by commit
d94e1dc9af60e3431a586c3edfbe42d8a0d3932b. You probably copied this from
ia64 which is a bad example since the kvm support there is broken and
will be removed soon. Set vcpu->mode to IN_GUEST_MODE/OUTSIDE_GUEST_MODE
instead and helper functions such as kvm_vcpu_exiting_guest_mode() here.
> +
> +/*
> + * Any interrupt that would normally be handled by the host at PL2
> + * needs to be reassigned to the guest at PL1 as we enter.
> + *
> + * The TLB interrupts remain handled by the hypervisor and are downcalled
> + * to the appropriate host or guest as necessary.
> + *
> + * FIXME: We don't give the UDN interrupts for now; at some point we
> + * plan to allow an option to pin the vcpus and report the true
> + * geometry to the guest, at which point passing the UDN access would
> + * make sense.
> + *
> + * FIXME: For now we don't pass the profiling interrupts to the guest,
> + * and instead require profiling be run in the host; we should be able
> + * to support guest-level profiling pretty easily, but we need to
> + * think about whether there are vcpu migration issues there.
> + */
> +static void kvm_grant_mpls(void)
> +{
> + __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
> + __insn_mtspr(SPR_MPL_ILL_SET_1, 1);
> + __insn_mtspr(SPR_MPL_GPV_SET_1, 1);
> + __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
> + __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
> +}
> +
> +static void kvm_ungrant_mpls(void)
> +{
> + __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
> + __insn_mtspr(SPR_MPL_ILL_SET_2, 1);
> + __insn_mtspr(SPR_MPL_GPV_SET_2, 1);
> + __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
> + __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
> +}
> +
> +/*
> + * There is lots of state that is (for the non-virtualized case) held
> + * permanently in SPRs, or that is in any case not context-switched.
> + * The next two routines switch in and out all the SPR state.
> + *
> + * We try to fix the timer so that when we restart, we fix up the
> + * timer value so that will fire at the correct wall-clock time even
> + * if we have been scheduled out for a little bit. This may also
> + * mean we end up firing it immediately on return, and suffer a
> + * timer delay in the guest.
> + */
> +static void kvm_save_sprs(struct kvm_vcpu *vcpu)
> +{
> + vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
> + vcpu->arch.vmexit_cycles = get_cycles();
> +
> +#define SAVE_SPR(x) vcpu->arch.sregs.x = __insn_mfspr(SPR_ ## x)
> + FOR_EACH_GUEST_SPR(SAVE_SPR);
> +#undef SAVE_SPR
> +}
> +
> +static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
> +{
> + unsigned long count = vcpu->arch.timer_control;
> + unsigned long underflow =
> + (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
> + unsigned long disabled =
> + (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
> +
> + if (!disabled) {
> + unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
> + count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
> + underflow |= delta > count;
> + count -= delta;
> + count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
> + count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
> + }
> + __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
> +
> +#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.sregs.x)
> + FOR_EACH_GUEST_SPR(RESTORE_SPR);
> +#undef RESTORE_SPR
> +}
> +
> +/*
> + * When entering the guest, we need to eliminate any PL0 translations
> + * that were in use by qemu, since the guest's PL0 translations will
> + * be different. We also flush PL1 translations in case there have
> + * been changes to the virtualization page table, etc.
> + *
> + * FIXME: Add a way to just flush PL0/PL1, or just flush below
> + * the host PAGE_OFFSET, or add vpid support, etc.
> + */
> +static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
> +{
> + HV_Context *ctx;
> + pgd_t *vpgdir;
> + pte_t *ptep;
> + int rc;
> +
> + /* Install virtualization context */
> + vpgdir = vcpu->kvm->arch.vpgd;
> + BUG_ON(vpgdir == NULL);
> + ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
> + rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
> + WARN_ON_ONCE(rc < 0);
> +
> + /* Install guest context */
> + ctx = &vcpu->arch.guest_context;
> + rc = hv_install_guest_context(ctx->page_table, ctx->access,
> + ctx->asid, ctx->flags);
> + WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
> + ctx->page_table, ctx->access.val,
> + ctx->asid, ctx->flags, rc);
> +
> + hv_flush_all(0);
> +}
> +
> +/*
> + * De-install the virtualization context so we take faults below the
> + * host Linux PL in the normal manner going forward.
> + *
> + * We flush all the TLB mappings as we exit the guest, since the
> + * guest has been using the ASIDs as it pleases, and may have installed
> + * incompatible mappings for qemu's process as well. Note that we don't
> + * worry about host-PL interrupts that occur while the guest is running,
> + * on the assumption that such interrupts can't touch userspace
> + * addresses legally anyway.
> + *
> + * NOTE: we may want to add a hypervisor call to just flush mappings
> + * below PL2 and use that here instead.
> + */
> +static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
> +{
> + int rc;
> +
> + /* Remember guest context */
> + vcpu->arch.guest_context = hv_inquire_guest_context();
> +
> + /* Disable virtualization context */
> + rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
> + WARN_ON_ONCE(rc < 0);
> +
> + /* Flush everything in the TLB. */
> + hv_flush_all(0);
> +}
> +
> +static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
> +{
> + /*
> + * Capture current set of ipi_events. We might race with
> + * another thread adding an event, but if so we'll just miss
> + * it on this go-around and see it next time.
> + */
> + vcpu->arch.sregs.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
> +
> + /*
> + * Note: We could set PC and EX1 for the guest os to jump
> + * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
> + * is unmasked and the guest is not at PL1 with ICS set.
> + * But in fact it's about as fast to just set INTCTRL_1_STATUS
> + * here and then run the short INTCTRL_1 handler in the guest.
> + */
> + vcpu->arch.sregs.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
> +}
> +
> +static void kvm_tile_run(struct kvm_vcpu *vcpu)
> +{
> + struct thread_info *ti = current_thread_info();
> + unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
> +
> + /*
> + * Disable interrupts while we set up the guest state.
> + * This way, if we race with another core trying to tell us
> + * to fix up our guest state, we will take the kick only as
> + * we actually try to enter the guest, and instead we will
> + * vmexit and end up retrying.
> + */
> + local_irq_disable();
> + kvm_guest_context_enter(vcpu);
> + clear_bit(KVM_REQ_KICK, &vcpu->requests);
> + ti->vcpu = vcpu;
> + vcpu->cpu = get_cpu();
> + kvm_inject_interrupts(vcpu);
> + kvm_grant_mpls();
> + kvm_restore_sprs(vcpu);
> +
> + /* Calling this function irets into the guest. */
> + kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
> +
> + /* We resume here due to a call to kvm_vmexit. */
> + __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
> +
> + vcpu->cpu = -1;
> + put_cpu();
> + ti->vcpu = NULL;
> + set_bit(KVM_REQ_KICK, &vcpu->requests);
> + vcpu->run->ready_for_interrupt_injection = 1;
> + kvm_ungrant_mpls();
> + kvm_save_sprs(vcpu);
> + __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
> + kvm_guest_context_exit(vcpu);
> + local_irq_enable();
> +}
> +
> +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +{
> + int r = 1;
> +
> + while (r > 0) {
> + kvm_guest_enter();
> + kvm_tile_run(vcpu);
> + kvm_guest_exit();
> +
> + r = kvm_handle_exit(vcpu);
> + /*
> + * <0: error for userspace.
> + * =0: QEMU to handle.
> + * >0: host os can handle it fully.
> + */
> + if (r <= 0)
> + break;
> +
> + if (signal_pending(current)) {
> + vcpu->run->exit_reason = KVM_EXIT_INTR;
> + r = -EINTR;
> + break;
> + }
> +
> + kvm_resched(vcpu);
> + }
> +
> + return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +{
> + int r;
> + sigset_t sigsaved;
> +
> + /* Secondary cpus must wait until they are told they can start. */
> + if (vcpu->arch.suspended) {
> + struct completion *c = &vcpu->kvm->arch.smp_start;
> + if (wait_for_completion_interruptible(c))
> + return -EINTR;
> + vcpu->arch.suspended = 0;
> + }
> +
> + if (vcpu->sigset_active)
> + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
> +
> + r = __vcpu_run(vcpu, kvm_run);
> +
> + if (vcpu->sigset_active)
> + sigprocmask(SIG_SETMASK, &sigsaved, NULL);
> +
> + return r;
> +}
> +
> +int kvm_arch_init(void *opaque)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_exit(void)
> +{
> +}
> +
> +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> +{
> + int i;
> + unsigned long resv_gfn_start;
> + struct kvm_memory_slot *s;
> + struct kvm *kvm = vcpu->kvm;
> +
> + if (!kvm->arch.resv_gpa_start) {
> + resv_gfn_start = 0;
> +
> + for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
> + s = &kvm->memslots->memslots[i];
Slots can be added or removed after vcpu is created. And of course
kvm->srcu comment applies. Memslot can be KVM_MEMSLOT_INVALID if it is
in the process of been deleted, so you have to check this too, but
probably it is better for userspace to set resv_gpa_start instead of
kernel trying to figure it out here.
> +
> + if (!s->npages)
> + continue;
> +
> + if ((s->base_gfn + s->npages) > resv_gfn_start)
> + resv_gfn_start = s->base_gfn + s->npages;
> + }
> +
> + kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
> + }
> +
> + /* Initialize to enter fake PA=VA mode in hypervisor. */
> + vcpu->arch.guest_context.page_table = HV_CTX_NONE;
> +
> + vcpu->arch.ipi_gpa =
> + kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
> + vcpu->arch.ipi_gpte =
> + pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
> +
> + /* Mark the core suspended if it is not the boot cpu. */
> + vcpu->arch.suspended = is_secondary_vcpu(vcpu);
> +
> + return 0;
> +}
> +
> +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> + /* Notify simulator that this task handles this vcpu. */
> + sim_set_vcpu(vcpu->vcpu_id);
> +}
> +
> +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> +{
> + sim_clear_vcpu();
> +}
> +
> +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
> +{
> + struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> + int rc;
> +
> + if (!vcpu)
> + return ERR_PTR(-ENOMEM);
> +
> + rc = kvm_vcpu_init(vcpu, kvm, id);
> + if (rc) {
> + kfree(vcpu);
> + return ERR_PTR(rc);
> + }
> +
> + return vcpu;
> +}
> +
> +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
> +{
> + memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
> + memset(&vcpu->arch.sregs, 0, sizeof(struct pt_regs));
> + vcpu->arch.sregs.IPI_MASK_1 = -1UL;
> + vcpu->arch.sregs.INTERRUPT_MASK_1 = -1UL;
> + vcpu->arch.sregs.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
> + return 0;
> +}
> +
> +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
> +{
> + kvm_vcpu_uninit(vcpu);
> + kfree(vcpu);
> +}
> +
> +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> +{
> + return kvm_arch_vcpu_destroy(vcpu);
> +}
> +
> +int kvm_arch_hardware_enable(void *garbage)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_hardware_disable(void *garbage)
> +{
> +}
> +
> +int kvm_arch_hardware_setup(void)
> +{
> + return 0;
> +}
> +
> +void kvm_arch_hardware_unsetup(void)
> +{
> +}
> +
> +void kvm_arch_check_processor_compat(void *rtn)
> +{
> +}
> +
> +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> +{
> + if (type)
> + return -EINVAL;
> +
> + init_completion(&kvm->arch.smp_start);
> + return 0;
> +}
> +
> +void kvm_arch_destroy_vm(struct kvm *kvm)
> +{
> + struct kvm_vcpu *vcpu;
> + int i;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm)
> + kvm_arch_vcpu_free(vcpu);
> +
> + /* Seems to be unnecessary? */
> + mutex_lock(&kvm->lock);
> + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
> + kvm->vcpus[i] = NULL;
> +
> + atomic_set(&kvm->online_vcpus, 0);
> + mutex_unlock(&kvm->lock);
> +
> + if (kvm->arch.vpgd)
> + pgd_free(kvm->mm, kvm->arch.vpgd);
> +}
> +
> +void kvm_arch_sync_events(struct kvm *kvm)
> +{
> +}
> +
> +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> +/* Called from guest hv glue via swint0 traps. */
> +void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
> +{
> + /* Hypercalls are only valid from PL1. */
> + if (EX1_PL(regs->ex1) != 0) {
> + kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
> + /*NORETURN*/
> + }
> + do_trap(regs, fault_num, 0);
> +}
> +
> +void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
> + unsigned long fault_addr, unsigned long write)
> +{
> + struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
> + BUG_ON(vcpu == NULL);
> + vcpu->arch.fault_addr = fault_addr;
> + kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
> + /*NORETURN*/
> +}
> +
> +void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
> +{
> + kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
> + /*NORETURN*/
> +}
> +
> +void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
> +{
> + struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
> + vcpu->run->exit_reason = exit_reason;
> + vcpu->arch.regs = *regs;
> + vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
> + kvm_vmexit(vcpu->arch.host_sp);
> + /*NORETURN*/
> +}
> +
> +static int __init kvm_tile_init(void)
> +{
> + return kvm_init(NULL, sizeof(struct kvm_vcpu),
> + __alignof__(struct kvm_vcpu), THIS_MODULE);
> +}
> +
> +static void __exit kvm_tile_exit(void)
> +{
> + kvm_exit();
> +}
> +
> +module_init(kvm_tile_init);
> +module_exit(kvm_tile_exit);
> diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
> index 82733c8..1590282 100644
> --- a/arch/tile/lib/exports.c
> +++ b/arch/tile/lib/exports.c
> @@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);
>
> /* hypervisor glue */
> #include <hv/hypervisor.h>
> +EXPORT_SYMBOL(hv_confstr);
> +EXPORT_SYMBOL(hv_dev_close);
> EXPORT_SYMBOL(hv_dev_open);
> +EXPORT_SYMBOL(hv_dev_poll);
> +EXPORT_SYMBOL(hv_dev_poll_cancel);
> EXPORT_SYMBOL(hv_dev_pread);
> -EXPORT_SYMBOL(hv_dev_pwrite);
> EXPORT_SYMBOL(hv_dev_preada);
> +EXPORT_SYMBOL(hv_dev_pwrite);
> EXPORT_SYMBOL(hv_dev_pwritea);
> -EXPORT_SYMBOL(hv_dev_poll);
> -EXPORT_SYMBOL(hv_dev_poll_cancel);
> -EXPORT_SYMBOL(hv_dev_close);
> -EXPORT_SYMBOL(hv_sysconf);
> -EXPORT_SYMBOL(hv_confstr);
> +EXPORT_SYMBOL(hv_flush_all);
> EXPORT_SYMBOL(hv_get_rtc);
> +#ifdef __tilegx__
> +EXPORT_SYMBOL(hv_inquire_guest_context);
> +EXPORT_SYMBOL(hv_install_guest_context);
> +EXPORT_SYMBOL(hv_install_virt_context);
> +#endif
> +EXPORT_SYMBOL(hv_physaddr_read64);
> +EXPORT_SYMBOL(hv_physaddr_write64);
> EXPORT_SYMBOL(hv_set_rtc);
> +EXPORT_SYMBOL(hv_sysconf);
>
> /* libgcc.a */
> uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
> diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
> index 64eec3f..39c48cb 100644
> --- a/arch/tile/mm/fault.c
> +++ b/arch/tile/mm/fault.c
> @@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
> flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> (write ? FAULT_FLAG_WRITE : 0));
>
> - is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
> + is_kernel_mode = !user_mode(regs);
>
> tsk = validate_current();
>
> @@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
> }
>
> #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
> - if (EX1_PL(regs->ex1) != USER_PL) {
> + if (!user_mode(regs)) {
> struct async_tlb *async;
> switch (fault_num) {
> #if CHIP_HAS_TILE_DMA()
> diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
> index 3004433..d6948d4 100644
> --- a/arch/tile/mm/pgtable.c
> +++ b/arch/tile/mm/pgtable.c
> @@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
>
> #if CHIP_HAS_MMIO()
>
> -/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
> -void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
> - pgprot_t home)
> +void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
> + unsigned long flags, pgprot_t prot)
> {
> void *addr;
> struct vm_struct *area;
> unsigned long offset, last_addr;
> - pgprot_t pgprot;
>
> /* Don't allow wraparound or zero size */
> last_addr = phys_addr + size - 1;
> if (!size || last_addr < phys_addr)
> return NULL;
>
> - /* Create a read/write, MMIO VA mapping homed at the requested shim. */
> - pgprot = PAGE_KERNEL;
> - pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
> - pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
> -
> /*
> * Mappings have to be page-aligned
> */
> @@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
> /*
> * Ok, go for it..
> */
> - area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
> + area = get_vm_area(size, flags);
> if (!area)
> return NULL;
> area->phys_addr = phys_addr;
> addr = area->addr;
> if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
> - phys_addr, pgprot)) {
> + phys_addr, prot)) {
> free_vm_area(area);
> return NULL;
> }
> - return (__force void __iomem *) (offset + (char *)addr);
> + return (void *) (offset + (char *)addr);
> +}
> +EXPORT_SYMBOL(generic_remap_prot);
> +
> +/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
> +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
> + pgprot_t home)
> +{
> + pgprot_t pgprot;
> + unsigned long flags;
> +
> + /* Create a read/write, MMIO VA mapping homed at the requested shim. */
> + pgprot = PAGE_KERNEL;
> + pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
> + pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
> + flags = VM_IOREMAP; /* | other flags? */
> +
> + return (__force void __iomem *) generic_remap_prot(phys_addr,
> + size, flags, pgprot);
> }
> EXPORT_SYMBOL(ioremap_prot);
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index acccd08..b622337 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -171,6 +171,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_WATCHDOG 21
> #define KVM_EXIT_S390_TSCH 22
> #define KVM_EXIT_EPR 23
> +#define KVM_EXIT_AGAIN 24
>
> /* For KVM_EXIT_INTERNAL_ERROR */
> /* Emulate instruction failed. */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1580dd4..2c4fd23 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
> if (vcpu->kvm->mm != current->mm)
> return -EIO;
>
> -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
> +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
> + defined(CONFIG_TILEGX)
No need to do that. Use KVM_IRQ_LINE ioctl which is asynchronous in
respect to vcpu. S390 and PPC are here for historical reason and MIPS
was review mistake. BTW where interrupt controller is emulated in
userspace or in the kernel?
> /*
> * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
> * so vcpu_load() would break it.
> --
> 1.8.3.1
--
Gleb.
Il 10/09/2013 12:53, Gleb Natapov ha scritto:
>> > +#ifndef __KERNEL__
>> > +/* For hv_*() */
>> > +#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
>> > +#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
>> > +#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
>> > +#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
>> > +/* For others */
>> > +#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
> This does not belong to a kernel header. QEMU is not the only user of KVM
> kernel APIs. Please drop that and change all the references in comment
> from "qemu" to "userspace". If you add code that workarounds QEMU bugs it
> is appropriate to mention QEMU by name, otherwise interface to userspace
> should not be QEMU specific.
>
In general, I believe that HCALL_DEFS should not be part of the public
interface.
Otherwise, adding a new hypercall would break compilation of userspace.
Hypercalls (after the first commit) should always be associated to a
capability, so they shouldn't be generated unless userspace explicitly
requests them.
Rather, document the hypercalls under Documentation/virtual/kvm, noting
which are implemented in the kernel and which need to be handled in
userspace.
BTW, BOTH_EMULATE and USER_HCALL seem unused.
Paolo
Il 28/08/2013 22:58, Chris Metcalf ha scritto:
> This change enables support for a virtio-based console,
> network support, and block driver support.
>
> We remove some debug code in relocate_kernel_64.S that made raw
> calls to the hv_console_putc Tilera hypervisor API, since everything
> now should funnel through the early_hv_write() API.
>
> Signed-off-by: Chris Metcalf <[email protected]>
Why couldn't this use the "regular" virtio-mmio interface?
> diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
> new file mode 100644
> index 0000000..8faa959
> --- /dev/null
> +++ b/arch/tile/include/asm/kvm_virtio.h
> @@ -0,0 +1,26 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +#ifndef _ASM_TILE_KVM_VIRTIO_H
> +#define _ASM_TILE_KVM_VIRTIO_H
> +
> +#include <uapi/asm/kvm_virtio.h>
> +
> +
> +struct kvm_device {
> + struct virtio_device vdev;
> + struct kvm_device_desc *desc;
> + unsigned long desc_pa;
> +};
> +
> +#endif /* _ASM_TILE_KVM_VIRTIO_H */
> diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
> index 89022a5..f07cc24 100644
> --- a/arch/tile/include/uapi/asm/Kbuild
> +++ b/arch/tile/include/uapi/asm/Kbuild
> @@ -8,6 +8,7 @@ header-y += cachectl.h
> header-y += hardwall.h
> header-y += kvm.h
> header-y += kvm_para.h
> +header-y += kvm_virtio.h
> header-y += mman.h
> header-y += ptrace.h
> header-y += setup.h
> diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
> index aa7b97f..4346520 100644
> --- a/arch/tile/include/uapi/asm/kvm.h
> +++ b/arch/tile/include/uapi/asm/kvm.h
> @@ -149,6 +149,9 @@
> */
> #define KVM_OTHER_HCALL 128
>
> +/* Hypercall index for virtio. */
> +#define KVM_HCALL_virtio 128
> +
> /* One greater than the maximum hypercall number. */
> #define KVM_NUM_HCALLS 256
>
> @@ -256,6 +259,8 @@ struct kvm_sync_regs {
> KVM_EMULATE(get_ipi_pte) \
> KVM_EMULATE(set_pte_super_shift) \
> KVM_EMULATE(set_speed) \
> + /* For others */ \
> + USER_HCALL(virtio)
Ah, here it is. :)
>
> #endif
>
> diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
> new file mode 100644
> index 0000000..d94f535
> --- /dev/null
> +++ b/arch/tile/include/uapi/asm/kvm_virtio.h
> @@ -0,0 +1,60 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
> +#define _UAPI_ASM_TILE_KVM_VIRTIO_H
> +
> +#include <linux/types.h>
> +
> +#define KVM_VIRTIO_UNKNOWN 0
> +#define KVM_VIRTIO_NOTIFY 1
> +#define KVM_VIRTIO_RESET 2
> +#define KVM_VIRTIO_SET_STATUS 3
> +
> +struct kvm_device_desc {
> + /* The device type: console, network, disk etc. Type 0 terminates. */
> + __u8 type;
> + /* The number of virtqueues (first in config array) */
> + __u8 num_vq;
> + /*
> + * The number of bytes of feature bits. Multiply by 2: one for host
> + * features and one for Guest acknowledgements.
> + */
> + __u8 feature_len;
> + /* The number of bytes of the config array after virtqueues. */
> + __u8 config_len;
> + /* A status byte, written by the Guest. */
> + __u8 status;
> + __u64 config[0];
> +};
> +
> +struct kvm_vqinfo {
> + /* Pointer to the information contained in the device config. */
> + struct kvm_vqconfig *config;
> + /* The address where we mapped the virtio ring, so we can unmap it. */
> + void *pages;
> +};
> +
> +struct kvm_vqconfig {
> + /* The physical address of the virtio ring */
> + __u64 pa;
> + /* The number of entries in the virtio_ring */
> + __u64 num;
> + /* The interrupt we get when something happens. Set by the guest. */
> + __u32 irq;
> +
> +};
> +
> +
> +#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
> diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
> index b7c8b5e..b638d3e 100644
> --- a/arch/tile/kernel/Makefile
> +++ b/arch/tile/kernel/Makefile
> @@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o
> obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o
> obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o
> obj-$(CONFIG_KPROBES) += kprobes.o
> +obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o
>
> obj-y += vdso/
> diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
> index b608e00..53f2be4 100644
> --- a/arch/tile/kernel/early_printk.c
> +++ b/arch/tile/kernel/early_printk.c
> @@ -18,11 +18,26 @@
> #include <linux/string.h>
> #include <linux/irqflags.h>
> #include <linux/printk.h>
> +#ifdef CONFIG_KVM_GUEST
> +#include <linux/virtio_console.h>
> +#include <linux/kvm_para.h>
> +#include <asm/kvm_virtio.h>
> +#endif
> #include <asm/setup.h>
> #include <hv/hypervisor.h>
>
> static void early_hv_write(struct console *con, const char *s, unsigned n)
> {
> +#ifdef CONFIG_KVM_GUEST
> + char buf[512];
> +
> + if (n > sizeof(buf) - 1)
> + n = sizeof(buf) - 1;
> + memcpy(buf, s, n);
> + buf[n] = '\0';
> +
> + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
How can userspace know the difference between KVM_VIRTIO_NOTIFY with a
string buffer, and KVM_VIRTIO_NOTIFY with a config space pointer?
In fact, this looks like a completely separate hypercall, why not keep
hv_console_putc?
> index 0000000..c6b6c6a
> --- /dev/null
> +++ b/arch/tile/kernel/kvm_virtio.c
> @@ -0,0 +1,430 @@
> +/*
> + * Copyright 2013 Tilera Corporation. All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
> + * NON INFRINGEMENT. See the GNU General Public License for
> + * more details.
> + */
> +
> +/* Referred lguest & s390 implemenation */
> +/*
> + * kvm_virtio.c - virtio for kvm on s390
> + *
> + * Copyright IBM Corp. 2008
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License (version 2 only)
> + * as published by the Free Software Foundation.
> + *
> + * Author(s): Christian Borntraeger <[email protected]>
> + */
This has the same problem as the old s390 implementation (there is a new
one that emulates the usual s390 I/O instead of using
paravirtualization); it doesn't raise an interrupt on config space writes.
Apart from this it looks good, but I'm not sure why it is necessary.
> +#include <linux/bootmem.h>
> +#include <linux/io.h>
> +#include <linux/vmalloc.h>
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/export.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_console.h>
> +#include <linux/virtio_ring.h>
> +#include <linux/virtio_pci.h>
> +
> +#include <linux/kvm_para.h>
> +#include <asm/kvm_virtio.h>
> +
> +static void *kvm_devices;
> +
> +/*
> + * TODO: We actually does not use PCI virtio here. We use this
> + * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
> + * Maybe we should change them to generic definitions in both qemu & Linux.
> + * Besides, Let's check whether the alignment value (4096, i.e. default
> + * x86 page size) affects performance later.
> + */
> +#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN
> +#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev)
> +
> +/*
> + * memory layout: (Total: PAGE_SIZE)
> + * <device 0>
> + * - kvm device descriptor
> + * struct kvm_device_desc
> + * - vqueue configuration (totally desc->num_vq)
> + * struct kvm_vqconfig
> + * ......
> + * struct kvm_vqconfig
> + * - feature bits (size: desc->feature_len * 2)
> + * - config space (size: desc->config_len)
> + * <device 1>
> + * ......
> + */
> +static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
> +{
> + return (struct kvm_vqconfig *)(desc + 1);
> +}
> +
> +static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
> +{
> + return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
> +}
> +
> +static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
> +{
> + return kvm_vq_features(desc) + desc->feature_len * 2;
> +}
> +
> +/*
> + * The total size of the config page used by this device (incl. desc)
> + */
> +static unsigned desc_size(const struct kvm_device_desc *desc)
> +{
> + return sizeof(*desc)
> + + desc->num_vq * sizeof(struct kvm_vqconfig)
> + + desc->feature_len * 2
> + + desc->config_len;
> +}
> +
> +/* This gets the device's feature bits. */
> +static u32 kvm_get_features(struct virtio_device *vdev)
> +{
> + unsigned int i;
> + u32 features = 0;
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> + u8 *in_features = kvm_vq_features(desc);
> +
> + for (i = 0; i < min(desc->feature_len * 8, 32); i++)
> + if (in_features[i / 8] & (1 << (i % 8)))
> + features |= (1 << i);
> + return features;
> +}
> +
> +static void kvm_finalize_features(struct virtio_device *vdev)
> +{
> + unsigned int i, bits;
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> + /* Second half of bitmap is features we accept. */
> + u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
> +
> + /* Give virtio_ring a chance to accept features. */
> + vring_transport_features(vdev);
> +
> + memset(out_features, 0, desc->feature_len);
> + bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
> + for (i = 0; i < bits; i++) {
> + if (test_bit(i, vdev->features))
> + out_features[i / 8] |= (1 << (i % 8));
> + }
> +}
> +
> +/*
> + * Reading and writing elements in config space
> + */
> +static void kvm_get(struct virtio_device *vdev, unsigned int offset,
> + void *buf, unsigned len)
> +{
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> +
> + BUG_ON(offset + len > desc->config_len);
> + memcpy(buf, kvm_vq_configspace(desc) + offset, len);
> +}
> +
> +static void kvm_set(struct virtio_device *vdev, unsigned int offset,
> + const void *buf, unsigned len)
> +{
> + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
> +
> + BUG_ON(offset + len > desc->config_len);
> + memcpy(kvm_vq_configspace(desc) + offset, buf, len);
> +}
> +
> +/*
> + * The operations to get and set the status word just access
> + * the status field of the device descriptor. set_status will also
> + * make a hypercall to the host, to tell about status changes
> + */
> +static u8 kvm_get_status(struct virtio_device *vdev)
> +{
> + return to_kvmdev(vdev)->desc->status;
> +}
> +
> +static void kvm_set_status(struct virtio_device *vdev, u8 status)
> +{
> + BUG_ON(!status);
> + to_kvmdev(vdev)->desc->status = status;
> + hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
> +}
> +
> +/*
> + * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
> + * descriptor address. The Host will zero the status and all the
> + * features.
> + */
> +static void kvm_reset(struct virtio_device *vdev)
> +{
> + hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
> +}
> +
> +/*
> + * When the virtio_ring code wants to notify the Host, it calls us here and we
> + * make a hypercall. We hand the address of the virtqueue so the Host
> + * knows which virtqueue we're talking about.
> + */
> +static void kvm_notify(struct virtqueue *vq)
> +{
> + struct kvm_vqinfo *vqi = vq->priv;
> +
> + hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
> +}
> +
> +/*
> + * Must set some caching mode to keep set_pte() happy.
> + * It doesn't matter what we choose, because the PFN
> + * is illegal, so we're going to take a page fault anyway.
> + */
> +static inline pgprot_t io_prot(void)
> +{
> + return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
> +}
> +
> +/*
> + * This routine finds the first virtqueue described in the configuration of
> + * this device and sets it up.
> + */
> +static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
> + unsigned index,
> + void (*callback)(struct virtqueue *vq),
> + const char *name)
> +{
> + struct kvm_device *kdev = to_kvmdev(vdev);
> + struct kvm_vqinfo *vqi;
> + struct kvm_vqconfig *config;
> + struct virtqueue *vq;
> + long irq;
> + int err = -EINVAL;
> +
> + if (index >= kdev->desc->num_vq)
> + return ERR_PTR(-ENOENT);
> +
> + vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
> + if (!vqi)
> + return ERR_PTR(-ENOMEM);
> +
> + config = kvm_vq_config(kdev->desc)+index;
> +
> + vqi->config = config;
> + vqi->pages = generic_remap_prot(config->pa,
> + vring_size(config->num,
> + KVM_TILE_VIRTIO_RING_ALIGN),
> + 0, io_prot());
> + if (!vqi->pages) {
> + err = -ENOMEM;
> + goto out;
> + }
> +
> + vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
> + vdev, 0, vqi->pages,
> + kvm_notify, callback, name);
> + if (!vq) {
> + err = -ENOMEM;
> + goto unmap;
> + }
> +
> + /*
> + * Trigger the IPI interrupt in SW way.
> + * TODO: We do not need to create one irq for each vq. A bit wasteful.
> + */
> + irq = create_irq();
> + if (irq < 0) {
> + err = -ENXIO;
> + goto del_virtqueue;
> + }
> +
> + tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
> +
> + if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
> + err = -ENXIO;
> + destroy_irq(irq);
> + goto del_virtqueue;
> + }
> +
> + config->irq = irq;
> +
> + vq->priv = vqi;
> + return vq;
> +
> +del_virtqueue:
> + vring_del_virtqueue(vq);
> +unmap:
> + vunmap(vqi->pages);
> +out:
> + return ERR_PTR(err);
> +}
> +
> +static void kvm_del_vq(struct virtqueue *vq)
> +{
> + struct kvm_vqinfo *vqi = vq->priv;
> +
> + vring_del_virtqueue(vq);
> + vunmap(vqi->pages);
> + kfree(vqi);
> +}
> +
> +static void kvm_del_vqs(struct virtio_device *vdev)
> +{
> + struct virtqueue *vq, *n;
> +
> + list_for_each_entry_safe(vq, n, &vdev->vqs, list)
> + kvm_del_vq(vq);
> +}
> +
> +static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
> + struct virtqueue *vqs[],
> + vq_callback_t *callbacks[],
> + const char *names[])
> +{
> + struct kvm_device *kdev = to_kvmdev(vdev);
> + int i;
> +
> + /* We must have this many virtqueues. */
> + if (nvqs > kdev->desc->num_vq)
> + return -ENOENT;
> +
> + for (i = 0; i < nvqs; ++i) {
> + vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
> + if (IS_ERR(vqs[i]))
> + goto error;
> + }
> + return 0;
> +
> +error:
> + kvm_del_vqs(vdev);
> + return PTR_ERR(vqs[i]);
> +}
> +
> +/*
> + * The config ops structure as defined by virtio config
> + */
> +static struct virtio_config_ops kvm_vq_config_ops = {
> + .get_features = kvm_get_features,
> + .finalize_features = kvm_finalize_features,
> + .get = kvm_get,
> + .set = kvm_set,
> + .get_status = kvm_get_status,
> + .set_status = kvm_set_status,
> + .reset = kvm_reset,
> + .find_vqs = kvm_find_vqs,
> + .del_vqs = kvm_del_vqs,
> +};
> +
> +/*
> + * The root device for the kvm virtio devices.
> + * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
> + */
> +static struct device *kvm_root;
> +
> +/*
> + * adds a new device and register it with virtio
> + * appropriate drivers are loaded by the device model
> + */
> +static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
> +{
> + struct kvm_device *kdev;
> +
> + kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
> + if (!kdev) {
> + pr_emerg("Cannot allocate kvm dev %u type %u\n",
> + offset, d->type);
> + return;
> + }
> +
> + kdev->vdev.dev.parent = kvm_root;
> + kdev->vdev.id.device = d->type;
> + kdev->vdev.config = &kvm_vq_config_ops;
> + kdev->desc = d;
> + kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
> +
> + if (register_virtio_device(&kdev->vdev) != 0) {
> + pr_err("Failed to register kvm device %u type %u\n",
> + offset, d->type);
> + kfree(kdev);
> + }
> +}
> +
> +/*
> + * scan_devices() simply iterates through the device page.
> + * The type 0 is reserved to mean "end of devices".
> + */
> +static void scan_devices(void)
> +{
> + unsigned int i;
> + struct kvm_device_desc *d;
> +
> + for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
> + d = kvm_devices + i;
> +
> + if (d->type == 0)
> + break;
> +
> + add_kvm_device(d, i);
> + }
> +}
> +
> +/*
> + * Init function for virtio.
> + * devices are in a single page above the top of "normal" mem.
> + */
> +static int __init kvm_devices_init(void)
> +{
> + int rc = -ENOMEM;
> +
> + kvm_root = root_device_register("kvm_tile");
> + if (IS_ERR(kvm_root)) {
> + rc = PTR_ERR(kvm_root);
> + pr_err("Could not register kvm_tile root device");
> + return rc;
> + }
> +
> + kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
> + 0, io_prot());
> + if (!kvm_devices) {
> + kvm_devices = NULL;
> + root_device_unregister(kvm_root);
> + return rc;
> + }
> +
> + scan_devices();
> + return 0;
> +}
> +
> +/* code for early console output with virtio_console */
> +static __init int early_put_chars(u32 vtermno, const char *buf, int len)
> +{
> + char scratch[512];
> +
> + if (len > sizeof(scratch) - 1)
> + len = sizeof(scratch) - 1;
> + scratch[len] = '\0';
> + memcpy(scratch, buf, len);
> + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
> +
> + return len;
> +}
> +
> +static int __init tile_virtio_console_init(void)
> +{
> + return virtio_cons_early_init(early_put_chars);
> +}
> +console_initcall(tile_virtio_console_init);
> +
> +/*
> + * We do this after core stuff, but before the drivers.
> + */
> +postcore_initcall(kvm_devices_init);
> diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
> index 1c09a4f..02bc446 100644
> --- a/arch/tile/kernel/relocate_kernel_64.S
> +++ b/arch/tile/kernel/relocate_kernel_64.S
> @@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
> addi sp, sp, -8
> /* we now have a stack (whether we need one or not) */
>
> +#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> moveli r40, hw2_last(hv_console_putc)
> shl16insli r40, r40, hw1(hv_console_putc)
> shl16insli r40, r40, hw0(hv_console_putc)
>
> -#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> moveli r0, 'r'
> jalr r40
>
> @@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
>
> /* we should not get here */
>
> +#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> moveli r0, '?'
> jalr r40
> moveli r0, '\n'
> jalr r40
> +#endif
>
> j .Lhalt
>
> @@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
> j .Lloop
>
>
> -.Lerr: moveli r0, 'e'
> +.Lerr:
> +#ifdef RELOCATE_NEW_KERNEL_VERBOSE
> + moveli r0, 'e'
> jalr r40
> moveli r0, 'r'
> jalr r40
> @@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
> jalr r40
> moveli r0, '\n'
> jalr r40
> +#endif
> .Lhalt:
> moveli r41, hw2_last(hv_halt)
> shl16insli r41, r41, hw1(hv_halt)
>
First, sorry for the slow response to your thoughtful review comments.
Things have been busy for me with a new major release of the Tilera
software environment and the usual customer bug firefighting.
On 9/10/2013 6:53 AM, Gleb Natapov wrote:
> On Wed, Aug 28, 2013 at 03:45:50PM -0400, Chris Metcalf wrote:
>> This commit enables the host side of KVM support for tilegx.
>>
>> [...]
>>
>> The commit adds a KVM_EXIT_xxx code, KVM_EXIT_AGAIN, which is used to
>> exit out to the host kernel, but not all the way out to qemu. This is
>> helpful if we are trying to handle resched, sigpending, etc., but don't
>> need to end up back in userspace first.
>>
> I think there is a confusion here on how things suppose to work.
> KVM_EXIT_xxx defines are only meant to be meaningful to userspace, they
> are never used internally by KVM. So KVM_EXIT_AGAIN, as defined above,
> does not make sense.
Fair enough; we can certainly arrange for the same semantics without
exposing a magic value in the user API.
> Looking at the code I see that you've reused those
> defines for vmexit codes too and this is incorrect. On platform with HW
> virt support vmexit codes are defined by CPU architecture (and there are
> much more of vmexit codes that userspace exit codes), PV define their own
> interface.
I'm not clear on what you're suggesting with this comment. We do have a
kvm_trigger_vmexit() function that takes a KVM_EXIT_xxx value and stuffs
it into kvm_run.exit_reason. But since we are PV and don't have separate
hardware-defined values, this seems like the right approach. We effectively
borrow the KVM_EXIT_xxx codes for our vmexit codes. Why not?
>> +#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
>> +
>> +#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
>> +
>> +#define gpmd_offset(kvm, pud, address) \
>> + ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
>> +
>> +#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
>> +
>> +#define gpte_offset_kernel(kvm, pmd, address) \
>> + ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
>> +
> I can't find where those four defines are used, but in case they are
> comment about gfn_to_pfn() bellow apples here to.
Good catch; they were for some code that we changed to something else,
so we don't need them any more.
>> +#ifndef __KERNEL__
>> +/* For hv_*() */
>> +#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
>> +#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
>> +#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
>> +#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
>> +/* For others */
>> +#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
> This does not belong to a kernel header. QEMU is not the only user of KVM
> kernel APIs. Please drop that and change all the references in comment
> from "qemu" to "userspace". If you add code that workarounds QEMU bugs it
> is appropriate to mention QEMU by name, otherwise interface to userspace
> should not be QEMU specific.
Thanks, makes sense.
>> +void kvm_arch_commit_memory_region(struct kvm *kvm,
>> + struct kvm_userspace_memory_region *mem,
>> + const struct kvm_memory_slot *old,
>> + enum kvm_mr_change change)
>> +{
>> + unsigned long gpa, address, pfn, i;
>> + struct page *page[1];
>> + pte_t *ptep, *vptep;
>> +
>> + gpa = mem->guest_phys_addr;
>> + address = mem->userspace_addr;
>> + for (i = 0; i < mem->memory_size;
>> + i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
>> + vptep = get_vpgd_pte(kvm, gpa);
>> + BUG_ON(vptep == NULL);
>> + get_user_pages_fast(address, 1, 1, page);
> get_user_pages_fast() can fail and you do not handle an error. Do I
> understand correctly that all guest memory is pinned here? Where is it
> unpinned? I do not see put_page anywhere.
Yes, we're pinning all of user memory here; this deserves more of
a comment than is in the code (i.e. none), but was done to simplify
our initial implementation of fast page faulting from the Tilera
hypervisor. I'll review this all more closely for the next version
of the patch set.
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> + struct kvm_sregs *sregs)
> +{
> + vcpu->arch.sregs = *sregs;
> + return 0;
> +}
> Most arches prefer to use KVM_GET_ONE_REG/KVM_SET_ONE_REG interface
> to get/set all vcpu registers since the interface is more flexible, but
> the way you are doing it is OK too.
We can certainly provide both interfaces. For the time being,
the way we're using it from qemu works best with SET_SREGS since we
just set a bunch of SPRs at once. Or maybe we just don't care in
the kernel until we have a client that actually wants the ONE_REG APIs?
> +static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
> +{
> + gpa_t gpa = vcpu->arch.regs.regs[0];
> + HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
> + uint64_t val = vcpu->arch.regs.regs[2];
> + gfn_t gfn;
> + pfn_t pfn;
> + hpa_t hpa;
> +
> + gfn = gpa_to_gfn(gpa);
> + pfn = gfn_to_pfn(vcpu->kvm, gfn);
> Here and in the function above you use gfn_to_pfn() which access
> memslots. Memslots are srcu protected, so you have to take kvm->srcu
> read lock around those calls. Here is the link with the patch that
> documents that: http://www.mail-archive.com/[email protected]/msg95566.html
Thanks, yeah, I missed that. I'll add the srcu stuff.
> Another thing about those function is that they are very similar to
> kvm_read_guest/kvm_write_guest, the only difference I see is that they
> use hv_physaddr_write64/hv_physaddr_read64 instead of
> __copy_to_user/__copy_from_user. What are those special functions and
> why can't we use __copy_to_user/__copy_from_user here?
The hv_physaddr_xxx functions actually do direct physical-address reads
by requesting the Tilera hypervisor (basically like a BIOS) to do it.
However, you may be right that if we instead just convert to an hva
and use __copy_to_user we'd get the same results. I'll see if we can
convert these to just use kvm_read_guest etc.
>> +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
>> +{
>> + return !test_and_set_bit(KVM_REQ_KICK, &vcpu->requests);
>> +}
> Use of KVM_REQ_KICK was deprecated some time ago by commit
> d94e1dc9af60e3431a586c3edfbe42d8a0d3932b. You probably copied this from
> ia64 which is a bad example since the kvm support there is broken and
> will be removed soon. Set vcpu->mode to IN_GUEST_MODE/OUTSIDE_GUEST_MODE
> instead and helper functions such as kvm_vcpu_exiting_guest_mode() here.
Thanks.
>> +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>> +{
>> + int i;
>> + unsigned long resv_gfn_start;
>> + struct kvm_memory_slot *s;
>> + struct kvm *kvm = vcpu->kvm;
>> +
>> + if (!kvm->arch.resv_gpa_start) {
>> + resv_gfn_start = 0;
>> +
>> + for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
>> + s = &kvm->memslots->memslots[i];
> Slots can be added or removed after vcpu is created. And of course
> kvm->srcu comment applies. Memslot can be KVM_MEMSLOT_INVALID if it is
> in the process of been deleted, so you have to check this too, but
> probably it is better for userspace to set resv_gpa_start instead of
> kernel trying to figure it out here.
We are really just trying to get an illegal PA here. We could probably
just use PA values starting at the highest legal value and work down
from there intead.
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
>> if (vcpu->kvm->mm != current->mm)
>> return -EIO;
>>
>> -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
>> +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
>> + defined(CONFIG_TILEGX)
> No need to do that. Use KVM_IRQ_LINE ioctl which is asynchronous in
> respect to vcpu. S390 and PPC are here for historical reason and MIPS
> was review mistake. BTW where interrupt controller is emulated in
> userspace or in the kernel?
We're using the s390 virtio model with a qemu hw/tile/tile-virtio-bus.c
component in userspace at the moment. We'll look at KVM_IRQ_LINE.
--
Chris Metcalf, Tilera Corp.
http://www.tilera.com
As I said to Gleb in the previous email - sorry for the delay in
replying to your thoughtful comments!
On 9/10/2013 8:47 AM, Paolo Bonzini wrote:
> Il 28/08/2013 22:58, Chris Metcalf ha scritto:
>> This change enables support for a virtio-based console,
>> network support, and block driver support.
>>
>> We remove some debug code in relocate_kernel_64.S that made raw
>> calls to the hv_console_putc Tilera hypervisor API, since everything
>> now should funnel through the early_hv_write() API.
>>
>> Signed-off-by: Chris Metcalf <[email protected]>
> Why couldn't this use the "regular" virtio-mmio interface?
We probably should! We were working with a CentOS 6 style distribution,
which has an older version of qemu; we upgraded slightly to 0.13 in
the thought that minimizing version skew would help distribution compatibility.
That version doesn't have the virtio-mmio stuff. But you're right, we probably
should return the virtio-mmio stuff to the community instead, even if we're
going to keep something like this patch in our local copy of KVM.
>> static void early_hv_write(struct console *con, const char *s, unsigned n)
>> {
>> +#ifdef CONFIG_KVM_GUEST
>> + char buf[512];
>> +
>> + if (n > sizeof(buf) - 1)
>> + n = sizeof(buf) - 1;
>> + memcpy(buf, s, n);
>> + buf[n] = '\0';
>> +
>> + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
> How can userspace know the difference between KVM_VIRTIO_NOTIFY with a
> string buffer, and KVM_VIRTIO_NOTIFY with a config space pointer?
>
> In fact, this looks like a completely separate hypercall, why not keep
> hv_console_putc?
Good point. Right now in qemu the virtio hypercall with a KVM_VIRTIO_NOTIFY
reason either does a virtio_queue_notify(), if the address is not in RAM,
or a print, if it is. It does seem we could just have separate calls;
the reason we grouped it in with the KVM_VIRTIO stuff instead of implementing
it with the hv_console_write() API is just that it uses the virtio_console
API to do the work. But we probably could do it the other way too, and
that might arguably make more sense. We'll think about it.
Thanks!
--
Chris Metcalf, Tilera Corp.
http://www.tilera.com
Il 30/09/2013 22:11, Chris Metcalf ha scritto:
> As I said to Gleb in the previous email - sorry for the delay in
> replying to your thoughtful comments!
>
>
> On 9/10/2013 8:47 AM, Paolo Bonzini wrote:
>> Il 28/08/2013 22:58, Chris Metcalf ha scritto:
>>> This change enables support for a virtio-based console,
>>> network support, and block driver support.
>>>
>>> We remove some debug code in relocate_kernel_64.S that made raw
>>> calls to the hv_console_putc Tilera hypervisor API, since everything
>>> now should funnel through the early_hv_write() API.
>>>
>>> Signed-off-by: Chris Metcalf <[email protected]>
>> Why couldn't this use the "regular" virtio-mmio interface?
>
> We probably should! We were working with a CentOS 6 style distribution,
> which has an older version of qemu; we upgraded slightly to 0.13 in
> the thought that minimizing version skew would help distribution compatibility.
> That version doesn't have the virtio-mmio stuff. But you're right, we probably
> should return the virtio-mmio stuff to the community instead, even if we're
> going to keep something like this patch in our local copy of KVM.
Thanks, that looks like the right thing to do.
The difference between s390-virtio and virtio-mmio is that s390 has a
single device that supports multiple "back-ends", with hotplug and
hot-unplug support.
virtio-mmio supports a fixed number of devices, defined in the board by
creating a number of instances of the "naked" virtio-mmio front-ends.
On the other hand, s390-virtio was never fully specified and is not part
of the virtio standardization effort (because s390 has now switched to a
different mechanism).
>>> static void early_hv_write(struct console *con, const char *s, unsigned n)
>>> {
>>> +#ifdef CONFIG_KVM_GUEST
>>> + char buf[512];
>>> +
>>> + if (n > sizeof(buf) - 1)
>>> + n = sizeof(buf) - 1;
>>> + memcpy(buf, s, n);
>>> + buf[n] = '\0';
>>> +
>>> + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
>> How can userspace know the difference between KVM_VIRTIO_NOTIFY with a
>> string buffer, and KVM_VIRTIO_NOTIFY with a config space pointer?
>>
>> In fact, this looks like a completely separate hypercall, why not keep
>> hv_console_putc?
>
> Good point. Right now in qemu the virtio hypercall with a KVM_VIRTIO_NOTIFY
> reason either does a virtio_queue_notify(), if the address is not in RAM,
> or a print, if it is. It does seem we could just have separate calls;
> the reason we grouped it in with the KVM_VIRTIO stuff instead of implementing
> it with the hv_console_write() API is just that it uses the virtio_console
> API to do the work. But we probably could do it the other way too, and
> that might arguably make more sense. We'll think about it.
Yeah, using virtio-console is just an implementation-dependent issue. I
think it's better to keep the previous guest code for early printk.
Paolo
On Mon, Sep 30, 2013 at 04:11:18PM -0400, Chris Metcalf wrote:
> On 9/10/2013 6:53 AM, Gleb Natapov wrote:
> > On Wed, Aug 28, 2013 at 03:45:50PM -0400, Chris Metcalf wrote:
> >> This commit enables the host side of KVM support for tilegx.
> >>
> >> [...]
> >>
> >> The commit adds a KVM_EXIT_xxx code, KVM_EXIT_AGAIN, which is used to
> >> exit out to the host kernel, but not all the way out to qemu. This is
> >> helpful if we are trying to handle resched, sigpending, etc., but don't
> >> need to end up back in userspace first.
> >>
> > I think there is a confusion here on how things suppose to work.
> > KVM_EXIT_xxx defines are only meant to be meaningful to userspace, they
> > are never used internally by KVM. So KVM_EXIT_AGAIN, as defined above,
> > does not make sense.
>
> Fair enough; we can certainly arrange for the same semantics without
> exposing a magic value in the user API.
>
>
> > Looking at the code I see that you've reused those
> > defines for vmexit codes too and this is incorrect. On platform with HW
> > virt support vmexit codes are defined by CPU architecture (and there are
> > much more of vmexit codes that userspace exit codes), PV define their own
> > interface.
>
> I'm not clear on what you're suggesting with this comment. We do have a
> kvm_trigger_vmexit() function that takes a KVM_EXIT_xxx value and stuffs
> it into kvm_run.exit_reason. But since we are PV and don't have separate
> hardware-defined values, this seems like the right approach. We effectively
> borrow the KVM_EXIT_xxx codes for our vmexit codes. Why not?
>
Experience shows that there are much more reasons for vmexits than
reasons to exit to userspace. Even with your initial implementation here
you found that you need KVM_EXIT_AGAIN, but going forward you may find
out you need more and more vmexit reasons and we will not define new
KVM_EXIT_xxx for that. The main loop logic usually looks like that:
while (r != exit_user) {
enter_guest_mode();
vmexit = get_vmexit_reason();
switch (vmexit) {
case A:
break;
case B:
break;
case C:
if (need_userspace_attention())
kvm_run.exit_reason = KVM_EXIT_xxx;
r = exit_user;
break;
}
}
> >> +void kvm_arch_commit_memory_region(struct kvm *kvm,
> >> + struct kvm_userspace_memory_region *mem,
> >> + const struct kvm_memory_slot *old,
> >> + enum kvm_mr_change change)
> >> +{
> >> + unsigned long gpa, address, pfn, i;
> >> + struct page *page[1];
> >> + pte_t *ptep, *vptep;
> >> +
> >> + gpa = mem->guest_phys_addr;
> >> + address = mem->userspace_addr;
> >> + for (i = 0; i < mem->memory_size;
> >> + i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
> >> + vptep = get_vpgd_pte(kvm, gpa);
> >> + BUG_ON(vptep == NULL);
> >> + get_user_pages_fast(address, 1, 1, page);
> > get_user_pages_fast() can fail and you do not handle an error. Do I
> > understand correctly that all guest memory is pinned here? Where is it
> > unpinned? I do not see put_page anywhere.
>
> Yes, we're pinning all of user memory here; this deserves more of
> a comment than is in the code (i.e. none), but was done to simplify
> our initial implementation of fast page faulting from the Tilera
> hypervisor. I'll review this all more closely for the next version
> of the patch set.
OK. Pinning pages is OK, what puzzled me is that I haven't found where
they unpinned.
>
> > +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> > + struct kvm_sregs *sregs)
> > +{
> > + vcpu->arch.sregs = *sregs;
> > + return 0;
> > +}
> > Most arches prefer to use KVM_GET_ONE_REG/KVM_SET_ONE_REG interface
> > to get/set all vcpu registers since the interface is more flexible, but
> > the way you are doing it is OK too.
>
> We can certainly provide both interfaces. For the time being,
> the way we're using it from qemu works best with SET_SREGS since we
> just set a bunch of SPRs at once. Or maybe we just don't care in
> the kernel until we have a client that actually wants the ONE_REG APIs?
>
Fine with me, just pointed out the alternative.
> >> +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> >> +{
> >> + int i;
> >> + unsigned long resv_gfn_start;
> >> + struct kvm_memory_slot *s;
> >> + struct kvm *kvm = vcpu->kvm;
> >> +
> >> + if (!kvm->arch.resv_gpa_start) {
> >> + resv_gfn_start = 0;
> >> +
> >> + for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
> >> + s = &kvm->memslots->memslots[i];
> > Slots can be added or removed after vcpu is created. And of course
> > kvm->srcu comment applies. Memslot can be KVM_MEMSLOT_INVALID if it is
> > in the process of been deleted, so you have to check this too, but
> > probably it is better for userspace to set resv_gpa_start instead of
> > kernel trying to figure it out here.
>
> We are really just trying to get an illegal PA here. We could probably
> just use PA values starting at the highest legal value and work down
> from there intead.
>
OK. But making userspace configuring it may be even easier. After all
userspace is the one who defines memory layout. X86 has similar things
where userspace configures available PA address for KVM to use.
--
Gleb.