Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754712Ab3HLUh5 (ORCPT ); Mon, 12 Aug 2013 16:37:57 -0400 Received: from usmamail.tilera.com ([12.216.194.151]:50444 "EHLO USMAMAIL.TILERA.COM" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754231Ab3HLUhw (ORCPT ); Mon, 12 Aug 2013 16:37:52 -0400 Message-ID: <201308122037.r7CKbo99011833@farm-0021.internal.tilera.com> In-Reply-To: <52091682.8020004@siemens.com> References: <52091682.8020004@siemens.com> From: Chris Metcalf Date: Mon, 12 Aug 2013 16:24:11 -0400 Subject: [PATCH v2] tile: support KVM for tilegx To: , , Gleb Natapov , Paolo Bonzini , Jan Kiszka MIME-Version: 1.0 Content-Type: text/plain Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 153933 Lines: 4967 This change provides the initial framework support for KVM on tilegx. Basic virtual disk and networking is supported. Signed-off-by: Chris Metcalf --- v2: remove KVM_TILE_RESET_SPR based on feedback from Jan Kiszka. qemu ends up modified to just use KVM_SET_SREGS instead. arch/tile/Kconfig | 19 +- arch/tile/Makefile | 1 + arch/tile/include/asm/io.h | 2 + arch/tile/include/asm/kvm.h | 29 + arch/tile/include/asm/kvm_host.h | 101 ++ arch/tile/include/asm/kvm_para.h | 20 + arch/tile/include/asm/kvm_virtio.h | 26 + arch/tile/include/asm/module.h | 9 +- arch/tile/include/asm/page.h | 56 +- arch/tile/include/asm/pgtable_32.h | 2 +- arch/tile/include/asm/pgtable_64.h | 3 +- arch/tile/include/asm/processor.h | 6 +- arch/tile/include/asm/ptrace.h | 2 +- arch/tile/include/asm/switch_to.h | 25 +- arch/tile/include/asm/thread_info.h | 17 +- arch/tile/include/asm/timex.h | 8 + arch/tile/include/hv/hypervisor.h | 183 +++- arch/tile/include/uapi/arch/sim.h | 19 + arch/tile/include/uapi/arch/sim_def.h | 8 + arch/tile/include/uapi/arch/spr_def_32.h | 15 + arch/tile/include/uapi/arch/spr_def_64.h | 25 + arch/tile/include/uapi/asm/Kbuild | 2 + arch/tile/include/uapi/asm/kvm.h | 267 +++++ arch/tile/include/uapi/asm/kvm_virtio.h | 60 ++ arch/tile/kernel/Makefile | 1 + arch/tile/kernel/asm-offsets.c | 7 + arch/tile/kernel/early_printk.c | 16 + arch/tile/kernel/head_32.S | 4 +- arch/tile/kernel/head_64.S | 6 +- arch/tile/kernel/hvglue.S | 8 +- arch/tile/kernel/hvglue_trace.c | 14 + arch/tile/kernel/intvec_32.S | 18 +- arch/tile/kernel/intvec_64.S | 226 +++-- arch/tile/kernel/kvm_virtio.c | 430 ++++++++ arch/tile/kernel/process.c | 40 +- arch/tile/kernel/relocate_kernel_64.S | 9 +- arch/tile/kernel/setup.c | 21 +- arch/tile/kernel/smp.c | 28 +- arch/tile/kernel/stack.c | 2 +- arch/tile/kernel/sysfs.c | 4 + arch/tile/kernel/time.c | 14 +- arch/tile/kernel/traps.c | 2 +- arch/tile/kernel/vmlinux.lds.S | 10 +- arch/tile/kvm/Kconfig | 3 - arch/tile/kvm/Makefile | 12 + arch/tile/kvm/entry.S | 91 ++ arch/tile/kvm/kvm-tile.c | 1581 ++++++++++++++++++++++++++++++ arch/tile/lib/exports.c | 20 +- arch/tile/mm/elf.c | 2 + arch/tile/mm/fault.c | 4 +- arch/tile/mm/init.c | 8 +- arch/tile/mm/pgtable.c | 35 +- include/uapi/linux/kvm.h | 1 + virt/kvm/kvm_main.c | 7 +- 54 files changed, 3331 insertions(+), 198 deletions(-) create mode 100644 arch/tile/include/asm/kvm.h create mode 100644 arch/tile/include/asm/kvm_host.h create mode 100644 arch/tile/include/asm/kvm_para.h create mode 100644 arch/tile/include/asm/kvm_virtio.h create mode 100644 arch/tile/include/uapi/asm/kvm.h create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h create mode 100644 arch/tile/kernel/kvm_virtio.c create mode 100644 arch/tile/kvm/Makefile create mode 100644 arch/tile/kvm/entry.S create mode 100644 arch/tile/kvm/kvm-tile.c diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index ecff467..bbb6d51 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -5,7 +5,6 @@ config TILE def_bool y select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG - select HAVE_KVM if !TILEGX select GENERIC_FIND_FIRST_BIT select SYSCTL_EXCEPTION_TRACE select USE_GENERIC_SMP_HELPERS @@ -113,6 +112,7 @@ config SMP def_bool y config HVC_TILE + depends on !KVM_GUEST depends on TTY select HVC_DRIVER select HVC_IRQ if TILEGX @@ -127,6 +127,7 @@ config TILEGX select HAVE_FTRACE_MCOUNT_RECORD select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_KVM if !KVM_GUEST config TILEPRO def_bool !TILEGX @@ -366,11 +367,23 @@ config HARDWALL bool "Hardwall support to allow access to user dynamic network" default y +config KVM_GUEST + bool "Build kernel as guest for KVM" + default n + depends on TILEGX + select VIRTIO + select VIRTIO_RING + select VIRTIO_CONSOLE + ---help--- + This will build a kernel that runs at a lower protection level + than the default kernel and is suitable to run under KVM. + +# TILEPro kernels run at PL1; TILE-Gx runs at PL2 unless it's a KVM guest. config KERNEL_PL int "Processor protection level for kernel" range 1 2 - default 2 if TILEGX - default 1 if !TILEGX + default 2 if TILEGX && !KVM_GUEST + default 1 if !TILEGX || KVM_GUEST ---help--- Since MDE 4.2, the Tilera hypervisor runs the kernel at PL2 by default. If running under an older hypervisor, diff --git a/arch/tile/Makefile b/arch/tile/Makefile index 3d15364..8e7f852 100644 --- a/arch/tile/Makefile +++ b/arch/tile/Makefile @@ -62,6 +62,7 @@ libs-y += $(LIBGCC_PATH) # See arch/tile/Kbuild for content of core part of the kernel core-y += arch/tile/ +core-$(CONFIG_KVM) += arch/tile/kvm/ core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/ diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h index 9fe4349..023659b 100644 --- a/arch/tile/include/asm/io.h +++ b/arch/tile/include/asm/io.h @@ -43,6 +43,8 @@ * long before casting it to a pointer to avoid compiler warnings. */ #if CHIP_HAS_MMIO() +extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long flags, pgprot_t prot); extern void __iomem *ioremap(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, pgprot_t pgprot); diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h new file mode 100644 index 0000000..2ea6c41 --- /dev/null +++ b/arch/tile/include/asm/kvm.h @@ -0,0 +1,29 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ +#ifndef _ASM_TILE_KVM_H +#define _ASM_TILE_KVM_H + +#include +#include + +#ifndef __ASSEMBLER__ +/* For hv_*() */ +#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name, +#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user, +#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal, +#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name, +/* For others */ +#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user, +#endif +#endif /* _ASM_TILE_KVM_H */ diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h new file mode 100644 index 0000000..58b6bf3 --- /dev/null +++ b/arch/tile/include/asm/kvm_host.h @@ -0,0 +1,101 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#ifndef _ASM_TILE_KVM_HOST_H +#define _ASM_TILE_KVM_HOST_H + +#define KVM_MAX_VCPUS 64 +#define KVM_USER_MEM_SLOTS 32 +#define KVM_PRIVATE_MEM_SLOTS 4 + +/* For now, claim we have no huge pages. */ +#define KVM_HPAGE_GFN_SHIFT(x) 0 +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) 1 + +/* Max number of message tags for hv_send/receive_message() */ +#define MAX_MSG_TAG (sizeof(unsigned long) * 8) + +/* Bits in pending_downcalls */ +#define DOWNCALL_MESSAGE_RCV 0x01 /**< Message receive */ + +#ifndef __ASSEMBLY__ + +#include +#include + +struct kvm_vcpu_stat { + /* None yet. */ +}; + +struct kvm_vcpu_arch { + struct pt_regs regs; + struct kvm_sregs sregs; + unsigned long host_sp; /* Host "real" sp during vmresume. */ + HV_Context guest_context; + unsigned long pending_msgs; /* Pending guest messages */ + unsigned long ipi_events; /* Pending guest ipi events. */ + unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */ + pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */ + unsigned long fault_addr; /* addr for VPGTABLE_MISS faults */ + int suspended; /* true for cores not yet started by host */ + unsigned long timer_control; /* AUX_TILE_TIMER_CONTROL value */ + unsigned long vmexit_cycles; /* cycle count of last vmexit */ +}; + +struct kvm_vm_stat { + /* + * FIXME - does this make sense for us? It's used in common KVM + * code. + */ + u32 remote_tlb_flush; +}; + +struct kvm_arch_memory_slot { +}; + +struct kvm_arch { + pgd_t *vpgd; + unsigned long resv_gpa_start; /* For special purpose. */ + struct completion smp_start; +}; + +struct kvm_vcpu; + +extern void kvm_vmresume(struct pt_regs *guest, + unsigned long *host_sp_ptr); +extern void kvm_vmexit(unsigned long host_sp); +extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason); +extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num); +extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num, + unsigned long, unsigned long); +extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num); + +extern void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address) + +#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud)) + +#define gpmd_offset(kvm, pud, address) \ + ((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address)) + +#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd)) + +#define gpte_offset_kernel(kvm, pmd, address) \ + ((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address)) + +#endif /* __ASSEMBLY__*/ + +#endif /* _ASM_TILE_KVM_HOST_H */ diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h new file mode 100644 index 0000000..c8c31d5 --- /dev/null +++ b/arch/tile/include/asm/kvm_para.h @@ -0,0 +1,20 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ +#ifndef _ASM_TILE_KVM_PARA_H +#define _ASM_TILE_KVM_PARA_H + +#include + +int hcall_virtio(unsigned long instrument, unsigned long mem); +#endif /* _ASM_TILE_KVM_PARA_H */ diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h new file mode 100644 index 0000000..8faa959 --- /dev/null +++ b/arch/tile/include/asm/kvm_virtio.h @@ -0,0 +1,26 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ +#ifndef _ASM_TILE_KVM_VIRTIO_H +#define _ASM_TILE_KVM_VIRTIO_H + +#include + + +struct kvm_device { + struct virtio_device vdev; + struct kvm_device_desc *desc; + unsigned long desc_pa; +}; + +#endif /* _ASM_TILE_KVM_VIRTIO_H */ diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h index 44ed07c..927c97f 100644 --- a/arch/tile/include/asm/module.h +++ b/arch/tile/include/asm/module.h @@ -28,6 +28,13 @@ # define MODULE_PGSZ "" #endif +/* Tag guest Linux, since it uses different SPRs, etc. */ +#if CONFIG_KERNEL_PL == 2 +#define MODULE_PL "" +#else +#define MODULE_PL " guest" +#endif + /* We don't really support no-SMP so tag if someone tries. */ #ifdef CONFIG_SMP #define MODULE_NOSMP "" @@ -35,6 +42,6 @@ #define MODULE_NOSMP " nosmp" #endif -#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP +#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP #endif /* _ASM_TILE_MODULE_H */ diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h index b4f96c0..65ee752 100644 --- a/arch/tile/include/asm/page.h +++ b/arch/tile/include/asm/page.h @@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size) #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif +#ifdef CONFIG_KVM_GUEST +/* Paravirtualized guests get half the VA, and thus half the PA. */ +#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1) +#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1) +#else +#define MAX_PA_WIDTH CHIP_PA_WIDTH() +#define MAX_VA_WIDTH CHIP_VA_WIDTH() +#endif + /* Each memory controller has PAs distinct in their high bits. */ -#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS()) +#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS()) #define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS()) #define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT) #define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT)) @@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size) * We reserve the lower half of memory for user-space programs, and the * upper half for system code. We re-map all of physical memory in the * upper half, which takes a quarter of our VA space. Then we have - * the vmalloc regions. The supervisor code lives at 0xfffffff700000000, + * the vmalloc regions. The supervisor code lives at the highest address, * with the hypervisor above that. * * Loadable kernel modules are placed immediately after the static @@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size) * Similarly, for now we don't play any struct page mapping games. */ -#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH() +#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH # error Too much PA to map with the VA available! #endif -#define HALF_VA_SPACE (_AC(1, UL) << (CHIP_VA_WIDTH() - 1)) -#define MEM_LOW_END (HALF_VA_SPACE - 1) /* low half */ -#define MEM_HIGH_START (-HALF_VA_SPACE) /* high half */ -#define PAGE_OFFSET MEM_HIGH_START -#define FIXADDR_BASE _AC(0xfffffff400000000, UL) /* 4 GB */ -#define FIXADDR_TOP _AC(0xfffffff500000000, UL) /* 4 GB */ +#ifdef CONFIG_KVM_GUEST +#define PAGE_OFFSET (_AC(1, UL) << (MAX_VA_WIDTH - 1)) +#define KERNEL_HIGH_VADDR (_AC(1, UL) << MAX_VA_WIDTH) +#else +#define PAGE_OFFSET (-(_AC(1, UL) << (MAX_VA_WIDTH - 1))) +#define KERNEL_HIGH_VADDR _AC(0xfffffff800000000, UL) /* high 32GB */ +#endif + +#define FIXADDR_BASE (KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */ +#define FIXADDR_TOP (KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */ #define _VMALLOC_START FIXADDR_TOP -#define HUGE_VMAP_BASE _AC(0xfffffff600000000, UL) /* 4 GB */ -#define MEM_SV_START _AC(0xfffffff700000000, UL) /* 256 MB */ -#define MEM_SV_INTRPT MEM_SV_START -#define MEM_MODULE_START _AC(0xfffffff710000000, UL) /* 256 MB */ +#define HUGE_VMAP_BASE (KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */ +#define MEM_SV_START (KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */ +#define MEM_MODULE_START (MEM_SV_START + (256*1024*1024)) /* 256 MB */ #define MEM_MODULE_END (MEM_MODULE_START + (256*1024*1024)) -#define MEM_HV_START _AC(0xfffffff800000000, UL) /* 32 GB */ - -/* Highest DTLB address we will use */ -#define KERNEL_HIGH_VADDR MEM_SV_START #else /* !__tilegx__ */ @@ -213,8 +221,8 @@ static inline __attribute_const__ int get_order(unsigned long size) * values, and after that, we show "typical" values, since the actual * addresses depend on kernel #defines. * - * MEM_HV_INTRPT 0xfe000000 - * MEM_SV_INTRPT (kernel code) 0xfd000000 + * MEM_HV_START 0xfe000000 + * MEM_SV_START (kernel code) 0xfd000000 * MEM_USER_INTRPT (user vector) 0xfc000000 * FIX_KMAP_xxx 0xf8000000 (via NR_CPUS * KM_TYPE_NR) * PKMAP_BASE 0xf7000000 (via LAST_PKMAP) @@ -224,14 +232,8 @@ static inline __attribute_const__ int get_order(unsigned long size) */ #define MEM_USER_INTRPT _AC(0xfc000000, UL) -#if CONFIG_KERNEL_PL == 1 -#define MEM_SV_INTRPT _AC(0xfd000000, UL) -#define MEM_HV_INTRPT _AC(0xfe000000, UL) -#else -#define MEM_GUEST_INTRPT _AC(0xfd000000, UL) -#define MEM_SV_INTRPT _AC(0xfe000000, UL) -#define MEM_HV_INTRPT _AC(0xff000000, UL) -#endif +#define MEM_SV_START _AC(0xfd000000, UL) +#define MEM_HV_START _AC(0xfe000000, UL) #define INTRPT_SIZE 0x4000 diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h index e5bdc0e..63142ab 100644 --- a/arch/tile/include/asm/pgtable_32.h +++ b/arch/tile/include/asm/pgtable_32.h @@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud) { return 0; } /* We don't define any pgds for these addresses. */ static inline int pgd_addr_invalid(unsigned long addr) { - return addr >= MEM_HV_INTRPT; + return addr >= MEM_HV_START; } /* diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h index 7cb8d35..3421177 100644 --- a/arch/tile/include/asm/pgtable_64.h +++ b/arch/tile/include/asm/pgtable_64.h @@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr) /* We don't define any pgds for these addresses. */ static inline int pgd_addr_invalid(unsigned long addr) { - return addr >= MEM_HV_START || - (addr > MEM_LOW_END && addr < MEM_HIGH_START); + return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr); } /* diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index 230b830..5aa5431 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -15,6 +15,8 @@ #ifndef _ASM_TILE_PROCESSOR_H #define _ASM_TILE_PROCESSOR_H +#include + #ifndef __ASSEMBLY__ /* @@ -25,7 +27,6 @@ #include #include -#include #include struct task_struct; @@ -167,7 +168,7 @@ struct thread_struct { #ifndef __ASSEMBLY__ #ifdef __tilegx__ -#define TASK_SIZE_MAX (MEM_LOW_END + 1) +#define TASK_SIZE_MAX (_AC(1, UL) << (MAX_VA_WIDTH - 1)) #else #define TASK_SIZE_MAX PAGE_OFFSET #endif @@ -347,7 +348,6 @@ extern int kdata_huge; /* * Provide symbolic constants for PLs. - * Note that assembly code assumes that USER_PL is zero. */ #define USER_PL 0 #if CONFIG_KERNEL_PL == 2 diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h index 0d25c21..b9620c0 100644 --- a/arch/tile/include/asm/ptrace.h +++ b/arch/tile/include/asm/ptrace.h @@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t; #define user_stack_pointer(regs) ((regs)->sp) /* Does the process account for user or for system time? */ -#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL) +#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL) /* Fill in a struct pt_regs with the current kernel registers. */ struct pt_regs *get_pt_regs(struct pt_regs *); diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h index b8f888c..8e9150f 100644 --- a/arch/tile/include/asm/switch_to.h +++ b/arch/tile/include/asm/switch_to.h @@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev, extern unsigned long get_switch_to_pc(void); /* + * Normally we notify the simulator whenever we change from one pid + * to another, so it can track symbol files appropriately on the fly. + * For now, we don't do this for the guest Linux, since we don't + * have a way to tell the simulator that we are entering a separate + * pid space when we are in the guest. + */ +#ifdef CONFIG_KVM_GUEST +#define notify_sim_task_change(prev) do { } while (0) +#else +#define notify_sim_task_change(prev) do { \ + if (unlikely((prev)->state == TASK_DEAD)) \ + __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \ + ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \ + __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \ + (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \ +} while (0) +#endif + +/* * Kernel threads can check to see if they need to migrate their * stack whenever they return from a context switch; for user * threads, we defer until they are returning to user-space. */ #define finish_arch_switch(prev) do { \ - if (unlikely((prev)->state == TASK_DEAD)) \ - __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT | \ - ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \ - __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH | \ - (current->pid << _SIM_CONTROL_OPERATOR_BITS)); \ + notify_sim_task_change(prev); \ if (current->mm == NULL && !kstack_hash && \ current_thread_info()->homecache_cpu != smp_processor_id()) \ homecache_migrate_kthread(); \ diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index b8aa6df..1c26cdf 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -18,7 +18,9 @@ #include #include + #ifndef __ASSEMBLY__ +struct kvm_vcpu; /* * Low level task data that assembly code needs immediate access to. @@ -44,6 +46,9 @@ struct thread_info { unsigned long unalign_jit_tmp[4]; /* temp r0..r3 storage */ void __user *unalign_jit_base; /* unalign fixup JIT base */ #endif +#ifdef CONFIG_KVM + struct kvm_vcpu *vcpu; /* vcpu during vmresume */ +#endif }; /* @@ -117,8 +122,8 @@ extern void _cpu_idle(void); /* * Thread information flags that various assembly files may need to access. - * Keep flags accessed frequently in low bits, particular since it makes - * it easier to build constants in assembly. + * Keep flags accessed frequently in low bits, since it makes it + * easier to build constants in assembly. */ #define TIF_SIGPENDING 0 /* signal pending */ #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ @@ -131,6 +136,7 @@ extern void _cpu_idle(void); #define TIF_MEMDIE 7 /* OOM killer at work */ #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ #define TIF_SYSCALL_TRACEPOINT 9 /* syscall tracepoint instrumentation */ +#define TIF_VIRT_EXIT 10 /* force exit of task in vmresume */ #define _TIF_SIGPENDING (1< +#endif + +#include + +/* + * For Hypervisor syscalls. Note this comes from the hv: syscall.h, + * with small modifications: Remove HV_SYS_fence_incoherent. + */ +/* Syscall allowed from guest PL bit mask. */ +#define HV_SYS_GUEST_SHIFT 12 +#define HV_SYS_GUEST_MASK (1 << HV_SYS_GUEST_SHIFT) +/* downcall_dispatch; this syscall number must be zero */ +#define HV_SYS_downcall_dispatch 0 +/* install_context */ +#define HV_SYS_install_context 1 +/* sysconf */ +#define HV_SYS_sysconf 2 +/* get_rtc */ +#define HV_SYS_get_rtc 3 +/* set_rtc */ +#define HV_SYS_set_rtc 4 +/* flush_asid */ +#define HV_SYS_flush_asid 5 +/* flush_page */ +#define HV_SYS_flush_page 6 +/* flush_pages */ +#define HV_SYS_flush_pages 7 +/* restart */ +#define HV_SYS_restart 8 +/* halt */ +#define HV_SYS_halt 9 +/* power_off */ +#define HV_SYS_power_off 10 +/* inquire_physical */ +#define HV_SYS_inquire_physical 11 +/* inquire_memory_controller */ +#define HV_SYS_inquire_memory_controller 12 +/* inquire_virtual */ +#define HV_SYS_inquire_virtual 13 +/* inquire_asid */ +#define HV_SYS_inquire_asid 14 +/* console_read_if_ready */ +#define HV_SYS_console_read_if_ready 15 +/* console_write */ +#define HV_SYS_console_write 16 +/* init */ +#define HV_SYS_init 17 +/* inquire_topology */ +#define HV_SYS_inquire_topology 18 +/* fs_findfile */ +#define HV_SYS_fs_findfile 19 +/* fs_fstat */ +#define HV_SYS_fs_fstat 20 +/* fs_pread */ +#define HV_SYS_fs_pread 21 +/* physaddr_read64 */ +#define HV_SYS_physaddr_read64 22 +/* physaddr_write64 */ +#define HV_SYS_physaddr_write64 23 +/* get_command_line */ +#define HV_SYS_get_command_line 24 +/* set_caching */ +#define HV_SYS_set_caching 25 +/* bzero_page */ +#define HV_SYS_bzero_page 26 +/* register_message_state */ +#define HV_SYS_register_message_state 27 +/* send_message */ +#define HV_SYS_send_message 28 +/* receive_message */ +#define HV_SYS_receive_message 29 +/* inquire_context */ +#define HV_SYS_inquire_context 30 +/* start_all_tiles */ +#define HV_SYS_start_all_tiles 31 +/* dev_open */ +#define HV_SYS_dev_open 32 +/* dev_close */ +#define HV_SYS_dev_close 33 +/* dev_pread */ +#define HV_SYS_dev_pread 34 +/* dev_pwrite */ +#define HV_SYS_dev_pwrite 35 +/* dev_poll */ +#define HV_SYS_dev_poll 36 +/* dev_poll_cancel */ +#define HV_SYS_dev_poll_cancel 37 +/* dev_preada */ +#define HV_SYS_dev_preada 38 +/* dev_pwritea */ +#define HV_SYS_dev_pwritea 39 +/* flush_remote */ +#define HV_SYS_flush_remote 40 +/* console_putc */ +#define HV_SYS_console_putc 41 +/* inquire_tiles */ +#define HV_SYS_inquire_tiles 42 +/* confstr */ +#define HV_SYS_confstr 43 +/* reexec */ +#define HV_SYS_reexec 44 +/* set_command_line */ +#define HV_SYS_set_command_line 45 + +/* store_mapping */ +#define HV_SYS_store_mapping 52 +/* inquire_realpa */ +#define HV_SYS_inquire_realpa 53 +/* flush_all */ +#define HV_SYS_flush_all 54 +/* get_ipi_pte */ +#define HV_SYS_get_ipi_pte 55 +/* set_pte_super_shift */ +#define HV_SYS_set_pte_super_shift 56 +/* set_speed */ +#define HV_SYS_set_speed 57 +/* install_virt_context */ +#define HV_SYS_install_virt_context 58 +/* inquire_virt_context */ +#define HV_SYS_inquire_virt_context 59 +/* inquire_guest_context */ +#define HV_SYS_install_guest_context 60 +/* inquire_guest_context */ +#define HV_SYS_inquire_guest_context 61 + +/* + * Number of hypercall (from guest os to host os) other than hv_*(). + * We leave the previous 128 entries to the usual hv_*() calls + * as defined in hypervisor.h. + */ +#define KVM_OTHER_HCALL 128 + +/* Hypercall index for virtio. */ +#define KVM_HCALL_virtio 128 + +/* One greater than the maximum hypercall number. */ +#define KVM_NUM_HCALLS 256 + +#ifndef __ASSEMBLER__ + +struct kvm_regs { + struct pt_regs regs; +}; + +#define FOR_EACH_GUEST_SPR(f) \ + f(INTERRUPT_MASK_1); \ + f(INTERRUPT_VECTOR_BASE_1); \ + f(EX_CONTEXT_1_0); \ + f(EX_CONTEXT_1_1); \ + f(SYSTEM_SAVE_1_0); \ + f(SYSTEM_SAVE_1_1); \ + f(SYSTEM_SAVE_1_2); \ + f(SYSTEM_SAVE_1_3); \ + f(INTCTRL_1_STATUS); \ + f(IPI_MASK_1); \ + f(IPI_EVENT_1); \ + f(SINGLE_STEP_CONTROL_1); \ + f(SINGLE_STEP_EN_1_1); \ + +struct kvm_sregs { +#define DECLARE_SPR(f) unsigned long f + FOR_EACH_GUEST_SPR(DECLARE_SPR) +#undef DECLARE_SPR +}; + +struct kvm_fpu { +}; + +struct kvm_debug_exit_arch { +}; + +struct kvm_guest_debug_arch { +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +#ifndef __KERNEL__ +/* For hv_*() */ +#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal, +#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name, +#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal, +#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name, +/* For others */ +#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name, +#endif + +#define HCALL_DEFS \ + /* For hv_*() */ \ + KVM_EMULATE(init) \ + NO_EMULATE(install_context) \ + KVM_EMULATE(sysconf) \ + KVM_EMULATE(get_rtc) \ + KVM_EMULATE(set_rtc) \ + NO_EMULATE(flush_asid) \ + NO_EMULATE(flush_page) \ + NO_EMULATE(flush_pages) \ + USER_EMULATE(restart) \ + USER_EMULATE(halt) \ + USER_EMULATE(power_off) \ + USER_EMULATE(inquire_physical) \ + USER_EMULATE(inquire_memory_controller) \ + KVM_EMULATE(inquire_virtual) \ + KVM_EMULATE(inquire_asid) \ + NO_EMULATE(console_read_if_ready) \ + NO_EMULATE(console_write) \ + NO_EMULATE(downcall_dispatch) \ + KVM_EMULATE(inquire_topology) \ + USER_EMULATE(fs_findfile) \ + USER_EMULATE(fs_fstat) \ + USER_EMULATE(fs_pread) \ + KVM_EMULATE(physaddr_read64) \ + KVM_EMULATE(physaddr_write64) \ + USER_EMULATE(get_command_line) \ + USER_EMULATE(set_caching) \ + NO_EMULATE(bzero_page) \ + KVM_EMULATE(register_message_state) \ + KVM_EMULATE(send_message) \ + KVM_EMULATE(receive_message) \ + KVM_EMULATE(inquire_context) \ + KVM_EMULATE(start_all_tiles) \ + USER_EMULATE(dev_open) \ + USER_EMULATE(dev_close) \ + USER_EMULATE(dev_pread) \ + USER_EMULATE(dev_pwrite) \ + USER_EMULATE(dev_poll) \ + USER_EMULATE(dev_poll_cancel) \ + USER_EMULATE(dev_preada) \ + USER_EMULATE(dev_pwritea) \ + USER_EMULATE(flush_remote) \ + NO_EMULATE(console_putc) \ + KVM_EMULATE(inquire_tiles) \ + KVM_EMULATE(confstr) \ + USER_EMULATE(reexec) \ + USER_EMULATE(set_command_line) \ + USER_EMULATE(store_mapping) \ + NO_EMULATE(inquire_realpa) \ + NO_EMULATE(flush_all) \ + KVM_EMULATE(get_ipi_pte) \ + KVM_EMULATE(set_pte_super_shift) \ + KVM_EMULATE(set_speed) \ + /* For others */ \ + USER_HCALL(virtio) + +#endif + +#endif /* _UAPI_ASM_TILE_KVM_H */ diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h new file mode 100644 index 0000000..d94f535 --- /dev/null +++ b/arch/tile/include/uapi/asm/kvm_virtio.h @@ -0,0 +1,60 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H +#define _UAPI_ASM_TILE_KVM_VIRTIO_H + +#include + +#define KVM_VIRTIO_UNKNOWN 0 +#define KVM_VIRTIO_NOTIFY 1 +#define KVM_VIRTIO_RESET 2 +#define KVM_VIRTIO_SET_STATUS 3 + +struct kvm_device_desc { + /* The device type: console, network, disk etc. Type 0 terminates. */ + __u8 type; + /* The number of virtqueues (first in config array) */ + __u8 num_vq; + /* + * The number of bytes of feature bits. Multiply by 2: one for host + * features and one for Guest acknowledgements. + */ + __u8 feature_len; + /* The number of bytes of the config array after virtqueues. */ + __u8 config_len; + /* A status byte, written by the Guest. */ + __u8 status; + __u64 config[0]; +}; + +struct kvm_vqinfo { + /* Pointer to the information contained in the device config. */ + struct kvm_vqconfig *config; + /* The address where we mapped the virtio ring, so we can unmap it. */ + void *pages; +}; + +struct kvm_vqconfig { + /* The physical address of the virtio ring */ + __u64 pa; + /* The number of entries in the virtio_ring */ + __u64 num; + /* The interrupt we get when something happens. Set by the guest. */ + __u32 irq; + +}; + + +#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */ diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile index b7c8b5e..b638d3e 100644 --- a/arch/tile/kernel/Makefile +++ b/arch/tile/kernel/Makefile @@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB) += usb.o obj-$(CONFIG_TILE_HVGLUE_TRACE) += hvglue_trace.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount_64.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KVM_GUEST) += kvm_virtio.o obj-y += vdso/ diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c index 97ea6ac..0a04a16 100644 --- a/arch/tile/kernel/asm-offsets.c +++ b/arch/tile/kernel/asm-offsets.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_KVM +#include +#endif /* Check for compatible compiler early in the build. */ #ifdef CONFIG_TILEGX @@ -68,6 +71,10 @@ void foo(void) DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET, offsetof(struct thread_info, unalign_jit_tmp)); #endif +#ifdef CONFIG_KVM + DEFINE(THREAD_INFO_VCPU_OFFSET, + offsetof(struct thread_info, vcpu)); +#endif DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET, offsetof(struct task_struct, thread.ksp)); diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c index b608e00..53f2be4 100644 --- a/arch/tile/kernel/early_printk.c +++ b/arch/tile/kernel/early_printk.c @@ -18,11 +18,26 @@ #include #include #include +#ifdef CONFIG_KVM_GUEST +#include +#include +#include +#endif #include #include static void early_hv_write(struct console *con, const char *s, unsigned n) { +#ifdef CONFIG_KVM_GUEST + char buf[512]; + + if (n > sizeof(buf) - 1) + n = sizeof(buf) - 1; + memcpy(buf, s, n); + buf[n] = '\0'; + + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf)); +#else tile_console_write(s, n); /* @@ -32,6 +47,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n) */ if (n && s[n-1] == '\n') tile_console_write("\r", 1); +#endif } static struct console early_hv_console = { diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S index f3f17b0..8d5b40f 100644 --- a/arch/tile/kernel/head_32.S +++ b/arch/tile/kernel/head_32.S @@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir) .set addr, addr + PGDIR_SIZE .endr - /* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */ - PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \ + /* The true text VAs are mapped as VA = PA + MEM_SV_START */ + PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \ (1 << (HV_PTE_INDEX_EXECUTABLE - 32)) .org swapper_pg_dir + PGDIR_SIZE END(swapper_pg_dir) diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S index 652b814..bd0e12f 100644 --- a/arch/tile/kernel/head_64.S +++ b/arch/tile/kernel/head_64.S @@ -135,9 +135,9 @@ ENTRY(_start) 1: /* Install the interrupt base. */ - moveli r0, hw2_last(MEM_SV_START) - shl16insli r0, r0, hw1(MEM_SV_START) - shl16insli r0, r0, hw0(MEM_SV_START) + moveli r0, hw2_last(intrpt_start) + shl16insli r0, r0, hw1(intrpt_start) + shl16insli r0, r0, hw0(intrpt_start) mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0 /* Get our processor number and save it away in SAVE_K_0. */ diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S index 16576c6..2914a9e 100644 --- a/arch/tile/kernel/hvglue.S +++ b/arch/tile/kernel/hvglue.S @@ -71,5 +71,11 @@ gensym hv_flush_all, 0x6e0, 32 gensym hv_get_ipi_pte, 0x700, 32 gensym hv_set_pte_super_shift, 0x720, 32 gensym hv_set_speed, 0x740, 32 +gensym hv_install_virt_context, 0x760, 32 +gensym hv_inquire_virt_context, 0x780, 32 +gensym hv_install_guest_context, 0x7a0, 32 +gensym hv_inquire_guest_context, 0x7c0, 32 gensym hv_console_set_ipi, 0x7e0, 32 -gensym hv_glue_internals, 0x800, 30720 +gensym hv_glue_internals, 0x800, 2048 +gensym hcall_virtio, 0x1000, 32 +gensym hv_hcall_internals, 0x1020, 28640 diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c index 16ef6c1..3b15c76 100644 --- a/arch/tile/kernel/hvglue_trace.c +++ b/arch/tile/kernel/hvglue_trace.c @@ -75,6 +75,10 @@ #define hv_get_ipi_pte _hv_get_ipi_pte #define hv_set_pte_super_shift _hv_set_pte_super_shift #define hv_set_speed _hv_set_speed +#define hv_install_virt_context _hv_install_virt_context +#define hv_inquire_virt_context _hv_inquire_virt_context +#define hv_install_guest_context _hv_install_guest_context +#define hv_inquire_guest_context _hv_inquire_guest_context #define hv_console_set_ipi _hv_console_set_ipi #include #undef hv_init @@ -135,6 +139,10 @@ #undef hv_get_ipi_pte #undef hv_set_pte_super_shift #undef hv_set_speed +#undef hv_install_virt_context +#undef hv_inquire_virt_context +#undef hv_install_guest_context +#undef hv_inquire_guest_context #undef hv_console_set_ipi /* @@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle, unsigned long, flags) HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access, HV_ASID, asid, __hv32, flags) +HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access, + HV_ASID, asid, __hv32, flags) +HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access, + HV_ASID, asid, __hv32, flags) HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count) HV_WRAP0(HV_Context, hv_inquire_context) +HV_WRAP0(HV_Context, hv_inquire_virt_context) +HV_WRAP0(HV_Context, hv_inquire_guest_context) HV_WRAP1(int, hv_flush_asid, HV_ASID, asid) HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size) HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size, diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S index f3d26f4..2ce69a5 100644 --- a/arch/tile/kernel/intvec_32.S +++ b/arch/tile/kernel/intvec_32.S @@ -353,7 +353,7 @@ intvec_\vecname: #ifdef __COLLECT_LINKER_FEEDBACK__ .pushsection .text.intvec_feedback,"ax" .org (\vecnum << 5) - FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8) + FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8) jrp lr .popsection #endif @@ -806,7 +806,7 @@ handle_interrupt: STD_ENTRY(interrupt_return) /* If we're resuming to kernel space, don't check thread flags. */ { - bnz r30, .Lrestore_all /* NMIs don't special-case user-space */ + bnz r30, restore_all /* NMIs don't special-case user-space */ PTREGS_PTR(r29, PTREGS_OFFSET_EX1) } lw r29, r29 @@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return) seq r27, r27, r28 } { - bbns r27, .Lrestore_all + bbns r27, restore_all addi r28, r28, 8 } sw r29, r28 - j .Lrestore_all + j restore_all .Lresume_userspace: FEEDBACK_REENTER(interrupt_return) @@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return) auli r1, r1, ha16(_TIF_ALLWORK_MASK) } and r1, r29, r1 - bzt r1, .Lrestore_all + bzt r1, restore_all /* * Make sure we have all the registers saved for signal @@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return) * profile interrupt will actually disable interrupts in both SPRs * before returning, which is OK.) */ -.Lrestore_all: + .global restore_all + .type restore_all, @function +restore_all: PTREGS_PTR(r0, PTREGS_OFFSET_EX1) { lw r0, r0 @@ -1890,8 +1892,8 @@ int_unalign: push_extra_callee_saves r0 j do_trap -/* Include .intrpt1 array of interrupt vectors */ - .section ".intrpt1", "ax" +/* Include .intrpt array of interrupt vectors */ + .section ".intrpt", "ax" #define op_handle_perf_interrupt bad_intr #define op_handle_aux_perf_interrupt bad_intr diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S index 18b2dcc..2c5cbe0 100644 --- a/arch/tile/kernel/intvec_64.S +++ b/arch/tile/kernel/intvec_64.S @@ -29,11 +29,25 @@ #include #include #include +#include +#ifdef CONFIG_KVM +#include +#endif #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg) #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR) +#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2 +/* + * Set "result" non-zero if ex1 holds the PL of the kernel + * (with or without ICS being set). Note this works only + * because we never find the PL at level 3. + */ +# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL +#else +# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL +#endif .macro push_reg reg, ptr=sp, delta=-8 { @@ -308,7 +322,7 @@ intvec_\vecname: */ { blbs sp, 2f - andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(r0, r0) } .ifc \vecnum, INT_DOUBLE_FAULT @@ -347,10 +361,6 @@ intvec_\vecname: * * Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for * any path that turns into a downcall to one of our TLB handlers. - * - * FIXME: if we end up never using this path, perhaps we should - * prevent the hypervisor from generating downcalls in this case. - * The advantage of getting a downcall is we can panic in Linux. */ mfspr r0, SPR_SYSTEM_SAVE_K_2 { @@ -490,6 +500,10 @@ intvec_\vecname: mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */ mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */ .else + .ifc \c_routine, kvm_vpgtable_miss + mfspr r2, SPR_SYSTEM_SAVE_K_3 /* address of page fault */ + mfspr r3, SPR_SYSTEM_SAVE_K_2 /* info about page fault */ + .else .ifc \vecnum, INT_ILL_TRANS mfspr r2, ILL_VA_PC .else @@ -512,6 +526,7 @@ intvec_\vecname: .endif .endif .endif + .endif /* Put function pointer in r0 */ moveli r0, hw2_last(\c_routine) shl16insli r0, r0, hw1(\c_routine) @@ -525,7 +540,7 @@ intvec_\vecname: #ifdef __COLLECT_LINKER_FEEDBACK__ .pushsection .text.intvec_feedback,"ax" .org (\vecnum << 5) - FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8) + FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8) jrp lr .popsection #endif @@ -641,24 +656,25 @@ intvec_\vecname: /* * If we will be returning to the kernel, we will need to * reset the interrupt masks to the state they had before. - * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled. + * Set DISABLE_IRQ in flags iff we came from kernel pl with + * irqs disabled. */ - mfspr r32, SPR_EX_CONTEXT_K_1 + mfspr r22, SPR_EX_CONTEXT_K_1 { - andi r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(r22, r22) PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS) } - beqzt r32, 1f /* zero if from user space */ - IRQS_DISABLED(r32) /* zero if irqs enabled */ + beqzt r22, 1f /* zero if from user space */ + IRQS_DISABLED(r22) /* zero if irqs enabled */ #if PT_FLAGS_DISABLE_IRQ != 1 # error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix #endif 1: .ifnc \function,handle_syscall /* Record the fact that we saved the caller-save registers above. */ - ori r32, r32, PT_FLAGS_CALLER_SAVES + ori r22, r22, PT_FLAGS_CALLER_SAVES .endif - st r21, r32 + st r21, r22 /* * we've captured enough state to the stack (including in @@ -698,12 +714,29 @@ intvec_\vecname: move tp, zero #endif + /* + * Prepare the first 256 stack bytes to be rapidly accessible + * without having to fetch the background data. + */ + addi r52, sp, -64 + { + wh64 r52 + addi r52, r52, -64 + } + { + wh64 r52 + addi r52, r52, -64 + } + { + wh64 r52 + addi r52, r52, -64 + } + wh64 r52 + #ifdef __COLLECT_LINKER_FEEDBACK__ /* * Notify the feedback routines that we were in the - * appropriate fixed interrupt vector area. Note that we - * still have ICS set at this point, so we can't invoke any - * atomic operations or we will panic. The feedback + * appropriate fixed interrupt vector area. The feedback * routines internally preserve r0..r10 and r30 up. */ .ifnc \function,handle_syscall @@ -722,23 +755,15 @@ intvec_\vecname: #endif /* - * Prepare the first 256 stack bytes to be rapidly accessible - * without having to fetch the background data. + * Stash any interrupt state in r30..r33 for now. + * This makes it easier to call C code in the code that follows. + * We don't need to on the syscall path since we reload + * them from the stack instead. */ - addi r52, sp, -64 - { - wh64 r52 - addi r52, r52, -64 - } - { - wh64 r52 - addi r52, r52, -64 - } - { - wh64 r52 - addi r52, r52, -64 - } - wh64 r52 + .ifnc \function,handle_syscall + { move r30, r0; move r31, r1 } + { move r32, r2; move r33, r3 } + .endif #ifdef CONFIG_TRACE_IRQFLAGS .ifnc \function,handle_nmi @@ -749,17 +774,8 @@ intvec_\vecname: * For syscalls, we already have the register state saved away * on the stack, so we don't bother to do any register saves here, * and later we pop the registers back off the kernel stack. - * For interrupt handlers, save r0-r3 in callee-saved registers. */ - .ifnc \function,handle_syscall - { move r30, r0; move r31, r1 } - { move r32, r2; move r33, r3 } - .endif TRACE_IRQS_OFF - .ifnc \function,handle_syscall - { move r0, r30; move r1, r31 } - { move r2, r32; move r3, r33 } - .endif .endif #endif @@ -808,11 +824,11 @@ handle_interrupt: STD_ENTRY(interrupt_return) /* If we're resuming to kernel space, don't check thread flags. */ { - bnez r30, .Lrestore_all /* NMIs don't special-case user-space */ + bnez r30, restore_all /* NMIs don't special-case user-space */ PTREGS_PTR(r29, PTREGS_OFFSET_EX1) } ld r29, r29 - andi r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(r29, r29) { beqzt r29, .Lresume_userspace move r29, sp @@ -824,14 +840,25 @@ STD_ENTRY(interrupt_return) addli r28, r29, THREAD_INFO_FLAGS_OFFSET { ld r28, r28 - addli r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET + addli r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET } { - andi r28, r28, _TIF_NEED_RESCHED - ld4s r29, r29 + andi r27, r28, _TIF_NEED_RESCHED + ld4s r26, r26 } - beqzt r28, 1f - bnez r29, 1f + beqzt r27, 1f + bnez r26, 1f +#ifdef CONFIG_KVM + addli r27, r29, THREAD_INFO_VCPU_OFFSET + ld r27, r27 + { + beqzt r27, 0f + movei r1, KVM_EXIT_AGAIN + } + push_extra_callee_saves r0 + j kvm_trigger_vmexit +0: +#endif jal preempt_schedule_irq FEEDBACK_REENTER(interrupt_return) 1: @@ -853,11 +880,11 @@ STD_ENTRY(interrupt_return) cmpeq r27, r27, r28 } { - blbc r27, .Lrestore_all + blbc r27, restore_all addi r28, r28, 8 } st r29, r28 - j .Lrestore_all + j restore_all .Lresume_userspace: FEEDBACK_REENTER(interrupt_return) @@ -897,7 +924,7 @@ STD_ENTRY(interrupt_return) shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK) } and r1, r29, r1 - beqzt r1, .Lrestore_all + beqzt r1, restore_all /* * Make sure we have all the registers saved for signal @@ -929,14 +956,16 @@ STD_ENTRY(interrupt_return) * ICS can only be used in very tight chunks of code to avoid * tripping over various assertions that it is off. */ -.Lrestore_all: + .global restore_all + .type restore_all, @function +restore_all: PTREGS_PTR(r0, PTREGS_OFFSET_EX1) { ld r0, r0 PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS) } { - andi r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK + IS_KERNEL_EX1(r0, r0) ld r32, r32 } bnez r0, 1f @@ -1007,7 +1036,7 @@ STD_ENTRY(interrupt_return) pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC { mtspr SPR_EX_CONTEXT_K_1, lr - andi lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK /* mask off ICS */ + IS_KERNEL_EX1(lr, lr) } { mtspr SPR_EX_CONTEXT_K_0, r21 @@ -1457,6 +1486,26 @@ int_unalign: j do_unaligned ENDPROC(hand_unalign_slow) +#ifdef CONFIG_KVM +/* + * Any call path that may lead to a vmexit needs to save the full + * callee-save register state, since if we vmexit we don't unwind + * the callee-saves from the C function stack frames, and instead + * just save away the register state from the interrupt handler as-is + * and later reload it directly and call back into the guest. + */ + .macro save_callee_saves_and_tailcall func +kvm_\func: + push_extra_callee_saves r0 + j kvm_do_\func + ENDPROC(\func) + .endm + + save_callee_saves_and_tailcall hypervisor_call + save_callee_saves_and_tailcall vpgtable_miss + save_callee_saves_and_tailcall vguest_fatal +#endif + /* Fill the return address stack with nonzero entries. */ STD_ENTRY(fill_ra_stack) { @@ -1469,13 +1518,57 @@ STD_ENTRY(fill_ra_stack) 4: jrp r0 STD_ENDPROC(fill_ra_stack) +#ifdef CONFIG_KVM +/* + * Handle the downcall dispatch service. On entry, the client's + * system save register 3 holds the original contents of + * REG_SYSCALL_NR_NAME, which we need to restore before we iret to + * the correct interrupt vector. + * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt + * here, since this is the only interrupt handled this way on GX. + */ +handle_downcall_dispatch: + /* + * If we were called from PL0, jump back to slow path. + * We check just the low bit to make sure it's set, since we + * can only be called from PL0 or PL1. + */ + mfspr TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1 + blbc TREG_SYSCALL_NR_NAME, intvec_SWINT_0 + + /* Set the PC to the downcall interrupt vector, and PL to guest. */ + mfspr TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1 + addli TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \ + INT_MESSAGE_RCV_DWNCL << 8 + { + mtspr SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME + movei TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK + } + mtspr SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME + + /* Restore REG_SYSCALL_NR_NAME and return to the new vector. */ + mfspr TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3 + iret + + .macro int_hand_kvm_hcall vecnum, vecname, c_routine, \ + processing=handle_interrupt + .org (\vecnum << 8) + /* Need special code for downcall dispatch syscall. */ + beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch + __int_hand \vecnum, \vecname, \c_routine, \processing + .endm + +#endif /* CONFIG_KVM */ + .macro int_hand vecnum, vecname, c_routine, processing=handle_interrupt .org (\vecnum << 8) __int_hand \vecnum, \vecname, \c_routine, \processing .endm -/* Include .intrpt1 array of interrupt vectors */ - .section ".intrpt1", "ax" +/* Include .intrpt array of interrupt vectors */ + .section ".intrpt", "ax" + .global intrpt_start +intrpt_start: #define op_handle_perf_interrupt bad_intr #define op_handle_aux_perf_interrupt bad_intr @@ -1484,6 +1577,11 @@ STD_ENTRY(fill_ra_stack) #define do_hardwall_trap bad_intr #endif +#ifndef CONFIG_KVM +#define kvm_vpgtable_miss bad_intr +#define kvm_vguest_fatal bad_intr +#endif + int_hand INT_MEM_ERROR, MEM_ERROR, do_trap int_hand INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr #if CONFIG_KERNEL_PL == 2 @@ -1504,14 +1602,24 @@ STD_ENTRY(fill_ra_stack) int_hand INT_SWINT_3, SWINT_3, do_trap int_hand INT_SWINT_2, SWINT_2, do_trap int_hand INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall +#ifdef CONFIG_KVM + int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call +#else int_hand INT_SWINT_0, SWINT_0, do_trap +#endif int_hand INT_ILL_TRANS, ILL_TRANS, do_trap int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA int_hand INT_DTLB_MISS, DTLB_MISS, do_page_fault int_hand INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault int_hand INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap int_hand INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap +#ifndef CONFIG_KVM_GUEST int_hand INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt + int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr +#else + int_hand INT_TILE_TIMER, TILE_TIMER, bad_intr + int_hand INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt +#endif int_hand INT_IDN_TIMER, IDN_TIMER, bad_intr int_hand INT_UDN_TIMER, UDN_TIMER, bad_intr int_hand INT_IDN_AVAIL, IDN_AVAIL, bad_intr @@ -1541,8 +1649,10 @@ STD_ENTRY(fill_ra_stack) int_hand INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \ hv_message_intr int_hand INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr - int_hand INT_I_ASID, I_ASID, bad_intr - int_hand INT_D_ASID, D_ASID, bad_intr + int_hand INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \ + kvm_vpgtable_miss + int_hand INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \ + kvm_vguest_fatal int_hand INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap /* Synthetic interrupt delivered only by the simulator */ diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c new file mode 100644 index 0000000..c6b6c6a --- /dev/null +++ b/arch/tile/kernel/kvm_virtio.c @@ -0,0 +1,430 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +/* Referred lguest & s390 implemenation */ +/* + * kvm_virtio.c - virtio for kvm on s390 + * + * Copyright IBM Corp. 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): Christian Borntraeger + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static void *kvm_devices; + +/* + * TODO: We actually does not use PCI virtio here. We use this + * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN. + * Maybe we should change them to generic definitions in both qemu & Linux. + * Besides, Let's check whether the alignment value (4096, i.e. default + * x86 page size) affects performance later. + */ +#define KVM_TILE_VIRTIO_RING_ALIGN VIRTIO_PCI_VRING_ALIGN +#define to_kvmdev(vd) container_of(vd, struct kvm_device, vdev) + +/* + * memory layout: (Total: PAGE_SIZE) + * + * - kvm device descriptor + * struct kvm_device_desc + * - vqueue configuration (totally desc->num_vq) + * struct kvm_vqconfig + * ...... + * struct kvm_vqconfig + * - feature bits (size: desc->feature_len * 2) + * - config space (size: desc->config_len) + * + * ...... + */ +static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc) +{ + return (struct kvm_vqconfig *)(desc + 1); +} + +static u8 *kvm_vq_features(const struct kvm_device_desc *desc) +{ + return (u8 *)(kvm_vq_config(desc) + desc->num_vq); +} + +static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc) +{ + return kvm_vq_features(desc) + desc->feature_len * 2; +} + +/* + * The total size of the config page used by this device (incl. desc) + */ +static unsigned desc_size(const struct kvm_device_desc *desc) +{ + return sizeof(*desc) + + desc->num_vq * sizeof(struct kvm_vqconfig) + + desc->feature_len * 2 + + desc->config_len; +} + +/* This gets the device's feature bits. */ +static u32 kvm_get_features(struct virtio_device *vdev) +{ + unsigned int i; + u32 features = 0; + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; + u8 *in_features = kvm_vq_features(desc); + + for (i = 0; i < min(desc->feature_len * 8, 32); i++) + if (in_features[i / 8] & (1 << (i % 8))) + features |= (1 << i); + return features; +} + +static void kvm_finalize_features(struct virtio_device *vdev) +{ + unsigned int i, bits; + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; + /* Second half of bitmap is features we accept. */ + u8 *out_features = kvm_vq_features(desc) + desc->feature_len; + + /* Give virtio_ring a chance to accept features. */ + vring_transport_features(vdev); + + memset(out_features, 0, desc->feature_len); + bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; + for (i = 0; i < bits; i++) { + if (test_bit(i, vdev->features)) + out_features[i / 8] |= (1 << (i % 8)); + } +} + +/* + * Reading and writing elements in config space + */ +static void kvm_get(struct virtio_device *vdev, unsigned int offset, + void *buf, unsigned len) +{ + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; + + BUG_ON(offset + len > desc->config_len); + memcpy(buf, kvm_vq_configspace(desc) + offset, len); +} + +static void kvm_set(struct virtio_device *vdev, unsigned int offset, + const void *buf, unsigned len) +{ + struct kvm_device_desc *desc = to_kvmdev(vdev)->desc; + + BUG_ON(offset + len > desc->config_len); + memcpy(kvm_vq_configspace(desc) + offset, buf, len); +} + +/* + * The operations to get and set the status word just access + * the status field of the device descriptor. set_status will also + * make a hypercall to the host, to tell about status changes + */ +static u8 kvm_get_status(struct virtio_device *vdev) +{ + return to_kvmdev(vdev)->desc->status; +} + +static void kvm_set_status(struct virtio_device *vdev, u8 status) +{ + BUG_ON(!status); + to_kvmdev(vdev)->desc->status = status; + hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa); +} + +/* + * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the + * descriptor address. The Host will zero the status and all the + * features. + */ +static void kvm_reset(struct virtio_device *vdev) +{ + hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa); +} + +/* + * When the virtio_ring code wants to notify the Host, it calls us here and we + * make a hypercall. We hand the address of the virtqueue so the Host + * knows which virtqueue we're talking about. + */ +static void kvm_notify(struct virtqueue *vq) +{ + struct kvm_vqinfo *vqi = vq->priv; + + hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa); +} + +/* + * Must set some caching mode to keep set_pte() happy. + * It doesn't matter what we choose, because the PFN + * is illegal, so we're going to take a page fault anyway. + */ +static inline pgprot_t io_prot(void) +{ + return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED); +} + +/* + * This routine finds the first virtqueue described in the configuration of + * this device and sets it up. + */ +static struct virtqueue *kvm_find_vq(struct virtio_device *vdev, + unsigned index, + void (*callback)(struct virtqueue *vq), + const char *name) +{ + struct kvm_device *kdev = to_kvmdev(vdev); + struct kvm_vqinfo *vqi; + struct kvm_vqconfig *config; + struct virtqueue *vq; + long irq; + int err = -EINVAL; + + if (index >= kdev->desc->num_vq) + return ERR_PTR(-ENOENT); + + vqi = kzalloc(sizeof(*vqi), GFP_KERNEL); + if (!vqi) + return ERR_PTR(-ENOMEM); + + config = kvm_vq_config(kdev->desc)+index; + + vqi->config = config; + vqi->pages = generic_remap_prot(config->pa, + vring_size(config->num, + KVM_TILE_VIRTIO_RING_ALIGN), + 0, io_prot()); + if (!vqi->pages) { + err = -ENOMEM; + goto out; + } + + vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN, + vdev, 0, vqi->pages, + kvm_notify, callback, name); + if (!vq) { + err = -ENOMEM; + goto unmap; + } + + /* + * Trigger the IPI interrupt in SW way. + * TODO: We do not need to create one irq for each vq. A bit wasteful. + */ + irq = create_irq(); + if (irq < 0) { + err = -ENXIO; + goto del_virtqueue; + } + + tile_irq_activate(irq, TILE_IRQ_SW_CLEAR); + + if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) { + err = -ENXIO; + destroy_irq(irq); + goto del_virtqueue; + } + + config->irq = irq; + + vq->priv = vqi; + return vq; + +del_virtqueue: + vring_del_virtqueue(vq); +unmap: + vunmap(vqi->pages); +out: + return ERR_PTR(err); +} + +static void kvm_del_vq(struct virtqueue *vq) +{ + struct kvm_vqinfo *vqi = vq->priv; + + vring_del_virtqueue(vq); + vunmap(vqi->pages); + kfree(vqi); +} + +static void kvm_del_vqs(struct virtio_device *vdev) +{ + struct virtqueue *vq, *n; + + list_for_each_entry_safe(vq, n, &vdev->vqs, list) + kvm_del_vq(vq); +} + +static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + struct kvm_device *kdev = to_kvmdev(vdev); + int i; + + /* We must have this many virtqueues. */ + if (nvqs > kdev->desc->num_vq) + return -ENOENT; + + for (i = 0; i < nvqs; ++i) { + vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]); + if (IS_ERR(vqs[i])) + goto error; + } + return 0; + +error: + kvm_del_vqs(vdev); + return PTR_ERR(vqs[i]); +} + +/* + * The config ops structure as defined by virtio config + */ +static struct virtio_config_ops kvm_vq_config_ops = { + .get_features = kvm_get_features, + .finalize_features = kvm_finalize_features, + .get = kvm_get, + .set = kvm_set, + .get_status = kvm_get_status, + .set_status = kvm_set_status, + .reset = kvm_reset, + .find_vqs = kvm_find_vqs, + .del_vqs = kvm_del_vqs, +}; + +/* + * The root device for the kvm virtio devices. + * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2. + */ +static struct device *kvm_root; + +/* + * adds a new device and register it with virtio + * appropriate drivers are loaded by the device model + */ +static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset) +{ + struct kvm_device *kdev; + + kdev = kzalloc(sizeof(*kdev), GFP_KERNEL); + if (!kdev) { + pr_emerg("Cannot allocate kvm dev %u type %u\n", + offset, d->type); + return; + } + + kdev->vdev.dev.parent = kvm_root; + kdev->vdev.id.device = d->type; + kdev->vdev.config = &kvm_vq_config_ops; + kdev->desc = d; + kdev->desc_pa = PFN_PHYS(max_pfn) + offset; + + if (register_virtio_device(&kdev->vdev) != 0) { + pr_err("Failed to register kvm device %u type %u\n", + offset, d->type); + kfree(kdev); + } +} + +/* + * scan_devices() simply iterates through the device page. + * The type 0 is reserved to mean "end of devices". + */ +static void scan_devices(void) +{ + unsigned int i; + struct kvm_device_desc *d; + + for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { + d = kvm_devices + i; + + if (d->type == 0) + break; + + add_kvm_device(d, i); + } +} + +/* + * Init function for virtio. + * devices are in a single page above the top of "normal" mem. + */ +static int __init kvm_devices_init(void) +{ + int rc = -ENOMEM; + + kvm_root = root_device_register("kvm_tile"); + if (IS_ERR(kvm_root)) { + rc = PTR_ERR(kvm_root); + pr_err("Could not register kvm_tile root device"); + return rc; + } + + kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE, + 0, io_prot()); + if (!kvm_devices) { + kvm_devices = NULL; + root_device_unregister(kvm_root); + return rc; + } + + scan_devices(); + return 0; +} + +/* code for early console output with virtio_console */ +static __init int early_put_chars(u32 vtermno, const char *buf, int len) +{ + char scratch[512]; + + if (len > sizeof(scratch) - 1) + len = sizeof(scratch) - 1; + scratch[len] = '\0'; + memcpy(scratch, buf, len); + hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch)); + + return len; +} + +static int __init tile_virtio_console_init(void) +{ + return virtio_cons_early_init(early_put_chars); +} +console_initcall(tile_virtio_console_init); + +/* + * We do this after core stuff, but before the drivers. + */ +postcore_initcall(kvm_devices_init); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 44cdc4a..2629ff1 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -247,11 +248,13 @@ struct task_struct *validate_current(void) /* Take and return the pointer to the previous task, for schedule_tail(). */ struct task_struct *sim_notify_fork(struct task_struct *prev) { +#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */ struct task_struct *tsk = current; __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT | (tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS)); __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK | (tsk->pid << _SIM_CONTROL_OPERATOR_BITS)); +#endif return prev; } @@ -450,6 +453,11 @@ void _prepare_arch_switch(struct task_struct *next) struct task_struct *__sched _switch_to(struct task_struct *prev, struct task_struct *next) { +#ifdef CONFIG_KVM + /* vmexit is needed before context switch. */ + BUG_ON(task_thread_info(prev)->vcpu); +#endif + /* DMA state is already saved; save off other arch state. */ save_arch_state(&prev->thread); @@ -519,6 +527,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) /* Enable interrupts; they are disabled again on return to caller. */ local_irq_enable(); +#ifdef CONFIG_KVM + /* + * Some work requires us to exit the VM first. Typically this + * allows the process running the VM to respond to the work + * (e.g. a signal), or allows the VM mechanism to latch + * modified host state (e.g. a "hypervisor" message sent to a + * different vcpu). It also means that if we are considering + * calling schedule(), we exit the VM first, so we never have + * to worry about context-switching into a VM. + */ + if (current_thread_info()->vcpu) { + u32 do_exit = thread_info_flags & + (_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT); + + if (thread_info_flags & _TIF_VIRT_EXIT) + clear_thread_flag(TIF_VIRT_EXIT); + if (do_exit) { + kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN); + /*NORETURN*/ + } + } +#endif + if (thread_info_flags & _TIF_NEED_RESCHED) { schedule(); return 1; @@ -538,11 +569,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) tracehook_notify_resume(regs); return 1; } - if (thread_info_flags & _TIF_SINGLESTEP) { + + /* Handle a few flags here that stay set. */ + if (thread_info_flags & _TIF_SINGLESTEP) single_step_once(regs); - return 0; - } - panic("work_pending: bad flags %#x\n", thread_info_flags); + + return 0; } unsigned long get_wchan(struct task_struct *p) diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S index 1c09a4f..02bc446 100644 --- a/arch/tile/kernel/relocate_kernel_64.S +++ b/arch/tile/kernel/relocate_kernel_64.S @@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel) addi sp, sp, -8 /* we now have a stack (whether we need one or not) */ +#ifdef RELOCATE_NEW_KERNEL_VERBOSE moveli r40, hw2_last(hv_console_putc) shl16insli r40, r40, hw1(hv_console_putc) shl16insli r40, r40, hw0(hv_console_putc) -#ifdef RELOCATE_NEW_KERNEL_VERBOSE moveli r0, 'r' jalr r40 @@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel) /* we should not get here */ +#ifdef RELOCATE_NEW_KERNEL_VERBOSE moveli r0, '?' jalr r40 moveli r0, '\n' jalr r40 +#endif j .Lhalt @@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel) j .Lloop -.Lerr: moveli r0, 'e' +.Lerr: +#ifdef RELOCATE_NEW_KERNEL_VERBOSE + moveli r0, 'e' jalr r40 moveli r0, 'r' jalr r40 @@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel) jalr r40 moveli r0, '\n' jalr r40 +#endif .Lhalt: moveli r41, hw2_last(hv_halt) shl16insli r41, r41, hw1(hv_halt) diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 774e819..2352a81 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc); /* * Determine for each controller where its lowmem is mapped and how much of * it is mapped there. On controller zero, the first few megabytes are - * already mapped in as code at MEM_SV_INTRPT, so in principle we could + * already mapped in as code at MEM_SV_START, so in principle we could * start our data mappings higher up, but for now we don't bother, to avoid * additional confusion. * @@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot) * SPRs, as well as the interrupt mask. */ __insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1); + +#ifdef CONFIG_KVM + /* + * If we launch a guest kernel, it will need some interrupts + * that otherwise are not used by the host or by userspace. + * Set them to MPL 1 now and leave them alone going forward; + * they are masked in the host so will never fire there anyway, + * and we mask them at PL1 as we exit the guest. + */ __insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1); + __insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1); + __insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1); + __insn_mtspr(SPR_MPL_IPI_1_SET_1, 1); +#endif /* Initialize IRQ support for this cpu. */ setup_irq_regs(); @@ -1242,7 +1255,7 @@ static void __init validate_va(void) #ifndef __tilegx__ /* FIXME: GX: probably some validation relevant here */ /* * Similarly, make sure we're only using allowed VAs. - * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT, + * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START, * and 0 .. KERNEL_HIGH_VADDR. * In addition, make sure we CAN'T use the end of memory, since * we use the last chunk of each pgd for the pgd_list. @@ -1257,7 +1270,7 @@ static void __init validate_va(void) if (range.size == 0) break; if (range.start <= MEM_USER_INTRPT && - range.start + range.size >= MEM_HV_INTRPT) + range.start + range.size >= MEM_HV_START) user_kernel_ok = 1; if (range.start == 0) max_va = range.size; @@ -1693,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved) static int __init request_standard_resources(void) { int i; - enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET }; + enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET }; #if defined(CONFIG_PCI) && !defined(__tilegx__) insert_non_bus_resource(); diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index 0ae1c59..62b3ba9 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -223,30 +223,34 @@ void __init ipi_init(void) #if CHIP_HAS_IPI() -void smp_send_reschedule(int cpu) +static void __smp_send_reschedule(int cpu) { - WARN_ON(cpu_is_offline(cpu)); - /* * We just want to do an MMIO store. The traditional writeq() * functions aren't really correct here, since they're always * directed at the PCI shim. For now, just do a raw store, - * casting away the __iomem attribute. + * casting away the __iomem attribute. We do the store as a + * single asm() instruction to ensure that we can force a step + * over it in the KVM case, if we are not binding vcpus to cpus, + * rather than require it to be possible to issue validly. */ - ((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0; + unsigned long *addr = + &((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE]; + asm volatile("st %0, zero" :: "r" (addr)); } #else -void smp_send_reschedule(int cpu) +static void __smp_send_reschedule(int cpu) { - HV_Coord coord; - - WARN_ON(cpu_is_offline(cpu)); - - coord.y = cpu_y(cpu); - coord.x = cpu_x(cpu); + HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) }; hv_trigger_ipi(coord, IRQ_RESCHEDULE); } #endif /* CHIP_HAS_IPI() */ + +void smp_send_reschedule(int cpu) +{ + WARN_ON(cpu_is_offline(cpu)); + __smp_send_reschedule(cpu); +} diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c index 24fd223..362284a 100644 --- a/arch/tile/kernel/stack.c +++ b/arch/tile/kernel/stack.c @@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt) p->sp >= sp) { if (kbt->verbose) pr_err(" <%s while in kernel mode>\n", fault); - } else if (EX1_PL(p->ex1) == USER_PL && + } else if (user_mode(p) && p->sp < PAGE_OFFSET && p->sp != 0) { if (kbt->verbose) pr_err(" <%s while in user mode>\n", fault); diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c index e25b0a8..024b978 100644 --- a/arch/tile/kernel/sysfs.c +++ b/arch/tile/kernel/sysfs.c @@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *page) { +#ifdef CONFIG_KVM_GUEST + return sprintf(page, "KVM\n"); +#else return sprintf(page, "tilera\n"); +#endif } static DEVICE_ATTR(type, 0444, type_show, NULL); diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c index 3c2dc87..b0b7264 100644 --- a/arch/tile/kernel/time.c +++ b/arch/tile/kernel/time.c @@ -117,9 +117,9 @@ void __init time_init(void) /* * Define the tile timer clock event device. The timer is driven by - * the TILE_TIMER_CONTROL register, which consists of a 31-bit down + * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down * counter, plus bit 31, which signifies that the counter has wrapped - * from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be + * from zero to (2**31) - 1. The INT_[AUX_]TILE_TIMER interrupt will be * raised as long as bit 31 is set. */ @@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks, struct clock_event_device *evt) { BUG_ON(ticks > MAX_TICK); - __insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks); - arch_local_irq_unmask_now(INT_TILE_TIMER); + __insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks); + arch_local_irq_unmask_now(INT_LINUX_TIMER); return 0; } @@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks, static void tile_timer_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) { - arch_local_irq_mask_now(INT_TILE_TIMER); + arch_local_irq_mask_now(INT_LINUX_TIMER); } static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = { @@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void) evt->cpumask = cpumask_of(smp_processor_id()); /* Start out with timer not firing. */ - arch_local_irq_mask_now(INT_TILE_TIMER); + arch_local_irq_mask_now(INT_LINUX_TIMER); /* * Register tile timer. Set min_delta to 1 microsecond, since @@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num) * Mask the timer interrupt here, since we are a oneshot timer * and there are now by definition no events pending. */ - arch_local_irq_mask(INT_TILE_TIMER); + arch_local_irq_mask(INT_LINUX_TIMER); /* Track time spent here in an interrupt context */ irq_enter(); diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index f110785..19d465c 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -30,7 +30,7 @@ void __init trap_init(void) { - /* Nothing needed here since we link code at .intrpt1 */ + /* Nothing needed here since we link code at .intrpt */ } int unaligned_fixup = 1; diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index c7ae53d..8b20163 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -5,7 +5,7 @@ #include /* Text loads starting from the supervisor interrupt vector address. */ -#define TEXT_OFFSET MEM_SV_INTRPT +#define TEXT_OFFSET MEM_SV_START OUTPUT_ARCH(tile) ENTRY(_start) @@ -13,7 +13,7 @@ jiffies = jiffies_64; PHDRS { - intrpt1 PT_LOAD ; + intrpt PT_LOAD ; text PT_LOAD ; data PT_LOAD ; } @@ -24,11 +24,11 @@ SECTIONS #define LOAD_OFFSET TEXT_OFFSET /* Interrupt vectors */ - .intrpt1 (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */ + .intrpt (LOAD_OFFSET) : AT ( 0 ) /* put at the start of physical memory */ { _text = .; - *(.intrpt1) - } :intrpt1 =0 + *(.intrpt) + } :intrpt =0 /* Hypervisor call vectors */ . = ALIGN(0x10000); diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig index 2298cb1..65f7f9d 100644 --- a/arch/tile/kvm/Kconfig +++ b/arch/tile/kvm/Kconfig @@ -27,9 +27,6 @@ config KVM This module provides access to the hardware capabilities through a character device node named /dev/kvm. - To compile this as a module, choose M here: the module - will be called kvm. - If unsure, say N. source drivers/vhost/Kconfig diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile new file mode 100644 index 0000000..2c3d206 --- /dev/null +++ b/arch/tile/kvm/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for Kernel-based Virtual Machine module +# + +ccflags-y := -Ivirt/kvm -Iarch/tile/kvm + +kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o) + +kvm-y += kvm-tile.o +kvm-y += entry.o + +obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S new file mode 100644 index 0000000..07aa3a6 --- /dev/null +++ b/arch/tile/kvm/entry.S @@ -0,0 +1,91 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include + +#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8) +#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 } +#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 } +#define FOR_EACH_CALLEE_SAVED_REG(f) \ + f(r30); f(r31); \ + f(r32); f(r33); f(r34); f(r35); f(r36); f(r37); f(r38); f(r39); \ + f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \ + f(r48); f(r49); f(r50); f(r51); f(r52); + +/* + * Called with interrupts disabled from kvm_tile_run() and is responsible + * just for saving the callee-save registers and the stack pointer, then + * resetting ksp0 so subsequent interrupts don't wipe the kernel stack. + * It uses restore_all in intvec_64.S to jump back into the guest. + * The kvm_vmexit function below undoes the stack manipulation. + */ +STD_ENTRY(kvm_vmresume) + /* Do function prolog and save callee-saves on stack. */ + { + move r10, sp + st sp, lr + } + { + addli r11, sp, -FRAME_SIZE + 8 + addli sp, sp, -FRAME_SIZE + } + { + st r11, r10 + addi r12, sp, 16 + } + FOR_EACH_CALLEE_SAVED_REG(SAVE_REG) + SAVE_REG(tp) + SAVE_REG(lr) + + /* Save frame pointer in thread_info so we can get it back later. */ + st r1, sp + + /* Set the ksp0 for this core to be below this frame. */ + mfspr r10, SPR_SYSTEM_SAVE_K_0 + bfins r10, sp, 0, CPU_SHIFT-1 + mtspr SPR_SYSTEM_SAVE_K_0, r10 + + /* sp points to ABI save area below pt_regs for restore_all. */ + addli sp, r0, -C_ABI_SAVE_AREA_SIZE + + /* Execute an "interrupt return" to the guest. */ + { + movei r30, 0 + j restore_all + } + STD_ENDPROC(kvm_vmresume) + +/* + * Called with interrupts disabled from kvm_trigger_vmexit(); returns with + * interrupts still disabled to kvm_vmresume()'s caller, discarding all the + * stack contents below the kvm_vmresume() frame. kvm_vmresume()'s caller + * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value. + */ +STD_ENTRY(kvm_vmexit) + { + move sp, r0 + addi r12, r0, 16 + } + FOR_EACH_CALLEE_SAVED_REG(LOAD_REG) + LOAD_REG(tp) + LOAD_REG(lr) + { + addli sp, sp, FRAME_SIZE + jrp lr + } + STD_ENDPROC(kvm_vmexit) diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c new file mode 100644 index 0000000..4c33991 --- /dev/null +++ b/arch/tile/kvm/kvm-tile.c @@ -0,0 +1,1581 @@ +/* + * Copyright 2013 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { NULL } +}; + +static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address) +{ + struct mm_struct *mm = kvm->mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (kvm->arch.vpgd == NULL) + kvm->arch.vpgd = pgd_alloc(kvm->mm); + pgd = kvm->arch.vpgd + pgd_index(address); + pud = pud_alloc(mm, pgd, address); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + return NULL; + return pte_alloc_kernel(pmd, address); +} + +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + +/* FIXME: support huge pages. */ +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) +{ + unsigned long gpa, i; + + gpa = mem->guest_phys_addr; + for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE) + if (get_vpgd_pte(kvm, gpa) == NULL) + return -ENOMEM; + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + enum kvm_mr_change change) +{ + unsigned long gpa, address, pfn, i; + struct page *page[1]; + pte_t *ptep, *vptep; + + gpa = mem->guest_phys_addr; + address = mem->userspace_addr; + for (i = 0; i < mem->memory_size; + i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) { + vptep = get_vpgd_pte(kvm, gpa); + BUG_ON(vptep == NULL); + get_user_pages_fast(address, 1, 1, page); + pfn = page_to_pfn(page[0]); + ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn))); + *vptep = *ptep; + } +} + +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_arch_flush_shadow_all(kvm); +} + +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + return 0; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return 0; +} + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq) +{ + if (irq < 0) + return -EINVAL; + + set_bit(irq, &vcpu->arch.ipi_events); + kvm_vcpu_kick(vcpu); + + return 0; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r = 0; + + switch (ioctl) { + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof(irq))) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq); + if (r) + goto out; + r = 0; + break; + } + default: + r = -EINVAL; + } + +out: + return r; +} + +int kvm_dev_ioctl_check_extension(long ext) +{ + return 0; +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + return 0; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + long r = -EINVAL; + + return r; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long page_size; + unsigned long gva = tr->linear_address; + unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa; + pud_t gpud; + pmd_t gpmd; + pte_t gpte; + + /* Get guest pgd (aka pud for three-level tables). */ + gpgd_gpa = vcpu->arch.guest_context.page_table + + (sizeof(pgd_t) * pgd_index(gva)); + if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0) + goto fail; + if (!pud_present(gpud)) + goto fail; + + /* Get guest pmd. */ + if (pud_huge_page(gpud)) { + /* FIXME: no super huge page support yet. */ + if (pte_super(*(pte_t *)&gpud)) + goto fail; + gpte = *(pte_t *)&gpud; + page_size = PGDIR_SIZE; + goto ok; + } + gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) + + (sizeof(pmd_t) * pmd_index(gva)); + if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0) + goto fail; + if (!pmd_present(gpmd)) + goto fail; + + /* Get guest pte. */ + if (pmd_huge_page(gpmd)) { + /* FIXME: no super huge page support yet. */ + if (pte_super(*(pte_t *)&gpmd)) + goto fail; + gpte = *(pte_t *)&gpmd; + page_size = PMD_SIZE; + goto ok; + } + gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) + + (sizeof(pte_t) * pte_index(gva)); + if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0) + goto fail; + if (!pte_present(gpte)) + goto fail; + + page_size = PAGE_SIZE; + +ok: + tr->physical_address = + PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1)); + tr->valid = 1; + tr->writeable = pte_write(gpte); + tr->usermode = pte_user(gpte); + + return 0; + +fail: + tr->valid = 0; + return 0; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + regs->regs = vcpu->arch.regs; + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu->arch.regs = regs->regs; + vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS; + return 0; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + *sregs = vcpu->arch.sregs; + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + vcpu->arch.sregs = *sregs; + return 0; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return 0; +} + +/* + * panic_hv() will dump stack info of both guest os and host os, and set + * proper exit reason so that qemu can terminate the guest process. + * + * FIXME: Probably KVM_EXIT_EXCEPTION? If using KVM_EXIT_EXCEPTION, + * current qemu process will "hang" (killable but Ctrl+C not working), + * so use KVM_EXIT_SHUTDOWN here temporarily. + */ +static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...) +{ + char panic_buf[256]; + struct pt_regs *regs; + va_list ap; + int i; + + va_start(ap, fmt); + vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap); + va_end(ap); + pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf); + + /* Show guest os info */ + regs = &vcpu->arch.regs; + for (i = 0; i < 17; i++) + pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n", + i, regs->regs[i], i+18, regs->regs[i+18], + i+36, regs->regs[i+36]); + pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n", + regs->regs[18], regs->regs[35], regs->tp); + pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr); + pr_err(" pc : "REGFMT" ex1: %ld faultnum: %ld\n", + regs->pc, regs->ex1, regs->faultnum); + + /* Show host os info */ + pr_err("\nKVM stack in the host:\n"); + dump_stack(); + + /* Shut down the guest os */ + pr_err("Shutting down guest.\n"); + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; +} + +/* Copied from virt/kvm/kvm_main.c */ +static int next_segment(unsigned long len, int offset) +{ + if (len > PAGE_SIZE - offset) + return PAGE_SIZE - offset; + else + return len; +} + +static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva, + void *data, unsigned long len) +{ + struct kvm *kvm = vcpu->kvm; + int seg; + int offset = offset_in_page(gva); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + struct kvm_translation tr; + tr.linear_address = gva; + kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (!tr.valid) + return -EFAULT; + ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address), + data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + gva += seg; + } + return 0; +} + +static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva, + const void *data, unsigned long len) +{ + struct kvm *kvm = vcpu->kvm; + int seg; + int offset = offset_in_page(gva); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + struct kvm_translation tr; + tr.linear_address = gva; + kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (!tr.valid) + return -EFAULT; + ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address), + data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + gva += seg; + } + return 0; +} + +static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long len) +{ + struct kvm *kvm = vcpu->kvm; + int seg; + int offset = offset_in_page(gva); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + struct kvm_translation tr; + tr.linear_address = gva; + kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (!tr.valid) + return -EFAULT; + ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address), + offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + gva += seg; + } + return 0; +} + +/* + * The following functions are emulation functions for various + * hypervisor system calls (i.e. hv_*()). Return value: + * 1 if the host os can emulate it completely. + * < 0 if errors occur and then qemu will handle them. + * 0 if qemu emulation is needed. + * In both the < 0 and the == 0 cases, exit reason should + * be set for qemu handling. + */ + +/* generic handler for hypercall which needs user (QEMU) to handle. */ +static int kvm_deliver_to_user(struct kvm_vcpu *vcpu) +{ + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + return 0; +} + +/* handler for illegal hypercall */ +static int kvm_emulate_illegal(struct kvm_vcpu *vcpu) +{ + return panic_hv(vcpu, "Illegal kvm hypercall: %ld", + (unsigned long)vcpu->arch.regs.regs[10]); +} + +static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu) +{ + int version = vcpu->arch.regs.regs[0]; + int chip_num = vcpu->arch.regs.regs[1]; + int chip_rev_num = vcpu->arch.regs.regs[2]; + int client_pl = vcpu->arch.regs.regs[3]; + + if (client_pl != 1) + return panic_hv(vcpu, "Guest is requesting PL %d, but KVM" + " guests must request PL 1.\n" + "Reconfigure your guest with KVM_GUEST set.\n", + client_pl); + + if (version != HV_VERSION) + return panic_hv(vcpu, "Client built for hv version %d, but" + " this hv is version %d\n", + version, HV_VERSION); + + if (chip_num != TILE_CHIP) + return panic_hv(vcpu, "Client built for chip %d, but this" + " hardware is chip %d\n", + chip_num, TILE_CHIP); + + if (chip_rev_num != TILE_CHIP_REV) + return panic_hv(vcpu, "Client built for chip rev %d, but this" + " hardware is chip rev %d\n", + chip_rev_num, TILE_CHIP_REV); + + return 1; +} + +static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu) +{ + HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0]; + long rc; + + switch (query) { + case HV_SYSCONF_PAGE_SIZE_SMALL: + rc = PAGE_SIZE; + break; + + case HV_SYSCONF_PAGE_SIZE_LARGE: + rc = HPAGE_SIZE; + break; + + case HV_SYSCONF_VALID_PAGE_SIZES: +#if PAGE_SHIFT == 16 + rc = HV_CTX_PG_SM_64K; +#elif PAGE_SHIFT == 14 + rc = HV_CTX_PG_SM_16K; +#else +# error Fix hv_sysconf emulation for new page size +#endif + break; + + case HV_SYSCONF_PAGE_SIZE_JUMBO: + rc = 0; /* FIXME add super page support */ + break; + + case HV_SYSCONF_CPU_SPEED: + case HV_SYSCONF_CPU_TEMP: + case HV_SYSCONF_BOARD_TEMP: + rc = hv_sysconf(query); + break; + + default: + rc = -EINVAL; + break; + } + + vcpu->arch.regs.regs[0] = rc; + return 1; +} + +static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu) +{ + HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0]; + long buflen = vcpu->arch.regs.regs[2]; + char hvbuf[256]; + const char *p; + long rc; + + switch (query) { + + /* For hardware attributes, just pass to the hypervisor. */ + case HV_CONFSTR_BOARD_PART_NUM: + case HV_CONFSTR_BOARD_SERIAL_NUM: + case HV_CONFSTR_CHIP_SERIAL_NUM: + case HV_CONFSTR_BOARD_REV: + case HV_CONFSTR_CHIP_MODEL: + case HV_CONFSTR_BOARD_DESC: + case HV_CONFSTR_MEZZ_PART_NUM: + case HV_CONFSTR_MEZZ_SERIAL_NUM: + case HV_CONFSTR_MEZZ_REV: + case HV_CONFSTR_MEZZ_DESC: + case HV_CONFSTR_SWITCH_CONTROL: + case HV_CONFSTR_CHIP_REV: + case HV_CONFSTR_CPUMOD_PART_NUM: + case HV_CONFSTR_CPUMOD_SERIAL_NUM: + case HV_CONFSTR_CPUMOD_REV: + case HV_CONFSTR_CPUMOD_DESC: + rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf)); + if (rc > sizeof(hvbuf)) { + /* Not the best answer, but very unlikely anyway. */ + rc = sizeof(hvbuf); + hvbuf[sizeof(hvbuf)-1] = '\0'; + } + p = hvbuf; + break; + + /* For hypervisor version info, just report the kernel version. */ + case HV_CONFSTR_HV_SW_VER: + p = UTS_RELEASE; + break; + case HV_CONFSTR_HV_CONFIG: + case HV_CONFSTR_HV_CONFIG_VER: + p = ""; + break; + + default: + rc = HV_EINVAL; + goto done; + } + + rc = strlen(p) + 1; /* include NUL */ + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1], + p, min(rc, buflen))) + rc = HV_EFAULT; + +done: + vcpu->arch.regs.regs[0] = rc; + return 1; +} + +static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu) +{ + HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0]; + struct rtc_time tm; + struct timeval tv; + + do_gettimeofday(&tv); + rtc_time_to_tm(tv.tv_sec, &tm); + hvtm->tm_sec = tm.tm_sec; + hvtm->tm_min = tm.tm_min; + hvtm->tm_hour = tm.tm_hour; + hvtm->tm_mday = tm.tm_mday; + hvtm->tm_mon = tm.tm_mon; + hvtm->tm_year = tm.tm_year; + hvtm->flags = 0; + + return 1; +} + +static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu) +{ + /* Do nothing here. */ + pr_warn("hv_set_rtc() will not work in kvm guest\n"); + return 1; +} + +static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu) +{ + int idx = vcpu->arch.regs.regs[0]; + HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0]; + + switch (idx) { + case 0: + var->start = 0UL; + var->size = 0x20000000000UL; + break; + case 1: + var->start = 0xFFFFFFFF80000000UL; + var->size = 0x80000000UL; + break; + default: + var->start = 0UL; + var->size = 0UL; + break; + } + + return 1; +} + +/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */ +static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu) +{ + int idx = vcpu->arch.regs.regs[0]; + HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0]; + + if (idx == 0) { + var->start = min_asid; + var->size = max_asid - min_asid + 1; + } else { + var->start = 0; + var->size = 0; + } + + return 1; +} + +static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu) +{ + HV_Topology *tp; + int cpus; + + /* Depends on the definition of struct HV_Topology */ + tp = (HV_Topology *)&vcpu->arch.regs.regs[0]; + + cpus = atomic_read(&vcpu->kvm->online_vcpus); + tp->coord.x = vcpu->vcpu_id; + tp->coord.y = 0; + tp->width = cpus; + tp->height = 1; + + return 1; +} + +static int xy_to_vcpu(struct kvm *kvm, int x, int y) +{ + if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus)) + return -1; + return x; +} + +/* + * The primary vcpu is the one that initially runs while the others + * all block. It is the only that is allowed to call hv_start_all_tiles(). + * The other cpus are secondary. + */ +static bool is_secondary_vcpu(struct kvm_vcpu *vcpu) +{ + return vcpu->vcpu_id != 0; +} + +static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu) +{ + struct completion *c = &vcpu->kvm->arch.smp_start; + if (is_secondary_vcpu(vcpu) || completion_done(c)) + return panic_hv(vcpu, "start_all_tiles() called again"); + complete_all(c); + return 1; +} + +static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu) +{ + gpa_t gpa = vcpu->arch.regs.regs[0]; + HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1]; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; + + gfn = gpa_to_gfn(gpa); + pfn = gfn_to_pfn(vcpu->kvm, gfn); + if (is_error_pfn(pfn)) + return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()", + gpa); + hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK); + + vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access); + + return 1; +} + +static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu) +{ + gpa_t gpa = vcpu->arch.regs.regs[0]; + HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1]; + uint64_t val = vcpu->arch.regs.regs[2]; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; + + gfn = gpa_to_gfn(gpa); + pfn = gfn_to_pfn(vcpu->kvm, gfn); + if (is_error_pfn(pfn)) + return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()", + gpa); + hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK); + + hv_physaddr_write64(hpa, *access, val); + + return 1; +} + +static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu) +{ + /* Do we care about the argument msgstate? */ + vcpu->arch.regs.regs[0] = HV_OK; + + return 1; +} + +/* + * NOTE: we may coalesce multiple messages with the same tag to the + * same recepient. Currently the only messages used by Linux are + * start/stop cpu (where coalescing is OK), and the smp_call_function() + * IPI message tag. In the latter case we rely on the generic + * smp_call_function code to properly handle this, and since it only + * uses the IPI as a way to wake up the generic list-walking code, + * it's OK if we coalesce several IPI deliveries before the recipient + * core takes action. + */ +static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *vcpui; + HV_Recipient recip[NR_CPUS]; + HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0]; + int nrecip = vcpu->arch.regs.regs[1]; + int buflen = vcpu->arch.regs.regs[3]; + int sent, vcpu_id, tag; + + /* NOTE: we only support the Linux usage of buflen == sizeof(int). */ + if (unlikely(buflen != sizeof(int) || + nrecip >= atomic_read(&kvm->online_vcpus))) { + vcpu->arch.regs.regs[0] = HV_EINVAL; + return 1; + } + + /* Get the buf info */ + if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2], + &tag, sizeof(tag))) { + vcpu->arch.regs.regs[0] = HV_EFAULT; + return 1; + } + + /* Range-check the tag value. */ + if (tag < 0 || tag >= MAX_MSG_TAG) { + vcpu->arch.regs.regs[0] = HV_EFAULT; + return 1; + } + + /* Get all the recipients */ + if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip, + nrecip * sizeof(HV_Recipient))) { + vcpu->arch.regs.regs[0] = HV_EFAULT; + return 1; + } + + for (sent = 0; sent < nrecip; sent++) { + if (recip[sent].state != HV_TO_BE_SENT) + continue; + vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y); + if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) { + recip[sent].state = HV_BAD_RECIP; + continue; + } + vcpui = kvm_get_vcpu(kvm, vcpu_id); + set_bit(tag, &vcpui->arch.pending_msgs); + kvm_vcpu_kick(vcpui); + recip[sent].state = HV_SENT; + } + + if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip, + nrecip * sizeof(HV_Recipient))) { + vcpu->arch.regs.regs[0] = HV_EFAULT; + return 1; + } + + vcpu->arch.regs.regs[0] = sent; + + return 1; +} + +static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu) +{ + HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0]; + int buflen = vcpu->arch.regs.regs[3]; + int tag; + + /* Currently we only support messages from other tiles. */ + rmi->source = HV_MSG_TILE; + + if (buflen <= sizeof(int)) { + rmi->msglen = HV_E2BIG; + return 1; + } + + tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG); + if (tag >= MAX_MSG_TAG) { + /* No more messages */ + rmi->msglen = 0; + return 1; + } + + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2], + &tag, sizeof(int))) { + rmi->msglen = HV_EFAULT; + return 1; + } + + /* + * This clear_bit could race with a set_bit as another core + * delivers a new smp_function_call to this core. However, + * the smp_function_call code will have set up the additional + * smp_function_call data on the kernel's list prior to + * raising the interrupt, so even if we lose the new + * interrupt due to the race, we still haven't dispatched + * to the original interrupt handler, and when we do, it + * will find both smp_function_calls waiting for it, so the + * race is harmless. This is consistent with the fact that + * the generic code is trying to support pretty much + * arbitrary architecture-dependent IPI semantics, so it + * is very conservative about what it assumes. + * + * Also note that we only clear_bit on the core that owns + * the mask, so there's no race condition caused by the + * find_first_bit above and the clear_bit here, since once + * a bit is found it will stay set until this point. + */ + clear_bit(tag, &vcpu->arch.pending_msgs); + rmi->msglen = sizeof(int); + return 1; +} + +static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu) +{ + HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0]; + + *ctx = hv_inquire_guest_context(); + + return 1; +} + +static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + HV_InqTileSet set = vcpu->arch.regs.regs[0]; + unsigned long gva = vcpu->arch.regs.regs[1]; + int length = vcpu->arch.regs.regs[2]; + struct cpumask mask = CPU_MASK_NONE; + int cpus, i, retval, bytes2copy, bytes2zero; + + switch (set) { + case HV_INQ_TILES_AVAIL: + case HV_INQ_TILES_HFH_CACHE: + case HV_INQ_TILES_LOTAR: + cpus = atomic_read(&kvm->online_vcpus); + for (i = 0; i < cpus; ++i) + cpumask_set_cpu(i, &mask); + break; + case HV_INQ_TILES_SHARED: + break; + default: + retval = HV_EINVAL; + goto done; + } + + bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length; + bytes2zero = length - bytes2copy; + + if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) { + retval = HV_EFAULT; + goto done; + } + + if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) { + retval = HV_EFAULT; + goto done; + } + + retval = HV_OK; +done: + vcpu->arch.regs.regs[0] = retval; + return 1; +} + +static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu) +{ + HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0]; + int pl = (int) vcpu->arch.regs.regs[1]; + struct kvm_vcpu *target_vcpu; + int vcpu_id; + + vcpu_id = vtarget.x; + if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 || + vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) { + vcpu->arch.regs.regs[0] = HV_EINVAL; + return 1; + } + + target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id); + if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2], + &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) { + vcpu->arch.regs.regs[0] = HV_EFAULT; + return 1; + } + + vcpu->arch.regs.regs[0] = HV_OK; + + return 1; +} + +struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa) +{ + struct kvm_vcpu *vcpui; + unsigned long idx; + + kvm_for_each_vcpu(idx, vcpui, kvm) + if (vcpui->arch.ipi_gpa == gpa) + return vcpui; + + return NULL; +} + +/* + * Most page faults will be downcall-ed from hv to and be handled directly + * by either guest os or host os. This function is used to handle the + * rest cases. + */ +static int handle_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_translation tr; + struct kvm_vcpu *ipi_vcpu; + + tr.linear_address = (__u64) vcpu->arch.fault_addr; + kvm_arch_vcpu_ioctl_translate(vcpu, &tr); + if (!tr.valid) + return 0; + + /* ipi PTE for rescheduling interrupt? */ + ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address); + if (!ipi_vcpu) + return 0; + + set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events); + kvm_vcpu_kick(ipi_vcpu); + + /* Juke the PC past the store instruction. */ + vcpu->arch.regs.pc += 8; + return 1; +} + +static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu) +{ + /* + * We do not expect this call in guest so far. At least guest os + * should just follow host os instead of *set*. Besides, + * hv_set_pte_super_shift() will not be called in guest os with + * current guest os setting. + */ + vcpu->arch.regs.regs[0] = HV_EINVAL; + + return 1; +} + +static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu) +{ + HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0]; + + hvss->new_speed = HV_EPERM; + hvss->end_cycle = 0; + hvss->delta_ns = 0; + + return 1; +} + +static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = { + HCALL_DEFS +}; + +static int kvm_handle_exit(struct kvm_vcpu *vcpu) +{ + unsigned long hcall_idx; + + switch (vcpu->run->exit_reason) { + case KVM_EXIT_HYPERCALL: + hcall_idx = vcpu->arch.regs.regs[10]; + if (unlikely(hcall_idx >= KVM_NUM_HCALLS || + hcall_handlers[hcall_idx] == NULL)) + return kvm_emulate_illegal(vcpu); + + /* Juke us past the swint0 when we return. */ + vcpu->arch.regs.pc += 8; + + return hcall_handlers[hcall_idx](vcpu); + + case KVM_EXIT_MMIO: + if (handle_mmio(vcpu)) + return 1; + return panic_hv(vcpu, "Out-of-bounds client memory access"); + + case KVM_EXIT_AGAIN: + return 1; + + default: + return 0; + } +} + +static void kvm_kick_func(void *info) +{ + struct kvm_vcpu *vcpu = info; + + /* If this is not the thread that we expect, just return. */ + if (unlikely(vcpu->pid != get_task_pid(current, PIDTYPE_PID))) + return; + + /* Setting this flag will cause a vmexit instead of a vmresume. */ + set_thread_flag(TIF_VIRT_EXIT); +} + +/* Note this function has been a standard kvm interface in latest Linux. */ +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me, cpu; + + /* If it is waiting in kvm_vcpu_block(), wake it up. */ + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + + /* If we are kicking our own vcpu, make sure we vmexit. */ + if (vcpu == current_thread_info()->vcpu) { + set_thread_flag(TIF_VIRT_EXIT); + return; + } + + /* + * If the vcpu is running the guest, interrupt its cpu, + * causing it to vmexit by setting TIF_VIRT_EXIT. Note we can + * race with a guest already doing a vmexit, but that is benign. + */ + cpu = vcpu->cpu; + me = get_cpu(); + if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu)) + if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + smp_call_function_single(cpu, kvm_kick_func, vcpu, 0); + put_cpu(); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_kick); + +/* + * Any interrupt that would normally be handled by the host at PL2 + * needs to be reassigned to the guest at PL1 as we enter. + * + * The TLB interrupts remain handled by the hypervisor and are downcalled + * to the appropriate host or guest as necessary. + * + * FIXME: We don't give the UDN interrupts for now; at some point we + * plan to allow an option to pin the vcpus and report the true + * geometry to the guest, at which point passing the UDN access would + * make sense. + * + * FIXME: For now we don't pass the profiling interrupts to the guest, + * and instead require profiling be run in the host; we should be able + * to support guest-level profiling pretty easily, but we need to + * think about whether there are vcpu migration issues there. + */ +static void kvm_grant_mpls(void) +{ + __insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1); + __insn_mtspr(SPR_MPL_ILL_SET_1, 1); + __insn_mtspr(SPR_MPL_GPV_SET_1, 1); + __insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1); + __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1); +} + +static void kvm_ungrant_mpls(void) +{ + __insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1); + __insn_mtspr(SPR_MPL_ILL_SET_2, 1); + __insn_mtspr(SPR_MPL_GPV_SET_2, 1); + __insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1); + __insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1); +} + +/* + * There is lots of state that is (for the non-virtualized case) held + * permanently in SPRs, or that is in any case not context-switched. + * The next two routines switch in and out all the SPR state. + * + * We try to fix the timer so that when we restart, we fix up the + * timer value so that will fire at the correct wall-clock time even + * if we have been scheduled out for a little bit. This may also + * mean we end up firing it immediately on return, and suffer a + * timer delay in the guest. + */ +static void kvm_save_sprs(struct kvm_vcpu *vcpu) +{ + vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL); + vcpu->arch.vmexit_cycles = get_cycles(); + +#define SAVE_SPR(x) vcpu->arch.sregs.x = __insn_mfspr(SPR_ ## x) + FOR_EACH_GUEST_SPR(SAVE_SPR); +#undef SAVE_SPR +} + +static void kvm_restore_sprs(struct kvm_vcpu *vcpu) +{ + unsigned long count = vcpu->arch.timer_control; + unsigned long underflow = + (count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1; + unsigned long disabled = + (count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1; + + if (!disabled) { + unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles; + count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK; + underflow |= delta > count; + count -= delta; + count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK; + count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT); + } + __insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count); + +#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.sregs.x) + FOR_EACH_GUEST_SPR(RESTORE_SPR); +#undef RESTORE_SPR +} + +/* + * When entering the guest, we need to eliminate any PL0 translations + * that were in use by qemu, since the guest's PL0 translations will + * be different. We also flush PL1 translations in case there have + * been changes to the virtualization page table, etc. + * + * FIXME: Add a way to just flush PL0/PL1, or just flush below + * the host PAGE_OFFSET, or add vpid support, etc. + */ +static void kvm_guest_context_enter(struct kvm_vcpu *vcpu) +{ + HV_Context *ctx; + pgd_t *vpgdir; + pte_t *ptep; + int rc; + + /* Install virtualization context */ + vpgdir = vcpu->kvm->arch.vpgd; + BUG_ON(vpgdir == NULL); + ptep = virt_to_pte(NULL, (unsigned long)vpgdir); + rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0); + WARN_ON_ONCE(rc < 0); + + /* Install guest context */ + ctx = &vcpu->arch.guest_context; + rc = hv_install_guest_context(ctx->page_table, ctx->access, + ctx->asid, ctx->flags); + WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n", + ctx->page_table, ctx->access.val, + ctx->asid, ctx->flags, rc); + + hv_flush_all(0); +} + +/* + * De-install the virtualization context so we take faults below the + * host Linux PL in the normal manner going forward. + * + * We flush all the TLB mappings as we exit the guest, since the + * guest has been using the ASIDs as it pleases, and may have installed + * incompatible mappings for qemu's process as well. Note that we don't + * worry about host-PL interrupts that occur while the guest is running, + * on the assumption that such interrupts can't touch userspace + * addresses legally anyway. + * + * NOTE: we may want to add a hypervisor call to just flush mappings + * below PL2 and use that here instead. + */ +static void kvm_guest_context_exit(struct kvm_vcpu *vcpu) +{ + int rc; + + /* Remember guest context */ + vcpu->arch.guest_context = hv_inquire_guest_context(); + + /* Disable virtualization context */ + rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0); + WARN_ON_ONCE(rc < 0); + + /* Flush everything in the TLB. */ + hv_flush_all(0); +} + +static void kvm_inject_interrupts(struct kvm_vcpu *vcpu) +{ + /* + * Capture current set of ipi_events. We might race with + * another thread adding an event, but if so we'll just miss + * it on this go-around and see it next time. + */ + vcpu->arch.sregs.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0); + + /* + * Note: We could set PC and EX1 for the guest os to jump + * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt + * is unmasked and the guest is not at PL1 with ICS set. + * But in fact it's about as fast to just set INTCTRL_1_STATUS + * here and then run the short INTCTRL_1 handler in the guest. + */ + vcpu->arch.sregs.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0); +} + +static void kvm_tile_run(struct kvm_vcpu *vcpu) +{ + struct thread_info *ti = current_thread_info(); + unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0); + + /* + * Disable interrupts while we set up the guest state. + * This way, if we race with another core trying to tell us + * to fix up our guest state, we will take the kick only as + * we actually try to enter the guest, and instead we will + * vmexit and end up retrying. + */ + local_irq_disable(); + kvm_guest_context_enter(vcpu); + clear_bit(KVM_REQ_KICK, &vcpu->requests); + ti->vcpu = vcpu; + vcpu->cpu = get_cpu(); + kvm_inject_interrupts(vcpu); + kvm_grant_mpls(); + kvm_restore_sprs(vcpu); + + /* Calling this function irets into the guest. */ + kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp); + + /* We resume here due to a call to kvm_vmexit. */ + __insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0); + + vcpu->cpu = -1; + put_cpu(); + ti->vcpu = NULL; + set_bit(KVM_REQ_KICK, &vcpu->requests); + vcpu->run->ready_for_interrupt_injection = 1; + kvm_ungrant_mpls(); + kvm_save_sprs(vcpu); + __insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL); + kvm_guest_context_exit(vcpu); + local_irq_enable(); +} + +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r = 1; + + while (r > 0) { + kvm_guest_enter(); + kvm_tile_run(vcpu); + kvm_guest_exit(); + + r = kvm_handle_exit(vcpu); + /* + * <0: error for userspace. + * =0: QEMU to handle. + * >0: host os can handle it fully. + */ + if (r <= 0) + break; + + if (signal_pending(current)) { + vcpu->run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + break; + } + +#ifdef CONFIG_HOMECACHE + if (current_thread_info()->homecache_cpu != + smp_processor_id()) { + /* Do homecache migration when returning to qemu. */ + vcpu->run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + break; + } +#endif + + kvm_resched(vcpu); + } + + return r; +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + sigset_t sigsaved; + + /* Secondary cpus must wait until they are told they can start. */ + if (vcpu->arch.suspended) { + struct completion *c = &vcpu->kvm->arch.smp_start; + if (wait_for_completion_interruptible(c)) + return -EINTR; + vcpu->arch.suspended = 0; + } + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + r = __vcpu_run(vcpu, kvm_run); + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return r; +} + +int kvm_arch_init(void *opaque) +{ + return 0; +} + +void kvm_arch_exit(void) +{ +} + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + unsigned long resv_gfn_start; + struct kvm_memory_slot *s; + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.resv_gpa_start) { + resv_gfn_start = 0; + + for (i = 0; i < KVM_USER_MEM_SLOTS; i++) { + s = &kvm->memslots->memslots[i]; + + if (!s->npages) + continue; + + if ((s->base_gfn + s->npages) > resv_gfn_start) + resv_gfn_start = s->base_gfn + s->npages; + } + + kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start); + } + + /* Initialize to enter fake PA=VA mode in hypervisor. */ + vcpu->arch.guest_context.page_table = HV_CTX_NONE; + + vcpu->arch.ipi_gpa = + kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE); + vcpu->arch.ipi_gpte = + pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL); + + /* Mark the core suspended if it is not the boot cpu. */ + vcpu->arch.suspended = is_secondary_vcpu(vcpu); + + return 0; +} + +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + /* Notify simulator that this task handles this vcpu. */ + sim_set_vcpu(vcpu->vcpu_id); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + sim_clear_vcpu(); +} + +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +{ + /* FIXME: some archs set up a cache for these structs? */ + struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + int rc; + + if (!vcpu) + return ERR_PTR(-ENOMEM); + + rc = kvm_vcpu_init(vcpu, kvm, id); + if (rc) { + kfree(vcpu); + return ERR_PTR(rc); + } + + return vcpu; +} + +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs)); + memset(&vcpu->arch.sregs, 0, sizeof(struct pt_regs)); + vcpu->arch.sregs.IPI_MASK_1 = -1UL; + vcpu->arch.sregs.INTERRUPT_MASK_1 = -1UL; + vcpu->arch.sregs.INTERRUPT_VECTOR_BASE_1 = 0xfd000000; + return 0; +} + +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + return 0; +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_vcpu_uninit(vcpu); + kfree(vcpu); +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + return kvm_arch_vcpu_destroy(vcpu); +} + +int kvm_arch_hardware_enable(void *garbage) +{ + return 0; +} + +void kvm_arch_hardware_disable(void *garbage) +{ +} + +int kvm_arch_hardware_setup(void) +{ + return 0; +} + +void kvm_arch_hardware_unsetup(void) +{ +} + +void kvm_arch_check_processor_compat(void *rtn) +{ +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return 0; +} + +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + if (type) + return -EINVAL; + + init_completion(&kvm->arch.smp_start); + return 0; +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + /* Seems to be unnecessary? */ + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); + + /* FIXME: release all the pmds and ptes as well! */ + if (kvm->arch.vpgd) + pgd_free(kvm->mm, kvm->arch.vpgd); +} + +void kvm_arch_sync_events(struct kvm *kvm) +{ +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return 0; +} + +/* Called from guest hv glue via swint0 traps. */ +void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num) +{ + /* Hypercalls are only valid from PL1. */ + if (EX1_PL(regs->ex1) != 0) { + kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL); + /*NORETURN*/ + } + do_trap(regs, fault_num, 0); +} + +void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num, + unsigned long fault_addr, unsigned long write) +{ + struct kvm_vcpu *vcpu = current_thread_info()->vcpu; + BUG_ON(vcpu == NULL); + vcpu->arch.fault_addr = fault_addr; + kvm_trigger_vmexit(regs, KVM_EXIT_MMIO); + /*NORETURN*/ +} + +void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num) +{ + kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN); + /*NORETURN*/ +} + +void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason) +{ + struct kvm_vcpu *vcpu = current_thread_info()->vcpu; + vcpu->run->exit_reason = exit_reason; + vcpu->arch.regs = *regs; + vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS; + kvm_vmexit(vcpu->arch.host_sp); + /*NORETURN*/ +} + +static int __init kvm_tile_init(void) +{ + return kvm_init(NULL, sizeof(struct kvm_vcpu), + __alignof__(struct kvm_vcpu), THIS_MODULE); +} + +static void __exit kvm_tile_exit(void) +{ + kvm_exit(); +} + +module_init(kvm_tile_init); +module_exit(kvm_tile_exit); diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c index 82733c8..1590282 100644 --- a/arch/tile/lib/exports.c +++ b/arch/tile/lib/exports.c @@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic); /* hypervisor glue */ #include +EXPORT_SYMBOL(hv_confstr); +EXPORT_SYMBOL(hv_dev_close); EXPORT_SYMBOL(hv_dev_open); +EXPORT_SYMBOL(hv_dev_poll); +EXPORT_SYMBOL(hv_dev_poll_cancel); EXPORT_SYMBOL(hv_dev_pread); -EXPORT_SYMBOL(hv_dev_pwrite); EXPORT_SYMBOL(hv_dev_preada); +EXPORT_SYMBOL(hv_dev_pwrite); EXPORT_SYMBOL(hv_dev_pwritea); -EXPORT_SYMBOL(hv_dev_poll); -EXPORT_SYMBOL(hv_dev_poll_cancel); -EXPORT_SYMBOL(hv_dev_close); -EXPORT_SYMBOL(hv_sysconf); -EXPORT_SYMBOL(hv_confstr); +EXPORT_SYMBOL(hv_flush_all); EXPORT_SYMBOL(hv_get_rtc); +#ifdef __tilegx__ +EXPORT_SYMBOL(hv_inquire_guest_context); +EXPORT_SYMBOL(hv_install_guest_context); +EXPORT_SYMBOL(hv_install_virt_context); +#endif +EXPORT_SYMBOL(hv_physaddr_read64); +EXPORT_SYMBOL(hv_physaddr_write64); EXPORT_SYMBOL(hv_set_rtc); +EXPORT_SYMBOL(hv_sysconf); /* libgcc.a */ uint32_t __udivsi3(uint32_t dividend, uint32_t divisor); diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 23f044e..86cff48 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm) char *buf, *path; struct vm_area_struct *vma; +#ifndef CONFIG_KVM_GUEST /* see notify_sim_task_change() */ if (!sim_is_simulator()) +#endif return 1; if (mm->exe_file == NULL) diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 64eec3f..39c48cb 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs, flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | (write ? FAULT_FLAG_WRITE : 0)); - is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL); + is_kernel_mode = !user_mode(regs); tsk = validate_current(); @@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num, } #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC() - if (EX1_PL(regs->ex1) != USER_PL) { + if (!user_mode(regs)) { struct async_tlb *async; switch (fault_num) { #if CHIP_HAS_TILE_DMA() diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index 3bfa127..c6d2160 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address) { int cpu; unsigned long page; - enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET }; + enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET }; #if CHIP_HAS_CBOX_HOME_MAP() /* For kdata=huge, everything is just hash-for-home. */ @@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) } } - address = MEM_SV_INTRPT; + address = MEM_SV_START; pmd = get_pmd(pgtables, address); pfn = 0; /* code starts at PA 0 */ if (ktext_small) { @@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { - const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET; + const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET; /* * Evict the dirty initdata on the boot cpu, evict the w1data @@ -1040,7 +1040,7 @@ void free_initmem(void) /* * Free the pages mapped from 0xc0000000 that correspond to code - * pages from MEM_SV_INTRPT that we won't use again after init. + * pages from MEM_SV_START that we won't use again after init. */ free_init_pages("unused kernel text", (unsigned long)_sinittext - text_delta, diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 3004433..d6948d4 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next) #if CHIP_HAS_MMIO() -/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */ -void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, - pgprot_t home) +void *generic_remap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long flags, pgprot_t prot) { void *addr; struct vm_struct *area; unsigned long offset, last_addr; - pgprot_t pgprot; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) return NULL; - /* Create a read/write, MMIO VA mapping homed at the requested shim. */ - pgprot = PAGE_KERNEL; - pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO); - pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home)); - /* * Mappings have to be page-aligned */ @@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, /* * Ok, go for it.. */ - area = get_vm_area(size, VM_IOREMAP /* | other flags? */); + area = get_vm_area(size, flags); if (!area) return NULL; area->phys_addr = phys_addr; addr = area->addr; if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { + phys_addr, prot)) { free_vm_area(area); return NULL; } - return (__force void __iomem *) (offset + (char *)addr); + return (void *) (offset + (char *)addr); +} +EXPORT_SYMBOL(generic_remap_prot); + +/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */ +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + pgprot_t home) +{ + pgprot_t pgprot; + unsigned long flags; + + /* Create a read/write, MMIO VA mapping homed at the requested shim. */ + pgprot = PAGE_KERNEL; + pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO); + pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home)); + flags = VM_IOREMAP; /* | other flags? */ + + return (__force void __iomem *) generic_remap_prot(phys_addr, + size, flags, pgprot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index acccd08..b622337 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -171,6 +171,7 @@ struct kvm_pit_config { #define KVM_EXIT_WATCHDOG 21 #define KVM_EXIT_S390_TSCH 22 #define KVM_EXIT_EPR 23 +#define KVM_EXIT_AGAIN 24 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1580dd4..1b8a1f1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1691,7 +1691,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); } -#ifndef CONFIG_S390 +#if !defined(CONFIG_S390) && !defined(CONFIG_TILE) /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ @@ -1714,7 +1714,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) put_cpu(); } EXPORT_SYMBOL_GPL(kvm_vcpu_kick); -#endif /* !CONFIG_S390 */ +#endif void kvm_resched(struct kvm_vcpu *vcpu) { @@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp, if (vcpu->kvm->mm != current->mm) return -EIO; -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) +#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \ + defined(CONFIG_TILEGX) /* * Special cases: vcpu ioctls that are asynchronous to vcpu execution, * so vcpu_load() would break it. -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/