2016-04-24 20:23:22

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On Mon, Feb 01, 2016 at 10:38:48AM -0500, Boris Ostrovsky wrote:
> Start HVMlite guest at XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
> page, initialize boot_params, enable early page tables.
>
> Since this stub is executed before kernel entry point we cannot use
> variables in .bss which is cleared by kernel. We explicitly place
> variables that are initialized here into .data.
>
> Signed-off-by: Boris Ostrovsky <[email protected]>
> ---
> arch/x86/xen/Makefile | 1 +
> arch/x86/xen/enlighten.c | 86 +++++++++++++++++++++-
> arch/x86/xen/xen-hvmlite.S | 175 ++++++++++++++++++++++++++++++++++++++++++++
> include/xen/xen.h | 6 ++
> 4 files changed, 267 insertions(+), 1 deletions(-)
> create mode 100644 arch/x86/xen/xen-hvmlite.S
>
> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
> index e47e527..1d913d7 100644
> --- a/arch/x86/xen/Makefile
> +++ b/arch/x86/xen/Makefile
> @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
> obj-$(CONFIG_XEN_DOM0) += vga.o
> obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
> obj-$(CONFIG_XEN_EFI) += efi.o
> +obj-$(CONFIG_XEN_PVHVM) += xen-hvmlite.o
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 5774800..5f05fa2 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
> */
> DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
>
> -enum xen_domain_type xen_domain_type = XEN_NATIVE;
> +enum xen_domain_type xen_domain_type
> + __attribute__((section(".data"))) = XEN_NATIVE;
> EXPORT_SYMBOL_GPL(xen_domain_type);
>
> unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
> @@ -171,6 +172,17 @@ struct tls_descs {
> */
> static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
>
> +#ifdef CONFIG_XEN_PVHVM
> +/*
> + * HVMlite variables. These need to live in data segment since they are
> + * initialized before startup_{32|64}, which clear .bss, are invoked.

So this jumps into startup_32/64 and I don't think we have talked about
it yet, have we? I'm not aware of any threads about it. Are we fine with
it, are we not?

I think we need to agree on API where xen guests should jump into
arch/x86/ and adhere to it. Otherwise, we will break xen again if we change
stuff in x86 and we do like to change stuff in x86 all the time.

Adding tip guys and leaving in the rest for reference.

...



> + */
> +int xen_hvmlite __attribute__((section(".data"))) = 0;
> +struct hvm_start_info hvmlite_start_info __attribute__((section(".data")));
> +uint hvmlite_start_info_sz = sizeof(hvmlite_start_info);
> +struct boot_params xen_hvmlite_boot_params __attribute__((section(".data")));
> +#endif
> +
> static void clamp_max_cpus(void)
> {
> #ifdef CONFIG_SMP
> @@ -1731,6 +1743,78 @@ asmlinkage __visible void __init xen_start_kernel(void)
> #endif
> }
>
> +#ifdef CONFIG_XEN_PVHVM
> +static void __init hvmlite_bootparams(void)
> +{
> + struct xen_memory_map memmap;
> + int i;
> +
> + memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params));
> +
> + memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map);
> + set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map);
> + if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) {
> + xen_raw_console_write("XENMEM_memory_map failed\n");
> + BUG();
> + }
> +
> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr =
> + ISA_START_ADDRESS;
> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size =
> + ISA_END_ADDRESS - ISA_START_ADDRESS;
> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type =
> + E820_RESERVED;
> +
> + sanitize_e820_map(xen_hvmlite_boot_params.e820_map,
> + ARRAY_SIZE(xen_hvmlite_boot_params.e820_map),
> + &memmap.nr_entries);
> +
> + xen_hvmlite_boot_params.e820_entries = memmap.nr_entries;
> + for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++)
> + e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr,
> + xen_hvmlite_boot_params.e820_map[i].size,
> + xen_hvmlite_boot_params.e820_map[i].type);
> +
> + xen_hvmlite_boot_params.hdr.cmd_line_ptr =
> + hvmlite_start_info.cmdline_paddr;
> +
> + /* The first module is always ramdisk */
> + if (hvmlite_start_info.nr_modules) {
> + struct hvm_modlist_entry *modaddr =
> + __va(hvmlite_start_info.modlist_paddr);
> + xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr;
> + xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size;
> + }
> +
> + /*
> + * See Documentation/x86/boot.txt.
> + *
> + * Version 2.12 supports Xen entry point but we will use default x86/PC
> + * environment (i.e. hardware_subarch 0).
> + */
> + xen_hvmlite_boot_params.hdr.version = 0x212;
> + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
> +}
> +
> +/*
> + * This routine (and those that it might call) should not use
> + * anything that lives in .bss since that segment will be cleared later
> + */
> +void __init xen_prepare_hvmlite(void)
> +{
> + u32 eax, ecx, edx, msr;
> + u64 pfn;
> +
> + xen_hvmlite = 1;
> +
> + cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
> + pfn = __pa(hypercall_page);
> + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> + hvmlite_bootparams();
> +}
> +#endif
> +
> void __ref xen_hvm_init_shared_info(void)
> {
> int cpu;
> diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S
> new file mode 100644
> index 0000000..fc7c08c
> --- /dev/null
> +++ b/arch/x86/xen/xen-hvmlite.S
> @@ -0,0 +1,175 @@
> +/*
> + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> + .code32
> + .text
> +#define _pa(x) ((x) - __START_KERNEL_map)
> +
> +#include <linux/elfnote.h>
> +#include <linux/init.h>
> +#include <linux/linkage.h>
> +#include <asm/segment.h>
> +#include <asm/asm.h>
> +#include <asm/boot.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr.h>
> +#include <xen/interface/elfnote.h>
> +
> + __HEAD
> + .code32
> +
> +/* Entry point for HVMlite guests */
> +ENTRY(hvmlite_start_xen)
> + cli
> + cld
> +
> + mov $_pa(gdt), %eax
> + lgdt (%eax)
> +
> + movl $(__BOOT_DS),%eax
> + movl %eax,%ds
> + movl %eax,%es
> + movl %eax,%ss
> +
> + /* Stash hvm_start_info */
> + mov $_pa(hvmlite_start_info), %edi
> + mov %ebx, %esi
> + mov $_pa(hvmlite_start_info_sz), %ecx
> + mov (%ecx), %ecx
> + rep
> + movsb
> +
> + movl $_pa(early_stack_end), %eax
> + movl %eax, %esp
> +
> + /* Enable PAE mode */
> + movl %cr4, %eax
> + orl $X86_CR4_PAE, %eax
> + movl %eax, %cr4
> +
> +#ifdef CONFIG_X86_64
> + /* Enable Long mode */
> + movl $MSR_EFER, %ecx
> + rdmsr
> + btsl $_EFER_LME, %eax
> + wrmsr
> +
> + /* Enable pre-constructed page tables */
> + mov $_pa(init_level4_pgt), %eax
> + movl %eax, %cr3
> + movl $(X86_CR0_PG | X86_CR0_PE), %eax
> + movl %eax, %cr0
> +
> + /* Jump to 64-bit mode. */
> + pushl $__KERNEL_CS
> + leal _pa(1f), %eax
> + pushl %eax
> + lret
> +
> + /* 64-bit entry point */
> + .code64
> +1:
> + call xen_prepare_hvmlite
> +
> + /* startup_64 expects boot_params in %rsi */
> + mov $_pa(xen_hvmlite_boot_params), %rsi
> + movq $_pa(startup_64), %rax
> + jmp *%rax
> +
> +#else /* CONFIG_X86_64 */
> +
> + /* Clear boot page tables */
> + movl $_pa(early_pgtable), %edi
> + xorl %eax, %eax
> + movl $((PAGE_SIZE*5)/4), %ecx
> + rep stosl
> +
> + /* Level 3 */
> + movl $_pa(early_pgtable), %edi
> + leal (PAGE_SIZE +_PAGE_PRESENT)(%edi), %eax
> + movl $4, %ecx
> +1:
> + movl %eax, 0x00(%edi)
> + addl $8, %edi
> + decl %ecx
> + jnz 1b
> +
> + /* Level 2 (2M entries) */
> + movl $(_pa(early_pgtable) + PAGE_SIZE), %edi
> + movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax
> + movl $2048, %ecx
> +2:
> + movl %eax, 0(%edi)
> + addl $0x00200000, %eax
> + addl $8, %edi
> + decl %ecx
> + jnz 2b
> +
> + /* Enable the boot paging */
> + movl $_pa(early_pgtable), %eax
> + movl %eax, %cr3
> + movl %cr0, %eax
> + orl $(X86_CR0_PG | X86_CR0_PE), %eax
> + movl %eax, %cr0
> +
> + ljmp $__BOOT_CS,$3f
> +3:
> + call xen_prepare_hvmlite
> + mov $_pa(xen_hvmlite_boot_params), %esi
> +
> + /* startup_32 doesn't expect paging and PAE to be on */
> + ljmp $__BOOT_CS,$_pa(4f)
> +4:
> + movl %cr0, %eax
> + andl $~X86_CR0_PG, %eax
> + movl %eax, %cr0
> + movl %cr4, %eax
> + andl $~X86_CR4_PAE, %eax
> + movl %eax, %cr4
> +
> + ljmp $0x10, $_pa(startup_32)
> +#endif
> +
> + .data
> +gdt:
> + .word gdt_end - gdt
> + .long _pa(gdt)
> + .word 0
> + .quad 0x0000000000000000 /* NULL descriptor */
> +#ifdef CONFIG_X86_64
> + .quad 0x00af9a000000ffff /* __KERNEL_CS */
> +#else
> + .quad 0x00cf9a000000ffff /* __KERNEL_CS */
> +#endif
> + .quad 0x00cf92000000ffff /* __KERNEL_DS */
> +gdt_end:
> +
> + .bss
> + .balign 4
> +early_stack:
> + .fill 16, 1, 0
> +early_stack_end:
> +
> +#ifdef CONFIG_X86_32
> + .section ".pgtable","a",@nobits
> + .balign 4096
> +early_pgtable:
> + .fill 5*4096, 1, 0
> +#endif
> +
> + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
> + _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map))
> diff --git a/include/xen/xen.h b/include/xen/xen.h
> index 0c0e3ef..6a0d3f3 100644
> --- a/include/xen/xen.h
> +++ b/include/xen/xen.h
> @@ -29,6 +29,12 @@ extern enum xen_domain_type xen_domain_type;
> #define xen_initial_domain() (0)
> #endif /* CONFIG_XEN_DOM0 */
>
> +#ifdef CONFIG_XEN_PVHVM
> +extern int xen_hvmlite;
> +#else
> +#define xen_hvmlite (0)
> +#endif
> +
> #ifdef CONFIG_XEN_PVH
> /* This functionality exists only for x86. The XEN_PVHVM support exists
> * only in x86 world - hence on ARM it will be always disabled.
> --
> 1.7.1
>
>

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.


2016-04-25 13:23:24

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On 04/24/2016 04:23 PM, Borislav Petkov wrote:
> On Mon, Feb 01, 2016 at 10:38:48AM -0500, Boris Ostrovsky wrote:
>> Start HVMlite guest at XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
>> page, initialize boot_params, enable early page tables.
>>
>> Since this stub is executed before kernel entry point we cannot use
>> variables in .bss which is cleared by kernel. We explicitly place
>> variables that are initialized here into .data.
>>
>> Signed-off-by: Boris Ostrovsky <[email protected]>
>> ---
>> arch/x86/xen/Makefile | 1 +
>> arch/x86/xen/enlighten.c | 86 +++++++++++++++++++++-
>> arch/x86/xen/xen-hvmlite.S | 175 ++++++++++++++++++++++++++++++++++++++++++++
>> include/xen/xen.h | 6 ++
>> 4 files changed, 267 insertions(+), 1 deletions(-)
>> create mode 100644 arch/x86/xen/xen-hvmlite.S
>>
>> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
>> index e47e527..1d913d7 100644
>> --- a/arch/x86/xen/Makefile
>> +++ b/arch/x86/xen/Makefile
>> @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
>> obj-$(CONFIG_XEN_DOM0) += vga.o
>> obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
>> obj-$(CONFIG_XEN_EFI) += efi.o
>> +obj-$(CONFIG_XEN_PVHVM) += xen-hvmlite.o
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index 5774800..5f05fa2 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
>> */
>> DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
>>
>> -enum xen_domain_type xen_domain_type = XEN_NATIVE;
>> +enum xen_domain_type xen_domain_type
>> + __attribute__((section(".data"))) = XEN_NATIVE;
>> EXPORT_SYMBOL_GPL(xen_domain_type);
>>
>> unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
>> @@ -171,6 +172,17 @@ struct tls_descs {
>> */
>> static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
>>
>> +#ifdef CONFIG_XEN_PVHVM
>> +/*
>> + * HVMlite variables. These need to live in data segment since they are
>> + * initialized before startup_{32|64}, which clear .bss, are invoked.
> So this jumps into startup_32/64 and I don't think we have talked about
> it yet, have we? I'm not aware of any threads about it. Are we fine with
> it, are we not?
>
> I think we need to agree on API where xen guests should jump into
> arch/x86/ and adhere to it. Otherwise, we will break xen again if we change
> stuff in x86 and we do like to change stuff in x86 all the time.
>
> Adding tip guys and leaving in the rest for reference.

I was following Documentation/x86/boot.txt (plus comments in code
preceding those two routines) which I considered to be the API.

We are supposed to come to startup_32 with paging off and %esi pointing
to boot_params. For 64-bit paging is on, %rsi points to zero-page. Plus
certain requirements on segment registers and interrupt being disabled.
(I just noticed that for 32-bit %ebp, %edi and %ebx are supposed to be
zero, so I'll need to do that)

-boris

>
> ...
>
>
>
>> + */
>> +int xen_hvmlite __attribute__((section(".data"))) = 0;
>> +struct hvm_start_info hvmlite_start_info __attribute__((section(".data")));
>> +uint hvmlite_start_info_sz = sizeof(hvmlite_start_info);
>> +struct boot_params xen_hvmlite_boot_params __attribute__((section(".data")));
>> +#endif
>> +
>> static void clamp_max_cpus(void)
>> {
>> #ifdef CONFIG_SMP
>> @@ -1731,6 +1743,78 @@ asmlinkage __visible void __init xen_start_kernel(void)
>> #endif
>> }
>>
>> +#ifdef CONFIG_XEN_PVHVM
>> +static void __init hvmlite_bootparams(void)
>> +{
>> + struct xen_memory_map memmap;
>> + int i;
>> +
>> + memset(&xen_hvmlite_boot_params, 0, sizeof(xen_hvmlite_boot_params));
>> +
>> + memmap.nr_entries = ARRAY_SIZE(xen_hvmlite_boot_params.e820_map);
>> + set_xen_guest_handle(memmap.buffer, xen_hvmlite_boot_params.e820_map);
>> + if (HYPERVISOR_memory_op(XENMEM_memory_map, &memmap)) {
>> + xen_raw_console_write("XENMEM_memory_map failed\n");
>> + BUG();
>> + }
>> +
>> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].addr =
>> + ISA_START_ADDRESS;
>> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries].size =
>> + ISA_END_ADDRESS - ISA_START_ADDRESS;
>> + xen_hvmlite_boot_params.e820_map[memmap.nr_entries++].type =
>> + E820_RESERVED;
>> +
>> + sanitize_e820_map(xen_hvmlite_boot_params.e820_map,
>> + ARRAY_SIZE(xen_hvmlite_boot_params.e820_map),
>> + &memmap.nr_entries);
>> +
>> + xen_hvmlite_boot_params.e820_entries = memmap.nr_entries;
>> + for (i = 0; i < xen_hvmlite_boot_params.e820_entries; i++)
>> + e820_add_region(xen_hvmlite_boot_params.e820_map[i].addr,
>> + xen_hvmlite_boot_params.e820_map[i].size,
>> + xen_hvmlite_boot_params.e820_map[i].type);
>> +
>> + xen_hvmlite_boot_params.hdr.cmd_line_ptr =
>> + hvmlite_start_info.cmdline_paddr;
>> +
>> + /* The first module is always ramdisk */
>> + if (hvmlite_start_info.nr_modules) {
>> + struct hvm_modlist_entry *modaddr =
>> + __va(hvmlite_start_info.modlist_paddr);
>> + xen_hvmlite_boot_params.hdr.ramdisk_image = modaddr->paddr;
>> + xen_hvmlite_boot_params.hdr.ramdisk_size = modaddr->size;
>> + }
>> +
>> + /*
>> + * See Documentation/x86/boot.txt.
>> + *
>> + * Version 2.12 supports Xen entry point but we will use default x86/PC
>> + * environment (i.e. hardware_subarch 0).
>> + */
>> + xen_hvmlite_boot_params.hdr.version = 0x212;
>> + xen_hvmlite_boot_params.hdr.type_of_loader = 9; /* Xen loader */
>> +}
>> +
>> +/*
>> + * This routine (and those that it might call) should not use
>> + * anything that lives in .bss since that segment will be cleared later
>> + */
>> +void __init xen_prepare_hvmlite(void)
>> +{
>> + u32 eax, ecx, edx, msr;
>> + u64 pfn;
>> +
>> + xen_hvmlite = 1;
>> +
>> + cpuid(xen_cpuid_base() + 2, &eax, &msr, &ecx, &edx);
>> + pfn = __pa(hypercall_page);
>> + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
>> +
>> + hvmlite_bootparams();
>> +}
>> +#endif
>> +
>> void __ref xen_hvm_init_shared_info(void)
>> {
>> int cpu;
>> diff --git a/arch/x86/xen/xen-hvmlite.S b/arch/x86/xen/xen-hvmlite.S
>> new file mode 100644
>> index 0000000..fc7c08c
>> --- /dev/null
>> +++ b/arch/x86/xen/xen-hvmlite.S
>> @@ -0,0 +1,175 @@
>> +/*
>> + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> + .code32
>> + .text
>> +#define _pa(x) ((x) - __START_KERNEL_map)
>> +
>> +#include <linux/elfnote.h>
>> +#include <linux/init.h>
>> +#include <linux/linkage.h>
>> +#include <asm/segment.h>
>> +#include <asm/asm.h>
>> +#include <asm/boot.h>
>> +#include <asm/processor-flags.h>
>> +#include <asm/msr.h>
>> +#include <xen/interface/elfnote.h>
>> +
>> + __HEAD
>> + .code32
>> +
>> +/* Entry point for HVMlite guests */
>> +ENTRY(hvmlite_start_xen)
>> + cli
>> + cld
>> +
>> + mov $_pa(gdt), %eax
>> + lgdt (%eax)
>> +
>> + movl $(__BOOT_DS),%eax
>> + movl %eax,%ds
>> + movl %eax,%es
>> + movl %eax,%ss
>> +
>> + /* Stash hvm_start_info */
>> + mov $_pa(hvmlite_start_info), %edi
>> + mov %ebx, %esi
>> + mov $_pa(hvmlite_start_info_sz), %ecx
>> + mov (%ecx), %ecx
>> + rep
>> + movsb
>> +
>> + movl $_pa(early_stack_end), %eax
>> + movl %eax, %esp
>> +
>> + /* Enable PAE mode */
>> + movl %cr4, %eax
>> + orl $X86_CR4_PAE, %eax
>> + movl %eax, %cr4
>> +
>> +#ifdef CONFIG_X86_64
>> + /* Enable Long mode */
>> + movl $MSR_EFER, %ecx
>> + rdmsr
>> + btsl $_EFER_LME, %eax
>> + wrmsr
>> +
>> + /* Enable pre-constructed page tables */
>> + mov $_pa(init_level4_pgt), %eax
>> + movl %eax, %cr3
>> + movl $(X86_CR0_PG | X86_CR0_PE), %eax
>> + movl %eax, %cr0
>> +
>> + /* Jump to 64-bit mode. */
>> + pushl $__KERNEL_CS
>> + leal _pa(1f), %eax
>> + pushl %eax
>> + lret
>> +
>> + /* 64-bit entry point */
>> + .code64
>> +1:
>> + call xen_prepare_hvmlite
>> +
>> + /* startup_64 expects boot_params in %rsi */
>> + mov $_pa(xen_hvmlite_boot_params), %rsi
>> + movq $_pa(startup_64), %rax
>> + jmp *%rax
>> +
>> +#else /* CONFIG_X86_64 */
>> +
>> + /* Clear boot page tables */
>> + movl $_pa(early_pgtable), %edi
>> + xorl %eax, %eax
>> + movl $((PAGE_SIZE*5)/4), %ecx
>> + rep stosl
>> +
>> + /* Level 3 */
>> + movl $_pa(early_pgtable), %edi
>> + leal (PAGE_SIZE +_PAGE_PRESENT)(%edi), %eax
>> + movl $4, %ecx
>> +1:
>> + movl %eax, 0x00(%edi)
>> + addl $8, %edi
>> + decl %ecx
>> + jnz 1b
>> +
>> + /* Level 2 (2M entries) */
>> + movl $(_pa(early_pgtable) + PAGE_SIZE), %edi
>> + movl $(_PAGE_PSE | _PAGE_RW | _PAGE_PRESENT), %eax
>> + movl $2048, %ecx
>> +2:
>> + movl %eax, 0(%edi)
>> + addl $0x00200000, %eax
>> + addl $8, %edi
>> + decl %ecx
>> + jnz 2b
>> +
>> + /* Enable the boot paging */
>> + movl $_pa(early_pgtable), %eax
>> + movl %eax, %cr3
>> + movl %cr0, %eax
>> + orl $(X86_CR0_PG | X86_CR0_PE), %eax
>> + movl %eax, %cr0
>> +
>> + ljmp $__BOOT_CS,$3f
>> +3:
>> + call xen_prepare_hvmlite
>> + mov $_pa(xen_hvmlite_boot_params), %esi
>> +
>> + /* startup_32 doesn't expect paging and PAE to be on */
>> + ljmp $__BOOT_CS,$_pa(4f)
>> +4:
>> + movl %cr0, %eax
>> + andl $~X86_CR0_PG, %eax
>> + movl %eax, %cr0
>> + movl %cr4, %eax
>> + andl $~X86_CR4_PAE, %eax
>> + movl %eax, %cr4
>> +
>> + ljmp $0x10, $_pa(startup_32)
>> +#endif
>> +
>> + .data
>> +gdt:
>> + .word gdt_end - gdt
>> + .long _pa(gdt)
>> + .word 0
>> + .quad 0x0000000000000000 /* NULL descriptor */
>> +#ifdef CONFIG_X86_64
>> + .quad 0x00af9a000000ffff /* __KERNEL_CS */
>> +#else
>> + .quad 0x00cf9a000000ffff /* __KERNEL_CS */
>> +#endif
>> + .quad 0x00cf92000000ffff /* __KERNEL_DS */
>> +gdt_end:
>> +
>> + .bss
>> + .balign 4
>> +early_stack:
>> + .fill 16, 1, 0
>> +early_stack_end:
>> +
>> +#ifdef CONFIG_X86_32
>> + .section ".pgtable","a",@nobits
>> + .balign 4096
>> +early_pgtable:
>> + .fill 5*4096, 1, 0
>> +#endif
>> +
>> + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
>> + _ASM_PTR (hvmlite_start_xen - __START_KERNEL_map))
>> diff --git a/include/xen/xen.h b/include/xen/xen.h
>> index 0c0e3ef..6a0d3f3 100644
>> --- a/include/xen/xen.h
>> +++ b/include/xen/xen.h
>> @@ -29,6 +29,12 @@ extern enum xen_domain_type xen_domain_type;
>> #define xen_initial_domain() (0)
>> #endif /* CONFIG_XEN_DOM0 */
>>
>> +#ifdef CONFIG_XEN_PVHVM
>> +extern int xen_hvmlite;
>> +#else
>> +#define xen_hvmlite (0)
>> +#endif
>> +
>> #ifdef CONFIG_XEN_PVH
>> /* This functionality exists only for x86. The XEN_PVHVM support exists
>> * only in x86 world - hence on ARM it will be always disabled.
>> --
>> 1.7.1
>>
>>

2016-04-25 13:47:56

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On Mon, Apr 25, 2016 at 09:21:27AM -0400, Boris Ostrovsky wrote:
> I was following Documentation/x86/boot.txt (plus comments in code preceding
> those two routines) which I considered to be the API.
>
> We are supposed to come to startup_32 with paging off and %esi pointing to
> boot_params. For 64-bit paging is on, %rsi points to zero-page.

So the entry points which are ABI and the ones I believe you're talking
about are in arch/x86/boot/compressed/head_64.S. But you have all this
stuff laid out in arch/x86/xen/ and I don't see you using the entry
points in boot/compressed/. It looks more to me that you're using the
ones in arch/x86/kernel/head_{32,64}.S after decompression.

Or am I missing something?

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.

2016-04-25 13:55:16

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On 04/25/2016 09:47 AM, Borislav Petkov wrote:
> On Mon, Apr 25, 2016 at 09:21:27AM -0400, Boris Ostrovsky wrote:
>> I was following Documentation/x86/boot.txt (plus comments in code preceding
>> those two routines) which I considered to be the API.
>>
>> We are supposed to come to startup_32 with paging off and %esi pointing to
>> boot_params. For 64-bit paging is on, %rsi points to zero-page.
> So the entry points which are ABI and the ones I believe you're talking
> about are in arch/x86/boot/compressed/head_64.S. But you have all this
> stuff laid out in arch/x86/xen/ and I don't see you using the entry
> points in boot/compressed/. It looks more to me that you're using the
> ones in arch/x86/kernel/head_{32,64}.S after decompression.

Yes, those. We don't do anything in arch/x86/boot/compressed, hypervisor
loads vmlinuX at entry point specified by XEN_ELFNOTE_PHYS32_ENTRY
(which is hvmlite_start_xen).


-boris

2016-04-25 14:11:50

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On Mon, Apr 25, 2016 at 09:54:37AM -0400, Boris Ostrovsky wrote:
> Yes, those.

I don't think the ones in arch/x86/kernel/head_{32,64}.S are ABI.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.

2016-04-25 14:42:52

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On 04/25/2016 10:11 AM, Borislav Petkov wrote:
> On Mon, Apr 25, 2016 at 09:54:37AM -0400, Boris Ostrovsky wrote:
>> Yes, those.
> I don't think the ones in arch/x86/kernel/head_{32,64}.S are ABI.
>

Hmm... I thought that everything specified in boot.txt was ABI.

I don't think we can jump to compressed code from vmlinux.

-boris


2016-04-25 15:22:15

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On Mon, Apr 25, 2016 at 10:42:15AM -0400, Boris Ostrovsky wrote:
> Hmm... I thought that everything specified in boot.txt was ABI.

But those are not there.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.

2016-04-25 15:49:04

by Boris Ostrovsky

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On 04/25/2016 11:22 AM, Borislav Petkov wrote:
> On Mon, Apr 25, 2016 at 10:42:15AM -0400, Boris Ostrovsky wrote:
>> Hmm... I thought that everything specified in boot.txt was ABI.
> But those are not there.
>


https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/x86/boot.txt#n1060

and

https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/x86/boot.txt#n1096

is what I was referring to.

-boris

2016-04-25 17:24:09

by David Vrabel

[permalink] [raw]
Subject: Re: [Xen-devel] [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On 24/04/16 21:23, Borislav Petkov wrote:
> On Mon, Feb 01, 2016 at 10:38:48AM -0500, Boris Ostrovsky wrote:
>> Start HVMlite guest at XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
>> page, initialize boot_params, enable early page tables.
>>
>> Since this stub is executed before kernel entry point we cannot use
>> variables in .bss which is cleared by kernel. We explicitly place
>> variables that are initialized here into .data.
>>
>> Signed-off-by: Boris Ostrovsky <[email protected]>
>> ---
>> arch/x86/xen/Makefile | 1 +
>> arch/x86/xen/enlighten.c | 86 +++++++++++++++++++++-
>> arch/x86/xen/xen-hvmlite.S | 175 ++++++++++++++++++++++++++++++++++++++++++++
>> include/xen/xen.h | 6 ++
>> 4 files changed, 267 insertions(+), 1 deletions(-)
>> create mode 100644 arch/x86/xen/xen-hvmlite.S
>>
>> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
>> index e47e527..1d913d7 100644
>> --- a/arch/x86/xen/Makefile
>> +++ b/arch/x86/xen/Makefile
>> @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
>> obj-$(CONFIG_XEN_DOM0) += vga.o
>> obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
>> obj-$(CONFIG_XEN_EFI) += efi.o
>> +obj-$(CONFIG_XEN_PVHVM) += xen-hvmlite.o
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index 5774800..5f05fa2 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -118,7 +118,8 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
>> */
>> DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
>>
>> -enum xen_domain_type xen_domain_type = XEN_NATIVE;
>> +enum xen_domain_type xen_domain_type
>> + __attribute__((section(".data"))) = XEN_NATIVE;
>> EXPORT_SYMBOL_GPL(xen_domain_type);
>>
>> unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
>> @@ -171,6 +172,17 @@ struct tls_descs {
>> */
>> static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
>>
>> +#ifdef CONFIG_XEN_PVHVM
>> +/*
>> + * HVMlite variables. These need to live in data segment since they are
>> + * initialized before startup_{32|64}, which clear .bss, are invoked.
>
> So this jumps into startup_32/64 and I don't think we have talked about
> it yet, have we? I'm not aware of any threads about it. Are we fine with
> it, are we not?
>
> I think we need to agree on API where xen guests should jump into
> arch/x86/ and adhere to it. Otherwise, we will break xen again if we change
> stuff in x86 and we do like to change stuff in x86 all the time.
>
> Adding tip guys and leaving in the rest for reference.

It would be good if we could start in the decompresser, but we would
need to be able to:

a) identify that the image is PVH-capable.
b) call a PVH specific entry point that builds the expected struct
boot_params.

I don't see any scope in the existing boot protocol to allow this. Hence
we get Xen to decompress the image and look at ELF notes etc.

We want PVH to be a drop-in replacement for PV as much as possible so
this excludes using a bootloader or post-processing the bzImage into a
PVH-compatible ELF image.

I'm open to other suggestions but what's proposed here seems the least
intrusive and with minimal risk for future breakage. I don't think the
decompressor to kernel ABI changes often does it?

David

2016-04-26 10:53:23

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 02/11] xen/hvmlite: Bootstrap HVMlite guest

On Mon, Apr 25, 2016 at 11:48:19AM -0400, Boris Ostrovsky wrote:
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/x86/boot.txt#n1096
>
> is what I was referring to.

Right, so reportedly those two weren't meant to be entry points
initially but stuff is using them (think of boot loaders and kexec, for
example) which makes them effectively such.

So I guess having one more user wouldn't change anything.

However, I'd like to document that fact and make them explicit, see
below.

Btw, that boot.txt file could use some serious scrubbing, but that's
for another day.

(Btw 2, that "start address of loaded 64-bit kernel plus 0x200" is
simply wrong. The 0x200 offset is for the boot/compressed/ version of
startup_64:

arch/x86/boot/compressed/head_64.S:
...

.code64
.org 0x200
ENTRY(startup_64)

---
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 9da6f3512249..69ed95784085 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -1053,9 +1053,9 @@ described in zero-page.txt.
After setting up the struct boot_params, the boot loader can load the
32/64-bit kernel in the same way as that of 16-bit boot protocol.

-In 32-bit boot protocol, the kernel is started by jumping to the
-32-bit kernel entry point, which is the start address of loaded
-32/64-bit kernel.
+In 32-bit boot protocol, the kernel is started by jumping to the 32-bit
+kernel entry point (arch/x86/kernel/head_32.S::startup_32), which is the
+start address of loaded 32/64-bit kernel.

At entry, the CPU must be in 32-bit protected mode with paging
disabled; a GDT must be loaded with the descriptors for selectors
@@ -1089,9 +1089,9 @@ After setting up the struct boot_params, the boot loader can load
64-bit kernel in the same way as that of 16-bit boot protocol, but
kernel could be loaded above 4G.

-In 64-bit boot protocol, the kernel is started by jumping to the
-64-bit kernel entry point, which is the start address of loaded
-64-bit kernel plus 0x200.
+In 64-bit boot protocol, the kernel is started by jumping to the 64-bit
+kernel entry point (arch/x86/kernel/head_64.S::startup_64), which is the
+start address of loaded 64-bit kernel.

At entry, the CPU must be in 64-bit mode with paging enabled.
The range with setup_header.init_size from start address of loaded

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.