2007-02-09 09:12:21

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 0/10] lguest

This patch series is against 2.6.20; some things are in flux, so there
might be issues as other things flow into the latest -git tree.

>From the documentation:

Lguest is designed to be a minimal hypervisor for the Linux kernel, for
Linux developers and users to experiment with virtualization with the
minimum of complexity. Nonetheless, it should have sufficient
features to make it useful for specific tasks, and, of course, you are
encouraged to fork and enhance it.

Features:

- Kernel module which runs in a normal kernel.
- Simple I/O model for communication.
- Simple program to create new guests.
- Logo contains cute puppies: http://lguest.ozlabs.org

Developer features:

- Fun to hack on.
- No ABI: being tied to a specific kernel anyway, you can change
anything.
- Many opportunities for improvement or feature implementation.

Cheers!
Rusty


2007-02-09 09:15:06

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 1/10] lguest: Don't rely on last-linked fallthru when no paravirt handler

The current code simply calls "start_kernel" directly if we're under a
hypervisor and no paravirt_ops backend wants us, because paravirt.c
registers that as a backend and it's linked last.

This was always a vain hope; start_kernel won't get far without setup.
It's also impossible for paravirt_ops backends which don't sit in the
arch/i386/kernel directory: they can't link before paravirt.o anyway.

This implements a real fallthrough if we pass all the registered
paravirt probes.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,8 +39,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
-
-# Make sure this is linked after any other paravirt_ops structs: see head.S
obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional
===================================================================
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -502,10 +502,11 @@ startup_paravirt:
pushl %ecx
pushl %eax

- /* paravirt.o is last in link, and that probe fn never returns */
pushl $__start_paravirtprobe
1:
movl 0(%esp), %eax
+ cmpl $__stop_paravirtprobe, %eax
+ je unhandled_paravirt
pushl (%eax)
movl 8(%esp), %eax
call *(%esp)
@@ -517,6 +518,12 @@ 1:

addl $4, (%esp)
jmp 1b
+
+unhandled_paravirt:
+ /* Nothing wanted us: try to die with dignity (impossible trap). */
+ movl $0x1F, %edx
+ pushl $0
+ jmp early_fault
#endif

/*
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -481,9 +481,6 @@ static int __init print_banner(void)
return 0;
}
core_initcall(print_banner);
-
-/* We simply declare start_kernel to be the paravirt probe of last resort. */
-paravirt_probe(start_kernel);

struct paravirt_ops paravirt_ops = {
.name = "bare hardware",


2007-02-09 09:16:14

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 2/10] lguest: Export symbols for lguest as a module

lguest does some fairly lowlevel things to support a host, which
normal modules don't need:

math_state_restore:
When the guest triggers a Device Not Available fault, we need
to be able to restore the FPU

tsc_khz:
Simplest way of telling the guest how to interpret the TSC
counter.

__put_task_struct:
We need to hold a reference to another task for inter-guest
I/O, and put_task_struct() is an inline function which calls
__put_task_struct.

access_process_vm:
We need to access another task for inter-guest I/O.

map_vm_area & __get_vm_area:
We need to map the switcher shim (ie. monitor) at 0xFFC01000.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1054,6 +1054,7 @@ asmlinkage void math_state_restore(void)
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
}
+EXPORT_SYMBOL_GPL(math_state_restore);

#ifndef CONFIG_MATH_EMULATION

===================================================================
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -475,3 +475,4 @@ static int __init init_tsc_clocksource(v
}

module_init(init_tsc_clocksource);
+EXPORT_SYMBOL_GPL(tsc_khz);
===================================================================
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -126,6 +126,7 @@ void __put_task_struct(struct task_struc
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+EXPORT_SYMBOL_GPL(__put_task_struct);

void __init fork_init(unsigned long mempages)
{
===================================================================
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2692,3 +2692,4 @@ int access_process_vm(struct task_struct

return buf - old_buf;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
===================================================================
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -159,6 +159,7 @@ int map_vm_area(struct vm_struct *area,
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
+EXPORT_SYMBOL_GPL(map_vm_area);

static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
@@ -237,6 +238,7 @@ struct vm_struct *__get_vm_area(unsigned
{
return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
}
+EXPORT_SYMBOL_GPL(__get_vm_area);

/**
* get_vm_area - reserve a contingous kernel virtual area


2007-02-09 09:18:08

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 3/10] lguest: Expose get_futex_key, get_key_refs and drop_key_refs.

Name: Expose get_futex_key, get_key_refs and drop_key_refs.

lguest uses the convenient futex infrastructure for inter-domain I/O,
so expose get_futex_key, get_key_refs (renamed get_futex_key_refs) and
drop_key_refs (renamed drop_futex_key_refs). Also means we need to
expose the union that these use.

No code changes.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -100,6 +100,35 @@ extern int
extern int
handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);

+/*
+ * Futexes are matched on equal values of this key.
+ * The key type depends on whether it's a shared or private mapping.
+ * Don't rearrange members without looking at hash_futex().
+ *
+ * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
+ * We set bit 0 to indicate if it's an inode-based key.
+ */
+union futex_key {
+ struct {
+ unsigned long pgoff;
+ struct inode *inode;
+ int offset;
+ } shared;
+ struct {
+ unsigned long address;
+ struct mm_struct *mm;
+ int offset;
+ } private;
+ struct {
+ unsigned long word;
+ void *ptr;
+ int offset;
+ } both;
+};
+int get_futex_key(u32 __user *uaddr, union futex_key *key);
+void get_futex_key_refs(union futex_key *key);
+void drop_futex_key_refs(union futex_key *key);
+
#ifdef CONFIG_FUTEX
extern void exit_robust_list(struct task_struct *curr);
extern void exit_pi_state_list(struct task_struct *curr);
===================================================================
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,37 +48,12 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
+#include <linux/module.h>
#include <asm/futex.h>

#include "rtmutex_common.h"

#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
-
-/*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
- struct {
- unsigned long pgoff;
- struct inode *inode;
- int offset;
- } shared;
- struct {
- unsigned long address;
- struct mm_struct *mm;
- int offset;
- } private;
- struct {
- unsigned long word;
- void *ptr;
- int offset;
- } both;
-};

/*
* Priority Inheritance state:
@@ -175,7 +150,7 @@ static inline int match_futex(union fute
*
* Should be called with &current->mm->mmap_sem but NOT any spinlocks.
*/
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uad
}
return err;
}
+EXPORT_SYMBOL_GPL(get_futex_key);

/*
* Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uad
* NOTE: mmap_sem MUST be held between get_futex_key() and calling this
* function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union fu
atomic_inc(&key->private.mm->mm_count);
}
}
+EXPORT_SYMBOL_GPL(get_futex_key_refs);

/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held.
*/
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_ke
mmdrop(key->private.mm);
}
}
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);

static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
@@ -871,7 +849,7 @@ static int futex_requeue(u32 __user *uad
this->lock_ptr = &hb2->lock;
}
this->key = key2;
- get_key_refs(&key2);
+ get_futex_key_refs(&key2);
drop_count++;

if (ret - nr_wake >= nr_requeue)
@@ -884,9 +862,9 @@ out_unlock:
if (hb1 != hb2)
spin_unlock(&hb2->lock);

- /* drop_key_refs() must be called outside the spinlocks. */
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
- drop_key_refs(&key1);
+ drop_futex_key_refs(&key1);

out:
up_read(&current->mm->mmap_sem);
@@ -904,7 +882,7 @@ queue_lock(struct futex_q *q, int fd, st

init_waitqueue_head(&q->waiters);

- get_key_refs(&q->key);
+ get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;

@@ -923,7 +901,7 @@ queue_unlock(struct futex_q *q, struct f
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}

/*
@@ -978,7 +956,7 @@ static int unqueue_me(struct futex_q *q)
ret = 1;
}

- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
return ret;
}

@@ -997,7 +975,7 @@ static void unqueue_me_pi(struct futex_q

spin_unlock(&hb->lock);

- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}

static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)



2007-02-09 09:19:02

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 4/10] lguest: Initialize esp0 properly all the time

Whenever we schedule, __switch_to calls load_esp0 which does:

tss->esp0 = thread->esp0;

This is never initialized for the initial thread (ie "swapper"), so
when we're scheduling that, we end up setting esp0 to 0. This is
fine: the swapper never leaves ring 0, so this field is never used.

lguest, however, gets upset that we're trying to used an unmapped page
as our kernel stack. Rather than work around it there, let's
initialize it.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -421,6 +421,7 @@ struct thread_struct {
};

#define INIT_THREAD { \
+ .esp0 = sizeof(init_stack) + (long)&init_stack, \
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \


2007-02-09 09:20:12

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 5/10] Make hvc_console.c compile on non-PowerPC

There's a really nice console helper (esp. for virtual console
drivers) in drivers/char/hvc_console.c. It has only ever been used
for PowerPC, though, so it uses NO_IRQ which is only defined there.

Let's fix that so it's more widely useful. By, say, lguest.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -48,6 +48,10 @@
#define HVC_MINOR 0

#define TIMEOUT (10)
+
+#ifndef NO_IRQ
+#define NO_IRQ 0
+#endif

/*
* Wait this long per iteration while trying to push buffered data to the


2007-02-09 10:09:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/10] lguest: Export symbols for lguest as a module

On Friday 09 February 2007 10:15, Rusty Russell wrote:

> tsc_khz:
> Simplest way of telling the guest how to interpret the TSC
> counter.


Are you sure this will work with varying TSC frequencies?

In general you should get this from cpufreq.

-Andi

2007-02-09 10:09:46

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/10] lguest: Don't rely on last-linked fallthru when no paravirt handler

On Friday 09 February 2007 10:14, Rusty Russell wrote:

> +unhandled_paravirt:
> + /* Nothing wanted us: try to die with dignity (impossible trap). */
> + movl $0x1F, %edx
> + pushl $0
> + jmp early_fault

Please print a real message with early_printk


-Andi

2007-02-09 10:56:19

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 6a/10] lguest: Config and headers

[ Part 6 was too big, so posting in four parts ]

Unfortunately, we don't have the build infrastructure for "private"
asm-offsets.h files, so there's a not-so-neat include in
arch/i386/kernel/asm-offsets.c.

The four headers are:
asm/lguest.h:
Things the guest needs to know (hypercall numbers, etc).
asm/lguest_device.h:
Things lguest devices need to know (lguest bus registration)
asm/lguest_user.h:
Things that the lguest userspace utility needs (/dev/lguest
and some devices)
arch/i386/lguest/lg.h:
Internal header for the lg module (which consists of 8 files).

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -226,6 +226,27 @@ config ES7000_CLUSTERED_APIC
depends on SMP && X86_ES7000 && MPENTIUMIII

source "arch/i386/Kconfig.cpu"
+
+config LGUEST
+ tristate "Linux hypervisor example code"
+ depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE
+ select LGUEST_GUEST
+ select HVC_DRIVER
+ ---help---
+ This is a very simple module which allows you to run
+ multiple instances of the same Linux kernel, using the
+ "lguest" command found in the Documentation/lguest directory.
+ Note that "lguest" is pronounced to rhyme with "fell quest",
+ not "rustyvisor". See Documentation/lguest/lguest.txt.
+
+ If unsure, say N. If curious, say M. If masochistic, say Y.
+
+config LGUEST_GUEST
+ bool
+ help
+ The guest needs code built-in, even if the host has lguest
+ support as a module. The drivers are tiny, so we build them
+ in too.

config HPET_TIMER
bool "HPET Timer Support"
===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -16,6 +16,10 @@
#include <asm/thread_info.h>
#include <asm/elf.h>
#include <asm/pda.h>
+#ifdef CONFIG_LGUEST_GUEST
+#include <asm/lguest.h>
+#include "../lguest/lg.h"
+#endif

#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -111,4 +115,19 @@ void foo(void)
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
#endif
+
+#ifdef CONFIG_LGUEST_GUEST
+ BLANK();
+ OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+ OFFSET(LGUEST_STATE_host_stackptr, lguest_state, host.stackptr);
+ OFFSET(LGUEST_STATE_host_pgdir, lguest_state, host.pgdir);
+ OFFSET(LGUEST_STATE_host_gdt, lguest_state, host.gdt);
+ OFFSET(LGUEST_STATE_host_idt, lguest_state, host.idt);
+ OFFSET(LGUEST_STATE_regs, lguest_state, regs);
+ OFFSET(LGUEST_STATE_gdt, lguest_state, gdt);
+ OFFSET(LGUEST_STATE_idt, lguest_state, idt);
+ OFFSET(LGUEST_STATE_gdt_table, lguest_state, gdt_table);
+ OFFSET(LGUEST_STATE_trapnum, lguest_state, regs.trapnum);
+ OFFSET(LGUEST_STATE_errcode, lguest_state, regs.errcode);
+#endif
}
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest.h
@@ -0,0 +1,86 @@
+/* Things the lguest guest needs to know. */
+#ifndef _ASM_LGUEST_H
+#define _ASM_LGUEST_H
+
+#define LGUEST_MAGIC_EBP 0x4C687970
+#define LGUEST_MAGIC_EDI 0x652D4D65
+#define LGUEST_MAGIC_ESI 0xFFFFFFFF
+
+#define LHCALL_FLUSH_ASYNC 0
+#define LHCALL_LGUEST_INIT 1
+#define LHCALL_CRASH 2
+#define LHCALL_LOAD_GDT 3
+#define LHCALL_NEW_PGTABLE 4
+#define LHCALL_FLUSH_TLB 5
+#define LHCALL_LOAD_IDT_ENTRY 6
+#define LHCALL_SET_STACK 7
+#define LHCALL_TS 8
+#define LHCALL_TIMER_READ 9
+#define LHCALL_TIMER_START 10
+#define LHCALL_HALT 11
+#define LHCALL_GET_WALLCLOCK 12
+#define LHCALL_BIND_DMA 13
+#define LHCALL_SEND_DMA 14
+#define LHCALL_SET_PTE 15
+#define LHCALL_SET_UNKNOWN_PTE 16
+#define LHCALL_SET_PUD 17
+#define LHCALL_LOAD_TLS 18
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+ : "=a"(call)
+ : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
+ : "memory");
+ return call;
+}
+
+void async_hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+#define LGUEST_IRQS 32
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+ u32 eax, edx, ebx, ecx;
+};
+
+/* All the good stuff happens here: guest registers it with LGUEST_INIT */
+struct lguest_data
+{
+/* Fields which change during running: */
+ /* 512 == enabled (same as eflags) */
+ unsigned int irq_enabled;
+ /* Blocked interrupts. */
+ DECLARE_BITMAP(interrupts, LGUEST_IRQS);
+
+ /* Last (userspace) address we got a GPF & reloaded gs. */
+ unsigned int gs_gpf_eip;
+
+ /* Virtual address of page fault. */
+ unsigned long cr2;
+
+ /* Async hypercall ring. 0xFF == done, 0 == pending. */
+ u8 hcall_status[LHCALL_RING_SIZE];
+ struct hcall_ring hcalls[LHCALL_RING_SIZE];
+
+/* Fields initialized by the hypervisor at boot: */
+ /* Memory not to try to access */
+ unsigned long reserve_mem;
+ /* ID of this guest (used by network driver to set ethernet address) */
+ u16 guestid;
+ /* Multiplier for TSC clock. */
+ u32 clock_mult;
+
+/* Fields initialized by the guest at boot: */
+ /* Instruction range to suppress interrupts even if enabled */
+ unsigned long noirq_start, noirq_end;
+};
+extern struct lguest_data lguest_data;
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+#endif /* _ASM_LGUEST_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest_device.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_LGUEST_DEVICE_H
+#define _ASM_LGUEST_DEVICE_H
+/* Everything you need to know about lguest devices. */
+#include <linux/device.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+
+struct lguest_device {
+ /* Unique busid, and index into lguest_page->devices[] */
+ /* By convention, each device can use irq index+1 if it wants to. */
+ unsigned int index;
+
+ struct device dev;
+
+ /* Driver can hang data off here. */
+ void *private;
+};
+
+struct lguest_driver {
+ const char *name;
+ struct module *owner;
+ u16 device_type;
+ int (*probe)(struct lguest_device *dev);
+ void (*remove)(struct lguest_device *dev);
+
+ struct device_driver drv;
+};
+
+extern int register_lguest_driver(struct lguest_driver *drv);
+extern void unregister_lguest_driver(struct lguest_driver *drv);
+#endif /* _ASM_LGUEST_DEVICE_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest_user.h
@@ -0,0 +1,86 @@
+#ifndef _ASM_LGUEST_USER
+#define _ASM_LGUEST_USER
+/* Everything the "lguest" userspace program needs to know. */
+/* They can register up to 32 arrays of lguest_dma. */
+#define LGUEST_MAX_DMA 32
+/* At most we can dma 16 lguest_dma in one op. */
+#define LGUEST_MAX_DMA_SECTIONS 16
+
+/* How many devices? Assume each one wants up to two dma arrays per device. */
+#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
+
+struct lguest_dma
+{
+ /* 0 if free to be used, filled by hypervisor. */
+ u32 used_len;
+ u32 addr[LGUEST_MAX_DMA_SECTIONS];
+ u16 len[LGUEST_MAX_DMA_SECTIONS];
+};
+
+/* This is found at address 0. */
+struct lguest_boot_info
+{
+ u32 max_pfn;
+ u32 initrd_size;
+ char cmdline[256];
+};
+
+struct lguest_block_page
+{
+ /* 0 is a read, 1 is a write. */
+ int type;
+ u32 sector; /* Offset in device = sector * 512. */
+ u32 bytes; /* Length expected to be read/written in bytes */
+ /* 0 = pending, 1 = done, 2 = done, error */
+ int result;
+ u32 num_sectors; /* Disk length = num_sectors * 512 */
+};
+
+/* There is a shared page of these. */
+struct lguest_net
+{
+ union {
+ unsigned char mac[6];
+ struct {
+ u8 promisc;
+ u8 pad;
+ u16 guestid;
+ };
+ };
+};
+
+/* lguest_device_desc->type */
+#define LGUEST_DEVICE_T_CONSOLE 1
+#define LGUEST_DEVICE_T_NET 2
+#define LGUEST_DEVICE_T_BLOCK 3
+
+/* lguest_device_desc->status. 256 and above are device specific. */
+#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */
+#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */
+#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */
+#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */
+#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */
+#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */
+
+#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
+#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */
+
+/* We have a page of these descriptors in the lguest_device page. */
+struct lguest_device_desc {
+ u16 type;
+ u16 features;
+ u16 status;
+ u16 num_pages;
+ u32 pfn;
+};
+
+/* Write command first word is a request. */
+enum lguest_req
+{
+ LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
+ LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
+ LHREQ_IRQ, /* + irq */
+};
+
+
+#endif /* _ASM_LGUEST_USER */
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lg.h
@@ -0,0 +1,253 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include <asm/desc.h>
+/* 64k ought to be enough for anybody! */
+#define HYPERVISOR_SIZE 65536
+#define HYPERVISOR_PAGES (HYPERVISOR_SIZE/PAGE_SIZE)
+
+#define GDT_ENTRY_LGUEST_CS 10
+#define GDT_ENTRY_LGUEST_DS 11
+#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/binfmts.h>
+#include <linux/futex.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/semaphore.h>
+#include "irq_vectors.h"
+
+#define GUEST_DPL 1
+
+struct lguest_regs
+{
+ /* Manually saved part. */
+ u32 cr3;
+ u32 ebx, ecx, edx;
+ u32 esi, edi, ebp;
+ u32 gs;
+ u32 eax;
+ u32 fs, ds, es;
+ u32 trapnum, errcode;
+ /* Trap pushed part */
+ u32 eip;
+ u32 cs;
+ u32 eflags;
+ u32 esp;
+ u32 ss;
+};
+
+__exit void free_pagetables(void);
+__init int init_pagetables(struct page *hype_pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
+
+/* Simplified version of IDT. */
+struct host_trap
+{
+ unsigned long addr;
+ int disable_interrupts;
+};
+
+struct lguest_dma_info
+{
+ struct list_head list;
+ union futex_key key;
+ unsigned long dmas;
+ u16 next_dma;
+ u16 num_dmas;
+ u16 guestid;
+ u8 interrupt; /* 0 when not registered */
+};
+
+struct pgdir
+{
+ u32 cr3;
+ u32 *pgdir;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+ struct lguest_state *state;
+ struct lguest_data __user *lguest_data;
+ struct task_struct *tsk;
+ struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
+ u16 guestid;
+ u32 pfn_limit;
+ u32 page_offset;
+ u32 cr2;
+ int timer_on;
+ int halted;
+ int ts;
+ u32 gpf_eip;
+ u32 last_timer;
+ u32 next_hcall;
+ u16 tls_limits[GDT_ENTRY_TLS_ENTRIES];
+
+ /* We keep a small number of these. */
+ u32 pgdidx;
+ struct pgdir pgdirs[4];
+ void *trap_page;
+
+ /* Cached wakeup: we hold a reference to this task. */
+ struct task_struct *wake;
+
+ unsigned long noirq_start, noirq_end;
+ int dma_is_pending;
+ unsigned long pending_dma; /* struct lguest_dma */
+ unsigned long pending_addr; /* address they're sending to */
+
+ unsigned int stack_pages;
+
+ struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+ /* Dead? */
+ const char *dead;
+
+ /* We intercept page fault (demand shadow paging & cr2 saving)
+ protection fault (in/out emulation, TLS handling) and
+ device not available (TS handling). */
+ struct host_trap page_trap, gpf_trap, fpu_trap;
+
+ /* Virtual interrupts */
+ DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+ struct host_trap interrupt[LGUEST_IRQS];
+};
+
+extern struct page *hype_pages; /* Contiguous pages. */
+extern struct lguest lguests[];
+extern struct semaphore lguest_lock;
+
+/* core.c: */
+/* Entry points in hypervisor */
+const unsigned long *__lguest_default_idt_entries(void);
+struct lguest_state *__lguest_states(void);
+u32 lhread_u32(struct lguest *lg, u32 addr);
+void lhwrite_u32(struct lguest *lg, u32 val, u32 addr);
+void lhread(struct lguest *lg, void *buf, u32 addr, unsigned bytes);
+void lhwrite(struct lguest *lg, u32 addr, const void *buf, unsigned bytes);
+int lguest_address_ok(const struct lguest *lg, unsigned long addr);
+int run_guest(struct lguest *lg, char *__user user);
+int find_free_guest(void);
+
+/* interrupts_and_traps.c: */
+void maybe_do_interrupt(struct lguest *lg);
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err);
+void check_bug_kill(struct lguest *lg);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+
+/* segments.c: */
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num);
+void guest_load_tls(struct lguest *lg,
+ const struct desc_struct __user *tls_array);
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable);
+void free_guest_pagetable(struct lguest *lg);
+void guest_new_pagetable(struct lguest *lg, u32 pgtable);
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long cr3,
+ unsigned long vaddr, u32 val);
+void map_trap_page(struct lguest *info);
+int demand_page(struct lguest *info, u32 cr2, int write);
+void pin_stack_pages(struct lguest *lg);
+
+int lguest_device_init(void);
+void lguest_device_remove(void);
+void lguest_io_init(void);
+u32 bind_dma(struct lguest *lg,
+ unsigned long addr, unsigned long udma, u16 numdmas,u8 interrupt);
+int send_dma(struct lguest *info, unsigned long addr,
+ unsigned long udma);
+void release_all_dma(struct lguest *lg);
+unsigned long get_dma_buffer(struct lguest *lg, unsigned long addr,
+ unsigned long *interrupt);
+
+void set_wakeup_process(struct lguest *lg, struct task_struct *p);
+int do_async_hcalls(struct lguest *info);
+int hypercall(struct lguest *info, struct lguest_regs *regs);
+
+#define kill_guest(lg, fmt...) \
+do { \
+ if (!(lg)->dead) { \
+ (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \
+ if (!(lg)->dead) \
+ (lg)->dead = (void *)1; \
+ } \
+} while(0)
+
+static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+ return vaddr - lg->page_offset;
+}
+
+/* Hardware-defined TSS structure. */
+struct x86_tss
+{
+ unsigned short back_link,__blh;
+ unsigned long esp0;
+ unsigned short ss0,__ss0pad;
+ unsigned long esp1;
+ unsigned short ss1,__ss1pad;
+ unsigned long esp2;
+ unsigned short ss2,__ss2pad;
+ unsigned long cr3;
+ unsigned long eip;
+ unsigned long eflags;
+ unsigned long eax,ecx,edx,ebx;
+ unsigned long esp; /* We actually use this one to save esp. */
+ unsigned long ebp;
+ unsigned long esi;
+ unsigned long edi;
+ unsigned short es, __espad;
+ unsigned short cs, __cspad;
+ unsigned short ss, __sspad;
+ unsigned short ds, __dspad;
+ unsigned short fs, __fspad;
+ unsigned short gs, __gspad;
+ unsigned short ldt, __ldtpad;
+ unsigned short trace, io_bitmap_base;
+};
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+ struct lguest_regs *regs, struct x86_tss *tss);
+
+struct lguest_host_state
+{
+ struct Xgt_desc_struct gdt;
+ struct Xgt_desc_struct idt;
+ unsigned long pgdir;
+ unsigned long stackptr;
+};
+
+/* This sits in the high-mapped shim. */
+struct lguest_state
+{
+ /* Task struct. */
+ struct x86_tss tss;
+
+ /* Gate descriptor table. */
+ struct Xgt_desc_struct gdt;
+ struct desc_struct gdt_table[GDT_ENTRIES];
+
+ /* Interrupt descriptor table. */
+ struct Xgt_desc_struct idt;
+ struct desc_struct idt_table[IDT_ENTRIES];
+
+ /* Host state we store while the guest runs. */
+ struct lguest_host_state host;
+
+ /* This is the stack on which we push our regs. */
+ struct lguest_regs regs;
+};
+#endif /* __ASSEMBLY__ */
+#endif /* _LGUEST_H */


2007-02-09 10:57:33

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 6b/10] lguest: the host code (lg.ko)

This is the host module (lg.ko) which supports lguest:

arch/i386/lguest/hypervisor.S:
The actual guest <-> host switching code. This is compiled into
a C array, which is mapped to 0xFFC01000 in host and guests.

arch/i386/lguest/core.c:
The core of the hypervisor, which calls into the assembler
code which does this actual switch. Also contains helper
routines.

arch/i386/lguest/hypercalls.c:
The entry point for the 19 hypercalls.

arch/i386/lguest/interrupts_and_traps.c:
Handling of interrupts and traps, except page faults.

arch/i386/lguest/io.c:
I/O from guest to host, and between guests.

arch/i386/lguest/lguest_user.c:
/dev/lguest interface for lguest program to launch/control guests.

arch/i386/lguest/page_tables.c:
Shadow Page table handling: generally we build up the shadow
page tables by converting from guest page tables when a fault occurs.

arch/i386/lguest/segments.c:
Segmentation (GDT) handling: we have to ensure they're trimmed
to avoid guest access to the switching code.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- /dev/null
+++ b/arch/i386/lguest/core.c
@@ -0,0 +1,425 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future. Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/lguest.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+/* This is our hypervisor, compiled from hypervisor.S. */
+static char __initdata hypervisor_blob[] = {
+#include "hypervisor-blob.c"
+};
+
+#define MAX_LGUEST_GUESTS \
+ ((HYPERVISOR_SIZE-sizeof(hypervisor_blob))/sizeof(struct lguest_state))
+
+static struct vm_struct *hypervisor_vma;
+static int cpu_had_pge;
+static struct {
+ unsigned long offset;
+ unsigned short segment;
+} lguest_entry;
+struct page *hype_pages; /* Contiguous pages. */
+struct lguest lguests[MAX_LGUEST_GUESTS];
+DECLARE_MUTEX(lguest_lock);
+
+/* IDT entries are at start of hypervisor. */
+const unsigned long *__lguest_default_idt_entries(void)
+{
+ return (void *)HYPE_ADDR;
+}
+
+/* Next is switch_to_guest */
+static void *__lguest_switch_to_guest(void)
+{
+ return (void *)HYPE_ADDR + HYPE_DATA_SIZE;
+}
+
+/* Then we use everything else to hold guest state. */
+struct lguest_state *__lguest_states(void)
+{
+ return (void *)HYPE_ADDR + sizeof(hypervisor_blob);
+}
+
+static __init int map_hypervisor(void)
+{
+ unsigned int i;
+ int err;
+ struct page *pages[HYPERVISOR_PAGES], **pagep = pages;
+
+ hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO,
+ get_order(HYPERVISOR_SIZE));
+ if (!hype_pages)
+ return -ENOMEM;
+
+ hypervisor_vma = __get_vm_area(HYPERVISOR_SIZE, VM_ALLOC,
+ HYPE_ADDR, VMALLOC_END);
+ if (!hypervisor_vma) {
+ err = -ENOMEM;
+ printk("lguest: could not map hypervisor pages high\n");
+ goto free_pages;
+ }
+
+ for (i = 0; i < HYPERVISOR_PAGES; i++)
+ pages[i] = hype_pages + i;
+
+ err = map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep);
+ if (err) {
+ printk("lguest: map_vm_area failed: %i\n", err);
+ goto free_vma;
+ }
+ memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob));
+
+ /* Setup LGUEST segments on all cpus */
+ for_each_possible_cpu(i) {
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ }
+
+ /* Initialize entry point into hypervisor. */
+ lguest_entry.offset = (long)__lguest_switch_to_guest();
+ lguest_entry.segment = LGUEST_CS;
+
+ printk("lguest: mapped hypervisor at %p\n", hypervisor_vma->addr);
+ return 0;
+
+free_vma:
+ vunmap(hypervisor_vma->addr);
+free_pages:
+ __free_pages(hype_pages, get_order(HYPERVISOR_SIZE));
+ return err;
+}
+
+static __exit void unmap_hypervisor(void)
+{
+ vunmap(hypervisor_vma->addr);
+ __free_pages(hype_pages, get_order(HYPERVISOR_SIZE));
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+ u8 insn;
+ unsigned int insnlen = 0, in = 0, shift = 0;
+ unsigned long physaddr = guest_pa(lg, lg->state->regs.eip);
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->state->regs.eip < lg->page_offset)
+ return 0;
+ lhread(lg, &insn, physaddr, 1);
+
+ /* Operand size prefix means it's actually for ax. */
+ if (insn == 0x66) {
+ shift = 16;
+ insnlen = 1;
+ lhread(lg, &insn, physaddr + insnlen, 1);
+ }
+
+ switch (insn & 0xFE) {
+ case 0xE4: /* in <next byte>,%al */
+ insnlen += 2;
+ in = 1;
+ break;
+ case 0xEC: /* in (%dx),%al */
+ insnlen += 1;
+ in = 1;
+ break;
+ case 0xE6: /* out %al,<next byte> */
+ insnlen += 2;
+ break;
+ case 0xEE: /* out %al,(%dx) */
+ insnlen += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (in) {
+ /* Lower bit tells is whether it's a 16 or 32 bit access */
+ if (insn & 0x1)
+ lg->state->regs.eax = 0xFFFFFFFF;
+ else
+ lg->state->regs.eax |= (0xFFFF << shift);
+ }
+ lg->state->regs.eip += insnlen;
+ return 1;
+}
+
+int find_free_guest(void)
+{
+ unsigned int i;
+ for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+ if (!lguests[i].state)
+ return i;
+ return -1;
+}
+
+int lguest_address_ok(const struct lguest *lg, unsigned long addr)
+{
+ return addr / PAGE_SIZE < lg->pfn_limit;
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lhread_u32(struct lguest *lg, u32 addr)
+{
+ u32 val = 0;
+
+ /* Don't let them access lguest_add */
+ if (!lguest_address_ok(lg, addr)
+ || get_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad read address %u", addr);
+ return val;
+}
+
+void lhwrite_u32(struct lguest *lg, u32 addr, u32 val)
+{
+ if (!lguest_address_ok(lg, addr)
+ || put_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad write address %u", addr);
+}
+
+void lhread(struct lguest *lg, void *b, u32 addr, unsigned bytes)
+{
+ if (addr + bytes < addr || !lguest_address_ok(lg, addr+bytes)
+ || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+ /* copy_from_user should do this, but as we rely on it... */
+ memset(b, 0, bytes);
+ kill_guest(lg, "bad read address %u len %u", addr, bytes);
+ }
+}
+
+void lhwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(lg, addr+bytes)
+ || copy_to_user((void __user *)addr, b, bytes) != 0)
+ kill_guest(lg, "bad write address %u len %u", addr, bytes);
+}
+
+/* Saves exporting idt_table from kernel */
+static struct desc_struct *get_idt_table(void)
+{
+ struct Xgt_desc_struct idt;
+
+ asm("sidt %0":"=m" (idt));
+ return (void *)idt.address;
+}
+
+extern asmlinkage void math_state_restore(void);
+
+static int usermode(struct lguest_regs *regs)
+{
+ return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+}
+
+/* Trap page resets this when it reloads gs. */
+static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs)
+{
+ u32 eip;
+ get_user(eip, &lg->lguest_data->gs_gpf_eip);
+ if (eip == regs->eip)
+ return 0;
+ put_user(regs->eip, &lg->lguest_data->gs_gpf_eip);
+ return 1;
+}
+
+static void set_ts(unsigned int guest_ts)
+{
+ u32 cr0;
+ if (guest_ts) {
+ asm("movl %%cr0,%0":"=r" (cr0));
+ if (!(cr0 & 8))
+ asm("movl %0,%%cr0": :"r" (cr0|8));
+ }
+}
+
+static void run_guest_once(struct lguest *lg)
+{
+ unsigned int clobber;
+
+ /* Put eflags on stack, lcall does rest. */
+ asm volatile("pushf; lcall *lguest_entry"
+ : "=a"(clobber), "=d"(clobber)
+ : "0"(lg->state), "1"(get_idt_table())
+ : "memory");
+}
+
+int run_guest(struct lguest *lg, char *__user user)
+{
+ struct lguest_regs *regs = &lg->state->regs;
+
+ while (!lg->dead) {
+ unsigned int cr2 = 0; /* Damn gcc */
+
+ /* Hypercalls first: we might have been out to userspace */
+ if (do_async_hcalls(lg))
+ goto pending_dma;
+
+ if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+ /* Only do hypercall once. */
+ regs->trapnum = 255;
+ if (hypercall(lg, regs))
+ goto pending_dma;
+ }
+
+ if (signal_pending(current))
+ return -EINTR;
+ maybe_do_interrupt(lg);
+
+ if (lg->dead)
+ break;
+
+ if (lg->halted) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ continue;
+ }
+
+ /* Restore limits on TLS segments if in user mode. */
+ if (usermode(regs)) {
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++)
+ lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a
+ |= lg->tls_limits[i];
+ }
+
+ local_irq_disable();
+ map_trap_page(lg);
+
+ /* Host state to be restored after the guest returns. */
+ asm("sidt %0":"=m"(lg->state->host.idt));
+ lg->state->host.gdt = __get_cpu_var(cpu_gdt_descr);
+
+ /* Even if *we* don't want FPU trap, guest might... */
+ set_ts(lg->ts);
+
+ run_guest_once(lg);
+
+ /* Save cr2 now if we page-faulted. */
+ if (regs->trapnum == 14)
+ asm("movl %%cr2,%0" :"=r" (cr2));
+ else if (regs->trapnum == 7)
+ math_state_restore();
+ local_irq_enable();
+
+ switch (regs->trapnum) {
+ case 13: /* We've intercepted a GPF. */
+ if (regs->errcode == 0) {
+ if (emulate_insn(lg))
+ continue;
+
+ /* FIXME: If it's reloading %gs in a loop? */
+ if (usermode(regs) && new_gfp_eip(lg,regs))
+ continue;
+ }
+
+ if (reflect_trap(lg, &lg->gpf_trap, 1))
+ continue;
+ break;
+ case 14: /* We've intercepted a page fault. */
+ if (demand_page(lg, cr2, regs->errcode & 2))
+ continue;
+
+ /* If lguest_data is NULL, this won't hurt. */
+ put_user(cr2, &lg->lguest_data->cr2);
+ if (reflect_trap(lg, &lg->page_trap, 1))
+ continue;
+ kill_guest(lg, "unhandled page fault at %#x"
+ " (eip=%#x, errcode=%#x)",
+ cr2, regs->eip, regs->errcode);
+ break;
+ case 7: /* We've intercepted a Device Not Available fault. */
+ /* If they don't want to know, just absorb it. */
+ if (!lg->ts)
+ continue;
+ if (reflect_trap(lg, &lg->fpu_trap, 0))
+ continue;
+ kill_guest(lg, "unhandled FPU fault at %#x",
+ regs->eip);
+ break;
+ case 32 ... 255: /* Real interrupt, fall thru */
+ cond_resched();
+ case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+ continue;
+ case 6: /* Invalid opcode before they installed handler */
+ check_bug_kill(lg);
+ }
+ kill_guest(lg,"unhandled trap %i at %#x (err=%i)",
+ regs->trapnum, regs->eip, regs->errcode);
+ }
+ return -ENOENT;
+
+pending_dma:
+ put_user(lg->pending_dma, (unsigned long *)user);
+ put_user(lg->pending_addr, (unsigned long *)user+1);
+ return sizeof(unsigned long)*2;
+}
+
+#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->elem)
+
+static void adjust_pge(void *on)
+{
+ if (on)
+ write_cr4(read_cr4() | X86_CR4_PGE);
+ else
+ write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+static int __init init(void)
+{
+ int err;
+
+ if (paravirt_enabled())
+ return -EPERM;
+
+ err = map_hypervisor();
+ if (err)
+ return err;
+
+ err = init_pagetables(hype_pages);
+ if (err) {
+ unmap_hypervisor();
+ return err;
+ }
+ lguest_io_init();
+
+ err = lguest_device_init();
+ if (err) {
+ free_pagetables();
+ unmap_hypervisor();
+ return err;
+ }
+ if (cpu_has_pge) { /* We have a broader idea of "global". */
+ cpu_had_pge = 1;
+ on_each_cpu(adjust_pge, 0, 0, 1);
+ clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ }
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ lguest_device_remove();
+ free_pagetables();
+ unmap_hypervisor();
+ if (cpu_had_pge) {
+ set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ on_each_cpu(adjust_pge, (void *)1, 0, 1);
+ }
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <[email protected]>");
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypercalls.c
@@ -0,0 +1,199 @@
+/* Actual hypercalls, which allow guests to actually do something.
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/clocksource.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void guest_set_stack(struct lguest *lg,
+ u32 seg, u32 esp, unsigned int pages)
+{
+ /* You cannot have a stack segment with priv level 0. */
+ if ((seg & 0x3) != GUEST_DPL)
+ kill_guest(lg, "bad stack segment %i", seg);
+ if (pages > 2)
+ kill_guest(lg, "bad stack pages %u", pages);
+ lg->state->tss.ss1 = seg;
+ lg->state->tss.esp1 = esp;
+ lg->stack_pages = pages;
+ pin_stack_pages(lg);
+}
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+ switch (regs->eax) {
+ case LHCALL_FLUSH_ASYNC:
+ break;
+ case LHCALL_LGUEST_INIT:
+ kill_guest(lg, "already have lguest_data");
+ break;
+ case LHCALL_CRASH: {
+ char msg[128];
+ lhread(lg, msg, regs->edx, sizeof(msg));
+ msg[sizeof(msg)-1] = '\0';
+ kill_guest(lg, "CRASH: %s", msg);
+ break;
+ }
+ case LHCALL_LOAD_GDT:
+ load_guest_gdt(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_NEW_PGTABLE:
+ guest_new_pagetable(lg, regs->edx);
+ break;
+ case LHCALL_FLUSH_TLB:
+ if (regs->edx)
+ guest_pagetable_clear_all(lg);
+ else
+ guest_pagetable_flush_user(lg);
+ break;
+ case LHCALL_LOAD_IDT_ENTRY:
+ load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_SET_STACK:
+ guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_TS:
+ lg->ts = regs->edx;
+ break;
+ case LHCALL_TIMER_READ: {
+ u32 now = jiffies;
+ mb();
+ regs->eax = now - lg->last_timer;
+ lg->last_timer = now;
+ break;
+ }
+ case LHCALL_TIMER_START:
+ lg->timer_on = 1;
+ if (regs->edx != HZ)
+ kill_guest(lg, "Bad clock speed %i", regs->edx);
+ lg->last_timer = jiffies;
+ break;
+ case LHCALL_HALT:
+ lg->halted = 1;
+ break;
+ case LHCALL_GET_WALLCLOCK: {
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ regs->eax = tv.tv_sec;
+ break;
+ }
+ case LHCALL_BIND_DMA:
+ regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+ regs->ecx >> 8, regs->ecx & 0xFF);
+ break;
+ case LHCALL_SEND_DMA:
+ return send_dma(lg, regs->edx, regs->ebx);
+ case LHCALL_SET_PTE:
+ guest_set_pte(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_SET_UNKNOWN_PTE:
+ guest_pagetable_clear_all(lg);
+ break;
+ case LHCALL_SET_PUD:
+ guest_set_pud(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_TLS:
+ guest_load_tls(lg, (struct desc_struct __user*)regs->edx);
+ break;
+ default:
+ kill_guest(lg, "Bad hypercall %i\n", regs->eax);
+ }
+ return 0;
+}
+
+#define log(...) \
+ do { \
+ mm_segment_t oldfs = get_fs(); \
+ char buf[100]; \
+ sprintf(buf, "lguest:" __VA_ARGS__); \
+ set_fs(KERNEL_DS); \
+ sys_write(1, buf, strlen(buf)); \
+ set_fs(oldfs); \
+ } while(0)
+
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+ unsigned int i, pending;
+ u8 st[LHCALL_RING_SIZE];
+
+ if (!lg->lguest_data)
+ return 0;
+
+ copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st));
+ for (i = 0; i < ARRAY_SIZE(st); i++) {
+ struct lguest_regs regs;
+ unsigned int n = lg->next_hcall;
+
+ if (st[n] == 0xFF)
+ break;
+
+ if (++lg->next_hcall == LHCALL_RING_SIZE)
+ lg->next_hcall = 0;
+
+ get_user(regs.eax, &lg->lguest_data->hcalls[n].eax);
+ get_user(regs.edx, &lg->lguest_data->hcalls[n].edx);
+ get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx);
+ get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx);
+ pending = do_hcall(lg, &regs);
+ put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+ if (pending)
+ return 1;
+ }
+
+ set_wakeup_process(lg, NULL);
+ return 0;
+}
+
+int hypercall(struct lguest *lg, struct lguest_regs *regs)
+{
+ int pending;
+
+ if (!lg->lguest_data) {
+ if (regs->eax != LHCALL_LGUEST_INIT) {
+ kill_guest(lg, "hypercall %i before LGUEST_INIT",
+ regs->eax);
+ return 0;
+ }
+
+ lg->lguest_data = (struct lguest_data __user *)regs->edx;
+ /* We check here so we can simply copy_to_user/from_user */
+ if (!lguest_address_ok(lg, (long)lg->lguest_data)
+ || !lguest_address_ok(lg, (long)(lg->lguest_data+1))){
+ kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ return 0;
+ }
+ get_user(lg->noirq_start, &lg->lguest_data->noirq_start);
+ get_user(lg->noirq_end, &lg->lguest_data->noirq_end);
+ /* We reserve the top pgd entry. */
+ put_user(4U*1024*1024, &lg->lguest_data->reserve_mem);
+ put_user(lg->guestid, &lg->lguest_data->guestid);
+ put_user(clocksource_khz2mult(tsc_khz, 22),
+ &lg->lguest_data->clock_mult);
+ return 0;
+ }
+ pending = do_hcall(lg, regs);
+ set_wakeup_process(lg, NULL);
+ return pending;
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypervisor.S
@@ -0,0 +1,170 @@
+/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch.
+ Layout is: default_idt_entries (1k), then switch_to_guest entry point. */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+#define SAVE_REGS \
+ /* Save old guest/host state */ \
+ pushl %es; \
+ pushl %ds; \
+ pushl %fs; \
+ pushl %eax; \
+ pushl %gs; \
+ pushl %ebp; \
+ pushl %edi; \
+ pushl %esi; \
+ pushl %edx; \
+ pushl %ecx; \
+ pushl %ebx; \
+
+.text
+ENTRY(_start) /* ld complains unless _start is defined. */
+/* %eax contains ptr to target guest state, %edx contains host idt. */
+switch_to_guest:
+ pushl %ss
+ SAVE_REGS
+ /* Save old stack, switch to guest's stack. */
+ movl %esp, LGUEST_STATE_host_stackptr(%eax)
+ movl %eax, %esp
+ /* Guest registers will be at: %esp-$LGUEST_STATE_regs */
+ addl $LGUEST_STATE_regs, %esp
+ /* Switch to guest's GDT, IDT. */
+ lgdt LGUEST_STATE_gdt(%eax)
+ lidt LGUEST_STATE_idt(%eax)
+ /* Save page table top. */
+ movl %cr3, %ebx
+ movl %ebx, LGUEST_STATE_host_pgdir(%eax)
+ /* Set host's TSS to available (clear byte 5 bit 2). */
+ movl (LGUEST_STATE_host_gdt+2)(%eax), %ebx
+ andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx)
+ /* Switch to guest page tables */
+ popl %ebx
+ movl %ebx, %cr3
+ /* Switch to guest's TSS. */
+ movl $(GDT_ENTRY_TSS*8), %ebx
+ ltr %bx
+ /* Restore guest regs */
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %gs
+ /* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */
+ addl $(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax
+ movw $0,(%eax)
+ movw $0,8(%eax)
+ movw $0,16(%eax)
+ popl %eax
+ popl %fs
+ popl %ds
+ popl %es
+ /* Skip error code and trap number */
+ addl $8, %esp
+ iret
+
+#define SWITCH_TO_HOST \
+ SAVE_REGS; \
+ /* Save old pgdir */ \
+ movl %cr3, %eax; \
+ pushl %eax; \
+ /* Load lguest ds segment for convenience. */ \
+ movl $(LGUEST_DS), %eax; \
+ movl %eax, %ds; \
+ /* Now figure out who we are */ \
+ movl %esp, %eax; \
+ subl $LGUEST_STATE_regs, %eax; \
+ /* Switch to host page tables (GDT, IDT and stack are in host \
+ mem, so need this first) */ \
+ movl LGUEST_STATE_host_pgdir(%eax), %ebx; \
+ movl %ebx, %cr3; \
+ /* Set guest's TSS to available (clear byte 5 bit 2). */ \
+ andb $0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\
+ /* Switch to host's GDT & IDT. */ \
+ lgdt LGUEST_STATE_host_gdt(%eax); \
+ lidt LGUEST_STATE_host_idt(%eax); \
+ /* Switch to host's stack. */ \
+ movl LGUEST_STATE_host_stackptr(%eax), %esp; \
+ /* Switch to host's TSS */ \
+ movl $(GDT_ENTRY_TSS*8), %eax; \
+ ltr %ax; \
+ /* Restore host regs */ \
+ popl %ebx; \
+ popl %ecx; \
+ popl %edx; \
+ popl %esi; \
+ popl %edi; \
+ popl %ebp; \
+ popl %gs; \
+ popl %eax; \
+ popl %fs; \
+ popl %ds; \
+ popl %es; \
+ popl %ss
+
+/* Return to run_guest_once. */
+return_to_host:
+ SWITCH_TO_HOST
+ iret
+
+deliver_to_host:
+ SWITCH_TO_HOST
+decode_idt_and_jmp:
+ /* Decode IDT and jump to hosts' irq handler. When that does iret, it
+ * will return to run_guest_once. This is a feature. */
+ /* We told gcc we'd clobber edx and eax... */
+ movl LGUEST_STATE_trapnum(%eax), %eax
+ leal (%edx,%eax,8), %eax
+ movzwl (%eax),%edx
+ movl 4(%eax), %eax
+ xorw %ax, %ax
+ orl %eax, %edx
+ jmp *%edx
+
+deliver_to_host_with_errcode:
+ SWITCH_TO_HOST
+ pushl LGUEST_STATE_errcode(%eax)
+ jmp decode_idt_and_jmp
+
+/* Real hardware interrupts are delivered straight to the host. Others
+ cause us to return to run_guest_once so it can decide what to do. Note
+ that some of these are overridden by the guest to deliver directly, and
+ never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+ .data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+ pushl $0
+ .endif
+ pushl $\N
+ jmp \TARGET
+ ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+ IRQ_STUB irq \TARGET
+ irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host. Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+default_idt_entries:
+.text
+ IRQ_STUBS 0 1 return_to_host /* First two traps */
+ IRQ_STUB 2 deliver_to_host_with_errcode /* NMI */
+ IRQ_STUBS 3 31 return_to_host /* Rest of traps */
+ IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
+ IRQ_STUB 128 return_to_host /* System call (overridden) */
+ IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
+
+/* Everything after this is used for the lguest_state structs. */
+ALIGN
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/interrupts_and_traps.c
@@ -0,0 +1,221 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val)
+{
+ lhwrite_u32(lg, (u32)--(*gstack), val);
+}
+
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err)
+{
+ u32 __user *gstack;
+ u32 eflags, ss, irq_enable;
+ struct lguest_regs *regs = &lg->state->regs;
+
+ if (!trap->addr)
+ return 0;
+
+ /* If they want a ring change, we use new stack and push old ss/esp */
+ if ((regs->ss&0x3) != GUEST_DPL) {
+ gstack = (u32 __user *)guest_pa(lg, lg->state->tss.esp1);
+ ss = lg->state->tss.ss1;
+ push_guest_stack(lg, &gstack, regs->ss);
+ push_guest_stack(lg, &gstack, regs->esp);
+ } else {
+ gstack = (u32 __user *)guest_pa(lg, regs->esp);
+ ss = regs->ss;
+ }
+
+ /* We use IF bit in eflags to indicate whether irqs were disabled
+ (it's always 0, since irqs are enabled when guest is running). */
+ eflags = regs->eflags;
+ get_user(irq_enable, &lg->lguest_data->irq_enabled);
+ eflags |= (irq_enable & 512);
+
+ push_guest_stack(lg, &gstack, eflags);
+ push_guest_stack(lg, &gstack, regs->cs);
+ push_guest_stack(lg, &gstack, regs->eip);
+
+ if (has_err)
+ push_guest_stack(lg, &gstack, regs->errcode);
+
+ /* Change the real stack so hypervisor returns to trap handler */
+ regs->ss = ss;
+ regs->esp = (u32)gstack + lg->page_offset;
+ regs->cs = (__KERNEL_CS|GUEST_DPL);
+ regs->eip = trap->addr;
+
+ /* GS will be neutered on way back to guest. */
+ put_user(0, &lg->lguest_data->gs_gpf_eip);
+
+ /* Disable interrupts for an interrupt gate. */
+ if (trap->disable_interrupts)
+ put_user(0, &lg->lguest_data->irq_enabled);
+ return 1;
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+ unsigned int irq;
+ DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+ if (!lg->lguest_data)
+ return;
+
+ /* If timer has changed, set timer interrupt. */
+ if (lg->timer_on && jiffies != lg->last_timer)
+ set_bit(0, lg->irqs_pending);
+
+ /* Mask out any interrupts they have blocked. */
+ copy_from_user(&irqs, lg->lguest_data->interrupts, sizeof(irqs));
+ bitmap_andnot(irqs, lg->irqs_pending, irqs, LGUEST_IRQS);
+
+ irq = find_first_bit(irqs, LGUEST_IRQS);
+ if (irq >= LGUEST_IRQS)
+ return;
+
+ /* If they're halted, we re-enable interrupts. */
+ if (lg->halted) {
+ /* Re-enable interrupts. */
+ put_user(512, &lg->lguest_data->irq_enabled);
+ lg->halted = 0;
+ } else {
+ /* Maybe they have interrupts disabled? */
+ u32 irq_enabled;
+ get_user(irq_enabled, &lg->lguest_data->irq_enabled);
+ if (!irq_enabled)
+ return;
+ }
+
+ if (lg->interrupt[irq].addr != 0) {
+ clear_bit(irq, lg->irqs_pending);
+ reflect_trap(lg, &lg->interrupt[irq], 0);
+ }
+}
+
+void check_bug_kill(struct lguest *lg)
+{
+#ifdef CONFIG_BUG
+ u32 eip = lg->state->regs.eip - PAGE_OFFSET;
+ u16 insn;
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->state->regs.eip < PAGE_OFFSET)
+ return;
+ lhread(lg, &insn, eip, sizeof(insn));
+ if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ u16 l;
+ u32 f;
+ char file[128];
+ lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+ lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+ lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+ file[sizeof(file)-1] = 0;
+ kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+ kill_guest(lg, "BUG() at %#x", eip);
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+ }
+#endif /* CONFIG_BUG */
+}
+
+static void copy_trap(struct lguest *lg,
+ struct host_trap *trap,
+ const struct desc_struct *desc)
+{
+ u8 type = ((desc->b >> 8) & 0xF);
+
+ /* Not present? */
+ if (!(desc->b & 0x8000)) {
+ trap->addr = 0;
+ return;
+ }
+ if (type != 0xE && type != 0xF)
+ kill_guest(lg, "bad IDT type %i", type);
+ trap->disable_interrupts = (type == 0xE);
+ trap->addr = ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000));
+}
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[]
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+ 0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+ /* movl 0, %ss:lguest_data.gs_gpf_eip */
+ 0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+ u32 addr, off;
+
+ off = sizeof(tramp)*i;
+ memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+ /* 0 is to be placed in lguest_data.gs_gpf_eip. */
+ addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+ memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+ /* Address is relative to where end of jmp will be. */
+ addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+ memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+ return (-4*1024*1024) + off;
+}
+
+/* We bounce through the trap page, for two reasons: firstly, we need
+ the interrupt destination always mapped, to avoid double faults,
+ secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+static void setup_idt(struct lguest *lg,
+ unsigned int i,
+ const struct desc_struct *desc)
+{
+ u8 type = ((desc->b >> 8) & 0xF);
+ u32 taddr;
+
+ /* Not present? */
+ if (!(desc->b & 0x8000)) {
+ /* FIXME: When we need this, we'll know... */
+ if (lg->state->idt_table[i].a & 0x8000)
+ kill_guest(lg, "removing interrupts not supported");
+ return;
+ }
+
+ /* We could reflect and disable interrupts, but guest can do itself. */
+ if (type != 0xF)
+ kill_guest(lg, "bad direct IDT %i type %i", i, type);
+
+ taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+
+ lg->state->idt_table[i].a = (((__KERNEL_CS|GUEST_DPL)<<16)
+ | (taddr & 0x0000FFFF));
+ lg->state->idt_table[i].b = (desc->b&0xEF00)|(taddr&0xFFFF0000);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 high)
+{
+ struct desc_struct d = { low, high };
+
+ /* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+ if (i == 2 || i == 8 || i == 15 || i == LGUEST_TRAP_ENTRY)
+ return;
+ /* FIXME: We should handle debug and int3 */
+ else if (i == 1 || i == 3)
+ return;
+ /* We intercept page fault, general protection fault and fpu missing */
+ else if (i == 13)
+ copy_trap(lg, &lg->gpf_trap, &d);
+ else if (i == 14)
+ copy_trap(lg, &lg->page_trap, &d);
+ else if (i == 7)
+ copy_trap(lg, &lg->fpu_trap, &d);
+ /* Other traps go straight to guest. */
+ else if (i < FIRST_EXTERNAL_VECTOR || i == SYSCALL_VECTOR)
+ setup_idt(lg, i, &d);
+ /* A virtual interrupt */
+ else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS)
+ copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d);
+}
+
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/io.c
@@ -0,0 +1,413 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[64];
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (!dma->len[i])
+ return 1;
+ if (!lguest_address_ok(lg, dma->addr[i]))
+ goto kill;
+ if (dma->len[i] > PAGE_SIZE)
+ goto kill;
+ /* We could do over a page, but is it worth it? */
+ if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+ goto kill;
+ }
+ return 1;
+
+kill:
+ kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]);
+ return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+ return jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset)
+ % ARRAY_SIZE(dma_hash);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+ BUG_ON(down_trylock(&lguest_lock) == 0);
+ dmainfo->interrupt = 0;
+ list_del(&dmainfo->list);
+ drop_futex_key_refs(&dmainfo->key);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+ return (a->both.word == b->both.word
+ && a->both.ptr == b->both.ptr
+ && a->both.offset == b->both.offset);
+}
+
+static u32 unbind_dma(struct lguest *lg,
+ const union futex_key *key,
+ unsigned long dmas)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+ unlink_dma(&lg->dma[i]);
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+u32 bind_dma(struct lguest *lg,
+ unsigned long addr, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+ unsigned int i;
+ u32 ret = 0;
+ union futex_key key;
+
+ if (interrupt >= LGUEST_IRQS)
+ return 0;
+
+ down(&lguest_lock);
+ down_read(&current->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)addr, &key) != 0) {
+ kill_guest(lg, "bad dma address %#lx", addr);
+ goto unlock;
+ }
+ get_futex_key_refs(&key);
+
+ if (interrupt == 0)
+ ret = unbind_dma(lg, &key, dmas);
+ else {
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (lg->dma[i].interrupt == 0) {
+ lg->dma[i].dmas = dmas;
+ lg->dma[i].num_dmas = numdmas;
+ lg->dma[i].next_dma = 0;
+ lg->dma[i].key = key;
+ lg->dma[i].guestid = lg->guestid;
+ lg->dma[i].interrupt = interrupt;
+ list_add(&lg->dma[i].list,
+ &dma_hash[hash(&key)]);
+ ret = 1;
+ goto unlock;
+ }
+ }
+ }
+ drop_futex_key_refs(&key);
+unlock:
+ up_read(&current->mm->mmap_sem);
+ up(&lguest_lock);
+ return ret;
+}
+
+/* lhread from another guest */
+static int lhread_other(struct lguest *lg,
+ void *buf, u32 addr, unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(lg, addr+bytes)
+ || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+ memset(buf, 0, bytes);
+ kill_guest(lg, "bad address in registered DMA struct");
+ return 0;
+ }
+ return 1;
+}
+
+/* lhwrite to another guest */
+static int lhwrite_other(struct lguest *lg, u32 addr,
+ const void *buf, unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(lg, addr+bytes)
+ || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+ != bytes)) {
+ kill_guest(lg, "bad address writing to registered DMA");
+ return 0;
+ }
+ return 1;
+}
+
+static u32 copy_data(const struct lguest_dma *src,
+ const struct lguest_dma *dst,
+ struct page *pages[])
+{
+ unsigned int totlen, si, di, srcoff, dstoff;
+ void *maddr = NULL;
+
+ totlen = 0;
+ si = di = 0;
+ srcoff = dstoff = 0;
+ while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+ && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+ u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+ if (!maddr)
+ maddr = kmap(pages[di]);
+
+ /* FIXME: This is not completely portable, since
+ archs do different things for copy_to_user_page. */
+ if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+ (void *__user)src->addr[si], len) != 0) {
+ totlen = 0;
+ break;
+ }
+
+ totlen += len;
+ srcoff += len;
+ dstoff += len;
+ if (srcoff == src->len[si]) {
+ si++;
+ srcoff = 0;
+ }
+ if (dstoff == dst->len[di]) {
+ kunmap(pages[di]);
+ maddr = NULL;
+ di++;
+ dstoff = 0;
+ }
+ }
+
+ if (maddr)
+ kunmap(pages[di]);
+
+ return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+ struct lguest *dstlg, const struct lguest_dma *dst)
+{
+ int i;
+ u32 ret;
+ struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+ if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+ return 0;
+
+ /* First get the destination pages */
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (dst->len[i] == 0)
+ break;
+ if (get_user_pages(dstlg->tsk, dstlg->mm,
+ dst->addr[i], 1, 1, 1, pages+i, NULL)
+ != 1) {
+ ret = 0;
+ goto drop_pages;
+ }
+ }
+
+ /* Now copy until we run out of src or dst. */
+ ret = copy_data(src, dst, pages);
+
+drop_pages:
+ while (--i >= 0)
+ put_page(pages[i]);
+ return ret;
+}
+
+/* We cache one process to wakeup: helps for batching & wakes outside locks. */
+void set_wakeup_process(struct lguest *lg, struct task_struct *p)
+{
+ if (p == lg->wake)
+ return;
+
+ if (lg->wake) {
+ wake_up_process(lg->wake);
+ put_task_struct(lg->wake);
+ }
+ lg->wake = p;
+ if (lg->wake)
+ get_task_struct(lg->wake);
+}
+
+static int dma_transfer(struct lguest *srclg,
+ unsigned long udma,
+ struct lguest_dma_info *dst)
+{
+ struct lguest_dma dst_dma, src_dma;
+ struct lguest *dstlg;
+ u32 i, dma = 0;
+
+ dstlg = &lguests[dst->guestid];
+ /* Get our dma list. */
+ lhread(srclg, &src_dma, udma, sizeof(src_dma));
+
+ /* We can't deadlock against them dmaing to us, because this
+ * is all under the lguest_lock. */
+ down_read(&dstlg->mm->mmap_sem);
+
+ for (i = 0; i < dst->num_dmas; i++) {
+ dma = (dst->next_dma + i) % dst->num_dmas;
+ if (!lhread_other(dstlg, &dst_dma,
+ dst->dmas + dma * sizeof(struct lguest_dma),
+ sizeof(dst_dma))) {
+ goto fail;
+ }
+ if (!dst_dma.used_len)
+ break;
+ }
+ if (i != dst->num_dmas) {
+ unsigned long used_lenp;
+ unsigned int ret;
+
+ ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+ /* Put used length in src. */
+ lhwrite_u32(srclg,
+ udma+offsetof(struct lguest_dma, used_len), ret);
+ if (ret == 0 && src_dma.len[0] != 0)
+ goto fail;
+
+ /* Make sure destination sees contents before length. */
+ mb();
+ used_lenp = dst->dmas
+ + dma * sizeof(struct lguest_dma)
+ + offsetof(struct lguest_dma, used_len);
+ lhwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+ dst->next_dma++;
+ }
+ up_read(&dstlg->mm->mmap_sem);
+
+ /* Do this last so dst doesn't simply sleep on lock. */
+ set_bit(dst->interrupt, dstlg->irqs_pending);
+ set_wakeup_process(srclg, dstlg->tsk);
+ return i == dst->num_dmas;
+
+fail:
+ up_read(&dstlg->mm->mmap_sem);
+ return 0;
+}
+
+int send_dma(struct lguest *lg, unsigned long addr, unsigned long udma)
+{
+ union futex_key key;
+ int pending = 0, empty = 0;
+
+again:
+ down(&lguest_lock);
+ down_read(&current->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)addr, &key) != 0) {
+ kill_guest(lg, "bad sending DMA address");
+ goto unlock;
+ }
+ /* Shared mapping? Look for other guests... */
+ if (key.shared.offset & 1) {
+ struct lguest_dma_info *i, *n;
+ list_for_each_entry_safe(i, n, &dma_hash[hash(&key)], list) {
+ if (i->guestid == lg->guestid)
+ continue;
+ if (!key_eq(&key, &i->key))
+ continue;
+
+ empty += dma_transfer(lg, udma, i);
+ break;
+ }
+ if (empty == 1) {
+ /* Give any recipients one chance to restock. */
+ up_read(&current->mm->mmap_sem);
+ up(&lguest_lock);
+ yield();
+ empty++;
+ goto again;
+ }
+ pending = 0;
+ } else {
+ /* Private mapping: tell our userspace. */
+ lg->dma_is_pending = 1;
+ lg->pending_dma = udma;
+ lg->pending_addr = addr;
+ pending = 1;
+ }
+unlock:
+ up_read(&current->mm->mmap_sem);
+ up(&lguest_lock);
+ return pending;
+}
+
+void release_all_dma(struct lguest *lg)
+{
+ unsigned int i;
+
+ BUG_ON(down_trylock(&lguest_lock) == 0);
+
+ down_read(&lg->mm->mmap_sem);
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (lg->dma[i].interrupt)
+ unlink_dma(&lg->dma[i]);
+ }
+ up_read(&lg->mm->mmap_sem);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+ unsigned long addr, unsigned long *interrupt)
+{
+ unsigned long ret = 0;
+ union futex_key key;
+ struct lguest_dma_info *i;
+
+ down(&lguest_lock);
+ down_read(&current->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)addr, &key) != 0) {
+ kill_guest(lg, "bad registered DMA buffer");
+ goto unlock;
+ }
+ list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+ if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+ unsigned int j;
+ for (j = 0; j < i->num_dmas; j++) {
+ struct lguest_dma dma;
+
+ ret = i->dmas + j * sizeof(struct lguest_dma);
+ lhread(lg, &dma, ret, sizeof(dma));
+ if (dma.used_len == 0)
+ break;
+ }
+ *interrupt = i->interrupt;
+ break;
+ }
+ }
+unlock:
+ up_read(&current->mm->mmap_sem);
+ up(&lguest_lock);
+ return ret;
+}
+
+void lguest_io_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+ INIT_LIST_HEAD(&dma_hash[i]);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_user.c
@@ -0,0 +1,242 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static struct lguest_state *setup_guest_state(unsigned int num, void *pgdir,
+ unsigned long start)
+{
+ struct lguest_state *guest = &__lguest_states()[num];
+ unsigned int i;
+ const long *def = __lguest_default_idt_entries();
+ struct lguest_regs *regs;
+
+ guest->gdt_table[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+ guest->gdt_table[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+ guest->gdt.size = GDT_ENTRIES*8-1;
+ guest->gdt.address = (unsigned long)&guest->gdt_table;
+
+ /* Other guest's IDTs are initialized from default. */
+ guest->idt.size = 8 * IDT_ENTRIES;
+ guest->idt.address = (long)guest->idt_table;
+ for (i = 0; i < IDT_ENTRIES; i++) {
+ u32 flags = 0x8e00;
+
+ /* They can't "int" into any of them except hypercall. */
+ if (i == LGUEST_TRAP_ENTRY)
+ flags |= (GUEST_DPL << 13);
+
+ guest->idt_table[i].a = (LGUEST_CS<<16) | (def[i]&0x0000FFFF);
+ guest->idt_table[i].b = (def[i]&0xFFFF0000) | flags;
+ }
+
+ memset(&guest->tss, 0, sizeof(guest->tss));
+ guest->tss.ss0 = LGUEST_DS;
+ guest->tss.esp0 = (unsigned long)(guest+1);
+ guest->tss.io_bitmap_base = sizeof(guest->tss); /* No I/O for you! */
+
+ /* Write out stack in format lguest expects, so we can switch to it. */
+ regs = &guest->regs;
+ regs->cr3 = __pa(pgdir);
+ regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0;
+ regs->edi = LGUEST_MAGIC_EDI;
+ regs->ebp = LGUEST_MAGIC_EBP;
+ regs->esi = LGUEST_MAGIC_ESI;
+ regs->gs = regs->fs = 0;
+ regs->ds = regs->es = __KERNEL_DS|GUEST_DPL;
+ regs->trapnum = regs->errcode = 0;
+ regs->eip = start;
+ regs->cs = __KERNEL_CS|GUEST_DPL;
+ regs->eflags = 0x202; /* Interrupts enabled. */
+ regs->ss = __KERNEL_DS|GUEST_DPL;
+
+ if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table),
+ &guest->regs, &guest->tss))
+ return NULL;
+
+ return guest;
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+ unsigned long addr, udma, irq;
+
+ if (get_user(addr, input) != 0)
+ return -EFAULT;
+ udma = get_dma_buffer(lg, addr, &irq);
+ if (!udma)
+ return -ENOENT;
+
+ /* We put irq number in udma->used_len. */
+ lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+ return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+ u32 irq;
+
+ if (get_user(irq, input) != 0)
+ return -EFAULT;
+ if (irq >= LGUEST_IRQS)
+ return -EINVAL;
+ set_bit(irq, lg->irqs_pending);
+ return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+ struct lguest *lg = file->private_data;
+
+ if (!lg)
+ return -EINVAL;
+
+ if (lg->dead) {
+ size_t len;
+
+ if (lg->dead == (void *)-1)
+ return -ENOMEM;
+
+ len = min(size, strlen(lg->dead)+1);
+ if (copy_to_user(user, lg->dead, len) != 0)
+ return -EFAULT;
+ return len;
+ }
+
+ if (lg->dma_is_pending)
+ lg->dma_is_pending = 0;
+
+ return run_guest(lg, user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+ struct lguest *lg;
+ int err, i;
+ u32 args[4];
+
+ if (file->private_data)
+ return -EBUSY;
+
+ if (copy_from_user(args, input, sizeof(args)) != 0)
+ return -EFAULT;
+
+ if (args[1] <= PAGE_SIZE)
+ return -EINVAL;
+
+ down(&lguest_lock);
+ i = find_free_guest();
+ if (i < 0) {
+ err = -ENOSPC;
+ goto unlock;
+ }
+ lg = &lguests[i];
+ lg->guestid = i;
+ lg->pfn_limit = args[0];
+ lg->page_offset = args[3];
+
+ lg->trap_page = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->trap_page) {
+ err = -ENOMEM;
+ goto release_guest;
+ }
+
+ err = init_guest_pagetable(lg, args[1]);
+ if (err)
+ goto free_trap_page;
+
+ lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+ if (!lg->state) {
+ err = -ENOEXEC;
+ goto release_pgtable;
+ }
+ up(&lguest_lock);
+
+ lg->tsk = current;
+ lg->mm = get_task_mm(current);
+ file->private_data = lg;
+ return sizeof(args);
+
+release_pgtable:
+ free_guest_pagetable(lg);
+free_trap_page:
+ free_page((long)lg->trap_page);
+release_guest:
+ memset(lg, 0, sizeof(*lg));
+unlock:
+ up(&lguest_lock);
+ return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+ size_t size, loff_t *off)
+{
+ struct lguest *lg = file->private_data;
+ u32 req;
+
+ if (get_user(req, input) != 0)
+ return -EFAULT;
+ input += sizeof(req);
+
+ if (req != LHREQ_INITIALIZE && !lg)
+ return -EINVAL;
+ if (lg && lg->dead)
+ return -ENOENT;
+
+ switch (req) {
+ case LHREQ_INITIALIZE:
+ return initialize(file, (const u32 __user *)input);
+ case LHREQ_GETDMA:
+ return user_get_dma(lg, (const u32 __user *)input);
+ case LHREQ_IRQ:
+ return user_send_irq(lg, (const u32 __user *)input);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+ struct lguest *lg = file->private_data;
+
+ if (!lg)
+ return 0;
+
+ down(&lguest_lock);
+ release_all_dma(lg);
+ free_page((long)lg->trap_page);
+ free_guest_pagetable(lg);
+ mmput(lg->mm);
+ if (lg->dead != (void *)1)
+ kfree(lg->dead);
+ memset(lg->state, 0, sizeof(*lg->state));
+ memset(lg, 0, sizeof(*lg));
+ up(&lguest_lock);
+ return 0;
+}
+
+static struct file_operations lguest_fops = {
+ .owner = THIS_MODULE,
+ .release = close,
+ .write = write,
+ .read = read,
+};
+static struct miscdevice lguest_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "lguest",
+ .fops = &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+ return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+ misc_deregister(&lguest_dev);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/page_tables.c
@@ -0,0 +1,374 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBm Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define HYPERVISOR_PGD_ENTRY (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(u32 *, hypervisor_pte_pages) = { NULL };
+#define hypervisor_pte_page(cpu) per_cpu(hypervisor_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd(unsigned long vaddr)
+{
+ return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the real versions. */
+static u32 *toplev(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+ unsigned int index = vaddr_to_pgd(vaddr);
+
+ if (index >= HYPERVISOR_PGD_ENTRY) {
+ kill_guest(lg, "attempt to access hypervisor pages");
+ index = 0;
+ }
+ return &lg->pgdirs[i].pgdir[index];
+}
+
+static u32 *pteof(struct lguest *lg, u32 top, unsigned long vaddr)
+{
+ u32 *page = __va(top&PAGE_MASK);
+ BUG_ON(!(top & _PAGE_PRESENT));
+ return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static u32 gtoplev(struct lguest *lg, unsigned long vaddr)
+{
+ unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+ return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(u32);
+}
+
+static u32 gpteof(struct lguest *lg, u32 gtop, unsigned long vaddr)
+{
+ u32 gpage = (gtop&PAGE_MASK);
+ BUG_ON(!(gtop & _PAGE_PRESENT));
+ return gpage + ((vaddr >> PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(u32);
+}
+
+static void release_pte(u32 pte)
+{
+ if (pte & _PAGE_PRESENT)
+ put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned long ret = -1UL;
+
+ down_read(&current->mm->mmap_sem);
+ if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+ 1, write, 1, &page, &vma) == 1)
+ ret = page_to_pfn(page);
+ up_read(&current->mm->mmap_sem);
+ return ret;
+}
+
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+ if ((entry & (_PAGE_PWT|_PAGE_PSE))
+ || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+ kill_guest(lg, "bad page table entry");
+ return entry & ~_PAGE_GLOBAL;
+}
+
+static u32 get_pte(struct lguest *lg, u32 entry, int write)
+{
+ u32 pfn;
+
+ pfn = get_pfn(entry >> PAGE_SHIFT, write);
+ if (pfn == -1UL) {
+ kill_guest(lg, "failed to get page %u", entry>>PAGE_SHIFT);
+ return 0;
+ }
+ return ((pfn << PAGE_SHIFT) | (entry & (PAGE_SIZE-1)));
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+ swapped. It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return NULL or the pte page. */
+static int page_in(struct lguest *lg, u32 vaddr, unsigned flags)
+{
+ u32 gtop, gpte;
+ u32 *top, *pte, *ptepage;
+ u32 val;
+
+ gtop = gtoplev(lg, vaddr);
+ val = lhread_u32(lg, gtop);
+ if (!(val & _PAGE_PRESENT))
+ return 0;
+
+ top = toplev(lg, lg->pgdidx, vaddr);
+ if (!(*top & _PAGE_PRESENT)) {
+ /* Get a PTE page for them. */
+ ptepage = (void *)get_zeroed_page(GFP_KERNEL);
+ /* FIXME: Steal from self in this case? */
+ if (!ptepage) {
+ kill_guest(lg, "out of memory allocating pte page");
+ return 0;
+ }
+ val = check_pgtable_entry(lg, val);
+ *top = (__pa(ptepage) | (val & (PAGE_SIZE-1)));
+ } else
+ ptepage = __va(*top & PAGE_MASK);
+
+ gpte = gpteof(lg, val, vaddr);
+ val = lhread_u32(lg, gpte);
+
+ /* No page, or write to readonly page? */
+ if (!(val&_PAGE_PRESENT) || ((flags&_PAGE_DIRTY) && !(val&_PAGE_RW)))
+ return 0;
+
+ pte = pteof(lg, *top, vaddr);
+ val = check_pgtable_entry(lg, val) | flags;
+
+ /* We're done with the old pte. */
+ release_pte(*pte);
+
+ /* We don't make it writable if this isn't a write: later
+ * write will fault so we can set dirty bit in guest. */
+ if (val & _PAGE_DIRTY)
+ *pte = get_pte(lg, val, 1);
+ else
+ *pte = get_pte(lg, val & ~_PAGE_RW, 0);
+
+ /* Now we update dirty/accessed on guest. */
+ lhwrite_u32(lg, gpte, val);
+ return 1;
+}
+
+int demand_page(struct lguest *lg, u32 vaddr, int write)
+{
+ return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED);
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+ unsigned int i;
+ u32 stack = lg->state->tss.esp1;
+
+ for (i = 0; i < lg->stack_pages; i++)
+ if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+ kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, u32 pgtable)
+{
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].cr3 == pgtable)
+ break;
+ return i;
+}
+
+static void release_pgd(struct lguest *lg, u32 *pgd)
+{
+ if (*pgd & _PAGE_PRESENT) {
+ unsigned int i;
+ u32 *ptepage = __va(*pgd & ~(PAGE_SIZE-1));
+ for (i = 0; i < PTES_PER_PAGE; i++)
+ release_pte(ptepage[i]);
+ free_page((long)ptepage);
+ *pgd = 0;
+ }
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+ unsigned int i;
+ for (i = 0; i < vaddr_to_pgd(lg->page_offset); i++)
+ release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+ flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int new_pgdir(struct lguest *lg, u32 cr3)
+{
+ unsigned int next;
+
+ next = (lg->pgdidx + random32()) % ARRAY_SIZE(lg->pgdirs);
+ if (!lg->pgdirs[next].pgdir) {
+ lg->pgdirs[next].pgdir = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[next].pgdir)
+ next = lg->pgdidx;
+ }
+ lg->pgdirs[next].cr3 = cr3;
+ /* Release all the non-kernel mappings. */
+ flush_user_mappings(lg, next);
+
+ return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, u32 pgtable)
+{
+ int newpgdir;
+
+ newpgdir = find_pgdir(lg, pgtable);
+ if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+ newpgdir = new_pgdir(lg, pgtable);
+ lg->pgdidx = newpgdir;
+ lg->state->regs.cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
+ pin_stack_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].pgdir)
+ for (j = 0; j < HYPERVISOR_PGD_ENTRY; j++)
+ release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+ release_all_pagetables(lg);
+ pin_stack_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+ unsigned long vaddr, u32 val)
+{
+ u32 *top = toplev(lg, idx, vaddr);
+ if (*top & _PAGE_PRESENT) {
+ u32 *pte = pteof(lg, *top, vaddr);
+ release_pte(*pte);
+ if (val & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+ val = check_pgtable_entry(lg, val);
+ *pte = get_pte(lg, val, val & _PAGE_DIRTY);
+ } else
+ *pte = 0;
+ }
+}
+
+void guest_set_pte(struct lguest *lg,
+ unsigned long cr3, unsigned long vaddr, u32 val)
+{
+ /* Kernel mappings must be changed on all top levels. */
+ if (vaddr >= lg->page_offset) {
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].pgdir)
+ do_set_pte(lg, i, vaddr, val);
+ } else {
+ int pgdir = find_pgdir(lg, cr3);
+ if (pgdir != ARRAY_SIZE(lg->pgdirs))
+ do_set_pte(lg, pgdir, vaddr, val);
+ }
+}
+
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+ int pgdir;
+
+ if (idx >= HYPERVISOR_PGD_ENTRY)
+ return;
+
+ pgdir = find_pgdir(lg, cr3);
+ if (pgdir < ARRAY_SIZE(lg->pgdirs))
+ release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable)
+{
+ /* We assume this in flush_user_mappings, so check now */
+ if (vaddr_to_pgd(lg->page_offset) >= HYPERVISOR_PGD_ENTRY)
+ return -EINVAL;
+ lg->pgdidx = 0;
+ lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+ lg->pgdirs[lg->pgdidx].pgdir = (u32*)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[lg->pgdidx].pgdir)
+ return -ENOMEM;
+ return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+ unsigned int i;
+
+ release_all_pagetables(lg);
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+ int cpu = smp_processor_id();
+
+ hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+ /* Since hypervisor less that 4MB, we simply mug top pte page. */
+ lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
+ (__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL);
+}
+
+static void free_hypervisor_pte_pages(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ free_page((long)hypervisor_pte_page(i));
+}
+
+static __init int alloc_hypervisor_pte_pages(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ hypervisor_pte_page(i) = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!hypervisor_pte_page(i)) {
+ free_hypervisor_pte_pages();
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static __init void populate_hypervisor_pte_page(int cpu)
+{
+ int i;
+ u32 *pte = hypervisor_pte_page(cpu);
+
+ for (i = 0; i < HYPERVISOR_PAGES; i++) {
+ /* First entry set dynamically in map_trap_page */
+ pte[i+1] = ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT)
+ | _PAGE_KERNEL_EXEC);
+ }
+}
+
+__init int init_pagetables(struct page hype_pages[])
+{
+ int ret;
+ unsigned int i;
+
+ ret = alloc_hypervisor_pte_pages();
+ if (ret)
+ return ret;
+
+ for_each_possible_cpu(i)
+ populate_hypervisor_pte_page(i);
+ return 0;
+}
+
+__exit void free_pagetables(void)
+{
+ free_hypervisor_pte_pages();
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/segments.c
@@ -0,0 +1,171 @@
+#include "lg.h"
+
+/* Dealing with GDT entries is such a horror, I convert to sanity and back */
+struct decoded_gdt_entry
+{
+ u32 base, limit;
+ union {
+ struct {
+ unsigned type:4;
+ unsigned dtype:1;
+ unsigned dpl:2;
+ unsigned present:1;
+ unsigned unused:4;
+ unsigned avl:1;
+ unsigned mbz:1;
+ unsigned def:1;
+ unsigned page_granularity:1;
+ };
+ u16 raw_attributes;
+ };
+};
+
+static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct *en)
+{
+ struct decoded_gdt_entry de;
+ de.base = ((en->a >> 16) | ((en->b & 0xff) << 16)
+ | (en->b & 0xFF000000));
+ de.limit = ((en->a & 0xFFFF) | (en->b & 0xF0000));
+ de.raw_attributes = (en->b >> 8);
+ return de;
+}
+
+static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry *de)
+{
+ struct desc_struct en;
+ en.a = ((de->limit & 0xFFFF) | (de->base << 16));
+ en.b = (((de->base >> 16) & 0xFF)
+ | ((((u32)de->raw_attributes) & 0xF0FF) << 8)
+ | (de->limit & 0xF0000)
+ | (de->base & 0xFF000000));
+ return en;
+}
+
+static int check_desc(const struct decoded_gdt_entry *dec)
+{
+ return (dec->mbz == 0 && dec->dtype == 1 && (dec->type & 4) == 0);
+}
+
+static void check_segment(const struct desc_struct *gdt, u32 *segreg)
+{
+ if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000))
+ *segreg = 0;
+}
+
+/* Ensure our manually-loaded segment regs don't fault in switch_to_guest. */
+static void check_live_segments(const struct desc_struct *gdt,
+ struct lguest_regs *regs)
+{
+ check_segment(gdt, &regs->es);
+ check_segment(gdt, &regs->ds);
+ check_segment(gdt, &regs->fs);
+ check_segment(gdt, &regs->gs);
+}
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+ struct lguest_regs *regs, struct x86_tss *tss)
+{
+ unsigned int i;
+ struct decoded_gdt_entry dec;
+
+ for (i = 0; i < num; i++) {
+ unsigned long base, length;
+
+ /* We override these ones, so we don't care what they give. */
+ if (i == GDT_ENTRY_TSS
+ || i == GDT_ENTRY_LGUEST_CS
+ || i == GDT_ENTRY_LGUEST_DS
+ || i == GDT_ENTRY_DOUBLEFAULT_TSS)
+ continue;
+
+ dec = decode_gdt_entry(&gdt[i]);
+ if (!dec.present)
+ continue;
+
+ if (!check_desc(&dec))
+ return 0;
+
+ base = dec.base;
+ length = dec.limit + 1;
+ if (dec.page_granularity) {
+ base *= PAGE_SIZE;
+ length *= PAGE_SIZE;
+ }
+
+ /* Unacceptable base? */
+ if (base >= HYPE_ADDR)
+ return 0;
+
+ /* Wrap around or segment overlaps hypervisor mem? */
+ if (!length
+ || base + length < base
+ || base + length > HYPE_ADDR) {
+ /* Trim to edge of hypervisor. */
+ length = HYPE_ADDR - base;
+ if (dec.page_granularity)
+ dec.limit = (length / PAGE_SIZE) - 1;
+ else
+ dec.limit = length - 1;
+ }
+ if (dec.dpl == 0)
+ dec.dpl = GUEST_DPL;
+ gdt[i] = encode_gdt_entry(&dec);
+ }
+ check_live_segments(gdt, regs);
+
+ /* Now put in hypervisor data and code segments. */
+ gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+ /* Finally, TSS entry */
+ dec.base = (unsigned long)tss;
+ dec.limit = sizeof(*tss)-1;
+ dec.type = 0x9;
+ dec.dtype = 0;
+ dec.def = 0;
+ dec.present = 1;
+ dec.mbz = 0;
+ dec.page_granularity = 0;
+ gdt[GDT_ENTRY_TSS] = encode_gdt_entry(&dec);
+
+ return 1;
+}
+
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num)
+{
+ if (num > GDT_ENTRIES)
+ kill_guest(lg, "too many gdt entries %i", num);
+
+ lhread(lg, lg->state->gdt_table, table,
+ num * sizeof(lg->state->gdt_table[0]));
+ if (!fixup_gdt_table(lg->state->gdt_table, num,
+ &lg->state->regs, &lg->state->tss))
+ kill_guest(lg, "bad gdt table");
+}
+
+/* We don't care about limit here, since we only let them use these in
+ * usermode (where lack of USER bit in pagetable protects hypervisor mem).
+ * However, we want to ensure it doesn't fault when loaded, since *we* are
+ * the ones who will load it in switch_to_guest.
+ */
+void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
+{
+ unsigned int i;
+ struct desc_struct *tls = &lg->state->gdt_table[GDT_ENTRY_TLS_MIN];
+
+ lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+ for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) {
+ struct decoded_gdt_entry dec = decode_gdt_entry(&tls[i]);
+
+ if (!dec.present)
+ continue;
+
+ /* We truncate to one byte/page (depending on G bit) to neuter
+ it, so ensure it's more than 1 page below trap page. */
+ tls[i].a &= 0xFFFF0000;
+ lg->tls_limits[i] = dec.limit;
+ if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE)
+ kill_guest(lg, "bad TLS descriptor %i", i);
+ }
+ check_live_segments(lg->state->gdt_table, &lg->state->regs);
+}


2007-02-09 10:58:44

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 6c/10] lguest: the guest code

This is the guest code which replaces the parts of paravirt_ops with
hypercalls. It's fairly trivial. This patch also includes trivial
bus driver for lguest devices.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest.c
@@ -0,0 +1,595 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2006, Rusty Russell <[email protected]> IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <asm/paravirt.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/asm-offsets.h>
+
+extern int mce_disabled;
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = __va(0);
+
+void async_hcall(unsigned long call,
+ unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ /* Note: This code assumes we're uniprocessor. */
+ static unsigned int next_call;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (lguest_data.hcall_status[next_call] != 0xFF) {
+ /* Table full, so do normal hcall which will flush table. */
+ hcall(call, arg1, arg2, arg3);
+ } else {
+ lguest_data.hcalls[next_call].eax = call;
+ lguest_data.hcalls[next_call].edx = arg1;
+ lguest_data.hcalls[next_call].ebx = arg2;
+ lguest_data.hcalls[next_call].ecx = arg3;
+ wmb();
+ lguest_data.hcall_status[next_call] = 0;
+ if (++next_call == LHCALL_RING_SIZE)
+ next_call = 0;
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE /* Not in 2.6.20. */
+static int lazy_mode;
+static void fastcall lguest_lazy_mode(int mode)
+{
+ lazy_mode = mode;
+ if (mode == PARAVIRT_LAZY_NONE)
+ hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3)
+{
+ if (lazy_mode == PARAVIRT_LAZY_NONE)
+ hcall(call, arg1, arg2, arg3);
+ else
+ async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long fastcall save_fl(void)
+{
+ return lguest_data.irq_enabled;
+}
+
+static void fastcall restore_fl(unsigned long flags)
+{
+ /* FIXME: Check if interrupt pending... */
+ lguest_data.irq_enabled = flags;
+}
+
+static void fastcall irq_disable(void)
+{
+ lguest_data.irq_enabled = 0;
+}
+
+static void fastcall irq_enable(void)
+{
+ /* Linux i386 code expects bit 9 set. */
+ /* FIXME: Check if interrupt pending... */
+ lguest_data.irq_enabled = 512;
+}
+
+static void fastcall lguest_load_gdt(const struct Xgt_desc_struct *desc)
+{
+ BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
+ hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+}
+
+static void fastcall lguest_load_idt(const struct Xgt_desc_struct *desc)
+{
+ unsigned int i;
+ struct desc_struct *idt = (void *)desc->address;
+
+ for (i = 0; i < (desc->size+1)/8; i++)
+ hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+ hcall(LHCALL_CRASH, __pa(p), 0, 0);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+ .notifier_call = lguest_panic
+};
+
+static cycle_t lguest_clock_read(void)
+{
+ /* FIXME: This is just the native one. Account stolen time! */
+ return paravirt_ops.read_tsc();
+}
+
+/* FIXME: Update iff tsc rate changes. */
+static struct clocksource lguest_clock = {
+ .name = "lguest",
+ .rating = 400,
+ .read = lguest_clock_read,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 0, /* to be set */
+ .shift = 22,
+ .is_continuous = 1,
+};
+
+static char *lguest_memory_setup(void)
+{
+ /* We do these here because lockcheck barfs if before start_kernel */
+ atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+ lguest_clock.mult = lguest_data.clock_mult;
+ clocksource_register(&lguest_clock);
+
+ e820.nr_map = 0;
+ add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+ return "LGUEST";
+}
+
+static fastcall void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ int is_feature = (*eax == 1);
+
+ asm volatile ("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+
+ if (is_feature) {
+ unsigned long *excap = (unsigned long *)ecx,
+ *features = (unsigned long *)edx;
+ /* Hypervisor needs to know when we flush kernel pages. */
+ set_bit(X86_FEATURE_PGE, features);
+ /* We don't have any features! */
+ clear_bit(X86_FEATURE_VME, features);
+ clear_bit(X86_FEATURE_DE, features);
+ clear_bit(X86_FEATURE_PSE, features);
+ clear_bit(X86_FEATURE_PAE, features);
+ clear_bit(X86_FEATURE_SEP, features);
+ clear_bit(X86_FEATURE_APIC, features);
+ clear_bit(X86_FEATURE_MTRR, features);
+ /* No MWAIT, either */
+ clear_bit(3, excap);
+ }
+}
+
+static unsigned long current_cr3;
+static void fastcall lguest_write_cr3(unsigned long cr3)
+{
+ hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+ current_cr3 = cr3;
+}
+
+static void fastcall lguest_flush_tlb(void)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void fastcall lguest_flush_tlb_kernel(void)
+{
+ lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void fastcall lguest_flush_tlb_single(u32 addr)
+{
+ /* Simply set it to zero, and it will fault back in. */
+ lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
+}
+
+/* FIXME: Eliminate all callers of this. */
+static fastcall void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+ /* Don't bother with hypercall before initial setup. */
+ if (current_cr3)
+ hcall(LHCALL_SET_UNKNOWN_PTE, 0, 0, 0);
+}
+
+static fastcall void lguest_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+ lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+}
+
+/* We only support two-level pagetables at the moment. */
+static fastcall void lguest_set_pud(pmd_t *pmdp, pmd_t pmdval)
+{
+ *pmdp = pmdval;
+ lazy_hcall(LHCALL_SET_PUD, __pa(pmdp)&PAGE_MASK,
+ (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static fastcall void lguest_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall void lguest_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall unsigned long lguest_apic_read(unsigned long reg)
+{
+ return 0;
+}
+#endif
+
+/* We move eflags word to lguest_data.irq_enabled to restore interrupt
+ state. For page faults, gpfs and virtual interrupts, the
+ hypervisor has saved eflags manually, otherwise it was delivered
+ directly and so eflags reflects the real machine IF state,
+ ie. interrupts on. Since the kernel always dies if it takes such a
+ trap with interrupts disabled anyway, turning interrupts back on
+ unconditionally here is OK. */
+asm("lguest_iret:"
+ " pushl %eax;"
+ " movl 12(%esp), %eax;"
+ "lguest_noirq_start:;"
+ " movl %eax,%ss:lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";"
+ " popl %eax;"
+ " iret;"
+ "lguest_noirq_end:");
+extern void fastcall lguest_iret(void);
+extern char lguest_noirq_start[], lguest_noirq_end[];
+
+static void fastcall lguest_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+ THREAD_SIZE/PAGE_SIZE);
+}
+
+static fastcall void lguest_load_tr_desc(void)
+{
+}
+
+static fastcall void lguest_set_ldt(const void *addr, unsigned entries)
+{
+ /* FIXME: Implement. */
+ BUG_ON(entries);
+}
+
+static fastcall void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static fastcall void lguest_set_debugreg(int regno, unsigned long value)
+{
+ /* FIXME: Implement */
+}
+
+static unsigned int lguest_cr0;
+static fastcall void lguest_clts(void)
+{
+ lazy_hcall(LHCALL_TS, 0, 0, 0);
+ lguest_cr0 &= ~8U;
+}
+
+static fastcall unsigned long lguest_read_cr0(void)
+{
+ return lguest_cr0;
+}
+
+static fastcall void lguest_write_cr0(unsigned long val)
+{
+ hcall(LHCALL_TS, val & 8, 0, 0);
+ lguest_cr0 = val;
+}
+
+static fastcall unsigned long lguest_read_cr2(void)
+{
+ return lguest_data.cr2;
+}
+
+static fastcall unsigned long lguest_read_cr3(void)
+{
+ return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static fastcall unsigned long lguest_read_cr4(void)
+{
+ return 0;
+}
+
+static fastcall void lguest_write_cr4(unsigned long val)
+{
+}
+
+/* FIXME: These should be in a header somewhere */
+extern unsigned long init_pg_tables_end;
+
+static void fastcall lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+ do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+ update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+ set_bit(irq, lguest_data.interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+ clear_bit(irq, lguest_data.interrupts);
+ /* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+ .name = "lguest",
+ .mask = disable_lguest_irq,
+ .mask_ack = disable_lguest_irq,
+ .unmask = enable_lguest_irq,
+};
+
+static void lguest_time_init(void)
+{
+ set_irq_handler(0, lguest_time_irq);
+ hcall(LHCALL_TIMER_START,HZ,0,0);
+}
+
+static void __init lguest_init_IRQ(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_IRQS; i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (i >= NR_IRQS)
+ break;
+ if (vector != SYSCALL_VECTOR) {
+ set_intr_gate(vector, interrupt[i]);
+ set_irq_chip_and_handler(i, &lguest_irq_controller,
+ handle_level_irq);
+ }
+ }
+ irq_ctx_init(smp_processor_id());
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+ u32 *lp = (u32 *)((char *)dt + entry*8);
+ lp[0] = entry_low;
+ lp[1] = entry_high;
+}
+
+static fastcall void lguest_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ /* FIXME: Allow this. */
+ BUG();
+}
+
+static fastcall void lguest_write_gdt_entry(void *dt, int entrynum,
+ u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+ hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static fastcall void lguest_write_idt_entry(void *dt, int entrynum,
+ u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+ hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled)
+#define DEF_LGUEST(name, code) \
+ extern const char start_##name[], end_##name[]; \
+ asm("start_" #name ": " code "; end_" #name ":")
+DEF_LGUEST(cli, "movl $0," LGUEST_IRQ);
+DEF_LGUEST(sti, "movl $512," LGUEST_IRQ);
+DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ);
+DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax");
+DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0," LGUEST_IRQ);
+DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */
+
+static const struct lguest_insns
+{
+ const char *start, *end;
+} lguest_insns[] = {
+ [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+ [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+ [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+ [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+ [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+ [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+ unsigned int insn_len;
+
+ /* Don't touch it if we don't have a replacement */
+ if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+ return len;
+
+ insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+ /* Similarly if we can't fit replacement. */
+ if (len < insn_len)
+ return len;
+
+ memcpy(insns, lguest_insns[type].start, insn_len);
+ if (type == PARAVIRT_INTERRUPT_RETURN) {
+ /* Jumps are relative. */
+ u32 off = (u32)lguest_iret - ((u32)insns + insn_len);
+ memcpy(insns+1, &off, sizeof(off));
+ }
+ return insn_len;
+}
+
+static void fastcall lguest_safe_halt(void)
+{
+ hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+ return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+ hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static __attribute_used__ __init void lguest_init(void)
+{
+ extern struct Xgt_desc_struct cpu_gdt_descr;
+ extern struct i386_pda boot_pda;
+
+ paravirt_ops.name = "lguest";
+ paravirt_ops.paravirt_enabled = 1;
+ paravirt_ops.kernel_rpl = 1;
+
+ paravirt_ops.save_fl = save_fl;
+ paravirt_ops.restore_fl = restore_fl;
+ paravirt_ops.irq_disable = irq_disable;
+ paravirt_ops.irq_enable = irq_enable;
+ paravirt_ops.load_gdt = lguest_load_gdt;
+ paravirt_ops.memory_setup = lguest_memory_setup;
+ paravirt_ops.cpuid = lguest_cpuid;
+ paravirt_ops.write_cr3 = lguest_write_cr3;
+ paravirt_ops.flush_tlb_user = lguest_flush_tlb;
+ paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+ paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+ paravirt_ops.set_pte = lguest_set_pte;
+ paravirt_ops.set_pte_at = lguest_set_pte_at;
+ paravirt_ops.set_pmd = lguest_set_pud;
+#ifdef CONFIG_X86_LOCAL_APIC
+ paravirt_ops.apic_write = lguest_apic_write;
+ paravirt_ops.apic_write_atomic = lguest_apic_write_atomic;
+ paravirt_ops.apic_read = lguest_apic_read;
+#endif
+ paravirt_ops.load_idt = lguest_load_idt;
+ paravirt_ops.iret = lguest_iret;
+ paravirt_ops.load_esp0 = lguest_load_esp0;
+ paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+ paravirt_ops.set_ldt = lguest_set_ldt;
+ paravirt_ops.load_tls = lguest_load_tls;
+ paravirt_ops.set_debugreg = lguest_set_debugreg;
+ paravirt_ops.clts = lguest_clts;
+ paravirt_ops.read_cr0 = lguest_read_cr0;
+ paravirt_ops.write_cr0 = lguest_write_cr0;
+ paravirt_ops.init_IRQ = lguest_init_IRQ;
+ paravirt_ops.read_cr2 = lguest_read_cr2;
+ paravirt_ops.read_cr3 = lguest_read_cr3;
+ paravirt_ops.read_cr4 = lguest_read_cr4;
+ paravirt_ops.write_cr4 = lguest_write_cr4;
+ paravirt_ops.write_ldt_entry = lguest_write_ldt_entry;
+ paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+ paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+ paravirt_ops.patch = lguest_patch;
+ paravirt_ops.safe_halt = lguest_safe_halt;
+ paravirt_ops.get_wallclock = lguest_get_wallclock;
+ paravirt_ops.time_init = lguest_time_init;
+#ifdef PARAVIRT_LAZY_NONE
+ paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+#endif
+
+ memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status));
+ lguest_data.noirq_start = (u32)lguest_noirq_start;
+ lguest_data.noirq_end = (u32)lguest_noirq_end;
+ hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+ strncpy(saved_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+ /* We use top of mem for initial pagetables. */
+ init_pg_tables_end = __pa(pg0);
+
+ /* set up PDA descriptor */
+ pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a,
+ (u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b,
+ (unsigned)&boot_pda, sizeof(boot_pda)-1,
+ 0x80 | DESCTYPE_S | 0x02, 0);
+ load_gdt(&cpu_gdt_descr);
+ asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+
+ reserve_top_address(lguest_data.reserve_mem);
+
+ cpu_detect(&new_cpu_data);
+ /* Need this before paging_init. */
+ set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability);
+ /* Math is always hard! */
+ new_cpu_data.hard_math = 1;
+
+ /* FIXME: Better way? */
+ /* Suppress vgacon startup code */
+ SCREEN_INFO.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+ add_preferred_console("hvc", 0, NULL);
+
+#ifdef CONFIG_X86_MCE
+ mce_disabled = 1;
+#endif
+
+#ifdef CONFIG_ACPI
+ acpi_disabled = 1;
+ acpi_ht = 0;
+#endif
+ if (boot->initrd_size) {
+ /* We stash this at top of memory. */
+ INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+ INITRD_SIZE = boot->initrd_size;
+ LOADER_TYPE = 0xFF;
+ }
+
+ pm_power_off = lguest_power_off;
+ start_kernel();
+}
+
+asm("lguest_maybe_init:\n"
+ " cmpl $"__stringify(LGUEST_MAGIC_EBP)", %ebp\n"
+ " jne 1f\n"
+ " cmpl $"__stringify(LGUEST_MAGIC_EDI)", %edi\n"
+ " jne 1f\n"
+ " cmpl $"__stringify(LGUEST_MAGIC_ESI)", %esi\n"
+ " je lguest_init\n"
+ "1: ret");
+extern void asmlinkage lguest_maybe_init(void);
+paravirt_probe(lguest_maybe_init);
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_bus.c
@@ -0,0 +1,180 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/lguest_device.h>
+#include <asm/lguest.h>
+#include <asm/io.h>
+
+static ssize_t type_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%hu", lguest_devices[dev->index].type);
+}
+static ssize_t features_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%hx", lguest_devices[dev->index].features);
+}
+static ssize_t pfn_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
+}
+static ssize_t status_show(struct device *_dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ return sprintf(buf, "%hx", lguest_devices[dev->index].status);
+}
+static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
+ return -EINVAL;
+ return count;
+}
+static struct device_attribute lguest_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_RO(features),
+ __ATTR_RO(pfn),
+ __ATTR(status, 0644, status_show, status_store),
+ __ATTR_NULL
+};
+
+static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
+
+ return (drv->device_type == lguest_devices[dev->index].type);
+}
+
+struct lguest_bus {
+ struct bus_type bus;
+ struct device dev;
+};
+
+static struct lguest_bus lguest_bus = {
+ .bus = {
+ .name = "lguest",
+ .match = lguest_dev_match,
+ .dev_attrs = lguest_dev_attrs,
+ },
+ .dev = {
+ .parent = NULL,
+ .bus_id = "lguest",
+ }
+};
+
+static int lguest_dev_probe(struct device *_dev)
+{
+ int ret;
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ struct lguest_driver *drv = container_of(dev->dev.driver,
+ struct lguest_driver, drv);
+
+ lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
+ ret = drv->probe(dev);
+ if (ret == 0)
+ lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
+ return ret;
+}
+
+static int lguest_dev_remove(struct device *_dev)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+ struct lguest_driver *drv = container_of(dev->dev.driver,
+ struct lguest_driver, drv);
+
+ if (dev->dev.driver && drv->remove)
+ drv->remove(dev);
+ put_device(&dev->dev);
+ return 0;
+}
+
+int register_lguest_driver(struct lguest_driver *drv)
+{
+ if (!lguest_devices)
+ return 0;
+
+ drv->drv.bus = &lguest_bus.bus;
+ drv->drv.name = drv->name;
+ drv->drv.owner = drv->owner;
+ drv->drv.probe = lguest_dev_probe;
+ drv->drv.remove = lguest_dev_remove;
+
+ return driver_register(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(register_lguest_driver);
+
+void unregister_lguest_driver(struct lguest_driver *drv)
+{
+ if (!lguest_devices)
+ return;
+
+ driver_unregister(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(unregister_lguest_driver);
+
+static void release_lguest_device(struct device *_dev)
+{
+ struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+
+ lguest_devices[dev->index].status |= LGUEST_DEVICE_S_REMOVED_ACK;
+ kfree(dev);
+}
+
+static void add_lguest_device(unsigned int index)
+{
+ struct lguest_device *new;
+
+ lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
+ new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
+ if (!new) {
+ printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
+ lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+ return;
+ }
+
+ new->index = index;
+ new->private = NULL;
+ memset(&new->dev, 0, sizeof(new->dev));
+ new->dev.parent = &lguest_bus.dev;
+ new->dev.bus = &lguest_bus.bus;
+ new->dev.release = release_lguest_device;
+ sprintf(new->dev.bus_id, "%u", index);
+ if (device_register(&new->dev) != 0) {
+ printk(KERN_EMERG "Cannot register lguest device %u\n", index);
+ lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+ kfree(new);
+ }
+}
+
+static void scan_devices(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_MAX_DEVICES; i++)
+ if (lguest_devices[i].type)
+ add_lguest_device(i);
+}
+
+static int __init lguest_bus_init(void)
+{
+ if (strcmp(paravirt_ops.name, "lguest") != 0)
+ return 0;
+
+ /* Devices are in page above top of "normal" mem. */
+ lguest_devices = ioremap(max_pfn << PAGE_SHIFT, PAGE_SIZE);
+
+ if (bus_register(&lguest_bus.bus) != 0
+ || device_register(&lguest_bus.dev) != 0)
+ panic("lguest bus registration failed");
+
+ scan_devices();
+ return 0;
+}
+postcore_initcall(lguest_bus_init);


2007-02-09 10:59:38

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 6d/10] lguest: the Makefiles

Finally, we put in the Makefile, so it will build.

You can see the pain involved in creating the switcher code
(hypervisor.S) ready to be copied into the top of memory.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -108,6 +108,7 @@ drivers-$(CONFIG_PCI) += arch/i386/pci
# must be linked after kernel/
drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/
drivers-$(CONFIG_PM) += arch/i386/power/
+drivers-$(CONFIG_LGUEST_GUEST) += arch/i386/lguest/

CFLAGS += $(mflags-y)
AFLAGS += $(mflags-y)
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/Makefile
@@ -0,0 +1,22 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST) += lg.o
+lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
+ segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC00000+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)" -DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+$(obj)/core.o: $(obj)/hypervisor-blob.c
+# This links the hypervisor in the right place and turns it into a C array.
+$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+ @$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf %#x $$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O binary $@
+$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+ @od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e 's/ /,0x/g' > $@
+
+clean-files := hypervisor-blob.c hypervisor-raw


2007-02-09 11:53:31

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/10] lguest: Don't rely on last-linked fallthru when no paravirt handler

On Fri, 2007-02-09 at 10:31 +0100, Andi Kleen wrote:
> On Friday 09 February 2007 10:14, Rusty Russell wrote:
>
> > +unhandled_paravirt:
> > + /* Nothing wanted us: try to die with dignity (impossible trap). */
> > + movl $0x1F, %edx
> > + pushl $0
> > + jmp early_fault
>
> Please print a real message with early_printk

If we make it thought early_fault, this will do just that.

Given this is a "never happens" situation, however... if you're actually
under Xen or lguest, you won't make it that far (lguest, at least, will
kill you on the cr2 load in early_fault, but it doesn't matter because
we won't get anywhere with early_printk anyway).

Actually, if we did BUG() here at least lguest would print something...
I wonder what Xen would do...

Rusty.


2007-02-09 12:06:53

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 2/10] lguest: Export symbols for lguest as a module

On Fri, 2007-02-09 at 10:32 +0100, Andi Kleen wrote:
> On Friday 09 February 2007 10:15, Rusty Russell wrote:
>
> > tsc_khz:
> > Simplest way of telling the guest how to interpret the TSC
> > counter.
>
>
> Are you sure this will work with varying TSC frequencies?

I'm actually quite sure it doesn't (there's a FIXME in the lguest code).
Given the debate over how useful the TSC was, I originally didn't use
it, but (1) it's simple, and (2) when it doesn't change, it's pretty
accurate.

> In general you should get this from cpufreq.

Hmm, ok, I'll bite: how? Time is a mystery I've avoided so far 8)

Thanks!
Rusty.


2007-02-09 13:58:58

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/10] lguest: Export symbols for lguest as a module

On Fri, Feb 09, 2007 at 11:06:06PM +1100, Rusty Russell wrote:
> On Fri, 2007-02-09 at 10:32 +0100, Andi Kleen wrote:
> > On Friday 09 February 2007 10:15, Rusty Russell wrote:
> >
> > > tsc_khz:
> > > Simplest way of telling the guest how to interpret the TSC
> > > counter.
> >
> >
> > Are you sure this will work with varying TSC frequencies?
>
> I'm actually quite sure it doesn't (there's a FIXME in the lguest code).
> Given the debate over how useful the TSC was, I originally didn't use
> it, but (1) it's simple, and (2) when it doesn't change, it's pretty
> accurate.

But when it changes users become pretty unhappy

>
> > In general you should get this from cpufreq.
>
> Hmm, ok, I'll bite: how? Time is a mystery I've avoided so far 8)

the old x86-64 time.c (before -mm) has a example in #ifdef CONFIG_CPUFREQ


-Andi

2007-02-09 17:08:06

by Len Brown

[permalink] [raw]
Subject: Re: [PATCH 6c/10] lguest: the guest code

On Friday 09 February 2007 05:57, Rusty Russell wrote:

> +#ifdef CONFIG_ACPI
> +       acpi_disabled = 1;
> +       acpi_ht = 0;
> +#endif

If this is hard-coded to have ACPI disabled, why isn't it enforced at build-time?

thanks,
-Len

2007-02-09 17:14:10

by James Morris

[permalink] [raw]
Subject: Re: [PATCH 6c/10] lguest: the guest code

On Fri, 9 Feb 2007, Len Brown wrote:

> On Friday 09 February 2007 05:57, Rusty Russell wrote:
>
> > +#ifdef CONFIG_ACPI
> > +???????acpi_disabled = 1;
> > +???????acpi_ht = 0;
> > +#endif
>
> If this is hard-coded to have ACPI disabled, why isn't it enforced at build-time?

This is being disabled in the guest kernel only. The host and guest
kernels are expected to be the same build.



- James
--
James Morris
<[email protected]>

2007-02-09 17:51:30

by Len Brown

[permalink] [raw]
Subject: Re: [PATCH 6c/10] lguest: the guest code

On Friday 09 February 2007 12:14, James Morris wrote:
> On Fri, 9 Feb 2007, Len Brown wrote:
>
> > On Friday 09 February 2007 05:57, Rusty Russell wrote:
> >
> > > +#ifdef CONFIG_ACPI
> > > +       acpi_disabled = 1;
> > > +       acpi_ht = 0;
> > > +#endif
> >
> > If this is hard-coded to have ACPI disabled, why isn't it enforced at build-time?
>
> This is being disabled in the guest kernel only. The host and guest
> kernels are expected to be the same build.

Okay, but better to use disable_acpi()
indeed, since this would be the first code not already inside CONFIG_ACPI
to invoke disable_acpi(), we could define the inline as empty and you could
then scratch the #ifdef too.

cheers,
-Len

2007-02-09 20:49:09

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 1/10] lguest: Don't rely on last-linked fallthru when no paravirt handler

Rusty Russell wrote:
> If we make it thought early_fault, this will do just that.
>
> Given this is a "never happens" situation, however... if you're actually
> under Xen or lguest, you won't make it that far (lguest, at least, will
> kill you on the cr2 load in early_fault, but it doesn't matter because
> we won't get anywhere with early_printk anyway).
>
> Actually, if we did BUG() here at least lguest would print something...
> I wonder what Xen would do...

Xen would print a complete register dump and backtrace. There's not a
lot else you can do here; you could try early_printk, but if we're in a
strange virtual environment, there may be no device on which the output
could appear.

J

2007-02-09 23:48:56

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 11/10] lguest: use disable_acpi()

On Fri, 2007-02-09 at 12:49 -0500, Len Brown wrote:
> On Friday 09 February 2007 12:14, James Morris wrote:
> > This is being disabled in the guest kernel only. The host and guest
> > kernels are expected to be the same build.
>
> Okay, but better to use disable_acpi()
> indeed, since this would be the first code not already inside CONFIG_ACPI
> to invoke disable_acpi(), we could define the inline as empty and you could
> then scratch the #ifdef too.

Thanks Len!

This applies on top of that series.

==
Len Brown <[email protected]> said:
> Okay, but better to use disable_acpi()
> indeed, since this would be the first code not already inside CONFIG_ACPI
> to invoke disable_acpi(), we could define the inline as empty and you could
> then scratch the #ifdef too.

Signed-off-by: Rusty Russell <[email protected]>

diff -r 85363b87e20b arch/i386/lguest/lguest.c
--- a/arch/i386/lguest/lguest.c Sat Feb 10 01:52:37 2007 +1100
+++ b/arch/i386/lguest/lguest.c Sat Feb 10 10:28:36 2007 +1100
@@ -555,10 +555,7 @@ static __attribute_used__ __init void lg
mce_disabled = 1;
#endif

-#ifdef CONFIG_ACPI
- acpi_disabled = 1;
- acpi_ht = 0;
-#endif
+ disable_acpi();
if (boot->initrd_size) {
/* We stash this at top of memory. */
INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
diff -r 85363b87e20b include/asm-i386/acpi.h
--- a/include/asm-i386/acpi.h Sat Feb 10 01:52:37 2007 +1100
+++ b/include/asm-i386/acpi.h Sat Feb 10 10:43:43 2007 +1100
@@ -127,6 +127,7 @@ extern int acpi_irq_balance_set(char *st
#define acpi_ioapic 0
static inline void acpi_noirq_set(void) { }
static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }

#endif /* !CONFIG_ACPI */




2007-02-10 11:40:18

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 2/10] lguest: Export symbols for lguest as a module

On Fri, 2007-02-09 at 14:58 +0100, Andi Kleen wrote:
> On Fri, Feb 09, 2007 at 11:06:06PM +1100, Rusty Russell wrote:
> > On Fri, 2007-02-09 at 10:32 +0100, Andi Kleen wrote:
> > > Are you sure this will work with varying TSC frequencies?
> >
> > I'm actually quite sure it doesn't (there's a FIXME in the lguest code).
> > Given the debate over how useful the TSC was, I originally didn't use
> > it, but (1) it's simple, and (2) when it doesn't change, it's pretty
> > accurate.
>
> But when it changes users become pretty unhappy

True. Simplest fix is below. There are several time issues on the TODO
list, and I will simply add this one.

Thanks!
Rusty.

lguest: Don't use the TSC in guest.

Andi complained that lguest guests don't deal with host TSC speed
changing. Close this can of worms by not using the TSC in the guest.
Later on we could do something clever when we overhaul this to deal
with stolen time (also on the TODO list).

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -475,4 +475,3 @@ static int __init init_tsc_clocksource(v
}

module_init(init_tsc_clocksource);
-EXPORT_SYMBOL_GPL(tsc_khz);
===================================================================
--- a/arch/i386/lguest/hypercalls.c
+++ b/arch/i386/lguest/hypercalls.c
@@ -18,7 +18,6 @@
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
-#include <linux/clocksource.h>
#include <asm/lguest.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -179,8 +178,6 @@ int hypercall(struct lguest *lg, struct
/* We reserve the top pgd entry. */
put_user(4U*1024*1024, &lg->lguest_data->reserve_mem);
put_user(lg->guestid, &lg->lguest_data->guestid);
- put_user(clocksource_khz2mult(tsc_khz, 22),
- &lg->lguest_data->clock_mult);
return 0;
}
pending = do_hcall(lg, regs);
===================================================================
--- a/arch/i386/lguest/lguest.c
+++ b/arch/i386/lguest/lguest.c
@@ -25,7 +25,6 @@
#include <linux/screen_info.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
-#include <linux/clocksource.h>
#include <asm/paravirt.h>
#include <asm/lguest.h>
#include <asm/lguest_user.h>
@@ -138,29 +137,10 @@ static struct notifier_block paniced = {
.notifier_call = lguest_panic
};

-static cycle_t lguest_clock_read(void)
-{
- /* FIXME: This is just the native one. Account stolen time! */
- return paravirt_ops.read_tsc();
-}
-
-/* FIXME: Update iff tsc rate changes. */
-static struct clocksource lguest_clock = {
- .name = "lguest",
- .rating = 400,
- .read = lguest_clock_read,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .is_continuous = 1,
-};
-
static char *lguest_memory_setup(void)
{
/* We do these here because lockcheck barfs if before start_kernel */
atomic_notifier_chain_register(&panic_notifier_list, &paniced);
- lguest_clock.mult = lguest_data.clock_mult;
- clocksource_register(&lguest_clock);

e820.nr_map = 0;
add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
===================================================================
--- a/include/asm-i386/lguest.h
+++ b/include/asm-i386/lguest.h
@@ -74,8 +74,6 @@ struct lguest_data
unsigned long reserve_mem;
/* ID of this guest (used by network driver to set ethernet address) */
u16 guestid;
- /* Multiplier for TSC clock. */
- u32 clock_mult;

/* Fields initialized by the guest at boot: */
/* Instruction range to suppress interrupts even if enabled */