LinuxLists.cc - Xen VMM patch set

2004-11-30 02:08:15

Subject: Xen VMM patch set - take 3

We didn't get much feedback from take 2, so hopefully we're
converging on something that's acceptable. The only major
difference between this set and the previous is the way we handle
the /dev/mem changes. I think the new approach is rather cleaner.

To get a working arch xen system you'll need the following set of
patches:

1. add ptep_establish_new to make va available
2. return code for arch_free_page
3. runtime disable of VT console
4. HAS_ARCH_DEV_MEM enables Xen to use own /dev/mem definition
5. split free_irq into teardown_irq
6. alloc_skb_from_cache (already accepted by Dave Miller)
7. bug fix: handle frag'ed skbs in icmp_filter (already accepted by Dave Miller)

The actual new architecture, arch xen, is too big to post to the list,
so here's a link:
8. http://www.cl.cam.ac.uk/netos/xen/downloads/arch-xen.patch

Likewise for the virtual block, network, and console drivers:
9. http://www.cl.cam.ac.uk/netos/xen/downloads/drivers-xen.patch

Arch xen will be maintained by myself, Keir Fraser, Christian Limpach
and Steve Hand.

Cheers,
Ian

2004-11-30 02:11:19

by Ian Pratt

[permalink] [raw]

Subject: [2/7] Xen VMM #3: return code for arch_free_page

This patch adds a return value to the existing arch_free_page function
that indicates whether the normal free routine still has work to
do. The only architecture that currently uses arch_free_page is arch
'um'. arch xen needs this for 'foreign pages' - pages that don't
belong to the page allocator but are instead managed by custom
allocators. Such pages are marked using PG_arch_1.

Signed-off-by: [email protected]

---

diff -Nurp pristine-linux-2.6.10-rc2/include/linux/gfp.h tmp-linux-2.6.10-rc2-xen.patch/include/linux/gfp.h
--- pristine-linux-2.6.10-rc2/include/linux/gfp.h 2004-10-18 22:53:44.000000000 +0100
+++ tmp-linux-2.6.10-rc2-xen.patch/include/linux/gfp.h 2004-11-30 00:41:24.000000000 +0000
@@ -74,8 +74,12 @@ struct vm_area_struct;
* optimized to &contig_page_data at compile-time.
*/

+/*
+ * If arch_free_page returns non-zero then the generic free_page code can
+ * immediately bail: the arch-specific function has done all the work.
+ */
#ifndef HAVE_ARCH_FREE_PAGE
-static inline void arch_free_page(struct page *page, int order) { }
+#define arch_free_page(page, order) 0
#endif

extern struct page *
diff -Nurp pristine-linux-2.6.10-rc2/mm/page_alloc.c tmp-linux-2.6.10-rc2-xen.patch/mm/page_alloc.c
--- pristine-linux-2.6.10-rc2/mm/page_alloc.c 2004-11-30 01:20:25.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/mm/page_alloc.c 2004-11-30 00:41:24.000000000 +0000
@@ -278,7 +278,8 @@ void __free_pages_ok(struct page *page,
LIST_HEAD(list);
int i;

- arch_free_page(page, order);
+ if (arch_free_page(page, order))
+ return;

mod_page_state(pgfree, 1 << order);
for (i = 0 ; i < (1 << order) ; ++i)
@@ -508,7 +509,8 @@ static void fastcall free_hot_cold_page(
struct per_cpu_pages *pcp;
unsigned long flags;

- arch_free_page(page, 0);
+ if (arch_free_page(page, 0))
+ return;

kernel_map_pages(page, 1, 0);
inc_page_state(pgfree);
diff -Nurp pristine-linux-2.6.10-rc2/arch/um/kernel/physmem.c tmp-linux-2.6.10-rc2-xen.patch/arch/um/kernel/physmem.c
--- pristine-linux-2.6.10-rc2/arch/um/kernel/physmem.c 2004-11-19 20:04:30.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/arch/um/kernel/physmem.c 2004-11-19 20:05:33.000000000 +0000
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(physmem_forget_descriptor)
EXPORT_SYMBOL(physmem_remove_mapping);
EXPORT_SYMBOL(physmem_subst_mapping);

-void arch_free_page(struct page *page, int order)
+void __arch_free_page(struct page *page, int order)
{
void *virt;
int i;
diff -Nurp pristine-linux-2.6.10-rc2/include/asm-um/page.h tmp-linux-2.6.10-rc2-xen.patch/include/asm-um/page.h
--- pristine-linux-2.6.10-rc2/include/asm-um/page.h 2004-11-19 20:04:52.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/include/asm-um/page.h 2004-11-19 20:05:33.000000000 +0000
@@ -46,7 +46,8 @@ extern void *to_virt(unsigned long phys)
extern struct page *arch_validate(struct page *page, int mask, int order);
#define HAVE_ARCH_VALIDATE

-extern void arch_free_page(struct page *page, int order);
+extern void __arch_free_page(struct page *page, int order);
+#define arch_free_page(page, order) (__arch_free_page((page), (order)), 0)
#define HAVE_ARCH_FREE_PAGE

#endif

2004-11-30 02:15:08

by Ian Pratt

[permalink] [raw]

Subject: [4/7] Xen VMM #3: ARCH_HAS_DEV_MEM

This patch adds ARCH_HAS_DEV_MEM, enabling per-architecture
implementations of /dev/mem and thus avoids a number of messy
#ifdef's. In arch xen we need to use different functions for mapping
bus vs physical addresses. This allows the X server and dmidecode etc
to work as per normal.

Signed-off-by: [email protected]

---

diff -Nurp pristine-linux-2.6.10-rc2/drivers/char/mem.c tmp-linux-2.6.10-rc2-xen.patch/drivers/char/mem.c
--- pristine-linux-2.6.10-rc2/drivers/char/mem.c 2004-11-30 01:19:43.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/drivers/char/mem.c 2004-11-30 01:42:12.000000000 +0000
@@ -143,7 +143,7 @@ static ssize_t do_write_mem(void *p, uns
return written;
}

-
+#ifndef ARCH_HAS_DEV_MEM
/*
* This funcion reads the *physical* memory. The f_pos points directly to the
* memory location.
@@ -189,8 +189,9 @@ static ssize_t write_mem(struct file * f
return -EFAULT;
return do_write_mem(__va(p), p, buf, count, ppos);
}
+#endif

-static int mmap_mem(struct file * file, struct vm_area_struct * vma)
+static int mmap_kmem(struct file * file, struct vm_area_struct * vma)
{
#ifdef pgprot_noncached
unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
@@ -208,6 +209,7 @@ static int mmap_mem(struct file * file,
vma->vm_end-vma->vm_start,
vma->vm_page_prot))
return -EAGAIN;
+
return 0;
}

@@ -567,7 +569,7 @@ static int open_port(struct inode * inod
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
}

-#define mmap_kmem mmap_mem
+#define mmap_mem mmap_kmem
#define zero_lseek null_lseek
#define full_lseek null_lseek
#define write_zero write_null
@@ -575,6 +577,7 @@ static int open_port(struct inode * inod
#define open_mem open_port
#define open_kmem open_mem

+#ifndef ARCH_HAS_DEV_MEM
static struct file_operations mem_fops = {
.llseek = memory_lseek,
.read = read_mem,
@@ -582,6 +585,9 @@ static struct file_operations mem_fops =
.mmap = mmap_mem,
.open = open_mem,
};
+#else
+extern struct file_operations mem_fops;
+#endif

static struct file_operations kmem_fops = {
.llseek = memory_lseek,

2004-11-30 02:12:26

by Ian Pratt

[permalink] [raw]

Subject: [3/7] Xen VMM #3: runtime disable of VT console

This patch enables the VT console to be disabled at runtime even if it
is built into the kernel. Arch xen needs this to avoid trying to
initialise a VT in virtual machine that doesn't have access to the
console hardware.

Signed-off-by: [email protected]

---

diff -Nurp pristine-linux-2.6.10-rc2/drivers/char/tty_io.c tmp-linux-2.6.10-rc2-xen.patch/drivers/char/tty_io.c
--- pristine-linux-2.6.10-rc2/drivers/char/tty_io.c 2004-11-30 01:19:44.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/drivers/char/tty_io.c 2004-11-30 00:41:24.000000000 +0000
@@ -131,6 +131,8 @@ LIST_HEAD(tty_drivers); /* linked list
vt.c for deeply disgusting hack reasons */
DECLARE_MUTEX(tty_sem);

+int console_use_vt = 1;
+
#ifdef CONFIG_UNIX98_PTYS
extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
extern int pty_limit; /* Config limit on Unix98 ptys */
@@ -2964,14 +2966,19 @@ static int __init tty_init(void)
#endif

#ifdef CONFIG_VT
- cdev_init(&vc0_cdev, &console_fops);
- if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
- register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
- panic("Couldn't register /dev/tty0 driver\n");
- devfs_mk_cdev(MKDEV(TTY_MAJOR, 0), S_IFCHR|S_IRUSR|S_IWUSR, "vc/0");
- class_simple_device_add(tty_class, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
+ if (console_use_vt) {
+ cdev_init(&vc0_cdev, &console_fops);
+ if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
+ register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1,
+ "/dev/vc/0") < 0)
+ panic("Couldn't register /dev/tty0 driver\n");
+ devfs_mk_cdev(MKDEV(TTY_MAJOR, 0), S_IFCHR|S_IRUSR|S_IWUSR,
+ "vc/0");
+ class_simple_device_add(tty_class, MKDEV(TTY_MAJOR, 0), NULL,
+ "tty0");

- vty_init();
+ vty_init();
+ }
#endif
return 0;
}

2004-11-30 02:20:04

by Ian Pratt

[permalink] [raw]

Subject: [5/7] Xen VMM #3: split free_irq into teardown_irq

This patch moves the `unregister the irqaction' part of free_irq into
a new function teardown_irq, leaving only the mapping from dev_id to
irqaction and freeing the irqaction in free_irq. free_irq
calls teardown_irq to unregister the irqaction. This is similar
to how setup_irq and request_irq work for registering irq's.
We need teardown_irq to allow us to unregister irq's which were
registered early during boot when memory management wasn't ready
yet, i.e. irq's which were registered using setup_irq and use a static
irqaction which cannot be kfree'd.

Signed-off-by: [email protected]

---
diff -Nurp pristine-linux-2.6.10-rc2/include/linux/irq.h tmp-linux-2.6.10-rc2-xen.patch/include/linux/irq.h
--- pristine-linux-2.6.10-rc2/include/linux/irq.h 2004-11-30 01:20:24.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/include/linux/irq.h 2004-11-30 00:41:24.000000000 +0000
@@ -73,6 +73,7 @@ extern irq_desc_t irq_desc [NR_IRQS];
#include <asm/hw_irq.h> /* the arch dependent stuff */

extern int setup_irq(unsigned int irq, struct irqaction * new);
+extern int teardown_irq(unsigned int irq, struct irqaction * old);

#ifdef CONFIG_GENERIC_HARDIRQS
extern cpumask_t irq_affinity[NR_IRQS];
diff -Nurp pristine-linux-2.6.10-rc2/kernel/irq/manage.c tmp-linux-2.6.10-rc2-xen.patch/kernel/irq/manage.c
--- pristine-linux-2.6.10-rc2/kernel/irq/manage.c 2004-11-30 01:20:25.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/kernel/irq/manage.c 2004-11-30 00:54:43.000000000 +0000
@@ -144,9 +144,14 @@ int can_request_irq(unsigned int irq, un
return !action;
}

-/*
- * Internal function to register an irqaction - typically used to
- * allocate special interrupts that are part of the architecture.
+/**
+ * setup_irq - register an irqaction structure
+ * @irq: Interrupt to register
+ * @irqaction: The irqaction structure to be registered
+ *
+ * Normally called by request_irq, this function can be used
+ * directly to allocate special interrupts that are part of the
+ * architecture.
*/
int setup_irq(unsigned int irq, struct irqaction * new)
{
@@ -215,28 +220,27 @@ int setup_irq(unsigned int irq, struct i
return 0;
}

-/**
- * free_irq - free an interrupt
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
- *
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
+/*
+ * teardown_irq - unregister an irqaction
+ * @irq: Interrupt line being freed
+ * @old: Pointer to the irqaction that is to be unregistered
+ *
+ * This function is called by free_irq and does the actual
+ * business of unregistering the handler. It exists as a
+ * seperate function to enable handlers to be unregistered
+ * for irqactions that have been allocated statically at
+ * boot time.
*
* This function must not be called from interrupt context.
*/
-void free_irq(unsigned int irq, void *dev_id)
+int teardown_irq(unsigned int irq, struct irqaction * old)
{
struct irq_desc *desc;
struct irqaction **p;
unsigned long flags;

if (irq >= NR_IRQS)
- return;
+ return -ENOENT;

desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock,flags);
@@ -248,7 +252,7 @@ void free_irq(unsigned int irq, void *de
struct irqaction **pp = p;

p = &action->next;
- if (action->dev_id != dev_id)
+ if (action != old)
continue;

/* Found it - now remove it from the list of entries */
@@ -265,13 +269,52 @@ void free_irq(unsigned int irq, void *de

/* Make sure it's not being used on another CPU */
synchronize_irq(irq);
- kfree(action);
- return;
+ return 0;
}
- printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+ printk(KERN_ERR "Trying to teardown free IRQ%d\n",irq);
spin_unlock_irqrestore(&desc->lock,flags);
+ return -ENOENT;
+ }
+}
+
+/**
+ * free_irq - free an interrupt
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the
+ * interrupt line is no longer in use by any driver it is disabled.
+ * On a shared IRQ the caller must ensure the interrupt is disabled
+ * on the card it drives before calling this function. The function
+ * does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc;
+ struct irqaction *action;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS)
+ return;
+
+ desc = irq_desc + irq;
+ spin_lock_irqsave(&desc->lock,flags);
+ for (action = desc->action; action != NULL; action = action->next) {
+ if (action->dev_id != dev_id)
+ continue;
+
+ spin_unlock_irqrestore(&desc->lock,flags);
+
+ if (teardown_irq(irq, action) == 0)
+ kfree(action);
return;
}
+ printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
+ spin_unlock_irqrestore(&desc->lock,flags);
+ return;
}

EXPORT_SYMBOL(free_irq);

2004-11-30 02:21:40

by Ian Pratt

[permalink] [raw]

Subject: [7/7] Xen VMM #3: handle fragemented skbs correctly in icmp_filter

[NB: This patch has already been accepted by Dave Miller. I'm
only resending it such that the set is complete.]

Simple bug fix to icmp_filter -- handle fragemented skbs correctly.

Signed-off-by: [email protected]

---
diff -Nurp pristine-linux-2.6.10-rc2/net/ipv4/raw.c tmp-linux-2.6.10-rc2-xen.patch/net/ipv4/raw.c
--- pristine-linux-2.6.10-rc2/net/ipv4/raw.c 2004-11-30 01:20:26.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/net/ipv4/raw.c 2004-11-30 00:41:24.000000000 +0000
@@ -130,6 +130,9 @@ static __inline__ int icmp_filter(struct
{
int type;

+ if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+ return 1;
+
type = skb->h.icmph->type;
if (type < 32) {
__u32 data = raw4_sk(sk)->filter.data;

2004-11-30 02:32:35

by Ian Pratt

[permalink] [raw]

Subject: [6/7] Xen VMM #3: alloc_skb_from_cache

[NB: This patch has already been accepted by Dave Miller. I'm
only resending it such that the set is complete.]

This patch adds a new alloc_skb_from_cache function. This serves two
purposes: firstly, we like to allocate skb's with page-sized data
fragements as this means we can do zero-copy transfer of network
buffers between guest operating systems. Secondly, it enables us to
have a cache of pages that have been used for network buffers that we
can be more lax about scrubbing when they change VM ownership (since
they could be sniffed on the wire).

Signed-off-by: [email protected]

---
diff -Nurp pristine-linux-2.6.10-rc2/net/core/skbuff.c tmp-linux-2.6.10-rc2-xen.patch/net/core/skbuff.c
--- pristine-linux-2.6.10-rc2/net/core/skbuff.c 2004-11-30 01:20:26.000000000 +0000
+++ tmp-linux-2.6.10-rc2-xen.patch/net/core/skbuff.c 2004-11-30 00:41:24.000000000 +0000
@@ -163,6 +163,59 @@ nodata:
goto out;
}

+/**
+ * alloc_skb_from_cache - allocate a network buffer
+ * @cp: kmem_cache from which to allocate the data area
+ * (object size must be big enough for @size bytes + skb overheads)
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of size bytes. The object has a reference count of one.
+ * The return is the buffer. On a failure the return is %NULL.
+ *
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
+ */
+struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
+ unsigned int size, int gfp_mask)
+{
+ struct sk_buff *skb;
+ u8 *data;
+
+ /* Get the HEAD */
+ skb = kmem_cache_alloc(skbuff_head_cache,
+ gfp_mask & ~__GFP_DMA);
+ if (!skb)
+ goto out;
+
+ /* Get the DATA. */
+ size = SKB_DATA_ALIGN(size);
+ data = kmem_cache_alloc(cp, gfp_mask);
+ if (!data)
+ goto nodata;
+
+ memset(skb, 0, offsetof(struct sk_buff, truesize));
+ skb->truesize = size + sizeof(struct sk_buff);
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb->tail = data;
+ skb->end = data + size;
+
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->tso_size = 0;
+ skb_shinfo(skb)->tso_segs = 0;
+ skb_shinfo(skb)->frag_list = NULL;
+out:
+ return skb;
+nodata:
+ kmem_cache_free(skbuff_head_cache, skb);
+ skb = NULL;
+ goto out;
+}
+

static void skb_drop_fraglist(struct sk_buff *skb)
{
diff -Nurp pristine-linux-2.6.10-rc2/include/linux/skbuff.h tmp-linux-2.6.10-rc2-xen.patch/include/linux/skbuff.h
--- pristine-linux-2.6.10-rc2/include/linux/skbuff.h 2004-10-18 22:55:36.000000000 +0100
+++ tmp-linux-2.6.10-rc2-xen.patch/include/linux/skbuff.h 2004-11-30 00:41:24.000000000 +0000
@@ -292,6 +292,8 @@ struct sk_buff {

extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *alloc_skb(unsigned int size, int priority);
+extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
+ unsigned int size, int priority);
extern void kfree_skbmem(struct sk_buff *skb);
extern struct sk_buff *skb_clone(struct sk_buff *skb, int priority);
extern struct sk_buff *skb_copy(const struct sk_buff *skb, int priority);
@@ -935,6 +937,7 @@ static inline void __skb_queue_purge(str
*
* %NULL is returned in there is no free memory.
*/
+#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
int gfp_mask)
{
@@ -943,6 +946,9 @@ static inline struct sk_buff *__dev_allo
skb_reserve(skb, 16);
return skb;
}
+#else
+extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask);
+#endif

/**
* dev_alloc_skb - allocate an skbuff for sending

2004-11-30 18:10:45

by Rik van Riel

[permalink] [raw]

Subject: Re: Xen VMM patch set - take 3

On Tue, 30 Nov 2004, Ian Pratt wrote:

> We didn't get much feedback from take 2, so hopefully we're
> converging on something that's acceptable.

I see it's all been cleaned up, it all looks good to me.

> The only major difference between this set and the previous is the way
> we handle the /dev/mem changes. I think the new approach is rather
> cleaner.

Yay.

--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan