LinuxLists.cc - patch to no longer use ia64's software mmu

2001-12-04 02:37:13

Subject: patch to no longer use ia64's software mmu

Hi,

The patch below (against 2.4.16) makes the ia64 port no longer use the (VERY
slow) software IO mmu but makes it use the same mechanism the x86 PAE port
uses: it lets the higher layers take care of the proper bouncing of
PCI-unreachable memory. The implemenation is pretty simple; instead of
having a 4Gb GFP_DMA zone and a <rest of ram> GFP_KERNEL zone, the ia64 port
now has a 4Gb GFP_DMA zone and a <rest of ram> GFP_HIGH zone.
Since the ia64 cpu can address all of this memory directly, the kmap() and
related functions are basically nops.

The result: 100 mbit ethernet performance on a ia64 machine with 32Gb of ram
increased more than 4x (from 20 mbit to 95 mbit)....

The only downside is that the current kernel will always bounce buffer disk
IO even if the scsi card is 64 bit PCI capable; Jens Axboe's block highmem
patch fixes that downside nicely though.

Greetings,
Arjan van de Ven

diff -urN Linux/arch/ia64/config.in linux/arch/ia64/config.in
--- Linux/arch/ia64/config.in Fri Nov 9 22:26:17 2001
+++ linux/arch/ia64/config.in Mon Dec 3 20:34:17 2001
@@ -25,6 +25,7 @@
define_bool CONFIG_SBUS n
define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y
define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM n
+define_bool CONFIG_HIGHMEM y

if [ "$CONFIG_IA64_HP_SIM" = "n" ]; then
define_bool CONFIG_ACPI y
diff -urN Linux/arch/ia64/kernel/Makefile linux/arch/ia64/kernel/Makefile
--- Linux/arch/ia64/kernel/Makefile Fri Nov 9 22:26:17 2001
+++ linux/arch/ia64/kernel/Makefile Mon Dec 3 20:45:28 2001
@@ -11,11 +11,11 @@

O_TARGET := kernel.o

-export-objs := ia64_ksyms.o
+export-objs := ia64_ksyms.o pci-dma.o

obj-y := acpi.o entry.o gate.o efi.o efi_stub.o ia64_ksyms.o irq.o irq_ia64.o irq_lsapic.o ivt.o \
machvec.o pal.o process.o perfmon.o ptrace.o sal.o semaphore.o setup.o \
- signal.o sys_ia64.o traps.o time.o unaligned.o unwind.o
+ signal.o sys_ia64.o traps.o time.o unaligned.o unwind.o pci-dma.o
obj-$(CONFIG_IA64_GENERIC) += iosapic.o
obj-$(CONFIG_IA64_DIG) += iosapic.o
obj-$(CONFIG_IA64_PALINFO) += palinfo.o
diff -urN Linux/arch/ia64/kernel/ia64_ksyms.c linux/arch/ia64/kernel/ia64_ksyms.c
--- Linux/arch/ia64/kernel/ia64_ksyms.c Fri Nov 9 22:26:17 2001
+++ linux/arch/ia64/kernel/ia64_ksyms.c Mon Dec 3 20:45:46 2001
@@ -6,6 +6,8 @@
#include <linux/module.h>

#include <linux/string.h>
+#include <linux/pci.h>
+
EXPORT_SYMBOL_NOVERS(memset);
EXPORT_SYMBOL(memchr);
EXPORT_SYMBOL(memcmp);
@@ -147,3 +149,5 @@
#include <linux/proc_fs.h>
extern struct proc_dir_entry *efi_dir;
EXPORT_SYMBOL(efi_dir);
+EXPORT_SYMBOL(pci_alloc_consistent);
+EXPORT_SYMBOL(pci_free_consistent);
diff -urN Linux/arch/ia64/kernel/pci-dma.c linux/arch/ia64/kernel/pci-dma.c
--- Linux/arch/ia64/kernel/pci-dma.c Thu Jan 1 01:00:00 1970
+++ linux/arch/ia64/kernel/pci-dma.c Mon Dec 3 20:34:17 2001
@@ -0,0 +1,38 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * On IA64 there is no hardware dynamic DMA address translation,
+ * so consistent alloc/free are merely page allocation/freeing.
+ * The rest of the dynamic DMA mapping interface is implemented
+ * in asm/pci.h.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+
+void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
+ dma_addr_t *dma_handle)
+{
+ void *ret;
+ int gfp = GFP_ATOMIC;
+
+ if (hwdev == NULL || hwdev->dma_mask != 0xffffffff)
+ gfp |= GFP_DMA;
+ ret = (void *)__get_free_pages(gfp, get_order(size));
+
+ if (ret != NULL) {
+ memset(ret, 0, size);
+ *dma_handle = virt_to_bus(ret);
+ }
+ return ret;
+}
+
+void pci_free_consistent(struct pci_dev *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle)
+{
+ free_pages((unsigned long)vaddr, get_order(size));
+}
diff -urN Linux/arch/ia64/lib/Makefile linux/arch/ia64/lib/Makefile
--- Linux/arch/ia64/lib/Makefile Tue Jul 31 18:30:08 2001
+++ linux/arch/ia64/lib/Makefile Mon Dec 3 20:34:17 2001
@@ -7,14 +7,14 @@

L_TARGET = lib.a

-export-objs := io.o swiotlb.o
+export-objs := io.o

obj-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
checksum.o clear_page.o csum_partial_copy.o copy_page.o \
copy_user.o clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
flush.o io.o do_csum.o \
- memcpy.o memset.o strlen.o swiotlb.o
+ memcpy.o memset.o strlen.o

IGNORE_FLAGS_OBJS = __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
diff -urN Linux/arch/ia64/mm/init.c linux/arch/ia64/mm/init.c
--- Linux/arch/ia64/mm/init.c Fri Nov 9 22:26:17 2001
+++ linux/arch/ia64/mm/init.c Mon Dec 3 20:34:17 2001
@@ -13,6 +13,7 @@
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/highmem.h>

#include <asm/bitops.h>
#include <asm/dma.h>
@@ -36,6 +37,7 @@
unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;

static unsigned long totalram_pages;
+static unsigned long totalhigh_pages;

int
do_check_pgt_cache (int low, int high)
@@ -160,8 +162,8 @@
val->sharedram = 0;
val->freeram = nr_free_pages();
val->bufferram = atomic_read(&buffermem_pages);
- val->totalhigh = 0;
- val->freehigh = 0;
+ val->totalhigh = totalhigh_pages;
+ val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
return;
}
@@ -349,12 +352,13 @@

memset(zones_size, 0, sizeof(zones_size));

- max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ max_dma = virt_to_phys((void *) (MAX_DMA_ADDRESS)) >> PAGE_SHIFT;
+
if (max_low_pfn < max_dma)
zones_size[ZONE_DMA] = max_low_pfn;
else {
zones_size[ZONE_DMA] = max_dma;
- zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
+ zones_size[ZONE_HIGHMEM] = max_low_pfn - max_dma;
}
free_area_init(zones_size);
}
@@ -382,6 +386,23 @@
return 0;
}

+static int
+count_highmem_pages (u64 start, u64 end, void *arg)
+{
+ unsigned long num_high = 0;
+ unsigned long *count = arg;
+ struct page *pg;
+
+ for (pg = virt_to_page(start); pg < virt_to_page(end); ++pg)
+ if (page_to_phys(pg)>(0xffffffff)) {
+ ++num_high;
+ set_bit(PG_highmem, &pg->flags);
+ pg->virtual = __va(page_to_phys(pg));
+ }
+ *count += num_high;
+ return 0;
+}
+
void
mem_init (void)
{
@@ -395,7 +415,7 @@
* any drivers that may need the PCI DMA interface are initialized or bootmem has
* been freed.
*/
- platform_pci_dma_init();
+ /*platform_pci_dma_init();*/
#endif

if (!mem_map)
@@ -405,6 +425,8 @@
efi_memmap_walk(count_pages, &num_physpages);

max_mapnr = max_low_pfn;
+ highmem_start_page = mem_map + (0x100000000 >> PAGE_SHIFT);
+
high_memory = __va(max_low_pfn * PAGE_SIZE);

totalram_pages += free_all_bootmem();
@@ -412,6 +434,9 @@
reserved_pages = 0;
efi_memmap_walk(count_reserved_pages, &reserved_pages);

+ totalhigh_pages = 0;
+ efi_memmap_walk(count_highmem_pages, &totalhigh_pages);
+
codesize = (unsigned long) &_etext - (unsigned long) &_stext;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff -urN Linux/include/asm-ia64/highmem.h linux/include/asm-ia64/highmem.h
--- Linux/include/asm-ia64/highmem.h Thu Jan 1 01:00:00 1970
+++ linux/include/asm-ia64/highmem.h Mon Dec 3 20:34:17 2001
@@ -0,0 +1,61 @@
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * [email protected]
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ * Copyright (C) 1999 Ingo Molnar <[email protected]>
+ * Modified for use on IA64 by Arjan van de Ven <[email protected]>
+ */
+
+#ifndef _ASM_HIGHMEM_H
+#define _ASM_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/kmap_types.h>
+#include <asm/pgtable.h>
+
+/* undef for production */
+#define HIGHMEM_DEBUG 0
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+#define kmap_init(void) do {} while (0)
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+#define kmap(page) page_address(page)
+#define kunmap(page) do {} while (0)
+
+/*
+ * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
+ * gives a more generic (and caching) interface. But kmap_atomic can
+ * be used in IRQ contexts, so in some (very limited) cases we need
+ * it.
+ */
+#define kmap_atomic(page, type) page_address(page)
+#define kunmap_atomic(kvaddr, type) do {} while (0)
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_HIGHMEM_H */
diff -urN Linux/include/asm-ia64/kmap_types.h linux/include/asm-ia64/kmap_types.h
--- Linux/include/asm-ia64/kmap_types.h Thu Jan 1 01:00:00 1970
+++ linux/include/asm-ia64/kmap_types.h Mon Dec 3 20:34:17 2001
@@ -0,0 +1,15 @@
+#ifndef _ASM_KMAP_TYPES_H
+#define _ASM_KMAP_TYPES_H
+
+enum km_type {
+ KM_BOUNCE_READ,
+ KM_BOUNCE_WRITE,
+ KM_SKB_DATA,
+ KM_SKB_DATA_SOFTIRQ,
+ KM_USER0,
+ KM_USER1,
+ KM_BH_IRQ,
+ KM_TYPE_NR
+};
+
+#endif
diff -urN Linux/include/asm-ia64/machvec.h linux/include/asm-ia64/machvec.h
--- Linux/include/asm-ia64/machvec.h Fri Nov 9 22:26:17 2001
+++ linux/include/asm-ia64/machvec.h Mon Dec 3 20:34:17 2001
@@ -225,36 +225,6 @@
#ifndef platform_global_tlb_purge
# define platform_global_tlb_purge ia64_global_tlb_purge /* default to architected version */
#endif
-#ifndef platform_pci_dma_init
-# define platform_pci_dma_init swiotlb_init
-#endif
-#ifndef platform_pci_alloc_consistent
-# define platform_pci_alloc_consistent swiotlb_alloc_consistent
-#endif
-#ifndef platform_pci_free_consistent
-# define platform_pci_free_consistent swiotlb_free_consistent
-#endif
-#ifndef platform_pci_map_single
-# define platform_pci_map_single swiotlb_map_single
-#endif
-#ifndef platform_pci_unmap_single
-# define platform_pci_unmap_single swiotlb_unmap_single
-#endif
-#ifndef platform_pci_map_sg
-# define platform_pci_map_sg swiotlb_map_sg
-#endif
-#ifndef platform_pci_unmap_sg
-# define platform_pci_unmap_sg swiotlb_unmap_sg
-#endif
-#ifndef platform_pci_dma_sync_single
-# define platform_pci_dma_sync_single swiotlb_sync_single
-#endif
-#ifndef platform_pci_dma_sync_sg
-# define platform_pci_dma_sync_sg swiotlb_sync_sg
-#endif
-#ifndef platform_pci_dma_address
-# define platform_pci_dma_address swiotlb_dma_address
-#endif
#ifndef platform_irq_desc
# define platform_irq_desc __ia64_irq_desc
#endif
diff -urN Linux/include/asm-ia64/pci.h linux/include/asm-ia64/pci.h
--- Linux/include/asm-ia64/pci.h Fri Nov 9 22:26:17 2001
+++ linux/include/asm-ia64/pci.h Mon Dec 3 20:34:17 2001
@@ -36,15 +36,147 @@
/*
* Dynamic DMA mapping API. See Documentation/DMA-mapping.txt for details.
*/
-#define pci_alloc_consistent platform_pci_alloc_consistent
-#define pci_free_consistent platform_pci_free_consistent
-#define pci_map_single platform_pci_map_single
-#define pci_unmap_single platform_pci_unmap_single
-#define pci_map_sg platform_pci_map_sg
-#define pci_unmap_sg platform_pci_unmap_sg
-#define pci_dma_sync_single platform_pci_dma_sync_single
-#define pci_dma_sync_sg platform_pci_dma_sync_sg
-#define sg_dma_address platform_pci_dma_address
+
+#define flush_write_buffers() do {} while (0)
+
+
+/* Map a single buffer of the indicated size for DMA in streaming mode.
+ * The 32-bit bus address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory
+ * until either pci_unmap_single or pci_dma_sync_single is performed.
+ */
+static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+ flush_write_buffers();
+ return virt_to_bus(ptr);
+}
+
+/* Unmap a single streaming mode DMA translation. The dma_addr and size
+ * must match what was provided for in a previous pci_map_single call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guarenteed to see
+ * whatever the device wrote there.
+ */
+static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+ /* Nothing to do */
+}
+
+/*
+ * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical
+ * to pci_map_single, but takes a struct page instead of a virtual address
+ */
+static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
+ unsigned long offset, size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+
+ return (page - mem_map) * PAGE_SIZE + offset;
+}
+
+static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+ /* Nothing to do */
+}
+
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA. This is the scather-gather version of the
+ * above pci_map_single interface. Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length. They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ int i;
+
+ if (direction == PCI_DMA_NONE)
+ BUG();
+
+ /*
+ * temporary 2.4 hack
+ */
+ for (i = 0; i < nents; i++ ) {
+ if (sg[i].address && sg[i].page)
+ BUG();
+ else if (!sg[i].address && !sg[i].page)
+ BUG();
+
+ if (sg[i].address)
+ sg[i].dma_address = virt_to_bus(sg[i].address);
+ else
+ sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
+ }
+
+ flush_write_buffers();
+ return nents;
+}
+
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+ /* Nothing to do */
+}
+
+/* Make physical memory consistent for a single
+ * streaming mode DMA translation after a transfer.
+ *
+ * If you perform a pci_map_single() but wish to interrogate the
+ * buffer using the cpu, yet do not wish to teardown the PCI dma
+ * mapping, you must call this function before doing so. At the
+ * next point you give the PCI dma address back to the card, the
+ * device again owns the buffer.
+ */
+static inline void pci_dma_sync_single(struct pci_dev *hwdev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+ flush_write_buffers();
+}
+
+/* Make physical memory consistent for a set of streaming
+ * mode DMA translations after a transfer.
+ *
+ * The same as pci_dma_sync_single but for a scatter-gather list,
+ * same rules and usage.
+ */
+static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ BUG();
+ flush_write_buffers();
+}

/*
* Return whether the given PCI device DMA address mask can be supported properly. For
@@ -79,4 +211,27 @@
extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
enum pci_mmap_state mmap_state, int write_combine);

+#define PCI_DMA_BUS_IS_PHYS (1)
+#define pci_dac_dma_supported(pci_dev, mask) (0)
+
+/* Allocate and map kernel buffer using consistent mode DMA for a device.
+ * hwdev should be valid struct pci_dev pointer for PCI devices,
+ * NULL for PCI-like buses (ISA, EISA).
+ * Returns non-NULL cpu-view pointer to the buffer if successful and
+ * sets *dma_addrp to the pci side dma address as well, else *dma_addrp
+ * is undefined.
+ */
+extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
+ dma_addr_t *dma_handle);
+
+/* Free and unmap a consistent DMA buffer.
+ * cpu_addr is what was returned from pci_alloc_consistent,
+ * size must be the same as what as passed into pci_alloc_consistent,
+ * and likewise dma_addr must be the same as what *dma_addrp was set to.
+ *
+ * References to the memory and mappings associated with cpu_addr/dma_addr
+ * past this call are illegal.
+ */
+extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
#endif /* _ASM_IA64_PCI_H */
diff -urN Linux/include/asm-ia64/scatterlist.h linux/include/asm-ia64/scatterlist.h
--- Linux/include/asm-ia64/scatterlist.h Tue Nov 13 17:01:16 2001
+++ linux/include/asm-ia64/scatterlist.h Mon Dec 3 20:44:15 2001
@@ -14,10 +14,12 @@
/* These two are only valid if ADDRESS member of this struct is NULL. */
struct page *page;
unsigned int offset;
+ dma_addr_t dma_address;

unsigned int length; /* buffer length */
};

#define ISA_DMA_THRESHOLD (~0UL)
+#define sg_dma_address(sg) ((sg)->dma_address)

#endif /* _ASM_IA64_SCATTERLIST_H */
diff -urN Linux/include/asm-ia64/types.h linux/include/asm-ia64/types.h
--- Linux/include/asm-ia64/types.h Fri Apr 21 23:21:24 2000
+++ linux/include/asm-ia64/types.h Mon Dec 3 20:34:17 2001
@@ -63,6 +63,7 @@
/* DMA addresses are 64-bits wide, in general. */

typedef u64 dma_addr_t;
+typedef u64 dma64_addr_t;

# endif /* __KERNEL__ */
#endif /* !__ASSEMBLY__ */
diff -urN Linux/kernel/ksyms.c linux/kernel/ksyms.c
--- Linux/kernel/ksyms.c Wed Nov 21 22:07:25 2001
+++ linux/kernel/ksyms.c Mon Dec 3 20:41:11 2001
@@ -117,8 +117,11 @@
EXPORT_SYMBOL(get_unmapped_area);
EXPORT_SYMBOL(init_mm);
#ifdef CONFIG_HIGHMEM
+#ifndef __ia64__
+/* these are inlined on ia64 */
EXPORT_SYMBOL(kmap_high);
EXPORT_SYMBOL(kunmap_high);
+#endif
EXPORT_SYMBOL(highmem_start_page);
EXPORT_SYMBOL(create_bounce);
#endif
diff -urN Linux/mm/highmem.c linux/mm/highmem.c
--- Linux/mm/highmem.c Mon Oct 22 23:01:57 2001
+++ linux/mm/highmem.c Mon Dec 3 20:42:44 2001
@@ -22,6 +22,8 @@
#include <linux/swap.h>
#include <linux/slab.h>

+
+#ifndef __ia64__
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -186,6 +188,8 @@
wake_up(&pkmap_map_wait);
}

+#endif
+
#define POOL_SIZE 32

/*

2001-12-04 00:21:00

by David Mosberger

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

Arjan> Hi, The patch below (against 2.4.16) makes the ia64 port no
Arjan> longer use the (VERY slow) software IO mmu but makes it use
Arjan> the same mechanism the x86 PAE port uses: it lets the higher
Arjan> layers take care of the proper bouncing of PCI-unreachable
Arjan> memory. The implemenation is pretty simple; instead of having
Arjan> a 4Gb GFP_DMA zone and a <rest of ram> GFP_KERNEL zone, the
Arjan> ia64 port now has a 4Gb GFP_DMA zone and a <rest of ram>
Arjan> GFP_HIGH zone. Since the ia64 cpu can address all of this
Arjan> memory directly, the kmap() and related functions are
Arjan> basically nops.

Arjan> The result: 100 mbit ethernet performance on a ia64 machine
Arjan> with 32Gb of ram increased more than 4x (from 20 mbit to 95
Arjan> mbit)....

Arjan> The only downside is that the current kernel will always
Arjan> bounce buffer disk IO even if the scsi card is 64 bit PCI
Arjan> capable; Jens Axboe's block highmem patch fixes that downside
Arjan> nicely though.

How soon will Jens' patch make it into the official tree? I think
that would be a pre-requisite before switching to a highmem based
implementation.

Another concern I have is that, fundamentally, I dislike the idea of
penalizing all IA-64 platforms due to one chipset that is, shall we
say, "lacking" (i.e., doesn't have an I/O TLB).

Could someone comment on whether the 870 will have I/O TLB support
(private mail is fine, if you don't feel comfortable sending mail to
all the lists...).

Thanks,

--david

2001-12-04 00:20:52

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

On Mon, Dec 03, 2001 at 01:47:41PM -0800, David Mosberger wrote:

> How soon will Jens' patch make it into the official tree? I think
> that would be a pre-requisite before switching to a highmem based
> implementation.

I understood (and hope) "soon".

> Another concern I have is that, fundamentally, I dislike the idea of
> penalizing all IA-64 platforms due to one chipset that is, shall we
> say, "lacking" (i.e., doesn't have an I/O TLB).

I think some of it (if not all) can be abstracted in the machine vectors;
setting CONFIG_HIGHMEM doesn't hurt anything; the only important part is
where you put > 4Gb memory, eg in the NORMAL or HIGH zone. That choice,
while hardcoded in my patch, can obviously be made at runtime based on
capabilities of the machine... (the remaining overhead due to
kmap is (almost) zero already as the compiler will basically optimize
the inline away as it's a nop in the context of the users)

Greetings,
Arjan van de Ven

2001-12-04 09:28:04

by Alan

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

> Another concern I have is that, fundamentally, I dislike the idea of
> penalizing all IA-64 platforms due to one chipset that is, shall we
> say, "lacking" (i.e., doesn't have an I/O TLB).

Allow me to introduce to you the concept of CONFIG_ options 8) It makes a
lot of sense to have a generic IA64 kernel, and an IA64 designed by people
with a brain kernel.

Alan

2001-12-04 14:44:29

by Andreas Schwab

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

Arjan van de Ven <[email protected]> writes:

|> Hi,
|>
|> The patch below (against 2.4.16) makes the ia64 port no longer use the (VERY
|> slow) software IO mmu but makes it use the same mechanism the x86 PAE port
|> uses: it lets the higher layers take care of the proper bouncing of
|> PCI-unreachable memory. The implemenation is pretty simple; instead of
|> having a 4Gb GFP_DMA zone and a <rest of ram> GFP_KERNEL zone, the ia64 port
|> now has a 4Gb GFP_DMA zone and a <rest of ram> GFP_HIGH zone.
|> Since the ia64 cpu can address all of this memory directly, the kmap() and
|> related functions are basically nops.

I tried it, but it doesn't compile: kmap_prot and kmap_pte are undefined.
If they are not used on ia64, then the reference in kernel/ksyms.c must be
removed.

Andreas.

--
Andreas Schwab "And now for something
[email protected] completely different."
SuSE Labs, SuSE GmbH, Schanz?ckerstr. 10, D-90443 N?rnberg
Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5

2001-12-04 16:28:03

by David Mosberger

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

>>>>> On Tue, 4 Dec 2001 09:36:33 +0000 (GMT), Alan Cox <[email protected]> said:

>> Another concern I have is that, fundamentally, I dislike the idea
>> of penalizing all IA-64 platforms due to one chipset that is,
>> shall we say, "lacking" (i.e., doesn't have an I/O TLB).

Alan> Allow me to introduce to you the concept of CONFIG_ options 8)
Alan> It makes a lot of sense to have a generic IA64 kernel, and an
Alan> IA64 designed by people with a brain kernel.

I think the issue at hand is whether, longer term, it is desirable to
move all bounce buffer handling into the PCI DMA layer or whether
Linux should continue to make bounce buffer management visible to
drivers. I'd be interested in hearing opinions.

--david

2001-12-04 16:37:37

by Arjan van de Ven

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

On Tue, Dec 04, 2001 at 08:26:59AM -0800, David Mosberger wrote:
> >>>>> On Tue, 4 Dec 2001 09:36:33 +0000 (GMT), Alan Cox <[email protected]> said:
>
> >> Another concern I have is that, fundamentally, I dislike the idea
> >> of penalizing all IA-64 platforms due to one chipset that is,
> >> shall we say, "lacking" (i.e., doesn't have an I/O TLB).
>
> Alan> Allow me to introduce to you the concept of CONFIG_ options 8)
> Alan> It makes a lot of sense to have a generic IA64 kernel, and an
> Alan> IA64 designed by people with a brain kernel.
>
> I think the issue at hand is whether, longer term, it is desirable to
> move all bounce buffer handling into the PCI DMA layer or whether
> Linux should continue to make bounce buffer management visible to
> drivers. I'd be interested in hearing opinions.

For "lacking" architectures (current ia64 machines, x86 with pae36) the only
real performing solution is to bounce in subsystems (not low level drivers).
The PCI layer doesn't have enough scope to decide what to do...

2001-12-04 17:11:26

by Alan

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

> I think the issue at hand is whether, longer term, it is desirable to
> move all bounce buffer handling into the PCI DMA layer or whether
> Linux should continue to make bounce buffer management visible to
> drivers. I'd be interested in hearing opinions.

I think the performance figures we see currently answer that already. Bounce
management in a sense is PCI layer, but its PCI layer in the sense of
helpers called by subsystems or devices not as a global layer in the middle.

On a box with 32bit limited cards you need to do zone stuff and play with
the high zone even though your kmap is a nop. It's not ideal but its the
real world. IA64 also needs to correct its GFP_DMA to mean "low 16Mb" for
ISA DMA. While there is no ISA DMA on ia64 (thankfully) many PCI cards have
26-31 bit limits.

Alan

2001-12-04 17:53:10

by Alan

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

> Alan> ISA DMA. While there is no ISA DMA on ia64 (thankfully) many
> Alan> PCI cards have 26-31 bit limits.
>
> We could do this if we there was a GFP_4GB zone. Now that 2.5 is open
> for business, it won't be long, right?

I don't see the need: GFP_DMA is the ISA DMA zone. pci_* API is used by
everyone else [for 2.5]. You want a 32bit zone purely so you can fulfill
allocations in 32bit PCI space, and an ISA DMA zone for back compat and to
cover broken PCI cards (of which there are lots)

Alan

2001-12-04 18:08:12

by David Mosberger

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

>>>>> On Tue, 4 Dec 2001 17:59:28 +0000 (GMT), Alan Cox <[email protected]> said:

Alan> ISA DMA. While there is no ISA DMA on ia64 (thankfully) many
Alan> PCI cards have 26-31 bit limits.
>> We could do this if we there was a GFP_4GB zone. Now that 2.5
>> is open for business, it won't be long, right?

Alan> I don't see the need: GFP_DMA is the ISA DMA zone. pci_* API
Alan> is used by everyone else [for 2.5].

Without a 4GB zone, you may end up creating bounce buffers needlessly
for 32-bit capable DMA devices, no?

--david

2001-12-04 18:14:12

by Alan

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

> Alan> I don't see the need: GFP_DMA is the ISA DMA zone. pci_* API
> Alan> is used by everyone else [for 2.5].
>
> Without a 4GB zone, you may end up creating bounce buffers needlessly
> for 32-bit capable DMA devices, no?

Yes - but it becomes an implementation detail. Drivers don't go around
asking for kmalloc in 4Gb zone anymore they ask for PCI memory that a 32bit
pci address can hit. I'm sure a 4Gb zone is what will be there internally
but you don't need GFP_4GBZONE as a visible driver detail.

Alan

2001-12-04 18:37:20

by David Mosberger

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

>>>>> On Tue, 4 Dec 2001 18:19:55 +0000 (GMT), Alan Cox <[email protected]> said:

Alan> I don't see the need: GFP_DMA is the ISA DMA zone. pci_* API
Alan> is used by everyone else [for 2.5].
>> Without a 4GB zone, you may end up creating bounce buffers
>> needlessly for 32-bit capable DMA devices, no?

Alan> Yes - but it becomes an implementation detail. Drivers don't
Alan> go around asking for kmalloc in 4Gb zone anymore they ask for
Alan> PCI memory that a 32bit pci address can hit. I'm sure a 4Gb
Alan> zone is what will be there internally but you don't need
Alan> GFP_4GBZONE as a visible driver detail.

Oh, OK, we're in agreement then. When I looked at the zone stuff the
last time, I didn't think you could have an internal 4GB zone without
abusing an existing zone making some changes to the header files in
linux/*.h, but perhaps I missed something.

--david

2001-12-04 18:14:10

by David Mosberger

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

>>>>> On Tue, 4 Dec 2001 17:18:17 +0000 (GMT), Alan Cox <[email protected]> said:

>> I think the issue at hand is whether, longer term, it is
>> desirable to move all bounce buffer handling into the PCI DMA
>> layer or whether Linux should continue to make bounce buffer
>> management visible to drivers. I'd be interested in hearing
>> opinions.

Alan> I think the performance figures we see currently answer that
Alan> already.

The numbers I have seen so far don't make this obvious. Tony Luck
reports 95Mbps with a CPU load of 20% after a fixing a performance bug
in the software I/O TLB. Arjan reported the same 95Mbps figure with
the highmem approach. Arjan didn't report the CPU load and neither
Tony nor Arjan specified the test environment they were using.

Alan> IA64 also needs to correct its GFP_DMA to mean "low 16Mb" for
Alan> ISA DMA. While there is no ISA DMA on ia64 (thankfully) many
Alan> PCI cards have 26-31 bit limits.

We could do this if we there was a GFP_4GB zone. Now that 2.5 is open
for business, it won't be long, right?

--david

2001-12-04 20:25:02

by David Miller

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

From: David Mosberger <[email protected]>
Date: Tue, 4 Dec 2001 08:26:59 -0800

I think the issue at hand is whether, longer term, it is desirable to
move all bounce buffer handling into the PCI DMA layer or whether
Linux should continue to make bounce buffer management visible to
drivers. I'd be interested in hearing opinions.

Well, this whole ia64 situation should be the example that shows that
for severely broken 64-bit platforms, like IA64, doing the bounce
buffering in the PCI DMA layer is a lose. The HIGHMEM option is the
optimal one in this case, and I think that is fine.

If what you are asking is should we tweak the APIs again so that
situations like current IA64 can be done more sanely in the PCI DMA
layer, I say definitely no.

There really is no excuse for the current IA64 hardware situation,
there were probably well over 3 or 4 major 64-bit platforms from
competitors, whose PCI controllers were pretty well documented
publicly, from which Intel could have derived a working 64-bit
platform PCI controller design.

When a saner IA64 hardware implementation comes about (if ever), you
can make CONFIG_IA64_WHATEVER_PLATFORM which undoes the HIGHMEM stuff
and enables PCI DMA support code for those chipsets. As Alan has
suggested. That is a perfectly fine way of dealing with this.

Franks a lot,
David S. Miller
[email protected]

2001-12-04 20:34:13

by David Mosberger

[permalink] [raw]

Subject: Re: [Linux-ia64] patch to no longer use ia64's software mmu

>>>>> On Tue, 04 Dec 2001 12:22:54 -0800 (PST), "David S. Miller" <[email protected]> said:

DaveM> If what you are asking is should we tweak the APIs again so
DaveM> that situations like current IA64 can be done more sanely in
DaveM> the PCI DMA layer, I say definitely no.

I certainly agree that the PCI DMA interface shouldn't be tweaked just
because of IA64. What I'm wondering is whether we'll have to tweak it
anyhow to more gracefully handle the case where a hardware I/O TLB
runs out of space. If so, I think the software I/O TLB makes sense.

DaveM> There really is no excuse for the current IA64 hardware
DaveM> situation, there were probably well over 3 or 4 major 64-bit
DaveM> platforms from competitors, whose PCI controllers were pretty
DaveM> well documented publicly, from which Intel could have derived
DaveM> a working 64-bit platform PCI controller design.

Well, I won't comment on *this* one! ;-))

--david