2005-09-11 16:59:22

by Andi Kleen

[permalink] [raw]
Subject: [1/3] Add 4GB DMA32 zone

Add 4GB DMA32 zone

Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones.

As a bit of historical background: when the x86-64 port
was originally designed we had some discussion if we should
use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or
both. Both was ruled out at this point because it was in early
2.4 when VM is still quite shakey and had bad troubles even
dealing with one DMA zone. We settled on the 16MB DMA zone mainly
because we worried about older soundcards and the floppy.

But this has always caused problems since then because
device drivers had trouble getting enough DMA able memory. These days
the VM works much better and the wide use of NUMA has proven
it can deal with many zones successfully.

So this patch adds both zones.

This helps drivers who need a lot of memory below 4GB because
their hardware is not accessing more (graphic drivers - proprietary
and free ones, video frame buffer drivers, sound drivers etc.).
Previously they could only use IOMMU+16MB GFP_DMA, which
was not enough memory.

Another common problem is that hardware who has full memory
addressing for >4GB misses it for some control structures in memory
(like transmit rings or other metadata). They tended to allocate memory
in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent,
but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory
(even on AMD systems the IOMMU tends to be quite small) especially if you have
many devices. With the new zone pci_alloc_consistent can just put
this stuff into memory below 4GB which works better.

One argument was still if the zone should be 4GB or 2GB. The main
motivation for 2GB would be an unnamed not so unpopular hardware
raid controller (mostly found in older machines from a particular four letter
company) who has a strange 2GB restriction in firmware. But
that one works ok with swiotlb/IOMMU anyways, so it doesn't really
need GFP_DMA32. I chose 4GB to be compatible with IA64 and because
it seems to be the most common restriction.

The new zone is so far added only for x86-64.

For other architectures who don't set up this
new zone nothing changes. Architectures can set a compatibility
define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32
as GFP_DMA. Otherwise it's a nop because on 32bit architectures
it's normally not needed because GFP_NORMAL (=0) is DMA able
enough.

One problem is still that GFP_DMA means different things on different
architectures. e.g. some drivers used to have #ifdef ia64 use GFP_DMA
(trusting it to be 4GB) #elif __x86_64__ (use other hacks like
the swiotlb because 16MB is not enough) ... . This was quite
ugly and is now obsolete.

These should be now converted to use GFP_DMA32 unconditionally. I haven't done
this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent
which will use GFP_DMA32 transparently.

Signed-off-by: Andi Kleen <[email protected]>

Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -318,32 +318,51 @@ void zap_low_mappings(void)
flush_tlb_all();
}

+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+ unsigned long w;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ z[i] = 0;
+
+ if (start_pfn < MAX_DMA_PFN)
+ z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ if (start_pfn < MAX_DMA32_PFN) {
+ unsigned long dma32_pfn = MAX_DMA32_PFN;
+ if (dma32_pfn > end_pfn)
+ dma32_pfn = end_pfn;
+ z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ }
+ z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ /* Remove lower zones from higher ones. */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (z[i])
+ z[i] -= w;
+ w += z[i];
+ }
+
+ /* Compute holes */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ unsigned long s = w;
+ w += z[i];
+ h[i] = e820_hole_size(s, w);
+ }
+}
+
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
- {
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long holes[MAX_NR_ZONES];
- unsigned int max_dma;
-
- memset(zones_size, 0, sizeof(zones_size));
- memset(holes, 0, sizeof(holes));
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
- if (end_pfn < max_dma) {
- zones_size[ZONE_DMA] = end_pfn;
- holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
- } else {
- zones_size[ZONE_DMA] = max_dma;
- holes[ZONE_DMA] = e820_hole_size(0, max_dma);
- zones_size[ZONE_NORMAL] = end_pfn - max_dma;
- holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
- }
- free_area_init_node(0, NODE_DATA(0), zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
- }
- return;
+ unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+ size_zones(zones, holes, 0, end_pfn);
+ free_area_init_node(0, NODE_DATA(0), zones,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
}
#endif

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -132,29 +132,14 @@ void __init setup_node_zones(int nodeid)
unsigned long start_pfn, end_pfn;
unsigned long zones[MAX_NR_ZONES];
unsigned long holes[MAX_NR_ZONES];
- unsigned long dma_end_pfn;

- memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
- memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+ start_pfn = node_start_pfn(nodeid);
+ end_pfn = node_end_pfn(nodeid);

- start_pfn = node_start_pfn(nodeid);
- end_pfn = node_end_pfn(nodeid);
+ Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
+ nodeid, start_pfn, end_pfn);

- Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
-
- /* All nodes > 0 have a zero length zone DMA */
- dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- if (start_pfn < dma_end_pfn) {
- zones[ZONE_DMA] = dma_end_pfn - start_pfn;
- holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
- zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
-
- } else {
- zones[ZONE_NORMAL] = end_pfn - start_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
- }
-
+ size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
start_pfn, holes);
}
Index: linux/include/asm-x86_64/dma.h
===================================================================
--- linux.orig/include/asm-x86_64/dma.h
+++ linux/include/asm-x86_64/dma.h
@@ -72,8 +72,15 @@

#define MAX_DMA_CHANNELS 8

-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000)
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))

/* 8237 DMA controllers */
#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -23,6 +23,8 @@ extern void mtrr_bp_init(void);
#define mtrr_bp_init() do {} while (0)
#endif
extern void init_memory_mapping(unsigned long start, unsigned long end);
+extern void size_zones(unsigned long *z, unsigned long *h,
+ unsigned long start_pfn, unsigned long end_pfn);

extern void system_call(void);
extern int kernel_syscall(void);
Index: linux/include/linux/gfp.h
===================================================================
--- linux.orig/include/linux/gfp.h
+++ linux/include/linux/gfp.h
@@ -14,6 +14,13 @@ struct vm_area_struct;
/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
#define __GFP_DMA 0x01u
#define __GFP_HIGHMEM 0x02u
+#ifdef CONFIG_DMA_IS_DMA32
+#define __GFP_DMA32 0x01 /* ZONE_DMA is ZONE_DMA32 */
+#elif BITS_PER_LONG < 64
+#define __GFP_DMA32 0x00 /* ZONE_NORMAL is ZONE_DMA32 */
+#else
+#define __GFP_DMA32 0x04 /* Has own ZONE_DMA32 */
+#endif

/*
* Action modifiers - doesn't change the zoning
@@ -64,6 +71,8 @@ struct vm_area_struct;

#define GFP_DMA __GFP_DMA

+/* 4GB DMA on some platforms */
+#define GFP_DMA32 __GFP_DMA32

/*
* There is only one page-allocator function, and two main namespaces to
Index: linux/include/linux/mmzone.h
===================================================================
--- linux.orig/include/linux/mmzone.h
+++ linux/include/linux/mmzone.h
@@ -70,11 +70,12 @@ struct per_cpu_pageset {
#endif

#define ZONE_DMA 0
-#define ZONE_NORMAL 1
-#define ZONE_HIGHMEM 2
+#define ZONE_DMA32 1
+#define ZONE_NORMAL 2
+#define ZONE_HIGHMEM 3

-#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
+#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
+#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */


/*
@@ -90,7 +91,7 @@ struct per_cpu_pageset {
* be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible
* combinations of zone modifiers in "zone modifier space".
*/
-#define GFP_ZONEMASK 0x03
+#define GFP_ZONEMASK 0x07
/*
* As an optimisation any zone modifier bits which are only valid when
* no other zone modifier bits are set (loners) should be placed in
@@ -110,6 +111,7 @@ struct per_cpu_pageset {
* into multiple physical zones. On a PC we have 3 zones:
*
* ZONE_DMA < 16 MB ISA DMA capable memory
+ * ZONE_DMA32 0 MB Empty
* ZONE_NORMAL 16-896 MB direct mapped by the kernel
* ZONE_HIGHMEM > 896 MB only page cache and user processes
*/
@@ -428,10 +430,10 @@ extern struct pglist_data contig_page_da

#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
/*
- * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
- * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
+ * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
+ * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
*/
-#define FLAGS_RESERVED 8
+#define FLAGS_RESERVED 9

#elif BITS_PER_LONG == 64
/*
Index: linux/mm/page_alloc.c
===================================================================
--- linux.orig/mm/page_alloc.c
+++ linux/mm/page_alloc.c
@@ -58,8 +58,11 @@ long nr_swap_pages;
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };

EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
@@ -71,7 +74,7 @@ EXPORT_SYMBOL(nr_swap_pages);
struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
EXPORT_SYMBOL(zone_table);

-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
int min_free_kbytes = 1024;

unsigned long __initdata nr_kernel_pages;
@@ -1418,6 +1421,10 @@ static int __init build_zonelists_node(p
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->present_pages)
zonelist->zones[j++] = zone;
+ case ZONE_DMA32:
+ zone = pgdat->node_zones + ZONE_DMA32;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->present_pages)
@@ -1526,6 +1533,8 @@ static void __init build_zonelists(pg_da
k = ZONE_NORMAL;
if (i & __GFP_HIGHMEM)
k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA32)
+ k = ZONE_DMA32;
if (i & __GFP_DMA)
k = ZONE_DMA;

@@ -1550,7 +1559,9 @@ static void __init build_zonelists(pg_da
j = 0;
k = ZONE_NORMAL;
if (i & __GFP_HIGHMEM)
- k = ZONE_HIGHMEM;
+ k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA32)
+ k = ZONE_DMA32;
if (i & __GFP_DMA)
k = ZONE_DMA;

@@ -1895,7 +1906,7 @@ static void __init free_area_init_core(s
if (zholes_size)
realsize -= zholes_size[j];

- if (j == ZONE_DMA || j == ZONE_NORMAL)
+ if (j < ZONE_HIGHMEM)
nr_kernel_pages += realsize;
nr_all_pages += realsize;


2005-09-12 07:43:38

by Jan Beulich

[permalink] [raw]
Subject: Re: [discuss] [1/3] Add 4GB DMA32 zone

It seems a little strange to add individual zones one by one. I remember
from an OS project I previously worked on that at some time our driver
developers ran into one or more devices that were able to consume 31-bit
physical addresses (but not 32-bit ones, and don't ask me for details on
what exact devices these were, I never knew). I thus wonder whether it
wouldn't make more sense to generalize the logic and allow drivers to
specify to the allocator how many physical address bits they can deal
with.

Jan

>>> "Andi Kleen" <[email protected]> 11.09.05 18:59:19 >>>
Add 4GB DMA32 zone

Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones.


As a bit of historical background: when the x86-64 port
was originally designed we had some discussion if we should
use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or
both. Both was ruled out at this point because it was in early
2.4 when VM is still quite shakey and had bad troubles even
dealing with one DMA zone. We settled on the 16MB DMA zone mainly
because we worried about older soundcards and the floppy.

But this has always caused problems since then because
device drivers had trouble getting enough DMA able memory. These days
the VM works much better and the wide use of NUMA has proven
it can deal with many zones successfully.

So this patch adds both zones.

This helps drivers who need a lot of memory below 4GB because
their hardware is not accessing more (graphic drivers - proprietary
and free ones, video frame buffer drivers, sound drivers etc.).
Previously they could only use IOMMU+16MB GFP_DMA, which
was not enough memory.

Another common problem is that hardware who has full memory
addressing for >4GB misses it for some control structures in memory
(like transmit rings or other metadata). They tended to allocate
memory
in the 16MB GFP_DMA or the IOMMU/swiotlb then using
pci_alloc_consistent,
but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory

(even on AMD systems the IOMMU tends to be quite small) especially if
you have
many devices. With the new zone pci_alloc_consistent can just put
this stuff into memory below 4GB which works better.

One argument was still if the zone should be 4GB or 2GB. The main
motivation for 2GB would be an unnamed not so unpopular hardware
raid controller (mostly found in older machines from a particular four
letter
company) who has a strange 2GB restriction in firmware. But
that one works ok with swiotlb/IOMMU anyways, so it doesn't really
need GFP_DMA32. I chose 4GB to be compatible with IA64 and because
it seems to be the most common restriction.

The new zone is so far added only for x86-64.

For other architectures who don't set up this
new zone nothing changes. Architectures can set a compatibility
define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32
as GFP_DMA. Otherwise it's a nop because on 32bit architectures
it's normally not needed because GFP_NORMAL (=0) is DMA able
enough.

One problem is still that GFP_DMA means different things on different
architectures. e.g. some drivers used to have #ifdef ia64 use GFP_DMA
(trusting it to be 4GB) #elif __x86_64__ (use other hacks like
the swiotlb because 16MB is not enough) ... . This was quite
ugly and is now obsolete.

These should be now converted to use GFP_DMA32 unconditionally. I
haven't done
this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent
which will use GFP_DMA32 transparently.

Signed-off-by: Andi Kleen <[email protected]>

Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -318,32 +318,51 @@ void zap_low_mappings(void)
flush_tlb_all();
}

+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+ unsigned long w;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ z[i] = 0;
+
+ if (start_pfn < MAX_DMA_PFN)
+ z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ if (start_pfn < MAX_DMA32_PFN) {
+ unsigned long dma32_pfn = MAX_DMA32_PFN;
+ if (dma32_pfn > end_pfn)
+ dma32_pfn = end_pfn;
+ z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ }
+ z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ /* Remove lower zones from higher ones. */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (z[i])
+ z[i] -= w;
+ w += z[i];
+ }
+
+ /* Compute holes */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ unsigned long s = w;
+ w += z[i];
+ h[i] = e820_hole_size(s, w);
+ }
+}
+
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
- {
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long holes[MAX_NR_ZONES];
- unsigned int max_dma;
-
- memset(zones_size, 0, sizeof(zones_size));
- memset(holes, 0, sizeof(holes));
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >>
PAGE_SHIFT;
-
- if (end_pfn < max_dma) {
- zones_size[ZONE_DMA] = end_pfn;
- holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
- } else {
- zones_size[ZONE_DMA] = max_dma;
- holes[ZONE_DMA] = e820_hole_size(0, max_dma);
- zones_size[ZONE_NORMAL] = end_pfn - max_dma;
- holes[ZONE_NORMAL] = e820_hole_size(max_dma,
end_pfn);
- }
- free_area_init_node(0, NODE_DATA(0), zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
- }
- return;
+ unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+ size_zones(zones, holes, 0, end_pfn);
+ free_area_init_node(0, NODE_DATA(0), zones,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
}
#endif

Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -132,29 +132,14 @@ void __init setup_node_zones(int nodeid)
unsigned long start_pfn, end_pfn;
unsigned long zones[MAX_NR_ZONES];
unsigned long holes[MAX_NR_ZONES];
- unsigned long dma_end_pfn;

- memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
- memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+ start_pfn = node_start_pfn(nodeid);
+ end_pfn = node_end_pfn(nodeid);

- start_pfn = node_start_pfn(nodeid);
- end_pfn = node_end_pfn(nodeid);
+ Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
+ nodeid, start_pfn, end_pfn);

- Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid,
start_pfn, end_pfn);
-
- /* All nodes > 0 have a zero length zone DMA */
- dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- if (start_pfn < dma_end_pfn) {
- zones[ZONE_DMA] = dma_end_pfn - start_pfn;
- holes[ZONE_DMA] = e820_hole_size(start_pfn,
dma_end_pfn);
- zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn,
end_pfn);
-
- } else {
- zones[ZONE_NORMAL] = end_pfn - start_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(start_pfn,
end_pfn);
- }
-
+ size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
start_pfn, holes);
}
Index: linux/include/asm-x86_64/dma.h
===================================================================
--- linux.orig/include/asm-x86_64/dma.h
+++ linux/include/asm-x86_64/dma.h
@@ -72,8 +72,15 @@

#define MAX_DMA_CHANNELS 8

-/* The maximum address that we can perform a DMA transfer to on this
platform */
-#define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000)
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN <<
PAGE_SHIFT))

/* 8237 DMA controllers */
#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -23,6 +23,8 @@ extern void mtrr_bp_init(void);
#define mtrr_bp_init() do {} while (0)
#endif
extern void init_memory_mapping(unsigned long start, unsigned long
end);
+extern void size_zones(unsigned long *z, unsigned long *h,
+ unsigned long start_pfn, unsigned long
end_pfn);

extern void system_call(void);
extern int kernel_syscall(void);
Index: linux/include/linux/gfp.h
===================================================================
--- linux.orig/include/linux/gfp.h
+++ linux/include/linux/gfp.h
@@ -14,6 +14,13 @@ struct vm_area_struct;
/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits)
*/
#define __GFP_DMA 0x01u
#define __GFP_HIGHMEM 0x02u
+#ifdef CONFIG_DMA_IS_DMA32
+#define __GFP_DMA32 0x01 /* ZONE_DMA is ZONE_DMA32 */
+#elif BITS_PER_LONG < 64
+#define __GFP_DMA32 0x00 /* ZONE_NORMAL is ZONE_DMA32 */
+#else
+#define __GFP_DMA32 0x04 /* Has own ZONE_DMA32 */
+#endif

/*
* Action modifiers - doesn't change the zoning
@@ -64,6 +71,8 @@ struct vm_area_struct;

#define GFP_DMA __GFP_DMA

+/* 4GB DMA on some platforms */
+#define GFP_DMA32 __GFP_DMA32

/*
* There is only one page-allocator function, and two main namespaces
to
Index: linux/include/linux/mmzone.h
===================================================================
--- linux.orig/include/linux/mmzone.h
+++ linux/include/linux/mmzone.h
@@ -70,11 +70,12 @@ struct per_cpu_pageset {
#endif

#define ZONE_DMA 0
-#define ZONE_NORMAL 1
-#define ZONE_HIGHMEM 2
+#define ZONE_DMA32 1
+#define ZONE_NORMAL 2
+#define ZONE_HIGHMEM 3

-#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT
*/
-#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
+#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT
*/
+#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */


/*
@@ -90,7 +91,7 @@ struct per_cpu_pageset {
* be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of
possible
* combinations of zone modifiers in "zone modifier space".
*/
-#define GFP_ZONEMASK 0x03
+#define GFP_ZONEMASK 0x07
/*
* As an optimisation any zone modifier bits which are only valid
when
* no other zone modifier bits are set (loners) should be placed in
@@ -110,6 +111,7 @@ struct per_cpu_pageset {
* into multiple physical zones. On a PC we have 3 zones:
*
* ZONE_DMA < 16 MB ISA DMA capable memory
+ * ZONE_DMA32 0 MB Empty
* ZONE_NORMAL 16-896 MB direct mapped by the kernel
* ZONE_HIGHMEM > 896 MB only page cache and user
processes
*/
@@ -428,10 +430,10 @@ extern struct pglist_data contig_page_da

#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
/*
- * with 32 bit page->flags field, we reserve 8 bits for node/zone
info.
- * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
+ * with 32 bit page->flags field, we reserve 9 bits for node/zone
info.
+ * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
*/
-#define FLAGS_RESERVED 8
+#define FLAGS_RESERVED 9

#elif BITS_PER_LONG == 64
/*
Index: linux/mm/page_alloc.c
===================================================================
--- linux.orig/mm/page_alloc.c
+++ linux/mm/page_alloc.c
@@ -58,8 +58,11 @@ long nr_swap_pages;
* NORMAL allocation will leave 784M/256 of ram reserved in the
ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in
ZONE_NORMAL
* HIGHMEM allocation will (224M+784M)/256 of ram reserved in
ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we
normally
+ * don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };

EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
@@ -71,7 +74,7 @@ EXPORT_SYMBOL(nr_swap_pages);
struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
EXPORT_SYMBOL(zone_table);

-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem"
};
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal",
"HighMem" };
int min_free_kbytes = 1024;

unsigned long __initdata nr_kernel_pages;
@@ -1418,6 +1421,10 @@ static int __init build_zonelists_node(p
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->present_pages)
zonelist->zones[j++] = zone;
+ case ZONE_DMA32:
+ zone = pgdat->node_zones + ZONE_DMA32;
+ if (zone->present_pages)
+ zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->present_pages)
@@ -1526,6 +1533,8 @@ static void __init build_zonelists(pg_da
k = ZONE_NORMAL;
if (i & __GFP_HIGHMEM)
k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA32)
+ k = ZONE_DMA32;
if (i & __GFP_DMA)
k = ZONE_DMA;

@@ -1550,7 +1559,9 @@ static void __init build_zonelists(pg_da
j = 0;
k = ZONE_NORMAL;
if (i & __GFP_HIGHMEM)
- k = ZONE_HIGHMEM;
+ k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA32)
+ k = ZONE_DMA32;
if (i & __GFP_DMA)
k = ZONE_DMA;

@@ -1895,7 +1906,7 @@ static void __init free_area_init_core(s
if (zholes_size)
realsize -= zholes_size[j];

- if (j == ZONE_DMA || j == ZONE_NORMAL)
+ if (j < ZONE_HIGHMEM)
nr_kernel_pages += realsize;
nr_all_pages += realsize;

2005-09-12 07:59:04

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] [1/3] Add 4GB DMA32 zone

On Monday 12 September 2005 09:44, Jan Beulich wrote:
> It seems a little strange to add individual zones one by one. I remember
> from an OS project I previously worked on that at some time our driver
> developers ran into one or more devices that were able to consume 31-bit
> physical addresses

That's likely the unnamed RAID controller with the broken firmware refered to
below (they might actually have fixed the firmware now) But for block
devices it's not really needed anyways. From my experience and those
of other folks (IA64) the 4GB zone + a small fallback zone is a good
compromise.

> (but not 32-bit ones, and don't ask me for details on
> what exact devices these were, I never knew). I thus wonder whether it
> wouldn't make more sense to generalize the logic and allow drivers to
> specify to the allocator how many physical address bits they can deal
> with.

Because that would likely either impact the page allocation fast path
by having unsuited data structures for the normal case or make the code to
allocate pages with arbitary boundaries really slow because the allocation
wouldn't be O(1). Didn't seem like a good tradeoff.

-Andi

2005-09-12 10:03:20

by Alan

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

> One argument was still if the zone should be 4GB or 2GB. The main
> motivation for 2GB would be an unnamed not so unpopular hardware
> raid controller (mostly found in older machines from a particular four letter
> company) who has a strange 2GB restriction in firmware. But

Adaptec AACRAID is one offender

> that one works ok with swiotlb/IOMMU anyways, so it doesn't really

Old aacraid actually cannot use IOMMU. It isn't alone in that
limitation. Most hardware that has a 30/31bit limit can't go via the
IOMMU because IOMMU space appears on the bus above 2GB so is itself
invisible to the hardware.

Other devices with similar limits include the broadcomm b44


2005-09-12 10:42:27

by Andi Kleen

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Monday 12 September 2005 12:28, Alan Cox wrote:
> > One argument was still if the zone should be 4GB or 2GB. The main
> > motivation for 2GB would be an unnamed not so unpopular hardware
> > raid controller (mostly found in older machines from a particular four
> > letter company) who has a strange 2GB restriction in firmware. But
>
> Adaptec AACRAID is one offender

Yes, that one was considered. Believe me we went over all
the broken hardware cases quite a lot before comming up with this patch.

I refuse to add unnecessary limits to everybody else just because of that
single broken firmware. 4GB limit is really common and the oddballs like
these have to use the same workarounds (custom bounce buffer in low GFP_DMA
memory) they always did on machines with enough memory.

Also the aacraid is not really an big issue on x86-64, because
afaik nobody shipped EM64T or AMD64 machines with these beasts.
(most aacraid is older Xeons without 64bit capability). There
are a few users who bought plugin cards later, but near all of these
ran into other problems because they didn't have enough memory
(I cannot in fact remember a report of someone running especially
into this problem) And the cards seem to be essentially dead in the market
now. So it's really more a theoretical problem than a practical one.
[Proof of it: the current sources don't seem to handle it, so
it cannot be that bad ;-]

And the probability of someone using a b44 in a machine with >2GB
of memory is small at best. Usually they are in really lowend
boxes where you couldn't even plug in more memory than that.
That is why I essentially ignored the b44. AFAIK the driver
has a GFP_DMA bounce workaround anyways, so it would work
anyways.

Yes I know some soundcards have similar limits, but for all
these we still have GFP_DMA and they always have been quite happy
with that.

Basically this a solution to make a lot of common hardware happy
and the oddballs and really broken cases are not worse than
they have been before.

> > that one works ok with swiotlb/IOMMU anyways, so it doesn't really
>
> Old aacraid actually cannot use IOMMU. It isn't alone in that
> limitation. Most hardware that has a 30/31bit limit can't go via the
> IOMMU because IOMMU space appears on the bus above 2GB so is itself
> invisible to the hardware.

Yes, true. Use GFP_DMA then.

Actually swiotlb would work in theory because it tends to be pretty
low, but that is not enabled on all machines and the code doesn't
attempt to handle it (and I don't plan to do it)

Hopefully the patch can go into 2.6.13.

-Andi

2005-09-12 11:08:18

by Alan

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Llu, 2005-09-12 at 12:42 +0200, Andi Kleen wrote:
> (I cannot in fact remember a report of someone running especially
> into this problem) And the cards seem to be essentially dead in the market
> now. So it's really more a theoretical problem than a practical one.
> [Proof of it: the current sources don't seem to handle it, so
> it cannot be that bad ;-]

Current sources don't handle DMA32, so that can't be bad either. Can we
stick to sensible discussion, its quicker.

There have been various reports over time, quite a few early on when we
had a long series of on list discussions trying to debug what appeared
to be an iommu bug but was in fact a kernel bug.

You hit it on any size AMD64 with iommu, but since most aacraid users
are intel boxes it doesn't hurt too many, and the rest all know about
turning the iommu off on the box (but that hurts the rest), or just run
something else because Linux "doesn't work".

> That is why I essentially ignored the b44. AFAIK the driver
> has a GFP_DMA bounce workaround anyways, so it would work
> anyways.

Usually - the DMA zone at 16MB is too small so allocations sometimes
fail. It btw would want 1GB limits.

> Yes I know some soundcards have similar limits, but for all
> these we still have GFP_DMA and they always have been quite happy
> with that.

No current shipping card, also those that need it typically need small
amounts (they'll live with 8K)

> > Old aacraid actually cannot use IOMMU. It isn't alone in that
> > limitation. Most hardware that has a 30/31bit limit can't go via the
> > IOMMU because IOMMU space appears on the bus above 2GB so is itself
> > invisible to the hardware.
>
> Yes, true. Use GFP_DMA then.

Doesn't work. The DMA area is way too small with all the users already
and the aacraid can and does want a lot of outstanding I/O. Using a 1GB
or 2GB boundary line hurts nobody doing "allocate me some memory below
4Gb" because nobody asks for .5GB chunks. A 4GB zone means we need to
either increase the 16MB zone or add yet another one.


Alan

2005-09-12 11:23:50

by Andi Kleen

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Monday 12 September 2005 13:33, Alan Cox wrote:
> On Llu, 2005-09-12 at 12:42 +0200, Andi Kleen wrote:
> > (I cannot in fact remember a report of someone running especially
> > into this problem) And the cards seem to be essentially dead in the
> > market now. So it's really more a theoretical problem than a practical
> > one. [Proof of it: the current sources don't seem to handle it, so
> > it cannot be that bad ;-]
>
> Current sources don't handle DMA32, so that can't be bad either. Can we
> stick to sensible discussion, its quicker.

Well discussing broken hardware that is rarely used on 64bit systems
with enough memory doesn't seem very sensible to me.

But ok:

Even a 2GB DMA32 wouldn't magically fix it anyways.


>
> There have been various reports over time, quite a few early on when we
> had a long series of on list discussions trying to debug what appeared
> to be an iommu bug but was in fact a kernel bug.
>
> You hit it on any size AMD64 with iommu, but since most aacraid users
> are intel boxes it doesn't hurt too many, and the rest all know about
> turning the iommu off on the box (but that hurts the rest), or just run
> something else because Linux "doesn't work".

You need essentially the same code to fix it with GFP_DMA32 as with
GFP_DMA. Some bounce code that allocates low memory and does
the necessary bounces.

The only difference with GFP_DMA is that the bounce pool is smaller

Limiting everybody else just to get a bigger bounce pool on a single
broken device that isn't even shipping anymore doesn't seem like sensible
design approach to me.

>
> > That is why I essentially ignored the b44. AFAIK the driver
> > has a GFP_DMA bounce workaround anyways, so it would work
> > anyways.
>
> Usually - the DMA zone at 16MB is too small so allocations sometimes
> fail. It btw would want 1GB limits.

mempool - sleep on a waitqueue until someone frees. In fact the kernel's
normal allocator is already quite good at that so you might not even
need that.


>
> Doesn't work. The DMA area is way too small with all the users already
> and the aacraid can and does want a lot of outstanding I/O. Using a 1GB
> or 2GB boundary line hurts nobody doing "allocate me some memory below
> 4Gb" because nobody asks for .5GB chunks. A 4GB zone means we need to
> either increase the 16MB zone or add yet another one.

This is really only a problem if a significant fraction of your memory
is beyond the limit (otherwise most buffers can go directly).

And with the mempool sleep approach they will just get small queues. Yes
that will be slower, but if you want performance on boxes with a lot of memory
you should not buy broken hardware.

Basically aacraid was always broken and it is not more or not less broken
than it was before with DMA32.

-Andi

2005-09-12 11:44:11

by Mark Salyzyn

[permalink] [raw]
Subject: RE: [1/3] Add 4GB DMA32 zone

Andi Kleen writes:
>> Adaptec AACRAID is one offender
> 4GB limit is really common and the oddballs like
>these have to use the same workarounds (custom bounce buffer in low
GFP_DMA
>memory) they always did on machines with enough memory.

The 2GB limit is to deal with allocation of hardware command frames
(FIB) and thus only during initialization, all the adapters deliver DMA
to the full address range at 'run time' and the driver does open the
limit up at that point. The reason for this strangeness is the inability
of the Firmware to work around the Intel ATU when doing memcpy, where
the DMA engine had no such limits.

> Also the aacraid is not really an big issue on x86-64
Oh really? I have throngs (a technical term) of customers that would
bear witness otherwise.
> because afaik nobody shipped EM64T or AMD64 machines with these
beasts.
Patently false. The cards with the 2GB limits (2200S) are popular
sellers in the channel, the replacement (2230S) w/o the limit is ramping
though ...

>[Proof of it: the current sources don't seem to handle it, so it cannot
be that bad ;-]

The current sources do handle it, took nearly a year for the patches to
propagate from the scsi-list. Meanwhile I deliver a driver to all the
customers that experienced problems, while I waited ...

Sincerely -- Mark Salyzyn

2005-09-12 11:52:12

by Andi Kleen

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Thanks Mark for the corrections.

On Monday 12 September 2005 13:44, Salyzyn, Mark wrote:
> Andi Kleen writes:
> >> Adaptec AACRAID is one offender
> >
> > 4GB limit is really common and the oddballs like
> >these have to use the same workarounds (custom bounce buffer in low
>
> GFP_DMA
>
> >memory) they always did on machines with enough memory.
>
> The 2GB limit is to deal with allocation of hardware command frames
> (FIB) and thus only during initialization,
> all the adapters deliver DMA
> to the full address range at 'run time' and the driver does open the
> limit up at that point. The reason for this strangeness is the inability
> of the Firmware to work around the Intel ATU when doing memcpy, where
> the DMA engine had no such limits.

Ok that makes a lot of sense. You should probably be really using
pci_alloc_consistent() instead of GFP_DMA directly here, but other than
that it should just work.

(pci_alloc_consistent has some hacks to first try the higher zones
and only use the lower zones if the allocation didn't succeed here -
on a 2GB machine you have a 50% chance that a normal allocation
ends up below 1GB - which make this all a bit more reliable)

That probably explains the lack of reports about this issue
which I mistakenly assumed was because of the cards getting scarce.

Anyways, it shows the aacraid doesn't need GFP_DMA32 at all, which
is good.

I hope there are no other concerns about the patch and Linus
could just merge it now?

-Andi

2005-09-12 12:08:21

by Mark Salyzyn

[permalink] [raw]
Subject: RE: [1/3] Add 4GB DMA32 zone

Andi Kleen [mailto:[email protected]] writes:
> Ok that makes a lot of sense. You should probably be really using
> pci_alloc_consistent() instead of GFP_DMA directly here, but other
than that it should just work.
scsi-misc-2.6 version of the driver has but one left in an ioctl call
that could be converted over ... Mostly done.

> Anyways, it shows the aacraid doesn't need GFP_DMA32 at all, which is
good.
>
> I hope there are no other concerns about the patch and Linus could
just merge it now?

Few concerns from me once the remaining driver source propagates, we
will watch carefully and hopefully turn around bugfix patches quickly in
the driver should it be needed. I seem to loose sleep at night over the
legacy cards doing yet another strangeness ;-/

-- Mark

2005-09-12 12:09:23

by Alan

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Llu, 2005-09-12 at 13:22 +0200, Andi Kleen wrote:
> And with the mempool sleep approach they will just get small queues. Yes
> that will be slower, but if you want performance on boxes with a lot of memory
> you should not buy broken hardware.

Ironically its broken hardware it works best with. AMD64 is problematic
but Intel with the swiotlb works ;)

Ok - points made anyway, if you think the 4GB one is the best way to do
it even considering these then I've no problem with that.

2005-09-12 12:28:46

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] Re: [1/3] Add 4GB DMA32 zone

On Monday 12 September 2005 14:34, Alan Cox wrote:
> On Llu, 2005-09-12 at 13:22 +0200, Andi Kleen wrote:
> > And with the mempool sleep approach they will just get small queues. Yes
> > that will be slower, but if you want performance on boxes with a lot of
> > memory you should not buy broken hardware.
>
> Ironically its broken hardware it works best with. AMD64 is problematic
> but Intel with the swiotlb works ;)

Actually the swiotlb code currently doesn't attempt to handle dma masks
<4GB even when the bounce pool happens to be located lower - it will just fail
or use GFP_DMA. It could be fixed in theory, but it would be pretty
unreliable and sometimes work on one system and sometimes not, so I would be
reluctant to go down that path.

Also BTW on many systems which don't allocate the IOMMU aperture in BIOS and
Linux has to allocate it over memory it tends to be as low (or high) as the
swiotlb pool - it is bootmem allocated at roughly the same place in boot.
But again the code doesn't attempt to handle that, it just uses hardcoded
0xffffffff masks.

-Andi

2005-09-12 12:45:40

by Roman Zippel

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Hi,

On Sun, 11 Sep 2005, Andi Kleen wrote:

> -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */

Why needs ZONES_SHIFT to be increased?

> -#define FLAGS_RESERVED 8
> +#define FLAGS_RESERVED 9

I would prefer to keep this at 8.

bye, Roman

2005-09-12 12:47:18

by Andi Kleen

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Monday 12 September 2005 14:45, Roman Zippel wrote:
> Hi,
>
> On Sun, 11 Sep 2005, Andi Kleen wrote:
> > -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> > -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> > +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> > +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
>
> Why needs ZONES_SHIFT to be increased?
>
> > -#define FLAGS_RESERVED 8
> > +#define FLAGS_RESERVED 9
>
> I would prefer to keep this at 8.

sparsemem needs these two.

-Andi

2005-09-12 12:50:57

by Roman Zippel

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Hi,

On Mon, 12 Sep 2005, Andi Kleen wrote:

> > > -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> > > -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> > > +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> > > +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
> >
> > Why needs ZONES_SHIFT to be increased?
> >
> > > -#define FLAGS_RESERVED 8
> > > +#define FLAGS_RESERVED 9
> >
> > I would prefer to keep this at 8.
>
> sparsemem needs these two.

What two? What are you talking about???

bye, Roman

2005-09-12 12:54:43

by Andi Kleen

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Monday 12 September 2005 14:50, Roman Zippel wrote:
> Hi,
>
> On Mon, 12 Sep 2005, Andi Kleen wrote:
> > > > -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> > > > -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> > > > +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> > > > +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
> > >
> > > Why needs ZONES_SHIFT to be increased?
> > >
> > > > -#define FLAGS_RESERVED 8
> > > > +#define FLAGS_RESERVED 9
> > >
> > > I would prefer to keep this at 8.
> >
> > sparsemem needs these two.
>
> What two? What are you talking about???

The two changes you quoted.

-Andi

2005-09-12 13:01:53

by Roman Zippel

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Hi,

On Mon, 12 Sep 2005, Andi Kleen wrote:

> > On Mon, 12 Sep 2005, Andi Kleen wrote:
> > > > > -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> > > > > -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> > > > > +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> > > > > +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
> > > >
> > > > Why needs ZONES_SHIFT to be increased?
> > > >
> > > > > -#define FLAGS_RESERVED 8
> > > > > +#define FLAGS_RESERVED 9
> > > >
> > > > I would prefer to keep this at 8.
> > >
> > > sparsemem needs these two.
> >
> > What two? What are you talking about???
>
> The two changes you quoted.

Sorry, but I still don't see why. Could you _please_ be a little more
verbose? Thanks.

bye, Roman

2005-09-12 18:19:00

by Jeff Garzik

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Alan Cox wrote:
> On Llu, 2005-09-12 at 12:42 +0200, Andi Kleen wrote:
>>Yes I know some soundcards have similar limits, but for all
>>these we still have GFP_DMA and they always have been quite happy
>>with that.
>
>
> No current shipping card, also those that need it typically need small
> amounts (they'll live with 8K)


[...just because I love broken hardware, not because I've been following
this thread...]

RealTek's ALS4000 PCI card is a SoundBlaster ISA clone chip glued onto a
PCI bus. Its DMA mask is 24-bit, IIRC. :)

Jeff


2005-09-12 19:55:48

by Mark Lord

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Andi Kleen wrote:
>
> And the probability of someone using a b44 in a machine with >2GB
> of memory is small at best. Usually they are in really lowend
> boxes where you couldn't even plug in more memory than that.

Data point:

My current model Dell notebook as b44 and 2GB RAM.
The 2GB is the limit only until >1GB SODIMMs become available.

Cheers

2005-09-12 22:02:25

by Bart Hartgers

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Jeff Garzik wrote:
> Alan Cox wrote:
>
>> On Llu, 2005-09-12 at 12:42 +0200, Andi Kleen wrote:
>>
>>> Yes I know some soundcards have similar limits, but for all
>>> these we still have GFP_DMA and they always have been quite happy
>>> with that.
>>
>>
>>
>> No current shipping card, also those that need it typically need small
>> amounts (they'll live with 8K)
>
>
>
> [...just because I love broken hardware, not because I've been following
> this thread...]
>
> RealTek's ALS4000 PCI card is a SoundBlaster ISA clone chip glued onto a
> PCI bus. Its DMA mask is 24-bit, IIRC. :)
>
> Jeff
>
Yep. You're absolutely right about the card. Google doesn't find anyone
still selling them, though... Apart from ebay ;-)

(Wrote the driver and got rid of the d*mn thing 2 weeks later...)

Cheers,
Bart

2005-09-13 03:20:25

by Andi Kleen

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On Tuesday 13 September 2005 00:02, Bart Hartgers wrote:

> Yep. You're absolutely right about the card. Google doesn't find anyone
> still selling them, though... Apart from ebay ;-)
>
> (Wrote the driver and got rid of the d*mn thing 2 weeks later...)

Just to avoid any misconceptions - it should just work great on x86-64
because exactly for such hardware we kept the 16MB DMA zone.
(unless of course the driver has other problems than just DMAing)

-Andi

2005-09-13 09:15:51

by Roman Zippel

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

Hi,

On Mon, 12 Sep 2005, Andi Kleen wrote:

> > On Sun, 11 Sep 2005, Andi Kleen wrote:
> > > -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> > > -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> > > +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> > > +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
> >
> > Why needs ZONES_SHIFT to be increased?
> >
> > > -#define FLAGS_RESERVED 8
> > > +#define FLAGS_RESERVED 9
> >
> > I would prefer to keep this at 8.
>
> sparsemem needs these two.

Did I somehow offend you, that I don't deserve an answer?
The reason for my question is rather simple (and I thought obvious), the
four zone types fit into two bits, so what is sparsemem doing with this
extra bit?

bye, Roman

2005-09-13 09:47:48

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] Re: [1/3] Add 4GB DMA32 zone

On Tuesday 13 September 2005 11:15, Roman Zippel wrote:
> Hi,
>
> On Mon, 12 Sep 2005, Andi Kleen wrote:
> > > On Sun, 11 Sep 2005, Andi Kleen wrote:
> > > > -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> > > > -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> > > > +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> > > > +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
> > >
> > > Why needs ZONES_SHIFT to be increased?
> > >
> > > > -#define FLAGS_RESERVED 8
> > > > +#define FLAGS_RESERVED 9
> > >
> > > I would prefer to keep this at 8.
> >
> > sparsemem needs these two.
>
> Did I somehow offend you, that I don't deserve an answer?

Well, your aggressive tone definitely doesn't encourage speedy answers.

> The reason for my question is rather simple (and I thought obvious), the
> four zone types fit into two bits, so what is sparsemem doing with this
> extra bit?

iirc it was a patch that came in over Andrew. I can't find the email anymore
unfortunately. The argument looked plausible and I think it fixed
a boot problem for the submitter on some arch (probably IA64, on x86-64 it
worked fine without it, but I've never tried sparsemem and the code was
originally written before sparsemem). Andrew do you still have the patch with
the description? It must have been between 2.6.13mm1 and 2.6.13mm2.

You're right that four zones should in theory fit into 2 bits, so I'm
also not sure why it was needed.

-Andi

2005-09-13 10:16:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [discuss] Re: [1/3] Add 4GB DMA32 zone

Andi Kleen <[email protected]> wrote:
>
> Andrew do you still have the patch with
> the description? It must have been between 2.6.13mm1 and 2.6.13mm2.




From: KAMEZAWA Hiroyuki <[email protected]>

Because 2.6.13-mm2 adds new zone DMA32, ZONES_SHIFT becomes 3.
So, flags bits reserved for (SECTION | NODE | ZONE) should be increase.

ZONE_SHIFT is increased, FLAGS_RESERVED should be.

Signed-off-by Kamezawa Hiroyuki <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

include/linux/mmzone.h | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)

diff -puN include/linux/mmzone.h~x86_64-dma32-fix include/linux/mmzone.h
--- 25/include/linux/mmzone.h~x86_64-dma32-fix Fri Sep 9 17:13:41 2005
+++ 25-akpm/include/linux/mmzone.h Fri Sep 9 17:14:13 2005
@@ -431,9 +431,10 @@ extern struct pglist_data contig_page_da
#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
/*
* with 32 bit page->flags field, we reserve 8 bits for node/zone info.
- * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
+ * there are 4 zones (3 bits) and this leaves 8-2=6 bits for nodes.
+ * +6bits for sections if CONFIG_SPARSEMEM
*/
-#define FLAGS_RESERVED 8
+#define FLAGS_RESERVED 9

#elif BITS_PER_LONG == 64
/*
_

2005-09-13 11:32:30

by Andi Kleen

[permalink] [raw]
Subject: Re: [discuss] Re: [1/3] Add 4GB DMA32 zone


Hmm ok description is not very enlightening. 4 zones should indeed
still fit into 2 bits.

Kamezawa-san, can you please explain why exactly you did that change?

Thanks,
-Andi

On Tuesday 13 September 2005 12:15, Andrew Morton wrote:
> Andi Kleen <[email protected]> wrote:
> > Andrew do you still have the patch with
> > the description? It must have been between 2.6.13mm1 and 2.6.13mm2.
>
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Because 2.6.13-mm2 adds new zone DMA32, ZONES_SHIFT becomes 3.
> So, flags bits reserved for (SECTION | NODE | ZONE) should be increase.
>
> ZONE_SHIFT is increased, FLAGS_RESERVED should be.
>
> Signed-off-by Kamezawa Hiroyuki <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> include/linux/mmzone.h | 5 +++--
> 1 files changed, 3 insertions(+), 2 deletions(-)
>
> diff -puN include/linux/mmzone.h~x86_64-dma32-fix include/linux/mmzone.h
> --- 25/include/linux/mmzone.h~x86_64-dma32-fix Fri Sep 9 17:13:41 2005
> +++ 25-akpm/include/linux/mmzone.h Fri Sep 9 17:14:13 2005
> @@ -431,9 +431,10 @@ extern struct pglist_data contig_page_da
> #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
> /*
> * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
> - * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
> + * there are 4 zones (3 bits) and this leaves 8-2=6 bits for nodes.
> + * +6bits for sections if CONFIG_SPARSEMEM
> */
> -#define FLAGS_RESERVED 8
> +#define FLAGS_RESERVED 9
>
> #elif BITS_PER_LONG == 64
> /*
> _

2005-09-13 12:10:03

by Roman Zippel

[permalink] [raw]
Subject: Re: [discuss] Re: [1/3] Add 4GB DMA32 zone

Hi,

On Tue, 13 Sep 2005, Andi Kleen wrote:

> Hmm ok description is not very enlightening. 4 zones should indeed
> still fit into 2 bits.
>
> Kamezawa-san, can you please explain why exactly you did that change?

Probably because it triggers this check:

#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
#endif

bye, Roman

2005-09-13 23:52:39

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [discuss] Re: [1/3] Add 4GB DMA32 zone

Roman Zippel wrote:
>>Kamezawa-san, can you please explain why exactly you did that change?
>
>
> Probably because it triggers this check:
>
> #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
> #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
> #endif
>
Yes, it was for this.
If still ZONES_WIDTH = ZONES_SHIFT =2, I have no problem.

Thanks,
-- Kame

> bye, Roman
>


2005-10-03 15:46:54

by Coywolf Qi Hunt

[permalink] [raw]
Subject: Re: [1/3] Add 4GB DMA32 zone

On 9/12/05, Andi Kleen <[email protected]> wrote:
> Add 4GB DMA32 zone
>
> Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones.
>
> As a bit of historical background: when the x86-64 port
> was originally designed we had some discussion if we should
> use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or
> both. Both was ruled out at this point because it was in early
> 2.4 when VM is still quite shakey and had bad troubles even
> dealing with one DMA zone. We settled on the 16MB DMA zone mainly
> because we worried about older soundcards and the floppy.
>
> But this has always caused problems since then because
> device drivers had trouble getting enough DMA able memory. These days
> the VM works much better and the wide use of NUMA has proven
> it can deal with many zones successfully.
>
> So this patch adds both zones.
>
> This helps drivers who need a lot of memory below 4GB because
> their hardware is not accessing more (graphic drivers - proprietary
> and free ones, video frame buffer drivers, sound drivers etc.).
> Previously they could only use IOMMU+16MB GFP_DMA, which
> was not enough memory.
>
> Another common problem is that hardware who has full memory
> addressing for >4GB misses it for some control structures in memory
> (like transmit rings or other metadata). They tended to allocate memory
> in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent,
> but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory
> (even on AMD systems the IOMMU tends to be quite small) especially if you have
> many devices. With the new zone pci_alloc_consistent can just put
> this stuff into memory below 4GB which works better.
>
> One argument was still if the zone should be 4GB or 2GB. The main
> motivation for 2GB would be an unnamed not so unpopular hardware
> raid controller (mostly found in older machines from a particular four letter
> company) who has a strange 2GB restriction in firmware. But
> that one works ok with swiotlb/IOMMU anyways, so it doesn't really
> need GFP_DMA32. I chose 4GB to be compatible with IA64 and because
> it seems to be the most common restriction.
>
> The new zone is so far added only for x86-64.
>
> For other architectures who don't set up this
> new zone nothing changes. Architectures can set a compatibility
> define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32
> as GFP_DMA. Otherwise it's a nop because on 32bit architectures
> it's normally not needed because GFP_NORMAL (=0) is DMA able
> enough.
>
> One problem is still that GFP_DMA means different things on different
> architectures. e.g. some drivers used to have #ifdef ia64 use GFP_DMA
> (trusting it to be 4GB) #elif __x86_64__ (use other hacks like
> the swiotlb because 16MB is not enough) ... . This was quite
> ugly and is now obsolete.
>
> These should be now converted to use GFP_DMA32 unconditionally. I haven't done
> this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent
> which will use GFP_DMA32 transparently.
>
> Signed-off-by: Andi Kleen <[email protected]>
>

<snip>

> Index: linux/include/linux/mmzone.h
> ===================================================================
> --- linux.orig/include/linux/mmzone.h
> +++ linux/include/linux/mmzone.h
> @@ -70,11 +70,12 @@ struct per_cpu_pageset {
> #endif
>
> #define ZONE_DMA 0
> -#define ZONE_NORMAL 1
> -#define ZONE_HIGHMEM 2
> +#define ZONE_DMA32 1
> +#define ZONE_NORMAL 2
> +#define ZONE_HIGHMEM 3
>
> -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
> -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
> +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */
> +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */
>
>
> /*
> @@ -90,7 +91,7 @@ struct per_cpu_pageset {
> * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible
> * combinations of zone modifiers in "zone modifier space".
> */
> -#define GFP_ZONEMASK 0x03
> +#define GFP_ZONEMASK 0x07
> /*
> * As an optimisation any zone modifier bits which are only valid when
> * no other zone modifier bits are set (loners) should be placed in
> @@ -110,6 +111,7 @@ struct per_cpu_pageset {
> * into multiple physical zones. On a PC we have 3 zones:

Now 4 zones.

> *
> * ZONE_DMA < 16 MB ISA DMA capable memory
> + * ZONE_DMA32 0 MB Empty
> * ZONE_NORMAL 16-896 MB direct mapped by the kernel
> * ZONE_HIGHMEM > 896 MB only page cache and user processes
> */

<snip>

--
Coywolf Qi Hunt
http://sosdg.org/~coywolf/