on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
On platforms with huge physical memory,
large memmap and vfs cache may eat up all usable system memory
under 4G.
Move swiotlb_init early before memmap is allocated can
solve this issue.
Signed-off-by: Zou Nan hai <[email protected]>
diff -Nraup linux-2.6.16-rc5/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
--- linux-2.6.16-rc5/arch/ia64/mm/init.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/ia64/mm/init.c 2006-03-01 17:40:58.000000000 +0800
@@ -585,7 +585,7 @@ mem_init (void)
* any drivers that may need the PCI DMA interface are initialized or bootmem has
* been freed.
*/
- platform_dma_init();
+ platform_dma_init(0);
#endif
#ifdef CONFIG_FLATMEM
diff -Nraup linux-2.6.16-rc5/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
--- linux-2.6.16-rc5/arch/x86_64/kernel/pci-swiotlb.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/kernel/pci-swiotlb.c 2006-03-01 17:41:01.000000000 +0800
@@ -36,7 +36,7 @@ void pci_swiotlb_init(void)
swiotlb = 1;
if (swiotlb) {
printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
+ swiotlb_init(__pa(MAX_DMA_ADDRESS));
dma_ops = &swiotlb_dma_ops;
}
}
diff -Nraup linux-2.6.16-rc5/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- linux-2.6.16-rc5/arch/x86_64/mm/init.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/mm/init.c 2006-03-01 17:41:01.000000000 +0800
@@ -437,6 +437,9 @@ void __init paging_init(void)
memory_present(0, 0, end_pfn);
sparse_init();
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
size_zones(zones, holes, 0, end_pfn);
free_area_init_node(0, NODE_DATA(0), zones,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
@@ -528,9 +531,6 @@ void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
-#ifdef CONFIG_SWIOTLB
- pci_swiotlb_init();
-#endif
no_iommu_init();
/* How many end-of-memory variables you have, grandma! */
diff -Nraup linux-2.6.16-rc5/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
--- linux-2.6.16-rc5/arch/x86_64/mm/numa.c 2006-03-01 17:43:29.000000000 +0800
+++ b/arch/x86_64/mm/numa.c 2006-03-01 17:41:01.000000000 +0800
@@ -305,7 +305,9 @@ void __init paging_init(void)
int i;
arch_sparse_init();
-
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
for_each_online_node(i) {
setup_node_zones(i);
}
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec.h 2006-02-17 00:23:50.000000000 +0800
+++ b/include/asm-ia64/machvec.h 2006-03-01 17:41:10.000000000 +0800
@@ -36,7 +36,7 @@ typedef int ia64_mv_pci_legacy_write_t (
u8 size);
/* DMA-mapping interface: */
-typedef void ia64_mv_dma_init (void);
+typedef void ia64_mv_dma_init (size_t);
typedef void *ia64_mv_dma_alloc_coherent (struct device *, size_t, dma_addr_t *, gfp_t);
typedef void ia64_mv_dma_free_coherent (struct device *, size_t, void *, dma_addr_t);
typedef dma_addr_t ia64_mv_dma_map_single (struct device *, void *, size_t, int);
@@ -76,6 +76,11 @@ typedef unsigned int ia64_mv_readl_relax
typedef unsigned long ia64_mv_readq_relaxed_t (const volatile void __iomem *);
static inline void
+machvec_noop_size_t (size_t size)
+{
+}
+
+static inline void
machvec_noop (void)
{
}
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec_hpzx1.h b/include/asm-ia64/machvec_hpzx1.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec_hpzx1.h 2006-02-17 00:23:50.000000000 +0800
+++ b/include/asm-ia64/machvec_hpzx1.h 2006-03-01 17:41:10.000000000 +0800
@@ -20,7 +20,7 @@ extern ia64_mv_dma_mapping_error sba_dma
*/
#define platform_name "hpzx1"
#define platform_setup dig_setup
-#define platform_dma_init machvec_noop
+#define platform_dma_init machvec_noop_size_t
#define platform_dma_alloc_coherent sba_alloc_coherent
#define platform_dma_free_coherent sba_free_coherent
#define platform_dma_map_single sba_map_single
diff -Nraup linux-2.6.16-rc5/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
--- linux-2.6.16-rc5/include/asm-ia64/machvec_sn2.h 2006-03-01 17:43:31.000000000 +0800
+++ b/include/asm-ia64/machvec_sn2.h 2006-03-01 17:41:10.000000000 +0800
@@ -102,7 +102,7 @@ extern ia64_mv_dma_supported sn_dma_sup
#define platform_pci_get_legacy_mem sn_pci_get_legacy_mem
#define platform_pci_legacy_read sn_pci_legacy_read
#define platform_pci_legacy_write sn_pci_legacy_write
-#define platform_dma_init machvec_noop
+#define platform_dma_init machvec_noop_size_t
#define platform_dma_alloc_coherent sn_dma_alloc_coherent
#define platform_dma_free_coherent sn_dma_free_coherent
#define platform_dma_map_single sn_dma_map_single
diff -Nraup linux-2.6.16-rc5/include/asm-x86_64/swiotlb.h b/include/asm-x86_64/swiotlb.h
--- linux-2.6.16-rc5/include/asm-x86_64/swiotlb.h 2006-03-01 17:43:31.000000000 +0800
+++ b/include/asm-x86_64/swiotlb.h 2006-03-01 17:41:11.000000000 +0800
@@ -41,7 +41,7 @@ extern int swiotlb_dma_mapping_error(dma
extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
void *vaddr, dma_addr_t dma_handle);
extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
-extern void swiotlb_init(void);
+extern void swiotlb_init(size_t);
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
diff -Nraup linux-2.6.16-rc5/include/linux/bootmem.h b/include/linux/bootmem.h
--- linux-2.6.16-rc5/include/linux/bootmem.h 2006-03-01 17:43:31.000000000 +0800
+++ b/include/linux/bootmem.h 2006-03-01 17:41:11.000000000 +0800
@@ -57,10 +57,14 @@ extern void __init reserve_bootmem (unsi
__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_goal(x,goal) \
+ __alloc_bootmem_low((x), SMP_CACHE_BYTES, goal)
#define alloc_bootmem_pages(x) \
__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low((x), PAGE_SIZE, 0)
+#define alloc_bootmem_low_pages_goal(x,goal) \
+ __alloc_bootmem_low((x), PAGE_SIZE, goal)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
extern unsigned long __init free_all_bootmem (void);
extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
diff -Nraup linux-2.6.16-rc5/lib/swiotlb.c b/lib/swiotlb.c
--- linux-2.6.16-rc5/lib/swiotlb.c 2006-03-01 17:43:31.000000000 +0800
+++ b/lib/swiotlb.c 2006-03-01 17:41:12.000000000 +0800
@@ -129,8 +129,8 @@ __setup("swiotlb=", setup_io_tlb_npages)
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
*/
-void
-swiotlb_init_with_default_size (size_t default_size)
+static void
+swiotlb_init_with_default_size (size_t default_size, size_t goal)
{
unsigned long i;
@@ -142,7 +142,7 @@ swiotlb_init_with_default_size (size_t d
/*
* Get IO TLB memory from the low pages
*/
- io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+ io_tlb_start = alloc_bootmem_low_pages_goal(io_tlb_nslabs * (1 << IO_TLB_SHIFT), goal);
if (!io_tlb_start)
panic("Cannot allocate SWIOTLB buffer");
io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
@@ -161,15 +161,15 @@ swiotlb_init_with_default_size (size_t d
/*
* Get the overflow emergency buffer
*/
- io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+ io_tlb_overflow_buffer = alloc_bootmem_low_goal(io_tlb_overflow, goal);
printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end));
}
void
-swiotlb_init (void)
+swiotlb_init (size_t goal)
{
- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */
+ swiotlb_init_with_default_size(64 * (1<<20), goal); /* default to 64MB */
}
/*
On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <[email protected]> wrote:
> on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
>
> On platforms with huge physical memory,
> large memmap and vfs cache may eat up all usable system memory
> under 4G.
>
> Move swiotlb_init early before memmap is allocated can
> solve this issue.
Shouldn't memmap be allocated from memory above 4G (if available)? Using
up lots of <4G memory on something that doesn't need to be below 4G
sounds like a poor use of resources.
-Tony
On Thursday 02 March 2006 05:15, Tony Luck wrote:
> On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <[email protected]> wrote:
> > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> >
> > On platforms with huge physical memory,
> > large memmap and vfs cache may eat up all usable system memory
> > under 4G.
> >
> > Move swiotlb_init early before memmap is allocated can
> > solve this issue.
>
> Shouldn't memmap be allocated from memory above 4G (if available)? Using
> up lots of <4G memory on something that doesn't need to be below 4G
> sounds like a poor use of resources.
On the really large machines it will be distributed over the nodes anyways.
But yes the single node SMP case should probably allocate it higher.
-Andi
On Thu, 2006-03-02 at 12:30, Andi Kleen wrote:
> On Thursday 02 March 2006 05:15, Tony Luck wrote:
> > On 01 Mar 2006 09:10:58 +0800, Zou Nan hai <[email protected]> wrote:
> > > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> > >
> > > On platforms with huge physical memory,
> > > large memmap and vfs cache may eat up all usable system memory
> > > under 4G.
> > >
> > > Move swiotlb_init early before memmap is allocated can
> > > solve this issue.
> >
> > Shouldn't memmap be allocated from memory above 4G (if available)? Using
> > up lots of <4G memory on something that doesn't need to be below 4G
> > sounds like a poor use of resources.
>
> On the really large machines it will be distributed over the nodes anyways.
> But yes the single node SMP case should probably allocate it higher.
>
> -Andi
Really, then how about the following patch?
Let normal bootmem allocator go above 4G first.
This will save more memory with address less than 4G.
Signed-off-by: Zou Nan hai <[email protected]>
--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
+++ b/mm/bootmem.c 2006-03-03 09:05:17.000000000 +0800
@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
return(free_all_bootmem_core(NODE_DATA(0)));
}
+#define LOW32LIMIT 0xffffffff
+
void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
+ if (goal < LOW32LIMIT) {
+ for_each_pgdat(pgdat)
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, LOW32LIMIT, 0)))
+ return(ptr);
+ }
+
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
align, goal, 0)))
return(ptr);
-
/*
* Whoops, we cannot satisfy the allocation request.
*/
@@ -405,6 +413,13 @@ void * __init __alloc_bootmem_node(pg_da
{
void *ptr;
+ if (goal < LOW32LIMIT) {
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align,
+ LOW32LIMIT, 0);
+ if (ptr)
+ return (ptr);
+ }
+
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
return (ptr);
@@ -412,7 +427,6 @@ void * __init __alloc_bootmem_node(pg_da
return __alloc_bootmem(size, align, goal);
}
-#define LOW32LIMIT 0xffffffff
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
{
>>-----Original Message-----
>>From: [email protected] [mailto:[email protected]] On Behalf Of Zou Nan hai
>>Sent: 2006??3??2?? 12:33
>>
>>Really, then how about the following patch?
>>
>>Let normal bootmem allocator go above 4G first.
>>This will save more memory with address less than 4G.
>>
>>Signed-off-by: Zou Nan hai <[email protected]>
>>
>>--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
>>+++ b/mm/bootmem.c 2006-03-03 09:05:17.000000000 +0800
>>@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
>> return(free_all_bootmem_core(NODE_DATA(0)));
>> }
>>
>>+#define LOW32LIMIT 0xffffffff
>>+
>> void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
>> {
>> pg_data_t *pgdat = pgdat_list;
>> void *ptr;
>>
>>+ if (goal < LOW32LIMIT) {
On i386, above is always true.
>>+ for_each_pgdat(pgdat)
>>+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
>>+ align, LOW32LIMIT, 0)))
>>+ return(ptr);
>>+ }
On Thu, 2006-03-02 at 17:09, Zhang, Yanmin wrote:
> >>-----Original Message-----
> >>From: [email protected] [mailto:[email protected]] On Behalf Of Zou Nan hai
> >>Sent: 2006年3月2日 12:33
> >>
> >>Really, then how about the following patch?
> >>
> >>Let normal bootmem allocator go above 4G first.
> >>This will save more memory with address less than 4G.
> >>
> >>Signed-off-by: Zou Nan hai <[email protected]>
> >>
> >>--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
> >>+++ b/mm/bootmem.c 2006-03-03 09:05:17.000000000 +0800
> >>@@ -381,16 +381,24 @@ unsigned long __init free_all_bootmem (v
> >> return(free_all_bootmem_core(NODE_DATA(0)));
> >> }
> >>
> >>+#define LOW32LIMIT 0xffffffff
> >>+
> >> void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
> >> {
> >> pg_data_t *pgdat = pgdat_list;
> >> void *ptr;
> >>
> >>+ if (goal < LOW32LIMIT) {
> On i386, above is always true.
>
>
Ok, I modified the patch.
On single node SMP System with large physical memory,
allocation from bootmem allocator like memmap and vfs_cache
may eat up usable memory under 4G, then software I/O TLB will not be able to allocate bounce buffer.
This patch modify the bootmem allocator,
let normal bootmem allocation on 64 bit system first go above 4G
address.
Signed-off-by: Zou Nan hai <[email protected]>
--- linux-2.6.16-rc5/mm/bootmem.c 2006-03-03 08:31:52.000000000 +0800
+++ b/mm/bootmem.c 2006-03-04 03:48:55.000000000 +0800
@@ -381,16 +381,25 @@ unsigned long __init free_all_bootmem (v
return(free_all_bootmem_core(NODE_DATA(0)));
}
+#define LOW32LIMIT 0xffffffff
+
void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
{
pg_data_t *pgdat = pgdat_list;
void *ptr;
+#if (BITS_PER_LONG == 64)
+ if (goal < LOW32LIMIT) {
+ for_each_pgdat(pgdat)
+ if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+ align, LOW32LIMIT, 0)))
+ return(ptr);
+ }
+#endif
for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
align, goal, 0)))
return(ptr);
-
/*
* Whoops, we cannot satisfy the allocation request.
*/
@@ -404,6 +413,14 @@ void * __init __alloc_bootmem_node(pg_da
unsigned long goal)
{
void *ptr;
+#if (BITS_PER_LONG == 64)
+ if (goal < LOW32LIMIT) {
+ ptr = __alloc_bootmem_core(pgdat->bdata, size, align,
+ LOW32LIMIT, 0);
+ if (ptr)
+ return (ptr);
+ }
+#endif
ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
@@ -412,7 +429,6 @@ void * __init __alloc_bootmem_node(pg_da
return __alloc_bootmem(size, align, goal);
}
-#define LOW32LIMIT 0xffffffff
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
{
On Friday 03 March 2006 00:35, Zou Nan hai wrote:
> This patch modify the bootmem allocator,
> let normal bootmem allocation on 64 bit system first go above 4G
> address.
That's very ugly and likely to break some architectures. Sorry
but #ifdefs is the wrong way to do this.
Passing a limit parameter is better and use that in the swiotlb
allocation. If you're worried about changing too many callers
you could add a new entry point.
-Andi
>-----Original Message-----
>From: Andi Kleen [mailto:[email protected]]
>Sent: Thursday, March 02, 2006 5:32 PM
>To: Zou, Nanhai
>Cc: Zhang, Yanmin; Luck, Tony; LKML; Andrew Morton; Pallipadi,
>Venkatesh
>Subject: Re: [Patch] Move swiotlb_init early on X86_64
>
>On Friday 03 March 2006 00:35, Zou Nan hai wrote:
>
>> This patch modify the bootmem allocator,
>> let normal bootmem allocation on 64 bit system first go above 4G
>> address.
>
>That's very ugly and likely to break some architectures. Sorry
>but #ifdefs is the wrong way to do this.
>
>Passing a limit parameter is better and use that in the swiotlb
>allocation. If you're worried about changing too many callers
>you could add a new entry point.
>
Another potential issue with this approach:
On a 64 bit system with less than 4G phys memory, we will fail
to get any memory above 4G and fall back to start from '0'.
This is different from original behaviour, where goal was
MAX_DMA_ADDRESS (16M) and we would allocate memory starting
from 16M. As a result, we will now eat up memory in 0-16M range
and may break some legacy drivers as they will not get any memory.
If we go this way, then we should fallback to original goal if we
are not able to get greater than 4G memory.
Thanks,
Venki
On Wednesday 01 March 2006 02:10, Zou Nan hai wrote:
> on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
>
> On platforms with huge physical memory,
> large memmap and vfs cache may eat up all usable system memory
> under 4G.
>
> Move swiotlb_init early before memmap is allocated can
> solve this issue.
>
> Signed-off-by: Zou Nan hai <[email protected]>
I came up with a simpler change now that should fix the problem too.
It just try to move the memmap to the end of the node. I don't have a system
big enough to test the original problem though.
It should be fairly safe because if the allocation fails we just fallback
to the normal old way of allocating it near the beginning.
Try to allocate node memmap near the end of node
This fixes problems with very large nodes (over 128GB) filling up all of
the first 4GB with their mem_map and not leaving enough
space for the swiotlb.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86_64/mm/numa.c | 12 +++++++++++-
include/linux/bootmem.h | 3 +++
mm/bootmem.c | 2 +-
3 files changed, 15 insertions(+), 2 deletions(-)
Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -172,7 +172,7 @@ void __init setup_node_bootmem(int nodei
/* Initialize final allocator for a zone */
void __init setup_node_zones(int nodeid)
{
- unsigned long start_pfn, end_pfn;
+ unsigned long start_pfn, end_pfn, memmapsize, limit;
unsigned long zones[MAX_NR_ZONES];
unsigned long holes[MAX_NR_ZONES];
@@ -182,6 +182,16 @@ void __init setup_node_zones(int nodeid)
Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
nodeid, start_pfn, end_pfn);
+ /* Try to allocate mem_map at end to not fill up precious <4GB
+ memory. */
+ memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+ limit = end_pfn << PAGE_SHIFT;
+ NODE_DATA(nodeid)->node_mem_map =
+ __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
+ memmapsize, SMP_CACHE_BYTES,
+ limit,
+ round_down(limit - memmapsize, PAGE_SIZE));
+
size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
start_pfn, holes);
Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -52,6 +52,9 @@ extern void * __init __alloc_bootmem_low
unsigned long size,
unsigned long align,
unsigned long goal);
+extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
+ unsigned long size, unsigned long align, unsigned long goal,
+ unsigned long limit);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
#define alloc_bootmem(x) \
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(boo
*
* NOTE: This function is _not_ reentrant.
*/
-static void * __init
+void * __init
__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
unsigned long align, unsigned long goal, unsigned long limit)
{
On Tue, 2006-03-07 at 16:39, Andi Kleen wrote:
> On Wednesday 01 March 2006 02:10, Zou Nan hai wrote:
> > on X86_64, swiotlb buffer is allocated in mem_init, after memmap and vfs cache allocation.
> >
> > On platforms with huge physical memory,
> > large memmap and vfs cache may eat up all usable system memory
> > under 4G.
> >
> > Move swiotlb_init early before memmap is allocated can
> > solve this issue.
> >
> > Signed-off-by: Zou Nan hai <[email protected]>
>
>
> I came up with a simpler change now that should fix the problem too.
> It just try to move the memmap to the end of the node. I don't have a system
> big enough to test the original problem though.
>
> It should be fairly safe because if the allocation fails we just fallback
> to the normal old way of allocating it near the beginning.
>
> Try to allocate node memmap near the end of node
>
> This fixes problems with very large nodes (over 128GB) filling up all of
> the first 4GB with their mem_map and not leaving enough
> space for the swiotlb.
>
>
> Signed-off-by: Andi Kleen <[email protected]>
>
> ---
> arch/x86_64/mm/numa.c | 12 +++++++++++-
> include/linux/bootmem.h | 3 +++
> mm/bootmem.c | 2 +-
> 3 files changed, 15 insertions(+), 2 deletions(-)
>
> Index: linux/arch/x86_64/mm/numa.c
> ===================================================================
> --- linux.orig/arch/x86_64/mm/numa.c
> +++ linux/arch/x86_64/mm/numa.c
> @@ -172,7 +172,7 @@ void __init setup_node_bootmem(int nodei
> /* Initialize final allocator for a zone */
> void __init setup_node_zones(int nodeid)
> {
> - unsigned long start_pfn, end_pfn;
> + unsigned long start_pfn, end_pfn, memmapsize, limit;
> unsigned long zones[MAX_NR_ZONES];
> unsigned long holes[MAX_NR_ZONES];
>
> @@ -182,6 +182,16 @@ void __init setup_node_zones(int nodeid)
> Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
> nodeid, start_pfn, end_pfn);
>
> + /* Try to allocate mem_map at end to not fill up precious <4GB
> + memory. */
> + memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
> + limit = end_pfn << PAGE_SHIFT;
> + NODE_DATA(nodeid)->node_mem_map =
> + __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
> + memmapsize, SMP_CACHE_BYTES,
> + limit,
> + round_down(limit - memmapsize, PAGE_SIZE));
> +
, round_down(limit - memmapsize, PAGE_SIZE), limit);?
Zou Nan hai
On Wednesday 08 March 2006 00:23, Zou Nan hai wrote:
>
> , round_down(limit - memmapsize, PAGE_SIZE), limit);?
Indeed. Thanks for catching that.
-Andi