2009-10-28 06:56:10

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 0/10] x86: handle HW IOMMU initialization failure gracely

If HW IOMMU initialization fails (Intel VT-d often does typically due
to BIOS bugs), we fall back to nommu. It doesn't work for the majority
since nowadays we have more than 4GB memory so we must use swiotlb
instead of nommu.

The problem is that it's too late to initialize swiotlb when HW IOMMU
initialization fails. We need to allocate swiotlb memory earlier from
bootmem allocator. Chris explained the issue in detail:

http://marc.info/?l=linux-kernel&m=125657444317079&w=2


The current x86 IOMMU initialization sequence is too complicated and
handling the above issue makes it more hacky.

This series changes x86 IOMMU initialization sequence to handle the
above issue cleanly.

The new x86 IOMMU initialization sequence are:

1. initializing swiotlb (and setting swiotlb to 1) in the case of
(max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to
swiotlb_dma_ops or nommu_dma_ops.

2. calling the detection functions of all the IOMMUs

3. the detection function sets x86_init.iommu.iommu_init to the IOMMU
initialization function (so we can avoid calling the initialization
functions of all the IOMMUs needlessly).

4. if the IOMMU initialization function doesn't need to swiotlb then
sets swiotlb to zero (e.g. the initialization is sucessful).

5. if we find that swiotlb is set to zero, we free swiotlb resource.

=
arch/ia64/kernel/pci-swiotlb.c | 4 +-
arch/powerpc/kernel/setup_32.c | 2 +-
arch/powerpc/kernel/setup_64.c | 2 +-
arch/x86/include/asm/amd_iommu.h | 2 -
arch/x86/include/asm/calgary.h | 2 -
arch/x86/include/asm/gart.h | 5 +--
arch/x86/include/asm/iommu.h | 2 +-
arch/x86/include/asm/x86_init.h | 9 ++++
arch/x86/kernel/amd_iommu.c | 2 +-
arch/x86/kernel/amd_iommu_init.c | 19 ++------
arch/x86/kernel/aperture_64.c | 4 +-
arch/x86/kernel/pci-calgary_64.c | 19 ++-----
arch/x86/kernel/pci-dma.c | 23 ++++-----
arch/x86/kernel/pci-gart_64.c | 16 ++----
arch/x86/kernel/pci-nommu.c | 9 ----
arch/x86/kernel/pci-swiotlb.c | 8 ++--
arch/x86/kernel/x86_init.c | 5 ++
drivers/pci/dmar.c | 7 ++-
drivers/pci/intel-iommu.c | 4 +-
include/linux/bootmem.h | 1 +
include/linux/dmar.h | 10 ----
include/linux/swiotlb.h | 5 +-
lib/swiotlb.c | 45 +++++++++++++++---
mm/bootmem.c | 98 +++++++++++++++++++++++++++++---------
24 files changed, 177 insertions(+), 126 deletions(-)




2009-10-28 06:54:17

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 01/10] Add iommu_init to x86_init_ops

We call the detections functions of all the IOMMUs then all their
initialization functions. THe latter is pointless since we don't
detect multiple different IOMMUs. What we need to do is calling the
initialization function of the detected IOMMU.

This adds iommu_init hook to x86_init_ops so if an IOMMU detection
function can set its initialization function to the hook.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/include/asm/iommu.h | 1 +
arch/x86/include/asm/x86_init.h | 9 +++++++++
arch/x86/kernel/pci-dma.c | 2 ++
arch/x86/kernel/x86_init.c | 5 +++++
4 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21b..42aa977 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -3,6 +3,7 @@

extern void pci_iommu_shutdown(void);
extern void no_iommu_init(void);
+static inline int iommu_init_noop(void) { return 0; }
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd..a58644a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -91,6 +91,14 @@ struct x86_init_timers {
};

/**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init: platform specific iommu setup
+ */
+struct x86_init_iommu {
+ int (*iommu_init)(void);
+};
+
+/**
* struct x86_init_ops - functions for platform specific setup
*
*/
@@ -101,6 +109,7 @@ struct x86_init_ops {
struct x86_init_oem oem;
struct x86_init_paging paging;
struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
};

/**
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b2a71dc..5609973 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -292,6 +292,8 @@ static int __init pci_iommu_init(void)
dma_debug_add_bus(&pci_bus_type);
#endif

+ x86_init.iommu.iommu_init();
+
calgary_iommu_init();

intel_iommu_init();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a..d89ce53 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -14,6 +14,7 @@
#include <asm/time.h>
#include <asm/irq.h>
#include <asm/tsc.h>
+#include <asm/iommu.h>

void __cpuinit x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
@@ -62,6 +63,10 @@ struct x86_init_ops x86_init __initdata = {
.tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
},
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
};

struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
--
1.5.6.5

2009-10-28 06:55:34

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 02/10] Calgary: convert detect_calgary to use iommu_init hook

This changes detect_calgary() to set init_calgary() to iommu_init hook
if detect_calgary() finds the Calgary IOMMU.

We can kill the code to check if we found the IOMMU in init_calgary()
since detect_calgary() sets init_calgary() only when it found the
IOMMU.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/include/asm/calgary.h | 2 --
arch/x86/kernel/pci-calgary_64.c | 11 +++++------
arch/x86/kernel/pci-dma.c | 2 --
3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb..0918654 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
extern int use_calgary;

#ifdef CONFIG_CALGARY_IOMMU
-extern int calgary_iommu_init(void);
extern void detect_calgary(void);
#else
-static inline int calgary_iommu_init(void) { return 1; }
static inline void detect_calgary(void) { return; }
#endif

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3be..47bd419 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -46,6 +46,7 @@
#include <asm/dma.h>
#include <asm/rio.h>
#include <asm/bios_ebda.h>
+#include <asm/x86_init.h>

#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -1344,6 +1345,8 @@ static void __init get_tce_space_from_tar(void)
return;
}

+int __init calgary_iommu_init(void);
+
void __init detect_calgary(void)
{
int bus;
@@ -1445,6 +1448,8 @@ void __init detect_calgary(void)
/* swiotlb for devices that aren't behind the Calgary. */
if (max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
+
+ x86_init.iommu.iommu_init = calgary_iommu_init;
}
return;

@@ -1461,12 +1466,6 @@ int __init calgary_iommu_init(void)
{
int ret;

- if (no_iommu || (swiotlb && !calgary_detected))
- return -ENODEV;
-
- if (!calgary_detected)
- return -ENODEV;
-
/* ok, we're trying to use Calgary - let's roll */
printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5609973..1b06714 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -294,8 +294,6 @@ static int __init pci_iommu_init(void)

x86_init.iommu.iommu_init();

- calgary_iommu_init();
-
intel_iommu_init();

amd_iommu_init();
--
1.5.6.5

2009-10-28 06:56:39

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 03/10] GART: convert gart_iommu_hole_init to use iommu_init hook

This changes gart_iommu_hole_init() to set gart_iommu_init() to
iommu_init hook if gart_iommu_hole_init() finds the GART IOMMU.

We can kill the code to check if we found the IOMMU in
gart_iommu_init() since gart_iommu_hole_init() sets gart_iommu_init()
only when it found the IOMMU.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/include/asm/gart.h | 5 +----
arch/x86/kernel/aperture_64.c | 2 ++
arch/x86/kernel/pci-dma.c | 2 --
arch/x86/kernel/pci-gart_64.c | 15 +++++----------
4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa..51d28a0 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,7 +35,7 @@ extern int gart_iommu_aperture_allowed;
extern int gart_iommu_aperture_disabled;

extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
+extern int gart_iommu_init(void);
extern void gart_iommu_shutdown(void);
extern void __init gart_parse_options(char *);
extern void gart_iommu_hole_init(void);
@@ -48,9 +48,6 @@ extern void gart_iommu_hole_init(void);
static inline void early_gart_iommu_check(void)
{
}
-static inline void gart_iommu_init(void)
-{
-}
static inline void gart_iommu_shutdown(void)
{
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d..03933cf 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/k8.h>
+#include <asm/x86_init.h>

int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
@@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void)

iommu_detected = 1;
gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;

aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1b06714..9830f12 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -298,8 +298,6 @@ static int __init pci_iommu_init(void)

amd_iommu_init();

- gart_iommu_init();
-
no_iommu_init();
return 0;
}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64..3bcae8f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -708,7 +708,7 @@ void gart_iommu_shutdown(void)
}
}

-void __init gart_iommu_init(void)
+int __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long iommu_start;
@@ -718,7 +718,7 @@ void __init gart_iommu_init(void)
long i;

if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
- return;
+ return 0;

#ifndef CONFIG_AGP_AMD64
no_agp = 1;
@@ -730,13 +730,6 @@ void __init gart_iommu_init(void)
(agp_copy_info(agp_bridge, &info) < 0);
#endif

- if (swiotlb)
- return;
-
- /* Did we detect a different HW IOMMU? */
- if (iommu_detected && !gart_iommu_aperture)
- return;
-
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
@@ -746,7 +739,7 @@ void __init gart_iommu_init(void)
"but GART IOMMU not available.\n");
printk(KERN_WARNING "falling back to iommu=soft.\n");
}
- return;
+ return 0;
}

/* need to map that range */
@@ -838,6 +831,8 @@ void __init gart_iommu_init(void)

flush_gart();
dma_ops = &gart_dma_ops;
+
+ return 0;
}

void __init gart_parse_options(char *p)
--
1.5.6.5

2009-10-28 06:54:30

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 04/10] amd_iommu: convert amd_iommu_detect to use iommu_init hook

This changes amd_iommu_detect() to set amd_iommu_init to
iommu_init hook if amd_iommu_detect() finds the AMD IOMMU.

We can kill the code to check if we found the IOMMU in
amd_iommu_init() since amd_iommu_detect() sets amd_iommu_init() only
when it found the IOMMU.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/include/asm/amd_iommu.h | 2 --
arch/x86/kernel/amd_iommu_init.c | 17 +++--------------
arch/x86/kernel/pci-dma.c | 2 --
3 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index ac95995..240617e 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -23,7 +23,6 @@
#include <linux/irqreturn.h>

#ifdef CONFIG_AMD_IOMMU
-extern int amd_iommu_init(void);
extern int amd_iommu_init_dma_ops(void);
extern int amd_iommu_init_passthrough(void);
extern void amd_iommu_detect(void);
@@ -32,7 +31,6 @@ extern void amd_iommu_flush_all_domains(void);
extern void amd_iommu_flush_all_devices(void);
extern void amd_iommu_shutdown(void);
#else
-static inline int amd_iommu_init(void) { return -ENODEV; }
static inline void amd_iommu_detect(void) { }
static inline void amd_iommu_shutdown(void) { }
#endif
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d4..52bf59f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -29,6 +29,7 @@
#include <asm/amd_iommu.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/x86_init.h>

/*
* definitions for the ACPI scanning code
@@ -1154,19 +1155,10 @@ static struct sys_device device_amd_iommu = {
* functions. Finally it prints some information about AMD IOMMUs and
* the driver state and enables the hardware.
*/
-int __init amd_iommu_init(void)
+static int __init amd_iommu_init(void)
{
int i, ret = 0;

-
- if (no_iommu) {
- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
- return 0;
- }
-
- if (!amd_iommu_detected)
- return -ENODEV;
-
/*
* First parse ACPI tables to find the largest Bus/Dev/Func
* we need to handle. Upon this information the shared data
@@ -1326,10 +1318,7 @@ void __init amd_iommu_detect(void)
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
amd_iommu_detected = 1;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
+ x86_init.iommu.iommu_init = amd_iommu_init;
}
}

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9830f12..6abb8c2 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -296,8 +296,6 @@ static int __init pci_iommu_init(void)

intel_iommu_init();

- amd_iommu_init();
-
no_iommu_init();
return 0;
}
--
1.5.6.5

2009-10-28 06:54:23

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 05/10] intel-iommu: convert detect_intel_iommu to use iommu_init hook

This changes detect_intel_iommu() to set intel_iommu_init() to
iommu_init hook if detect_intel_iommu() finds the IOMMU.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/kernel/pci-dma.c | 2 --
drivers/pci/dmar.c | 4 ++++
include/linux/dmar.h | 10 ----------
3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6abb8c2..f70b766 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -294,8 +294,6 @@ static int __init pci_iommu_init(void)

x86_init.iommu.iommu_init();

- intel_iommu_init();
-
no_iommu_init();
return 0;
}
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 22b02c6..bce9cd7 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -617,6 +617,10 @@ void __init detect_intel_iommu(void)
!dmar_disabled)
iommu_detected = 1;
#endif
+#ifdef CONFIG_X86
+ if (ret)
+ x86_init.iommu.iommu_init = intel_iommu_init;
+#endif
}
early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
dmar_tbl = NULL;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 4a2b162..3d12f28 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -208,16 +208,6 @@ struct dmar_atsr_unit {
u8 include_all:1; /* include all ports */
};

-/* Intel DMAR initialization functions */
extern int intel_iommu_init(void);
-#else
-static inline int intel_iommu_init(void)
-{
-#ifdef CONFIG_INTR_REMAP
- return dmar_dev_scope_init();
-#else
- return -ENODEV;
#endif
-}
-#endif /* !CONFIG_DMAR */
#endif /* __DMAR_H__ */
--
1.5.6.5

2009-10-28 06:55:37

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 06/10] bootmem: refactor free_all_bootmem_core

From: Chris Wright <[email protected]>

Move the loop that frees all bootmem pages back to page allocator into
its own function. This should have not functional effect and allows the
function to be reused later.

Reviewed-by: FUJITA Tomonori <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
---
mm/bootmem.c | 61 +++++++++++++++++++++++++++++++++++++++-------------------
1 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2..94ef2e7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,17 +143,22 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}

-static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+/**
+ * free_bootmem_pages - frees bootmem pages to page allocator
+ * @start: start pfn
+ * @end: end pfn
+ * @map: bootmem bitmap of reserved pages
+ *
+ * This will free the pages in the range @start to @end, making them
+ * available to the page allocator. The @map will be used to skip
+ * reserved pages. Returns the count of pages freed.
+ */
+static unsigned long __init free_bootmem_pages(unsigned long start,
+ unsigned long end,
+ unsigned long *map)
{
+ unsigned long cursor, count = 0;
int aligned;
- struct page *page;
- unsigned long start, end, pages, count = 0;
-
- if (!bdata->node_bootmem_map)
- return 0;
-
- start = bdata->node_min_pfn;
- end = bdata->node_low_pfn;

/*
* If the start is aligned to the machines wordsize, we might
@@ -161,27 +166,25 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
*/
aligned = !(start & (BITS_PER_LONG - 1));

- bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
- bdata - bootmem_node_data, start, end, aligned);
+ for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
+ unsigned long idx, vec;

- while (start < end) {
- unsigned long *map, idx, vec;
-
- map = bdata->node_bootmem_map;
- idx = start - bdata->node_min_pfn;
+ idx = cursor - start;
vec = ~map[idx / BITS_PER_LONG];

- if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+ if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) {
int order = ilog2(BITS_PER_LONG);

- __free_pages_bootmem(pfn_to_page(start), order);
+ __free_pages_bootmem(pfn_to_page(cursor), order);
count += BITS_PER_LONG;
} else {
unsigned long off = 0;

while (vec && off < BITS_PER_LONG) {
if (vec & 1) {
- page = pfn_to_page(start + off);
+ struct page *page;
+
+ page = pfn_to_page(cursor + off);
__free_pages_bootmem(page, 0);
count++;
}
@@ -189,8 +192,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
off++;
}
}
- start += BITS_PER_LONG;
}
+ return count;
+}
+
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+ struct page *page;
+ unsigned long start, end, *map, pages, count = 0;
+
+ if (!bdata->node_bootmem_map)
+ return 0;
+
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+ map = bdata->node_bootmem_map;
+
+ bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+ start, end);
+
+ count = free_bootmem_pages(start, end, map);

page = virt_to_page(bdata->node_bootmem_map);
pages = bdata->node_low_pfn - bdata->node_min_pfn;
--
1.5.6.5

2009-10-28 06:55:03

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 07/10] bootmem: add free_bootmem_late

From: Chris Wright <[email protected]>

Add a new function for freeing bootmem after the bootmem allocator has
been released and the unreserved pages given to the page allocator.
This allows us to reserve bootmem and then release it if we later
discover it was not needed.

Reviewed-by: FUJITA Tomonori <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
---
include/linux/bootmem.h | 1 +
mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index dd97fb8..b10ec49 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
unsigned long addr,
unsigned long size);
extern void free_bootmem(unsigned long addr, unsigned long size);
+extern void free_bootmem_late(unsigned long addr, unsigned long size);

/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 94ef2e7..6c04e6e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -151,7 +151,9 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
*
* This will free the pages in the range @start to @end, making them
* available to the page allocator. The @map will be used to skip
- * reserved pages. Returns the count of pages freed.
+ * reserved pages. In the case that @map is NULL, the bootmem allocator
+ * is already free and the range is contiguous. Returns the count of
+ * pages freed.
*/
static unsigned long __init free_bootmem_pages(unsigned long start,
unsigned long end,
@@ -164,13 +166,23 @@ static unsigned long __init free_bootmem_pages(unsigned long start,
* If the start is aligned to the machines wordsize, we might
* be able to free pages in bulks of that order.
*/
- aligned = !(start & (BITS_PER_LONG - 1));
+ if (map)
+ aligned = !(start & (BITS_PER_LONG - 1));
+ else
+ aligned = 1;

for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
- unsigned long idx, vec;
+ unsigned long vec;

- idx = cursor - start;
- vec = ~map[idx / BITS_PER_LONG];
+ if (map) {
+ unsigned long idx = cursor - start;
+ vec = ~map[idx / BITS_PER_LONG];
+ } else {
+ if (end - cursor >= BITS_PER_LONG)
+ vec = ~0UL;
+ else
+ vec = (1UL << (end - cursor)) - 1;
+ }

if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) {
int order = ilog2(BITS_PER_LONG);
@@ -387,6 +399,27 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
}

/**
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long start, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ start = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ totalram_pages += free_bootmem_pages(start, end, NULL);
+}
+
+/**
* reserve_bootmem_node - mark a page range as reserved
* @pgdat: node the range resides on
* @physaddr: starting address of the range
--
1.5.6.5

2009-10-28 06:54:29

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 08/10] swiotlb: add swiotlb_free function

swiotlb_free function frees all allocated memory for swiotlb.

We need to initialize swiotlb before IOMMU initialization (x86 and
powerpc needs to allocate memory from bootmem allocator). If IOMMU
initialization is successful, we need to free swiotlb resource (don't
want to waste 64MB).

Signed-off-by: FUJITA Tomonori <[email protected]>
---
include/linux/swiotlb.h | 1 +
lib/swiotlb.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 73b1f1c..51b6cc2 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -88,4 +88,5 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
extern int
swiotlb_dma_supported(struct device *hwdev, u64 mask);

+extern void __init swiotlb_free(void);
#endif /* __LINUX_SWIOTLB_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index ac25cd2..eee512b 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -97,6 +97,8 @@ static phys_addr_t *io_tlb_orig_addr;
*/
static DEFINE_SPINLOCK(io_tlb_lock);

+static int late_alloc;
+
static int __init
setup_io_tlb_npages(char *str)
{
@@ -262,6 +264,8 @@ swiotlb_late_init_with_default_size(size_t default_size)

swiotlb_print_info(bytes);

+ late_alloc = 1;
+
return 0;

cleanup4:
@@ -281,6 +285,32 @@ cleanup1:
return -ENOMEM;
}

+void __init swiotlb_free(void)
+{
+ if (!io_tlb_overflow_buffer)
+ return;
+
+ if (late_alloc) {
+ free_pages((unsigned long)io_tlb_overflow_buffer,
+ get_order(io_tlb_overflow));
+ free_pages((unsigned long)io_tlb_orig_addr,
+ get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+ sizeof(int)));
+ free_pages((unsigned long)io_tlb_start,
+ get_order(io_tlb_nslabs << IO_TLB_SHIFT));
+ } else {
+ free_bootmem_late(__pa(io_tlb_overflow_buffer),
+ io_tlb_overflow);
+ free_bootmem_late(__pa(io_tlb_orig_addr),
+ io_tlb_nslabs * sizeof(phys_addr_t));
+ free_bootmem_late(__pa(io_tlb_list),
+ io_tlb_nslabs * sizeof(int));
+ free_bootmem_late(__pa(io_tlb_start),
+ io_tlb_nslabs << IO_TLB_SHIFT);
+ }
+}
+
static int is_swiotlb_buffer(phys_addr_t paddr)
{
return paddr >= virt_to_phys(io_tlb_start) &&
--
1.5.6.5

2009-10-28 06:54:42

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 09/10] swiotlb: export swiotlb_print_info

This enables us to avoid printing swiotlb memory info when we
initialize swiotlb. After swiotlb initialization, we could find that
we dont' need swiotlb.

This patch removes the code to print swiotlb memory info in
swiotlb_init() and exports the function to do that.

Cc: [email protected]
Cc: [email protected]
Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/ia64/kernel/pci-swiotlb.c | 4 ++--
arch/powerpc/kernel/setup_32.c | 2 +-
arch/powerpc/kernel/setup_64.c | 2 +-
arch/x86/kernel/pci-swiotlb.c | 3 +--
include/linux/swiotlb.h | 4 ++--
lib/swiotlb.c | 15 ++++++++-------
6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
index 285aae8..53292ab 100644
--- a/arch/ia64/kernel/pci-swiotlb.c
+++ b/arch/ia64/kernel/pci-swiotlb.c
@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = {
void __init swiotlb_dma_init(void)
{
dma_ops = &swiotlb_dma_ops;
- swiotlb_init();
+ swiotlb_init(1);
}

void __init pci_swiotlb_init(void)
@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void)
swiotlb = 1;
printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
machvec_init("dig");
- swiotlb_init();
+ swiotlb_init(1);
dma_ops = &swiotlb_dma_ops;
#else
panic("Unable to find Intel IOMMU");
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 53bcf3d..b152de3 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p)

#ifdef CONFIG_SWIOTLB
if (ppc_swiotlb_enable)
- swiotlb_init();
+ swiotlb_init(1);
#endif

paging_init();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 797ea95..1efbcaa 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -551,7 +551,7 @@ void __init setup_arch(char **cmdline_p)

#ifdef CONFIG_SWIOTLB
if (ppc_swiotlb_enable)
- swiotlb_init();
+ swiotlb_init(1);
#endif

paging_init();
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b78..ea20ef7 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -52,8 +52,7 @@ void __init pci_swiotlb_init(void)
if (swiotlb_force)
swiotlb = 1;
if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
- swiotlb_init();
+ swiotlb_init(0);
dma_ops = &swiotlb_dma_ops;
}
}
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 51b6cc2..6b70068 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -20,8 +20,7 @@ struct scatterlist;
*/
#define IO_TLB_SHIFT 11

-extern void
-swiotlb_init(void);
+extern void swiotlb_init(int verbose);

extern void
*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -89,4 +88,5 @@ extern int
swiotlb_dma_supported(struct device *hwdev, u64 mask);

extern void __init swiotlb_free(void);
+extern void swiotlb_print_info(void);
#endif /* __LINUX_SWIOTLB_H */
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index eee512b..0c12d7c 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -123,8 +123,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
return phys_to_dma(hwdev, virt_to_phys(address));
}

-static void swiotlb_print_info(unsigned long bytes)
+void swiotlb_print_info(void)
{
+ unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
phys_addr_t pstart, pend;

pstart = virt_to_phys(io_tlb_start);
@@ -142,7 +143,7 @@ static void swiotlb_print_info(unsigned long bytes)
* structures for the software IO TLB used to implement the DMA API.
*/
void __init
-swiotlb_init_with_default_size(size_t default_size)
+swiotlb_init_with_default_size(size_t default_size, int verbose)
{
unsigned long i, bytes;

@@ -178,14 +179,14 @@ swiotlb_init_with_default_size(size_t default_size)
io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
if (!io_tlb_overflow_buffer)
panic("Cannot allocate SWIOTLB overflow buffer!\n");
-
- swiotlb_print_info(bytes);
+ if (verbose)
+ swiotlb_print_info();
}

void __init
-swiotlb_init(void)
+swiotlb_init(int verbose)
{
- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */
+ swiotlb_init_with_default_size(64 * (1<<20), verbose); /* default to 64MB */
}

/*
@@ -262,7 +263,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
if (!io_tlb_overflow_buffer)
goto cleanup4;

- swiotlb_print_info(bytes);
+ swiotlb_print_info();

late_alloc = 1;

--
1.5.6.5

2009-10-28 06:54:26

by FUJITA Tomonori

[permalink] [raw]
Subject: [PATCH 10/10] x86: handle HW IOMMU initialization failure gracely

If HW IOMMU initialization fails (Intel VT-d often does typically due
to BIOS bugs), we fall back to nommu. It doesn't work for the majority
since nowadays we have more than 4GB memory so we must use swiotlb
instead of nommu.

The problem is that it's too late to initialize swiotlb when HW IOMMU
initialization fails. We need to allocate swiotlb memory earlier from
bootmem allocator. Chris explained the issue in detail:

http://marc.info/?l=linux-kernel&m=125657444317079&w=2

The current x86 IOMMU initialization sequence is too complicated and
handling the above issue makes it more hacky.

This patch changes x86 IOMMU initialization sequence to handle the
above issue cleanly.

The new x86 IOMMU initialization sequence are:

1. initializing swiotlb (and setting swiotlb to 1) in the case of
(max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to
swiotlb_dma_ops or nommu_dma_ops.

2. calling the detection functions of all the IOMMUs

3. the detection function sets x86_init.iommu.iommu_init to the IOMMU
initialization function (so we can avoid calling the initialization
functions of all the IOMMUs needlessly).

4. if the IOMMU initialization function doesn't need to swiotlb then
sets swiotlb to zero (e.g. the initialization is sucessful).

5. if we find that swiotlb is set to zero, we free swiotlb resource.

Signed-off-by: FUJITA Tomonori <[email protected]>
---
arch/x86/include/asm/iommu.h | 1 -
arch/x86/kernel/amd_iommu.c | 2 +-
arch/x86/kernel/amd_iommu_init.c | 2 +-
arch/x86/kernel/aperture_64.c | 2 +-
arch/x86/kernel/pci-calgary_64.c | 10 +---------
arch/x86/kernel/pci-dma.c | 17 +++++++++--------
arch/x86/kernel/pci-gart_64.c | 1 +
arch/x86/kernel/pci-nommu.c | 9 ---------
arch/x86/kernel/pci-swiotlb.c | 5 +++--
drivers/pci/dmar.c | 3 +--
drivers/pci/intel-iommu.c | 4 ++--
11 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 42aa977..9254199 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -2,7 +2,6 @@
#define _ASM_X86_IOMMU_H

extern void pci_iommu_shutdown(void);
-extern void no_iommu_init(void);
static inline int iommu_init_noop(void) { return 0; }
extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f..383bf83 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2108,8 +2108,8 @@ int __init amd_iommu_init_dma_ops(void)
prealloc_protection_domains();

iommu_detected = 1;
- force_iommu = 1;
bad_dma_address = 0;
+ swiotlb = 0;
#ifdef CONFIG_GART_IOMMU
gart_iommu_aperture_disabled = 1;
gart_iommu_aperture = 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 52bf59f..5eda21a 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1312,7 +1312,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)

void __init amd_iommu_detect(void)
{
- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
+ if (no_iommu || (iommu_detected && !gart_iommu_aperture))
return;

if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 03933cf..e0dfb68 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -458,7 +458,7 @@ out:

if (aper_alloc) {
/* Got the aperture from the AGP bridge */
- } else if (swiotlb && !valid_agp) {
+ } else if (!valid_agp) {
/* Do nothing */
} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
force_iommu ||
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 47bd419..833f491 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1360,7 +1360,7 @@ void __init detect_calgary(void)
* if the user specified iommu=off or iommu=soft or we found
* another HW IOMMU already, bail out.
*/
- if (swiotlb || no_iommu || iommu_detected)
+ if (no_iommu || iommu_detected)
return;

if (!use_calgary)
@@ -1445,10 +1445,6 @@ void __init detect_calgary(void)
printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
specified_table_size);

- /* swiotlb for devices that aren't behind the Calgary. */
- if (max_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
-
x86_init.iommu.iommu_init = calgary_iommu_init;
}
return;
@@ -1476,11 +1472,7 @@ int __init calgary_iommu_init(void)
return ret;
}

- force_iommu = 1;
bad_dma_address = 0x0;
- /* dma_ops is set to swiotlb or nommu */
- if (!dma_ops)
- dma_ops = &nommu_dma_ops;

return 0;
}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f70b766..28fd54a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -128,20 +128,16 @@ void __init pci_iommu_alloc(void)
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
#endif
+ pci_swiotlb_init();

- /*
- * The order of these functions is important for
- * fall-back/fail-over reasons
- */
gart_iommu_hole_init();

detect_calgary();

detect_intel_iommu();

+ /* needs to be called after gart_iommu_hole_init */
amd_iommu_detect();
-
- pci_swiotlb_init();
}

void *dma_generic_alloc_coherent(struct device *dev, size_t size,
@@ -291,10 +287,15 @@ static int __init pci_iommu_init(void)
#ifdef CONFIG_PCI
dma_debug_add_bus(&pci_bus_type);
#endif
-
x86_init.iommu.iommu_init();

- no_iommu_init();
+ if (swiotlb) {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ } else
+ swiotlb_free();
+
return 0;
}

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 3bcae8f..e36f5f4 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -831,6 +831,7 @@ int __init gart_iommu_init(void)

flush_gart();
dma_ops = &gart_dma_ops;
+ swiotlb = 0;

return 0;
}
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4..875e382 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
.sync_sg_for_device = nommu_sync_sg_for_device,
.is_phys = 1,
};
-
-void __init no_iommu_init(void)
-{
- if (dma_ops)
- return;
-
- force_iommu = 0; /* no HW IOMMU */
- dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index ea20ef7..607fbb6 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -46,7 +46,7 @@ void __init pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
#ifdef CONFIG_X86_64
- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
+ if (!no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
if (swiotlb_force)
@@ -54,5 +54,6 @@ void __init pci_swiotlb_init(void)
if (swiotlb) {
swiotlb_init(0);
dma_ops = &swiotlb_dma_ops;
- }
+ } else
+ dma_ops = &nommu_dma_ops;
}
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index bce9cd7..4373996 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -613,8 +613,7 @@ void __init detect_intel_iommu(void)
"x2apic and Intr-remapping.\n");
#endif
#ifdef CONFIG_DMAR
- if (ret && !no_iommu && !iommu_detected && !swiotlb &&
- !dmar_disabled)
+ if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
iommu_detected = 1;
#endif
#ifdef CONFIG_X86
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index b1e97e6..c7a8bcf 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3231,7 +3231,7 @@ int __init intel_iommu_init(void)
* Check the need for DMA-remapping initialization now.
* Above initialization will also be used by Interrupt-remapping.
*/
- if (no_iommu || swiotlb || dmar_disabled)
+ if (no_iommu || dmar_disabled)
return -ENODEV;

iommu_init_mempool();
@@ -3252,7 +3252,7 @@ int __init intel_iommu_init(void)
"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");

init_timer(&unmap_timer);
- force_iommu = 1;
+ swiotlb = 0;
dma_ops = &intel_dma_ops;

init_iommu_sysfs();
--
1.5.6.5

2009-10-28 07:37:03

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 06/10] bootmem: refactor free_all_bootmem_core


* FUJITA Tomonori <[email protected]> wrote:

> From: Chris Wright <[email protected]>
>
> Move the loop that frees all bootmem pages back to page allocator into
> its own function. This should have not functional effect and allows the
> function to be reused later.
>
> Reviewed-by: FUJITA Tomonori <[email protected]>
> Signed-off-by: Chris Wright <[email protected]>

Small signoff-handling sidenote: in such cases (when you are handling
other people's patches) the expected workflow is to add your SOB line
after Chris's line, instead of the Reviewed-by line.

I.e. it should look like this:

Signed-off-by: Chris Wright <[email protected]>
Signed-off-by: FUJITA Tomonori <[email protected]>

Showing the path of the patch - ending with your signoff.

Thanks,

Ingo

2009-10-28 07:37:26

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 06/10] bootmem: refactor free_all_bootmem_core


* FUJITA Tomonori <[email protected]> wrote:

> From: Chris Wright <[email protected]>
>
> Move the loop that frees all bootmem pages back to page allocator into
> its own function. This should have not functional effect and allows the
> function to be reused later.
>
> Reviewed-by: FUJITA Tomonori <[email protected]>
> Signed-off-by: Chris Wright <[email protected]>
> ---
> mm/bootmem.c | 61 +++++++++++++++++++++++++++++++++++++++-------------------
> 1 files changed, 41 insertions(+), 20 deletions(-)
>
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index 555d5d2..94ef2e7 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -143,17 +143,22 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
> return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
> }
>
> -static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
> +/**
> + * free_bootmem_pages - frees bootmem pages to page allocator
> + * @start: start pfn
> + * @end: end pfn
> + * @map: bootmem bitmap of reserved pages
> + *
> + * This will free the pages in the range @start to @end, making them
> + * available to the page allocator. The @map will be used to skip
> + * reserved pages. Returns the count of pages freed.
> + */
> +static unsigned long __init free_bootmem_pages(unsigned long start,
> + unsigned long end,
> + unsigned long *map)
> {
> + unsigned long cursor, count = 0;
> int aligned;
> - struct page *page;
> - unsigned long start, end, pages, count = 0;
> -
> - if (!bdata->node_bootmem_map)
> - return 0;
> -
> - start = bdata->node_min_pfn;
> - end = bdata->node_low_pfn;
>
> /*
> * If the start is aligned to the machines wordsize, we might
> @@ -161,27 +166,25 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
> */
> aligned = !(start & (BITS_PER_LONG - 1));
>
> - bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
> - bdata - bootmem_node_data, start, end, aligned);
> + for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
> + unsigned long idx, vec;
>
> - while (start < end) {
> - unsigned long *map, idx, vec;
> -
> - map = bdata->node_bootmem_map;
> - idx = start - bdata->node_min_pfn;
> + idx = cursor - start;
> vec = ~map[idx / BITS_PER_LONG];
>
> - if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
> + if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) {
> int order = ilog2(BITS_PER_LONG);
>
> - __free_pages_bootmem(pfn_to_page(start), order);
> + __free_pages_bootmem(pfn_to_page(cursor), order);
> count += BITS_PER_LONG;
> } else {
> unsigned long off = 0;
>
> while (vec && off < BITS_PER_LONG) {
> if (vec & 1) {
> - page = pfn_to_page(start + off);
> + struct page *page;
> +
> + page = pfn_to_page(cursor + off);
> __free_pages_bootmem(page, 0);
> count++;
> }
> @@ -189,8 +192,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
> off++;
> }
> }
> - start += BITS_PER_LONG;
> }
> + return count;
> +}
> +
> +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
> +{
> + struct page *page;
> + unsigned long start, end, *map, pages, count = 0;
> +
> + if (!bdata->node_bootmem_map)
> + return 0;
> +
> + start = bdata->node_min_pfn;
> + end = bdata->node_low_pfn;
> + map = bdata->node_bootmem_map;
> +
> + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
> + start, end);
> +
> + count = free_bootmem_pages(start, end, map);
>
> page = virt_to_page(bdata->node_bootmem_map);
> pages = bdata->node_low_pfn - bdata->node_min_pfn;

Small nit: the initialization of count to 0 seems superfluous.

Ingo

2009-10-28 07:49:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late


* FUJITA Tomonori <[email protected]> wrote:

> From: Chris Wright <[email protected]>
>
> Add a new function for freeing bootmem after the bootmem allocator has
> been released and the unreserved pages given to the page allocator.
> This allows us to reserve bootmem and then release it if we later
> discover it was not needed.
>
> Reviewed-by: FUJITA Tomonori <[email protected]>
> Signed-off-by: Chris Wright <[email protected]>
> ---
> include/linux/bootmem.h | 1 +
> mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 39 insertions(+), 5 deletions(-)

Hm, we are now further complicating the bootmem model.

I think we could remove the bootmem allocator middle man altogether.

This can be done by initializing the page allocator sooner and by
extending (already existing) 'reserve memory early on' mechanisms in
architecture code. (the reserve_early*() APIs in x86 for example)

Right now we have 5 memory allocation models on x86, initialized
gradually:

- allocator (buddy) [generic]
- early allocator (bootmem) [generic]
- very early allocator (reserve_early*()) [x86]
- very very early allocator (early brk model) [x86]
- very very very early allocator (build time .data/.bss) [generic]

Seems excessive.

The reserve_early() method is list/range based and can handle vast
amounts of not very fragmented memory - perfect for basically all the
real bootmem purposes (which is to bootstrap the buddy).

reserve_early() allocated memory could be freed into the buddy later on
as well. The main reason why bootmem is 'destroyed' during free-to-buddy
is because it has excessive internal bitmaps we want to free. With a
list/range based reserve_early() mechanism there's no such problem -
they can linger indefinitely and there's near zero allocation management
overhead.

reserve_early() might need some small amount of extra work before it can
be used as a generic early allocator - like adding a node field to it
(so that the buddy can then pick those ranges up in a NUMA aware
fashion) - but nothing very complex.

Thoughts?

Ingo

2009-10-28 08:00:21

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

Hello,

Ingo Molnar wrote:
> Hm, we are now further complicating the bootmem model.
>
> I think we could remove the bootmem allocator middle man altogether.
>
> This can be done by initializing the page allocator sooner and by
> extending (already existing) 'reserve memory early on' mechanisms in
> architecture code. (the reserve_early*() APIs in x86 for example)
...
> reserve_early() might need some small amount of extra work before it can
> be used as a generic early allocator - like adding a node field to it
> (so that the buddy can then pick those ranges up in a NUMA aware
> fashion) - but nothing very complex.

ISTR an attempt to initialize the kmalloc allocator much earlier
during boot such that it can completely replace the bootmem allocator,
which would nicely remove all the complications although it may
require the kmalloc allocator to go through more complex boot
strapping steps. I didn't follow how that went. Did it prove to be
unworkable?

As for percpu allocator, as long as it can grab memory in NUMA aware
way, changing shouldn't be a problem at all. The good thing about
bootmem allocator is that it is generic and standardizes memory
bootstrapping across different architectures. As long as that can be
maintained, makings things simpler would be great.

Thanks.

--
tejun

2009-10-28 11:38:39

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

Hi Tejun,

On Wed, Oct 28, 2009 at 10:00 AM, Tejun Heo <[email protected]> wrote:
> ISTR an attempt to initialize the kmalloc allocator much earlier
> during boot such that it can completely replace the bootmem allocator,
> which would nicely remove all the complications although it may
> require the kmalloc allocator to go through more complex boot
> strapping steps. ?I didn't follow how that went. ?Did it prove to be
> unworkable?

We're doing it before scheduler init now but I haven't put any effort
into moving it earlier than that yet. I don't see any fundamental
reason we can't do that but the practical problem is that we're going
to affect architecture specific boot code which is really hard to
test.

Pekka

2009-10-28 12:12:50

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

Hello,

Pekka Enberg wrote:
> On Wed, Oct 28, 2009 at 10:00 AM, Tejun Heo <[email protected]> wrote:
>> ISTR an attempt to initialize the kmalloc allocator much earlier
>> during boot such that it can completely replace the bootmem allocator,
>> which would nicely remove all the complications although it may
>> require the kmalloc allocator to go through more complex boot
>> strapping steps. I didn't follow how that went. Did it prove to be
>> unworkable?
>
> We're doing it before scheduler init now but I haven't put any effort
> into moving it earlier than that yet. I don't see any fundamental
> reason we can't do that but the practical problem is that we're going
> to affect architecture specific boot code which is really hard to
> test.

Thanks for the explanation. It would be really great if we can pull
that off someday. This should be doable architecture-by-architecture,
right? You can, for example, first convert x86 and then make bootmem
allocator thin wrapper around the slab allocator. After all archs
have been converted, the wrappers can be dropped.

--
tejun

2009-10-28 13:21:34

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH 0/10] x86: handle HW IOMMU initialization failure gracely

On Wed, Oct 28, 2009 at 03:47:34PM +0900, FUJITA Tomonori wrote:
> If HW IOMMU initialization fails (Intel VT-d often does typically due
> to BIOS bugs), we fall back to nommu. It doesn't work for the majority
> since nowadays we have more than 4GB memory so we must use swiotlb
> instead of nommu.
>
> The problem is that it's too late to initialize swiotlb when HW IOMMU
> initialization fails. We need to allocate swiotlb memory earlier from
> bootmem allocator. Chris explained the issue in detail:
>
> http://marc.info/?l=linux-kernel&m=125657444317079&w=2
>
>
> The current x86 IOMMU initialization sequence is too complicated and
> handling the above issue makes it more hacky.
>
> This series changes x86 IOMMU initialization sequence to handle the
> above issue cleanly.
>
> The new x86 IOMMU initialization sequence are:
>
> 1. initializing swiotlb (and setting swiotlb to 1) in the case of
> (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to
> swiotlb_dma_ops or nommu_dma_ops.
>
> 2. calling the detection functions of all the IOMMUs
>
> 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU
> initialization function (so we can avoid calling the initialization
> functions of all the IOMMUs needlessly).
>
> 4. if the IOMMU initialization function doesn't need to swiotlb then
> sets swiotlb to zero (e.g. the initialization is sucessful).
>
> 5. if we find that swiotlb is set to zero, we free swiotlb resource.

I started to test this patchset. It breaks the iommu=soft selection to
force using swiotlb usage. On my machine always the AMD IOMMU driver
gots selected and initialized.

Joerg

2009-10-28 13:39:43

by Muli Ben-Yehuda

[permalink] [raw]
Subject: Re: [PATCH 02/10] Calgary: convert detect_calgary to use iommu_init hook

On Wed, Oct 28, 2009 at 03:47:36PM +0900, FUJITA Tomonori wrote:
> This changes detect_calgary() to set init_calgary() to iommu_init hook
> if detect_calgary() finds the Calgary IOMMU.
>
> We can kill the code to check if we found the IOMMU in init_calgary()
> since detect_calgary() sets init_calgary() only when it found the
> IOMMU.

I like this patchset a lot. Our with the old cruft!

> Signed-off-by: FUJITA Tomonori <[email protected]>

Acked-by: Muli Ben-Yehuda <[email protected]>

Cheers,
Muli

2009-10-29 08:28:22

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH 0/10] x86: handle HW IOMMU initialization failure gracely

On Wed, 28 Oct 2009 14:21:19 +0100
Joerg Roedel <[email protected]> wrote:

> > The new x86 IOMMU initialization sequence are:
> >
> > 1. initializing swiotlb (and setting swiotlb to 1) in the case of
> > (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to
> > swiotlb_dma_ops or nommu_dma_ops.
> >
> > 2. calling the detection functions of all the IOMMUs
> >
> > 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU
> > initialization function (so we can avoid calling the initialization
> > functions of all the IOMMUs needlessly).
> >
> > 4. if the IOMMU initialization function doesn't need to swiotlb then
> > sets swiotlb to zero (e.g. the initialization is sucessful).
> >
> > 5. if we find that swiotlb is set to zero, we free swiotlb resource.
>
> I started to test this patchset. It breaks the iommu=soft selection to
> force using swiotlb usage. On my machine always the AMD IOMMU driver
> gots selected and initialized.

Thanks for testing the patchset.

Sorry about the bug. I need to add 1.5 to the above sequence:

1.5 if swiotlb usage is forced by the boot option, we skip the rest.

Here's a patch against the patchset.

diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
index dcbaea7..f0acde6 100644
--- a/arch/ia64/include/asm/swiotlb.h
+++ b/arch/ia64/include/asm/swiotlb.h
@@ -4,8 +4,6 @@
#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>

-extern int swiotlb_force;
-
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
extern void pci_swiotlb_init(void);
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20..193aa75 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -5,8 +5,6 @@

/* SWIOTLB interface */

-extern int swiotlb_force;
-
#ifdef CONFIG_SWIOTLB
extern int swiotlb;
extern void pci_swiotlb_init(void);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 28fd54a..2583a0b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -124,11 +124,15 @@ static void __init dma32_free_bootmem(void)

void __init pci_iommu_alloc(void)
{
+ /* swiotlb is forced by the boot option */
+ int use_swiotlb = swiotlb;
#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
#endif
pci_swiotlb_init();
+ if (use_swiotlb)
+ return;

gart_iommu_hole_init();

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 607fbb6..17ce422 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -49,8 +49,6 @@ void __init pci_swiotlb_init(void)
if (!no_iommu && max_pfn > MAX_DMA32_PFN)
swiotlb = 1;
#endif
- if (swiotlb_force)
- swiotlb = 1;
if (swiotlb) {
swiotlb_init(0);
dma_ops = &swiotlb_dma_ops;
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 0c12d7c..2eae139 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -57,7 +57,7 @@ enum dma_sync_target {
SYNC_FOR_DEVICE = 1,
};

-int swiotlb_force;
+static int swiotlb_force;

/*
* Used to do a quick range check in unmap_single and
@@ -109,8 +109,10 @@ setup_io_tlb_npages(char *str)
}
if (*str == ',')
++str;
- if (!strcmp(str, "force"))
+ if (!strcmp(str, "force")) {
swiotlb_force = 1;
+ swiotlb = 1;
+ }
return 1;
}
__setup("swiotlb=", setup_io_tlb_npages);



2009-10-29 11:19:51

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Wed, Oct 28, 2009 at 2:12 PM, Tejun Heo <[email protected]> wrote:
>> We're doing it before scheduler init now but I haven't put any effort
>> into moving it earlier than that yet. I don't see any fundamental
>> reason we can't do that but the practical problem is that we're going
>> to affect architecture specific boot code which is really hard to
>> test.
>
> Thanks for the explanation. ?It would be really great if we can pull
> that off someday. ?This should be doable architecture-by-architecture,
> right? ?You can, for example, first convert x86 and then make bootmem
> allocator thin wrapper around the slab allocator. ?After all archs
> have been converted, the wrappers can be dropped.

I am not sure how you we could do that.

The main challenge in initializing slab earlier is the various
implicit dependencies between different parts of the kernel boot code.
If we initialize slab earlier, we also need to initialize page
allocator earlier which requires page table setup, mem_init(), and so
on. Unfortunately architectures don't do boot-time setup in "standard"
places which means that for some architectures mem_init() might need
traps but for other architectures we can't just enable traps earlier
unless we do something else before that as well.

So I think we need to untagle the mess in init/main.c some more first
before we try to initialize slab earlier.

Pekka

2009-11-06 01:51:55

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Wed, 28 Oct 2009 08:48:32 +0100
Ingo Molnar <[email protected]> wrote:

>
> * FUJITA Tomonori <[email protected]> wrote:
>
> > From: Chris Wright <[email protected]>
> >
> > Add a new function for freeing bootmem after the bootmem allocator has
> > been released and the unreserved pages given to the page allocator.
> > This allows us to reserve bootmem and then release it if we later
> > discover it was not needed.
> >
> > Reviewed-by: FUJITA Tomonori <[email protected]>
> > Signed-off-by: Chris Wright <[email protected]>
> > ---
> > include/linux/bootmem.h | 1 +
> > mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++-----
> > 2 files changed, 39 insertions(+), 5 deletions(-)
>
> Hm, we are now further complicating the bootmem model.

Yeah, agreed. But reorganizing the allocator during boot is not
easy. I'll investigate it later on but VT-d people want to fix this
IOMMU issue now. Can we accept this for now?

2009-11-08 09:58:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late


* Pekka Enberg <[email protected]> wrote:

> On Wed, Oct 28, 2009 at 2:12 PM, Tejun Heo <[email protected]> wrote:
> >> We're doing it before scheduler init now but I haven't put any effort
> >> into moving it earlier than that yet. I don't see any fundamental
> >> reason we can't do that but the practical problem is that we're going
> >> to affect architecture specific boot code which is really hard to
> >> test.
> >
> > Thanks for the explanation. ?It would be really great if we can pull
> > that off someday. ?This should be doable architecture-by-architecture,
> > right? ?You can, for example, first convert x86 and then make bootmem
> > allocator thin wrapper around the slab allocator. ?After all archs
> > have been converted, the wrappers can be dropped.
>
> I am not sure how you we could do that.
>
> The main challenge in initializing slab earlier is the various
> implicit dependencies between different parts of the kernel boot code.
> If we initialize slab earlier, we also need to initialize page
> allocator earlier which requires page table setup, mem_init(), and so
> on. Unfortunately architectures don't do boot-time setup in "standard"
> places which means that for some architectures mem_init() might need
> traps but for other architectures we can't just enable traps earlier
> unless we do something else before that as well.
>
> So I think we need to untagle the mess in init/main.c some more first
> before we try to initialize slab earlier.

Page tables is the main dependency. x86 boots with a limited set of page
tables, the real ones are set up later.

We'd need to see what bootmem allocations are done before page table
init in practice. I think i did such tests a few years ago and i think
it's rather limited (if it happens at all).

If that's mapped out we can just convert x86 to an 'emulated' bootmem
allocator: buddy and slab is set up right when pagetables are set up,
and bootmem can just use kmalloc.

If that works and is without nasty surprises then we can repeat the same
for other architectures as well. (x86-only code could switch away from
bootmem at that point as well)

( It's not without mm/page_alloc.c challenges: we'd need new helpers to
be able to bootstrap the buddy straight out of arch code, without any
bootmem data structures. )

The elimination of bootmem would be an awesome simplification of our
memory bootstrap design, and universal kmalloc would be very nice for
platform code as well.

Ingo

2009-11-08 10:01:12

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late


* FUJITA Tomonori <[email protected]> wrote:

> On Wed, 28 Oct 2009 08:48:32 +0100
> Ingo Molnar <[email protected]> wrote:
>
> >
> > * FUJITA Tomonori <[email protected]> wrote:
> >
> > > From: Chris Wright <[email protected]>
> > >
> > > Add a new function for freeing bootmem after the bootmem allocator has
> > > been released and the unreserved pages given to the page allocator.
> > > This allows us to reserve bootmem and then release it if we later
> > > discover it was not needed.
> > >
> > > Reviewed-by: FUJITA Tomonori <[email protected]>
> > > Signed-off-by: Chris Wright <[email protected]>
> > > ---
> > > include/linux/bootmem.h | 1 +
> > > mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++-----
> > > 2 files changed, 39 insertions(+), 5 deletions(-)
> >
> > Hm, we are now further complicating the bootmem model.
>
> Yeah, agreed. But reorganizing the allocator during boot is not easy.
> I'll investigate it later on but VT-d people want to fix this IOMMU
> issue now. Can we accept this for now?

Since the patchset weighs heavily towards the iommu and x86 side i can
do that in tip:core/iommu tree, if there's an Acked-by for this
bootmem.c patch from at least one of these gents:

Johannes Weiner <[email protected]>
Pekka Enberg <[email protected]>
Andrew Morton <[email protected]>
Tejun Heo <[email protected]>

Thanks,

Ingo

2009-11-09 19:25:11

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Sun, 8 Nov 2009 11:00:29 +0100
Ingo Molnar <[email protected]> wrote:

>
> * FUJITA Tomonori <[email protected]> wrote:
>
> > On Wed, 28 Oct 2009 08:48:32 +0100
> > Ingo Molnar <[email protected]> wrote:
> >
> > >
> > > * FUJITA Tomonori <[email protected]> wrote:
> > >
> > > > From: Chris Wright <[email protected]>
> > > >
> > > > Add a new function for freeing bootmem after the bootmem allocator has
> > > > been released and the unreserved pages given to the page allocator.
> > > > This allows us to reserve bootmem and then release it if we later
> > > > discover it was not needed.
> > > >
> > > > Reviewed-by: FUJITA Tomonori <[email protected]>
> > > > Signed-off-by: Chris Wright <[email protected]>
> > > > ---
> > > > include/linux/bootmem.h | 1 +
> > > > mm/bootmem.c | 43 ++++++++++++++++++++++++++++++++++++++-----
> > > > 2 files changed, 39 insertions(+), 5 deletions(-)
> > >
> > > Hm, we are now further complicating the bootmem model.
> >
> > Yeah, agreed. But reorganizing the allocator during boot is not easy.
> > I'll investigate it later on but VT-d people want to fix this IOMMU
> > issue now. Can we accept this for now?
>
> Since the patchset weighs heavily towards the iommu and x86 side i can
> do that in tip:core/iommu tree, if there's an Acked-by for this
> bootmem.c patch from at least one of these gents:

ok

> Johannes Weiner <[email protected]>
> Pekka Enberg <[email protected]>
> Andrew Morton <[email protected]>
> Tejun Heo <[email protected]>

Can you give ACKs?


Thanks,

2009-11-09 20:02:24

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 01/10] Add iommu_init to x86_init_ops

On Wed, Oct 28, 2009 at 8:47 AM, FUJITA Tomonori
<[email protected]> wrote:
> We call the detections functions of all the IOMMUs then all their
> initialization functions. THe latter is pointless since we don't
> detect multiple different IOMMUs. What we need to do is calling the
> initialization function of the detected IOMMU.
>
> This adds iommu_init hook to x86_init_ops so if an IOMMU detection
> function can set its initialization function to the hook.
>
> Signed-off-by: FUJITA Tomonori <[email protected]>
> ---
> ?arch/x86/include/asm/iommu.h ? ?| ? ?1 +
> ?arch/x86/include/asm/x86_init.h | ? ?9 +++++++++
> ?arch/x86/kernel/pci-dma.c ? ? ? | ? ?2 ++
> ?arch/x86/kernel/x86_init.c ? ? ?| ? ?5 +++++
> ?4 files changed, 17 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
> index fd6d21b..42aa977 100644
> --- a/arch/x86/include/asm/iommu.h
> +++ b/arch/x86/include/asm/iommu.h
> @@ -3,6 +3,7 @@
>
> ?extern void pci_iommu_shutdown(void);
> ?extern void no_iommu_init(void);
> +static inline int iommu_init_noop(void) { return 0; }

Why is this function not put in x86_init.c?

2009-11-09 20:11:56

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH 01/10] Add iommu_init to x86_init_ops

On Mon, 9 Nov 2009 22:02:24 +0200
Pekka Enberg <[email protected]> wrote:

> On Wed, Oct 28, 2009 at 8:47 AM, FUJITA Tomonori
> <[email protected]> wrote:
> > We call the detections functions of all the IOMMUs then all their
> > initialization functions. THe latter is pointless since we don't
> > detect multiple different IOMMUs. What we need to do is calling the
> > initialization function of the detected IOMMU.
> >
> > This adds iommu_init hook to x86_init_ops so if an IOMMU detection
> > function can set its initialization function to the hook.
> >
> > Signed-off-by: FUJITA Tomonori <[email protected]>
> > ---
> > ?arch/x86/include/asm/iommu.h ? ?| ? ?1 +
> > ?arch/x86/include/asm/x86_init.h | ? ?9 +++++++++
> > ?arch/x86/kernel/pci-dma.c ? ? ? | ? ?2 ++
> > ?arch/x86/kernel/x86_init.c ? ? ?| ? ?5 +++++
> > ?4 files changed, 17 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
> > index fd6d21b..42aa977 100644
> > --- a/arch/x86/include/asm/iommu.h
> > +++ b/arch/x86/include/asm/iommu.h
> > @@ -3,6 +3,7 @@
> >
> > ?extern void pci_iommu_shutdown(void);
> > ?extern void no_iommu_init(void);
> > +static inline int iommu_init_noop(void) { return 0; }
>
> Why is this function not put in x86_init.c?

If it's fine by x86 maintainers, I'll do in the next version.

2009-11-09 20:13:38

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Wed, Oct 28, 2009 at 8:47 AM, FUJITA Tomonori
<[email protected]> wrote:
> @@ -151,7 +151,9 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
> ?*
> ?* This will free the pages in the range @start to @end, making them
> ?* available to the page allocator. ?The @map will be used to skip
> - * reserved pages. ?Returns the count of pages freed.
> + * reserved pages. ?In the case that @map is NULL, the bootmem allocator
> + * is already free and the range is contiguous. ?Returns the count of
> + * pages freed.
> ?*/
> ?static unsigned long __init free_bootmem_pages(unsigned long start,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? unsigned long end,
> @@ -164,13 +166,23 @@ static unsigned long __init free_bootmem_pages(unsigned long start,
> ? ? ? ? * If the start is aligned to the machines wordsize, we might
> ? ? ? ? * be able to free pages in bulks of that order.
> ? ? ? ? */
> - ? ? ? aligned = !(start & (BITS_PER_LONG - 1));
> + ? ? ? if (map)
> + ? ? ? ? ? ? ? aligned = !(start & (BITS_PER_LONG - 1));
> + ? ? ? else
> + ? ? ? ? ? ? ? aligned = 1;

Why do we need this special casing here? Isn't start always aligned
properly for callers with map == NULL?

>
> ? ? ? ?for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
> - ? ? ? ? ? ? ? unsigned long idx, vec;
> + ? ? ? ? ? ? ? unsigned long vec;
>
> - ? ? ? ? ? ? ? idx = cursor - start;
> - ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
> + ? ? ? ? ? ? ? if (map) {
> + ? ? ? ? ? ? ? ? ? ? ? unsigned long idx = cursor - start;
> + ? ? ? ? ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
> + ? ? ? ? ? ? ? } else {
> + ? ? ? ? ? ? ? ? ? ? ? if (end - cursor >= BITS_PER_LONG)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vec = ~0UL;

Why do we need the above?

> + ? ? ? ? ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vec = (1UL << (end - cursor)) - 1;
> + ? ? ? ? ? ? ? }
>
> ? ? ? ? ? ? ? ?if (aligned && vec == ~0UL && cursor + BITS_PER_LONG < end) {
> ? ? ? ? ? ? ? ? ? ? ? ?int order = ilog2(BITS_PER_LONG);
> @@ -387,6 +399,27 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
> ?}
>
> ?/**
> + * free_bootmem_late - free bootmem pages directly to page allocator
> + * @addr: starting address of the range
> + * @size: size of the range in bytes
> + *
> + * This is only useful when the bootmem allocator has already been torn
> + * down, but we are still initializing the system. ?Pages are given directly
> + * to the page allocator, no bootmem metadata is updated because it is gone.
> + */
> +void __init free_bootmem_late(unsigned long addr, unsigned long size)
> +{
> + ? ? ? unsigned long start, end;
> +
> + ? ? ? kmemleak_free_part(__va(addr), size);
> +
> + ? ? ? start = PFN_UP(addr);
> + ? ? ? end = PFN_DOWN(addr + size);
> +
> + ? ? ? totalram_pages += free_bootmem_pages(start, end, NULL);
> +}
> +
> +/**
> ?* reserve_bootmem_node - mark a page range as reserved
> ?* @pgdat: node the range resides on
> ?* @physaddr: starting address of the range
> --
> 1.5.6.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
>

2009-11-09 20:21:45

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Mon, Nov 9, 2009 at 10:13 PM, Pekka Enberg <[email protected]> wrote:
>> ? ? ? ?for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
>> - ? ? ? ? ? ? ? unsigned long idx, vec;
>> + ? ? ? ? ? ? ? unsigned long vec;
>>
>> - ? ? ? ? ? ? ? idx = cursor - start;
>> - ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
>> + ? ? ? ? ? ? ? if (map) {
>> + ? ? ? ? ? ? ? ? ? ? ? unsigned long idx = cursor - start;
>> + ? ? ? ? ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
>> + ? ? ? ? ? ? ? } else {
>> + ? ? ? ? ? ? ? ? ? ? ? if (end - cursor >= BITS_PER_LONG)
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vec = ~0UL;
>
> Why do we need the above?

OK, I figured that out. I'm not sure why you want to play tricks with
"vec" when you could just add a new helper that calls
__free_pages_bootmem() for the full contiguous page range.

Pekka

2009-11-09 21:48:11

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Mon, 9 Nov 2009 22:21:48 +0200
Pekka Enberg <[email protected]> wrote:

> On Mon, Nov 9, 2009 at 10:13 PM, Pekka Enberg <[email protected]> wrote:
> >> ? ? ? ?for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
> >> - ? ? ? ? ? ? ? unsigned long idx, vec;
> >> + ? ? ? ? ? ? ? unsigned long vec;
> >>
> >> - ? ? ? ? ? ? ? idx = cursor - start;
> >> - ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
> >> + ? ? ? ? ? ? ? if (map) {
> >> + ? ? ? ? ? ? ? ? ? ? ? unsigned long idx = cursor - start;
> >> + ? ? ? ? ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
> >> + ? ? ? ? ? ? ? } else {
> >> + ? ? ? ? ? ? ? ? ? ? ? if (end - cursor >= BITS_PER_LONG)
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vec = ~0UL;
> >
> > Why do we need the above?
>
> OK, I figured that out. I'm not sure why you want to play tricks with
> "vec" when you could just add a new helper that calls
> __free_pages_bootmem() for the full contiguous page range.

I'm not sure why Chris did that, but yeah, I think that it's cleaner
to leave free_all_bootmem_core alone.

There is just one user of free_bootmem_late to free some memory so how
about the following simple patch instead 7/10?

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index dd97fb8..b10ec49 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
unsigned long addr,
unsigned long size);
extern void free_bootmem(unsigned long addr, unsigned long size);
+extern void free_bootmem_late(unsigned long addr, unsigned long size);

/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 94ef2e7..b0b393b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -196,6 +196,30 @@ static unsigned long __init free_bootmem_pages(unsigned long start,
return count;
}

+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ cursor = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;

2009-11-10 04:45:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 01/10] Add iommu_init to x86_init_ops


* FUJITA Tomonori <[email protected]> wrote:

> On Mon, 9 Nov 2009 22:02:24 +0200
> Pekka Enberg <[email protected]> wrote:
>
> > On Wed, Oct 28, 2009 at 8:47 AM, FUJITA Tomonori
> > <[email protected]> wrote:
> > > We call the detections functions of all the IOMMUs then all their
> > > initialization functions. THe latter is pointless since we don't
> > > detect multiple different IOMMUs. What we need to do is calling the
> > > initialization function of the detected IOMMU.
> > >
> > > This adds iommu_init hook to x86_init_ops so if an IOMMU detection
> > > function can set its initialization function to the hook.
> > >
> > > Signed-off-by: FUJITA Tomonori <[email protected]>
> > > ---
> > > ?arch/x86/include/asm/iommu.h ? ?| ? ?1 +
> > > ?arch/x86/include/asm/x86_init.h | ? ?9 +++++++++
> > > ?arch/x86/kernel/pci-dma.c ? ? ? | ? ?2 ++
> > > ?arch/x86/kernel/x86_init.c ? ? ?| ? ?5 +++++
> > > ?4 files changed, 17 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
> > > index fd6d21b..42aa977 100644
> > > --- a/arch/x86/include/asm/iommu.h
> > > +++ b/arch/x86/include/asm/iommu.h
> > > @@ -3,6 +3,7 @@
> > >
> > > ?extern void pci_iommu_shutdown(void);
> > > ?extern void no_iommu_init(void);
> > > +static inline int iommu_init_noop(void) { return 0; }
> >
> > Why is this function not put in x86_init.c?
>
> If it's fine by x86 maintainers, I'll do in the next version.

Sure, that looks good to me.

Ingo

2009-11-10 08:06:00

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

FUJITA Tomonori wrote:
> On Mon, 9 Nov 2009 22:21:48 +0200
> Pekka Enberg <[email protected]> wrote:
>
>> On Mon, Nov 9, 2009 at 10:13 PM, Pekka Enberg <[email protected]> wrote:
>>>> for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
>>>> - unsigned long idx, vec;
>>>> + unsigned long vec;
>>>>
>>>> - idx = cursor - start;
>>>> - vec = ~map[idx / BITS_PER_LONG];
>>>> + if (map) {
>>>> + unsigned long idx = cursor - start;
>>>> + vec = ~map[idx / BITS_PER_LONG];
>>>> + } else {
>>>> + if (end - cursor >= BITS_PER_LONG)
>>>> + vec = ~0UL;
>>> Why do we need the above?
>> OK, I figured that out. I'm not sure why you want to play tricks with
>> "vec" when you could just add a new helper that calls
>> __free_pages_bootmem() for the full contiguous page range.
>
> I'm not sure why Chris did that, but yeah, I think that it's cleaner
> to leave free_all_bootmem_core alone.
>
> There is just one user of free_bootmem_late to free some memory so how
> about the following simple patch instead 7/10?

Looks good to me!

Acked-by: Pekka Enberg <[email protected]>

> diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
> index dd97fb8..b10ec49 100644
> --- a/include/linux/bootmem.h
> +++ b/include/linux/bootmem.h
> @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
> unsigned long addr,
> unsigned long size);
> extern void free_bootmem(unsigned long addr, unsigned long size);
> +extern void free_bootmem_late(unsigned long addr, unsigned long size);
>
> /*
> * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index 94ef2e7..b0b393b 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -196,6 +196,30 @@ static unsigned long __init free_bootmem_pages(unsigned long start,
> return count;
> }
>
> +/*
> + * free_bootmem_late - free bootmem pages directly to page allocator
> + * @addr: starting address of the range
> + * @size: size of the range in bytes
> + *
> + * This is only useful when the bootmem allocator has already been torn
> + * down, but we are still initializing the system. Pages are given directly
> + * to the page allocator, no bootmem metadata is updated because it is gone.
> + */
> +void __init free_bootmem_late(unsigned long addr, unsigned long size)
> +{
> + unsigned long cursor, end;
> +
> + kmemleak_free_part(__va(addr), size);
> +
> + cursor = PFN_UP(addr);
> + end = PFN_DOWN(addr + size);
> +
> + for (; cursor < end; cursor++) {
> + __free_pages_bootmem(pfn_to_page(cursor), 0);
> + totalram_pages++;
> + }
> +}
> +
> static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
> {
> struct page *page;
>
>

2009-11-13 21:12:31

by Chris Wright

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

* Pekka Enberg ([email protected]) wrote:
> On Mon, Nov 9, 2009 at 10:13 PM, Pekka Enberg <[email protected]> wrote:
> >> ? ? ? ?for (cursor = start; cursor < end; cursor += BITS_PER_LONG) {
> >> - ? ? ? ? ? ? ? unsigned long idx, vec;
> >> + ? ? ? ? ? ? ? unsigned long vec;
> >>
> >> - ? ? ? ? ? ? ? idx = cursor - start;
> >> - ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
> >> + ? ? ? ? ? ? ? if (map) {
> >> + ? ? ? ? ? ? ? ? ? ? ? unsigned long idx = cursor - start;
> >> + ? ? ? ? ? ? ? ? ? ? ? vec = ~map[idx / BITS_PER_LONG];
> >> + ? ? ? ? ? ? ? } else {
> >> + ? ? ? ? ? ? ? ? ? ? ? if (end - cursor >= BITS_PER_LONG)
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vec = ~0UL;
> >
> > Why do we need the above?
>
> OK, I figured that out. I'm not sure why you want to play tricks with
> "vec" when you could just add a new helper that calls
> __free_pages_bootmem() for the full contiguous page range.

I did it this way since it's simple enough and allows for high-order
frees (these will nearly all be high-order) and keeps the same core
code exercised in each path. You can't do a higher order free
w/ __free_pages_bootmem() w/out conforming to its requirements.
I don't care either way.

thanks,
-chris

2009-11-16 10:27:32

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH 07/10] bootmem: add free_bootmem_late

On Sun, Nov 08, 2009 at 10:57:19AM +0100, Ingo Molnar wrote:

> Page tables is the main dependency. x86 boots with a limited set of page
> tables, the real ones are set up later.
>
> We'd need to see what bootmem allocations are done before page table
> init in practice. I think i did such tests a few years ago and i think
> it's rather limited (if it happens at all).
>
> If that's mapped out we can just convert x86 to an 'emulated' bootmem
> allocator: buddy and slab is set up right when pagetables are set up,
> and bootmem can just use kmalloc.

That sounds like a good idea. But keep in mind that support for 1GB
pages currently depends on the bootmem allocator because the buddy
system can not allocate 1GB of physically contiguous memory.
But I think this could also be handled from x86 arch code without the
bootmem allocator.

Joerg