2013-08-05 20:42:34

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 00/20] PCI root complex changes for tile architecture

These changes add new features and fix bugs in the Tilera PCI
root complex subsystem. Among the new tilegx features are support
for I/O space access, more MXI-X interrupt vectors, and support for
TRIO 0 MAC 0 on Gx72 systems.

Chris Metcalf (20):
tile PCI RC: cleanups for tilepro PCI RC
tile PCI RC: tilepro conflict with PCI and RAM addresses
tile PCI RC: support pci=off boot arg for tilepro
tile PCI RC: tweak the the pcie_rc_delay support
tile PCI RC: handle case that PCI link is already up
tile: support LSI MEGARAID SAS HBA hybrid dma_ops
tile PCI RC: support more MSI-X interrupt vectors
tile PCI RC: gentler warning for missing plug-in PCI
tile PCI RC: support I/O space access
tile PCI DMA: handle a NULL dev argument properly
tile PCI RC: restructure TRIO initialization
tile PCI RC: eliminate pci_controller.mem_resources field
tile PCI RC: include pci/pcie/Kconfig
tile PCI RC: bomb comments and whitespace format
tile PCI RC: use proper accessor function
tile PCI RC: add dma_get_required_mask()
tile PCI DMA: fix bug in non-page-aligned accessors
tile PCI RC: support PCIe TRIO 0 MAC 0 on Gx72 system
tile PCI RC: reduce driver's vmalloc space usage
tile PCI RC: remove stale include of linux/numa.h

arch/tile/Kconfig | 12 +
arch/tile/gxio/iorpc_trio.c | 23 ++
arch/tile/include/arch/trio.h | 39 ++
arch/tile/include/asm/dma-mapping.h | 8 +-
arch/tile/include/asm/io.h | 126 ++++++-
arch/tile/include/asm/pci.h | 17 +-
arch/tile/include/gxio/iorpc_trio.h | 5 +
arch/tile/include/hv/drv_trio_intf.h | 8 +-
arch/tile/kernel/pci-dma.c | 44 ++-
arch/tile/kernel/pci.c | 33 +-
arch/tile/kernel/pci_gx.c | 710 ++++++++++++++++++++---------------
arch/tile/kernel/setup.c | 9 +-
12 files changed, 678 insertions(+), 356 deletions(-)

--
1.8.3.1


2013-08-05 20:42:36

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 01/20] tile PCI RC: cleanups for tilepro PCI RC

- remove unneeded <linux/bootmem.h> include in pci.c
- eliminate unused pci_controller.first_busno field
- prefer msleep to mdelay
- remove stale comment about pci_scan_bus_parented()

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/pci.h | 1 -
arch/tile/kernel/pci.c | 16 +++-------------
2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 54a9242..cd10e65 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -29,7 +29,6 @@ struct pci_controller {
int index; /* PCI domain number */
struct pci_bus *root_bus;

- int first_busno;
int last_busno;

int hv_cfg_fd[2]; /* config{0,1} fds for this PCIe controller */
diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
index 67237d3..1dae3b2 100644
--- a/arch/tile/kernel/pci.c
+++ b/arch/tile/kernel/pci.c
@@ -20,7 +20,6 @@
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/errno.h>
-#include <linux/bootmem.h>
#include <linux/irq.h>
#include <linux/io.h>
#include <linux/uaccess.h>
@@ -192,7 +191,6 @@ int __init tile_pci_init(void)
controller->hv_cfg_fd[0] = hv_cfg_fd0;
controller->hv_cfg_fd[1] = hv_cfg_fd1;
controller->hv_mem_fd = hv_mem_fd;
- controller->first_busno = 0;
controller->last_busno = 0xff;
controller->ops = &tile_cfg_ops;

@@ -283,7 +281,7 @@ int __init pcibios_init(void)
* known to require at least 20ms here, but we use a more
* conservative value.
*/
- mdelay(250);
+ msleep(250);

/* Scan all of the recorded PCI controllers. */
for (i = 0; i < TILE_NUM_PCIE; i++) {
@@ -304,18 +302,10 @@ int __init pcibios_init(void)

pr_info("PCI: initializing controller #%d\n", i);

- /*
- * This comes from the generic Linux PCI driver.
- *
- * It reads the PCI tree for this bus into the Linux
- * data structures.
- *
- * This is inlined in linux/pci.h and calls into
- * pci_scan_bus_parented() in probe.c.
- */
pci_add_resource(&resources, &ioport_resource);
pci_add_resource(&resources, &iomem_resource);
- bus = pci_scan_root_bus(NULL, 0, controller->ops, controller, &resources);
+ bus = pci_scan_root_bus(NULL, 0, controller->ops,
+ controller, &resources);
controller->root_bus = bus;
controller->last_busno = bus->busn_res.end;
}
--
1.8.3.1

2013-08-05 20:42:45

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 04/20] tile PCI RC: tweak the the pcie_rc_delay support

Allow longer delays if requested, and print the info messages
as we are performing the delay, not when parsing the arguments.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci_gx.c | 35 ++++++++++++++++-------------------
1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 1142563..e99809e 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -69,17 +69,14 @@ static int pcie_rc[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
* a HW PCIe link-training bug. The exact delay is specified with
* a kernel boot argument in the form of "pcie_rc_delay=T,P,S",
* where T is the TRIO instance number, P is the port number and S is
- * the delay in seconds. If the delay is not provided, the value
- * will be DEFAULT_RC_DELAY.
+ * the delay in seconds. If the argument is specified, but the delay is
+ * not provided, the value will be DEFAULT_RC_DELAY.
*/
static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];

/* Default number of seconds that the PCIe RC port probe can be delayed. */
#define DEFAULT_RC_DELAY 10

-/* Max number of seconds that the PCIe RC port probe can be delayed. */
-#define MAX_RC_DELAY 20
-
/* Array of the PCIe ports configuration info obtained from the BIB. */
struct pcie_port_property pcie_ports[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];

@@ -570,14 +567,9 @@ static int setup_pcie_rc_delay(char *str)
if (!isdigit(*str))
return -EINVAL;
delay = simple_strtoul(str, (char **)&str, 10);
- if (delay > MAX_RC_DELAY)
- return -EINVAL;
}

rc_delay[trio_index][mac] = delay ? : DEFAULT_RC_DELAY;
- pr_info("Delaying PCIe RC link training for %u sec"
- " on MAC %lu on TRIO %lu\n", rc_delay[trio_index][mac],
- mac, trio_index);
return 0;
}
early_param("pcie_rc_delay", setup_pcie_rc_delay);
@@ -682,12 +674,6 @@ int __init pcibios_init(void)
continue;
}

- /*
- * Delay the RC link training if needed.
- */
- if (rc_delay[trio_index][mac])
- msleep(rc_delay[trio_index][mac] * 1000);
-
ret = gxio_trio_force_rc_link_up(trio_context, mac);
if (ret < 0)
pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
@@ -697,10 +683,21 @@ int __init pcibios_init(void)
trio_index, controller->mac);

/*
- * Wait a bit here because some EP devices take longer
- * to come up.
+ * Delay the bus probe if needed.
*/
- msleep(1000);
+ if (rc_delay[trio_index][mac]) {
+ pr_info("Delaying PCIe RC bus enumerating %d sec"
+ " on MAC %d on TRIO %d\n",
+ rc_delay[trio_index][mac], mac,
+ trio_index);
+ msleep(rc_delay[trio_index][mac] * 1000);
+ } else {
+ /*
+ * Wait a bit here because some EP devices
+ * take longer to come up.
+ */
+ msleep(1000);
+ }

/*
* Check for PCIe link-up status.
--
1.8.3.1

2013-08-05 20:42:56

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 08/20] tile PCI RC: gentler warning for missing plug-in PCI

Besides using pr_info() to print the linkdown status for a plug-in
slot, add extra indication that this is expected if the slot is empty.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/hv/drv_trio_intf.h | 5 +++--
arch/tile/kernel/pci_gx.c | 10 ++++++++--
2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/tile/include/hv/drv_trio_intf.h b/arch/tile/include/hv/drv_trio_intf.h
index ef9f3f5..ec643a0 100644
--- a/arch/tile/include/hv/drv_trio_intf.h
+++ b/arch/tile/include/hv/drv_trio_intf.h
@@ -64,8 +64,9 @@ struct pcie_port_property
* will not consider it an error if the link comes up as a x8 link. */
uint8_t allow_x8: 1;

- /** Reserved. */
- uint8_t reserved: 1;
+ /** If true, this link is connected to a device which may or may not
+ * be present. */
+ uint8_t removable: 1;

};

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index e0d6664..bf8c69d 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -729,8 +729,14 @@ int __init pcibios_init(void)
__gxio_mmio_read(trio_context->mmio_base_mac +
reg_offset);
if (!port_status.dl_up) {
- pr_err("PCI: link is down, MAC %d on TRIO %d\n",
- mac, trio_index);
+ if (pcie_ports[trio_index][mac].removable) {
+ pr_info("PCI: link is down, MAC %d on TRIO %d\n",
+ mac, trio_index);
+ pr_info("This is expected if no PCIe card"
+ " is connected to this link\n");
+ } else
+ pr_err("PCI: link is down, MAC %d on TRIO %d\n",
+ mac, trio_index);
continue;
}

--
1.8.3.1

2013-08-05 20:42:39

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 02/20] tile PCI RC: tilepro conflict with PCI and RAM addresses

Fix a bug in the tilepro PCI resource allocation code that could make
the bootmem allocator unhappy if 4GB is installed on mshim 0.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/setup.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 68b5426..676e155 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -614,11 +614,12 @@ static void __init setup_bootmem_allocator_node(int i)
/*
* Throw away any memory aliased by the PCI region.
*/
- if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start)
- reserve_bootmem(PFN_PHYS(pci_reserve_start_pfn),
- PFN_PHYS(pci_reserve_end_pfn -
- pci_reserve_start_pfn),
+ if (pci_reserve_start_pfn < end && pci_reserve_end_pfn > start) {
+ start = max(pci_reserve_start_pfn, start);
+ end = min(pci_reserve_end_pfn, end);
+ reserve_bootmem(PFN_PHYS(start), PFN_PHYS(end - start),
BOOTMEM_EXCLUSIVE);
+ }
#endif
}

--
1.8.3.1

2013-08-05 20:42:48

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 06/20] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

The LSI MEGARAID SAS HBA suffers from the problem where it can do
64-bit DMA to streaming buffers but not to consistent buffers.
In other words, 64-bit DMA is used for disk data transfers and 32-bit
DMA must be used for control message transfers. According to LSI,
the firmware is not fully functional yet. This change implements a
kind of hybrid dma_ops to support this.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/dma-mapping.h | 4 ++--
arch/tile/kernel/pci-dma.c | 17 +++++++++++------
2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index f2ff191..6da0540 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -44,12 +44,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)

static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
{
- return paddr + get_dma_offset(dev);
+ return paddr;
}

static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
{
- return daddr - get_dma_offset(dev);
+ return daddr;
}

static inline void dma_mark_clean(void *addr, size_t size) {}
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index b9fe80e..7e98371 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -357,7 +357,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,

addr = page_to_phys(pg);

- *dma_handle = phys_to_dma(dev, addr);
+ *dma_handle = addr + get_dma_offset(dev);

return page_address(pg);
}
@@ -387,7 +387,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
sg->dma_address = sg_phys(sg);
__dma_prep_pa_range(sg->dma_address, sg->length, direction);

- sg->dma_address = phys_to_dma(dev, sg->dma_address);
+ sg->dma_address = sg->dma_address + get_dma_offset(dev);
#ifdef CONFIG_NEED_SG_DMA_LENGTH
sg->dma_length = sg->length;
#endif
@@ -422,7 +422,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
BUG_ON(offset + size > PAGE_SIZE);
__dma_prep_page(page, offset, size, direction);

- return phys_to_dma(dev, page_to_pa(page) + offset);
+ return page_to_pa(page) + offset + get_dma_offset(dev);
}

static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
@@ -432,7 +432,7 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
{
BUG_ON(!valid_dma_direction(direction));

- dma_address = dma_to_phys(dev, dma_address);
+ dma_address -= get_dma_offset(dev);

__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
dma_address & PAGE_OFFSET, size, direction);
@@ -445,7 +445,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
{
BUG_ON(!valid_dma_direction(direction));

- dma_handle = dma_to_phys(dev, dma_handle);
+ dma_handle -= get_dma_offset(dev);

__dma_complete_pa_range(dma_handle, size, direction);
}
@@ -456,7 +456,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
enum dma_data_direction
direction)
{
- dma_handle = dma_to_phys(dev, dma_handle);
+ dma_handle -= get_dma_offset(dev);

__dma_prep_pa_range(dma_handle, size, direction);
}
@@ -573,6 +573,11 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
if (((dma_ops == gx_pci_dma_map_ops) ||
(dma_ops == gx_legacy_pci_dma_map_ops)) &&
(mask <= DMA_BIT_MASK(32))) {
+ if (dma_ops == gx_pci_dma_map_ops) {
+ dma_ops->alloc = tile_swiotlb_alloc_coherent;
+ dma_ops->free = tile_swiotlb_free_coherent;
+ }
+
if (mask > dev->archdata.max_direct_dma_addr)
mask = dev->archdata.max_direct_dma_addr;
}
--
1.8.3.1

2013-08-05 20:43:09

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 09/20] tile PCI RC: support I/O space access

To enable this functionality, configure CONFIG_TILE_PCI_IO. Without
this flag, the kernel still assigns I/O address ranges to the
devices, but no TRIO resource and mapping support is provided.

We assign disjoint I/O address ranges to separate PCIe domains.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/Kconfig | 10 ++++
arch/tile/include/asm/io.h | 126 +++++++++++++++++++++++++++++++++++++++----
arch/tile/include/asm/pci.h | 11 +++-
arch/tile/kernel/pci_gx.c | 128 +++++++++++++++++++++++++++++++++++++++++---
4 files changed, 257 insertions(+), 18 deletions(-)

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 24565a7..bfff769 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -396,6 +396,16 @@ config NO_IOMEM
config NO_IOPORT
def_bool !PCI

+config TILE_PCI_IO
+ bool "PCI I/O space support"
+ default n
+ depends on PCI
+ depends on TILEGX
+ ---help---
+ Enable PCI I/O space support on TILEGx. Since the PCI I/O space
+ is used by few modern PCIe endpoint devices, its support is disabled
+ by default to save the TRIO PIO Region resource for other purposes.
+
source "drivers/pci/Kconfig"

config TILE_USB
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 3167291..73b319e 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -19,7 +19,8 @@
#include <linux/bug.h>
#include <asm/page.h>

-#define IO_SPACE_LIMIT 0xfffffffful
+/* Maximum PCI I/O space address supported. */
+#define IO_SPACE_LIMIT 0xffffffff

/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
@@ -281,8 +282,108 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,

#endif

+#if CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO)
+
+static inline u8 inb(unsigned long addr)
+{
+ return readb((volatile void __iomem *) addr);
+}
+
+static inline u16 inw(unsigned long addr)
+{
+ return readw((volatile void __iomem *) addr);
+}
+
+static inline u32 inl(unsigned long addr)
+{
+ return readl((volatile void __iomem *) addr);
+}
+
+static inline void outb(u8 b, unsigned long addr)
+{
+ writeb(b, (volatile void __iomem *) addr);
+}
+
+static inline void outw(u16 b, unsigned long addr)
+{
+ writew(b, (volatile void __iomem *) addr);
+}
+
+static inline void outl(u32 b, unsigned long addr)
+{
+ writel(b, (volatile void __iomem *) addr);
+}
+
+static inline void insb(unsigned long addr, void *buffer, int count)
+{
+ if (count) {
+ u8 *buf = buffer;
+ do {
+ u8 x = inb(addr);
+ *buf++ = x;
+ } while (--count);
+ }
+}
+
+static inline void insw(unsigned long addr, void *buffer, int count)
+{
+ if (count) {
+ u16 *buf = buffer;
+ do {
+ u16 x = inw(addr);
+ *buf++ = x;
+ } while (--count);
+ }
+}
+
+static inline void insl(unsigned long addr, void *buffer, int count)
+{
+ if (count) {
+ u32 *buf = buffer;
+ do {
+ u32 x = inl(addr);
+ *buf++ = x;
+ } while (--count);
+ }
+}
+
+static inline void outsb(unsigned long addr, const void *buffer, int count)
+{
+ if (count) {
+ const u8 *buf = buffer;
+ do {
+ outb(*buf++, addr);
+ } while (--count);
+ }
+}
+
+static inline void outsw(unsigned long addr, const void *buffer, int count)
+{
+ if (count) {
+ const u16 *buf = buffer;
+ do {
+ outw(*buf++, addr);
+ } while (--count);
+ }
+}
+
+static inline void outsl(unsigned long addr, const void *buffer, int count)
+{
+ if (count) {
+ const u32 *buf = buffer;
+ do {
+ outl(*buf++, addr);
+ } while (--count);
+ }
+}
+
+extern void __iomem *ioport_map(unsigned long port, unsigned int len);
+extern void ioport_unmap(void __iomem *addr);
+
+#else
+
/*
- * The Tile architecture does not support IOPORT, even with PCI.
+ * The TilePro architecture does not support IOPORT, even with PCI.
* Unfortunately we can't yet simply not declare these methods,
* since some generic code that compiles into the kernel, but
* we never run, uses them unconditionally.
@@ -290,7 +391,12 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src,

static inline long ioport_panic(void)
{
+#ifdef __tilegx__
+ panic("PCI IO space support is disabled. Configure the kernel with"
+ " CONFIG_TILE_PCI_IO to enable it");
+#else
panic("inb/outb and friends do not exist on tile");
+#endif
return 0;
}

@@ -335,13 +441,6 @@ static inline void outl(u32 b, unsigned long addr)
ioport_panic();
}

-#define inb_p(addr) inb(addr)
-#define inw_p(addr) inw(addr)
-#define inl_p(addr) inl(addr)
-#define outb_p(x, addr) outb((x), (addr))
-#define outw_p(x, addr) outw((x), (addr))
-#define outl_p(x, addr) outl((x), (addr))
-
static inline void insb(unsigned long addr, void *buffer, int count)
{
ioport_panic();
@@ -372,6 +471,15 @@ static inline void outsl(unsigned long addr, const void *buffer, int count)
ioport_panic();
}

+#endif /* CHIP_HAS_MMIO() && defined(CONFIG_TILE_PCI_IO) */
+
+#define inb_p(addr) inb(addr)
+#define inw_p(addr) inw(addr)
+#define inl_p(addr) inl(addr)
+#define outb_p(x, addr) outb((x), (addr))
+#define outw_p(x, addr) outw((x), (addr))
+#define outl_p(x, addr) outl((x), (addr))
+
#define ioread16be(addr) be16_to_cpu(ioread16(addr))
#define ioread32be(addr) be32_to_cpu(ioread32(addr))
#define iowrite16be(v, addr) iowrite16(be16_to_cpu(v), (addr))
diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index cd10e65..9cf5308 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -144,6 +144,10 @@ struct pci_controller {

int pio_mem_index; /* PIO region index for memory access */

+#ifdef CONFIG_TILE_PCI_IO
+ int pio_io_index; /* PIO region index for I/O space access */
+#endif
+
/*
* Mem-Map regions for all the memory controllers so that Linux can
* map all of its physical memory space to the PCI bus.
@@ -153,6 +157,10 @@ struct pci_controller {
int index; /* PCI domain number */
struct pci_bus *root_bus;

+ /* PCI I/O space resource for this controller. */
+ struct resource io_space;
+ char io_space_name[32];
+
/* PCI memory space resource for this controller. */
struct resource mem_space;
char mem_space_name[32];
@@ -210,7 +218,8 @@ static inline int pcibios_assign_all_busses(void)
}

#define PCIBIOS_MIN_MEM 0
-#define PCIBIOS_MIN_IO 0
+/* Minimum PCI I/O address, starting at the page boundary. */
+#define PCIBIOS_MIN_IO PAGE_SIZE

/* Use any cpu for PCI. */
#define cpumask_of_pcibus(bus) cpu_online_mask
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index bf8c69d..fde3589 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -77,6 +77,9 @@ static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
/* Default number of seconds that the PCIe RC port probe can be delayed. */
#define DEFAULT_RC_DELAY 10

+/* The PCI I/O space size in each PCI domain. */
+#define IO_SPACE_SIZE 0x10000
+
/* Array of the PCIe ports configuration info obtained from the BIB. */
struct pcie_port_property pcie_ports[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];

@@ -421,6 +424,17 @@ out:
controller->index = i;
controller->ops = &tile_cfg_ops;

+ controller->io_space.start = PCIBIOS_MIN_IO +
+ (i * IO_SPACE_SIZE);
+ controller->io_space.end = controller->io_space.start +
+ IO_SPACE_SIZE - 1;
+ BUG_ON(controller->io_space.end > IO_SPACE_LIMIT);
+ controller->io_space.flags = IORESOURCE_IO;
+ snprintf(controller->io_space_name,
+ sizeof(controller->io_space_name),
+ "PCI I/O domain %d", i);
+ controller->io_space.name = controller->io_space_name;
+
/*
* The PCI memory resource is located above the PA space.
* For every host bridge, the BAR window or the MMIO aperture
@@ -861,17 +875,16 @@ int __init pcibios_init(void)
/*
* The PCI memory resource is located above the PA space.
* The memory range for the PCI root bus should not overlap
- * with the physical RAM
+ * with the physical RAM.
*/
pci_add_resource_offset(&resources, &controller->mem_space,
controller->mem_offset);
-
+ pci_add_resource(&resources, &controller->io_space);
controller->first_busno = next_busno;
bus = pci_scan_root_bus(NULL, next_busno, controller->ops,
controller, &resources);
controller->root_bus = bus;
next_busno = bus->busn_res.end + 1;
-
}

/* Do machine dependent PCI interrupt routing */
@@ -915,9 +928,9 @@ int __init pcibios_init(void)
pci_controllers[i].mem_resources[0] =
*next_bus->resource[0];
pci_controllers[i].mem_resources[1] =
- *next_bus->resource[1];
+ *next_bus->resource[1];
pci_controllers[i].mem_resources[2] =
- *next_bus->resource[2];
+ *next_bus->resource[2];

break;
}
@@ -967,6 +980,39 @@ int __init pcibios_init(void)
continue;
}

+#ifdef CONFIG_TILE_PCI_IO
+ /*
+ * Alloc a PIO region for PCI I/O space access for each RC port.
+ */
+ ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
+ if (ret < 0) {
+ pr_err("PCI: I/O PIO alloc failure on TRIO %d mac %d, "
+ "give up\n", controller->trio_index,
+ controller->mac);
+
+ continue;
+ }
+
+ controller->pio_io_index = ret;
+
+ /*
+ * For PIO IO, the bus_address_hi parameter is hard-coded 0
+ * because PCI I/O address space is 32-bit.
+ */
+ ret = gxio_trio_init_pio_region_aux(trio_context,
+ controller->pio_io_index,
+ controller->mac,
+ 0,
+ HV_TRIO_PIO_FLAG_IO_SPACE);
+ if (ret < 0) {
+ pr_err("PCI: I/O PIO init failure on TRIO %d mac %d, "
+ "give up\n", controller->trio_index,
+ controller->mac);
+
+ continue;
+ }
+#endif
+
/*
* Configure a Mem-Map region for each memory controller so
* that Linux can map all of its PA space to the PCI bus.
@@ -1052,8 +1098,7 @@ char *pcibios_setup(char *str)

/*
* Enable memory address decoding, as appropriate, for the
- * device described by the 'dev' struct. The I/O decoding
- * is disabled, though the TILE-Gx supports I/O addressing.
+ * device described by the 'dev' struct.
*
* This is called from the generic PCI layer, and can be called
* for bridges or endpoints.
@@ -1134,10 +1179,77 @@ got_it:
* We need to keep the PCI bus address's in-page offset in the VA.
*/
return iorpc_ioremap(trio_fd, offset, size) +
- (phys_addr & (PAGE_SIZE - 1));
+ (start & (PAGE_SIZE - 1));
}
EXPORT_SYMBOL(ioremap);

+#ifdef CONFIG_TILE_PCI_IO
+/* Map a PCI I/O address into VA space. */
+void __iomem *ioport_map(unsigned long port, unsigned int size)
+{
+ struct pci_controller *controller = NULL;
+ resource_size_t bar_start;
+ resource_size_t bar_end;
+ resource_size_t offset;
+ resource_size_t start;
+ resource_size_t end;
+ int trio_fd;
+ int i;
+
+ start = port;
+ end = port + size - 1;
+
+ /*
+ * In the following, each PCI controller's mem_resources[0]
+ * represents its PCI I/O resource. By searching port in each
+ * controller's mem_resources[0], we can determine the controller
+ * that should accept the PCI I/O access.
+ */
+
+ for (i = 0; i < num_rc_controllers; i++) {
+ /*
+ * Skip controllers that are not properly initialized or
+ * have down links.
+ */
+ if (pci_controllers[i].root_bus == NULL)
+ continue;
+
+ bar_start = pci_controllers[i].mem_resources[0].start;
+ bar_end = pci_controllers[i].mem_resources[0].end;
+
+ if ((start >= bar_start) && (end <= bar_end)) {
+
+ controller = &pci_controllers[i];
+
+ goto got_it;
+ }
+ }
+
+ if (controller == NULL)
+ return NULL;
+
+got_it:
+ trio_fd = controller->trio->fd;
+
+ /* Convert the resource start to the bus address offset. */
+ port -= controller->io_space.start;
+
+ offset = HV_TRIO_PIO_OFFSET(controller->pio_io_index) + port;
+
+ /*
+ * We need to keep the PCI bus address's in-page offset in the VA.
+ */
+ return iorpc_ioremap(trio_fd, offset, size) + (port & (PAGE_SIZE - 1));
+}
+EXPORT_SYMBOL(ioport_map);
+
+void ioport_unmap(void __iomem *addr)
+{
+ iounmap(addr);
+}
+EXPORT_SYMBOL(ioport_unmap);
+#endif
+
void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
{
iounmap(addr);
--
1.8.3.1

2013-08-05 20:43:15

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 16/20] tile PCI RC: add dma_get_required_mask()

The standard kernel function dma_get_required_mask() uses the
highest DRAM address to determine if 32-bit or 64-bit DMA addressing
is needed. This only works on architectures that have direct mapping
between the PA and the PCI address space, i.e. those that don't have
I/O TLBs or have I/O TLB but choose to use direct mapping. Neither
of these are true for tilegx. Whether to use 64-bit DMA should depend
on the PCI device's capability only, not on the amount of DRAM
installeds, so we now advertise a 64-bit DMA mask unconditionally.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/dma-mapping.h | 4 ++++
arch/tile/kernel/pci-dma.c | 18 ++++++++++++++++++
2 files changed, 22 insertions(+)

diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 6da0540..bd8387d 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -20,6 +20,10 @@
#include <linux/cache.h>
#include <linux/io.h>

+#ifdef __tilegx__
+#define ARCH_HAS_DMA_GET_REQUIRED_MASK
+#endif
+
extern struct dma_map_ops *tile_dma_map_ops;
extern struct dma_map_ops *gx_pci_dma_map_ops;
extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index d16cfc1..3570ff7 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -590,3 +590,21 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_coherent_mask);
#endif
+
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+/*
+ * The generic dma_get_required_mask() uses the highest physical address
+ * (max_pfn) to provide the hint to the PCI drivers regarding 32-bit or
+ * 64-bit DMA configuration. Since TILEGx has I/O TLB/MMU, allowing the
+ * DMAs to use the full 64-bit PCI address space and not limited by
+ * the physical memory space, we always let the PCI devices use
+ * 64-bit DMA if they have that capability, by returning the 64-bit
+ * DMA mask here. The device driver has the option to use 32-bit DMA if
+ * the device is not capable of 64-bit DMA.
+ */
+u64 dma_get_required_mask(struct device *dev)
+{
+ return DMA_BIT_MASK(64);
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+#endif
--
1.8.3.1

2013-08-05 20:43:24

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 20/20] tile PCI RC: remove stale include of linux/numa.h

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/pci.h | 1 -
1 file changed, 1 deletion(-)

diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 2c001b2..c99ad44 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -17,7 +17,6 @@

#include <linux/dma-mapping.h>
#include <linux/pci.h>
-#include <linux/numa.h>
#include <asm-generic/pci_iomap.h>

#ifndef __tilegx__
--
1.8.3.1

2013-08-05 20:43:11

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 12/20] tile PCI RC: eliminate pci_controller.mem_resources field

The .mem_resources[] field in the pci_controller struct
is now obsoleted by the .mem_space and .io_space fields.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/pci.h | 3 --
arch/tile/kernel/pci_gx.c | 71 ++++++++-------------------------------------
2 files changed, 12 insertions(+), 62 deletions(-)

diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 1f1b654..2c001b2 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -173,9 +173,6 @@ struct pci_controller {

/* Table that maps the INTx numbers to Linux irq numbers. */
int irq_intx_table[4];
-
- /* Address ranges that are routed to this controller/bridge. */
- struct resource mem_resources[3];
};

extern struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 8051638..6837be2 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -929,9 +929,6 @@ int __init pcibios_init(void)
struct pci_controller *controller = &pci_controllers[i];
gxio_trio_context_t *trio_context = controller->trio;
struct pci_bus *root_bus = pci_controllers[i].root_bus;
- struct pci_bus *next_bus;
- uint32_t bus_address_hi;
- struct pci_dev *dev;
int ret;
int j;

@@ -945,35 +942,6 @@ int __init pcibios_init(void)
/* Configure the max_payload_size values for this domain. */
fixup_read_and_payload_sizes(controller);

- list_for_each_entry(dev, &root_bus->devices, bus_list) {
- /* Find the PCI host controller, ie. the 1st bridge. */
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI &&
- (PCI_SLOT(dev->devfn) == 0)) {
- next_bus = dev->subordinate;
- pci_controllers[i].mem_resources[0] =
- *next_bus->resource[0];
- pci_controllers[i].mem_resources[1] =
- *next_bus->resource[1];
- pci_controllers[i].mem_resources[2] =
- *next_bus->resource[2];
-
- break;
- }
- }
-
- if (pci_controllers[i].mem_resources[1].flags & IORESOURCE_MEM)
- bus_address_hi =
- pci_controllers[i].mem_resources[1].start >> 32;
- else if (pci_controllers[i].mem_resources[2].flags & IORESOURCE_PREFETCH)
- bus_address_hi =
- pci_controllers[i].mem_resources[2].start >> 32;
- else {
- /* This is unlikely. */
- pr_err("PCI: no memory resources on TRIO %d mac %d\n",
- controller->trio_index, controller->mac);
- continue;
- }
-
/*
* Alloc a PIO region for PCI memory access for each RC port.
*/
@@ -1153,16 +1121,13 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
resource_size_t start;
resource_size_t end;
int trio_fd;
- int i, j;
+ int i;

start = phys_addr;
end = phys_addr + size - 1;

/*
- * In the following, each PCI controller's mem_resources[1]
- * represents its (non-prefetchable) PCI memory resource and
- * mem_resources[2] refers to its prefetchable PCI memory resource.
- * By searching phys_addr in each controller's mem_resources[], we can
+ * By searching phys_addr in each controller's mem_space, we can
* determine the controller that should accept the PCI memory access.
*/

@@ -1174,25 +1139,18 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
if (pci_controllers[i].root_bus == NULL)
continue;

- for (j = 1; j < 3; j++) {
- bar_start =
- pci_controllers[i].mem_resources[j].start;
- bar_end =
- pci_controllers[i].mem_resources[j].end;
-
- if ((start >= bar_start) && (end <= bar_end)) {
+ bar_start = pci_controllers[i].mem_space.start;
+ bar_end = pci_controllers[i].mem_space.end;

- controller = &pci_controllers[i];
-
- goto got_it;
- }
+ if ((start >= bar_start) && (end <= bar_end)) {
+ controller = &pci_controllers[i];
+ break;
}
}

if (controller == NULL)
return NULL;

-got_it:
trio_fd = controller->trio->fd;

/* Convert the resource start to the bus address offset. */
@@ -1225,10 +1183,8 @@ void __iomem *ioport_map(unsigned long port, unsigned int size)
end = port + size - 1;

/*
- * In the following, each PCI controller's mem_resources[0]
- * represents its PCI I/O resource. By searching port in each
- * controller's mem_resources[0], we can determine the controller
- * that should accept the PCI I/O access.
+ * By searching the port in each controller's io_space, we can
+ * determine the controller that should accept the PCI I/O access.
*/

for (i = 0; i < num_rc_controllers; i++) {
@@ -1239,21 +1195,18 @@ void __iomem *ioport_map(unsigned long port, unsigned int size)
if (pci_controllers[i].root_bus == NULL)
continue;

- bar_start = pci_controllers[i].mem_resources[0].start;
- bar_end = pci_controllers[i].mem_resources[0].end;
+ bar_start = pci_controllers[i].io_space.start;
+ bar_end = pci_controllers[i].io_space.end;

if ((start >= bar_start) && (end <= bar_end)) {
-
controller = &pci_controllers[i];
-
- goto got_it;
+ break;
}
}

if (controller == NULL)
return NULL;

-got_it:
trio_fd = controller->trio->fd;

/* Convert the resource start to the bus address offset. */
--
1.8.3.1

2013-08-05 20:43:21

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 18/20] tile PCI RC: support PCIe TRIO 0 MAC 0 on Gx72 system

On Tilera Gx72 systems, the logic for figuring out whether
a given port is root complex is slightly different.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/hv/drv_trio_intf.h | 3 +++
arch/tile/kernel/pci_gx.c | 33 ++++++++++++++++++++++++++++++---
2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/arch/tile/include/hv/drv_trio_intf.h b/arch/tile/include/hv/drv_trio_intf.h
index ec643a0..237e04d 100644
--- a/arch/tile/include/hv/drv_trio_intf.h
+++ b/arch/tile/include/hv/drv_trio_intf.h
@@ -168,6 +168,9 @@ pcie_stream_intr_config_sel_t;
struct pcie_trio_ports_property
{
struct pcie_port_property ports[TILEGX_TRIO_PCIES];
+
+ /** Set if this TRIO belongs to a Gx72 device. */
+ uint8_t is_gx72;
};

/* Flags indicating traffic class. */
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index de5008b..f2bf200 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -436,9 +436,26 @@ int __init tile_pci_init(void)

/*
* Now determine which PCIe ports are configured to operate in RC
- * mode. To use a port, it must be allowed to be in RC mode by the
+ * mode. There is a differece in the port configuration capability
+ * between the Gx36 and Gx72 devices.
+ *
+ * The Gx36 has configuration capability for each of the 3 PCIe
+ * interfaces (disable, auto endpoint, auto RC, etc.).
+ * On the Gx72, you can only select one of the 3 PCIe interfaces per
+ * TRIO to train automatically. Further, the allowable training modes
+ * are reduced to four options (auto endpoint, auto RC, stream x1,
+ * stream x4).
+ *
+ * For Gx36 ports, it must be allowed to be in RC mode by the
* Board Information Block, and the hardware strapping pins must be
* set to RC mode.
+ *
+ * For Gx72 ports, the port will operate in RC mode if either of the
+ * following is true:
+ * 1. It is allowed to be in RC mode by the Board Information Block,
+ * and the BIB doesn't allow the EP mode.
+ * 2. It is allowed to be in either the RC or the EP mode by the BIB,
+ * and the hardware strapping pin is set to RC mode.
*/
for (i = 0; i < TILEGX_NUM_TRIO; i++) {
gxio_trio_context_t *context = &trio_contexts[i];
@@ -447,8 +464,18 @@ int __init tile_pci_init(void)
continue;

for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
- if (pcie_ports[i].ports[j].allow_rc &&
- strapped_for_rc(context, j)) {
+ int is_rc = 0;
+
+ if (pcie_ports[i].is_gx72 &&
+ pcie_ports[i].ports[j].allow_rc) {
+ if (!pcie_ports[i].ports[j].allow_ep ||
+ strapped_for_rc(context, j))
+ is_rc = 1;
+ } else if (pcie_ports[i].ports[j].allow_rc &&
+ strapped_for_rc(context, j)) {
+ is_rc = 1;
+ }
+ if (is_rc) {
pcie_rc[i][j] = 1;
num_rc_controllers++;
}
--
1.8.3.1

2013-08-05 20:44:08

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 19/20] tile PCI RC: reduce driver's vmalloc space usage

We can take advantage of the fact that bit 29 is hard-wired
to zero in register TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR.
This is handy since at the moment we only allocate one 4GB
region for vmalloc, and with this change we can allocate
four or more TRIO MACs without using up all the vmalloc space.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci_gx.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index f2bf200..4843881 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -860,9 +860,15 @@ int __init pcibios_init(void)

#endif

+ /*
+ * To save VMALLOC space, we take advantage of the fact that
+ * bit 29 in the PIO CFG address format is reserved 0. With
+ * TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT being 30,
+ * this cuts VMALLOC space usage from 1GB to 512MB per mac.
+ */
trio_context->mmio_base_pio_cfg[mac] =
- iorpc_ioremap(trio_context->fd, offset,
- (1 << TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT));
+ iorpc_ioremap(trio_context->fd, offset, (1UL <<
+ (TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT - 1)));
if (trio_context->mmio_base_pio_cfg[mac] == NULL) {
pr_err("PCI: PIO map failure for mac %d on TRIO %d\n",
mac, trio_index);
--
1.8.3.1

2013-08-05 20:44:32

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 17/20] tile PCI DMA: fix bug in non-page-aligned accessors

The code incorrectly masked with PAGE_OFFSET instead of
PAGE_SIZE-1. This only matters when trying to do a
non page-aligned DMA; it was noticed during code inspection.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci-dma.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index 3570ff7..cbc6fc4 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -257,7 +257,7 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
BUG_ON(!valid_dma_direction(direction));

__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
- dma_address & PAGE_OFFSET, size, direction);
+ dma_address & (PAGE_SIZE - 1), size, direction);
}

static void tile_dma_sync_single_for_cpu(struct device *dev,
@@ -436,7 +436,7 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
dma_address -= get_dma_offset(dev);

__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
- dma_address & PAGE_OFFSET, size, direction);
+ dma_address & (PAGE_SIZE - 1), size, direction);
}

static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
--
1.8.3.1

2013-08-05 20:44:55

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 15/20] tile PCI RC: use proper accessor function

Using the low-level hv_dev_pread() API makes assumptions about the
layout of datastructures in the Tilera hypervisor API; it's better to
use the gxio_XXX accessor and the pcie_trio_ports_property struct.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci_gx.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 8352d85..de5008b 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -91,7 +91,7 @@ static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1

/* Array of the PCIe ports configuration info obtained from the BIB. */
-struct pcie_port_property pcie_ports[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
+struct pcie_trio_ports_property pcie_ports[TILEGX_NUM_TRIO];

/* Number of configured TRIO instances. */
int num_trio_shims;
@@ -195,10 +195,7 @@ static int tile_pcie_open(int trio_index)
#endif

/* Get the properties of the PCIe ports on this TRIO instance. */
- ret = hv_dev_pread(context->fd, 0,
- (HV_VirtAddr)&pcie_ports[trio_index][0],
- sizeof(struct pcie_port_property) * TILEGX_TRIO_PCIES,
- GXIO_TRIO_OP_GET_PORT_PROPERTY);
+ ret = gxio_trio_get_port_property(context, &pcie_ports[trio_index]);
if (ret < 0) {
pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
" on TRIO %d\n", ret, trio_index);
@@ -221,8 +218,8 @@ static int tile_pcie_open(int trio_index)
unsigned int reg_offset;

/* Ignore ports that are not specified in the BIB. */
- if (!pcie_ports[trio_index][mac].allow_rc &&
- !pcie_ports[trio_index][mac].allow_ep)
+ if (!pcie_ports[trio_index].ports[mac].allow_rc &&
+ !pcie_ports[trio_index].ports[mac].allow_ep)
continue;

reg_offset =
@@ -243,7 +240,7 @@ static int tile_pcie_open(int trio_index)
*/
if (port_config.strap_state == AUTO_CONFIG_EP ||
port_config.strap_state == AUTO_CONFIG_EP_G1)
- pcie_ports[trio_index][mac].allow_ep = 1;
+ pcie_ports[trio_index].ports[mac].allow_ep = 1;
}
}

@@ -438,9 +435,10 @@ int __init tile_pci_init(void)
return 0;

/*
- * Now determine which PCIe ports are configured to operate in RC mode.
- * We look at the Board Information Block first and then see if there
- * are any overriding configuration by the HW strapping pin.
+ * Now determine which PCIe ports are configured to operate in RC
+ * mode. To use a port, it must be allowed to be in RC mode by the
+ * Board Information Block, and the hardware strapping pins must be
+ * set to RC mode.
*/
for (i = 0; i < TILEGX_NUM_TRIO; i++) {
gxio_trio_context_t *context = &trio_contexts[i];
@@ -449,7 +447,7 @@ int __init tile_pci_init(void)
continue;

for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
- if (pcie_ports[i][j].allow_rc &&
+ if (pcie_ports[i].ports[j].allow_rc &&
strapped_for_rc(context, j)) {
pcie_rc[i][j] = 1;
num_rc_controllers++;
@@ -736,7 +734,7 @@ int __init pcibios_init(void)
__gxio_mmio_read(trio_context->mmio_base_mac +
reg_offset);
if (!port_status.dl_up) {
- if (pcie_ports[trio_index][mac].removable) {
+ if (pcie_ports[trio_index].ports[mac].removable) {
pr_info("PCI: link is down, MAC %d on TRIO %d\n",
mac, trio_index);
pr_info("This is expected if no PCIe card"
--
1.8.3.1

2013-08-05 20:46:24

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 14/20] tile PCI RC: bomb comments and whitespace format

This change is purely stylistic but improves the readability
of the tile PCI RC driver.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci_gx.c | 180 +++++++++++++++-------------------------------
1 file changed, 56 insertions(+), 124 deletions(-)

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 6837be2..8352d85 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -108,17 +108,15 @@ static struct pci_ops tile_cfg_ops;
/* Mask of CPUs that should receive PCIe interrupts. */
static struct cpumask intr_cpus_map;

-/*
- * We don't need to worry about the alignment of resources.
- */
+/* We don't need to worry about the alignment of resources. */
resource_size_t pcibios_align_resource(void *data, const struct resource *res,
- resource_size_t size, resource_size_t align)
+ resource_size_t size,
+ resource_size_t align)
{
return res->start;
}
EXPORT_SYMBOL(pcibios_align_resource);

-
/*
* Pick a CPU to receive and handle the PCIe interrupts, based on the IRQ #.
* For now, we simply send interrupts to non-dataplane CPUs.
@@ -146,25 +144,19 @@ static int tile_irq_cpu(int irq)
return cpu;
}

-/*
- * Open a file descriptor to the TRIO shim.
- */
+/* Open a file descriptor to the TRIO shim. */
static int tile_pcie_open(int trio_index)
{
gxio_trio_context_t *context = &trio_contexts[trio_index];
int ret;
int mac;

- /*
- * This opens a file descriptor to the TRIO shim.
- */
+ /* This opens a file descriptor to the TRIO shim. */
ret = gxio_trio_init(context, trio_index);
if (ret < 0)
goto gxio_trio_init_failure;

- /*
- * Allocate an ASID for the kernel.
- */
+ /* Allocate an ASID for the kernel. */
ret = gxio_trio_alloc_asids(context, 1, 0, 0);
if (ret < 0) {
pr_err("PCI: ASID alloc failure on TRIO %d, give up\n",
@@ -285,20 +277,17 @@ static int __init tile_trio_init(void)
}
postcore_initcall(tile_trio_init);

-static void
-tilegx_legacy_irq_ack(struct irq_data *d)
+static void tilegx_legacy_irq_ack(struct irq_data *d)
{
__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
}

-static void
-tilegx_legacy_irq_mask(struct irq_data *d)
+static void tilegx_legacy_irq_mask(struct irq_data *d)
{
__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
}

-static void
-tilegx_legacy_irq_unmask(struct irq_data *d)
+static void tilegx_legacy_irq_unmask(struct irq_data *d)
{
__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
}
@@ -319,8 +308,7 @@ static struct irq_chip tilegx_legacy_irq_chip = {
* to Linux which just calls handle_level_irq() after clearing the
* MAC INTx Assert status bit associated with this interrupt.
*/
-static void
-trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
+static void trio_handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
struct pci_controller *controller = irq_desc_get_handler_data(desc);
gxio_trio_context_t *trio_context = controller->trio;
@@ -386,9 +374,7 @@ static int tile_init_irqs(struct pci_controller *controller)
goto free_irqs;
}

- /*
- * Register the IRQ handler with the kernel.
- */
+ /* Register the IRQ handler with the kernel. */
irq_set_chip_and_handler(irq, &tilegx_legacy_irq_chip,
trio_handle_level_irq);
irq_set_chip_data(irq, (void *)(uint64_t)i);
@@ -471,15 +457,11 @@ int __init tile_pci_init(void)
}
}

- /*
- * Return if no PCIe ports are configured to operate in RC mode.
- */
+ /* Return if no PCIe ports are configured to operate in RC mode. */
if (num_rc_controllers == 0)
return 0;

- /*
- * Set the TRIO pointer and MAC index for each PCIe RC port.
- */
+ /* Set the TRIO pointer and MAC index for each PCIe RC port. */
for (i = 0; i < TILEGX_NUM_TRIO; i++) {
for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
if (pcie_rc[i][j]) {
@@ -495,14 +477,10 @@ int __init tile_pci_init(void)
}

out:
- /*
- * Configure each PCIe RC port.
- */
+ /* Configure each PCIe RC port. */
for (i = 0; i < num_rc_controllers; i++) {
- /*
- * Configure the PCIe MAC to run in RC mode.
- */

+ /* Configure the PCIe MAC to run in RC mode. */
struct pci_controller *controller = &pci_controllers[i];

controller->index = i;
@@ -525,7 +503,6 @@ out:
* is in range [3GB, 4GB - 1] of a 4GB space beyond the
* PA space.
*/
-
controller->mem_offset = TILE_PCI_MEM_START +
(i * TILE_PCI_BAR_WINDOW_TOP);
controller->mem_space.start = controller->mem_offset +
@@ -553,7 +530,6 @@ static int tile_map_irq(const struct pci_dev *dev, u8 device, u8 pin)
return controller->irq_intx_table[pin - 1];
}

-
static void fixup_read_and_payload_sizes(struct pci_controller *controller)
{
gxio_trio_context_t *trio_context = controller->trio;
@@ -567,9 +543,7 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)

mac = controller->mac;

- /*
- * Set our max read request size to be 4KB.
- */
+ /* Set our max read request size to be 4KB. */
reg_offset =
(TRIO_PCIE_RC_DEVICE_CONTROL <<
TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -578,10 +552,10 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);

dev_control.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
- reg_offset);
+ reg_offset);
dev_control.max_read_req_sz = 5;
__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
- dev_control.word);
+ dev_control.word);

/*
* Set the max payload size supported by this Gx PCIe MAC.
@@ -597,10 +571,10 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
(mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);

rc_dev_cap.word = __gxio_mmio_read32(trio_context->mmio_base_mac +
- reg_offset);
+ reg_offset);
rc_dev_cap.mps_sup = 1;
__gxio_mmio_write32(trio_context->mmio_base_mac + reg_offset,
- rc_dev_cap.word);
+ rc_dev_cap.word);

/* Configure PCI Express MPS setting. */
list_for_each_entry(child, &root_bus->children, node) {
@@ -628,7 +602,7 @@ static void fixup_read_and_payload_sizes(struct pci_controller *controller)
dev_control.max_payload_size,
dev_control.max_read_req_sz,
mac);
- if (err < 0) {
+ if (err < 0) {
pr_err("PCI: PCIE_CONFIGURE_MAC_MPS_MRS failure, "
"MAC %d on TRIO %d\n",
mac, controller->trio_index);
@@ -672,9 +646,7 @@ static int setup_pcie_rc_delay(char *str)
}
early_param("pcie_rc_delay", setup_pcie_rc_delay);

-/*
- * PCI initialization entry point, called by subsys_initcall.
- */
+/* PCI initialization entry point, called by subsys_initcall. */
int __init pcibios_init(void)
{
resource_size_t offset;
@@ -744,9 +716,7 @@ int __init pcibios_init(void)
pr_info("PCI: Found PCI controller #%d on TRIO %d MAC %d\n", i,
trio_index, controller->mac);

- /*
- * Delay the bus probe if needed.
- */
+ /* Delay the bus probe if needed. */
if (rc_delay[trio_index][mac]) {
pr_info("Delaying PCIe RC bus enumerating %d sec"
" on MAC %d on TRIO %d\n",
@@ -761,9 +731,7 @@ int __init pcibios_init(void)
msleep(1000);
}

- /*
- * Check for PCIe link-up status again.
- */
+ /* Check for PCIe link-up status again. */
port_status.word =
__gxio_mmio_read(trio_context->mmio_base_mac +
reg_offset);
@@ -801,7 +769,6 @@ int __init pcibios_init(void)
* Change the device ID so that Linux bus crawl doesn't confuse
* the internal bridge with any Tilera endpoints.
*/
-
reg_offset =
(TRIO_PCIE_RC_DEVICE_ID_VEN_ID <<
TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -814,10 +781,7 @@ int __init pcibios_init(void)
TRIO_PCIE_RC_DEVICE_ID_VEN_ID__DEV_ID_SHIFT) |
TILERA_VENDOR_ID);

- /*
- * Set the internal P2P bridge class code.
- */
-
+ /* Set the internal P2P bridge class code. */
reg_offset =
(TRIO_PCIE_RC_REVISION_ID <<
TRIO_CFG_REGION_ADDR__REG_SHIFT) |
@@ -828,26 +792,22 @@ int __init pcibios_init(void)
class_code_revision =
__gxio_mmio_read32(trio_context->mmio_base_mac +
reg_offset);
- class_code_revision = (class_code_revision & 0xff ) |
- (PCI_CLASS_BRIDGE_PCI << 16);
+ class_code_revision = (class_code_revision & 0xff) |
+ (PCI_CLASS_BRIDGE_PCI << 16);

__gxio_mmio_write32(trio_context->mmio_base_mac +
reg_offset, class_code_revision);

#ifdef USE_SHARED_PCIE_CONFIG_REGION

- /*
- * Map in the MMIO space for the PIO region.
- */
+ /* Map in the MMIO space for the PIO region. */
offset = HV_TRIO_PIO_OFFSET(trio_context->pio_cfg_index) |
(((unsigned long long)mac) <<
TRIO_TILE_PIO_REGION_SETUP_CFG_ADDR__MAC_SHIFT);

#else

- /*
- * Alloc a PIO region for PCI config access per MAC.
- */
+ /* Alloc a PIO region for PCI config access per MAC. */
ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
if (ret < 0) {
pr_err("PCI: PCI CFG PIO alloc failure for mac %d "
@@ -858,9 +818,7 @@ int __init pcibios_init(void)

trio_context->pio_cfg_index[mac] = ret;

- /*
- * For PIO CFG, the bus_address_hi parameter is 0.
- */
+ /* For PIO CFG, the bus_address_hi parameter is 0. */
ret = gxio_trio_init_pio_region_aux(trio_context,
trio_context->pio_cfg_index[mac],
mac, 0, HV_TRIO_PIO_FLAG_CONFIG_SPACE);
@@ -887,9 +845,7 @@ int __init pcibios_init(void)
continue;
}

- /*
- * Initialize the PCIe interrupts.
- */
+ /* Initialize the PCIe interrupts. */
if (tile_init_irqs(controller)) {
pr_err("PCI: IRQs init failure for mac %d on TRIO %d\n",
mac, trio_index);
@@ -921,7 +877,6 @@ int __init pcibios_init(void)
* It allocates all of the resources (I/O memory, etc)
* associated with the devices read in above.
*/
-
pci_assign_unassigned_resources();

/* Record the I/O resources in the PCI controller structure. */
@@ -942,14 +897,12 @@ int __init pcibios_init(void)
/* Configure the max_payload_size values for this domain. */
fixup_read_and_payload_sizes(controller);

- /*
- * Alloc a PIO region for PCI memory access for each RC port.
- */
+ /* Alloc a PIO region for PCI memory access for each RC port. */
ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
if (ret < 0) {
pr_err("PCI: MEM PIO alloc failure on TRIO %d mac %d, "
- "give up\n", controller->trio_index,
- controller->mac);
+ "give up\n", controller->trio_index,
+ controller->mac);

continue;
}
@@ -967,8 +920,8 @@ int __init pcibios_init(void)
0);
if (ret < 0) {
pr_err("PCI: MEM PIO init failure on TRIO %d mac %d, "
- "give up\n", controller->trio_index,
- controller->mac);
+ "give up\n", controller->trio_index,
+ controller->mac);

continue;
}
@@ -980,8 +933,8 @@ int __init pcibios_init(void)
ret = gxio_trio_alloc_pio_regions(trio_context, 1, 0, 0);
if (ret < 0) {
pr_err("PCI: I/O PIO alloc failure on TRIO %d mac %d, "
- "give up\n", controller->trio_index,
- controller->mac);
+ "give up\n", controller->trio_index,
+ controller->mac);

continue;
}
@@ -999,8 +952,8 @@ int __init pcibios_init(void)
HV_TRIO_PIO_FLAG_IO_SPACE);
if (ret < 0) {
pr_err("PCI: I/O PIO init failure on TRIO %d mac %d, "
- "give up\n", controller->trio_index,
- controller->mac);
+ "give up\n", controller->trio_index,
+ controller->mac);

continue;
}
@@ -1020,9 +973,9 @@ int __init pcibios_init(void)
0);
if (ret < 0) {
pr_err("PCI: Mem-Map alloc failure on TRIO %d "
- "mac %d for MC %d, give up\n",
- controller->trio_index,
- controller->mac, j);
+ "mac %d for MC %d, give up\n",
+ controller->trio_index,
+ controller->mac, j);

goto alloc_mem_map_failed;
}
@@ -1053,9 +1006,9 @@ int __init pcibios_init(void)
GXIO_TRIO_ORDER_MODE_UNORDERED);
if (ret < 0) {
pr_err("PCI: Mem-Map init failure on TRIO %d "
- "mac %d for MC %d, give up\n",
- controller->trio_index,
- controller->mac, j);
+ "mac %d for MC %d, give up\n",
+ controller->trio_index,
+ controller->mac, j);

goto alloc_mem_map_failed;
}
@@ -1064,22 +1017,18 @@ int __init pcibios_init(void)
alloc_mem_map_failed:
break;
}
-
}

return 0;
}
subsys_initcall(pcibios_init);

-/* Note: to be deleted after Linux 3.6 merge. */
+/* No bus fixups needed. */
void pcibios_fixup_bus(struct pci_bus *bus)
{
}

-/*
- * This can be called from the generic PCI layer, but doesn't need to
- * do anything.
- */
+/* Process any "pci=" kernel boot arguments. */
char *pcibios_setup(char *str)
{
if (!strcmp(str, "off")) {
@@ -1130,7 +1079,6 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
* By searching phys_addr in each controller's mem_space, we can
* determine the controller that should accept the PCI memory access.
*/
-
for (i = 0; i < num_rc_controllers; i++) {
/*
* Skip controllers that are not properly initialized or
@@ -1158,9 +1106,7 @@ void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)

offset = HV_TRIO_PIO_OFFSET(controller->pio_mem_index) + start;

- /*
- * We need to keep the PCI bus address's in-page offset in the VA.
- */
+ /* We need to keep the PCI bus address's in-page offset in the VA. */
return iorpc_ioremap(trio_fd, offset, size) +
(start & (PAGE_SIZE - 1));
}
@@ -1186,7 +1132,6 @@ void __iomem *ioport_map(unsigned long port, unsigned int size)
* By searching the port in each controller's io_space, we can
* determine the controller that should accept the PCI I/O access.
*/
-
for (i = 0; i < num_rc_controllers; i++) {
/*
* Skip controllers that are not properly initialized or
@@ -1214,9 +1159,7 @@ void __iomem *ioport_map(unsigned long port, unsigned int size)

offset = HV_TRIO_PIO_OFFSET(controller->pio_io_index) + port;

- /*
- * We need to keep the PCI bus address's in-page offset in the VA.
- */
+ /* We need to keep the PCI bus address's in-page offset in the VA. */
return iorpc_ioremap(trio_fd, offset, size) + (port & (PAGE_SIZE - 1));
}
EXPORT_SYMBOL(ioport_map);
@@ -1249,7 +1192,6 @@ EXPORT_SYMBOL(pci_iounmap);
* offset is in bytes, from the start of config space for the
* specified bus & device.
*/
-
static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
int size, u32 *val)
{
@@ -1299,7 +1241,6 @@ static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
* Accesses to the directly attached device have to be
* sent as type-0 configs.
*/
-
if (busnum == (controller->first_busno + 1)) {
/*
* There is only one device off of our built-in P2P bridge.
@@ -1321,9 +1262,8 @@ static int tile_cfg_read(struct pci_bus *bus, unsigned int devfn, int offset,
* Note that we don't set the mac field in cfg_addr because the
* mapping is per port.
*/
-
mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
- cfg_addr.word;
+ cfg_addr.word;

valid_device:

@@ -1427,7 +1367,6 @@ static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
* Accesses to the directly attached device have to be
* sent as type-0 configs.
*/
-
if (busnum == (controller->first_busno + 1)) {
/*
* There is only one device off of our built-in P2P bridge.
@@ -1449,7 +1388,6 @@ static int tile_cfg_write(struct pci_bus *bus, unsigned int devfn, int offset,
* Note that we don't set the mac field in cfg_addr because the
* mapping is per port.
*/
-
mmio_addr = trio_context->mmio_base_pio_cfg[controller->mac] +
cfg_addr.word;

@@ -1487,11 +1425,8 @@ static struct pci_ops tile_cfg_ops = {
};


-/*
- * MSI support starts here.
- */
-static unsigned int
-tilegx_msi_startup(struct irq_data *d)
+/* MSI support starts here. */
+static unsigned int tilegx_msi_startup(struct irq_data *d)
{
if (d->msi_desc)
unmask_msi_irq(d);
@@ -1499,21 +1434,18 @@ tilegx_msi_startup(struct irq_data *d)
return 0;
}

-static void
-tilegx_msi_ack(struct irq_data *d)
+static void tilegx_msi_ack(struct irq_data *d)
{
__insn_mtspr(SPR_IPI_EVENT_RESET_K, 1UL << d->irq);
}

-static void
-tilegx_msi_mask(struct irq_data *d)
+static void tilegx_msi_mask(struct irq_data *d)
{
mask_msi_irq(d);
__insn_mtspr(SPR_IPI_MASK_SET_K, 1UL << d->irq);
}

-static void
-tilegx_msi_unmask(struct irq_data *d)
+static void tilegx_msi_unmask(struct irq_data *d)
{
__insn_mtspr(SPR_IPI_MASK_RESET_K, 1UL << d->irq);
unmask_msi_irq(d);
--
1.8.3.1

2013-08-05 20:43:07

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 10/20] tile PCI DMA: handle a NULL dev argument properly

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci-dma.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index 7e98371..d16cfc1 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -36,8 +36,9 @@ static void *tile_dma_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp,
struct dma_attrs *attrs)
{
- u64 dma_mask = dev->coherent_dma_mask ?: DMA_BIT_MASK(32);
- int node = dev_to_node(dev);
+ u64 dma_mask = (dev && dev->coherent_dma_mask) ?
+ dev->coherent_dma_mask : DMA_BIT_MASK(32);
+ int node = dev ? dev_to_node(dev) : 0;
int order = get_order(size);
struct page *pg;
dma_addr_t addr;
--
1.8.3.1

2013-08-05 20:47:23

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 13/20] tile PCI RC: include pci/pcie/Kconfig

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/Kconfig | 2 ++
1 file changed, 2 insertions(+)

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index bfff769..e41a381 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -408,6 +408,8 @@ config TILE_PCI_IO

source "drivers/pci/Kconfig"

+source "drivers/pci/pcie/Kconfig"
+
config TILE_USB
tristate "Tilera USB host adapter support"
default y
--
1.8.3.1

2013-08-05 20:43:04

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 11/20] tile PCI RC: restructure TRIO initialization

The TRIO shim initialization is shared with other kernel drivers
such as the endpoint and StreamIO drivers, so reorganize the
initialization flow to ensure that the root complex driver properly
initializes TRIO state regardless of what kind of TRIO driver will
end up using the shim.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/pci.h | 1 +
arch/tile/kernel/pci_gx.c | 209 +++++++++++++++++++++++++-------------------
2 files changed, 118 insertions(+), 92 deletions(-)

diff --git a/arch/tile/include/asm/pci.h b/arch/tile/include/asm/pci.h
index 9cf5308..1f1b654 100644
--- a/arch/tile/include/asm/pci.h
+++ b/arch/tile/include/asm/pci.h
@@ -180,6 +180,7 @@ struct pci_controller {

extern struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
extern gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];
+extern int num_trio_shims;

extern void pci_iounmap(struct pci_dev *dev, void __iomem *);

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index fde3589..8051638 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -80,16 +80,28 @@ static int rc_delay[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];
/* The PCI I/O space size in each PCI domain. */
#define IO_SPACE_SIZE 0x10000

+/* Provide shorter versions of some very long constant names. */
+#define AUTO_CONFIG_RC \
+ TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC
+#define AUTO_CONFIG_RC_G1 \
+ TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1
+#define AUTO_CONFIG_EP \
+ TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT
+#define AUTO_CONFIG_EP_G1 \
+ TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1
+
/* Array of the PCIe ports configuration info obtained from the BIB. */
struct pcie_port_property pcie_ports[TILEGX_NUM_TRIO][TILEGX_TRIO_PCIES];

+/* Number of configured TRIO instances. */
+int num_trio_shims;
+
/* All drivers share the TRIO contexts defined here. */
gxio_trio_context_t trio_contexts[TILEGX_NUM_TRIO];

/* Pointer to an array of PCIe RC controllers. */
struct pci_controller pci_controllers[TILEGX_NUM_TRIO * TILEGX_TRIO_PCIES];
int num_rc_controllers;
-static int num_ep_controllers;

static struct pci_ops tile_cfg_ops;

@@ -141,13 +153,14 @@ static int tile_pcie_open(int trio_index)
{
gxio_trio_context_t *context = &trio_contexts[trio_index];
int ret;
+ int mac;

/*
* This opens a file descriptor to the TRIO shim.
*/
ret = gxio_trio_init(context, trio_index);
if (ret < 0)
- return ret;
+ goto gxio_trio_init_failure;

/*
* Allocate an ASID for the kernel.
@@ -189,17 +202,89 @@ static int tile_pcie_open(int trio_index)
}
#endif

+ /* Get the properties of the PCIe ports on this TRIO instance. */
+ ret = hv_dev_pread(context->fd, 0,
+ (HV_VirtAddr)&pcie_ports[trio_index][0],
+ sizeof(struct pcie_port_property) * TILEGX_TRIO_PCIES,
+ GXIO_TRIO_OP_GET_PORT_PROPERTY);
+ if (ret < 0) {
+ pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
+ " on TRIO %d\n", ret, trio_index);
+ goto get_port_property_failure;
+ }
+
+ context->mmio_base_mac =
+ iorpc_ioremap(context->fd, 0, HV_TRIO_CONFIG_IOREMAP_SIZE);
+ if (context->mmio_base_mac == NULL) {
+ pr_err("PCI: TRIO config space mapping failure, error %d,"
+ " on TRIO %d\n", ret, trio_index);
+ ret = -ENOMEM;
+
+ goto trio_mmio_mapping_failure;
+ }
+
+ /* Check the port strap state which will override the BIB setting. */
+ for (mac = 0; mac < TILEGX_TRIO_PCIES; mac++) {
+ TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+ unsigned int reg_offset;
+
+ /* Ignore ports that are not specified in the BIB. */
+ if (!pcie_ports[trio_index][mac].allow_rc &&
+ !pcie_ports[trio_index][mac].allow_ep)
+ continue;
+
+ reg_offset =
+ (TRIO_PCIE_INTFC_PORT_CONFIG <<
+ TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+ (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+ TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+ (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+ port_config.word =
+ __gxio_mmio_read(context->mmio_base_mac + reg_offset);
+
+ if (port_config.strap_state != AUTO_CONFIG_RC &&
+ port_config.strap_state != AUTO_CONFIG_RC_G1) {
+ /*
+ * If this is really intended to be an EP port, record
+ * it so that the endpoint driver will know about it.
+ */
+ if (port_config.strap_state == AUTO_CONFIG_EP ||
+ port_config.strap_state == AUTO_CONFIG_EP_G1)
+ pcie_ports[trio_index][mac].allow_ep = 1;
+ }
+ }
+
return ret;

+trio_mmio_mapping_failure:
+get_port_property_failure:
asid_alloc_failure:
#ifdef USE_SHARED_PCIE_CONFIG_REGION
pio_alloc_failure:
#endif
hv_dev_close(context->fd);
+gxio_trio_init_failure:
+ context->fd = -1;

return ret;
}

+static int __init tile_trio_init(void)
+{
+ int i;
+
+ /* We loop over all the TRIO shims. */
+ for (i = 0; i < TILEGX_NUM_TRIO; i++) {
+ if (tile_pcie_open(i) < 0)
+ continue;
+ num_trio_shims++;
+ }
+
+ return 0;
+}
+postcore_initcall(tile_trio_init);
+
static void
tilegx_legacy_irq_ack(struct irq_data *d)
{
@@ -320,14 +405,39 @@ free_irqs:
}

/*
+ * Return 1 if the port is strapped to operate in RC mode.
+ */
+static int
+strapped_for_rc(gxio_trio_context_t *trio_context, int mac)
+{
+ TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
+ unsigned int reg_offset;
+
+ /* Check the port configuration. */
+ reg_offset =
+ (TRIO_PCIE_INTFC_PORT_CONFIG <<
+ TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+ (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+ TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+ (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+ port_config.word =
+ __gxio_mmio_read(trio_context->mmio_base_mac + reg_offset);
+
+ if (port_config.strap_state == AUTO_CONFIG_RC ||
+ port_config.strap_state == AUTO_CONFIG_RC_G1)
+ return 1;
+ else
+ return 0;
+}
+
+/*
* Find valid controllers and fill in pci_controller structs for each
* of them.
*
- * Returns the number of controllers discovered.
+ * Return the number of controllers discovered.
*/
int __init tile_pci_init(void)
{
- int num_trio_shims = 0;
int ctl_index = 0;
int i, j;

@@ -338,19 +448,6 @@ int __init tile_pci_init(void)

pr_info("PCI: Searching for controllers...\n");

- /*
- * We loop over all the TRIO shims.
- */
- for (i = 0; i < TILEGX_NUM_TRIO; i++) {
- int ret;
-
- ret = tile_pcie_open(i);
- if (ret < 0)
- continue;
-
- num_trio_shims++;
- }
-
if (num_trio_shims == 0 || sim_is_simulator())
return 0;

@@ -361,29 +458,16 @@ int __init tile_pci_init(void)
*/
for (i = 0; i < TILEGX_NUM_TRIO; i++) {
gxio_trio_context_t *context = &trio_contexts[i];
- int ret;

if (context->fd < 0)
continue;

- ret = hv_dev_pread(context->fd, 0,
- (HV_VirtAddr)&pcie_ports[i][0],
- sizeof(struct pcie_port_property) * TILEGX_TRIO_PCIES,
- GXIO_TRIO_OP_GET_PORT_PROPERTY);
- if (ret < 0) {
- pr_err("PCI: PCIE_GET_PORT_PROPERTY failure, error %d,"
- " on TRIO %d\n", ret, i);
- continue;
- }
-
for (j = 0; j < TILEGX_TRIO_PCIES; j++) {
- if (pcie_ports[i][j].allow_rc) {
+ if (pcie_ports[i][j].allow_rc &&
+ strapped_for_rc(context, j)) {
pcie_rc[i][j] = 1;
num_rc_controllers++;
}
- else if (pcie_ports[i][j].allow_ep) {
- num_ep_controllers++;
- }
}
}

@@ -600,35 +684,10 @@ int __init pcibios_init(void)

tile_pci_init();

- if (num_rc_controllers == 0 && num_ep_controllers == 0)
+ if (num_rc_controllers == 0)
return 0;

/*
- * We loop over all the TRIO shims and set up the MMIO mappings.
- */
- for (i = 0; i < TILEGX_NUM_TRIO; i++) {
- gxio_trio_context_t *context = &trio_contexts[i];
-
- if (context->fd < 0)
- continue;
-
- /*
- * Map in the MMIO space for the MAC.
- */
- offset = 0;
- context->mmio_base_mac =
- iorpc_ioremap(context->fd, offset,
- HV_TRIO_CONFIG_IOREMAP_SIZE);
- if (context->mmio_base_mac == NULL) {
- pr_err("PCI: MAC map failure on TRIO %d\n", i);
-
- hv_dev_close(context->fd);
- context->fd = -1;
- continue;
- }
- }
-
- /*
* Delay a bit in case devices aren't ready. Some devices are
* known to require at least 20ms here, but we use a more
* conservative value.
@@ -639,7 +698,6 @@ int __init pcibios_init(void)
for (next_busno = 0, i = 0; i < num_rc_controllers; i++) {
struct pci_controller *controller = &pci_controllers[i];
gxio_trio_context_t *trio_context = controller->trio;
- TRIO_PCIE_INTFC_PORT_CONFIG_t port_config;
TRIO_PCIE_INTFC_PORT_STATUS_t port_status;
TRIO_PCIE_INTFC_TX_FIFO_CTL_t tx_fifo_ctl;
struct pci_bus *bus;
@@ -656,39 +714,6 @@ int __init pcibios_init(void)
mac = controller->mac;

/*
- * Check the port strap state which will override the BIB
- * setting.
- */
-
- reg_offset =
- (TRIO_PCIE_INTFC_PORT_CONFIG <<
- TRIO_CFG_REGION_ADDR__REG_SHIFT) |
- (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
- TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
- (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
-
- port_config.word =
- __gxio_mmio_read(trio_context->mmio_base_mac +
- reg_offset);
-
- if ((port_config.strap_state !=
- TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC) &&
- (port_config.strap_state !=
- TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_RC_G1)) {
- /*
- * If this is really intended to be an EP port,
- * record it so that the endpoint driver will know about it.
- */
- if (port_config.strap_state ==
- TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT ||
- port_config.strap_state ==
- TRIO_PCIE_INTFC_PORT_CONFIG__STRAP_STATE_VAL_AUTO_CONFIG_ENDPOINT_G1)
- pcie_ports[trio_index][mac].allow_ep = 1;
-
- continue;
- }
-
- /*
* Check for PCIe link-up status to decide if we need
* to force the link to come up.
*/
--
1.8.3.1

2013-08-05 20:42:52

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 07/20] tile PCI RC: support more MSI-X interrupt vectors

To support PCIe devices with higher number of MSI-X interrupt vectors,
e.g. 16 for the LSI RAID card, enhance the Gx RC stack to provide more
MSI-X vectors by using the TRIO Scatter Queues, which provide 8 more
vectors in addition to ~10 from the Map Mem regions.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/gxio/iorpc_trio.c | 23 +++++++++++++++
arch/tile/include/arch/trio.h | 39 ++++++++++++++++++++++++
arch/tile/include/gxio/iorpc_trio.h | 5 ++++
arch/tile/kernel/pci_gx.c | 59 ++++++++++++++++++++++++-------------
4 files changed, 106 insertions(+), 20 deletions(-)

diff --git a/arch/tile/gxio/iorpc_trio.c b/arch/tile/gxio/iorpc_trio.c
index cef4b22..da6e18e 100644
--- a/arch/tile/gxio/iorpc_trio.c
+++ b/arch/tile/gxio/iorpc_trio.c
@@ -61,6 +61,29 @@ int gxio_trio_alloc_memory_maps(gxio_trio_context_t * context,

EXPORT_SYMBOL(gxio_trio_alloc_memory_maps);

+struct alloc_scatter_queues_param {
+ unsigned int count;
+ unsigned int first;
+ unsigned int flags;
+};
+
+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t * context,
+ unsigned int count, unsigned int first,
+ unsigned int flags)
+{
+ struct alloc_scatter_queues_param temp;
+ struct alloc_scatter_queues_param *params = &temp;
+
+ params->count = count;
+ params->first = first;
+ params->flags = flags;
+
+ return hv_dev_pwrite(context->fd, 0, (HV_VirtAddr) params,
+ sizeof(*params),
+ GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES);
+}
+
+EXPORT_SYMBOL(gxio_trio_alloc_scatter_queues);

struct alloc_pio_regions_param {
unsigned int count;
diff --git a/arch/tile/include/arch/trio.h b/arch/tile/include/arch/trio.h
index d3000a8..c0ddedc 100644
--- a/arch/tile/include/arch/trio.h
+++ b/arch/tile/include/arch/trio.h
@@ -23,6 +23,45 @@
#ifndef __ASSEMBLER__

/*
+ * Map SQ Doorbell Format.
+ * This describes the format of the write-only doorbell register that exists
+ * in the last 8-bytes of the MAP_SQ_BASE/LIM range. This register is only
+ * writable from PCIe space. Writes to this register will not be written to
+ * Tile memory space and thus no IO VA translation is required if the last
+ * page of the BASE/LIM range is not otherwise written.
+ */
+
+__extension__
+typedef union
+{
+ struct
+ {
+#ifndef __BIG_ENDIAN__
+ /*
+ * When written with a 1, the associated MAP_SQ region's doorbell
+ * interrupt will be triggered once all previous writes are visible to
+ * Tile software.
+ */
+ uint_reg_t doorbell : 1;
+ /*
+ * When written with a 1, the descriptor at the head of the associated
+ * MAP_SQ's FIFO will be dequeued.
+ */
+ uint_reg_t pop : 1;
+ /* Reserved. */
+ uint_reg_t __reserved : 62;
+#else /* __BIG_ENDIAN__ */
+ uint_reg_t __reserved : 62;
+ uint_reg_t pop : 1;
+ uint_reg_t doorbell : 1;
+#endif
+ };
+
+ uint_reg_t word;
+} TRIO_MAP_SQ_DOORBELL_FMT_t;
+
+
+/*
* Tile PIO Region Configuration - CFG Address Format.
* This register describes the address format for PIO accesses when the
* associated region is setup with TYPE=CFG.
diff --git a/arch/tile/include/gxio/iorpc_trio.h b/arch/tile/include/gxio/iorpc_trio.h
index 58105c3..d95b96f 100644
--- a/arch/tile/include/gxio/iorpc_trio.h
+++ b/arch/tile/include/gxio/iorpc_trio.h
@@ -30,6 +30,7 @@

#define GXIO_TRIO_OP_ALLOC_MEMORY_MAPS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1404)

+#define GXIO_TRIO_OP_ALLOC_SCATTER_QUEUES IORPC_OPCODE(IORPC_FORMAT_NONE, 0x140e)
#define GXIO_TRIO_OP_ALLOC_PIO_REGIONS IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1412)

#define GXIO_TRIO_OP_INIT_PIO_REGION_AUX IORPC_OPCODE(IORPC_FORMAT_NONE, 0x1414)
@@ -54,6 +55,10 @@ int gxio_trio_alloc_memory_maps(gxio_trio_context_t * context,
unsigned int flags);


+int gxio_trio_alloc_scatter_queues(gxio_trio_context_t * context,
+ unsigned int count, unsigned int first,
+ unsigned int flags);
+
int gxio_trio_alloc_pio_regions(gxio_trio_context_t * context,
unsigned int count, unsigned int first,
unsigned int flags);
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 2cc3e64..e0d6664 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -1474,32 +1474,55 @@ int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
trio_context = controller->trio;

/*
- * Allocate the Mem-Map that will accept the MSI write and
- * trigger the TILE-side interrupts.
+ * Allocate a scatter-queue that will accept the MSI write and
+ * trigger the TILE-side interrupts. We use the scatter-queue regions
+ * before the mem map regions, because the latter are needed by more
+ * applications.
*/
- mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
- if (mem_map < 0) {
- dev_printk(KERN_INFO, &pdev->dev,
- "%s Mem-Map alloc failure. "
- "Failed to initialize MSI interrupts. "
- "Falling back to legacy interrupts.\n",
- desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+ mem_map = gxio_trio_alloc_scatter_queues(trio_context, 1, 0, 0);
+ if (mem_map >= 0) {
+ TRIO_MAP_SQ_DOORBELL_FMT_t doorbell_template = {{
+ .pop = 0,
+ .doorbell = 1,
+ }};
+
+ mem_map += TRIO_NUM_MAP_MEM_REGIONS;
+ mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+ mem_map * MEM_MAP_INTR_REGION_SIZE;
+ mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+ msi_addr = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 8;
+ msg.data = (unsigned int)doorbell_template.word;
+ } else {
+ /* SQ regions are out, allocate from map mem regions. */
+ mem_map = gxio_trio_alloc_memory_maps(trio_context, 1, 0, 0);
+ if (mem_map < 0) {
+ dev_printk(KERN_INFO, &pdev->dev,
+ "%s Mem-Map alloc failure. "
+ "Failed to initialize MSI interrupts. "
+ "Falling back to legacy interrupts.\n",
+ desc->msi_attrib.is_msix ? "MSI-X" : "MSI");
+ ret = -ENOMEM;
+ goto msi_mem_map_alloc_failure;
+ }

- ret = -ENOMEM;
- goto msi_mem_map_alloc_failure;
+ mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
+ mem_map * MEM_MAP_INTR_REGION_SIZE;
+ mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
+
+ msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 -
+ TRIO_MAP_MEM_REG_INT0;
+
+ msg.data = mem_map;
}

/* We try to distribute different IRQs to different tiles. */
cpu = tile_irq_cpu(irq);

/*
- * Now call up to the HV to configure the Mem-Map interrupt and
+ * Now call up to the HV to configure the MSI interrupt and
* set up the IPI binding.
*/
- mem_map_base = MEM_MAP_INTR_REGIONS_BASE +
- mem_map * MEM_MAP_INTR_REGION_SIZE;
- mem_map_limit = mem_map_base + MEM_MAP_INTR_REGION_SIZE - 1;
-
ret = gxio_trio_config_msi_intr(trio_context, cpu_x(cpu), cpu_y(cpu),
KERNEL_PL, irq, controller->mac,
mem_map, mem_map_base, mem_map_limit,
@@ -1512,13 +1535,9 @@ int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)

irq_set_msi_desc(irq, desc);

- msi_addr = mem_map_base + TRIO_MAP_MEM_REG_INT3 - TRIO_MAP_MEM_REG_INT0;
-
msg.address_hi = msi_addr >> 32;
msg.address_lo = msi_addr & 0xffffffff;

- msg.data = mem_map;
-
write_msi_msg(irq, &msg);
irq_set_chip_and_handler(irq, &tilegx_msi_chip, handle_level_irq);
irq_set_handler_data(irq, controller);
--
1.8.3.1

2013-08-05 20:48:56

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 05/20] tile PCI RC: handle case that PCI link is already up

If we are rebooting (e.g. via kexec) then the PCI RC link may
already be up. In that case, we don't want to do the software
fixup to force the link up, since that can degrade it to Gen1.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci_gx.c | 41 ++++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index e99809e..2cc3e64 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -674,10 +674,33 @@ int __init pcibios_init(void)
continue;
}

- ret = gxio_trio_force_rc_link_up(trio_context, mac);
- if (ret < 0)
- pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
- "MAC %d on TRIO %d\n", mac, trio_index);
+ /*
+ * Check for PCIe link-up status to decide if we need
+ * to force the link to come up.
+ */
+ reg_offset =
+ (TRIO_PCIE_INTFC_PORT_STATUS <<
+ TRIO_CFG_REGION_ADDR__REG_SHIFT) |
+ (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
+ TRIO_CFG_REGION_ADDR__INTFC_SHIFT) |
+ (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
+
+ port_status.word =
+ __gxio_mmio_read(trio_context->mmio_base_mac +
+ reg_offset);
+ if (!port_status.dl_up) {
+ if (rc_delay[trio_index][mac]) {
+ pr_info("Delaying PCIe RC TRIO init %d sec"
+ " on MAC %d on TRIO %d\n",
+ rc_delay[trio_index][mac], mac,
+ trio_index);
+ msleep(rc_delay[trio_index][mac] * 1000);
+ }
+ ret = gxio_trio_force_rc_link_up(trio_context, mac);
+ if (ret < 0)
+ pr_err("PCI: PCIE_FORCE_LINK_UP failure, "
+ "MAC %d on TRIO %d\n", mac, trio_index);
+ }

pr_info("PCI: Found PCI controller #%d on TRIO %d MAC %d\n", i,
trio_index, controller->mac);
@@ -700,16 +723,8 @@ int __init pcibios_init(void)
}

/*
- * Check for PCIe link-up status.
+ * Check for PCIe link-up status again.
*/
-
- reg_offset =
- (TRIO_PCIE_INTFC_PORT_STATUS <<
- TRIO_CFG_REGION_ADDR__REG_SHIFT) |
- (TRIO_CFG_REGION_ADDR__INTFC_VAL_MAC_INTERFACE <<
- TRIO_CFG_REGION_ADDR__INTFC_SHIFT ) |
- (mac << TRIO_CFG_REGION_ADDR__MAC_SEL_SHIFT);
-
port_status.word =
__gxio_mmio_read(trio_context->mmio_base_mac +
reg_offset);
--
1.8.3.1

2013-08-05 20:49:17

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH 03/20] tile PCI RC: support pci=off boot arg for tilepro

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/kernel/pci.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
index 1dae3b2..af75835 100644
--- a/arch/tile/kernel/pci.c
+++ b/arch/tile/kernel/pci.c
@@ -51,6 +51,8 @@
*
*/

+static int pci_probe = 1;
+
/*
* This flag tells if the platform is TILEmpower that needs
* special configuration for the PLX switch chip.
@@ -143,6 +145,11 @@ int __init tile_pci_init(void)
{
int i;

+ if (!pci_probe) {
+ pr_info("PCI: disabled by boot argument\n");
+ return 0;
+ }
+
pr_info("PCI: Searching for controllers...\n");

/* Re-init number of PCIe controllers to support hot-plug feature. */
@@ -378,6 +385,16 @@ void pcibios_set_master(struct pci_dev *dev)
/* No special bus mastering setup handling. */
}

+/* Process any "pci=" kernel boot arguments. */
+char * __init pcibios_setup(char *str)
+{
+ if (!strcmp(str, "off")) {
+ pci_probe = 0;
+ return NULL;
+ }
+ return str;
+}
+
/*
* Enable memory and/or address decoding, as appropriate, for the
* device described by the 'dev' struct.
--
1.8.3.1

2013-08-05 20:53:14

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH 06/20] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

On Mon, Aug 05, 2013 at 04:06:20PM -0400, Chris Metcalf wrote:
> The LSI MEGARAID SAS HBA suffers from the problem where it can do
> 64-bit DMA to streaming buffers but not to consistent buffers.
> In other words, 64-bit DMA is used for disk data transfers and 32-bit
> DMA must be used for control message transfers. According to LSI,
> the firmware is not fully functional yet. This change implements a
> kind of hybrid dma_ops to support this.

If this is generic to LSI MegaRAID HBA shouldn't this change also be
done for the other platforms?

Or perhaps some other changes to make it work with other devices?

>
> Signed-off-by: Chris Metcalf <[email protected]>
> ---
> arch/tile/include/asm/dma-mapping.h | 4 ++--
> arch/tile/kernel/pci-dma.c | 17 +++++++++++------
> 2 files changed, 13 insertions(+), 8 deletions(-)
>
> diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
> index f2ff191..6da0540 100644
> --- a/arch/tile/include/asm/dma-mapping.h
> +++ b/arch/tile/include/asm/dma-mapping.h
> @@ -44,12 +44,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
>
> static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
> {
> - return paddr + get_dma_offset(dev);
> + return paddr;
> }
>
> static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
> {
> - return daddr - get_dma_offset(dev);
> + return daddr;
> }
>
> static inline void dma_mark_clean(void *addr, size_t size) {}
> diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
> index b9fe80e..7e98371 100644
> --- a/arch/tile/kernel/pci-dma.c
> +++ b/arch/tile/kernel/pci-dma.c
> @@ -357,7 +357,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,
>
> addr = page_to_phys(pg);
>
> - *dma_handle = phys_to_dma(dev, addr);
> + *dma_handle = addr + get_dma_offset(dev);
>
> return page_address(pg);
> }
> @@ -387,7 +387,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
> sg->dma_address = sg_phys(sg);
> __dma_prep_pa_range(sg->dma_address, sg->length, direction);
>
> - sg->dma_address = phys_to_dma(dev, sg->dma_address);
> + sg->dma_address = sg->dma_address + get_dma_offset(dev);
> #ifdef CONFIG_NEED_SG_DMA_LENGTH
> sg->dma_length = sg->length;
> #endif
> @@ -422,7 +422,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
> BUG_ON(offset + size > PAGE_SIZE);
> __dma_prep_page(page, offset, size, direction);
>
> - return phys_to_dma(dev, page_to_pa(page) + offset);
> + return page_to_pa(page) + offset + get_dma_offset(dev);
> }
>
> static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
> @@ -432,7 +432,7 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
> {
> BUG_ON(!valid_dma_direction(direction));
>
> - dma_address = dma_to_phys(dev, dma_address);
> + dma_address -= get_dma_offset(dev);
>
> __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
> dma_address & PAGE_OFFSET, size, direction);
> @@ -445,7 +445,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
> {
> BUG_ON(!valid_dma_direction(direction));
>
> - dma_handle = dma_to_phys(dev, dma_handle);
> + dma_handle -= get_dma_offset(dev);
>
> __dma_complete_pa_range(dma_handle, size, direction);
> }
> @@ -456,7 +456,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
> enum dma_data_direction
> direction)
> {
> - dma_handle = dma_to_phys(dev, dma_handle);
> + dma_handle -= get_dma_offset(dev);
>
> __dma_prep_pa_range(dma_handle, size, direction);
> }
> @@ -573,6 +573,11 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
> if (((dma_ops == gx_pci_dma_map_ops) ||
> (dma_ops == gx_legacy_pci_dma_map_ops)) &&
> (mask <= DMA_BIT_MASK(32))) {
> + if (dma_ops == gx_pci_dma_map_ops) {
> + dma_ops->alloc = tile_swiotlb_alloc_coherent;
> + dma_ops->free = tile_swiotlb_free_coherent;
> + }
> +

So.. that would change it for all of the devices on the host, not just
for this specific one. That is not good is it?

> if (mask > dev->archdata.max_direct_dma_addr)
> mask = dev->archdata.max_direct_dma_addr;
> }
> --
> 1.8.3.1
>

2013-08-06 17:00:10

by Chris Metcalf

[permalink] [raw]
Subject: Re: [PATCH 06/20] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

On 8/5/2013 4:52 PM, Konrad Rzeszutek Wilk wrote:
> On Mon, Aug 05, 2013 at 04:06:20PM -0400, Chris Metcalf wrote:
>> The LSI MEGARAID SAS HBA suffers from the problem where it can do
>> 64-bit DMA to streaming buffers but not to consistent buffers.
>> In other words, 64-bit DMA is used for disk data transfers and 32-bit
>> DMA must be used for control message transfers. According to LSI,
>> the firmware is not fully functional yet. This change implements a
>> kind of hybrid dma_ops to support this.
> If this is generic to LSI MegaRAID HBA shouldn't this change also be
> done for the other platforms?
>
> Or perhaps some other changes to make it work with other devices?

No, it's really specific to how tilegx maps the 64-bit PCI space, 32-bit PCI space, and memory. I'll add more commentary to the git commit message to make this clearer.

>> @@ -573,6 +573,11 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
>> if (((dma_ops == gx_pci_dma_map_ops) ||
>> (dma_ops == gx_legacy_pci_dma_map_ops)) &&
>> (mask <= DMA_BIT_MASK(32))) {
>> + if (dma_ops == gx_pci_dma_map_ops) {
>> + dma_ops->alloc = tile_swiotlb_alloc_coherent;
>> + dma_ops->free = tile_swiotlb_free_coherent;
>> + }
>> +
> So.. that would change it for all of the devices on the host, not just
> for this specific one. That is not good is it?

Yes, good point; doing it this way might have some performance impact on other devices on the same host. See following email for a revised change that updates the dma_ops pointer for that single device instead.

--
Chris Metcalf, Tilera Corp.
http://www.tilera.com

2013-08-06 17:02:27

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

The LSI MEGARAID SAS HBA suffers from the problem where it can do
64-bit DMA to streaming buffers but not to consistent buffers.
In other words, 64-bit DMA is used for disk data transfers and 32-bit
DMA must be used for control message transfers. According to LSI,
the firmware is not fully functional yet. This change implements a
kind of hybrid dma_ops to support this.

Note that on most other platforms, the 64-bit DMA addressing space is the
same as the 32-bit DMA space and they overlap the physical memory space.
No special arrangement is needed to support this kind of mixed DMA
capability. On TILE-Gx, the 64-bit DMA space is completely separate
from the 32-bit DMA space. Due to the use of the IOMMU, the 64-bit DMA
space doesn't overlap the physical memory space. On the other hand,
the 32-bit DMA space overlaps the physical memory space under 4GB.
The separate address spaces make it necessary to have separate dma_ops.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/dma-mapping.h | 10 +++++++---
arch/tile/kernel/pci-dma.c | 40 ++++++++++++++++++++++++++++---------
2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index f2ff191..4a60059 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -23,6 +23,7 @@
extern struct dma_map_ops *tile_dma_map_ops;
extern struct dma_map_ops *gx_pci_dma_map_ops;
extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;

static inline struct dma_map_ops *get_dma_ops(struct device *dev)
{
@@ -44,12 +45,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)

static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
{
- return paddr + get_dma_offset(dev);
+ return paddr;
}

static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
{
- return daddr - get_dma_offset(dev);
+ return daddr;
}

static inline void dma_mark_clean(void *addr, size_t size) {}
@@ -88,7 +89,10 @@ dma_set_mask(struct device *dev, u64 mask)
struct dma_map_ops *dma_ops = get_dma_ops(dev);

/* Handle legacy PCI devices with limited memory addressability. */
- if ((dma_ops == gx_pci_dma_map_ops) && (mask <= DMA_BIT_MASK(32))) {
+ if ((dma_ops == gx_pci_dma_map_ops ||
+ dma_ops == gx_hybrid_pci_dma_map_ops ||
+ dma_ops == gx_legacy_pci_dma_map_ops) &&
+ (mask <= DMA_BIT_MASK(32))) {
set_dma_ops(dev, gx_legacy_pci_dma_map_ops);
set_dma_offset(dev, 0);
if (mask > dev->archdata.max_direct_dma_addr)
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index b9fe80e..7e22e73 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -357,7 +357,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,

addr = page_to_phys(pg);

- *dma_handle = phys_to_dma(dev, addr);
+ *dma_handle = addr + get_dma_offset(dev);

return page_address(pg);
}
@@ -387,7 +387,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
sg->dma_address = sg_phys(sg);
__dma_prep_pa_range(sg->dma_address, sg->length, direction);

- sg->dma_address = phys_to_dma(dev, sg->dma_address);
+ sg->dma_address = sg->dma_address + get_dma_offset(dev);
#ifdef CONFIG_NEED_SG_DMA_LENGTH
sg->dma_length = sg->length;
#endif
@@ -422,7 +422,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
BUG_ON(offset + size > PAGE_SIZE);
__dma_prep_page(page, offset, size, direction);

- return phys_to_dma(dev, page_to_pa(page) + offset);
+ return page_to_pa(page) + offset + get_dma_offset(dev);
}

static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
@@ -432,7 +432,7 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
{
BUG_ON(!valid_dma_direction(direction));

- dma_address = dma_to_phys(dev, dma_address);
+ dma_address -= get_dma_offset(dev);

__dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
dma_address & PAGE_OFFSET, size, direction);
@@ -445,7 +445,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
{
BUG_ON(!valid_dma_direction(direction));

- dma_handle = dma_to_phys(dev, dma_handle);
+ dma_handle -= get_dma_offset(dev);

__dma_complete_pa_range(dma_handle, size, direction);
}
@@ -456,7 +456,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
enum dma_data_direction
direction)
{
- dma_handle = dma_to_phys(dev, dma_handle);
+ dma_handle -= get_dma_offset(dev);

__dma_prep_pa_range(dma_handle, size, direction);
}
@@ -558,21 +558,43 @@ static struct dma_map_ops pci_swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
};

+static struct dma_map_ops pci_hybrid_dma_ops = {
+ .alloc = tile_swiotlb_alloc_coherent,
+ .free = tile_swiotlb_free_coherent,
+ .map_page = tile_pci_dma_map_page,
+ .unmap_page = tile_pci_dma_unmap_page,
+ .map_sg = tile_pci_dma_map_sg,
+ .unmap_sg = tile_pci_dma_unmap_sg,
+ .sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
+ .sync_single_for_device = tile_pci_dma_sync_single_for_device,
+ .sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
+ .sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
+ .mapping_error = tile_pci_dma_mapping_error,
+ .dma_supported = tile_pci_dma_supported
+};
+
struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
#else
struct dma_map_ops *gx_legacy_pci_dma_map_ops;
+struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
#endif
EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
+EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);

#ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
struct dma_map_ops *dma_ops = get_dma_ops(dev);

- /* Handle legacy PCI devices with limited memory addressability. */
- if (((dma_ops == gx_pci_dma_map_ops) ||
- (dma_ops == gx_legacy_pci_dma_map_ops)) &&
+ /* Handle hybrid PCI devices with limited memory addressability. */
+ if ((dma_ops == gx_pci_dma_map_ops ||
+ dma_ops == gx_hybrid_pci_dma_map_ops ||
+ dma_ops == gx_legacy_pci_dma_map_ops) &&
(mask <= DMA_BIT_MASK(32))) {
+ if (dma_ops == gx_pci_dma_map_ops)
+ set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
+
if (mask > dev->archdata.max_direct_dma_addr)
mask = dev->archdata.max_direct_dma_addr;
}
--
1.8.3.1

2013-08-06 17:48:28

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

[+cc Myron, Adam]

On Fri, Aug 2, 2013 at 10:24 AM, Chris Metcalf <[email protected]> wrote:
> The LSI MEGARAID SAS HBA suffers from the problem where it can do
> 64-bit DMA to streaming buffers but not to consistent buffers.
> In other words, 64-bit DMA is used for disk data transfers and 32-bit
> DMA must be used for control message transfers.

Is this related at all to the make_local_pdev() hacks in megaraid.c?
The intent there seems to be to use a 32-bit DMA mask for certain
transactions and a 64-bit mask for others. But I think it's actually
broken, because the implementation changes the mask to 32-bit for
*all* future transactions, not just the one using make_local_pdev().

> According to LSI,
> the firmware is not fully functional yet. This change implements a
> kind of hybrid dma_ops to support this.
>
> Note that on most other platforms, the 64-bit DMA addressing space is the
> same as the 32-bit DMA space and they overlap the physical memory space.
> No special arrangement is needed to support this kind of mixed DMA
> capability. On TILE-Gx, the 64-bit DMA space is completely separate
> from the 32-bit DMA space.

Help me understand what's going on here. My understanding is that on
typical systems, the 32-bit DMA space is a subset of the 64-bit DMA
space. In conventional PCI, "a master that supports 64-bit addressing
must generate a SAC, instead of a DAC, when the upper 32 bits of the
address are zero" (PCI spec r3.0, sec 3.9). PCIe doesn't have
SAC/DAC, but it has both 32-bit and 64-bit address headers and has a
similar requirement: "For Addresses below 4GB, Requesters must use the
32-bit format" (PCIe spec r3.0, sec 2.2.4.1).

Those imply to me that the 0-4GB region of the 64-bit DMA space must
be identical to the 0-4GB 32-bit DMA space, and in fact, the receiver
of a transaction shouldn't be able to distinguish them.

But it sounds like something's different on TILE-Gx? Does it
translate bus addresses to physical memory addresses based on the type
of the transaction (SAC vs DAC, or 32-bit vs 64-bit header) in
addition to the address? Even if it does, the spec doesn't allow a
DAC cycle or a 64-bit header where the 32 high-order bits are zero, so
it shouldn't matter.

Bjorn

> Due to the use of the IOMMU, the 64-bit DMA
> space doesn't overlap the physical memory space. On the other hand,
> the 32-bit DMA space overlaps the physical memory space under 4GB.
> The separate address spaces make it necessary to have separate dma_ops.
>
> Signed-off-by: Chris Metcalf <[email protected]>
> ---
> arch/tile/include/asm/dma-mapping.h | 10 +++++++---
> arch/tile/kernel/pci-dma.c | 40 ++++++++++++++++++++++++++++---------
> 2 files changed, 38 insertions(+), 12 deletions(-)
>
> diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
> index f2ff191..4a60059 100644
> --- a/arch/tile/include/asm/dma-mapping.h
> +++ b/arch/tile/include/asm/dma-mapping.h
> @@ -23,6 +23,7 @@
> extern struct dma_map_ops *tile_dma_map_ops;
> extern struct dma_map_ops *gx_pci_dma_map_ops;
> extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
> +extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
>
> static inline struct dma_map_ops *get_dma_ops(struct device *dev)
> {
> @@ -44,12 +45,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
>
> static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
> {
> - return paddr + get_dma_offset(dev);
> + return paddr;
> }
>
> static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
> {
> - return daddr - get_dma_offset(dev);
> + return daddr;
> }
>
> static inline void dma_mark_clean(void *addr, size_t size) {}
> @@ -88,7 +89,10 @@ dma_set_mask(struct device *dev, u64 mask)
> struct dma_map_ops *dma_ops = get_dma_ops(dev);
>
> /* Handle legacy PCI devices with limited memory addressability. */
> - if ((dma_ops == gx_pci_dma_map_ops) && (mask <= DMA_BIT_MASK(32))) {
> + if ((dma_ops == gx_pci_dma_map_ops ||
> + dma_ops == gx_hybrid_pci_dma_map_ops ||
> + dma_ops == gx_legacy_pci_dma_map_ops) &&
> + (mask <= DMA_BIT_MASK(32))) {
> set_dma_ops(dev, gx_legacy_pci_dma_map_ops);
> set_dma_offset(dev, 0);
> if (mask > dev->archdata.max_direct_dma_addr)
> diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
> index b9fe80e..7e22e73 100644
> --- a/arch/tile/kernel/pci-dma.c
> +++ b/arch/tile/kernel/pci-dma.c
> @@ -357,7 +357,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,
>
> addr = page_to_phys(pg);
>
> - *dma_handle = phys_to_dma(dev, addr);
> + *dma_handle = addr + get_dma_offset(dev);
>
> return page_address(pg);
> }
> @@ -387,7 +387,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
> sg->dma_address = sg_phys(sg);
> __dma_prep_pa_range(sg->dma_address, sg->length, direction);
>
> - sg->dma_address = phys_to_dma(dev, sg->dma_address);
> + sg->dma_address = sg->dma_address + get_dma_offset(dev);
> #ifdef CONFIG_NEED_SG_DMA_LENGTH
> sg->dma_length = sg->length;
> #endif
> @@ -422,7 +422,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
> BUG_ON(offset + size > PAGE_SIZE);
> __dma_prep_page(page, offset, size, direction);
>
> - return phys_to_dma(dev, page_to_pa(page) + offset);
> + return page_to_pa(page) + offset + get_dma_offset(dev);
> }
>
> static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
> @@ -432,7 +432,7 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
> {
> BUG_ON(!valid_dma_direction(direction));
>
> - dma_address = dma_to_phys(dev, dma_address);
> + dma_address -= get_dma_offset(dev);
>
> __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
> dma_address & PAGE_OFFSET, size, direction);
> @@ -445,7 +445,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
> {
> BUG_ON(!valid_dma_direction(direction));
>
> - dma_handle = dma_to_phys(dev, dma_handle);
> + dma_handle -= get_dma_offset(dev);
>
> __dma_complete_pa_range(dma_handle, size, direction);
> }
> @@ -456,7 +456,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
> enum dma_data_direction
> direction)
> {
> - dma_handle = dma_to_phys(dev, dma_handle);
> + dma_handle -= get_dma_offset(dev);
>
> __dma_prep_pa_range(dma_handle, size, direction);
> }
> @@ -558,21 +558,43 @@ static struct dma_map_ops pci_swiotlb_dma_ops = {
> .mapping_error = swiotlb_dma_mapping_error,
> };
>
> +static struct dma_map_ops pci_hybrid_dma_ops = {
> + .alloc = tile_swiotlb_alloc_coherent,
> + .free = tile_swiotlb_free_coherent,
> + .map_page = tile_pci_dma_map_page,
> + .unmap_page = tile_pci_dma_unmap_page,
> + .map_sg = tile_pci_dma_map_sg,
> + .unmap_sg = tile_pci_dma_unmap_sg,
> + .sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
> + .sync_single_for_device = tile_pci_dma_sync_single_for_device,
> + .sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
> + .sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
> + .mapping_error = tile_pci_dma_mapping_error,
> + .dma_supported = tile_pci_dma_supported
> +};
> +
> struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
> +struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
> #else
> struct dma_map_ops *gx_legacy_pci_dma_map_ops;
> +struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
> #endif
> EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
> +EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);
>
> #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
> int dma_set_coherent_mask(struct device *dev, u64 mask)
> {
> struct dma_map_ops *dma_ops = get_dma_ops(dev);
>
> - /* Handle legacy PCI devices with limited memory addressability. */
> - if (((dma_ops == gx_pci_dma_map_ops) ||
> - (dma_ops == gx_legacy_pci_dma_map_ops)) &&
> + /* Handle hybrid PCI devices with limited memory addressability. */
> + if ((dma_ops == gx_pci_dma_map_ops ||
> + dma_ops == gx_hybrid_pci_dma_map_ops ||
> + dma_ops == gx_legacy_pci_dma_map_ops) &&
> (mask <= DMA_BIT_MASK(32))) {
> + if (dma_ops == gx_pci_dma_map_ops)
> + set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
> +
> if (mask > dev->archdata.max_direct_dma_addr)
> mask = dev->archdata.max_direct_dma_addr;
> }
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2013-08-09 22:42:34

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

On Thu, Aug 08, 2013 at 12:47:10PM -0400, Chris Metcalf wrote:
> On 8/6/2013 1:48 PM, Bjorn Helgaas wrote:
> > [+cc Myron, Adam]
> >
> > On Fri, Aug 2, 2013 at 10:24 AM, Chris Metcalf <[email protected]> wrote:
> >> According to LSI,
> >> the firmware is not fully functional yet. This change implements a
> >> kind of hybrid dma_ops to support this.
> >>
> >> Note that on most other platforms, the 64-bit DMA addressing space is the
> >> same as the 32-bit DMA space and they overlap the physical memory space.
> >> No special arrangement is needed to support this kind of mixed DMA
> >> capability. On TILE-Gx, the 64-bit DMA space is completely separate
> >> from the 32-bit DMA space.
> > Help me understand what's going on here. My understanding is that on
> > typical systems, the 32-bit DMA space is a subset of the 64-bit DMA
> > space. In conventional PCI, "a master that supports 64-bit addressing
> > must generate a SAC, instead of a DAC, when the upper 32 bits of the
> > address are zero" (PCI spec r3.0, sec 3.9). PCIe doesn't have
> > SAC/DAC, but it has both 32-bit and 64-bit address headers and has a
> > similar requirement: "For Addresses below 4GB, Requesters must use the
> > 32-bit format" (PCIe spec r3.0, sec 2.2.4.1).
> >
> > Those imply to me that the 0-4GB region of the 64-bit DMA space must
> > be identical to the 0-4GB 32-bit DMA space, and in fact, the receiver
> > of a transaction shouldn't be able to distinguish them.
> >
> > But it sounds like something's different on TILE-Gx? Does it
> > translate bus addresses to physical memory addresses based on the type
> > of the transaction (SAC vs DAC, or 32-bit vs 64-bit header) in
> > addition to the address? Even if it does, the spec doesn't allow a
> > DAC cycle or a 64-bit header where the 32 high-order bits are zero, so
> > it shouldn't matter.
>
> No, we don't translate based on the type of the transaction. Using
> "DMA space" in the commit message was probably misleading. What's
> really going on is different DMA windows. 32-bit DMA has the
> obvious naive implementation where [0,4GB] in DMA space maps to
> [0,4GB] in PA space. However, for 64-bit DMA, we use DMA
> addresses with a non-zero high 32 bits, in the [1TB,2TB] range,
> but map the results down to PA [0,1TB] using our IOMMU.

I guess this means devices can DMA to physical addresses [0,3GB]
using either 32-bit bus addresses in the [0,3GB] range or 64-bit bus
addresses in the [1TB,1TB+3GB] range, right?

> We did consider having the 64-bit DMA window be [0,1TB] and map
> directly to PA space, like the 32-bit window. But this design
> suffers from the “PCI hole” problem. Basically, the BAR space is
> usually under 4GB (it occupies the range [3GB, 4GB] on tilegx) and
> the host bridge uses negative decoding in passing DMA traffic
> upstream. That is, DMA traffic with target address in [3GB, 4GB]
> are not passed to the host memory. This means that the same amount
> of physical memory as the BAR space cannot be used for DMA
> purpose. And because it is not easy avoid this region in
> allocating DMA memory, the kernel is simply told to not use this
> chunk of PA at all, so it is wasted.

OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
as you describe. And even if DMA *could* reach it, the CPU couldn't
see it because CPU accesses to that range would go to PCI for the
memory-mapped BAR space, not to memory.

But I can't figure out why Tile needs to do something special. I
think other arches handle the PCI hole for MMIO space the same way.

I don't know if other arches alias the [0,3GB] physical address
range in both 32-bit and 64-bit DMA space like you do, but if that's
part of the problem, it seems like you could easily avoid the
aliasing by making the 64-bit DMA space [1TB+4GB,2TB] instead of
[1TB,2TB].

> For the LSI device, the way we manage it is to ensure that the
> device’s streaming buffers and the consistent buffers come from
> different pools, with the latter using the under-4GB bounce
> buffers. Obviously, normal devices use the same buffer pool for
> both streaming and consistent, either under 4GB or the whole PA.

It seems like you could make your DMA space be the union of [0,3GB]
and [1TB+4GB,2TB], then use pci_set_dma_mask(dev, DMA_BIT_MASK(64))
and pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)) (I assume the
driver already sets those masks correctly if it works on other
arches).

> Given all of that, does this change make sense? I can certainly
> amend the commit description to include more commentary.

Obviously, I'm missing something. I guess it really doesn't matter
because this is all arch code and I don't need to understand it, but
it does niggle at me somehow.

Bjorn

> >> Due to the use of the IOMMU, the 64-bit DMA
> >> space doesn't overlap the physical memory space. On the other hand,
> >> the 32-bit DMA space overlaps the physical memory space under 4GB.
> >> The separate address spaces make it necessary to have separate dma_ops.
> >>
> >> Signed-off-by: Chris Metcalf <[email protected]>
> >> ---
> >> arch/tile/include/asm/dma-mapping.h | 10 +++++++---
> >> arch/tile/kernel/pci-dma.c | 40 ++++++++++++++++++++++++++++---------
> >> 2 files changed, 38 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
> >> index f2ff191..4a60059 100644
> >> --- a/arch/tile/include/asm/dma-mapping.h
> >> +++ b/arch/tile/include/asm/dma-mapping.h
> >> @@ -23,6 +23,7 @@
> >> extern struct dma_map_ops *tile_dma_map_ops;
> >> extern struct dma_map_ops *gx_pci_dma_map_ops;
> >> extern struct dma_map_ops *gx_legacy_pci_dma_map_ops;
> >> +extern struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
> >>
> >> static inline struct dma_map_ops *get_dma_ops(struct device *dev)
> >> {
> >> @@ -44,12 +45,12 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
> >>
> >> static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
> >> {
> >> - return paddr + get_dma_offset(dev);
> >> + return paddr;
> >> }
> >>
> >> static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
> >> {
> >> - return daddr - get_dma_offset(dev);
> >> + return daddr;
> >> }
> >>
> >> static inline void dma_mark_clean(void *addr, size_t size) {}
> >> @@ -88,7 +89,10 @@ dma_set_mask(struct device *dev, u64 mask)
> >> struct dma_map_ops *dma_ops = get_dma_ops(dev);
> >>
> >> /* Handle legacy PCI devices with limited memory addressability. */
> >> - if ((dma_ops == gx_pci_dma_map_ops) && (mask <= DMA_BIT_MASK(32))) {
> >> + if ((dma_ops == gx_pci_dma_map_ops ||
> >> + dma_ops == gx_hybrid_pci_dma_map_ops ||
> >> + dma_ops == gx_legacy_pci_dma_map_ops) &&
> >> + (mask <= DMA_BIT_MASK(32))) {
> >> set_dma_ops(dev, gx_legacy_pci_dma_map_ops);
> >> set_dma_offset(dev, 0);
> >> if (mask > dev->archdata.max_direct_dma_addr)
> >> diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
> >> index b9fe80e..7e22e73 100644
> >> --- a/arch/tile/kernel/pci-dma.c
> >> +++ b/arch/tile/kernel/pci-dma.c
> >> @@ -357,7 +357,7 @@ static void *tile_pci_dma_alloc_coherent(struct device *dev, size_t size,
> >>
> >> addr = page_to_phys(pg);
> >>
> >> - *dma_handle = phys_to_dma(dev, addr);
> >> + *dma_handle = addr + get_dma_offset(dev);
> >>
> >> return page_address(pg);
> >> }
> >> @@ -387,7 +387,7 @@ static int tile_pci_dma_map_sg(struct device *dev, struct scatterlist *sglist,
> >> sg->dma_address = sg_phys(sg);
> >> __dma_prep_pa_range(sg->dma_address, sg->length, direction);
> >>
> >> - sg->dma_address = phys_to_dma(dev, sg->dma_address);
> >> + sg->dma_address = sg->dma_address + get_dma_offset(dev);
> >> #ifdef CONFIG_NEED_SG_DMA_LENGTH
> >> sg->dma_length = sg->length;
> >> #endif
> >> @@ -422,7 +422,7 @@ static dma_addr_t tile_pci_dma_map_page(struct device *dev, struct page *page,
> >> BUG_ON(offset + size > PAGE_SIZE);
> >> __dma_prep_page(page, offset, size, direction);
> >>
> >> - return phys_to_dma(dev, page_to_pa(page) + offset);
> >> + return page_to_pa(page) + offset + get_dma_offset(dev);
> >> }
> >>
> >> static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
> >> @@ -432,7 +432,7 @@ static void tile_pci_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
> >> {
> >> BUG_ON(!valid_dma_direction(direction));
> >>
> >> - dma_address = dma_to_phys(dev, dma_address);
> >> + dma_address -= get_dma_offset(dev);
> >>
> >> __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
> >> dma_address & PAGE_OFFSET, size, direction);
> >> @@ -445,7 +445,7 @@ static void tile_pci_dma_sync_single_for_cpu(struct device *dev,
> >> {
> >> BUG_ON(!valid_dma_direction(direction));
> >>
> >> - dma_handle = dma_to_phys(dev, dma_handle);
> >> + dma_handle -= get_dma_offset(dev);
> >>
> >> __dma_complete_pa_range(dma_handle, size, direction);
> >> }
> >> @@ -456,7 +456,7 @@ static void tile_pci_dma_sync_single_for_device(struct device *dev,
> >> enum dma_data_direction
> >> direction)
> >> {
> >> - dma_handle = dma_to_phys(dev, dma_handle);
> >> + dma_handle -= get_dma_offset(dev);
> >>
> >> __dma_prep_pa_range(dma_handle, size, direction);
> >> }
> >> @@ -558,21 +558,43 @@ static struct dma_map_ops pci_swiotlb_dma_ops = {
> >> .mapping_error = swiotlb_dma_mapping_error,
> >> };
> >>
> >> +static struct dma_map_ops pci_hybrid_dma_ops = {
> >> + .alloc = tile_swiotlb_alloc_coherent,
> >> + .free = tile_swiotlb_free_coherent,
> >> + .map_page = tile_pci_dma_map_page,
> >> + .unmap_page = tile_pci_dma_unmap_page,
> >> + .map_sg = tile_pci_dma_map_sg,
> >> + .unmap_sg = tile_pci_dma_unmap_sg,
> >> + .sync_single_for_cpu = tile_pci_dma_sync_single_for_cpu,
> >> + .sync_single_for_device = tile_pci_dma_sync_single_for_device,
> >> + .sync_sg_for_cpu = tile_pci_dma_sync_sg_for_cpu,
> >> + .sync_sg_for_device = tile_pci_dma_sync_sg_for_device,
> >> + .mapping_error = tile_pci_dma_mapping_error,
> >> + .dma_supported = tile_pci_dma_supported
> >> +};
> >> +
> >> struct dma_map_ops *gx_legacy_pci_dma_map_ops = &pci_swiotlb_dma_ops;
> >> +struct dma_map_ops *gx_hybrid_pci_dma_map_ops = &pci_hybrid_dma_ops;
> >> #else
> >> struct dma_map_ops *gx_legacy_pci_dma_map_ops;
> >> +struct dma_map_ops *gx_hybrid_pci_dma_map_ops;
> >> #endif
> >> EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops);
> >> +EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops);
> >>
> >> #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
> >> int dma_set_coherent_mask(struct device *dev, u64 mask)
> >> {
> >> struct dma_map_ops *dma_ops = get_dma_ops(dev);
> >>
> >> - /* Handle legacy PCI devices with limited memory addressability. */
> >> - if (((dma_ops == gx_pci_dma_map_ops) ||
> >> - (dma_ops == gx_legacy_pci_dma_map_ops)) &&
> >> + /* Handle hybrid PCI devices with limited memory addressability. */
> >> + if ((dma_ops == gx_pci_dma_map_ops ||
> >> + dma_ops == gx_hybrid_pci_dma_map_ops ||
> >> + dma_ops == gx_legacy_pci_dma_map_ops) &&
> >> (mask <= DMA_BIT_MASK(32))) {
> >> + if (dma_ops == gx_pci_dma_map_ops)
> >> + set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
> >> +
> >> if (mask > dev->archdata.max_direct_dma_addr)
> >> mask = dev->archdata.max_direct_dma_addr;
> >> }
> >> --
> >> 1.8.3.1
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> >> the body of a message to [email protected]
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
>

2013-08-12 19:44:13

by Chris Metcalf

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

(Oops, resending without the helpful [SPAM] marker that our
mail system appears to have injected into the subject line.)

On 8/9/2013 6:42 PM, Bjorn Helgaas wrote:
> On Thu, Aug 08, 2013 at 12:47:10PM -0400, Chris Metcalf wrote:
>> On 8/6/2013 1:48 PM, Bjorn Helgaas wrote:
>>> [+cc Myron, Adam]
>>>
>>> On Fri, Aug 2, 2013 at 10:24 AM, Chris Metcalf <[email protected]> wrote:
>>>> According to LSI,
>>>> the firmware is not fully functional yet. This change implements a
>>>> kind of hybrid dma_ops to support this.
>>>>
>>>> Note that on most other platforms, the 64-bit DMA addressing space is the
>>>> same as the 32-bit DMA space and they overlap the physical memory space.
>>>> No special arrangement is needed to support this kind of mixed DMA
>>>> capability. On TILE-Gx, the 64-bit DMA space is completely separate
>>>> from the 32-bit DMA space.
>>> Help me understand what's going on here. My understanding is that on
>>> typical systems, the 32-bit DMA space is a subset of the 64-bit DMA
>>> space. In conventional PCI, "a master that supports 64-bit addressing
>>> must generate a SAC, instead of a DAC, when the upper 32 bits of the
>>> address are zero" (PCI spec r3.0, sec 3.9). PCIe doesn't have
>>> SAC/DAC, but it has both 32-bit and 64-bit address headers and has a
>>> similar requirement: "For Addresses below 4GB, Requesters must use the
>>> 32-bit format" (PCIe spec r3.0, sec 2.2.4.1).
>>>
>>> Those imply to me that the 0-4GB region of the 64-bit DMA space must
>>> be identical to the 0-4GB 32-bit DMA space, and in fact, the receiver
>>> of a transaction shouldn't be able to distinguish them.
>>>
>>> But it sounds like something's different on TILE-Gx? Does it
>>> translate bus addresses to physical memory addresses based on the type
>>> of the transaction (SAC vs DAC, or 32-bit vs 64-bit header) in
>>> addition to the address? Even if it does, the spec doesn't allow a
>>> DAC cycle or a 64-bit header where the 32 high-order bits are zero, so
>>> it shouldn't matter.
>> No, we don't translate based on the type of the transaction. Using
>> "DMA space" in the commit message was probably misleading. What's
>> really going on is different DMA windows. 32-bit DMA has the
>> obvious naive implementation where [0,4GB] in DMA space maps to
>> [0,4GB] in PA space. However, for 64-bit DMA, we use DMA
>> addresses with a non-zero high 32 bits, in the [1TB,2TB] range,
>> but map the results down to PA [0,1TB] using our IOMMU.
> I guess this means devices can DMA to physical addresses [0,3GB]
> using either 32-bit bus addresses in the [0,3GB] range or 64-bit bus
> addresses in the [1TB,1TB+3GB] range, right?

True in general, but not true for any specific individual device.

64-bit capable devices won’t generate 32-bit bus addresses, because
the dma_ops makes sure that only bus/DMA addresses in [1TB,1TB+3GB]
are handed out to the devices.

32-bit only devices use bus addresses in [0,3GB] to access the PA [0,3GB].
PA in [3GB, 4GB] is not accessed by the 32-bit only devices because the
bounce buffers are allocated under 3GB limit.

>> We did consider having the 64-bit DMA window be [0,1TB] and map
>> directly to PA space, like the 32-bit window. But this design
>> suffers from the “PCI hole” problem. Basically, the BAR space is
>> usually under 4GB (it occupies the range [3GB, 4GB] on tilegx) and
>> the host bridge uses negative decoding in passing DMA traffic
>> upstream. That is, DMA traffic with target address in [3GB, 4GB]
>> are not passed to the host memory. This means that the same amount
>> of physical memory as the BAR space cannot be used for DMA
>> purpose. And because it is not easy avoid this region in
>> allocating DMA memory, the kernel is simply told to not use this
>> chunk of PA at all, so it is wasted.
> OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
> as you describe. And even if DMA *could* reach it, the CPU couldn't
> see it because CPU accesses to that range would go to PCI for the
> memory-mapped BAR space, not to memory.

Right. Unreachability is only a problem if the DMA window overlaps
[3G, 4G], and since the 64-bit DMA window is [1TB,2TB], the whole PA
space can be reached by 64-bit capable devices.

> But I can't figure out why Tile needs to do something special. I
> think other arches handle the PCI hole for MMIO space the same way.
>
> I don't know if other arches alias the [0,3GB] physical address
> range in both 32-bit and 64-bit DMA space like you do, but if that's
> part of the problem, it seems like you could easily avoid the
> aliasing by making the 64-bit DMA space [1TB+4GB,2TB] instead of
> [1TB,2TB].

Perhaps, but since 64-bit capable devices can't actually see the
aliasing (since they aren't offered the [0,4GB] address range) they
only see an un-aliased space.

>> For the LSI device, the way we manage it is to ensure that the
>> device’s streaming buffers and the consistent buffers come from
>> different pools, with the latter using the under-4GB bounce
>> buffers. Obviously, normal devices use the same buffer pool for
>> both streaming and consistent, either under 4GB or the whole PA.
> It seems like you could make your DMA space be the union of [0,3GB]
> and [1TB+4GB,2TB], then use pci_set_dma_mask(dev, DMA_BIT_MASK(64))
> and pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)) (I assume the
> driver already sets those masks correctly if it works on other
> arches).

Unfortunately, the Megaraid driver doesn’t even call
pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)).

More generally, your proposed DMA space suggestion isn't optimal because
then the PA in [3GB, 4GB] can’t be reached by 64-bit capable devices.

>> Given all of that, does this change make sense? I can certainly
>> amend the commit description to include more commentary.
> Obviously, I'm missing something. I guess it really doesn't matter
> because this is all arch code and I don't need to understand it, but
> it does niggle at me somehow.

I will add the following comment to <asm/pci.h> in hopes of making it a
bit clearer:

/*
[...]
+ * This design lets us avoid the "PCI hole" problem where the host bridge
+ * won't pass DMA traffic with target addresses that happen to fall within the
+ * BAR space. This enables us to use all the physical memory for DMA, instead
+ * of wasting the same amount of physical memory as the BAR window size.
*/
#define TILE_PCI_MEM_MAP_BASE_OFFSET (1ULL << CHIP_PA_WIDTH())

--
Chris Metcalf, Tilera Corp.
http://www.tilera.com

2013-08-12 20:42:24

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

On Mon, Aug 12, 2013 at 1:42 PM, Chris Metcalf <[email protected]> wrote:
> On 8/9/2013 6:42 PM, Bjorn Helgaas wrote:
>> On Thu, Aug 08, 2013 at 12:47:10PM -0400, Chris Metcalf wrote:
>>> On 8/6/2013 1:48 PM, Bjorn Helgaas wrote:
>>>> [+cc Myron, Adam]
>>>>
>>>> On Fri, Aug 2, 2013 at 10:24 AM, Chris Metcalf <[email protected]> wrote:
>>>>> According to LSI,
>>>>> the firmware is not fully functional yet. This change implements a
>>>>> kind of hybrid dma_ops to support this.
>>>>>
>>>>> Note that on most other platforms, the 64-bit DMA addressing space is the
>>>>> same as the 32-bit DMA space and they overlap the physical memory space.
>>>>> No special arrangement is needed to support this kind of mixed DMA
>>>>> capability. On TILE-Gx, the 64-bit DMA space is completely separate
>>>>> from the 32-bit DMA space.
>>>> Help me understand what's going on here. My understanding is that on
>>>> typical systems, the 32-bit DMA space is a subset of the 64-bit DMA
>>>> space. In conventional PCI, "a master that supports 64-bit addressing
>>>> must generate a SAC, instead of a DAC, when the upper 32 bits of the
>>>> address are zero" (PCI spec r3.0, sec 3.9). PCIe doesn't have
>>>> SAC/DAC, but it has both 32-bit and 64-bit address headers and has a
>>>> similar requirement: "For Addresses below 4GB, Requesters must use the
>>>> 32-bit format" (PCIe spec r3.0, sec 2.2.4.1).
>>>>
>>>> Those imply to me that the 0-4GB region of the 64-bit DMA space must
>>>> be identical to the 0-4GB 32-bit DMA space, and in fact, the receiver
>>>> of a transaction shouldn't be able to distinguish them.
>>>>
>>>> But it sounds like something's different on TILE-Gx? Does it
>>>> translate bus addresses to physical memory addresses based on the type
>>>> of the transaction (SAC vs DAC, or 32-bit vs 64-bit header) in
>>>> addition to the address? Even if it does, the spec doesn't allow a
>>>> DAC cycle or a 64-bit header where the 32 high-order bits are zero, so
>>>> it shouldn't matter.
>>> No, we don't translate based on the type of the transaction. Using
>>> "DMA space" in the commit message was probably misleading. What's
>>> really going on is different DMA windows. 32-bit DMA has the
>>> obvious naive implementation where [0,4GB] in DMA space maps to
>>> [0,4GB] in PA space. However, for 64-bit DMA, we use DMA
>>> addresses with a non-zero high 32 bits, in the [1TB,2TB] range,
>>> but map the results down to PA [0,1TB] using our IOMMU.
>> I guess this means devices can DMA to physical addresses [0,3GB]
>> using either 32-bit bus addresses in the [0,3GB] range or 64-bit bus
>> addresses in the [1TB,1TB+3GB] range, right?
>
> True in general, but not true for any specific individual device.
>
> 64-bit capable devices won?t generate 32-bit bus addresses, because the dma_ops makes sure that only bus/DMA addresses in [1TB,1TB+3GB] are handed out to the devices.
>
> 32-bit only devices use bus addresses in [0,3GB] to access the PA [0,3GB]. PA in [3GB, 4GB] is not accessed by the 32-bit only devices because the bounce buffers are allocated under 3GB limit.
>
>>> We did consider having the 64-bit DMA window be [0,1TB] and map
>>> directly to PA space, like the 32-bit window. But this design
>>> suffers from the ?PCI hole? problem. Basically, the BAR space is
>>> usually under 4GB (it occupies the range [3GB, 4GB] on tilegx) and
>>> the host bridge uses negative decoding in passing DMA traffic
>>> upstream. That is, DMA traffic with target address in [3GB, 4GB]
>>> are not passed to the host memory. This means that the same amount
>>> of physical memory as the BAR space cannot be used for DMA
>>> purpose. And because it is not easy avoid this region in
>>> allocating DMA memory, the kernel is simply told to not use this
>>> chunk of PA at all, so it is wasted.
>> OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
>> as you describe. And even if DMA *could* reach it, the CPU couldn't
>> see it because CPU accesses to that range would go to PCI for the
>> memory-mapped BAR space, not to memory.
>
> Right. Unreachability is only a problem if the DMA window overlaps [3G, 4G], and since the 64-bit DMA window is [1TB,2TB], the whole PA space can be reached by 64-bit capable devices.

So the [0-1TB] memory range (including [3GB-4GB]) is reachable by
64-bit DMA to bus addresses [1TB-2TB]. But if the CPU can't see
physical memory from [3GB-4GB], how is it useful to DMA there?

>> But I can't figure out why Tile needs to do something special. I
>> think other arches handle the PCI hole for MMIO space the same way.
>>
>> I don't know if other arches alias the [0,3GB] physical address
>> range in both 32-bit and 64-bit DMA space like you do, but if that's
>> part of the problem, it seems like you could easily avoid the
>> aliasing by making the 64-bit DMA space [1TB+4GB,2TB] instead of
>> [1TB,2TB].
>
> Perhaps, but since 64-bit capable devices can't actually see the aliasing (since they aren't offered the [0,4GB] address range) they only see an un-aliased space.
>
>>> For the LSI device, the way we manage it is to ensure that the
>>> device?s streaming buffers and the consistent buffers come from
>>> different pools, with the latter using the under-4GB bounce
>>> buffers. Obviously, normal devices use the same buffer pool for
>>> both streaming and consistent, either under 4GB or the whole PA.
>> It seems like you could make your DMA space be the union of [0,3GB]
>> and [1TB+4GB,2TB], then use pci_set_dma_mask(dev, DMA_BIT_MASK(64))
>> and pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)) (I assume the
>> driver already sets those masks correctly if it works on other
>> arches).
>
> Unfortunately, the Megaraid driver doesn?t even call pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)).

If the Megaraid driver needs that call, but it's missing, why wouldn't
we just add it?

> More generally, your proposed DMA space suggestion isn't optimal because then the PA in [3GB, 4GB] can?t be reached by 64-bit capable devices.

True. I assumed it wasn't useful to DMA there because the CPU
couldn't see that memory anyway. But apparently that assumption was
wrong?

>>> Given all of that, does this change make sense? I can certainly
>>> amend the commit description to include more commentary.
>> Obviously, I'm missing something. I guess it really doesn't matter
>> because this is all arch code and I don't need to understand it, but
>> it does niggle at me somehow.
>
> I will add the following comment to <asm/pci.h> in hopes of making it a bit clearer:
>
> /*
> [...]
> + * This design lets us avoid the "PCI hole" problem where the host bridge
> + * won't pass DMA traffic with target addresses that happen to fall within the
> + * BAR space. This enables us to use all the physical memory for DMA, instead
> + * of wasting the same amount of physical memory as the BAR window size.

By "target addresses", I guess you mean the bus address, not the CPU
address, right?

The whole reason I'm interested in this is to figure out whether this
change is really specific to Tile, or whether other architectures need
similar changes. I think host bridges on other arches behave the same
way (they don't allow DMA to addresses in the PCI hole), so I still
haven't figured out what is truly Tile-specific.

I guess the ability for 64-bit DMA to reach the PCI hole (3GB-4GB)
might be unique, but it doesn't sound useful.

> */
> #define TILE_PCI_MEM_MAP_BASE_OFFSET (1ULL << CHIP_PA_WIDTH())
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
>

2013-08-13 16:12:53

by Chris Metcalf

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

(Trimming the quoted material a little to try to keep this email under control.)

On 8/12/2013 4:42 PM, Bjorn Helgaas wrote:
> On Mon, Aug 12, 2013 at 1:42 PM, Chris Metcalf <[email protected]> wrote:
>> On 8/9/2013 6:42 PM, Bjorn Helgaas wrote:
>>> OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
>>> as you describe. And even if DMA *could* reach it, the CPU couldn't
>>> see it because CPU accesses to that range would go to PCI for the
>>> memory-mapped BAR space, not to memory.
>> Right. Unreachability is only a problem if the DMA window overlaps [3G, 4G], and since the 64-bit DMA window is [1TB,2TB], the whole PA space can be reached by 64-bit capable devices.
> So the [0-1TB] memory range (including [3GB-4GB]) is reachable by
> 64-bit DMA to bus addresses [1TB-2TB]. But if the CPU can't see
> physical memory from [3GB-4GB], how is it useful to DMA there?

Sorry, looking back I can see that the thread is a little confusing.
The CPU can see the whole PA space. The confusion comes from the BAR space
in [3GB, 4GB].

On Tile, we define the CPU memory space as follows:

[0, 1TB]: PA
[1TB + 3GB, 1TB + 4GB]: BAR space for RC port 0, in [3GB, 4GB]
[1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: BAR space for RC port N, in [3GB, 4GB]

The mapping from [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB] to [3GB, 4GB] is done by a
hardware PIO region, which generates PCI bus addresses in [3GB, 4GB] for MMIOs to
the BAR space.

>> Unfortunately, the Megaraid driver doesn?t even call pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)).
> If the Megaraid driver needs that call, but it's missing, why wouldn't
> we just add it?

The Megaraid driver doesn?t strictly need that call on other platforms, because
by default the device coherent_dma_mask is DMA_BIT_MASK(32) and the consistent
memory pool doesn?t come from the bounce buffers on most other platforms.

Of course, for the sake of correctness, this call should be added across all platforms.

>> More generally, your proposed DMA space suggestion isn't optimal because then the PA in [3GB, 4GB] can?t be reached by 64-bit capable devices.
> True. I assumed it wasn't useful to DMA there because the CPU
> couldn't see that memory anyway. But apparently that assumption was
> wrong?

Correct.

>>>> Given all of that, does this change make sense? I can certainly
>>>> amend the commit description to include more commentary.
>>> Obviously, I'm missing something. I guess it really doesn't matter
>>> because this is all arch code and I don't need to understand it, but
>>> it does niggle at me somehow.
>> I will add the following comment to <asm/pci.h> in hopes of making it a bit clearer:
>>
>> /*
>> [...]
>> + * This design lets us avoid the "PCI hole" problem where the host bridge
>> + * won't pass DMA traffic with target addresses that happen to fall within the
>> + * BAR space. This enables us to use all the physical memory for DMA, instead
>> + * of wasting the same amount of physical memory as the BAR window size.
> By "target addresses", I guess you mean the bus address, not the CPU
> address, right?

Correct.

> The whole reason I'm interested in this is to figure out whether this
> change is really specific to Tile, or whether other architectures need
> similar changes. I think host bridges on other arches behave the same
> way (they don't allow DMA to addresses in the PCI hole), so I still
> haven't figured out what is truly Tile-specific.

What is unique about Tile is that the PCI drivers must explicitly declare
its DMA capability by calling pci_set_dma_mask() and pci_set_consistent_dma_mask().

This is why we must patch those drivers that don?t call pci_set_consistent_dma_mask(),
as is the case in the Megaraid driver.

> I guess the ability for 64-bit DMA to reach the PCI hole (3GB-4GB)
> might be unique, but it doesn't sound useful.

It seems like other architectures might benefit from the approach we've taken
with tile, but it's certainly disruptive enough that it might not be worth it.

--
Chris Metcalf, Tilera Corp.
http://www.tilera.com

2013-08-13 20:30:54

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

[+cc James in case he has opinions on the DMA mask question]

On Tue, Aug 13, 2013 at 10:12 AM, Chris Metcalf <[email protected]> wrote:
> (Trimming the quoted material a little to try to keep this email under control.)
>
> On 8/12/2013 4:42 PM, Bjorn Helgaas wrote:
>> On Mon, Aug 12, 2013 at 1:42 PM, Chris Metcalf <[email protected]> wrote:
>>> On 8/9/2013 6:42 PM, Bjorn Helgaas wrote:
>>>> OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
>>>> as you describe. And even if DMA *could* reach it, the CPU couldn't
>>>> see it because CPU accesses to that range would go to PCI for the
>>>> memory-mapped BAR space, not to memory.
>>> Right. Unreachability is only a problem if the DMA window overlaps [3G, 4G], and since the 64-bit DMA window is [1TB,2TB], the whole PA space can be reached by 64-bit capable devices.
>> So the [0-1TB] memory range (including [3GB-4GB]) is reachable by
>> 64-bit DMA to bus addresses [1TB-2TB]. But if the CPU can't see
>> physical memory from [3GB-4GB], how is it useful to DMA there?
>
> Sorry, looking back I can see that the thread is a little confusing.
> The CPU can see the whole PA space. The confusion comes from the BAR space
> in [3GB, 4GB].
>
> On Tile, we define the CPU memory space as follows:
>
> [0, 1TB]: PA
> [1TB + 3GB, 1TB + 4GB]: BAR space for RC port 0, in [3GB, 4GB]
> [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: BAR space for RC port N, in [3GB, 4GB]
>
> The mapping from [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB] to [3GB, 4GB] is done by a
> hardware PIO region, which generates PCI bus addresses in [3GB, 4GB] for MMIOs to
> the BAR space.

OK, I think I get it now. CPU address space:
[0, 1TB]: physical memory
[1TB + 3GB, 1TB + 4GB]: translated to bus address [3GB, 4GB] under RC port 0
[1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: translated to bus address
[3GB, 4GB] under RC port N

Bus address space:
[0, 3GB]: 32-bit DMA reaches physical memory [0, 3GB]
[3GB, 4GB]: 32-bit DMA (peer-to-peer DMA under local RC port, I guess?)
[1TB, 2TB]: 64-bit DMA mapped via IOMMU to physical memory [0, 1TB]

I guess the problem is that 32-bit DMA can't reach physical memory
[3GB, 4GB], so you're using bounce buffers so the bus address is in
[0, 3GB]. That makes sense, and I don't see another possibility other
than just throwing away the [3GB, 4GB] range by leaving it out of the
kernel allocator altogether, or using hardware (which tilegx probably
doesn't have) to remap it somewhere else.

So it seems like just a question of how you wrap this all up in
dma_ops, and *that* is all arch stuff that I don't have an opinion on.

>>> Unfortunately, the Megaraid driver doesn?t even call pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)).
>> If the Megaraid driver needs that call, but it's missing, why wouldn't
>> we just add it?
>
> The Megaraid driver doesn?t strictly need that call on other platforms, because
> by default the device coherent_dma_mask is DMA_BIT_MASK(32) and the consistent
> memory pool doesn?t come from the bounce buffers on most other platforms.
>
> Of course, for the sake of correctness, this call should be added across all platforms.
> ...

> What is unique about Tile is that the PCI drivers must explicitly declare
> its DMA capability by calling pci_set_dma_mask() and pci_set_consistent_dma_mask().

It looks like the reason you need drivers to explicitly call
pci_set_dma_mask() and pci_set_consistent_dma_mask() is because you
have hooks in those functions to tweak the dma_ops, even though the
mask itself might not be changed.

That doesn't sound like a robust solution: we have well-known defaults
for the mask sizes, and I don't think it's reasonable to expect
drivers to explicitly set the mask even if they are happy with the
defaults (though Documentation/DMA-API-HOWTO.txt does say that being
explicit is good style). I'm afraid you'll just keep tripping over
drivers that don't work on tilegx because they don't set the mask.

Bjorn

2013-08-13 21:53:17

by James Bottomley

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

On Tue, 2013-08-13 at 14:30 -0600, Bjorn Helgaas wrote:
> [+cc James in case he has opinions on the DMA mask question]
>
> On Tue, Aug 13, 2013 at 10:12 AM, Chris Metcalf <[email protected]> wrote:
> > (Trimming the quoted material a little to try to keep this email under control.)
> >
> > On 8/12/2013 4:42 PM, Bjorn Helgaas wrote:
> >> On Mon, Aug 12, 2013 at 1:42 PM, Chris Metcalf <[email protected]> wrote:
> >>> On 8/9/2013 6:42 PM, Bjorn Helgaas wrote:
> >>>> OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
> >>>> as you describe. And even if DMA *could* reach it, the CPU couldn't
> >>>> see it because CPU accesses to that range would go to PCI for the
> >>>> memory-mapped BAR space, not to memory.
> >>> Right. Unreachability is only a problem if the DMA window overlaps [3G, 4G], and since the 64-bit DMA window is [1TB,2TB], the whole PA space can be reached by 64-bit capable devices.
> >> So the [0-1TB] memory range (including [3GB-4GB]) is reachable by
> >> 64-bit DMA to bus addresses [1TB-2TB]. But if the CPU can't see
> >> physical memory from [3GB-4GB], how is it useful to DMA there?
> >
> > Sorry, looking back I can see that the thread is a little confusing.
> > The CPU can see the whole PA space. The confusion comes from the BAR space
> > in [3GB, 4GB].
> >
> > On Tile, we define the CPU memory space as follows:
> >
> > [0, 1TB]: PA
> > [1TB + 3GB, 1TB + 4GB]: BAR space for RC port 0, in [3GB, 4GB]
> > [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: BAR space for RC port N, in [3GB, 4GB]
> >
> > The mapping from [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB] to [3GB, 4GB] is done by a
> > hardware PIO region, which generates PCI bus addresses in [3GB, 4GB] for MMIOs to
> > the BAR space.
>
> OK, I think I get it now. CPU address space:
> [0, 1TB]: physical memory
> [1TB + 3GB, 1TB + 4GB]: translated to bus address [3GB, 4GB] under RC port 0
> [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: translated to bus address
> [3GB, 4GB] under RC port N
>
> Bus address space:
> [0, 3GB]: 32-bit DMA reaches physical memory [0, 3GB]
> [3GB, 4GB]: 32-bit DMA (peer-to-peer DMA under local RC port, I guess?)
> [1TB, 2TB]: 64-bit DMA mapped via IOMMU to physical memory [0, 1TB]
>
> I guess the problem is that 32-bit DMA can't reach physical memory
> [3GB, 4GB], so you're using bounce buffers so the bus address is in
> [0, 3GB]. That makes sense, and I don't see another possibility other
> than just throwing away the [3GB, 4GB] range by leaving it out of the
> kernel allocator altogether, or using hardware (which tilegx probably
> doesn't have) to remap it somewhere else.

This is remarkably familiar. I think almost every system on earth has a
configuration similar to this. On PARISC, the top 256MB of memory on a
32 bit system is reserved for I/O access and is designated as "F Space".

What is unusual is that you seem to have responding memory behind the F
Space which is accessible to some bus entities. On PARISC 32 bit, the
memory is just lost (inaccessible) on 64 bit, it's remapped above 32GB
(and the low F Space window expanded to 1GB).

> So it seems like just a question of how you wrap this all up in
> dma_ops, and *that* is all arch stuff that I don't have an opinion on.
>
> >>> Unfortunately, the Megaraid driver doesn’t even call pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)).
> >> If the Megaraid driver needs that call, but it's missing, why wouldn't
> >> we just add it?
> >
> > The Megaraid driver doesn’t strictly need that call on other platforms, because
> > by default the device coherent_dma_mask is DMA_BIT_MASK(32) and the consistent
> > memory pool doesn’t come from the bounce buffers on most other platforms.
> >
> > Of course, for the sake of correctness, this call should be added across all platforms.
> > ...
>
> > What is unique about Tile is that the PCI drivers must explicitly declare
> > its DMA capability by calling pci_set_dma_mask() and pci_set_consistent_dma_mask().
>
> It looks like the reason you need drivers to explicitly call
> pci_set_dma_mask() and pci_set_consistent_dma_mask() is because you
> have hooks in those functions to tweak the dma_ops, even though the
> mask itself might not be changed.
>
> That doesn't sound like a robust solution: we have well-known defaults
> for the mask sizes, and I don't think it's reasonable to expect
> drivers to explicitly set the mask even if they are happy with the
> defaults (though Documentation/DMA-API-HOWTO.txt does say that being
> explicit is good style). I'm afraid you'll just keep tripping over
> drivers that don't work on tilegx because they don't set the mask.

Right, it's not a robust solution at all. A DMA mask is just that: an
accessibility mask. The founding assumption is that an address line is
either connected or not, which is why the mask works. What you have is
two different classes of memory: 0-3GB which is usable for I/O and 3-4GB
which isn't. Surely what you need to do is implement ZONE_DMA32? which
stretches from 0-3GB, which means all kmallocs in the driver will be in
the right range. Then you need a bounce pfn at 3GB which means that
user space which gets the 3-4GB is bounced. There's a magic pfn in
BLK_BOUNCE_HIGH that was designed for this, but I'm not sure the design
contemplated BLK_BOUNCE_HIGH being different for 64 and 32 bit.

James

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2013-08-14 19:10:29

by Chris Metcalf

[permalink] [raw]
Subject: Re: [PATCH v2] tile: support LSI MEGARAID SAS HBA hybrid dma_ops

On 8/13/2013 4:30 PM, Bjorn Helgaas wrote:
> [+cc James in case he has opinions on the DMA mask question]
>
> On Tue, Aug 13, 2013 at 10:12 AM, Chris Metcalf <[email protected]> wrote:
>> (Trimming the quoted material a little to try to keep this email under control.)
>>
>> On 8/12/2013 4:42 PM, Bjorn Helgaas wrote:
>>> On Mon, Aug 12, 2013 at 1:42 PM, Chris Metcalf <[email protected]> wrote:
>>>> On 8/9/2013 6:42 PM, Bjorn Helgaas wrote:
>>>>> OK, so physical memory in the [3GB,4GB] range is unreachable via DMA
>>>>> as you describe. And even if DMA *could* reach it, the CPU couldn't
>>>>> see it because CPU accesses to that range would go to PCI for the
>>>>> memory-mapped BAR space, not to memory.
>>>> Right. Unreachability is only a problem if the DMA window overlaps [3G, 4G], and since the 64-bit DMA window is [1TB,2TB], the whole PA space can be reached by 64-bit capable devices.
>>> So the [0-1TB] memory range (including [3GB-4GB]) is reachable by
>>> 64-bit DMA to bus addresses [1TB-2TB]. But if the CPU can't see
>>> physical memory from [3GB-4GB], how is it useful to DMA there?
>> Sorry, looking back I can see that the thread is a little confusing.
>> The CPU can see the whole PA space. The confusion comes from the BAR space
>> in [3GB, 4GB].
>>
>> On Tile, we define the CPU memory space as follows:
>>
>> [0, 1TB]: PA
>> [1TB + 3GB, 1TB + 4GB]: BAR space for RC port 0, in [3GB, 4GB]
>> [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: BAR space for RC port N, in [3GB, 4GB]
>>
>> The mapping from [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB] to [3GB, 4GB] is done by a
>> hardware PIO region, which generates PCI bus addresses in [3GB, 4GB] for MMIOs to
>> the BAR space.
> OK, I think I get it now. CPU address space:
> [0, 1TB]: physical memory
> [1TB + 3GB, 1TB + 4GB]: translated to bus address [3GB, 4GB] under RC port 0
> [1TB + 3GB + N*4GB, 1TB + (1 + N)*4GB]: translated to bus address
> [3GB, 4GB] under RC port N
>
> Bus address space:
> [0, 3GB]: 32-bit DMA reaches physical memory [0, 3GB]
> [3GB, 4GB]: 32-bit DMA (peer-to-peer DMA under local RC port, I guess?)

Correct.

> [1TB, 2TB]: 64-bit DMA mapped via IOMMU to physical memory [0, 1TB]
>
> I guess the problem is that 32-bit DMA can't reach physical memory
> [3GB, 4GB], so you're using bounce buffers so the bus address is in
> [0, 3GB]. That makes sense, and I don't see another possibility other
> than just throwing away the [3GB, 4GB] range by leaving it out of the
> kernel allocator altogether, or using hardware (which tilegx probably
> doesn't have) to remap it somewhere else.
>
> So it seems like just a question of how you wrap this all up in
> dma_ops, and *that* is all arch stuff that I don't have an opinion on.
>
>>>> Unfortunately, the Megaraid driver doesn?t even call pci_set_consistent_dma_mask(dev, DMA_BIT_MASK(32)).
>>> If the Megaraid driver needs that call, but it's missing, why wouldn't
>>> we just add it?
>> The Megaraid driver doesn?t strictly need that call on other platforms, because
>> by default the device coherent_dma_mask is DMA_BIT_MASK(32) and the consistent
>> memory pool doesn?t come from the bounce buffers on most other platforms.
>>
>> Of course, for the sake of correctness, this call should be added across all platforms.
>> ...
>> What is unique about Tile is that the PCI drivers must explicitly declare
>> its DMA capability by calling pci_set_dma_mask() and pci_set_consistent_dma_mask().
> It looks like the reason you need drivers to explicitly call
> pci_set_dma_mask() and pci_set_consistent_dma_mask() is because you
> have hooks in those functions to tweak the dma_ops, even though the
> mask itself might not be changed.
>
> That doesn't sound like a robust solution: we have well-known defaults
> for the mask sizes, and I don't think it's reasonable to expect
> drivers to explicitly set the mask even if they are happy with the
> defaults (though Documentation/DMA-API-HOWTO.txt does say that being
> explicit is good style). I'm afraid you'll just keep tripping over
> drivers that don't work on tilegx because they don't set the mask.

Yes, I think you (and James) are right. We will look at changing to a DMA_MASK(32) default and do some testing and see what that ends up looking like.

James also wrote:
> What you have is
> two different classes of memory: 0-3GB which is usable for I/O and 3-4GB
> which isn't. Surely what you need to do is implement ZONE_DMA32? which
> stretches from 0-3GB, which means all kmallocs in the driver will be in
> the right range. Then you need a bounce pfn at 3GB which means that
> user space which gets the 3-4GB is bounced. There's a magic pfn in
> BLK_BOUNCE_HIGH that was designed for this, but I'm not sure the design
> contemplated BLK_BOUNCE_HIGH being different for 64 and 32 bit.

Well, we do have ZONE_DMA already, for PAs 0-4GB. This is a sort of generally reasonable definition, and works well for 32-bit USB devices, for example. But you're right that it does mean that sometimes something like swiotlb_alloc_coherent() might try to allocate memory for 32-bit PCI and get something that falls in the 3-4GB hole, and it then throws it away and uses map_single(). So arguably we should just shift our ZONE_DMA ceiling down to 3GB to make PCI more likely to get a usable page, though that penalizes 32-bit USB, and if we ever thought about expanding the size of the PCI BAR there, it would penalize it more.

Bottom line, though, it does seem like in general the 32-bit devices (both PCI and USB) that we see in practice are pretty low performance, so it's hard to worry too much about it, as long as they work.

I don't know that we've looked at the BLK_BOUNCE_xxx stuff much; I'm personally not very familiar with it, except that as you say, it seems to be hard-wired to -1ULL for 64-bit systems.

--
Chris Metcalf, Tilera Corp.
http://www.tilera.com

2013-08-30 14:15:17

by Chris Metcalf

[permalink] [raw]
Subject: [PATCH] tile PCI RC: make default consistent DMA mask 32-bit

This change sets the PCI devices' initial DMA capabilities
conservatively and promotes them at the request of the driver,
as opposed to assuming advanced DMA capabilities. The old design
runs the risk of breaking drivers that assume default capabilities.

Signed-off-by: Chris Metcalf <[email protected]>
---
arch/tile/include/asm/device.h | 5 ++++-
arch/tile/include/asm/dma-mapping.h | 21 +++++++++++++--------
arch/tile/kernel/pci-dma.c | 21 ++++++++++++---------
arch/tile/kernel/pci_gx.c | 15 +++++++++++++--
4 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/arch/tile/include/asm/device.h b/arch/tile/include/asm/device.h
index 5182705..6ab8bf14 100644
--- a/arch/tile/include/asm/device.h
+++ b/arch/tile/include/asm/device.h
@@ -23,7 +23,10 @@ struct dev_archdata {
/* Offset of the DMA address from the PA. */
dma_addr_t dma_offset;

- /* Highest DMA address that can be generated by this device. */
+ /*
+ * Highest DMA address that can be generated by devices that
+ * have limited DMA capability, i.e. non 64-bit capable.
+ */
dma_addr_t max_direct_dma_addr;
};

diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index 6f522d5..1eae359 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -92,14 +92,19 @@ dma_set_mask(struct device *dev, u64 mask)
{
struct dma_map_ops *dma_ops = get_dma_ops(dev);

- /* Handle legacy PCI devices with limited memory addressability. */
- if ((dma_ops == gx_pci_dma_map_ops ||
- dma_ops == gx_hybrid_pci_dma_map_ops ||
- dma_ops == gx_legacy_pci_dma_map_ops) &&
- (mask <= DMA_BIT_MASK(32))) {
- set_dma_ops(dev, gx_legacy_pci_dma_map_ops);
- set_dma_offset(dev, 0);
- if (mask > dev->archdata.max_direct_dma_addr)
+ /*
+ * For PCI devices with 64-bit DMA addressing capability, promote
+ * the dma_ops to hybrid, with the consistent memory DMA space limited
+ * to 32-bit. For 32-bit capable devices, limit the streaming DMA
+ * address range to max_direct_dma_addr.
+ */
+ if (dma_ops == gx_pci_dma_map_ops ||
+ dma_ops == gx_hybrid_pci_dma_map_ops ||
+ dma_ops == gx_legacy_pci_dma_map_ops) {
+ if (mask == DMA_BIT_MASK(64) &&
+ dma_ops == gx_legacy_pci_dma_map_ops)
+ set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
+ else if (mask > dev->archdata.max_direct_dma_addr)
mask = dev->archdata.max_direct_dma_addr;
}

diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index d94f487..09b5870 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -588,15 +588,18 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
{
struct dma_map_ops *dma_ops = get_dma_ops(dev);

- /* Handle hybrid PCI devices with limited memory addressability. */
- if ((dma_ops == gx_pci_dma_map_ops ||
- dma_ops == gx_hybrid_pci_dma_map_ops ||
- dma_ops == gx_legacy_pci_dma_map_ops) &&
- (mask <= DMA_BIT_MASK(32))) {
- if (dma_ops == gx_pci_dma_map_ops)
- set_dma_ops(dev, gx_hybrid_pci_dma_map_ops);
-
- if (mask > dev->archdata.max_direct_dma_addr)
+ /*
+ * For PCI devices with 64-bit DMA addressing capability, promote
+ * the dma_ops to full capability for both streams and consistent
+ * memory access. For 32-bit capable devices, limit the consistent
+ * memory DMA range to max_direct_dma_addr.
+ */
+ if (dma_ops == gx_pci_dma_map_ops ||
+ dma_ops == gx_hybrid_pci_dma_map_ops ||
+ dma_ops == gx_legacy_pci_dma_map_ops) {
+ if (mask == DMA_BIT_MASK(64))
+ set_dma_ops(dev, gx_pci_dma_map_ops);
+ else if (mask > dev->archdata.max_direct_dma_addr)
mask = dev->archdata.max_direct_dma_addr;
}

diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 66ef9db..29acac6 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -1081,13 +1081,24 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
return pci_enable_resources(dev, mask);
}

-/* Called for each device after PCI setup is done. */
+/*
+ * Called for each device after PCI setup is done.
+ * We initialize the PCI device capabilities conservatively, assuming that
+ * all devices can only address the 32-bit DMA space. The exception here is
+ * that the device dma_offset is set to the value that matches the 64-bit
+ * capable devices. This is OK because dma_offset is not used by legacy
+ * dma_ops, nor by the hybrid dma_ops's streaming DMAs, which are 64-bit ops.
+ * This implementation matches the kernel design of setting PCI devices'
+ * coherent_dma_mask to 0xffffffffull by default, allowing the device drivers
+ * to skip calling pci_set_consistent_dma_mask(DMA_BIT_MASK(32)).
+ */
static void pcibios_fixup_final(struct pci_dev *pdev)
{
- set_dma_ops(&pdev->dev, gx_pci_dma_map_ops);
+ set_dma_ops(&pdev->dev, gx_legacy_pci_dma_map_ops);
set_dma_offset(&pdev->dev, TILE_PCI_MEM_MAP_BASE_OFFSET);
pdev->dev.archdata.max_direct_dma_addr =
TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
+ pdev->dev.coherent_dma_mask = TILE_PCI_MAX_DIRECT_DMA_ADDRESS;
}
DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_final);

--
1.8.3.1