Hi,
This patchset implements Non-Transparent Bridge (NTB) support for the
Microsemi Switchtec series of switches. We're looking for some
review from the community at this point but hope to get it upstreamed
for v4.14.
Switchtec NTB support is configured over the same function and bar
as the management endpoint. Thus, the new driver hooks into the
management driver which we had merged in v4.12. We use the class
interface API to register an NTB device for every switchtec device
which supports NTB (not all do).
The Switchtec hardware supports doorbells, memory windows and messages.
Seeing there is no native scratchpad support, 128 spads are emulated
through the use of a pre-setup memory window. The switch has 64
doorbells which are shared between the two partitions and a
configurable set of memory windows. While the hardware supports more
than 2 partitions, this driver only supports the first two seeing
the current NTB API only supports two hosts.
The driver has been tested with ntb_netdev and fully passes the
ntb_test script.
This patchset is based off of v4.12-rc5 and can be found in this
git repo:
https://github.com/sbates130272/linux-p2pmem.git switchtec_ntb
Thanks,
Logan
Logan Gunthorpe (13):
switchtec: move structure definitions into a common header
switchtec: export class symbol for use in upper layer driver
switchtec: add ntb hardware register definitions
switchtec: add link event notifier block
switchtec_ntb: introduce initial ntb driver
switchtec_ntb: initialize hardware for memory windows
switchtec_ntb: initialize hardware for doorbells and messages
switchtec_ntb: add skeleton ntb driver
switchtec_ntb: add link management
switchtec_ntb: implement doorbell registers
switchtec_ntb: implement scratchpad registers
switchtec_ntb: add memory window support
switchtec_ntb: update switchtec documentation with notes for ntb
Documentation/switchtec.txt | 12 +
MAINTAINERS | 2 +
drivers/ntb/hw/Kconfig | 1 +
drivers/ntb/hw/Makefile | 1 +
drivers/ntb/hw/mscc/Kconfig | 9 +
drivers/ntb/hw/mscc/Makefile | 1 +
drivers/ntb/hw/mscc/switchtec_ntb.c | 1144 +++++++++++++++++++++++++++++++++++
drivers/pci/switch/switchtec.c | 319 ++--------
include/linux/ntb.h | 3 +
include/linux/switchtec.h | 365 +++++++++++
10 files changed, 1601 insertions(+), 256 deletions(-)
create mode 100644 drivers/ntb/hw/mscc/Kconfig
create mode 100644 drivers/ntb/hw/mscc/Makefile
create mode 100644 drivers/ntb/hw/mscc/switchtec_ntb.c
create mode 100644 include/linux/switchtec.h
--
2.11.0
switchtec_ntb checks for a link by looking at the shared memory
window. If the magic number is correct and the otherside indicates
their link is enabled then we take the link to be up.
Whenever we change our local link status we send a msg to the
otherside to check whether it's up and change their status.
The current status is maintained in a flag so ntb_is_link_up
can return quickly.
We utilize switchtec's link status notifier to also check link changes
when the switch notices a port changes state.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 160 +++++++++++++++++++++++++++++++++++-
1 file changed, 159 insertions(+), 1 deletion(-)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index dadbf3c38d0e..19fa8fb0f15c 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -57,6 +57,7 @@ static inline void _iowrite64(u64 val, void __iomem *mmio)
struct shared_mw {
u32 magic;
+ u32 link_sta;
u32 partition_id;
};
@@ -92,8 +93,18 @@ struct switchtec_ntb {
int nr_direct_mw;
int nr_lut_mw;
int direct_mw_to_bar[MAX_DIRECT_MW];
+
+ bool link_is_up;
+ enum ntb_speed link_speed;
+ enum ntb_width link_width;
+ struct notifier_block link_notifier;
};
+static struct switchtec_ntb *ntb_sndev(struct ntb_dev *ntb)
+{
+ return container_of(ntb, struct switchtec_ntb, ntb);
+}
+
static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
struct ntb_ctrl_regs __iomem *ctl,
u32 op, int wait_status)
@@ -147,6 +158,17 @@ static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
return -EIO;
}
+static int switchtec_ntb_send_msg(struct switchtec_ntb *sndev, int idx,
+ u32 val)
+{
+ if (idx < 0 || idx >= ARRAY_SIZE(sndev->mmio_self_dbmsg->omsg))
+ return -EINVAL;
+
+ iowrite32(val, &sndev->mmio_self_dbmsg->omsg[idx].msg);
+
+ return 0;
+}
+
static int switchtec_ntb_mw_count(struct ntb_dev *ntb)
{
return 0;
@@ -167,22 +189,148 @@ static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
return 0;
}
+static void switchtec_ntb_part_link_speed(struct switchtec_ntb *sndev,
+ int partition,
+ enum ntb_speed *speed,
+ enum ntb_width *width)
+{
+ struct switchtec_dev *stdev = sndev->stdev;
+
+ u32 pff = ioread32(&stdev->mmio_part_cfg[partition].vep_pff_inst_id);
+ u32 linksta = ioread32(&stdev->mmio_pff_csr[pff].pci_cap_region[13]);
+
+ if (speed)
+ *speed = (linksta >> 16) & 0xF;
+
+ if (width)
+ *width = (linksta >> 20) & 0x3F;
+}
+
+static void switchtec_ntb_set_link_speed(struct switchtec_ntb *sndev)
+{
+ enum ntb_speed self_speed, peer_speed;
+ enum ntb_width self_width, peer_width;
+
+ if (!sndev->link_is_up) {
+ sndev->link_speed = NTB_SPEED_NONE;
+ sndev->link_width = NTB_WIDTH_NONE;
+ return;
+ }
+
+ switchtec_ntb_part_link_speed(sndev, sndev->self_partition,
+ &self_speed, &self_width);
+ switchtec_ntb_part_link_speed(sndev, sndev->peer_partition,
+ &peer_speed, &peer_width);
+
+ sndev->link_speed = min(self_speed, peer_speed);
+ sndev->link_width = min(self_width, peer_width);
+}
+
+enum {
+ LINK_MESSAGE = 0,
+ MSG_LINK_UP = 1,
+ MSG_LINK_DOWN = 2,
+ MSG_CHECK_LINK = 3,
+};
+
+static void switchtec_ntb_check_link(struct switchtec_ntb *sndev)
+{
+ int link_sta;
+ int old = sndev->link_is_up;
+
+ link_sta = sndev->self_shared->link_sta;
+ if (link_sta) {
+ u64 peer = ioread64(&sndev->peer_shared->magic);
+
+ if ((peer & 0xFFFFFFFF) == SWITCHTEC_NTB_MAGIC)
+ link_sta = peer >> 32;
+ else
+ link_sta = 0;
+ }
+
+ sndev->link_is_up = link_sta;
+ switchtec_ntb_set_link_speed(sndev);
+
+ if (link_sta != old) {
+ switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_CHECK_LINK);
+ ntb_link_event(&sndev->ntb);
+ dev_info(&sndev->stdev->dev, "ntb link %s",
+ link_sta ? "up" : "down");
+ }
+}
+
+static void switchtec_ntb_link_event(struct switchtec_ntb *sndev, int msg)
+{
+ switchtec_ntb_check_link(sndev);
+}
+
+static int switchtec_ntb_link_notification(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct switchtec_ntb *sndev = container_of(nb, struct switchtec_ntb,
+ link_notifier);
+
+ dev_dbg(&sndev->stdev->dev, "%s", __func__);
+ switchtec_ntb_check_link(sndev);
+
+ return NOTIFY_OK;
+}
+
+static int switchtec_ntb_init_link_notifier(struct switchtec_ntb *sndev)
+{
+ sndev->link_notifier.notifier_call = switchtec_ntb_link_notification;
+
+ return blocking_notifier_chain_register(&sndev->stdev->link_notifier,
+ &sndev->link_notifier);
+}
+
+static void switchtec_ntb_deinit_link_notifier(struct switchtec_ntb *sndev)
+{
+ blocking_notifier_chain_unregister(&sndev->stdev->link_notifier,
+ &sndev->link_notifier);
+}
+
static int switchtec_ntb_link_is_up(struct ntb_dev *ntb,
enum ntb_speed *speed,
enum ntb_width *width)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (speed)
+ *speed = sndev->link_speed;
+ if (width)
+ *width = sndev->link_width;
+
+ return sndev->link_is_up;
}
static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
enum ntb_speed max_speed,
enum ntb_width max_width)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ dev_dbg(&sndev->stdev->dev, "enabling link");
+
+ sndev->self_shared->link_sta = 1;
+ switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
+
+ switchtec_ntb_check_link(sndev);
+
return 0;
}
static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ dev_dbg(&sndev->stdev->dev, "disabling link");
+
+ sndev->self_shared->link_sta = 0;
+ switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_LINK_UP);
+
+ switchtec_ntb_check_link(sndev);
+
return 0;
}
@@ -499,6 +647,9 @@ static irqreturn_t switchtec_ntb_message_isr(int irq, void *dev)
dev_dbg(&sndev->stdev->dev, "message: %d %08x\n", i,
(u32)msg);
iowrite8(1, &sndev->mmio_self_dbmsg->imsg[i].status);
+
+ if (i == LINK_MESSAGE)
+ switchtec_ntb_link_event(sndev, msg);
}
}
@@ -596,6 +747,10 @@ static int switchtec_ntb_add(struct device *dev,
if (rc)
goto deinit_shared_and_exit;
+ rc = switchtec_ntb_init_link_notifier(sndev);
+ if (rc)
+ goto denit_db_msg_and_exit;
+
rc = ntb_register_device(&sndev->ntb);
if (rc)
goto deinit_and_exit;
@@ -606,6 +761,8 @@ static int switchtec_ntb_add(struct device *dev,
return 0;
deinit_and_exit:
+ switchtec_ntb_deinit_link_notifier(sndev);
+denit_db_msg_and_exit:
switchtec_ntb_deinit_db_msg_irq(sndev);
deinit_shared_and_exit:
switchtec_ntb_deinit_shared_mw(sndev);
@@ -626,6 +783,7 @@ void switchtec_ntb_remove(struct device *dev,
stdev->sndev = NULL;
ntb_unregister_device(&sndev->ntb);
+ switchtec_ntb_deinit_link_notifier(sndev);
switchtec_ntb_deinit_db_msg_irq(sndev);
switchtec_ntb_deinit_shared_mw(sndev);
kfree(sndev);
--
2.11.0
Create the switchtec.h header in include/linux with hardware defines
and the switchtec_dev structure moved directly from switchtec.c.
This is a prep patch for created an NTB driver for switchtec.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
MAINTAINERS | 1 +
drivers/pci/switch/switchtec.c | 249 +------------------------------------
include/linux/switchtec.h | 270 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 272 insertions(+), 248 deletions(-)
create mode 100644 include/linux/switchtec.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 09b5ab6a8a5c..6cee5e253ec3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9841,6 +9841,7 @@ F: Documentation/switchtec.txt
F: Documentation/ABI/testing/sysfs-class-switchtec
F: drivers/pci/switch/switchtec*
F: include/uapi/linux/switchtec_ioctl.h
+F: include/linux/switchtec.h
PCI DRIVER FOR NVIDIA TEGRA
M: Thierry Reding <[email protected]>
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index f6a63406c76e..c4369ba7bbc1 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -13,6 +13,7 @@
*
*/
+#include <linux/switchtec.h>
#include <linux/switchtec_ioctl.h>
#include <linux/interrupt.h>
@@ -37,254 +38,6 @@ static dev_t switchtec_devt;
static struct class *switchtec_class;
static DEFINE_IDA(switchtec_minor_ida);
-#define MICROSEMI_VENDOR_ID 0x11f8
-#define MICROSEMI_NTB_CLASSCODE 0x068000
-#define MICROSEMI_MGMT_CLASSCODE 0x058000
-
-#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
-#define SWITCHTEC_MAX_PFF_CSR 48
-
-#define SWITCHTEC_EVENT_OCCURRED BIT(0)
-#define SWITCHTEC_EVENT_CLEAR BIT(0)
-#define SWITCHTEC_EVENT_EN_LOG BIT(1)
-#define SWITCHTEC_EVENT_EN_CLI BIT(2)
-#define SWITCHTEC_EVENT_EN_IRQ BIT(3)
-#define SWITCHTEC_EVENT_FATAL BIT(4)
-
-enum {
- SWITCHTEC_GAS_MRPC_OFFSET = 0x0000,
- SWITCHTEC_GAS_TOP_CFG_OFFSET = 0x1000,
- SWITCHTEC_GAS_SW_EVENT_OFFSET = 0x1800,
- SWITCHTEC_GAS_SYS_INFO_OFFSET = 0x2000,
- SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
- SWITCHTEC_GAS_PART_CFG_OFFSET = 0x4000,
- SWITCHTEC_GAS_NTB_OFFSET = 0x10000,
- SWITCHTEC_GAS_PFF_CSR_OFFSET = 0x134000,
-};
-
-struct mrpc_regs {
- u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
- u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
- u32 cmd;
- u32 status;
- u32 ret_value;
-} __packed;
-
-enum mrpc_status {
- SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
- SWITCHTEC_MRPC_STATUS_DONE = 2,
- SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
- SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
-};
-
-struct sw_event_regs {
- u64 event_report_ctrl;
- u64 reserved1;
- u64 part_event_bitmap;
- u64 reserved2;
- u32 global_summary;
- u32 reserved3[3];
- u32 stack_error_event_hdr;
- u32 stack_error_event_data;
- u32 reserved4[4];
- u32 ppu_error_event_hdr;
- u32 ppu_error_event_data;
- u32 reserved5[4];
- u32 isp_error_event_hdr;
- u32 isp_error_event_data;
- u32 reserved6[4];
- u32 sys_reset_event_hdr;
- u32 reserved7[5];
- u32 fw_exception_hdr;
- u32 reserved8[5];
- u32 fw_nmi_hdr;
- u32 reserved9[5];
- u32 fw_non_fatal_hdr;
- u32 reserved10[5];
- u32 fw_fatal_hdr;
- u32 reserved11[5];
- u32 twi_mrpc_comp_hdr;
- u32 twi_mrpc_comp_data;
- u32 reserved12[4];
- u32 twi_mrpc_comp_async_hdr;
- u32 twi_mrpc_comp_async_data;
- u32 reserved13[4];
- u32 cli_mrpc_comp_hdr;
- u32 cli_mrpc_comp_data;
- u32 reserved14[4];
- u32 cli_mrpc_comp_async_hdr;
- u32 cli_mrpc_comp_async_data;
- u32 reserved15[4];
- u32 gpio_interrupt_hdr;
- u32 gpio_interrupt_data;
- u32 reserved16[4];
-} __packed;
-
-struct sys_info_regs {
- u32 device_id;
- u32 device_version;
- u32 firmware_version;
- u32 reserved1;
- u32 vendor_table_revision;
- u32 table_format_version;
- u32 partition_id;
- u32 cfg_file_fmt_version;
- u32 reserved2[58];
- char vendor_id[8];
- char product_id[16];
- char product_revision[4];
- char component_vendor[8];
- u16 component_id;
- u8 component_revision;
-} __packed;
-
-struct flash_info_regs {
- u32 flash_part_map_upd_idx;
-
- struct active_partition_info {
- u32 address;
- u32 build_version;
- u32 build_string;
- } active_img;
-
- struct active_partition_info active_cfg;
- struct active_partition_info inactive_img;
- struct active_partition_info inactive_cfg;
-
- u32 flash_length;
-
- struct partition_info {
- u32 address;
- u32 length;
- } cfg0;
-
- struct partition_info cfg1;
- struct partition_info img0;
- struct partition_info img1;
- struct partition_info nvlog;
- struct partition_info vendor[8];
-};
-
-struct ntb_info_regs {
- u8 partition_count;
- u8 partition_id;
- u16 reserved1;
- u64 ep_map;
- u16 requester_id;
-} __packed;
-
-struct part_cfg_regs {
- u32 status;
- u32 state;
- u32 port_cnt;
- u32 usp_port_mode;
- u32 usp_pff_inst_id;
- u32 vep_pff_inst_id;
- u32 dsp_pff_inst_id[47];
- u32 reserved1[11];
- u16 vep_vector_number;
- u16 usp_vector_number;
- u32 port_event_bitmap;
- u32 reserved2[3];
- u32 part_event_summary;
- u32 reserved3[3];
- u32 part_reset_hdr;
- u32 part_reset_data[5];
- u32 mrpc_comp_hdr;
- u32 mrpc_comp_data[5];
- u32 mrpc_comp_async_hdr;
- u32 mrpc_comp_async_data[5];
- u32 dyn_binding_hdr;
- u32 dyn_binding_data[5];
- u32 reserved4[159];
-} __packed;
-
-enum {
- SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
- SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
- SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
- SWITCHTEC_PART_CFG_EVENT_DYN_PART_CMP = 1 << 3,
-};
-
-struct pff_csr_regs {
- u16 vendor_id;
- u16 device_id;
- u32 pci_cfg_header[15];
- u32 pci_cap_region[48];
- u32 pcie_cap_region[448];
- u32 indirect_gas_window[128];
- u32 indirect_gas_window_off;
- u32 reserved[127];
- u32 pff_event_summary;
- u32 reserved2[3];
- u32 aer_in_p2p_hdr;
- u32 aer_in_p2p_data[5];
- u32 aer_in_vep_hdr;
- u32 aer_in_vep_data[5];
- u32 dpc_hdr;
- u32 dpc_data[5];
- u32 cts_hdr;
- u32 cts_data[5];
- u32 reserved3[6];
- u32 hotplug_hdr;
- u32 hotplug_data[5];
- u32 ier_hdr;
- u32 ier_data[5];
- u32 threshold_hdr;
- u32 threshold_data[5];
- u32 power_mgmt_hdr;
- u32 power_mgmt_data[5];
- u32 tlp_throttling_hdr;
- u32 tlp_throttling_data[5];
- u32 force_speed_hdr;
- u32 force_speed_data[5];
- u32 credit_timeout_hdr;
- u32 credit_timeout_data[5];
- u32 link_state_hdr;
- u32 link_state_data[5];
- u32 reserved4[174];
-} __packed;
-
-struct switchtec_dev {
- struct pci_dev *pdev;
- struct device dev;
- struct cdev cdev;
-
- int partition;
- int partition_count;
- int pff_csr_count;
- char pff_local[SWITCHTEC_MAX_PFF_CSR];
-
- void __iomem *mmio;
- struct mrpc_regs __iomem *mmio_mrpc;
- struct sw_event_regs __iomem *mmio_sw_event;
- struct sys_info_regs __iomem *mmio_sys_info;
- struct flash_info_regs __iomem *mmio_flash_info;
- struct ntb_info_regs __iomem *mmio_ntb;
- struct part_cfg_regs __iomem *mmio_part_cfg;
- struct part_cfg_regs __iomem *mmio_part_cfg_all;
- struct pff_csr_regs __iomem *mmio_pff_csr;
-
- /*
- * The mrpc mutex must be held when accessing the other
- * mrpc_ fields, alive flag and stuser->state field
- */
- struct mutex mrpc_mutex;
- struct list_head mrpc_queue;
- int mrpc_busy;
- struct work_struct mrpc_work;
- struct delayed_work mrpc_timeout;
- bool alive;
-
- wait_queue_head_t event_wq;
- atomic_t event_cnt;
-};
-
-static struct switchtec_dev *to_stdev(struct device *dev)
-{
- return container_of(dev, struct switchtec_dev, dev);
-}
-
enum mrpc_state {
MRPC_IDLE = 0,
MRPC_QUEUED,
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
new file mode 100644
index 000000000000..508cda78a430
--- /dev/null
+++ b/include/linux/switchtec.h
@@ -0,0 +1,270 @@
+/*
+ * Microsemi Switchtec PCIe Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef _SWITCHTEC_H
+#define _SWITCHTEC_H
+
+#include <linux/pci.h>
+#include <linux/cdev.h>
+
+#define MICROSEMI_VENDOR_ID 0x11f8
+#define MICROSEMI_NTB_CLASSCODE 0x068000
+#define MICROSEMI_MGMT_CLASSCODE 0x058000
+
+#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
+#define SWITCHTEC_MAX_PFF_CSR 48
+
+#define SWITCHTEC_EVENT_OCCURRED BIT(0)
+#define SWITCHTEC_EVENT_CLEAR BIT(0)
+#define SWITCHTEC_EVENT_EN_LOG BIT(1)
+#define SWITCHTEC_EVENT_EN_CLI BIT(2)
+#define SWITCHTEC_EVENT_EN_IRQ BIT(3)
+#define SWITCHTEC_EVENT_FATAL BIT(4)
+
+enum {
+ SWITCHTEC_GAS_MRPC_OFFSET = 0x0000,
+ SWITCHTEC_GAS_TOP_CFG_OFFSET = 0x1000,
+ SWITCHTEC_GAS_SW_EVENT_OFFSET = 0x1800,
+ SWITCHTEC_GAS_SYS_INFO_OFFSET = 0x2000,
+ SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
+ SWITCHTEC_GAS_PART_CFG_OFFSET = 0x4000,
+ SWITCHTEC_GAS_NTB_OFFSET = 0x10000,
+ SWITCHTEC_GAS_PFF_CSR_OFFSET = 0x134000,
+};
+
+struct mrpc_regs {
+ u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
+ u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
+ u32 cmd;
+ u32 status;
+ u32 ret_value;
+} __packed;
+
+enum mrpc_status {
+ SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
+ SWITCHTEC_MRPC_STATUS_DONE = 2,
+ SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
+ SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
+};
+
+struct sw_event_regs {
+ u64 event_report_ctrl;
+ u64 reserved1;
+ u64 part_event_bitmap;
+ u64 reserved2;
+ u32 global_summary;
+ u32 reserved3[3];
+ u32 stack_error_event_hdr;
+ u32 stack_error_event_data;
+ u32 reserved4[4];
+ u32 ppu_error_event_hdr;
+ u32 ppu_error_event_data;
+ u32 reserved5[4];
+ u32 isp_error_event_hdr;
+ u32 isp_error_event_data;
+ u32 reserved6[4];
+ u32 sys_reset_event_hdr;
+ u32 reserved7[5];
+ u32 fw_exception_hdr;
+ u32 reserved8[5];
+ u32 fw_nmi_hdr;
+ u32 reserved9[5];
+ u32 fw_non_fatal_hdr;
+ u32 reserved10[5];
+ u32 fw_fatal_hdr;
+ u32 reserved11[5];
+ u32 twi_mrpc_comp_hdr;
+ u32 twi_mrpc_comp_data;
+ u32 reserved12[4];
+ u32 twi_mrpc_comp_async_hdr;
+ u32 twi_mrpc_comp_async_data;
+ u32 reserved13[4];
+ u32 cli_mrpc_comp_hdr;
+ u32 cli_mrpc_comp_data;
+ u32 reserved14[4];
+ u32 cli_mrpc_comp_async_hdr;
+ u32 cli_mrpc_comp_async_data;
+ u32 reserved15[4];
+ u32 gpio_interrupt_hdr;
+ u32 gpio_interrupt_data;
+ u32 reserved16[4];
+} __packed;
+
+struct sys_info_regs {
+ u32 device_id;
+ u32 device_version;
+ u32 firmware_version;
+ u32 reserved1;
+ u32 vendor_table_revision;
+ u32 table_format_version;
+ u32 partition_id;
+ u32 cfg_file_fmt_version;
+ u32 reserved2[58];
+ char vendor_id[8];
+ char product_id[16];
+ char product_revision[4];
+ char component_vendor[8];
+ u16 component_id;
+ u8 component_revision;
+} __packed;
+
+struct flash_info_regs {
+ u32 flash_part_map_upd_idx;
+
+ struct active_partition_info {
+ u32 address;
+ u32 build_version;
+ u32 build_string;
+ } active_img;
+
+ struct active_partition_info active_cfg;
+ struct active_partition_info inactive_img;
+ struct active_partition_info inactive_cfg;
+
+ u32 flash_length;
+
+ struct partition_info {
+ u32 address;
+ u32 length;
+ } cfg0;
+
+ struct partition_info cfg1;
+ struct partition_info img0;
+ struct partition_info img1;
+ struct partition_info nvlog;
+ struct partition_info vendor[8];
+};
+
+struct ntb_info_regs {
+ u8 partition_count;
+ u8 partition_id;
+ u16 reserved1;
+ u64 ep_map;
+ u16 requester_id;
+} __packed;
+
+struct part_cfg_regs {
+ u32 status;
+ u32 state;
+ u32 port_cnt;
+ u32 usp_port_mode;
+ u32 usp_pff_inst_id;
+ u32 vep_pff_inst_id;
+ u32 dsp_pff_inst_id[47];
+ u32 reserved1[11];
+ u16 vep_vector_number;
+ u16 usp_vector_number;
+ u32 port_event_bitmap;
+ u32 reserved2[3];
+ u32 part_event_summary;
+ u32 reserved3[3];
+ u32 part_reset_hdr;
+ u32 part_reset_data[5];
+ u32 mrpc_comp_hdr;
+ u32 mrpc_comp_data[5];
+ u32 mrpc_comp_async_hdr;
+ u32 mrpc_comp_async_data[5];
+ u32 dyn_binding_hdr;
+ u32 dyn_binding_data[5];
+ u32 reserved4[159];
+} __packed;
+
+enum {
+ SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
+ SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
+ SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
+ SWITCHTEC_PART_CFG_EVENT_DYN_PART_CMP = 1 << 3,
+};
+
+struct pff_csr_regs {
+ u16 vendor_id;
+ u16 device_id;
+ u32 pci_cfg_header[15];
+ u32 pci_cap_region[48];
+ u32 pcie_cap_region[448];
+ u32 indirect_gas_window[128];
+ u32 indirect_gas_window_off;
+ u32 reserved[127];
+ u32 pff_event_summary;
+ u32 reserved2[3];
+ u32 aer_in_p2p_hdr;
+ u32 aer_in_p2p_data[5];
+ u32 aer_in_vep_hdr;
+ u32 aer_in_vep_data[5];
+ u32 dpc_hdr;
+ u32 dpc_data[5];
+ u32 cts_hdr;
+ u32 cts_data[5];
+ u32 reserved3[6];
+ u32 hotplug_hdr;
+ u32 hotplug_data[5];
+ u32 ier_hdr;
+ u32 ier_data[5];
+ u32 threshold_hdr;
+ u32 threshold_data[5];
+ u32 power_mgmt_hdr;
+ u32 power_mgmt_data[5];
+ u32 tlp_throttling_hdr;
+ u32 tlp_throttling_data[5];
+ u32 force_speed_hdr;
+ u32 force_speed_data[5];
+ u32 credit_timeout_hdr;
+ u32 credit_timeout_data[5];
+ u32 link_state_hdr;
+ u32 link_state_data[5];
+ u32 reserved4[174];
+} __packed;
+
+struct switchtec_dev {
+ struct pci_dev *pdev;
+ struct device dev;
+ struct cdev cdev;
+
+ int partition;
+ int partition_count;
+ int pff_csr_count;
+ char pff_local[SWITCHTEC_MAX_PFF_CSR];
+
+ void __iomem *mmio;
+ struct mrpc_regs __iomem *mmio_mrpc;
+ struct sw_event_regs __iomem *mmio_sw_event;
+ struct sys_info_regs __iomem *mmio_sys_info;
+ struct flash_info_regs __iomem *mmio_flash_info;
+ struct ntb_info_regs __iomem *mmio_ntb;
+ struct part_cfg_regs __iomem *mmio_part_cfg;
+ struct part_cfg_regs __iomem *mmio_part_cfg_all;
+ struct pff_csr_regs __iomem *mmio_pff_csr;
+
+ /*
+ * The mrpc mutex must be held when accessing the other
+ * mrpc_ fields, alive flag and stuser->state field
+ */
+ struct mutex mrpc_mutex;
+ struct list_head mrpc_queue;
+ int mrpc_busy;
+ struct work_struct mrpc_work;
+ struct delayed_work mrpc_timeout;
+ bool alive;
+
+ wait_queue_head_t event_wq;
+ atomic_t event_cnt;
+};
+
+static inline struct switchtec_dev *to_stdev(struct device *dev)
+{
+ return container_of(dev, struct switchtec_dev, dev);
+}
+
+#endif
--
2.11.0
This patch sets up some hardware registers and creates interrupt service
routines for the doorbells and messages.
There are 64 doorbells in the switch that are shared between all
partitions. The upper 4 doorbells are also shared with the messages
and are there for not used. Thus, this code provides 28 doorbells
for each partition.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 142 ++++++++++++++++++++++++++++++++++++
1 file changed, 142 insertions(+)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 756307d1a8a3..6c9b4e337cdf 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -16,6 +16,7 @@
#include <linux/switchtec.h>
#include <linux/module.h>
#include <linux/delay.h>
+#include <linux/interrupt.h>
MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
MODULE_VERSION("0.1");
@@ -67,6 +68,9 @@ struct switchtec_ntb {
int self_partition;
int peer_partition;
+ int doorbell_irq;
+ int message_irq;
+
struct ntb_info_regs __iomem *mmio_ntb;
struct ntb_ctrl_regs __iomem *mmio_ctrl;
struct ntb_dbmsg_regs __iomem *mmio_dbmsg;
@@ -78,6 +82,11 @@ struct switchtec_ntb {
struct shared_mw __iomem *peer_shared;
dma_addr_t self_shared_dma;
+ u64 db_mask;
+ u64 db_valid_mask;
+ int db_shift;
+ int db_peer_shift;
+
int nr_direct_mw;
int nr_lut_mw;
int direct_mw_to_bar[MAX_DIRECT_MW];
@@ -180,6 +189,49 @@ static void switchtec_ntb_init_mw(struct switchtec_ntb *sndev)
sndev->nr_direct_mw, sndev->nr_lut_mw);
}
+/*
+ * There are 64 doorbells in the switch hardware but this is
+ * shared among all partitions. So we must split them in half
+ * (32 for each partition). However, the message interrupts are
+ * also shared with the top 4 doorbells so we just limit this to
+ * 28 doorbells per partition
+ */
+static void switchtec_ntb_init_db(struct switchtec_ntb *sndev)
+{
+ sndev->db_valid_mask = 0x0FFFFFFF;
+
+ if (sndev->self_partition < sndev->peer_partition) {
+ sndev->db_shift = 0;
+ sndev->db_peer_shift = 32;
+ } else {
+ sndev->db_shift = 32;
+ sndev->db_peer_shift = 0;
+ }
+
+ sndev->db_mask = 0x0FFFFFFFFFFFFFFFULL;
+ iowrite64(~sndev->db_mask, &sndev->mmio_self_dbmsg->idb_mask);
+ iowrite64(sndev->db_valid_mask << sndev->db_peer_shift,
+ &sndev->mmio_self_dbmsg->odb_mask);
+}
+
+static void switchtec_ntb_init_msgs(struct switchtec_ntb *sndev)
+{
+ int i;
+ u32 msg_map = 0;
+
+ for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_dbmsg->imsg); i++) {
+ int m = i | sndev->peer_partition << 2;
+
+ msg_map |= m << i * 8;
+ }
+
+ iowrite32(msg_map, &sndev->mmio_self_dbmsg->msg_map);
+
+ for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_dbmsg->imsg); i++)
+ iowrite64(NTB_DBMSG_IMSG_STATUS | NTB_DBMSG_IMSG_MASK,
+ &sndev->mmio_self_dbmsg->imsg[i]);
+}
+
static int switchtec_ntb_init_req_id_table(struct switchtec_ntb *sndev)
{
int rc = 0;
@@ -300,6 +352,87 @@ static void switchtec_ntb_deinit_shared_mw(struct switchtec_ntb *sndev)
sndev->self_shared_dma);
}
+static irqreturn_t switchtec_ntb_doorbell_isr(int irq, void *dev)
+{
+ struct switchtec_ntb *sndev = dev;
+
+ dev_dbg(&sndev->stdev->dev, "doorbell\n");
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t switchtec_ntb_message_isr(int irq, void *dev)
+{
+ int i;
+ struct switchtec_ntb *sndev = dev;
+
+ for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_dbmsg->imsg); i++) {
+ u64 msg = ioread64(&sndev->mmio_self_dbmsg->imsg[i]);
+
+ if (msg & NTB_DBMSG_IMSG_STATUS) {
+ dev_dbg(&sndev->stdev->dev, "message: %d %08x\n", i,
+ (u32)msg);
+ iowrite8(1, &sndev->mmio_self_dbmsg->imsg[i].status);
+ }
+ }
+
+ return IRQ_HANDLED;
+}
+
+static int switchtec_ntb_init_db_msg_irq(struct switchtec_ntb *sndev)
+{
+ int i;
+ int rc;
+ int doorbell_irq = 0;
+ int message_irq = 0;
+ int event_irq;
+ int idb_vecs = sizeof(sndev->mmio_self_dbmsg->idb_vec_map);
+
+ event_irq = ioread32(&sndev->stdev->mmio_part_cfg->vep_vector_number);
+
+ while (doorbell_irq == event_irq)
+ doorbell_irq++;
+ while (message_irq == doorbell_irq ||
+ message_irq == event_irq)
+ message_irq++;
+
+ dev_dbg(&sndev->stdev->dev, "irqs - event: %d, db: %d, msgs: %d",
+ event_irq, doorbell_irq, message_irq);
+
+ for (i = 0; i < idb_vecs - 4; i++)
+ iowrite8(doorbell_irq,
+ &sndev->mmio_self_dbmsg->idb_vec_map[i]);
+
+ for (; i < idb_vecs; i++)
+ iowrite8(message_irq,
+ &sndev->mmio_self_dbmsg->idb_vec_map[i]);
+
+ sndev->doorbell_irq = pci_irq_vector(sndev->stdev->pdev, doorbell_irq);
+ sndev->message_irq = pci_irq_vector(sndev->stdev->pdev, message_irq);
+
+ rc = request_irq(sndev->doorbell_irq,
+ switchtec_ntb_doorbell_isr, 0,
+ "switchtec_ntb_doorbell", sndev);
+ if (rc)
+ return rc;
+
+ rc = request_irq(sndev->message_irq,
+ switchtec_ntb_message_isr, 0,
+ "switchtec_ntb_message", sndev);
+ if (rc) {
+ free_irq(sndev->doorbell_irq, sndev);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void switchtec_ntb_deinit_db_msg_irq(struct switchtec_ntb *sndev)
+{
+ free_irq(sndev->doorbell_irq, sndev);
+ free_irq(sndev->message_irq, sndev);
+}
+
static int switchtec_ntb_add(struct device *dev,
struct class_interface *class_intf)
{
@@ -323,6 +456,8 @@ static int switchtec_ntb_add(struct device *dev,
switchtec_ntb_init_sndev(sndev);
switchtec_ntb_init_mw(sndev);
+ switchtec_ntb_init_db(sndev);
+ switchtec_ntb_init_msgs(sndev);
rc = switchtec_ntb_init_req_id_table(sndev);
if (rc)
@@ -332,11 +467,17 @@ static int switchtec_ntb_add(struct device *dev,
if (rc)
goto free_and_exit;
+ rc = switchtec_ntb_init_db_msg_irq(sndev);
+ if (rc)
+ goto deinit_shared_and_exit;
+
stdev->sndev = sndev;
dev_info(dev, "NTB device registered");
return 0;
+deinit_shared_and_exit:
+ switchtec_ntb_deinit_shared_mw(sndev);
free_and_exit:
kfree(sndev);
dev_err(dev, "failed to register ntb device: %d", rc);
@@ -353,6 +494,7 @@ void switchtec_ntb_remove(struct device *dev,
return;
stdev->sndev = NULL;
+ switchtec_ntb_deinit_db_msg_irq(sndev);
switchtec_ntb_deinit_shared_mw(sndev);
kfree(sndev);
dev_info(dev, "ntb device unregistered");
--
2.11.0
In order for the switchtec NTB code to handle link change events we
create a notifier block in the switchtec code which gets called
whenever an appropriate event interrupt occurs.
In order to preserve userspace's ability to follow these events,
we compare the event count with a stored copy from last time we
checked.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/pci/switch/switchtec.c | 53 ++++++++++++++++++++++++++++++++++++++++++
include/linux/switchtec.h | 5 ++++
2 files changed, 58 insertions(+)
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index e9bf17b1934e..63e305b24fb9 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -972,6 +972,50 @@ static const struct file_operations switchtec_fops = {
.compat_ioctl = switchtec_dev_ioctl,
};
+static void link_event_work(struct work_struct *work)
+{
+ struct switchtec_dev *stdev;
+
+ stdev = container_of(work, struct switchtec_dev, link_event_work);
+
+ dev_dbg(&stdev->dev, "%s\n", __func__);
+
+ blocking_notifier_call_chain(&stdev->link_notifier, 0, stdev);
+}
+
+static void check_link_state_events(struct switchtec_dev *stdev)
+{
+ int idx;
+ u32 reg;
+ int count;
+ int occurred = 0;
+
+ for (idx = 0; idx < stdev->pff_csr_count; idx++) {
+ reg = ioread32(&stdev->mmio_pff_csr[idx].link_state_hdr);
+ dev_dbg(&stdev->dev, "%s: %d->%08x\n", __func__, idx, reg);
+ count = (reg >> 5) & 0xFF;
+
+ if (count != stdev->link_event_count[idx]) {
+ occurred = 1;
+ stdev->link_event_count[idx] = count;
+ }
+ }
+
+ if (occurred)
+ schedule_work(&stdev->link_event_work);
+}
+
+static void enable_link_state_events(struct switchtec_dev *stdev)
+{
+ int idx;
+
+ for (idx = 0; idx < stdev->pff_csr_count; idx++) {
+ iowrite32(SWITCHTEC_EVENT_CLEAR |
+ SWITCHTEC_EVENT_EN_IRQ,
+ &stdev->mmio_pff_csr[idx].link_state_hdr);
+ }
+}
+
static void stdev_release(struct device *dev)
{
struct switchtec_dev *stdev = to_stdev(dev);
@@ -1024,6 +1068,8 @@ static struct switchtec_dev *stdev_create(struct pci_dev *pdev)
stdev->mrpc_busy = 0;
INIT_WORK(&stdev->mrpc_work, mrpc_event_work);
INIT_DELAYED_WORK(&stdev->mrpc_timeout, mrpc_timeout_work);
+ INIT_WORK(&stdev->link_event_work, link_event_work);
+ BLOCKING_INIT_NOTIFIER_HEAD(&stdev->link_notifier);
init_waitqueue_head(&stdev->event_wq);
atomic_set(&stdev->event_cnt, 0);
@@ -1067,6 +1113,9 @@ static int mask_event(struct switchtec_dev *stdev, int eid, int idx)
if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ))
return 0;
+ if (eid == SWITCHTEC_IOCTL_EVENT_LINK_STATE)
+ return 0;
+
dev_dbg(&stdev->dev, "%s: %d %d %x\n", __func__, eid, idx, hdr);
hdr &= ~(SWITCHTEC_EVENT_EN_IRQ | SWITCHTEC_EVENT_OCCURRED);
iowrite32(hdr, hdr_reg);
@@ -1086,6 +1135,7 @@ static int mask_all_events(struct switchtec_dev *stdev, int eid)
for (idx = 0; idx < stdev->pff_csr_count; idx++) {
if (!stdev->pff_local[idx])
continue;
+
count += mask_event(stdev, eid, idx);
}
} else {
@@ -1110,6 +1160,8 @@ static irqreturn_t switchtec_event_isr(int irq, void *dev)
iowrite32(reg, &stdev->mmio_part_cfg->mrpc_comp_hdr);
}
+ check_link_state_events(stdev);
+
for (eid = 0; eid < SWITCHTEC_IOCTL_MAX_EVENTS; eid++)
event_count += mask_all_events(stdev, eid);
@@ -1236,6 +1288,7 @@ static int switchtec_pci_probe(struct pci_dev *pdev,
iowrite32(SWITCHTEC_EVENT_CLEAR |
SWITCHTEC_EVENT_EN_IRQ,
&stdev->mmio_part_cfg->mrpc_comp_hdr);
+ enable_link_state_events(stdev);
rc = cdev_device_add(&stdev->cdev, &stdev->dev);
if (rc)
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 3c6b964885e9..8d66a0659cef 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -18,6 +18,7 @@
#include <linux/pci.h>
#include <linux/cdev.h>
+#include <linux/notifier.h>
#define MICROSEMI_VENDOR_ID 0x11f8
#define MICROSEMI_NTB_CLASSCODE 0x068000
@@ -344,6 +345,10 @@ struct switchtec_dev {
wait_queue_head_t event_wq;
atomic_t event_cnt;
+
+ struct work_struct link_event_work;
+ struct blocking_notifier_head link_notifier;
+ u8 link_event_count[SWITCHTEC_MAX_PFF_CSR];
};
static inline struct switchtec_dev *to_stdev(struct device *dev)
--
2.11.0
There are two additional regions: ctrl and dbmsg. The first is
for generic ntb control and memory windows. The second for doorbells
and message registers. This patch also adds a number of related
constants for using these registers.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
include/linux/switchtec.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 3b87618fc42f..3c6b964885e9 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -147,6 +147,12 @@ struct flash_info_regs {
struct partition_info vendor[8];
};
+enum {
+ SWITCHTEC_NTB_REG_INFO_OFFSET = 0x0000,
+ SWITCHTEC_NTB_REG_CTRL_OFFSET = 0x4000,
+ SWITCHTEC_NTB_REG_DBMSG_OFFSET = 0x64000,
+};
+
struct ntb_info_regs {
u8 partition_count;
u8 partition_id;
@@ -182,6 +188,84 @@ struct part_cfg_regs {
} __packed;
enum {
+ NTB_CTRL_PART_OP_LOCK = 0x1,
+ NTB_CTRL_PART_OP_CFG = 0x2,
+ NTB_CTRL_PART_OP_RESET = 0x3,
+
+ NTB_CTRL_PART_STATUS_NORMAL = 0x1,
+ NTB_CTRL_PART_STATUS_LOCKED = 0x2,
+ NTB_CTRL_PART_STATUS_LOCKING = 0x3,
+ NTB_CTRL_PART_STATUS_CONFIGURING = 0x4,
+ NTB_CTRL_PART_STATUS_RESETTING = 0x5,
+
+ NTB_CTRL_BAR_VALID = 1 << 0,
+ NTB_CTRL_BAR_DIR_WIN_EN = 1 << 4,
+ NTB_CTRL_BAR_LUT_WIN_EN = 1 << 5,
+
+ NTB_CTRL_REQ_ID_EN = 1 << 0,
+
+ NTB_CTRL_LUT_EN = 1 << 0,
+
+ NTB_PART_CTRL_ID_PROT_DIS = 1 << 0,
+};
+
+struct ntb_ctrl_regs {
+ u32 partition_status;
+ u32 partition_op;
+ u32 partition_ctrl;
+ u32 bar_setup;
+ u32 bar_error;
+ u16 lut_table_entries;
+ u16 lut_table_offset;
+ u32 lut_error;
+ u16 req_id_table_size;
+ u16 req_id_table_offset;
+ u32 req_id_error;
+ u32 reserved1[7];
+ struct {
+ u32 ctl;
+ u32 win_size;
+ u64 xlate_addr;
+ } bar_entry[6];
+ u32 reserved2[216];
+ u32 req_id_table[256];
+ u32 reserved3[512];
+ u64 lut_entry[512];
+} __packed;
+
+#define NTB_DBMSG_IMSG_STATUS BIT_ULL(32)
+#define NTB_DBMSG_IMSG_MASK BIT_ULL(40)
+
+struct ntb_dbmsg_regs {
+ u32 reserved1[1024];
+ u64 odb;
+ u64 odb_mask;
+ u64 idb;
+ u64 idb_mask;
+ u8 idb_vec_map[64];
+ u32 msg_map;
+ u32 reserved2;
+ struct {
+ u32 msg;
+ u32 status;
+ } omsg[4];
+
+ struct {
+ u32 msg;
+ u8 status;
+ u8 mask;
+ u8 src;
+ u8 reserved;
+ } imsg[4];
+
+ u8 reserved3[3928];
+ u8 msix_table[1024];
+ u8 reserved4[3072];
+ u8 pba[24];
+ u8 reserved5[4072];
+} __packed;
+
+enum {
SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
--
2.11.0
Pretty straightforward implementation of doorbell registers.
The shift and mask were setup in an earlier patch and this just hooks
up the approprirate portion of the idb register as the local doorbells
and the opposite portion of odb as the peer doorbells. The db mask is
protected by a spinlock to avoid concurrent read-modify-write accesses.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 89 +++++++++++++++++++++++++++++++++++--
1 file changed, 85 insertions(+), 4 deletions(-)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 19fa8fb0f15c..53604c22be67 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -90,6 +90,9 @@ struct switchtec_ntb {
int db_shift;
int db_peer_shift;
+ /* synchronize rmw access of db_mask and hw reg */
+ spinlock_t db_mask_lock;
+
int nr_direct_mw;
int nr_lut_mw;
int direct_mw_to_bar[MAX_DIRECT_MW];
@@ -336,41 +339,115 @@ static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
static u64 switchtec_ntb_db_valid_mask(struct ntb_dev *ntb)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ return sndev->db_valid_mask;
}
static int switchtec_ntb_db_vector_count(struct ntb_dev *ntb)
{
- return 0;
+ return 1;
}
static u64 switchtec_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (db_vector < 0 || db_vector > 1)
+ return 0;
+
+ return sndev->db_valid_mask;
}
static u64 switchtec_ntb_db_read(struct ntb_dev *ntb)
{
- return 0;
+ u64 ret;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ ret = ioread64(&sndev->mmio_self_dbmsg->idb) >> sndev->db_shift;
+
+ return ret & sndev->db_valid_mask;
}
static int switchtec_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ iowrite64(db_bits << sndev->db_shift, &sndev->mmio_self_dbmsg->idb);
+
return 0;
}
static int switchtec_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
{
+ unsigned long irqflags;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (db_bits & ~sndev->db_valid_mask)
+ return -EINVAL;
+
+ spin_lock_irqsave(&sndev->db_mask_lock, irqflags);
+ {
+ sndev->db_mask |= db_bits << sndev->db_shift;
+ iowrite64(~sndev->db_mask, &sndev->mmio_self_dbmsg->idb_mask);
+ }
+ spin_unlock_irqrestore(&sndev->db_mask_lock, irqflags);
+
return 0;
}
static int switchtec_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
{
+ unsigned long irqflags;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (db_bits & ~sndev->db_valid_mask)
+ return -EINVAL;
+
+ spin_lock_irqsave(&sndev->db_mask_lock, irqflags);
+ {
+ sndev->db_mask &= ~(db_bits << sndev->db_shift);
+ iowrite64(~sndev->db_mask, &sndev->mmio_self_dbmsg->idb_mask);
+ }
+ spin_unlock_irqrestore(&sndev->db_mask_lock, irqflags);
+
+ return 0;
+}
+
+static u64 switchtec_ntb_db_read_mask(struct ntb_dev *ntb)
+{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ return (sndev->db_mask >> sndev->db_shift) & sndev->db_valid_mask;
+}
+
+static int switchtec_ntb_peer_db_addr(struct ntb_dev *ntb,
+ phys_addr_t *db_addr,
+ resource_size_t *db_size)
+{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+ unsigned long offset;
+
+ offset = (unsigned long)sndev->mmio_self_dbmsg->odb -
+ (unsigned long)sndev->stdev->mmio;
+
+ offset += sndev->db_shift / 8;
+
+ if (db_addr)
+ *db_addr = pci_resource_start(ntb->pdev, 0) + offset;
+ if (db_size)
+ *db_size = sizeof(u32);
+
return 0;
}
static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ iowrite64(db_bits << sndev->db_peer_shift,
+ &sndev->mmio_self_dbmsg->odb);
+
return 0;
}
@@ -408,6 +485,8 @@ static const struct ntb_dev_ops switchtec_ntb_ops = {
.db_clear = switchtec_ntb_db_clear,
.db_set_mask = switchtec_ntb_db_set_mask,
.db_clear_mask = switchtec_ntb_db_clear_mask,
+ .db_read_mask = switchtec_ntb_db_read_mask,
+ .peer_db_addr = switchtec_ntb_peer_db_addr,
.peer_db_set = switchtec_ntb_peer_db_set,
.spad_count = switchtec_ntb_spad_count,
.spad_read = switchtec_ntb_spad_read,
@@ -632,6 +711,8 @@ static irqreturn_t switchtec_ntb_doorbell_isr(int irq, void *dev)
dev_dbg(&sndev->stdev->dev, "doorbell\n");
+ ntb_db_event(&sndev->ntb, 0);
+
return IRQ_HANDLED;
}
--
2.11.0
The switchtec hardware has two types of memory windows: LUTs and Direct.
The first area in each BAR is for LUT windows and the remaining area is
for the direct region. The total number of LUT entries is set by a
configuration setting in hardware and they all must be the same
size. (This is fixed by switchtec_ntb to be 64K.)
switchtec_ntb enables the LUTs only for the first bar and enables the
highest power of two possible. Seeing the LUTs are at the beginning of
the BAR, the direct memory window's alignment is affected. Therefore,
the maximum direct memory window size can not be greater than the number
of LUTs times 64K. The direct window in other bars will not have this
restriction as the LUTs will not be enabled there. LUTs will only be
exposed through the NTB api if the use_lut_mw parameter is set.
Seeing the switchtec hardware, by default, configures BARs to be 4G a
module parameter is given to limit the size of the advertised memory
windows. Higher layers tend to allocate the maximum BAR size and this
has a tendancy of failing when they try to allocate 4GB of continuous
memory.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 197 +++++++++++++++++++++++++++++++++++-
1 file changed, 195 insertions(+), 2 deletions(-)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index aabe20388695..3c4c5acc9ae7 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -24,6 +24,16 @@ MODULE_VERSION("0.1");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Microsemi Corporation");
+static ulong max_mw_size = SZ_2M;
+module_param(max_mw_size, ulong, 0644);
+MODULE_PARM_DESC(max_mw_size,
+ "Limit the size of the memory windows reported to the upper layer");
+
+static bool use_lut_mws;
+module_param(use_lut_mws, bool, 0644);
+MODULE_PARM_DESC(use_lut_mws,
+ "Enable the use of the LUT based memory windows");
+
#ifndef ioread64
#ifdef readq
#define ioread64 readq
@@ -175,6 +185,85 @@ static int switchtec_ntb_send_msg(struct switchtec_ntb *sndev, int idx,
static int switchtec_ntb_mw_count(struct ntb_dev *ntb)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (use_lut_mws)
+ return sndev->nr_direct_mw + sndev->nr_lut_mw - 1;
+ else
+ return sndev->nr_direct_mw;
+}
+
+static int lut_index(struct switchtec_ntb *sndev, int mw_idx)
+{
+ return mw_idx - sndev->nr_direct_mw + 1;
+}
+
+static int switchtec_ntb_mw_direct_get_range(struct switchtec_ntb *sndev,
+ int idx, phys_addr_t *base,
+ resource_size_t *size,
+ resource_size_t *align,
+ resource_size_t *align_size)
+{
+ int bar = sndev->direct_mw_to_bar[idx];
+ size_t offset = 0;
+
+ if (bar < 0)
+ return -EINVAL;
+
+ if (idx == 0) {
+ /*
+ * This is the direct BAR shared with the LUTs
+ * which means the actual window will be offset
+ * by the size of all the LUT entries.
+ */
+
+ offset = LUT_SIZE * sndev->nr_lut_mw;
+ }
+
+ if (base)
+ *base = pci_resource_start(sndev->ntb.pdev, bar) + offset;
+
+ if (size) {
+ *size = pci_resource_len(sndev->ntb.pdev, bar) - offset;
+ if (offset && *size > offset)
+ *size = offset;
+
+ if (*size > max_mw_size)
+ *size = max_mw_size;
+ }
+
+ if (align)
+ *align = SZ_4K;
+
+ if (align_size)
+ *align_size = SZ_4K;
+
+ return 0;
+}
+
+static int switchtec_ntb_mw_lut_get_range(struct switchtec_ntb *sndev,
+ int idx, phys_addr_t *base,
+ resource_size_t *size,
+ resource_size_t *align,
+ resource_size_t *align_size)
+{
+ int bar = sndev->direct_mw_to_bar[0];
+ int offset;
+
+ offset = LUT_SIZE * lut_index(sndev, idx);
+
+ if (base)
+ *base = pci_resource_start(sndev->ntb.pdev, bar) + offset;
+
+ if (size)
+ *size = LUT_SIZE;
+
+ if (align)
+ *align = LUT_SIZE;
+
+ if (align_size)
+ *align_size = LUT_SIZE;
+
return 0;
}
@@ -184,13 +273,117 @@ static int switchtec_ntb_mw_get_range(struct ntb_dev *ntb, int idx,
resource_size_t *align,
resource_size_t *align_size)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (idx < sndev->nr_direct_mw)
+ return switchtec_ntb_mw_direct_get_range(sndev, idx, base,
+ size, align,
+ align_size);
+ else if (idx < switchtec_ntb_mw_count(ntb))
+ return switchtec_ntb_mw_lut_get_range(sndev, idx, base,
+ size, align,
+ align_size);
+ else
+ return -EINVAL;
+}
+
+static void switchtec_ntb_mw_clr_direct(struct switchtec_ntb *sndev, int idx)
+{
+ struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+ int bar = sndev->direct_mw_to_bar[idx];
+ u32 ctl_val;
+
+ ctl_val = ioread32(&ctl->bar_entry[bar].ctl);
+ ctl_val &= ~NTB_CTRL_BAR_DIR_WIN_EN;
+ iowrite32(ctl_val, &ctl->bar_entry[bar].ctl);
+ iowrite32(0, &ctl->bar_entry[bar].win_size);
+ iowrite64(sndev->self_partition, &ctl->bar_entry[bar].xlate_addr);
+}
+
+static void switchtec_ntb_mw_clr_lut(struct switchtec_ntb *sndev, int idx)
+{
+ struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+
+ iowrite64(0, &ctl->lut_entry[lut_index(sndev, idx)]);
+}
+
+static void switchtec_ntb_mw_set_direct(struct switchtec_ntb *sndev, int idx,
+ dma_addr_t addr, resource_size_t size)
+{
+ int xlate_pos = ilog2(size);
+ int bar = sndev->direct_mw_to_bar[idx];
+ struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+ u32 ctl_val;
+
+ ctl_val = ioread32(&ctl->bar_entry[bar].ctl);
+ ctl_val |= NTB_CTRL_BAR_DIR_WIN_EN;
+
+ iowrite32(ctl_val, &ctl->bar_entry[bar].ctl);
+ iowrite32(xlate_pos | size, &ctl->bar_entry[bar].win_size);
+ iowrite64(sndev->self_partition | addr,
+ &ctl->bar_entry[bar].xlate_addr);
+}
+
+static void switchtec_ntb_mw_set_lut(struct switchtec_ntb *sndev, int idx,
+ dma_addr_t addr, resource_size_t size)
+{
+ struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+
+ iowrite64((NTB_CTRL_LUT_EN | (sndev->self_partition << 1) | addr),
+ &ctl->lut_entry[lut_index(sndev, idx)]);
}
static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
dma_addr_t addr, resource_size_t size)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+ struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+ int xlate_pos = ilog2(size);
+ int rc;
+
+ dev_dbg(&sndev->stdev->dev, "MW %d: %pad size %pap", idx, &addr, &size);
+
+ if (idx >= switchtec_ntb_mw_count(ntb))
+ return -EINVAL;
+
+ if (xlate_pos < 12)
+ return -EINVAL;
+
+ rc = switchtec_ntb_part_op(sndev, ctl, NTB_CTRL_PART_OP_LOCK,
+ NTB_CTRL_PART_STATUS_LOCKED);
+ if (rc)
+ return rc;
+
+ if (addr == 0 || size == 0) {
+ if (idx < sndev->nr_direct_mw)
+ switchtec_ntb_mw_clr_direct(sndev, idx);
+ else
+ switchtec_ntb_mw_clr_lut(sndev, idx);
+ } else {
+ if (idx < sndev->nr_direct_mw)
+ switchtec_ntb_mw_set_direct(sndev, idx, addr, size);
+ else
+ switchtec_ntb_mw_set_lut(sndev, idx, addr, size);
+ }
+
+ rc = switchtec_ntb_part_op(sndev, ctl, NTB_CTRL_PART_OP_CFG,
+ NTB_CTRL_PART_STATUS_NORMAL);
+
+ if (rc == -EIO) {
+ dev_err(&sndev->stdev->dev,
+ "Hardware reported an error configuring mw %d: %08x",
+ idx, ioread32(&ctl->bar_error));
+
+ if (idx < sndev->nr_direct_mw)
+ switchtec_ntb_mw_clr_direct(sndev, idx);
+ else
+ switchtec_ntb_mw_clr_lut(sndev, idx);
+
+ switchtec_ntb_part_op(sndev, ctl, NTB_CTRL_PART_OP_CFG,
+ NTB_CTRL_PART_STATUS_NORMAL);
+ }
+
+ return rc;
}
static void switchtec_ntb_part_link_speed(struct switchtec_ntb *sndev,
--
2.11.0
Seeing there is no dedicated hardware for this, we simply add
these as entries in the shared memory window. Thus, we could support
any number of them but 128 seems like enough, for now.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 65 +++++++++++++++++++++++++++++++++++--
1 file changed, 63 insertions(+), 2 deletions(-)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 53604c22be67..aabe20388695 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -59,6 +59,7 @@ struct shared_mw {
u32 magic;
u32 link_sta;
u32 partition_id;
+ u32 spad[128];
};
#define MAX_DIRECT_MW ARRAY_SIZE(((struct ntb_ctrl_regs *)(0))->bar_entry)
@@ -453,21 +454,79 @@ static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
static int switchtec_ntb_spad_count(struct ntb_dev *ntb)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ return ARRAY_SIZE(sndev->self_shared->spad);
}
static u32 switchtec_ntb_spad_read(struct ntb_dev *ntb, int idx)
{
- return 0;
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (idx < 0 || idx >= ARRAY_SIZE(sndev->self_shared->spad))
+ return 0;
+
+ if (!sndev->self_shared)
+ return 0;
+
+ return sndev->self_shared->spad[idx];
}
static int switchtec_ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (idx < 0 || idx >= ARRAY_SIZE(sndev->self_shared->spad))
+ return -EINVAL;
+
+ if (!sndev->self_shared)
+ return -EIO;
+
+ sndev->self_shared->spad[idx] = val;
+
return 0;
}
+static u32 switchtec_ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (idx < 0 || idx >= ARRAY_SIZE(sndev->peer_shared->spad))
+ return 0;
+
+ if (!sndev->peer_shared)
+ return 0;
+
+ return ioread32(&sndev->peer_shared->spad[idx]);
+}
+
static int switchtec_ntb_peer_spad_write(struct ntb_dev *ntb, int idx, u32 val)
{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+ if (idx < 0 || idx >= ARRAY_SIZE(sndev->peer_shared->spad))
+ return -EINVAL;
+
+ if (!sndev->peer_shared)
+ return -EIO;
+
+ iowrite32(val, &sndev->peer_shared->spad[idx]);
+
+ return 0;
+}
+
+static int switchtec_ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+ phys_addr_t *spad_addr)
+{
+ struct switchtec_ntb *sndev = ntb_sndev(ntb);
+ unsigned long offset;
+
+ offset = (unsigned long)&sndev->peer_shared->spad[idx] -
+ (unsigned long)sndev->stdev->mmio;
+
+ if (spad_addr)
+ *spad_addr = pci_resource_start(ntb->pdev, 0) + offset;
+
return 0;
}
@@ -491,7 +550,9 @@ static const struct ntb_dev_ops switchtec_ntb_ops = {
.spad_count = switchtec_ntb_spad_count,
.spad_read = switchtec_ntb_spad_read,
.spad_write = switchtec_ntb_spad_write,
+ .peer_spad_read = switchtec_ntb_peer_spad_read,
.peer_spad_write = switchtec_ntb_peer_spad_write,
+ .peer_spad_addr = switchtec_ntb_peer_spad_addr,
};
static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
--
2.11.0
This commit adds the code to initialize the memory windows in the
hardware. This includes setting up the requester ID table, and
figuring out which bar corresponds to which memory window. (Seeing
the switch can be configured with any number of bars.)
Also, seeing the device doesn't have hardware for scratchpads or
determining the link status, we create a shared memory window that has
these features. A magic number with a version copmonent will be used
to determine if the otherside's driver is actually up.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 296 ++++++++++++++++++++++++++++++++++++
1 file changed, 296 insertions(+)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 1f094216aa1c..756307d1a8a3 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -15,37 +15,332 @@
#include <linux/switchtec.h>
#include <linux/module.h>
+#include <linux/delay.h>
MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
MODULE_VERSION("0.1");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Microsemi Corporation");
+#ifndef ioread64
+#ifdef readq
+#define ioread64 readq
+#else
+#define ioread64 _ioread64
+static inline u64 _ioread64(void __iomem *mmio)
+{
+ u64 low, high;
+
+ low = ioread32(mmio);
+ high = ioread32(mmio + sizeof(u32));
+ return low | (high << 32);
+}
+#endif
+#endif
+
+#ifndef iowrite64
+#ifdef writeq
+#define iowrite64 writeq
+#else
+#define iowrite64 _iowrite64
+static inline void _iowrite64(u64 val, void __iomem *mmio)
+{
+ iowrite32(val, mmio);
+ iowrite32(val >> 32, mmio + sizeof(u32));
+}
+#endif
+#endif
+
+#define SWITCHTEC_NTB_MAGIC 0x45CC0001
+
+struct shared_mw {
+ u32 magic;
+ u32 partition_id;
+};
+
+#define MAX_DIRECT_MW ARRAY_SIZE(((struct ntb_ctrl_regs *)(0))->bar_entry)
+#define LUT_SIZE SZ_64K
+
struct switchtec_ntb {
struct switchtec_dev *stdev;
+
+ int self_partition;
+ int peer_partition;
+
+ struct ntb_info_regs __iomem *mmio_ntb;
+ struct ntb_ctrl_regs __iomem *mmio_ctrl;
+ struct ntb_dbmsg_regs __iomem *mmio_dbmsg;
+ struct ntb_ctrl_regs __iomem *mmio_self_ctrl;
+ struct ntb_ctrl_regs __iomem *mmio_peer_ctrl;
+ struct ntb_dbmsg_regs __iomem *mmio_self_dbmsg;
+
+ struct shared_mw *self_shared;
+ struct shared_mw __iomem *peer_shared;
+ dma_addr_t self_shared_dma;
+
+ int nr_direct_mw;
+ int nr_lut_mw;
+ int direct_mw_to_bar[MAX_DIRECT_MW];
};
+static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
+ struct ntb_ctrl_regs __iomem *ctl,
+ u32 op, int wait_status)
+{
+ static const char * const op_text[] = {
+ [NTB_CTRL_PART_OP_LOCK] = "lock",
+ [NTB_CTRL_PART_OP_CFG] = "configure",
+ [NTB_CTRL_PART_OP_RESET] = "reset",
+ };
+
+ int i;
+ u32 ps;
+ int status;
+
+ switch (op) {
+ case NTB_CTRL_PART_OP_LOCK:
+ status = NTB_CTRL_PART_STATUS_LOCKING;
+ break;
+ case NTB_CTRL_PART_OP_CFG:
+ status = NTB_CTRL_PART_STATUS_CONFIGURING;
+ break;
+ case NTB_CTRL_PART_OP_RESET:
+ status = NTB_CTRL_PART_STATUS_RESETTING;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ iowrite32(op, &ctl->partition_op);
+
+ for (i = 0; i < 1000; i++) {
+ mdelay(50);
+ ps = ioread32(&ctl->partition_status) & 0xFFFF;
+
+ if (ps != status)
+ break;
+ }
+
+ if (ps == wait_status)
+ return 0;
+
+ if (ps == status) {
+ dev_err(&sndev->stdev->dev,
+ "Timed out while peforming %s (%d). (%08x)",
+ op_text[op], op,
+ ioread32(&ctl->partition_status));
+
+ return -ETIMEDOUT;
+ }
+
+ return -EIO;
+}
+
+static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
+{
+ u64 part_map;
+
+ sndev->self_partition = sndev->stdev->partition;
+
+ sndev->mmio_ntb = sndev->stdev->mmio_ntb;
+ part_map = ioread64(&sndev->mmio_ntb->ep_map);
+ part_map &= ~(1 << sndev->self_partition);
+ sndev->peer_partition = ffs(part_map) - 1;
+
+ dev_dbg(&sndev->stdev->dev, "Partition ID %d of %d (%llx)",
+ sndev->self_partition, sndev->stdev->partition_count,
+ part_map);
+
+ sndev->mmio_ctrl = (void * __iomem)sndev->mmio_ntb +
+ SWITCHTEC_NTB_REG_CTRL_OFFSET;
+ sndev->mmio_dbmsg = (void * __iomem)sndev->mmio_ntb +
+ SWITCHTEC_NTB_REG_DBMSG_OFFSET;
+
+ sndev->mmio_self_ctrl = &sndev->mmio_ctrl[sndev->self_partition];
+ sndev->mmio_peer_ctrl = &sndev->mmio_ctrl[sndev->peer_partition];
+ sndev->mmio_self_dbmsg = &sndev->mmio_dbmsg[sndev->self_partition];
+}
+
+static void switchtec_ntb_init_mw(struct switchtec_ntb *sndev)
+{
+ int i;
+
+ sndev->nr_direct_mw = 0;
+ for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_ctrl->bar_entry); i++) {
+ u32 r = ioread32(&sndev->mmio_self_ctrl->bar_entry[i].ctl);
+
+ if (r & NTB_CTRL_BAR_VALID)
+ sndev->direct_mw_to_bar[sndev->nr_direct_mw++] = i;
+ }
+
+ sndev->nr_lut_mw = ioread16(&sndev->mmio_self_ctrl->lut_table_entries);
+ sndev->nr_lut_mw = rounddown_pow_of_two(sndev->nr_lut_mw);
+
+ dev_dbg(&sndev->stdev->dev, "MWs: %d direct, %d lut",
+ sndev->nr_direct_mw, sndev->nr_lut_mw);
+}
+
+static int switchtec_ntb_init_req_id_table(struct switchtec_ntb *sndev)
+{
+ int rc = 0;
+ u16 req_id = ioread16(&sndev->mmio_ntb->requester_id);
+ u32 error;
+
+ if (ioread32(&sndev->mmio_self_ctrl->req_id_table_size) < 2) {
+ dev_err(&sndev->stdev->dev,
+ "Not enough requester IDs available.");
+ return -EFAULT;
+ }
+
+ rc = switchtec_ntb_part_op(sndev, sndev->mmio_self_ctrl,
+ NTB_CTRL_PART_OP_LOCK,
+ NTB_CTRL_PART_STATUS_LOCKED);
+ if (rc)
+ return rc;
+
+ iowrite32(NTB_PART_CTRL_ID_PROT_DIS,
+ &sndev->mmio_self_ctrl->partition_ctrl);
+
+ // Root Complex Requester ID
+ iowrite32(0 << 16 | NTB_CTRL_REQ_ID_EN,
+ &sndev->mmio_self_ctrl->req_id_table[0]);
+
+ // Host Bridge Requester ID
+ iowrite32(req_id << 16 | NTB_CTRL_REQ_ID_EN,
+ &sndev->mmio_self_ctrl->req_id_table[1]);
+
+ rc = switchtec_ntb_part_op(sndev, sndev->mmio_self_ctrl,
+ NTB_CTRL_PART_OP_CFG,
+ NTB_CTRL_PART_STATUS_NORMAL);
+
+ if (rc == -EIO) {
+ error = ioread32(&sndev->mmio_self_ctrl->req_id_error);
+ dev_err(&sndev->stdev->dev,
+ "Error setting up the requester ID table: %08x",
+ error);
+ }
+
+ return rc;
+}
+
+static int switchtec_ntb_init_shared_mw(struct switchtec_ntb *sndev)
+{
+ struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+ int bar = sndev->direct_mw_to_bar[0];
+ u32 ctl_val;
+ int rc;
+
+ sndev->self_shared = dma_zalloc_coherent(&sndev->stdev->pdev->dev,
+ LUT_SIZE,
+ &sndev->self_shared_dma,
+ GFP_KERNEL);
+ if (!sndev->self_shared) {
+ dev_err(&sndev->stdev->dev,
+ "unable to allocate memory for shared mw");
+ return -ENOMEM;
+ }
+
+ memset(sndev->self_shared, 0, LUT_SIZE);
+ sndev->self_shared->magic = SWITCHTEC_NTB_MAGIC;
+ sndev->self_shared->partition_id = sndev->stdev->partition;
+
+ rc = switchtec_ntb_part_op(sndev, ctl, NTB_CTRL_PART_OP_LOCK,
+ NTB_CTRL_PART_STATUS_LOCKED);
+ if (rc)
+ goto unalloc_and_exit;
+
+ ctl_val = ioread32(&ctl->bar_entry[bar].ctl);
+ ctl_val |= NTB_CTRL_BAR_LUT_WIN_EN;
+ ctl_val &= 0xFF;
+ ctl_val |= ilog2(LUT_SIZE) << 8;
+ ctl_val |= (sndev->nr_lut_mw - 1) << 14;
+ iowrite32(ctl_val, &ctl->bar_entry[bar].ctl);
+
+ iowrite64((NTB_CTRL_LUT_EN | (sndev->self_partition << 1) |
+ sndev->self_shared_dma),
+ &ctl->lut_entry[0]);
+
+ rc = switchtec_ntb_part_op(sndev, ctl, NTB_CTRL_PART_OP_CFG,
+ NTB_CTRL_PART_STATUS_NORMAL);
+ if (rc) {
+ u32 bar_error, lut_error;
+
+ bar_error = ioread32(&ctl->bar_error);
+ lut_error = ioread32(&ctl->lut_error);
+ dev_err(&sndev->stdev->dev,
+ "Error setting up shared MW: %08x / %08x",
+ bar_error, lut_error);
+ goto unalloc_and_exit;
+ }
+
+ sndev->peer_shared = pci_iomap(sndev->stdev->pdev, bar, LUT_SIZE);
+ if (!sndev->peer_shared) {
+ rc = -ENOMEM;
+ goto unalloc_and_exit;
+ }
+
+ dev_dbg(&sndev->stdev->dev, "Shared MW Ready");
+ return 0;
+
+unalloc_and_exit:
+ dma_free_coherent(&sndev->stdev->pdev->dev, LUT_SIZE,
+ sndev->self_shared, sndev->self_shared_dma);
+
+ return rc;
+}
+
+static void switchtec_ntb_deinit_shared_mw(struct switchtec_ntb *sndev)
+{
+ if (sndev->peer_shared)
+ pci_iounmap(sndev->stdev->pdev, sndev->peer_shared);
+
+ if (sndev->self_shared)
+ dma_free_coherent(&sndev->stdev->pdev->dev, LUT_SIZE,
+ sndev->self_shared,
+ sndev->self_shared_dma);
+}
+
static int switchtec_ntb_add(struct device *dev,
struct class_interface *class_intf)
{
struct switchtec_dev *stdev = to_stdev(dev);
struct switchtec_ntb *sndev;
+ int rc;
stdev->sndev = NULL;
if (stdev->pdev->class != MICROSEMI_NTB_CLASSCODE)
return -ENODEV;
+ if (stdev->partition_count != 2)
+ dev_warn(dev, "ntb driver only supports 2 partitions");
+
sndev = kzalloc_node(sizeof(*sndev), GFP_KERNEL, dev_to_node(dev));
if (!sndev)
return -ENOMEM;
sndev->stdev = stdev;
+ switchtec_ntb_init_sndev(sndev);
+ switchtec_ntb_init_mw(sndev);
+
+ rc = switchtec_ntb_init_req_id_table(sndev);
+ if (rc)
+ goto free_and_exit;
+
+ rc = switchtec_ntb_init_shared_mw(sndev);
+ if (rc)
+ goto free_and_exit;
+
stdev->sndev = sndev;
dev_info(dev, "NTB device registered");
return 0;
+
+free_and_exit:
+ kfree(sndev);
+ dev_err(dev, "failed to register ntb device: %d", rc);
+ return rc;
}
void switchtec_ntb_remove(struct device *dev,
@@ -58,6 +353,7 @@ void switchtec_ntb_remove(struct device *dev,
return;
stdev->sndev = NULL;
+ switchtec_ntb_deinit_shared_mw(sndev);
kfree(sndev);
dev_info(dev, "ntb device unregistered");
}
--
2.11.0
The switchtec_ntb driver has a couple requirements on the switchec's
hardware configuration so we add these notes to the documentation.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
Documentation/switchtec.txt | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
index a0a9c7b3d4d5..978a346e1fb3 100644
--- a/Documentation/switchtec.txt
+++ b/Documentation/switchtec.txt
@@ -78,3 +78,15 @@ The following IOCTLs are also supported by the device:
between PCI Function Framework number (used by the event system)
and Switchtec Logic Port ID and Partition number (which is more
user friendly).
+
+
+Non-Transparent Bridge (NTB) Driver
+===================================
+
+An NTB driver is provided for the switchtec hardware in switchec_ntb.
+Currently, it only supports switches configured with exactly 2
+partitions. It also requires the following configuration settings:
+
+* Both partitions must be able to access each other's GAS spaces.
+ Thus, the bits in the GAS Access Vector under Management Settings
+ must be set to support this.
--
2.11.0
Seeing the switchtec NTB hardware shares the same endpoint as the
management endpoint we utilize the class_interface api to register
an NTB driver for every switchtec device in the system that has the
NTB class code.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
MAINTAINERS | 1 +
drivers/ntb/hw/Kconfig | 1 +
drivers/ntb/hw/Makefile | 1 +
drivers/ntb/hw/mscc/Kconfig | 9 +++++
drivers/ntb/hw/mscc/Makefile | 1 +
drivers/ntb/hw/mscc/switchtec_ntb.c | 81 +++++++++++++++++++++++++++++++++++++
include/linux/switchtec.h | 4 ++
7 files changed, 98 insertions(+)
create mode 100644 drivers/ntb/hw/mscc/Kconfig
create mode 100644 drivers/ntb/hw/mscc/Makefile
create mode 100644 drivers/ntb/hw/mscc/switchtec_ntb.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 6cee5e253ec3..fbd88be1be03 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9842,6 +9842,7 @@ F: Documentation/ABI/testing/sysfs-class-switchtec
F: drivers/pci/switch/switchtec*
F: include/uapi/linux/switchtec_ioctl.h
F: include/linux/switchtec.h
+F: drivers/ntb/hw/mscc/
PCI DRIVER FOR NVIDIA TEGRA
M: Thierry Reding <[email protected]>
diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig
index 7116472b4625..1d1360db87e1 100644
--- a/drivers/ntb/hw/Kconfig
+++ b/drivers/ntb/hw/Kconfig
@@ -1,2 +1,3 @@
source "drivers/ntb/hw/amd/Kconfig"
source "drivers/ntb/hw/intel/Kconfig"
+source "drivers/ntb/hw/mscc/Kconfig"
diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile
index 532e0859b4a1..690c00274174 100644
--- a/drivers/ntb/hw/Makefile
+++ b/drivers/ntb/hw/Makefile
@@ -1,2 +1,3 @@
obj-$(CONFIG_NTB_AMD) += amd/
obj-$(CONFIG_NTB_INTEL) += intel/
+obj-$(CONFIG_NTB_SWITCHTEC) += mscc/
diff --git a/drivers/ntb/hw/mscc/Kconfig b/drivers/ntb/hw/mscc/Kconfig
new file mode 100644
index 000000000000..013ed6716438
--- /dev/null
+++ b/drivers/ntb/hw/mscc/Kconfig
@@ -0,0 +1,9 @@
+config NTB_SWITCHTEC
+ tristate "MicroSemi Switchtec Non-Transparent Bridge Support"
+ select PCI_SW_SWITCHTEC
+ help
+ Enables NTB support for Switchtec PCI switches. This also
+ selects the Switchtec management driver as they share the same
+ hardware interface.
+
+ If unsure, say N.
diff --git a/drivers/ntb/hw/mscc/Makefile b/drivers/ntb/hw/mscc/Makefile
new file mode 100644
index 000000000000..21907b364e19
--- /dev/null
+++ b/drivers/ntb/hw/mscc/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_SWITCHTEC) += switchtec_ntb.o
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
new file mode 100644
index 000000000000..1f094216aa1c
--- /dev/null
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -0,0 +1,81 @@
+/*
+ * Microsemi Switchtec(tm) PCIe Management Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/switchtec.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
+MODULE_VERSION("0.1");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Microsemi Corporation");
+
+struct switchtec_ntb {
+ struct switchtec_dev *stdev;
+};
+
+static int switchtec_ntb_add(struct device *dev,
+ struct class_interface *class_intf)
+{
+ struct switchtec_dev *stdev = to_stdev(dev);
+ struct switchtec_ntb *sndev;
+
+ stdev->sndev = NULL;
+
+ if (stdev->pdev->class != MICROSEMI_NTB_CLASSCODE)
+ return -ENODEV;
+
+ sndev = kzalloc_node(sizeof(*sndev), GFP_KERNEL, dev_to_node(dev));
+ if (!sndev)
+ return -ENOMEM;
+
+ sndev->stdev = stdev;
+
+ stdev->sndev = sndev;
+ dev_info(dev, "NTB device registered");
+
+ return 0;
+}
+
+void switchtec_ntb_remove(struct device *dev,
+ struct class_interface *class_intf)
+{
+ struct switchtec_dev *stdev = to_stdev(dev);
+ struct switchtec_ntb *sndev = stdev->sndev;
+
+ if (!sndev)
+ return;
+
+ stdev->sndev = NULL;
+ kfree(sndev);
+ dev_info(dev, "ntb device unregistered");
+}
+
+static struct class_interface switchtec_interface = {
+ .class = &switchtec_class,
+ .add_dev = switchtec_ntb_add,
+ .remove_dev = switchtec_ntb_remove,
+};
+
+static int __init switchtec_ntb_init(void)
+{
+ return class_interface_register(&switchtec_interface);
+}
+module_init(switchtec_ntb_init);
+
+static void __exit switchtec_ntb_exit(void)
+{
+ class_interface_unregister(&switchtec_interface);
+}
+module_exit(switchtec_ntb_exit);
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 8d66a0659cef..0a7d0ed77507 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -312,6 +312,8 @@ struct pff_csr_regs {
u32 reserved4[174];
} __packed;
+struct switchtec_ntb;
+
struct switchtec_dev {
struct pci_dev *pdev;
struct device dev;
@@ -349,6 +351,8 @@ struct switchtec_dev {
struct work_struct link_event_work;
struct blocking_notifier_head link_notifier;
u8 link_event_count[SWITCHTEC_MAX_PFF_CSR];
+
+ struct switchtec_ntb *sndev;
};
static inline struct switchtec_dev *to_stdev(struct device *dev)
--
2.11.0
We switch to class_register/unregister and a declared class which
is exported for use in the switchtec_ntb driver.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/pci/switch/switchtec.c | 21 +++++++++++----------
include/linux/switchtec.h | 2 ++
2 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index c4369ba7bbc1..e9bf17b1934e 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -21,8 +21,6 @@
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/poll.h>
-#include <linux/pci.h>
-#include <linux/cdev.h>
#include <linux/wait.h>
MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
@@ -35,9 +33,14 @@ module_param(max_devices, int, 0644);
MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
static dev_t switchtec_devt;
-static struct class *switchtec_class;
static DEFINE_IDA(switchtec_minor_ida);
+struct class switchtec_class = {
+ .owner = THIS_MODULE,
+ .name = "switchtec",
+};
+EXPORT_SYMBOL(switchtec_class);
+
enum mrpc_state {
MRPC_IDLE = 0,
MRPC_QUEUED,
@@ -1026,7 +1029,7 @@ static struct switchtec_dev *stdev_create(struct pci_dev *pdev)
dev = &stdev->dev;
device_initialize(dev);
- dev->class = switchtec_class;
+ dev->class = &switchtec_class;
dev->parent = &pdev->dev;
dev->groups = switchtec_device_groups;
dev->release = stdev_release;
@@ -1313,11 +1316,9 @@ static int __init switchtec_init(void)
if (rc)
return rc;
- switchtec_class = class_create(THIS_MODULE, "switchtec");
- if (IS_ERR(switchtec_class)) {
- rc = PTR_ERR(switchtec_class);
+ rc = class_register(&switchtec_class);
+ if (rc)
goto err_create_class;
- }
rc = pci_register_driver(&switchtec_pci_driver);
if (rc)
@@ -1328,7 +1329,7 @@ static int __init switchtec_init(void)
return 0;
err_pci_register:
- class_destroy(switchtec_class);
+ class_unregister(&switchtec_class);
err_create_class:
unregister_chrdev_region(switchtec_devt, max_devices);
@@ -1340,7 +1341,7 @@ module_init(switchtec_init);
static void __exit switchtec_exit(void)
{
pci_unregister_driver(&switchtec_pci_driver);
- class_destroy(switchtec_class);
+ class_unregister(&switchtec_class);
unregister_chrdev_region(switchtec_devt, max_devices);
ida_destroy(&switchtec_minor_ida);
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 508cda78a430..3b87618fc42f 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -267,4 +267,6 @@ static inline struct switchtec_dev *to_stdev(struct device *dev)
return container_of(dev, struct switchtec_dev, dev);
}
+extern struct class switchtec_class;
+
#endif
--
2.11.0
This patch simply adds a skeleton NTB driver which will be filled
out in subsequent patches.
Signed-off-by: Logan Gunthorpe <[email protected]>
Reviewed-by: Stephen Bates <[email protected]>
Reviewed-by: Kurt Schwemmer <[email protected]>
---
drivers/ntb/hw/mscc/switchtec_ntb.c | 134 +++++++++++++++++++++++++++++++++++-
include/linux/ntb.h | 3 +
2 files changed, 136 insertions(+), 1 deletion(-)
diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 6c9b4e337cdf..dadbf3c38d0e 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
+#include <linux/ntb.h>
MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
MODULE_VERSION("0.1");
@@ -63,6 +64,7 @@ struct shared_mw {
#define LUT_SIZE SZ_64K
struct switchtec_ntb {
+ struct ntb_dev ntb;
struct switchtec_dev *stdev;
int self_partition;
@@ -145,10 +147,134 @@ static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
return -EIO;
}
+static int switchtec_ntb_mw_count(struct ntb_dev *ntb)
+{
+ return 0;
+}
+
+static int switchtec_ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+ phys_addr_t *base,
+ resource_size_t *size,
+ resource_size_t *align,
+ resource_size_t *align_size)
+{
+ return 0;
+}
+
+static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+ dma_addr_t addr, resource_size_t size)
+{
+ return 0;
+}
+
+static int switchtec_ntb_link_is_up(struct ntb_dev *ntb,
+ enum ntb_speed *speed,
+ enum ntb_width *width)
+{
+ return 0;
+}
+
+static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
+ enum ntb_speed max_speed,
+ enum ntb_width max_width)
+{
+ return 0;
+}
+
+static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
+{
+ return 0;
+}
+
+static u64 switchtec_ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+ return 0;
+}
+
+static int switchtec_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+ return 0;
+}
+
+static u64 switchtec_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+ return 0;
+}
+
+static u64 switchtec_ntb_db_read(struct ntb_dev *ntb)
+{
+ return 0;
+}
+
+static int switchtec_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+ return 0;
+}
+
+static int switchtec_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+ return 0;
+}
+
+static int switchtec_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+ return 0;
+}
+
+static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+ return 0;
+}
+
+static int switchtec_ntb_spad_count(struct ntb_dev *ntb)
+{
+ return 0;
+}
+
+static u32 switchtec_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+ return 0;
+}
+
+static int switchtec_ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+ return 0;
+}
+
+static int switchtec_ntb_peer_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+ return 0;
+}
+
+static const struct ntb_dev_ops switchtec_ntb_ops = {
+ .mw_count = switchtec_ntb_mw_count,
+ .mw_get_range = switchtec_ntb_mw_get_range,
+ .mw_set_trans = switchtec_ntb_mw_set_trans,
+ .link_is_up = switchtec_ntb_link_is_up,
+ .link_enable = switchtec_ntb_link_enable,
+ .link_disable = switchtec_ntb_link_disable,
+ .db_valid_mask = switchtec_ntb_db_valid_mask,
+ .db_vector_count = switchtec_ntb_db_vector_count,
+ .db_vector_mask = switchtec_ntb_db_vector_mask,
+ .db_read = switchtec_ntb_db_read,
+ .db_clear = switchtec_ntb_db_clear,
+ .db_set_mask = switchtec_ntb_db_set_mask,
+ .db_clear_mask = switchtec_ntb_db_clear_mask,
+ .peer_db_set = switchtec_ntb_peer_db_set,
+ .spad_count = switchtec_ntb_spad_count,
+ .spad_read = switchtec_ntb_spad_read,
+ .spad_write = switchtec_ntb_spad_write,
+ .peer_spad_write = switchtec_ntb_peer_spad_write,
+};
+
static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
{
u64 part_map;
+ sndev->ntb.pdev = sndev->stdev->pdev;
+ sndev->ntb.topo = NTB_TOPO_SWITCH;
+ sndev->ntb.ops = &switchtec_ntb_ops;
+
sndev->self_partition = sndev->stdev->partition;
sndev->mmio_ntb = sndev->stdev->mmio_ntb;
@@ -453,7 +579,6 @@ static int switchtec_ntb_add(struct device *dev,
return -ENOMEM;
sndev->stdev = stdev;
-
switchtec_ntb_init_sndev(sndev);
switchtec_ntb_init_mw(sndev);
switchtec_ntb_init_db(sndev);
@@ -471,11 +596,17 @@ static int switchtec_ntb_add(struct device *dev,
if (rc)
goto deinit_shared_and_exit;
+ rc = ntb_register_device(&sndev->ntb);
+ if (rc)
+ goto deinit_and_exit;
+
stdev->sndev = sndev;
dev_info(dev, "NTB device registered");
return 0;
+deinit_and_exit:
+ switchtec_ntb_deinit_db_msg_irq(sndev);
deinit_shared_and_exit:
switchtec_ntb_deinit_shared_mw(sndev);
free_and_exit:
@@ -494,6 +625,7 @@ void switchtec_ntb_remove(struct device *dev,
return;
stdev->sndev = NULL;
+ ntb_unregister_device(&sndev->ntb);
switchtec_ntb_deinit_db_msg_irq(sndev);
switchtec_ntb_deinit_shared_mw(sndev);
kfree(sndev);
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index de87ceac110e..ae351f0bd4ae 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -68,6 +68,7 @@ struct pci_dev;
* @NTB_TOPO_SEC: On secondary side of remote ntb.
* @NTB_TOPO_B2B_USD: On primary side of local ntb upstream of remote ntb.
* @NTB_TOPO_B2B_DSD: On primary side of local ntb downstream of remote ntb.
+ * @NTB_TOPO_SWITCH: Connected via a switch which supports ntb.
*/
enum ntb_topo {
NTB_TOPO_NONE = -1,
@@ -75,6 +76,7 @@ enum ntb_topo {
NTB_TOPO_SEC,
NTB_TOPO_B2B_USD,
NTB_TOPO_B2B_DSD,
+ NTB_TOPO_SWITCH,
};
static inline int ntb_topo_is_b2b(enum ntb_topo topo)
@@ -95,6 +97,7 @@ static inline char *ntb_topo_string(enum ntb_topo topo)
case NTB_TOPO_SEC: return "NTB_TOPO_SEC";
case NTB_TOPO_B2B_USD: return "NTB_TOPO_B2B_USD";
case NTB_TOPO_B2B_DSD: return "NTB_TOPO_B2B_DSD";
+ case NTB_TOPO_SWITCH: return "NTB_TOPO_SWITCH";
}
return "NTB_TOPO_INVALID";
}
--
2.11.0
From: Logan Gunthorpe
> Hi,
>
> This patchset implements Non-Transparent Bridge (NTB) support for the
> Microsemi Switchtec series of switches. We're looking for some
> review from the community at this point but hope to get it upstreamed
> for v4.14.
>
> Switchtec NTB support is configured over the same function and bar
> as the management endpoint. Thus, the new driver hooks into the
> management driver which we had merged in v4.12. We use the class
> interface API to register an NTB device for every switchtec device
> which supports NTB (not all do).
>
> The Switchtec hardware supports doorbells, memory windows and messages.
> Seeing there is no native scratchpad support, 128 spads are emulated
> through the use of a pre-setup memory window. The switch has 64
> doorbells which are shared between the two partitions and a
> configurable set of memory windows. While the hardware supports more
> than 2 partitions, this driver only supports the first two seeing
> the current NTB API only supports two hosts.
See what is staged in https://github.com/jonmason/ntb.git ntb-next, with the addition of multi-peer support by Serge. It would be good at this stage to understand whether the api changes there would also support the Switchtec driver, and what if anything must change, or be planned to change, to support the Switchtec driver.
Thanks for providing the patch set for the Switchtec driver. My first impression is that it is a good patch set. Only to be included, it needs to be reconciled with the api changes in ntb-next. I will follow up with a more detailed review of patches in this series, but sending this now as I don't want to delay your review of ntb-next.
>
> The driver has been tested with ntb_netdev and fully passes the
> ntb_test script.
>
> This patchset is based off of v4.12-rc5 and can be found in this
> git repo:
>
> https://github.com/sbates130272/linux-p2pmem.git switchtec_ntb
>
> Thanks,
>
> Logan
>
>
> Logan Gunthorpe (13):
> switchtec: move structure definitions into a common header
> switchtec: export class symbol for use in upper layer driver
> switchtec: add ntb hardware register definitions
> switchtec: add link event notifier block
> switchtec_ntb: introduce initial ntb driver
> switchtec_ntb: initialize hardware for memory windows
> switchtec_ntb: initialize hardware for doorbells and messages
> switchtec_ntb: add skeleton ntb driver
> switchtec_ntb: add link management
> switchtec_ntb: implement doorbell registers
> switchtec_ntb: implement scratchpad registers
> switchtec_ntb: add memory window support
> switchtec_ntb: update switchtec documentation with notes for ntb
>
> Documentation/switchtec.txt | 12 +
> MAINTAINERS | 2 +
> drivers/ntb/hw/Kconfig | 1 +
> drivers/ntb/hw/Makefile | 1 +
> drivers/ntb/hw/mscc/Kconfig | 9 +
> drivers/ntb/hw/mscc/Makefile | 1 +
> drivers/ntb/hw/mscc/switchtec_ntb.c | 1144 +++++++++++++++++++++++++++++++++++
> drivers/pci/switch/switchtec.c | 319 ++--------
> include/linux/ntb.h | 3 +
> include/linux/switchtec.h | 365 +++++++++++
> 10 files changed, 1601 insertions(+), 256 deletions(-)
> create mode 100644 drivers/ntb/hw/mscc/Kconfig
> create mode 100644 drivers/ntb/hw/mscc/Makefile
> create mode 100644 drivers/ntb/hw/mscc/switchtec_ntb.c
> create mode 100644 include/linux/switchtec.h
>
> --
> 2.11.0
On 16/06/17 07:53 AM, Allen Hubbe wrote:
> See what is staged in https://github.com/jonmason/ntb.git ntb-next, with the addition of multi-peer support by Serge. It would be good at this stage to understand whether the api changes there would also support the Switchtec driver, and what if anything must change, or be planned to change, to support the Switchtec driver.
Ah, yes I had seen that patchset some time ago but I wasn't aware of
it's status or that it was queued up in ntb-next. I think it will be no
problem to reconcile with the switchtec driver and I'll rebase onto
ntb-next for the next posting of the patch set. However, I *may* save
full multi-host switchtec support for a follow up submission. My initial
impression is the new API will support the switchtec hardware well.
Thanks,
Logan
From: Logan Gunthorpe
> On 16/06/17 07:53 AM, Allen Hubbe wrote:
> > See what is staged in https://github.com/jonmason/ntb.git ntb-next, with the addition of multi-peer
> support by Serge. It would be good at this stage to understand whether the api changes there would
> also support the Switchtec driver, and what if anything must change, or be planned to change, to
> support the Switchtec driver.
>
> Ah, yes I had seen that patchset some time ago but I wasn't aware of
> it's status or that it was queued up in ntb-next. I think it will be no
> problem to reconcile with the switchtec driver and I'll rebase onto
> ntb-next for the next posting of the patch set. However, I *may* save
> full multi-host switchtec support for a follow up submission. My initial
> impression is the new API will support the switchtec hardware well.
Alright!
In code review, I really only have found minor nits. Overall, the driver looks good.
In switchtec_ntb_part_op, there is a delay of up to 50s (1000 * 50ms). This looks like a thread context, so it could involve the scheduler for the delay instead of spinning for up to 50s before bailing.
There are a few instances like this:
> + dev_dbg(&stdev->dev, "%s\n", __func__);
Where the printing of __func__ could be controlled by dyndbg=+pf. The debug message could be more useful.
In switchtec_ntb_db_set_mask and friends, an in-memory copy of the mask bits is protected by a spinlock. Elsewhere, you noted that the db bits are shared between all ports, so the db bitset is chopped up to be shared between the ports. Is the db mask also shared, and how is the spinlock sufficient for synchronizing access to the mask bits between multiple ports?
The IDT switch also does not have hardware scratchpads. Could the code you wrote for emulated scratchpads be made into shared library code for ntb drivers? Also, some ntb clients may not need scratchpad support. If it is not natively supported by a driver, can the emulated scratchpad support be an optional feature?
>
> Thanks,
>
> Logan
Hello Logan.
Thanks for the new hardware driver. It's really great to see NTB subsystem
being developed.
New NTB API is going to be merged to mainline kernel within next features
merge window, so it's really recommended to use that API for new hardware.
Could you please rabase your driver on top of the tree?
https://github.com/jonmason/ntb.git
> See what is staged in https://github.com/jonmason/ntb.git ntb-next, with
> the addition of multi-peer support by Serge. It would be good at this
> stage to understand whether the api changes there would also support the
> Switchtec driver, and what if anything must change, or be planned to change,
> to support the Switchtec driver.
My impression is that, there is no need for new NTB API changes, since it
was specifically designed for new type of multi-port switches with
additional message registers, which is exactly supported by switchtec
device.
> The Switchtec hardware supports doorbells, memory windows and messages.
> Seeing there is no native scratchpad support, 128 spads are emulated
> through the use of a pre-setup memory window.
According to the NTB philosophy, it's not good to have any hardware
emulation within hardware driver. Hardware driver must reflect the only
hardware abilities, nothing else. Could you please get rid of Scratchpad
emulation and add messaging as new NTB API has got a proper callback
functions for it?
Hmmm, I haven't seen the actual code (see my last comment), but
according to my impression of API, it's impossible to have memory window
accessed while NTB link is down, but Scratchpads still can be used.
In this case, if you got Scratchpads emulated over memory windows,
you must have got NTB link enabled before NTB device is registered, which
makes ntb_link_* methods kind of being useless unless Switchtec hardware
supports NTB link getting up/down for individual memory windows...
> configurable set of memory windows. While the hardware supports more
> than 2 partitions, this driver only supports the first two seeing
> the current NTB API only supports two hosts.
New NTB API is updated so to have access to any of peer ports. IDT driver
has got a special translation table to access peer functionality just by
providing an index to corresponding API callback. You can use it as
reference to have Switchtec driver accordingly altered. It would be vastly
useful to have one more multi-port hardware driver in the tree.
> switchtec: move structure definitions into a common header
> switchtec: export class symbol for use in upper layer driver
> switchtec: add ntb hardware register definitions
> switchtec: add link event notifier block
> switchtec_ntb: introduce initial ntb driver
> switchtec_ntb: initialize hardware for memory windows
> switchtec_ntb: initialize hardware for doorbells and messages
> switchtec_ntb: add skeleton ntb driver
> switchtec_ntb: add link management
> switchtec_ntb: implement doorbell registers
> switchtec_ntb: implement scratchpad registers
> switchtec_ntb: add memory window support
> switchtec_ntb: update switchtec documentation with notes for ntb
If I'm not mistaken, these patches can be combined the way so to have
just two big functionally split patches:
1) NTB: Microsemt Switchtec switch management driver alterations for NTB
2) NTB: Add Microsemi Switchtec PCIe-switches support
It would really simplify the review. Could you please combine them?
Thanks for submitting the patches. We really appreciate this and looking
forward to have it completely reviewed and added to the kernel tree.
Regards
-Sergey
On Fri, Jun 16, 2017 at 08:09:55AM -0600, Logan Gunthorpe <[email protected]> wrote:
>
>
> On 16/06/17 07:53 AM, Allen Hubbe wrote:
> > See what is staged in https://github.com/jonmason/ntb.git ntb-next, with the addition of multi-peer support by Serge. It would be good at this stage to understand whether the api changes there would also support the Switchtec driver, and what if anything must change, or be planned to change, to support the Switchtec driver.
>
> Ah, yes I had seen that patchset some time ago but I wasn't aware of
> it's status or that it was queued up in ntb-next. I think it will be no
> problem to reconcile with the switchtec driver and I'll rebase onto
> ntb-next for the next posting of the patch set. However, I *may* save
> full multi-host switchtec support for a follow up submission. My initial
> impression is the new API will support the switchtec hardware well.
>
> Thanks,
>
> Logan
On 16/06/17 09:34 AM, Allen Hubbe wrote:
> In code review, I really only have found minor nits. Overall, the driver looks good.
Great, thanks for such a quick review!
> In switchtec_ntb_part_op, there is a delay of up to 50s (1000 * 50ms). This looks like a thread context, so it could involve the scheduler for the delay instead of spinning for up to 50s before bailing.
Good point. If I were to change this to msleep_interruptible would that
be acceptable?
> There are a few instances like this:
>> + dev_dbg(&stdev->dev, "%s\n", __func__);
> Where the printing of __func__ could be controlled by dyndbg=+pf. The debug message could be more useful.
Ok, I'll change that.
> In switchtec_ntb_db_set_mask and friends, an in-memory copy of the mask bits is protected by a spinlock. Elsewhere, you noted that the db bits are shared between all ports, so the db bitset is chopped up to be shared between the ports. Is the db mask also shared, and how is the spinlock sufficient for synchronizing access to the mask bits between multiple ports?
Well, there are 64 doorbells that are shared between ports but each port
has it's own in and out registers for the doorbells. So triggering
doorbell one on one port's ODB actually triggers it on every ports IDB.
So these are shared only in the sense that each port needs to know which
dbs it cares about. Seeing each port has their own registers they don't
have to worry about synchronization.
The mask is only protected by a spin lock seeing multiple callers of
db_set_mask and db_clr_mask on the same port may step on each others
toes. So if two processes try to mask different bits they both must get
masked in the end and therefore some kind of synchronization must be
involved.
> The IDT switch also does not have hardware scratchpads. Could the code you wrote for emulated scratchpads be made into shared library code for ntb drivers? Also, some ntb clients may not need scratchpad support. If it is not natively supported by a driver, can the emulated scratchpad support be an optional feature?
Hmm, interesting idea. A few pieces could possibly be made common but it
depends mostly on hardware having the resources to make use of it.
Switchtec has extra LUT memory windows that made this possible. Unless
you object I'm inclined to leave it as is and I'd be happy to work with
the IDT folks to create a common solution in the future.
Logan
On 16/06/17 10:33 AM, Serge Semin wrote:
> New NTB API is going to be merged to mainline kernel within next features
> merge window, so it's really recommended to use that API for new hardware.
> Could you please rabase your driver on top of the tree?
> https://github.com/jonmason/ntb.git
Yes, Allen's already pointed this out. I'll be sure to fix it up before
a final submission.
> According to the NTB philosophy, it's not good to have any hardware
> emulation within hardware driver. Hardware driver must reflect the only
> hardware abilities, nothing else. Could you please get rid of Scratchpad
> emulation and add messaging as new NTB API has got a proper callback
> functions for it?
I disagree completely. Practicality trumps philosophy in every case. I
need emulated scratchpads for ntb_transport to work and I'm not getting
rid of it (thus breaking things that work well) just because of your
philosophical beliefs.
> Hmmm, I haven't seen the actual code (see my last comment), but
> according to my impression of API, it's impossible to have memory window
> accessed while NTB link is down, but Scratchpads still can be used.
> In this case, if you got Scratchpads emulated over memory windows,
> you must have got NTB link enabled before NTB device is registered, which
> makes ntb_link_* methods kind of being useless unless Switchtec hardware
> supports NTB link getting up/down for individual memory windows...
Nothing in-kernel actually uses the peer's scratchpads while the link is
down and all clients seem specifically designed to wait until the link
event to set them. So I think you're either wrong about that rule or we
should change the rule going forward.
I'm not sure what you're referring to about the link stuff; as
implemented, our link management works just fine.
> New NTB API is updated so to have access to any of peer ports. IDT driver
> has got a special translation table to access peer functionality just by
> providing an index to corresponding API callback. You can use it as
> reference to have Switchtec driver accordingly altered. It would be vastly
> useful to have one more multi-port hardware driver in the tree.
Yes, I expect we will get there eventually, it doesn't sound like much
work. However, it's client support that's really going to make this
interesting and worthwhile. That seems like the real challenge right now.
> If I'm not mistaken, these patches can be combined the way so to have
> just two big functionally split patches:
> 1) NTB: Microsemt Switchtec switch management driver alterations for NTB
> 2) NTB: Add Microsemi Switchtec PCIe-switches support
> It would really simplify the review. Could you please combine them?
Seems like every time I make a submission, someone either wants patches
to be smaller and split up more or bigger and combined. I happen to
agree with the people who prefer smaller patches and I think these
provide good chunks for reviewers to look at. So, no, I'm not going to
change this. Feel free to apply the patches to a git tree or view it on
our github branch if you want to see the code combined.
Thanks,
Logan
On Fri, Jun 16, 2017 at 10:47:21AM -0600, Logan Gunthorpe <[email protected]> wrote:
>
>
> On 16/06/17 09:34 AM, Allen Hubbe wrote:
> > In code review, I really only have found minor nits. Overall, the driver looks good.
>
> Great, thanks for such a quick review!
>
> > In switchtec_ntb_part_op, there is a delay of up to 50s (1000 * 50ms). This looks like a thread context, so it could involve the scheduler for the delay instead of spinning for up to 50s before bailing.
>
> Good point. If I were to change this to msleep_interruptible would that
> be acceptable?
>
> > There are a few instances like this:
> >> + dev_dbg(&stdev->dev, "%s\n", __func__);
>
> > Where the printing of __func__ could be controlled by dyndbg=+pf. The debug message could be more useful.
>
> Ok, I'll change that.
>
> > In switchtec_ntb_db_set_mask and friends, an in-memory copy of the mask bits is protected by a spinlock. Elsewhere, you noted that the db bits are shared between all ports, so the db bitset is chopped up to be shared between the ports. Is the db mask also shared, and how is the spinlock sufficient for synchronizing access to the mask bits between multiple ports?
>
> Well, there are 64 doorbells that are shared between ports but each port
> has it's own in and out registers for the doorbells. So triggering
> doorbell one on one port's ODB actually triggers it on every ports IDB.
> So these are shared only in the sense that each port needs to know which
> dbs it cares about. Seeing each port has their own registers they don't
> have to worry about synchronization.
>
It's exactly the way the IDT hardware works. There is Global doorbell
registers, directly connected to doorbell registers of each port, unless
doorbell routing isn't configured differently by global switch configuration.
Each port has got it's own in and out doorbell registers as well as the
doorbell mask preventing the corresponding doorbell bit from generating
interrupt.
>
> The mask is only protected by a spin lock seeing multiple callers of
> db_set_mask and db_clr_mask on the same port may step on each others
> toes. So if two processes try to mask different bits they both must get
> masked in the end and therefore some kind of synchronization must be
> involved.
>
> > The IDT switch also does not have hardware scratchpads. Could the code you wrote for emulated scratchpads be made into shared library code for ntb drivers? Also, some ntb clients may not need scratchpad support. If it is not natively supported by a driver, can the emulated scratchpad support be an optional feature?
>
> Hmm, interesting idea. A few pieces could possibly be made common but it
> depends mostly on hardware having the resources to make use of it.
> Switchtec has extra LUT memory windows that made this possible. Unless
> you object I'm inclined to leave it as is and I'd be happy to work with
> the IDT folks to create a common solution in the future.
>
> Logan
Alas there are no IDT folks here. It's only me for now, who was responsible
for NTB API alteration (together with much of help from others NTB guys) and
IDT NTB driver development fitting that API.
Regards,
-Sergey
From: Logan Gunthorpe
> On 16/06/17 09:34 AM, Allen Hubbe wrote:
> > In code review, I really only have found minor nits. Overall, the driver looks good.
>
> Great, thanks for such a quick review!
>
> > In switchtec_ntb_part_op, there is a delay of up to 50s (1000 * 50ms). This looks like a thread
> context, so it could involve the scheduler for the delay instead of spinning for up to 50s before
> bailing.
>
> Good point. If I were to change this to msleep_interruptible would that
> be acceptable?
I would be satisfied.
>
> > There are a few instances like this:
> >> + dev_dbg(&stdev->dev, "%s\n", __func__);
>
> > Where the printing of __func__ could be controlled by dyndbg=+pf. The debug message could be more
> useful.
>
> Ok, I'll change that.
>
> > In switchtec_ntb_db_set_mask and friends, an in-memory copy of the mask bits is protected by a
> spinlock. Elsewhere, you noted that the db bits are shared between all ports, so the db bitset is
> chopped up to be shared between the ports. Is the db mask also shared, and how is the spinlock
> sufficient for synchronizing access to the mask bits between multiple ports?
>
> Well, there are 64 doorbells that are shared between ports but each port
> has it's own in and out registers for the doorbells. So triggering
> doorbell one on one port's ODB actually triggers it on every ports IDB.
> So these are shared only in the sense that each port needs to know which
> dbs it cares about. Seeing each port has their own registers they don't
> have to worry about synchronization.
>
> The mask is only protected by a spin lock seeing multiple callers of
> db_set_mask and db_clr_mask on the same port may step on each others
> toes. So if two processes try to mask different bits they both must get
> masked in the end and therefore some kind of synchronization must be
> involved.
Thanks for clearing that up. Now I understand, each port has its own independent set of mask bits. So, while the doorbell numbers are assigned globally, the registers themselves are per port. For the mask bits, the mask behavior only affects the local port.
>
> > The IDT switch also does not have hardware scratchpads. Could the code you wrote for emulated
> scratchpads be made into shared library code for ntb drivers? Also, some ntb clients may not need
> scratchpad support. If it is not natively supported by a driver, can the emulated scratchpad support
> be an optional feature?
>
> Hmm, interesting idea. A few pieces could possibly be made common but it
> depends mostly on hardware having the resources to make use of it.
> Switchtec has extra LUT memory windows that made this possible. Unless
> you object I'm inclined to leave it as is and I'd be happy to work with
> the IDT folks to create a common solution in the future.
Alright. I'll leave it to you to find and reconcile common functionalities of the drivers. What about making spad emulation optional?
>
> Logan
There was a comment on irc.oftc.net #ntb wishing for patch v2 to be fewer patches. Something like, 1/2: prep the existing switch driver, 2/2: introduce the ntb driver.
On Fri, Jun 16, 2017 at 11:08:52AM -0600, Logan Gunthorpe <[email protected]> wrote:
>
>
> On 16/06/17 10:33 AM, Serge Semin wrote:
> > New NTB API is going to be merged to mainline kernel within next features
> > merge window, so it's really recommended to use that API for new hardware.
> > Could you please rabase your driver on top of the tree?
> > https://github.com/jonmason/ntb.git
>
> Yes, Allen's already pointed this out. I'll be sure to fix it up before
> a final submission.
>
> > According to the NTB philosophy, it's not good to have any hardware
> > emulation within hardware driver. Hardware driver must reflect the only
> > hardware abilities, nothing else. Could you please get rid of Scratchpad
> > emulation and add messaging as new NTB API has got a proper callback
> > functions for it?
>
> I disagree completely. Practicality trumps philosophy in every case. I
> need emulated scratchpads for ntb_transport to work and I'm not getting
> rid of it (thus breaking things that work well) just because of your
> philosophical beliefs.
>
It's the way the NTB API was created for, to have set of functions to access
NTB devices in the similar way. These aren't my beliefs, it's the way it was
created. I agree it can be optional, but it shouldn't be made as the basics
of the driver. It is called NTB "hardware" driver after all, not "emulating" or
"abstracting" driver.
ntb_transport could work without Scratchpads, if it's properly altered to
use NTB messaging. This should be the way to make things compatible, but not
making the hardware driver suitable for just one ntb_transport.
> > Hmmm, I haven't seen the actual code (see my last comment), but
> > according to my impression of API, it's impossible to have memory window
> > accessed while NTB link is down, but Scratchpads still can be used.
> > In this case, if you got Scratchpads emulated over memory windows,
> > you must have got NTB link enabled before NTB device is registered, which
> > makes ntb_link_* methods kind of being useless unless Switchtec hardware
> > supports NTB link getting up/down for individual memory windows...
>
> Nothing in-kernel actually uses the peer's scratchpads while the link is
> down and all clients seem specifically designed to wait until the link
> event to set them. So I think you're either wrong about that rule or we
> should change the rule going forward.
>
> I'm not sure what you're referring to about the link stuff; as
> implemented, our link management works just fine.
>
> > New NTB API is updated so to have access to any of peer ports. IDT driver
> > has got a special translation table to access peer functionality just by
> > providing an index to corresponding API callback. You can use it as
> > reference to have Switchtec driver accordingly altered. It would be vastly
> > useful to have one more multi-port hardware driver in the tree.
>
> Yes, I expect we will get there eventually, it doesn't sound like much
> work. However, it's client support that's really going to make this
> interesting and worthwhile. That seems like the real challenge right now.
>
> > If I'm not mistaken, these patches can be combined the way so to have
> > just two big functionally split patches:
> > 1) NTB: Microsemy Switchtec switch management driver alterations for NTB
> > 2) NTB: Add Microsemi Switchtec PCIe-switches support
> > It would really simplify the review. Could you please combine them?
>
> Seems like every time I make a submission, someone either wants patches
> to be smaller and split up more or bigger and combined. I happen to
> agree with the people who prefer smaller patches and I think these
> provide good chunks for reviewers to look at. So, no, I'm not going to
> change this. Feel free to apply the patches to a git tree or view it on
> our github branch if you want to see the code combined.
It's not like my whim or something, but the way it's usually done.
https://kernelnewbies.org/PatchPhilosophy
Cite from there:
"Each patch should group changes into a logical sequence. Bug fixes must
come first in the patchset, then new features. This is because we need to be
able to backport bug fixes to older kernels, and they should not depend on
new features."
You grouped the patches in according to your logical view or development
progress (I don't know for sure), but it's not obvious for reviewers.
>From my perspective your new Microsemi Switchtec NTB driver is just one
feature. I don't know who would think differently so to split the solid
driver up for review. Switchtec management driver alteration might be the
same - just one fix. It's much easier for you to have your commits squashed,
than for me to look at your git tree, than get back to your patchset looking
for a necessary peace of patch and commenting it there.
Regards,
-Sergey
>
> Thanks,
>
> Logan
>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> To post to this group, send email to [email protected].
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/883bdb76-972c-7de9-0208-2d0933f192d4%40deltatee.com.
> For more options, visit https://groups.google.com/d/optout.
On 16/06/17 12:08 PM, Allen Hubbe wrote:
> Alright. I'll leave it to you to find and reconcile common functionalities of the drivers. What about making spad emulation optional?
Ok.
I don't see the point of making spad emulation optional. Who would want
to disable it and what would be the benefit?
Logan
On 16/06/17 12:38 PM, Serge Semin wrote:
> On Fri, Jun 16, 2017 at 11:08:52AM -0600, Logan Gunthorpe <[email protected]> wrote:
> It's the way the NTB API was created for, to have set of functions to access
> NTB devices in the similar way. These aren't my beliefs, it's the way it was
> created. I agree it can be optional, but it shouldn't be made as the basics
> of the driver. It is called NTB "hardware" driver after all, not "emulating" or
> "abstracting" driver.
Just more philosophy. You haven't given any good reason to remove the
functionality. Vague references to the way things were created aren't
compelling arguments. Better to cite code and point out actual problems.
> ntb_transport could work without Scratchpads, if it's properly altered to
> use NTB messaging. This should be the way to make things compatible, but not
> making the hardware driver suitable for just one ntb_transport.
Ok, well when all the NTB clients no longer require using scratchpads
and we can all abide by the rule that clients must function without
them. Then, I'll remove the emulation. Until then, it stays.
> It's not like my whim or something, but the way it's usually done.
> https://kernelnewbies.org/PatchPhilosophy
> Cite from there:
> "Each patch should group changes into a logical sequence. Bug fixes must
> come first in the patchset, then new features. This is because we need to be
> able to backport bug fixes to older kernels, and they should not depend on
> new features."
You should probably read that again because it doesn't actually support
your point (in fact it's saying something quite unrelated). It is also
probably a good idea to read the rest of the seciton you cite:
"The idea here is that you should break changes up in such a way that it
will be easy to review."
"When creating a new feature patchset, you may need to break up your
changes into multiple commits. "
"Clean up patches that are over 200 lines long are discouraged, because
they are hard to review. Break those patches up into smaller patches. "
Also, to quote Greg Kroah-Hartman from my last series[1]:
"That's one big patch to review, would you want to do that?
Can you break it up into smaller parts?"
> You grouped the patches in according to your logical view or development
> progress (I don't know for sure), but it's not obvious for reviewers.
> From my perspective your new Microsemi Switchtec NTB driver is just one
> feature. I don't know who would think differently so to split the solid
> driver up for review. Switchtec management driver alteration might be the
> same - just one fix. It's much easier for you to have your commits squashed,
> than for me to look at your git tree, than get back to your patchset looking
> for a necessary peace of patch and commenting it there.
Well you're free to think that but, in my experience, your opinion
differs significantly from the rest of the kernel community which I
personally agree with.
Now, if you'd like to actually review the code I'd be happy to address
any concerns you find. I won't be responding to any more philosophical
arguments or bike-shedding over the format of the patch.
Logan
[1] https://lkml.org/lkml/2017/1/31/637
On Fri, Jun 16, 2017 at 01:34:59PM -0600, Logan Gunthorpe <[email protected]> wrote:
>
>
> On 16/06/17 12:38 PM, Serge Semin wrote:
> > On Fri, Jun 16, 2017 at 11:08:52AM -0600, Logan Gunthorpe <[email protected]> wrote:
> > It's the way the NTB API was created for, to have set of functions to access
> > NTB devices in the similar way. These aren't my beliefs, it's the way it was
> > created. I agree it can be optional, but it shouldn't be made as the basics
> > of the driver. It is called NTB "hardware" driver after all, not "emulating" or
> > "abstracting" driver.
>
> Just more philosophy. You haven't given any good reason to remove the
> functionality. Vague references to the way things were created aren't
> compelling arguments. Better to cite code and point out actual problems.
>
Actual problem is the design of your driver. Of course you can disagree as much as
you want.
> > ntb_transport could work without Scratchpads, if it's properly altered to
> > use NTB messaging. This should be the way to make things compatible, but not
> > making the hardware driver suitable for just one ntb_transport.
>
> Ok, well when all the NTB clients no longer require using scratchpads
> and we can all abide by the rule that clients must function without
> them. Then, I'll remove the emulation. Until then, it stays.
>
> > It's not like my whim or something, but the way it's usually done.
> > https://kernelnewbies.org/PatchPhilosophy
>
> > Cite from there:
> > "Each patch should group changes into a logical sequence. Bug fixes must
> > come first in the patchset, then new features. This is because we need to be
> > able to backport bug fixes to older kernels, and they should not depend on
> > new features."
>
> You should probably read that again because it doesn't actually support
> your point (in fact it's saying something quite unrelated). It is also
> probably a good idea to read the rest of the seciton you cite:
>
> "The idea here is that you should break changes up in such a way that it
> will be easy to review."
>
> "When creating a new feature patchset, you may need to break up your
> changes into multiple commits. "
>
> "Clean up patches that are over 200 lines long are discouraged, because
> they are hard to review. Break those patches up into smaller patches. "
>
This doesn't prove your way of splitting patchset is correct, but supports
my point. As well as the sentence about the logical sentence in addition
to the thing about easy review.
> Also, to quote Greg Kroah-Hartman from my last series[1]:
>
> "That's one big patch to review, would you want to do that?
>
> Can you break it up into smaller parts?"
>
> > You grouped the patches in according to your logical view or development
> > progress (I don't know for sure), but it's not obvious for reviewers.
> > From my perspective your new Microsemi Switchtec NTB driver is just one
> > feature. I don't know who would think differently so to split the solid
> > driver up for review. Switchtec management driver alteration might be the
> > same - just one fix. It's much easier for you to have your commits squashed,
> > than for me to look at your git tree, than get back to your patchset looking
> > for a necessary peace of patch and commenting it there.
>
> Well you're free to think that but, in my experience, your opinion
> differs significantly from the rest of the kernel community which I
> personally agree with.
>
And your quotation doesn't prove you are right. Greg asked you to split at
least the documentation. He had point to ask it, since it's logically correct.
You wasn't arguing with him, was you? But in this case you have sent the
set of incremental patches of your own code, so I don't see how it can be
easier for review, than a combined text.
> Now, if you'd like to actually review the code I'd be happy to address
> any concerns you find. I won't be responding to any more philosophical
> arguments or bike-shedding over the format of the patch.
>
I don't want to review a patchset, which isn't properly formated.
> Logan
>
> [1] https://lkml.org/lkml/2017/1/31/637
>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> To post to this group, send email to [email protected].
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/33b6c321-c0af-7340-8e8e-e929a00005c7%40deltatee.com.
> For more options, visit https://groups.google.com/d/optout.
On Fri, Jun 16, 2017 at 11:21:00PM +0300, Serge Semin wrote:
> On Fri, Jun 16, 2017 at 01:34:59PM -0600, Logan Gunthorpe <[email protected]> wrote:
> > Now, if you'd like to actually review the code I'd be happy to address
> > any concerns you find. I won't be responding to any more philosophical
> > arguments or bike-shedding over the format of the patch.
> >
>
> I don't want to review a patchset, which isn't properly formated.
Ah, but the patchset does seem to properly formatted. At least it's
easy for me to review as-published, while a much smaller number of
patches, making much larger individual patches, would be much much
harder to review.
But what do I know...
Oh wait, I review more kernel patches than anyone else :)
Logan, given that you need to rebase these on the "new" ntb api (and why
the hell is that tree on github? We can't take kernel git pulls from
github), is it worth reviewing this patch series as-is, or do you want
us to wait?
thanks,
greg k-h
On Thu, Jun 15, 2017 at 02:37:18PM -0600, Logan Gunthorpe wrote:
> We switch to class_register/unregister and a declared class which
> is exported for use in the switchtec_ntb driver.
>
> Signed-off-by: Logan Gunthorpe <[email protected]>
> Reviewed-by: Stephen Bates <[email protected]>
> Reviewed-by: Kurt Schwemmer <[email protected]>
> ---
> drivers/pci/switch/switchtec.c | 21 +++++++++++----------
> include/linux/switchtec.h | 2 ++
> 2 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index c4369ba7bbc1..e9bf17b1934e 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -21,8 +21,6 @@
> #include <linux/fs.h>
> #include <linux/uaccess.h>
> #include <linux/poll.h>
> -#include <linux/pci.h>
> -#include <linux/cdev.h>
> #include <linux/wait.h>
>
> MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
> @@ -35,9 +33,14 @@ module_param(max_devices, int, 0644);
> MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
>
> static dev_t switchtec_devt;
> -static struct class *switchtec_class;
> static DEFINE_IDA(switchtec_minor_ida);
>
> +struct class switchtec_class = {
> + .owner = THIS_MODULE,
> + .name = "switchtec",
> +};
> +EXPORT_SYMBOL(switchtec_class);
EXPORT_SYMBOL_GPL()?
And do you really have to move from a dynamic class to a static one? I
know it will work just the same, but I hate seeing static structures
that have reference counts on them :)
thanks,
greg k-h
> +
> enum mrpc_state {
> MRPC_IDLE = 0,
> MRPC_QUEUED,
> @@ -1026,7 +1029,7 @@ static struct switchtec_dev *stdev_create(struct pci_dev *pdev)
>
> dev = &stdev->dev;
> device_initialize(dev);
> - dev->class = switchtec_class;
> + dev->class = &switchtec_class;
> dev->parent = &pdev->dev;
> dev->groups = switchtec_device_groups;
> dev->release = stdev_release;
> @@ -1313,11 +1316,9 @@ static int __init switchtec_init(void)
> if (rc)
> return rc;
>
> - switchtec_class = class_create(THIS_MODULE, "switchtec");
> - if (IS_ERR(switchtec_class)) {
> - rc = PTR_ERR(switchtec_class);
> + rc = class_register(&switchtec_class);
> + if (rc)
> goto err_create_class;
> - }
>
> rc = pci_register_driver(&switchtec_pci_driver);
> if (rc)
> @@ -1328,7 +1329,7 @@ static int __init switchtec_init(void)
> return 0;
>
> err_pci_register:
> - class_destroy(switchtec_class);
> + class_unregister(&switchtec_class);
>
> err_create_class:
> unregister_chrdev_region(switchtec_devt, max_devices);
> @@ -1340,7 +1341,7 @@ module_init(switchtec_init);
> static void __exit switchtec_exit(void)
> {
> pci_unregister_driver(&switchtec_pci_driver);
> - class_destroy(switchtec_class);
> + class_unregister(&switchtec_class);
> unregister_chrdev_region(switchtec_devt, max_devices);
> ida_destroy(&switchtec_minor_ida);
>
> diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
> index 508cda78a430..3b87618fc42f 100644
> --- a/include/linux/switchtec.h
> +++ b/include/linux/switchtec.h
> @@ -267,4 +267,6 @@ static inline struct switchtec_dev *to_stdev(struct device *dev)
> return container_of(dev, struct switchtec_dev, dev);
> }
>
> +extern struct class switchtec_class;
> +
> #endif
> --
> 2.11.0
On Thu, Jun 15, 2017 at 02:37:17PM -0600, Logan Gunthorpe wrote:
> Create the switchtec.h header in include/linux with hardware defines
> and the switchtec_dev structure moved directly from switchtec.c.
> This is a prep patch for created an NTB driver for switchtec.
>
> Signed-off-by: Logan Gunthorpe <[email protected]>
> Reviewed-by: Stephen Bates <[email protected]>
> Reviewed-by: Kurt Schwemmer <[email protected]>
Acked-by: Greg Kroah-Hartman <[email protected]>
On Thu, Jun 15, 2017 at 02:37:20PM -0600, Logan Gunthorpe wrote:
> In order for the switchtec NTB code to handle link change events we
> create a notifier block in the switchtec code which gets called
> whenever an appropriate event interrupt occurs.
>
> In order to preserve userspace's ability to follow these events,
> we compare the event count with a stored copy from last time we
> checked.
>
> Signed-off-by: Logan Gunthorpe <[email protected]>
> Reviewed-by: Stephen Bates <[email protected]>
> Reviewed-by: Kurt Schwemmer <[email protected]>
> ---
> drivers/pci/switch/switchtec.c | 53 ++++++++++++++++++++++++++++++++++++++++++
> include/linux/switchtec.h | 5 ++++
> 2 files changed, 58 insertions(+)
>
> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
> index e9bf17b1934e..63e305b24fb9 100644
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -972,6 +972,50 @@ static const struct file_operations switchtec_fops = {
> .compat_ioctl = switchtec_dev_ioctl,
> };
>
> +static void link_event_work(struct work_struct *work)
> +{
> + struct switchtec_dev *stdev;
> +
> + stdev = container_of(work, struct switchtec_dev, link_event_work);
> +
> + dev_dbg(&stdev->dev, "%s\n", __func__);
You do know about ftrace, right? It's good to drop debugging code like
this for "final" versions.
> +
> + blocking_notifier_call_chain(&stdev->link_notifier, 0, stdev);
> +}
Do you really need a notifier call chain? How many different things are
going to "hook up" to this? I ask as they tend to get really messy over
time while direct callbacks are easier to handle and manage.
thanks,
greg k-h
On Thu, Jun 15, 2017 at 02:37:22PM -0600, Logan Gunthorpe wrote:
> This commit adds the code to initialize the memory windows in the
> hardware. This includes setting up the requester ID table, and
> figuring out which bar corresponds to which memory window. (Seeing
> the switch can be configured with any number of bars.)
>
> Also, seeing the device doesn't have hardware for scratchpads or
> determining the link status, we create a shared memory window that has
> these features. A magic number with a version copmonent will be used
> to determine if the otherside's driver is actually up.
>
> Signed-off-by: Logan Gunthorpe <[email protected]>
> Reviewed-by: Stephen Bates <[email protected]>
> Reviewed-by: Kurt Schwemmer <[email protected]>
> ---
> drivers/ntb/hw/mscc/switchtec_ntb.c | 296 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 296 insertions(+)
>
> diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c b/drivers/ntb/hw/mscc/switchtec_ntb.c
> index 1f094216aa1c..756307d1a8a3 100644
> --- a/drivers/ntb/hw/mscc/switchtec_ntb.c
> +++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
> @@ -15,37 +15,332 @@
>
> #include <linux/switchtec.h>
> #include <linux/module.h>
> +#include <linux/delay.h>
>
> MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
> MODULE_VERSION("0.1");
> MODULE_LICENSE("GPL");
> MODULE_AUTHOR("Microsemi Corporation");
>
> +#ifndef ioread64
> +#ifdef readq
> +#define ioread64 readq
> +#else
> +#define ioread64 _ioread64
> +static inline u64 _ioread64(void __iomem *mmio)
> +{
> + u64 low, high;
> +
> + low = ioread32(mmio);
> + high = ioread32(mmio + sizeof(u32));
> + return low | (high << 32);
> +}
> +#endif
> +#endif
Really? Don't we have ioread64 in generic code for all arches? If not,
that should be fixed, don't hide this in a random driver please. Or
just restrict your driver to only building on those arches that does
provide this api.
thanks,
greg k-h
On 16/06/17 11:09 PM, 'Greg Kroah-Hartman' wrote:
> Ah, but the patchset does seem to properly formatted. At least it's
> easy for me to review as-published, while a much smaller number of
> patches, making much larger individual patches, would be much much
> harder to review.
>
> But what do I know...
>
> Oh wait, I review more kernel patches than anyone else :)
Thanks Greg.
> Logan, given that you need to rebase these on the "new" ntb api (and why
> the hell is that tree on github? We can't take kernel git pulls from
> github), is it worth reviewing this patch series as-is, or do you want
> us to wait?
I think initial review at this time will still be useful. I don't expect
the patchset will change _that_ much.
Logan
On 16/06/17 11:09 PM, 'Greg Kroah-Hartman' wrote:
> Ah, but the patchset does seem to properly formatted. At least it's
> easy for me to review as-published, while a much smaller number of
> patches, making much larger individual patches, would be much much
> harder to review.
>
> But what do I know...
>
> Oh wait, I review more kernel patches than anyone else :)
Thanks Greg!
> Logan, given that you need to rebase these on the "new" ntb api (and why
> the hell is that tree on github? We can't take kernel git pulls from
> github), is it worth reviewing this patch series as-is, or do you want
> us to wait?
I think review at this time is still appropriate. The new ntb api mostly
just adds indexes for the port so I don't expect things to change too much.
Thanks for the review so far.
Logan
On 16/06/17 11:11 PM, Greg Kroah-Hartman wrote:
> EXPORT_SYMBOL_GPL()?
>
> And do you really have to move from a dynamic class to a static one? I
> know it will work just the same, but I hate seeing static structures
> that have reference counts on them :)
I'll change both for v1. I didn't really need a static one. I just saw
an example of some other user of class_interface which and it seemed
appropriate.
Logan
On 16/06/17 11:14 PM, Greg Kroah-Hartman wrote:
> You do know about ftrace, right? It's good to drop debugging code like
> this for "final" versions.
I've never actually used it but maybe I should give it a try. I'll
remove these debug lines.
>> +
>> + blocking_notifier_call_chain(&stdev->link_notifier, 0, stdev);
>> +}
>
> Do you really need a notifier call chain? How many different things are
> going to "hook up" to this? I ask as they tend to get really messy over
> time while direct callbacks are easier to handle and manage.
Ok, understood. I only expect the one callback at this time so I'll
change it to a single function pointer.
Logan
On 16/06/17 11:16 PM, Greg Kroah-Hartman wrote:
>> +#ifndef ioread64
>> +#ifdef readq
>> +#define ioread64 readq
>> +#else
>> +#define ioread64 _ioread64
>> +static inline u64 _ioread64(void __iomem *mmio)
>> +{
>> + u64 low, high;
>> +
>> + low = ioread32(mmio);
>> + high = ioread32(mmio + sizeof(u32));
>> + return low | (high << 32);
>> +}
>> +#endif
>> +#endif
>
> Really? Don't we have ioread64 in generic code for all arches? If not,
> that should be fixed, don't hide this in a random driver please. Or
> just restrict your driver to only building on those arches that does
> provide this api.
Yes, I know these are _very_ ugly. Unfortunately, the other ntb drivers
each have the exact same thing. ioread64 is not provided universally at
this time and I did spend a bit of time digging and things are a bit
messy so I wasn't at all sure of the correct solution.
For starters, ioread64 is only defined on 64 bit machines. They are
surrounded by ifdef CONFIG_64BIT and it's not clear to me if the above
wrapper (around two ioread32s) would be acceptable universally.
Second, the x86_64 version doesn't even compile. This is because the
arch doesn't provide any ioread64 implementation anywhere and the macro
in io.h isn't used because CONFIG_GENERIC_IOMAP is defined.
The only arch where I _think_ ioread64 would work is powerpc or any that
define CONFIG_64BIT and not CONFIG_GENERIC_IOMAP.
If you have any guidance on this I'd be happy to try and make a patch or
two for it.
Thanks,
Logan
On Sat, Jun 17, 2017 at 10:39:35AM -0600, Logan Gunthorpe wrote:
>
>
> On 16/06/17 11:16 PM, Greg Kroah-Hartman wrote:
> >> +#ifndef ioread64
> >> +#ifdef readq
> >> +#define ioread64 readq
> >> +#else
> >> +#define ioread64 _ioread64
> >> +static inline u64 _ioread64(void __iomem *mmio)
> >> +{
> >> + u64 low, high;
> >> +
> >> + low = ioread32(mmio);
> >> + high = ioread32(mmio + sizeof(u32));
> >> + return low | (high << 32);
> >> +}
> >> +#endif
> >> +#endif
> >
> > Really? Don't we have ioread64 in generic code for all arches? If not,
> > that should be fixed, don't hide this in a random driver please. Or
> > just restrict your driver to only building on those arches that does
> > provide this api.
>
> Yes, I know these are _very_ ugly. Unfortunately, the other ntb drivers
> each have the exact same thing. ioread64 is not provided universally at
> this time and I did spend a bit of time digging and things are a bit
> messy so I wasn't at all sure of the correct solution.
That implies that this needs to be fixed, no driver should ever have to
define this themselves. The fact that they all do it at the same time
is a huge hint that it's wrong.
> For starters, ioread64 is only defined on 64 bit machines. They are
> surrounded by ifdef CONFIG_64BIT and it's not clear to me if the above
> wrapper (around two ioread32s) would be acceptable universally.
>
> Second, the x86_64 version doesn't even compile. This is because the
> arch doesn't provide any ioread64 implementation anywhere and the macro
> in io.h isn't used because CONFIG_GENERIC_IOMAP is defined.
>
> The only arch where I _think_ ioread64 would work is powerpc or any that
> define CONFIG_64BIT and not CONFIG_GENERIC_IOMAP.
>
> If you have any guidance on this I'd be happy to try and make a patch or
> two for it.
I suggest coming up with something simple like what you have here,
adding it to the arch-generic code and posting it to the linux-arch
mailing list and seeing if anyone screams :)
thanks,
greg k-h
On Sat, Jun 17, 2017 at 07:09:59AM +0200, 'Greg Kroah-Hartman' wrote:
> On Fri, Jun 16, 2017 at 11:21:00PM +0300, Serge Semin wrote:
> > On Fri, Jun 16, 2017 at 01:34:59PM -0600, Logan Gunthorpe <[email protected]> wrote:
> > > Now, if you'd like to actually review the code I'd be happy to address
> > > any concerns you find. I won't be responding to any more philosophical
> > > arguments or bike-shedding over the format of the patch.
> > >
> >
> > I don't want to review a patchset, which isn't properly formated.
>
> Ah, but the patchset does seem to properly formatted. At least it's
> easy for me to review as-published, while a much smaller number of
> patches, making much larger individual patches, would be much much
> harder to review.
>
> But what do I know...
>
> Oh wait, I review more kernel patches than anyone else :)
>
> Logan, given that you need to rebase these on the "new" ntb api (and why
> the hell is that tree on github? We can't take kernel git pulls from
> github), is it worth reviewing this patch series as-is, or do you want
Well, Linus has been taking my pull request from it since v3.12. He did
call me out initially for requesting it initially, but was amenible
after I GPG signed all of my pull requests (and had a sufficient number
of people he "knew" in my ring). But all of that has been sorted out
now.
The reason it is on Github is for the Wiki of NTB HW and it's usage
https://github.com/jonmason/ntb/wiki
It's gotten a bit stale, but was very useful back in the v3.12 days :)
(Also, I am using this as a call to update the Wiki!)
Thanks,
Jon
> us to wait?
>
> thanks,
>
> greg k-h
On Thu, Jun 15, 2017 at 02:37:16PM -0600, Logan Gunthorpe wrote:
> Hi,
>
> This patchset implements Non-Transparent Bridge (NTB) support for the
> Microsemi Switchtec series of switches. We're looking for some
> review from the community at this point but hope to get it upstreamed
> for v4.14.
>
> Switchtec NTB support is configured over the same function and bar
> as the management endpoint. Thus, the new driver hooks into the
> management driver which we had merged in v4.12. We use the class
> interface API to register an NTB device for every switchtec device
> which supports NTB (not all do).
>
> The Switchtec hardware supports doorbells, memory windows and messages.
> Seeing there is no native scratchpad support, 128 spads are emulated
> through the use of a pre-setup memory window. The switch has 64
> doorbells which are shared between the two partitions and a
> configurable set of memory windows. While the hardware supports more
> than 2 partitions, this driver only supports the first two seeing
> the current NTB API only supports two hosts.
>
> The driver has been tested with ntb_netdev and fully passes the
> ntb_test script.
>
> This patchset is based off of v4.12-rc5 and can be found in this
> git repo:
>
> https://github.com/sbates130272/linux-p2pmem.git switchtec_ntb
I think this code is of quality enough to go from an RFC to a standard
patch, and we can nit pick it to death there ;-)
Please rebase on ntb-next (which I believe you are already doing), and
resbutmit.
Also, we'll need to decide how to handle the patches to
drivers/pci/switches
We have 2 options here.
1. We can split it up into 2 separate series, and have Bjorn take the
drivers/pci/switches in his, and I'll take the NTB portions. The down
side of this is that I'll have to wait for his to get pulled in first,
which will push this back a full kernel release (which based on your
comment above doesn't seem your prefered choice)
2. I can pull in the drivers/pci/switches patches into my NTB tree,
but I'll want Bjorn's ack on those patches before I'll pull them in.
I'm thinking that we'll want to keep this series all in one place.
So, #2 sounds like the best option. But, I need Bjorns $0.02 on this.
FYI, I ran smatch on the patches and got this. Please correct them in
v2 (or v1 of the non-RFC...however you want to think of it).
drivers/pci/switch/switchtec.c:484 switchtec_dev_read() error: double unlock 'mutex:&stdev->mrpc_mutex'
drivers/pci/switch/switchtec.c:506 switchtec_dev_read() error: double unlock 'mutex:&stdev->mrpc_mutex'
drivers/pci/switch/switchtec.c:513 switchtec_dev_read() warn: inconsistent returns 'mutex:&stdev->mrpc_mutex'.
Thanks,
Jon
>
> Thanks,
>
> Logan
>
>
> Logan Gunthorpe (13):
> switchtec: move structure definitions into a common header
> switchtec: export class symbol for use in upper layer driver
> switchtec: add ntb hardware register definitions
> switchtec: add link event notifier block
> switchtec_ntb: introduce initial ntb driver
> switchtec_ntb: initialize hardware for memory windows
> switchtec_ntb: initialize hardware for doorbells and messages
> switchtec_ntb: add skeleton ntb driver
> switchtec_ntb: add link management
> switchtec_ntb: implement doorbell registers
> switchtec_ntb: implement scratchpad registers
> switchtec_ntb: add memory window support
> switchtec_ntb: update switchtec documentation with notes for ntb
>
> Documentation/switchtec.txt | 12 +
> MAINTAINERS | 2 +
> drivers/ntb/hw/Kconfig | 1 +
> drivers/ntb/hw/Makefile | 1 +
> drivers/ntb/hw/mscc/Kconfig | 9 +
> drivers/ntb/hw/mscc/Makefile | 1 +
> drivers/ntb/hw/mscc/switchtec_ntb.c | 1144 +++++++++++++++++++++++++++++++++++
> drivers/pci/switch/switchtec.c | 319 ++--------
> include/linux/ntb.h | 3 +
> include/linux/switchtec.h | 365 +++++++++++
> 10 files changed, 1601 insertions(+), 256 deletions(-)
> create mode 100644 drivers/ntb/hw/mscc/Kconfig
> create mode 100644 drivers/ntb/hw/mscc/Makefile
> create mode 100644 drivers/ntb/hw/mscc/switchtec_ntb.c
> create mode 100644 include/linux/switchtec.h
>
> --
> 2.11.0
>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> To post to this group, send email to [email protected].
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/20170615203729.9009-1-logang%40deltatee.com.
> For more options, visit https://groups.google.com/d/optout.
On 19/06/17 02:07 PM, Jon Mason wrote:
> I think this code is of quality enough to go from an RFC to a standard
> patch, and we can nit pick it to death there ;-)
Thanks!
> Please rebase on ntb-next (which I believe you are already doing), and
> resbutmit.
I'll try to get the rebase done and all the feedback so far applied by
the end of the week and resend a v1.
> I'm thinking that we'll want to keep this series all in one place.
> So, #2 sounds like the best option. But, I need Bjorns $0.02 on this.
I was thinking #2 was the best choice as well but really it's for you
maintainers to decide. And, yes, we'd want to get Bjorn's ack.
> FYI, I ran smatch on the patches and got this. Please correct them in
> v2 (or v1 of the non-RFC...however you want to think of it).
> drivers/pci/switch/switchtec.c:484 switchtec_dev_read() error: double unlock 'mutex:&stdev->mrpc_mutex'
> drivers/pci/switch/switchtec.c:506 switchtec_dev_read() error: double unlock 'mutex:&stdev->mrpc_mutex'
> drivers/pci/switch/switchtec.c:513 switchtec_dev_read() warn: inconsistent returns 'mutex:&stdev->mrpc_mutex'.
This looks like a false positive to me. The code looks correct. smatch
may have been confused by the fact that the lock is taken by two calls
to the static function 'lock_mutex_and_test_alive'.
This is also part of the switchtec management driver that's already in
the kernel and not part of the NTB related patches I sent.
Logan
On Mon, Jun 19, 2017 at 4:27 PM, Logan Gunthorpe <[email protected]> wrote:
>
>
> On 19/06/17 02:07 PM, Jon Mason wrote:
>> I think this code is of quality enough to go from an RFC to a standard
>> patch, and we can nit pick it to death there ;-)
>
> Thanks!
>
>> Please rebase on ntb-next (which I believe you are already doing), and
>> resbutmit.
>
> I'll try to get the rebase done and all the feedback so far applied by
> the end of the week and resend a v1.
>
>> I'm thinking that we'll want to keep this series all in one place.
>> So, #2 sounds like the best option. But, I need Bjorns $0.02 on this.
>
> I was thinking #2 was the best choice as well but really it's for you
> maintainers to decide. And, yes, we'd want to get Bjorn's ack.
Bjorn is a busy guy, but I'm sure he'll get to it soon enough.
>> FYI, I ran smatch on the patches and got this. Please correct them in
>> v2 (or v1 of the non-RFC...however you want to think of it).
>> drivers/pci/switch/switchtec.c:484 switchtec_dev_read() error: double unlock 'mutex:&stdev->mrpc_mutex'
>> drivers/pci/switch/switchtec.c:506 switchtec_dev_read() error: double unlock 'mutex:&stdev->mrpc_mutex'
>> drivers/pci/switch/switchtec.c:513 switchtec_dev_read() warn: inconsistent returns 'mutex:&stdev->mrpc_mutex'.
>
> This looks like a false positive to me. The code looks correct. smatch
> may have been confused by the fact that the lock is taken by two calls
> to the static function 'lock_mutex_and_test_alive'.
>
> This is also part of the switchtec management driver that's already in
> the kernel and not part of the NTB related patches I sent.
Ah, no problem. I'm not a master user of smatch, but I do find it
very useful. So, I do not know of a way to run it against only the
patches being applied, and I'm not sure that is possible to do so.
Either way, I'm sure someone will address that then, given that it is
already in the kernel.
BTW, I don't think I said so, but I'm really excited to have another
piece of NTB HW supported in the kernel.
Thanks,
Jon
>
> Logan
Hey Guys,
I've run into some subtle issues with the new API:
It has to do with splitting mw_get_range into mw_get_align and
peer_mw_get_addr.
The original mw_get_range returned the size of the /local/ memory
window's size, address and alignment requirements. The ntb clients then
take the local size and transmit it via spads to the peer which would
use it in setting up the memory window. However, it made the assumption
that the alignment restrictions were symmetric on both hosts seeing they
were not sent across the link.
The new API makes a sensible change for this in that mw_get_align
appears to be intended to return the alignment restrictions (and now
size) of the peer. This helps a bit for the Switchtec driver but appears
to be a semantic change that wasn't really reflected in the changes to
the other NTB code. So, I see a couple of issues:
1) With our hardware, we can't actually know anything about the peer's
memory windows until the peer has finished its setup (ie. the link is
up). However, all the clients call the function during probe, before the
link is ready. There's really no good reason for this, so I think we
should change the clients so that mw_get_align is called only when the
link is up.
2) The changes to the Intel and AMD driver for mw_get_align sets
*max_size to the local pci resource size. (Thus making the assumption
that the local is the same as the peer, which is wrong). max_size isn't
actually used for anything so it's not _really_ an issue, but I do think
it's confusing and incorrect. I'd suggest we remove max_size until
something actually needs it, or at least set it to zero in cases where
the hardware doesn't support returning the size of the peer's memory
window (ie. in the Intel and AMD drivers).
Thoughts?
Logan
On 6/22/2017 12:32 PM, Allen Hubbe wrote:
> From: Logan Gunthorpe
>> Hey Guys,
>>
>> I've run into some subtle issues with the new API:
>>
>> It has to do with splitting mw_get_range into mw_get_align and
>> peer_mw_get_addr.
>>
>> The original mw_get_range returned the size of the /local/ memory
>> window's size, address and alignment requirements. The ntb clients then
>> take the local size and transmit it via spads to the peer which would
>> use it in setting up the memory window. However, it made the assumption
>> that the alignment restrictions were symmetric on both hosts seeing they
>> were not sent across the link.
>>
>> The new API makes a sensible change for this in that mw_get_align
>> appears to be intended to return the alignment restrictions (and now
>> size) of the peer. This helps a bit for the Switchtec driver but appears
>> to be a semantic change that wasn't really reflected in the changes to
>> the other NTB code. So, I see a couple of issues:
>>
>> 1) With our hardware, we can't actually know anything about the peer's
>> memory windows until the peer has finished its setup (ie. the link is
>> up). However, all the clients call the function during probe, before the
>> link is ready. There's really no good reason for this, so I think we
>> should change the clients so that mw_get_align is called only when the
>> link is up.
>>
>> 2) The changes to the Intel and AMD driver for mw_get_align sets
>> *max_size to the local pci resource size. (Thus making the assumption
>> that the local is the same as the peer, which is wrong). max_size isn't
>> actually used for anything so it's not _really_ an issue, but I do think
>> it's confusing and incorrect. I'd suggest we remove max_size until
>> something actually needs it, or at least set it to zero in cases where
>> the hardware doesn't support returning the size of the peer's memory
>> window (ie. in the Intel and AMD drivers).
>
> You're right, and the b2b_split in the Intel driver even makes use of different primary/secondary bar sizes. For Intel and AMD, it would make more sense to use the secondary bar size here. The size of the secondary bar still not necessarily valid end-to-end, because in b2b the peer's primary bar size could be even smaller.
>
> I'm not entirely convinced that this should represent the end-to-end size of local and peer memory window configurations. I think it should represent the largest side that would be valid to pass to ntb_mw_set_trans(). Then, the peers should communicate their respective max sizes (along with translation addresses, etc) before setting up the translations, and that exchange will ensure that the size finally used is valid end-to-end.
But why would the client ever need to use the max_size instead of the
actual size of the bar as retrieved and exchanged from peer_mw_get_addr?
Logan
From: Logan Gunthorpe
> Hey Guys,
>
> I've run into some subtle issues with the new API:
>
> It has to do with splitting mw_get_range into mw_get_align and
> peer_mw_get_addr.
>
> The original mw_get_range returned the size of the /local/ memory
> window's size, address and alignment requirements. The ntb clients then
> take the local size and transmit it via spads to the peer which would
> use it in setting up the memory window. However, it made the assumption
> that the alignment restrictions were symmetric on both hosts seeing they
> were not sent across the link.
>
> The new API makes a sensible change for this in that mw_get_align
> appears to be intended to return the alignment restrictions (and now
> size) of the peer. This helps a bit for the Switchtec driver but appears
> to be a semantic change that wasn't really reflected in the changes to
> the other NTB code. So, I see a couple of issues:
>
> 1) With our hardware, we can't actually know anything about the peer's
> memory windows until the peer has finished its setup (ie. the link is
> up). However, all the clients call the function during probe, before the
> link is ready. There's really no good reason for this, so I think we
> should change the clients so that mw_get_align is called only when the
> link is up.
>
> 2) The changes to the Intel and AMD driver for mw_get_align sets
> *max_size to the local pci resource size. (Thus making the assumption
> that the local is the same as the peer, which is wrong). max_size isn't
> actually used for anything so it's not _really_ an issue, but I do think
> it's confusing and incorrect. I'd suggest we remove max_size until
> something actually needs it, or at least set it to zero in cases where
> the hardware doesn't support returning the size of the peer's memory
> window (ie. in the Intel and AMD drivers).
You're right, and the b2b_split in the Intel driver even makes use of different primary/secondary bar sizes. For Intel and AMD, it would make more sense to use the secondary bar size here. The size of the secondary bar still not necessarily valid end-to-end, because in b2b the peer's primary bar size could be even smaller.
I'm not entirely convinced that this should represent the end-to-end size of local and peer memory window configurations. I think it should represent the largest side that would be valid to pass to ntb_mw_set_trans(). Then, the peers should communicate their respective max sizes (along with translation addresses, etc) before setting up the translations, and that exchange will ensure that the size finally used is valid end-to-end.
>
> Thoughts?
>
> Logan
From: Logan Gunthorpe
> On 6/22/2017 12:32 PM, Allen Hubbe wrote:
> > From: Logan Gunthorpe
> >> 2) The changes to the Intel and AMD driver for mw_get_align sets
> >> *max_size to the local pci resource size. (Thus making the assumption
> >> that the local is the same as the peer, which is wrong). max_size isn't
> >> actually used for anything so it's not _really_ an issue, but I do think
> >> it's confusing and incorrect. I'd suggest we remove max_size until
> >> something actually needs it, or at least set it to zero in cases where
> >> the hardware doesn't support returning the size of the peer's memory
> >> window (ie. in the Intel and AMD drivers).
> >
> > You're right, and the b2b_split in the Intel driver even makes use of different primary/secondary
> bar sizes. For Intel and AMD, it would make more sense to use the secondary bar size here. The size
> of the secondary bar still not necessarily valid end-to-end, because in b2b the peer's primary bar
> size could be even smaller.
> >
> > I'm not entirely convinced that this should represent the end-to-end size of local and peer memory
> window configurations. I think it should represent the largest side that would be valid to pass to
> ntb_mw_set_trans(). Then, the peers should communicate their respective max sizes (along with
> translation addresses, etc) before setting up the translations, and that exchange will ensure that the
> size finally used is valid end-to-end.
>
> But why would the client ever need to use the max_size instead of the
> actual size of the bar as retrieved and exchanged from peer_mw_get_addr?
The resource size given by peer_mw_get_addr might be different than the max_size given by ntb_mw_get_align.
I am most familiar with the ntb_hw_intel driver and that type of ntb hardware. The peer_mw_get_addr size is of the primary bar on the side to be the source of the translated writes (or reads). In b2b topology, at least, the first translation of that write lands it on the secondary bar of the peer ntb. That size of that bar could different than the first. The second translation lands the write in memory (eg). So, the end-to-end translation is limited by the first AND second sizes.
The first point is, the *max_size returned by intel_ntb_mw_get_align looks wrong. That should be the size of the secondary bar, not the resource size of the primary bar, of that device.
The second point is, because the sizes returned by peer_mw_get_addr, and ntb_mw_get_align, may be different, the two sides should communicate and reconcile the address and size information when setting up the translations.
On 6/22/2017 4:12 PM, Allen Hubbe wrote:
> The resource size given by peer_mw_get_addr might be different than the max_size given by ntb_mw_get_align.
>
> I am most familiar with the ntb_hw_intel driver and that type of ntb hardware. The peer_mw_get_addr size is of the primary bar on the side to be the source of the translated writes (or reads). In b2b topology, at least, the first translation of that write lands it on the secondary bar of the peer ntb. That size of that bar could different than the first. The second translation lands the write in memory (eg). So, the end-to-end translation is limited by the first AND second sizes.
>
> The first point is, the *max_size returned by intel_ntb_mw_get_align looks wrong. That should be the size of the secondary bar, not the resource size of the primary bar, of that device.
>
> The second point is, because the sizes returned by peer_mw_get_addr, and ntb_mw_get_align, may be different, the two sides should communicate and reconcile the address and size information when setting up the translations.
Ok, that makes some sense. So drivers that don't need this should return
-1 or something like that? Though, I'd still suggest that until a driver
actually needs this, and it's implemented correctly in the clients, we
should leave it out.
Any thoughts on changing the semantics of mw_get_align so it must be
called with the link up?
Logan
From: Logan Gunthorpe
> Any thoughts on changing the semantics of mw_get_align so it must be
> called with the link up?
The intention of these is that these calls return information from the local port. The calls themselves don't reach across the link to the peer, but the information returned from the local port needs to be communicated for setting up the translation end-to-end.
I would like to understand why this hardware needs link up. Are there registers on the local port that are only valid after link up?
Can you post snippets of how ntb_mw_get_align and ntb_peer_mw_get_addr might be implemented for the Switchtec?
On 6/22/2017 4:42 PM, Allen Hubbe wrote:
> From: Logan Gunthorpe
>> Any thoughts on changing the semantics of mw_get_align so it must be
>> called with the link up?
>
> The intention of these is that these calls return information from the local port. The calls themselves don't reach across the link to the peer, but the information returned from the local port needs to be communicated for setting up the translation end-to-end.
Ok, well if it's from the local port, then splitting up mw_get_range
into peer_mw_get_addr and mw_get_align is confusing because one has the
peer designation and the other doesn't. And all the clients apply the
alignments to the remote bar so they'd technically need to transfer them
across the link somehow.
> I would like to understand why this hardware needs link up. Are there registers on the local port that are only valid after link up?
We only need the link up if we are trying to find the alignment
requirements (and max_size) of the peer's bar. In theory, switchtec
could have different sizes of bars on both sides of the link and
different alignment requirements. Though, in practice, this is probably
unlikely.
> Can you post snippets of how ntb_mw_get_align and ntb_peer_mw_get_addr might be implemented for the Switchtec?
Hmm, yes, but lets sort out my confusion on the semantics per above first.
Logan
From: Logan Gunthorpe
> On 6/22/2017 4:42 PM, Allen Hubbe wrote:
> > From: Logan Gunthorpe
> >> Any thoughts on changing the semantics of mw_get_align so it must be
> >> called with the link up?
> >
> > The intention of these is that these calls return information from the local port. The calls
> themselves don't reach across the link to the peer, but the information returned from the local port
> needs to be communicated for setting up the translation end-to-end.
>
> Ok, well if it's from the local port, then splitting up mw_get_range
> into peer_mw_get_addr and mw_get_align is confusing because one has the
> peer designation and the other doesn't. And all the clients apply the
> alignments to the remote bar so they'd technically need to transfer them
> across the link somehow.
By "remote" do you mean the source or destination of a write?
Yes, clients should transfer the address and size information to the peer.
> > I would like to understand why this hardware needs link up. Are there registers on the local port
> that are only valid after link up?
>
> We only need the link up if we are trying to find the alignment
> requirements (and max_size) of the peer's bar. In theory, switchtec
Maybe this is the confusion. None of these api calls are to reach across to the peer port, as to get the size of the peer's bar. They are to get information from the local port, or to configure the local port.
Some mw configuration is done at the destination, as for Intel and AMD, and some configuration is done on the source side, for IDT. The local configuration of the port on one side could depend on information from the remote port on the other side. For example in IDT, the mw translation configured on the source side needs to know destination address information. Likewise, if there is any difference in the size of the range that can be translated by ports on opposing sides, that needs to be negotiated.
> could have different sizes of bars on both sides of the link and
> different alignment requirements. Though, in practice, this is probably
> unlikely.
>
> > Can you post snippets of how ntb_mw_get_align and ntb_peer_mw_get_addr might be implemented for the
> Switchtec?
>
> Hmm, yes, but lets sort out my confusion on the semantics per above first.
Some snippets of code would help me understand your interpretation of the api semantics more exactly.
> Logan
On 23/06/17 07:18 AM, Allen Hubbe wrote:
> By "remote" do you mean the source or destination of a write?
Look at how these calls are used in ntb_transport and ntb_perf:
They both call ntb_peer_mw_get_addr to get the size of the BAR. The size
is sent via spads to the other side. The other side then uses
ntb_mw_get_align and applies align_size to the received size.
> Yes, clients should transfer the address and size information to the peer.
But then they also need to technically transfer the alignment
information as well. Which neither of the clients do.
> Maybe this is the confusion. None of these api calls are to reach across to the peer port, as to get the size of the peer's bar. They are to get information from the local port, or to configure the local port.
I like the rule that these api calls are not to reach across the port.
But then API looks very wrong. Why are we calling one peer_mw_get addr
and the other mw_get_align? And why does mw_get_align have a peer index?
> Some snippets of code would help me understand your interpretation of the api semantics more exactly.
I'm not sure the code to best show this in code, but let me try
describing an example situation:
Lets say we have the following mws on each side (this is something that
is possible with Switchtec hardware):
Host A BARs:
mwA0: 2MB size, aligned to 4k, size aligned to 4k
mwA1: 4MB size, aligned to 4k, size aligned to 4k
mwA2: 64k size, aligned to 64k, size aligned to 64k
Host B BARs:
mwB0: 2MB size, aligned to 4k, size aligned to 4k
mwB1: 64k size, aligned to 64k, size aligned to 64k
So on Host A: peer_mw_get_addr(idx=1) should return size=4M (from mwA1),
but it's not clear what mw_get_align(widx=1) should do. I see two
possibilities:
1) Given that it has the opposite sense of peer_mw_get_addr (ie. there
is no peer in it's name) and that this new api also has a peer index, it
should return align_size=64k (from mwB1). However, in order to do this,
Host B must be fully configured (technically the link doesn't have to be
fully up, but having a link up is the only way for a client to tell if
Host B is configured or not).
2) Given your assertion that these APIs should never reach across the
link, then one could say it should return align_size=4k. However, in
this situation the name is really confusing. And the fact that it has a
pidx argument makes no sense. And the way ntb_transport and ntb_perf use
it is wrong because they will end up masking the 64K size of mwB1 with
the 4k align_size from mwA1.
Does that make more sense?
Thanks,
Logan
From: Logan Gunthorpe
> On 23/06/17 07:18 AM, Allen Hubbe wrote:
> > By "remote" do you mean the source or destination of a write?
>
> Look at how these calls are used in ntb_transport and ntb_perf:
>
> They both call ntb_peer_mw_get_addr to get the size of the BAR. The size
> is sent via spads to the other side. The other side then uses
> ntb_mw_get_align and applies align_size to the received size.
>
> > Yes, clients should transfer the address and size information to the peer.
>
> But then they also need to technically transfer the alignment
> information as well. Which neither of the clients do.
The client's haven't been fully ported to the multi-port api yet. They were only minimally changed to call the new api, but so far other than that they have only been made to work as they had before.
> > Maybe this is the confusion. None of these api calls are to reach across to the peer port, as to
> get the size of the peer's bar. They are to get information from the local port, or to configure the
> local port.
>
> I like the rule that these api calls are not to reach across the port.
> But then API looks very wrong. Why are we calling one peer_mw_get addr
> and the other mw_get_align? And why does mw_get_align have a peer index?
I regret that the term "peer" is used to distinguish the mw api. Better names perhaps should be ntb_outbound_mw_foo, ntb_inbound_mw_foo; or ntb_src_mw_foo, ntb_dest_mw_foo. I like outbound/inbound, although the names are longer, maybe out/in would be ok.
> And why does mw_get_align have a peer index?
Good question. Serge?
For that matter, why do we not also have peer mw idx in the set of parameters. Working through the example below, it looks like we are lacking a way to say Outbound MW1 on A corresponds with Inbound MW0 on B. It looks like we can only indicate that Port A (not which Outbound MW of Port A) corresponds with Inbound MW0 on B.
> > Some snippets of code would help me understand your interpretation of the api semantics more
> exactly.
>
> I'm not sure the code to best show this in code, but let me try
> describing an example situation:
>
> Lets say we have the following mws on each side (this is something that
> is possible with Switchtec hardware):
>
> Host A BARs:
> mwA0: 2MB size, aligned to 4k, size aligned to 4k
> mwA1: 4MB size, aligned to 4k, size aligned to 4k
> mwA2: 64k size, aligned to 64k, size aligned to 64k
>
> Host B BARs:
> mwB0: 2MB size, aligned to 4k, size aligned to 4k
> mwB1: 64k size, aligned to 64k, size aligned to 64k
If those are BARs, that corresponds to "outbound", writing something to the BAR at mwA0.
A more complete picture might be:
Host A BARs (aka "outbound" or "peer" memory windows):
peer_mwA0: resource at 0xA00000000 - 0xA00200000 (2MB)
peer_mwA1: resource at 0xA10000000 - 0xA10400000 (4MB)
peer_mwA2: resource at 0xA20000000 - 0xa20010000 (64k)
Host A MWs (aka "inbound" memory windows):
mwA0: 64k max size, aligned to 64k, size aligned to 64k
mwA1: 2MB max size, aligned to 4k, size aligned to 4k
Host A sees Host B on port index 1
Host B BARs (aka "outbound" or "peer" memory windows):
peer_mwB0: resource at 0xB00000000 - 0xB00200000 (2MB)
peer_mwB1: resource at 0xB10000000 - 0xB10010000 (64k)
Host B MWs (aka "inbound" memory windows):
mwB0: 1MB size, aligned to 4k, size aligned to 4k
mwB1: 2MB size, aligned to 4k, size aligned to 4k
Host B sees Host A on port index 4
Outbound memory (aka "peer mw") windows come with a pci resource. We can get the size of the resource, it's physical address, and set up outbound translation if the hardware has that (IDT).
Inbound memory windows (aka "mw") are only used to set up inbound translation, if the hardware has that (Intel, AMD).
To set up end-to-end memory window so that A can write to B, let's use peer_mwA1 and mwB0.
A: ntb_peer_mw_get_addr(peer_mwA1) -> base 0xA10000000, size 4MB
B: ntb_mw_get_align(port4**, mwB0) -> aligned 4k, aligned 4k, max size 1MB
** Serge: do we need port info here, why?
Side A has a resource size of 4MB, but B only supports inbound translation up to 1MB. Side A can only use the first quarter of the 4MB resource.
Side B needs to allocate memory aligned to 4k (the dma address must be aligned to 4k after dma mapping), and a multiple of 4k in size. B may need to set inbound translation so that incoming writes go into this memory. A may also need to set outbound translation.
A: ntb_peer_mw_set_trans(port1**, peer_mwA1, dma_mem_addr, dma_mem_size)
B: ntb_mw_set_trans(port4**, mwB0, dma_mem_addr, dma_mem_size)
** Serge: do we also need the opposing side MW index here?
** Logan: would those changes to the api suit your needs?
> So on Host A: peer_mw_get_addr(idx=1) should return size=4M (from mwA1),
> but it's not clear what mw_get_align(widx=1) should do. I see two
> possibilities:
>
> 1) Given that it has the opposite sense of peer_mw_get_addr (ie. there
> is no peer in it's name) and that this new api also has a peer index, it
> should return align_size=64k (from mwB1). However, in order to do this,
> Host B must be fully configured (technically the link doesn't have to be
> fully up, but having a link up is the only way for a client to tell if
> Host B is configured or not).
>
> 2) Given your assertion that these APIs should never reach across the
> link, then one could say it should return align_size=4k. However, in
> this situation the name is really confusing. And the fact that it has a
> pidx argument makes no sense. And the way ntb_transport and ntb_perf use
> it is wrong because they will end up masking the 64K size of mwB1 with
> the 4k align_size from mwA1.
>
> Does that make more sense?
>
> Thanks,
>
> Logan
>
On 23/06/17 01:07 PM, Allen Hubbe wrote:
> The client's haven't been fully ported to the multi-port api yet. They were only minimally changed to call the new api, but so far other than that they have only been made to work as they had before.
So is it intended to eventually send the align parameters via spads?
This seems like it would take a lot of spads or multiplexing the spads
with a few doorbells. This gets a bit nasty.
> If those are BARs, that corresponds to "outbound", writing something to the BAR at mwA0.
> A more complete picture might be:
>
> Host A BARs (aka "outbound" or "peer" memory windows):
> peer_mwA0: resource at 0xA00000000 - 0xA00200000 (2MB)
> peer_mwA1: resource at 0xA10000000 - 0xA10400000 (4MB)
> peer_mwA2: resource at 0xA20000000 - 0xa20010000 (64k)
>
> Host A MWs (aka "inbound" memory windows):
> mwA0: 64k max size, aligned to 64k, size aligned to 64k
> mwA1: 2MB max size, aligned to 4k, size aligned to 4k
I don't really like the separation of inbound and output as you describe
it. It doesn't really match my hardware. In switchtec, each partition
has some number of BARs and each BAR has a single translation which sets
the peer and destination address. The translation really exists inside
the switch hardware, not on either side. But any translation can be
programmed by any peer. Saying that there's an opposite inbound window
to every outbound window is not an accurate abstraction for us.
I _suspect_ the IDT hardware is similar but, based on Serge's driver, I
think the translation can only be programmed by the peer that the BAR is
resident in (as opposed to from any side like in the switchtec hardwer).
(This poses some problems for getting the IDT code to actually work with
existing clients.)
> Outbound memory (aka "peer mw") windows come with a pci resource. We can get the size of the resource, it's physical address, and set up outbound translation if the hardware has that (IDT).
>
> Inbound memory windows (aka "mw") are only used to set up inbound translation, if the hardware has that (Intel, AMD).
>
> To set up end-to-end memory window so that A can write to B, let's use peer_mwA1 and mwB0.
>
> A: ntb_peer_mw_get_addr(peer_mwA1) -> base 0xA10000000, size 4MB
> B: ntb_mw_get_align(port4**, mwB0) -> aligned 4k, aligned 4k, max size 1MB
> ** Serge: do we need port info here, why?
>
> Side A has a resource size of 4MB, but B only supports inbound translation up to 1MB. Side A can only use the first quarter of the 4MB resource.
>
> Side B needs to allocate memory aligned to 4k (the dma address must be aligned to 4k after dma mapping), and a multiple of 4k in size. B may need to set inbound translation so that incoming writes go into this memory. A may also need to set outbound translation.
>
> A: ntb_peer_mw_set_trans(port1**, peer_mwA1, dma_mem_addr, dma_mem_size)
> B: ntb_mw_set_trans(port4**, mwB0, dma_mem_addr, dma_mem_size)
> ** Serge: do we also need the opposing side MW index here?
>
> ** Logan: would those changes to the api suit your needs?
Not really, no. Except for the confusion with the mw_get_align issue the
new API, as it is, suits my hardware well. What you're proposing doesn't
fix my issue and doesn't match my hardware. Though, I interpreted
ntb_peer_mw_set_trans somewhat differently from what you describe. I did
not expect the client would need to call both functions but some clients
could optionally use ntb_peer_mw_set_trans to set the translation from
the opposite side (thus needing to send the DMA address over spads or
msgs). Though, without an actual in-kernel user it's hard to know what
is actually intended.
It's worth noticing that the IDT driver only provides peer_mw_set_trans
and not mw_set_trans. I assumed it's because the hardware's memory
windows can only be configured from the opposite side.
Pragmatically, the only change I need for everything to work as I expect
is for mw_get_align to be called only after link up. However, given all
the confusion I'm wondering if these changes are even ready for
upstream. Without actual in-kernel client code it's hard to know if the
API is correct or that everyone is even interpreting it in the same way.
Logan
On Fri, Jun 23, 2017 at 03:07:19PM -0400, Allen Hubbe <[email protected]> wrote:
Hello Allen,
> From: Logan Gunthorpe
> > On 23/06/17 07:18 AM, Allen Hubbe wrote:
> > > By "remote" do you mean the source or destination of a write?
> >
> > Look at how these calls are used in ntb_transport and ntb_perf:
> >
> > They both call ntb_peer_mw_get_addr to get the size of the BAR. The size
> > is sent via spads to the other side. The other side then uses
> > ntb_mw_get_align and applies align_size to the received size.
> >
> > > Yes, clients should transfer the address and size information to the peer.
> >
> > But then they also need to technically transfer the alignment
> > information as well. Which neither of the clients do.
>
> The client's haven't been fully ported to the multi-port api yet. They were only minimally changed to call the new api, but so far other than that they have only been made to work as they had before.
>
> > > Maybe this is the confusion. None of these api calls are to reach across to the peer port, as to
> > get the size of the peer's bar. They are to get information from the local port, or to configure the
> > local port.
> >
> > I like the rule that these api calls are not to reach across the port.
> > But then API looks very wrong. Why are we calling one peer_mw_get addr
> > and the other mw_get_align? And why does mw_get_align have a peer index?
>
> I regret that the term "peer" is used to distinguish the mw api. Better names perhaps should be ntb_outbound_mw_foo, ntb_inbound_mw_foo; or ntb_src_mw_foo, ntb_dest_mw_foo. I like outbound/inbound, although the names are longer, maybe out/in would be ok.
>
As you remember we discussed the matter when new NTB API was being developed.
So we decided to have the following designation:
ntb_mw_* - NTB MW API connected with inbound memory windows configurations,
ntb_peer_mw_* - NTB MW API connected with outbound memory windows configurations.
Yes, suffixes like "in/out" might give better notion of the methods purpose,
but the current notation is in coherency with the rest of API, and as long as
user gets into NTB API, one won't have much difficulties with understanding.
> > And why does mw_get_align have a peer index?
>
> Good question. Serge?
>
> For that matter, why do we not also have peer mw idx in the set of parameters. Working through the example below, it looks like we are lacking a way to say Outbound MW1 on A corresponds with Inbound MW0 on B. It looks like we can only indicate that Port A (not which Outbound MW of Port A) corresponds with Inbound MW0 on B.
>
Before I give any explanation, could you please study the new NTB API documentation:
https://github.com/jonmason/ntb/blob/ntb-next/Documentation/ntb.txt
Particularly you are interested in lines 31 - 111. It will give you a better
description of the way all MW-related methods work for the start. The detailed
multi-port example is given in the end of this email.
> > > Some snippets of code would help me understand your interpretation of the api semantics more
> > exactly.
> >
> > I'm not sure the code to best show this in code, but let me try
> > describing an example situation:
> >
> > Lets say we have the following mws on each side (this is something that
> > is possible with Switchtec hardware):
> >
> > Host A BARs:
> > mwA0: 2MB size, aligned to 4k, size aligned to 4k
> > mwA1: 4MB size, aligned to 4k, size aligned to 4k
> > mwA2: 64k size, aligned to 64k, size aligned to 64k
> >
> > Host B BARs:
> > mwB0: 2MB size, aligned to 4k, size aligned to 4k
> > mwB1: 64k size, aligned to 64k, size aligned to 64k
>
> If those are BARs, that corresponds to "outbound", writing something to the BAR at mwA0.
>
> A more complete picture might be:
>
> Host A BARs (aka "outbound" or "peer" memory windows):
> peer_mwA0: resource at 0xA00000000 - 0xA00200000 (2MB)
> peer_mwA1: resource at 0xA10000000 - 0xA10400000 (4MB)
> peer_mwA2: resource at 0xA20000000 - 0xa20010000 (64k)
>
> Host A MWs (aka "inbound" memory windows):
> mwA0: 64k max size, aligned to 64k, size aligned to 64k
> mwA1: 2MB max size, aligned to 4k, size aligned to 4k
>
> Host A sees Host B on port index 1
>
>
> Host B BARs (aka "outbound" or "peer" memory windows):
> peer_mwB0: resource at 0xB00000000 - 0xB00200000 (2MB)
> peer_mwB1: resource at 0xB10000000 - 0xB10010000 (64k)
>
> Host B MWs (aka "inbound" memory windows):
> mwB0: 1MB size, aligned to 4k, size aligned to 4k
> mwB1: 2MB size, aligned to 4k, size aligned to 4k
>
> Host B sees Host A on port index 4
>
>
> Outbound memory (aka "peer mw") windows come with a pci resource. We can get the size of the resource, it's physical address, and set up outbound translation if the hardware has that (IDT).
>
> Inbound memory windows (aka "mw") are only used to set up inbound translation, if the hardware has that (Intel, AMD).
>
> To set up end-to-end memory window so that A can write to B, let's use peer_mwA1 and mwB0.
That's not actually right. We can't connect any outbound to any inbound MW,
since each of them can be of different type, thus having different
restrictions. Like IDT got MWs with direct address translation and MWs
based on Look-up tables (I suppose the same might be with Switchtec). All
inbound MWs information provided by new NTB API actually belongs to
corresponding outbound MWs.
>
> A: ntb_peer_mw_get_addr(peer_mwA1) -> base 0xA10000000, size 4MB
> B: ntb_mw_get_align(port4**, mwB0) -> aligned 4k, aligned 4k, max size 1MB
> ** Serge: do we need port info here, why?
>
> Side A has a resource size of 4MB, but B only supports inbound translation up to 1MB. Side A can only use the first quarter of the 4MB resource.
>
> Side B needs to allocate memory aligned to 4k (the dma address must be aligned to 4k after dma mapping), and a multiple of 4k in size. B may need to set inbound translation so that incoming writes go into this memory. A may also need to set outbound translation.
>
> A: ntb_peer_mw_set_trans(port1**, peer_mwA1, dma_mem_addr, dma_mem_size)
> B: ntb_mw_set_trans(port4**, mwB0, dma_mem_addr, dma_mem_size)
> ** Serge: do we also need the opposing side MW index here?
>
> ** Logan: would those changes to the api suit your needs?
>
Alright folks. I'll explain the whole architecture by example of simple
multi-port device. Suppose we got some abstract three-NTB-ports
device Z: {Port A, Port B, Port C}. For our particular case lets say, that each
port can have up to three outbound memory windows configured (obviously it's more
than enough to have ports connected to each other, but in general each port can
have different number of outbound memory windows). So to speak each port
can access up to three remote memory regions. In this way, each port
shall have the following set of inbound and outbound memory windows:
Port A: 3 outbound MWs to access Port B or Port C memory,
6 inbound MWs to have Port B and Port C accessing Port A memory
Port B: 3 outbound MWs to access Port A or Port C memory,
6 inbound MWs to have Port A and Port C accessing Port B memory
Port C: 3 outbound MWs to access Port A or Port B memory,
6 inbound MWs to have Port A and Port B accessing Port C memory
Suppose also, that each outbound memory window got different restrictions like:
translation address alignment, memory window size alignment and maximum size of
memory window. Lets say, that our abstract device has some similarity with
Intel/AMD device (I'll explain the difference with IDT later), but obviously
got multiple ports.
Now when description is over, I would like to have Port A accessing Port B memory
over outbound memory window with index 1 - midx1 (index starts from zero).
To set the connection up, the following steps must be performed within new NTB API.
1) Port B needs to allocated a memory region for inbound memory window. But first
it needs to know what kind of address and size should be of this region. For this
purpose it calls:
ntb_mw_get_align(Port A, midx1, &addr_align, &size_align, &size_max);
As you can see, we get translation address alignment (addr_align), memory region
alignment (size_align) and its maximum possible size (size_max) to have inbound
memory allocated properly for Port A outbound memory windows - midx1.
2) The actual inbound memory region can be allocated using say
dma_alloc_coherent() or something else. Using the restrictions from 1) we can
make sure, that it is correct for corresponding outbound MW.
3) Now the translation address of the memory window needs to be set to NTBi device.
For this purpose we need to call:
ntb_mw_set_trans(Port A, midx1, trans_addr, mw_size);
Which actually implies, that the memory region with translation address trans_addr
and size mw_size is set for outbound memory window midx1 of Port A.
4) Since Port B finished the inbound memory window configuration, Port A
needs to be somehow notified of it and informed about inbound memory window
parameters, particularly memory window index (if it isn't predefined by logic)
and memory window size (again if it isn't predefined by logic). It might be done
using any of preferable API methods like doorbell and scratchpads or message
registers. After this Port A start it's configuration.
5) Port A got information from Port B about inbound MW configured for its
outbound memory window midx1. The only thing left to do is to get actual
mapping info like outbound memory window base address and size. It's done
by calling
ntb_peer_mw_get_addr(midx1, map_base, map_size);
>From now map_base and map_size can be used to map actual outbound MW using
for instance ioremap_wc(), so to have Port B memory accessed over retrieved
virtual address.
It must be noted in this matter, that in general map_size doesn't need to
be equal to mw_size. But obviously Port A logic needs to know mw_size for
proper communications.
Hopefully the explanation was full enough to grab a gist of multi-port NTB API.
As I said it was based on assumption, that the abstract device got more
similarities with Intel/AMD hardware (except multiportness of course).
On the other way, the IDT hardware is configured a bit different.
The difference is located at step 3) of the algorithm above. Due to
IDT hardware peculiarity (particularly due to Lookup table entry access
registers), the translation address and size of memory region can't
be setup on the inbound MW side. Instead the translation address (trans_addr)
and size (mw_addr) must be transfered to Port A, so it would set them up using
the method:
ntb_peer_mw_set_trans(Port B, midx1, trans_addr, mw_size);
The rest of the configuration steps are the same.
Regards,
-Sergey
> > So on Host A: peer_mw_get_addr(idx=1) should return size=4M (from mwA1),
> > but it's not clear what mw_get_align(widx=1) should do. I see two
> > possibilities:
> >
> > 1) Given that it has the opposite sense of peer_mw_get_addr (ie. there
> > is no peer in it's name) and that this new api also has a peer index, it
> > should return align_size=64k (from mwB1). However, in order to do this,
> > Host B must be fully configured (technically the link doesn't have to be
> > fully up, but having a link up is the only way for a client to tell if
> > Host B is configured or not).
> >
> > 2) Given your assertion that these APIs should never reach across the
> > link, then one could say it should return align_size=4k. However, in
> > this situation the name is really confusing. And the fact that it has a
> > pidx argument makes no sense. And the way ntb_transport and ntb_perf use
> > it is wrong because they will end up masking the 64K size of mwB1 with
> > the 4k align_size from mwA1.
> >
> > Does that make more sense?
> >
> > Thanks,
> >
> > Logan
> >
>
> --
> You received this message because you are subscribed to the Google Groups "linux-ntb" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> To post to this group, send email to [email protected].
> To view this discussion on the web visit https://groups.google.com/d/msgid/linux-ntb/000101d2ec53%24f2830840%24d78918c0%24%40dell.com.
> For more options, visit https://groups.google.com/d/optout.
Hey,
Thanks Serge for the detailed explanation. This is pretty much exactly
as I had thought it should be interpreted. My only problem remains that
my hardware can't provide ntb_mw_get_align until the port it is asking
about has configured itself. The easiest way to solve this is to only
allow access when the link to that port is up. It's not a complicated
change and would actually simplify ntb_transport and ntb_perf a little.
Is this ok with you Allen? If it is, I can include a patch in my next
switchtec submission.
Thanks,
Logan
From: Logan Gunthorpe
> Hey,
>
> Thanks Serge for the detailed explanation. This is pretty much exactly
> as I had thought it should be interpreted. My only problem remains that
> my hardware can't provide ntb_mw_get_align until the port it is asking
> about has configured itself. The easiest way to solve this is to only
> allow access when the link to that port is up. It's not a complicated
> change and would actually simplify ntb_transport and ntb_perf a little.
>
> Is this ok with you Allen? If it is, I can include a patch in my next
> switchtec submission.
Ok.
>
> Thanks,
>
> Logan
From: Logan Gunthorpe
> But any translation can be
> programmed by any peer.
That doesn't seem safe. Even though it can be done as you say, would it not be better to have each specific translation under the control of exactly one driver?
If drivers can reach across and set the translation of any peer bar, they would still need to negotiate among N peers which one sets which other's translation.
On 23/06/17 04:06 PM, Allen Hubbe wrote:
> From: Logan Gunthorpe
>> But any translation can be
>> programmed by any peer.
>
> That doesn't seem safe. Even though it can be done as you say, would it not be better to have each specific translation under the control of exactly one driver?
>
> If drivers can reach across and set the translation of any peer bar, they would still need to negotiate among N peers which one sets which other's translation.
Yup. In a dual host setup its not a problem seeing only the one peer
will ever set any of the memory windows. In the multi-host setup, there
has to be implicit or explicit negotiation as to which memory window
connects to which peer.
The hardware also implements locking so if two peers try to mess with
the same translation, the worst that can happen is the last peer's
settings will take effect or one peer will see an error.
Logan