2005-03-04 00:27:24

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][0/26] InfiniBand merge

Here's another series of patches that applies on top of the fixes I
posted yesterday. This series syncs the kernel with everything ready
for merging from the OpenIB subversion tree.

Most of these patches add more support for "mem-free" mode to mthca.
This allows PCI Express HCAs to operate by storing context in the host
system's memory rather than in dedicated memory attached to the HCA.
With this series of patches, mem-free mode is usable -- in fact, this
series of patches is being posted from a system whose only network
connection is IP-over-IB running on a mem-free HCA.

Thanks,
Roland


2005-03-03 23:30:46

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][25/26] IB/mthca: implement query of device caps

From: Michael S. Tsirkin <[email protected]>

Set device_cap_flags field in mthca's query_device method.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cmd.h 2005-01-25 20:48:02.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cmd.h 2005-03-03 14:13:03.934043620 -0800
@@ -95,7 +95,21 @@
};

enum {
- DEV_LIM_FLAG_SRQ = 1 << 6
+ DEV_LIM_FLAG_RC = 1 << 0,
+ DEV_LIM_FLAG_UC = 1 << 1,
+ DEV_LIM_FLAG_UD = 1 << 2,
+ DEV_LIM_FLAG_RD = 1 << 3,
+ DEV_LIM_FLAG_RAW_IPV6 = 1 << 4,
+ DEV_LIM_FLAG_RAW_ETHER = 1 << 5,
+ DEV_LIM_FLAG_SRQ = 1 << 6,
+ DEV_LIM_FLAG_BAD_PKEY_CNTR = 1 << 8,
+ DEV_LIM_FLAG_BAD_QKEY_CNTR = 1 << 9,
+ DEV_LIM_FLAG_MW = 1 << 16,
+ DEV_LIM_FLAG_AUTO_PATH_MIG = 1 << 17,
+ DEV_LIM_FLAG_ATOMIC = 1 << 18,
+ DEV_LIM_FLAG_RAW_MULTI = 1 << 19,
+ DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+ DEV_LIM_FLAG_UD_MULTI = 1 << 21,
};

struct mthca_dev_lim {
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:03.005245231 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:03.932044054 -0800
@@ -218,6 +218,7 @@

int hca_type;
unsigned long mthca_flags;
+ unsigned long device_cap_flags;

u32 rev_id;

--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:13:03.005245231 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:13:03.933043837 -0800
@@ -171,6 +171,33 @@
mdev->limits.reserved_uars = dev_lim->reserved_uars;
mdev->limits.reserved_pds = dev_lim->reserved_pds;

+ /* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+ May be doable since hardware supports it for SRQ.
+
+ IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+ IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+ supported by driver. */
+ mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+ mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+ mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+ mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+ mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+ mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
mdev->mthca_flags |= MTHCA_FLAG_SRQ;

--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:13:02.566340502 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:13:03.933043837 -0800
@@ -43,6 +43,8 @@
struct ib_smp *in_mad = NULL;
struct ib_smp *out_mad = NULL;
int err = -ENOMEM;
+ struct mthca_dev* mdev = to_mdev(ibdev);
+
u8 status;

in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL);
@@ -50,7 +52,7 @@
if (!in_mad || !out_mad)
goto out;

- props->fw_ver = to_mdev(ibdev)->fw_ver;
+ props->fw_ver = mdev->fw_ver;

memset(in_mad, 0, sizeof *in_mad);
in_mad->base_version = 1;
@@ -59,7 +61,7 @@
in_mad->method = IB_MGMT_METHOD_GET;
in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;

- err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ err = mthca_MAD_IFC(mdev, 1, 1,
1, NULL, NULL, in_mad, out_mad,
&status);
if (err)
@@ -69,10 +71,11 @@
goto out;
}

- props->vendor_id = be32_to_cpup((u32 *) (out_mad->data + 36)) &
+ props->device_cap_flags = mdev->device_cap_flags;
+ props->vendor_id = be32_to_cpup((u32 *) (out_mad->data + 36)) &
0xffffff;
- props->vendor_part_id = be16_to_cpup((u16 *) (out_mad->data + 30));
- props->hw_ver = be16_to_cpup((u16 *) (out_mad->data + 32));
+ props->vendor_part_id = be16_to_cpup((u16 *) (out_mad->data + 30));
+ props->hw_ver = be16_to_cpup((u16 *) (out_mad->data + 32));
memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
memcpy(&props->node_guid, out_mad->data + 12, 8);


2005-03-03 23:30:45

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][23/26] IB/mthca: mem-free multicast table

Tie up one last loose end by mapping enough context memory to cover
the whole multicast table during initialization, and then enable
mem-free mode. mthca now supports enough of mem-free mode so that
IPoIB works with a mem-free HCA.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:02.565340719 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:03.005245231 -0800
@@ -207,8 +207,9 @@
};

struct mthca_mcg_table {
- struct semaphore sem;
- struct mthca_alloc alloc;
+ struct semaphore sem;
+ struct mthca_alloc alloc;
+ struct mthca_icm_table *table;
};

struct mthca_dev {
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:57.858362446 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:13:03.005245231 -0800
@@ -412,8 +412,29 @@
goto err_unmap_eqp;
}

+ /*
+ * It's not strictly required, but for simplicity just map the
+ * whole multicast group table now. The table isn't very big
+ * and it's a lot easier than trying to track ref counts.
+ */
+ mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+ MTHCA_MGM_ENTRY_SIZE,
+ mdev->limits.num_mgms +
+ mdev->limits.num_amgms,
+ mdev->limits.num_mgms +
+ mdev->limits.num_amgms,
+ 0);
+ if (!mdev->mcg_table.table) {
+ mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_cq;
+ }
+
return 0;

+err_unmap_cq:
+ mthca_free_icm_table(mdev, mdev->cq_table.table);
+
err_unmap_eqp:
mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);

@@ -587,7 +608,7 @@
goto err_uar_free;
}

- err = mthca_init_pd_table(dev);
+ err = mthca_init_pd_table(dev);
if (err) {
mthca_err(dev, "Failed to initialize "
"protection domain table, aborting.\n");
@@ -635,13 +656,6 @@

mthca_dbg(dev, "NOP command IRQ test passed\n");

- if (dev->hca_type == ARBEL_NATIVE) {
- mthca_warn(dev, "Sorry, native MT25208 mode support is not complete, "
- "aborting.\n");
- err = -ENODEV;
- goto err_cmd_poll;
- }
-
err = mthca_init_cq_table(dev);
if (err) {
mthca_err(dev, "Failed to initialize "
@@ -704,7 +718,7 @@

err_uar_table_free:
mthca_cleanup_uar_table(dev);
- return err;
+ return err;
}

static int __devinit mthca_request_regions(struct pci_dev *pdev,
@@ -814,6 +828,7 @@
const struct pci_device_id *id)
{
static int mthca_version_printed = 0;
+ static int mthca_memfree_warned = 0;
int ddr_hidden = 0;
int err;
struct mthca_dev *mdev;
@@ -893,6 +908,10 @@
mdev->pdev = pdev;
mdev->hca_type = id->driver_data;

+ if (mdev->hca_type == ARBEL_NATIVE && !mthca_memfree_warned++)
+ mthca_warn(mdev, "Warning: native MT25208 mode support is incomplete. "
+ "Your HCA may not work properly.\n");
+
if (ddr_hidden)
mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;


2005-03-03 23:35:38

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][26/26] IB: MAD cancel callbacks from thread

From: Sean Hefty <[email protected]>

Modify ib_cancel_mad() to invoke a user's send completion callback from
a different thread context than that used by the caller. This allows a
caller to hold a lock while calling cancel that is also acquired from
their send handler.

Signed-off-by: Sean Hefty <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/core/mad.c 2005-03-03 14:12:54.671054304 -0800
+++ linux-export/drivers/infiniband/core/mad.c 2005-03-03 14:13:04.375947697 -0800
@@ -68,6 +68,7 @@
static void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
struct ib_mad_send_wc *mad_send_wc);
static void timeout_sends(void *data);
+static void cancel_sends(void *data);
static void local_completions(void *data);
static int solicited_mad(struct ib_mad *mad);
static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
@@ -341,6 +342,8 @@
INIT_LIST_HEAD(&mad_agent_priv->local_list);
INIT_WORK(&mad_agent_priv->local_work, local_completions,
mad_agent_priv);
+ INIT_LIST_HEAD(&mad_agent_priv->canceled_list);
+ INIT_WORK(&mad_agent_priv->canceled_work, cancel_sends, mad_agent_priv);
atomic_set(&mad_agent_priv->refcount, 1);
init_waitqueue_head(&mad_agent_priv->wait);

@@ -2004,12 +2007,44 @@
return NULL;
}

+void cancel_sends(void *data)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+
+ mad_agent_priv = (struct ib_mad_agent_private *)data;
+
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->canceled_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->canceled_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ mad_send_wc.wr_id = mad_send_wr->wr_id;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ kfree(mad_send_wr);
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ wake_up(&mad_agent_priv->wait);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
void ib_cancel_mad(struct ib_mad_agent *mad_agent,
u64 wr_id)
{
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_send_wr_private *mad_send_wr;
- struct ib_mad_send_wc mad_send_wc;
unsigned long flags;

mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
@@ -2031,19 +2066,12 @@
}

list_del(&mad_send_wr->agent_list);
+ list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->canceled_list);
adjust_timeout(mad_agent_priv);
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);

- mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.wr_id = mad_send_wr->wr_id;
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
-
- kfree(mad_send_wr);
- if (atomic_dec_and_test(&mad_agent_priv->refcount))
- wake_up(&mad_agent_priv->wait);
-
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->canceled_work);
out:
return;
}
--- linux-export.orig/drivers/infiniband/core/mad_priv.h 2005-03-02 20:53:21.000000000 -0800
+++ linux-export/drivers/infiniband/core/mad_priv.h 2005-03-03 14:13:04.375947697 -0800
@@ -95,6 +95,8 @@
unsigned long timeout;
struct list_head local_list;
struct work_struct local_work;
+ struct list_head canceled_list;
+ struct work_struct canceled_work;

atomic_t refcount;
wait_queue_head_t wait;

2005-03-03 23:35:36

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][22/26] IB/mthca: mem-free work request posting

Implement posting send and receive work requests for mem-free mode.
Also tidy up a few things in send/receive posting for Tavor mode (fix
smp_wmb()s that should really be just wmb()s, annotate tests in the
fast path with likely()/unlikely()).

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:01.213634129 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:02.565340719 -0800
@@ -380,10 +380,14 @@
void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
enum ib_event_type event_type);
int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask);
-int mthca_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
- struct ib_send_wr **bad_wr);
-int mthca_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
- struct ib_recv_wr **bad_wr);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
int index, int *dbd, u32 *new_wqe);
int mthca_alloc_qp(struct mthca_dev *dev,
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:13:01.213634129 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:13:02.566340502 -0800
@@ -613,8 +613,6 @@
dev->ib_dev.create_qp = mthca_create_qp;
dev->ib_dev.modify_qp = mthca_modify_qp;
dev->ib_dev.destroy_qp = mthca_destroy_qp;
- dev->ib_dev.post_send = mthca_post_send;
- dev->ib_dev.post_recv = mthca_post_receive;
dev->ib_dev.create_cq = mthca_create_cq;
dev->ib_dev.destroy_cq = mthca_destroy_cq;
dev->ib_dev.poll_cq = mthca_poll_cq;
@@ -625,10 +623,15 @@
dev->ib_dev.detach_mcast = mthca_multicast_detach;
dev->ib_dev.process_mad = mthca_process_mad;

- if (dev->hca_type == ARBEL_NATIVE)
+ if (dev->hca_type == ARBEL_NATIVE) {
dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
- else
+ dev->ib_dev.post_send = mthca_arbel_post_send;
+ dev->ib_dev.post_recv = mthca_arbel_post_receive;
+ } else {
dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+ dev->ib_dev.post_send = mthca_tavor_post_send;
+ dev->ib_dev.post_recv = mthca_tavor_post_receive;
+ }

init_MUTEX(&dev->cap_mask_mutex);

--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:01.713525620 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:02.567340285 -0800
@@ -253,6 +253,16 @@
u16 vcrc;
};

+static const u8 mthca_opcode[] = {
+ [IB_WR_SEND] = MTHCA_OPCODE_SEND,
+ [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM,
+ [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM,
+ [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
{
return qp->qpn >= dev->qp_table.sqp_start &&
@@ -637,9 +647,8 @@

if (qp->transport == MLX || qp->transport == UD)
qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
- else if (attr_mask & IB_QP_PATH_MTU) {
+ else if (attr_mask & IB_QP_PATH_MTU)
qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
- }

if (dev->hca_type == ARBEL_NATIVE) {
qp_context->rq_size_stride =
@@ -1385,8 +1394,8 @@
return 0;
}

-int mthca_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
- struct ib_send_wr **bad_wr)
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
{
struct mthca_dev *dev = to_mdev(ibqp->device);
struct mthca_qp *qp = to_mqp(ibqp);
@@ -1402,16 +1411,6 @@
int ind;
u8 op0 = 0;

- static const u8 opcode[] = {
- [IB_WR_SEND] = MTHCA_OPCODE_SEND,
- [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM,
- [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE,
- [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM,
- [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ,
- [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS,
- [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
- };
-
spin_lock_irqsave(&qp->lock, flags);

/* XXX check that state is OK to post send */
@@ -1550,7 +1549,7 @@

qp->wrid[ind + qp->rq.max] = wr->wr_id;

- if (wr->opcode >= ARRAY_SIZE(opcode)) {
+ if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
mthca_err(dev, "opcode invalid\n");
err = -EINVAL;
*bad_wr = wr;
@@ -1561,15 +1560,15 @@
((struct mthca_next_seg *) prev_wqe)->nda_op =
cpu_to_be32(((ind << qp->sq.wqe_shift) +
qp->send_wqe_offset) |
- opcode[wr->opcode]);
- smp_wmb();
+ mthca_opcode[wr->opcode]);
+ wmb();
((struct mthca_next_seg *) prev_wqe)->ee_nds =
cpu_to_be32((size0 ? 0 : MTHCA_NEXT_DBD) | size);
}

if (!size0) {
size0 = size;
- op0 = opcode[wr->opcode];
+ op0 = mthca_opcode[wr->opcode];
}

++ind;
@@ -1578,7 +1577,7 @@
}

out:
- if (nreq) {
+ if (likely(nreq)) {
u32 doorbell[2];

doorbell[0] = cpu_to_be32(((qp->sq.next << qp->sq.wqe_shift) +
@@ -1599,8 +1598,8 @@
return err;
}

-int mthca_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
- struct ib_recv_wr **bad_wr)
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
{
struct mthca_dev *dev = to_mdev(ibqp->device);
struct mthca_qp *qp = to_mqp(ibqp);
@@ -1621,7 +1620,7 @@
ind = qp->rq.next;

for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (qp->rq.cur + nreq >= qp->rq.max) {
+ if (unlikely(qp->rq.cur + nreq >= qp->rq.max)) {
mthca_err(dev, "RQ %06x full\n", qp->qpn);
err = -ENOMEM;
*bad_wr = wr;
@@ -1640,7 +1639,7 @@
wqe += sizeof (struct mthca_next_seg);
size = sizeof (struct mthca_next_seg) / 16;

- if (wr->num_sge > qp->rq.max_gs) {
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
err = -EINVAL;
*bad_wr = wr;
goto out;
@@ -1659,10 +1658,10 @@

qp->wrid[ind] = wr->wr_id;

- if (prev_wqe) {
+ if (likely(prev_wqe)) {
((struct mthca_next_seg *) prev_wqe)->nda_op =
cpu_to_be32((ind << qp->rq.wqe_shift) | 1);
- smp_wmb();
+ wmb();
((struct mthca_next_seg *) prev_wqe)->ee_nds =
cpu_to_be32(MTHCA_NEXT_DBD | size);
}
@@ -1676,7 +1675,7 @@
}

out:
- if (nreq) {
+ if (likely(nreq)) {
u32 doorbell[2];

doorbell[0] = cpu_to_be32((qp->rq.next << qp->rq.wqe_shift) | size0);
@@ -1696,6 +1695,247 @@
return err;
}

+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ void *prev_wqe;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ int size0 = 0;
+ u32 f0 = 0;
+ int ind;
+ u8 op0 = 0;
+
+ spin_lock_irqsave(&qp->lock, flags);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.next & (qp->sq.max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (qp->sq.cur + nreq >= qp->sq.max) {
+ mthca_err(dev, "SQ full (%d posted, %d max, %d nreq)\n",
+ qp->sq.cur, qp->sq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_send_wqe(qp, ind);
+ prev_wqe = qp->sq.last;
+ qp->sq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->flags =
+ ((wr->send_flags & IB_SEND_SIGNALED) ?
+ cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+ ((wr->send_flags & IB_SEND_SOLICITED) ?
+ cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) |
+ cpu_to_be32(1);
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ ((struct mthca_next_seg *) wqe)->flags = wr->imm_data;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ switch (qp->transport) {
+ case UD:
+ memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
+ to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+ ((struct mthca_arbel_ud_seg *) wqe)->dqpn =
+ cpu_to_be32(wr->wr.ud.remote_qpn);
+ ((struct mthca_arbel_ud_seg *) wqe)->qkey =
+ cpu_to_be32(wr->wr.ud.remote_qkey);
+
+ wqe += sizeof (struct mthca_arbel_ud_seg);
+ size += sizeof (struct mthca_arbel_ud_seg) / 16;
+ break;
+
+ case MLX:
+ err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+ wqe - sizeof (struct mthca_next_seg),
+ wqe);
+ if (err) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ break;
+ }
+
+ if (wr->num_sge > qp->sq.max_gs) {
+ mthca_err(dev, "too many gathers\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mthca_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mthca_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ /* Add one more inline data segment for ICRC */
+ if (qp->transport == MLX) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32((1 << 31) | 4);
+ ((u32 *) wqe)[1] = 0;
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+ if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+ mthca_err(dev, "opcode invalid\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (likely(prev_wqe)) {
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =
+ cpu_to_be32(((ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) |
+ mthca_opcode[wr->opcode]);
+ wmb();
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD | size);
+ }
+
+ if (!size0) {
+ size0 = size;
+ op0 = mthca_opcode[wr->opcode];
+ }
+
+ ++ind;
+ if (unlikely(ind >= qp->sq.max))
+ ind -= qp->sq.max;
+ }
+
+out:
+ if (likely(nreq)) {
+ u32 doorbell[2];
+
+ doorbell[0] = cpu_to_be32((nreq << 24) |
+ ((qp->sq.next & 0xffff) << 8) |
+ f0 | op0);
+ doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);
+
+ qp->sq.cur += nreq;
+ qp->sq.next += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->sq.db = cpu_to_be32(qp->sq.next & 0xffff);
+
+ /*
+ * Make sure doorbell record is written before we
+ * write MMIO send doorbell.
+ */
+ wmb();
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_SEND_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ spin_unlock_irqrestore(&qp->lock, flags);
+ return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int i;
+ void *wqe;
+
+ spin_lock_irqsave(&qp->lock, flags);
+
+ /* XXX check that state is OK to post receive */
+
+ ind = qp->rq.next & (qp->rq.max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(qp->rq.cur + nreq >= qp->rq.max)) {
+ mthca_err(dev, "RQ %06x full\n", qp->qpn);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_recv_wqe(qp, ind);
+
+ ((struct mthca_next_seg *) wqe)->flags = 0;
+
+ wqe += sizeof (struct mthca_next_seg);
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32(wr->sg_list[i].length);
+ ((struct mthca_data_seg *) wqe)->lkey =
+ cpu_to_be32(wr->sg_list[i].lkey);
+ ((struct mthca_data_seg *) wqe)->addr =
+ cpu_to_be64(wr->sg_list[i].addr);
+ wqe += sizeof (struct mthca_data_seg);
+ }
+
+ if (i < qp->rq.max_gs) {
+ ((struct mthca_data_seg *) wqe)->byte_count = 0;
+ ((struct mthca_data_seg *) wqe)->lkey = cpu_to_be32(0x100);
+ ((struct mthca_data_seg *) wqe)->addr = 0;
+ }
+
+ qp->wrid[ind] = wr->wr_id;
+
+ ++ind;
+ if (unlikely(ind >= qp->rq.max))
+ ind -= qp->rq.max;
+ }
+out:
+ if (likely(nreq)) {
+ qp->rq.cur += nreq;
+ qp->rq.next += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->rq.db = cpu_to_be32(qp->rq.next & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->lock, flags);
+ return err;
+}
+
int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
int index, int *dbd, u32 *new_wqe)
{

2005-03-03 23:40:40

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][20/26] IB/mthca: mem-free QP initialization

Update QP initialization and cleanup to handle mem-free mode. In
mem-free mode, work queue sizes have to be rounded up to a power of 2,
we need to allocate doorbells, there must be memory mapped for the
entries in the QP and extended QP context table that we use, and the
entries of the receive queue must be initialized.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:01.213634129 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:01.712525837 -0800
@@ -167,6 +167,9 @@
void *last;
int max_gs;
int wqe_shift;
+
+ int db_index; /* Arbel only */
+ u32 *db;
};

struct mthca_qp {
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:01.215633695 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:01.713525620 -0800
@@ -40,6 +40,7 @@

#include "mthca_dev.h"
#include "mthca_cmd.h"
+#include "mthca_memfree.h"

enum {
MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
@@ -105,8 +106,11 @@

struct mthca_qp_context {
u32 flags;
- u32 sched_queue;
- u32 mtu_msgmax;
+ u32 tavor_sched_queue; /* Reserved on Arbel */
+ u8 mtu_msgmax;
+ u8 rq_size_stride; /* Reserved on Tavor */
+ u8 sq_size_stride; /* Reserved on Tavor */
+ u8 rlkey_arbel_sched_queue; /* Reserved on Tavor */
u32 usr_page;
u32 local_qpn;
u32 remote_qpn;
@@ -121,18 +125,22 @@
u32 reserved2;
u32 next_send_psn;
u32 cqn_snd;
- u32 next_snd_wqe[2];
+ u32 snd_wqe_base_l; /* Next send WQE on Tavor */
+ u32 snd_db_index; /* (debugging only entries) */
u32 last_acked_psn;
u32 ssn;
u32 params2;
u32 rnr_nextrecvpsn;
u32 ra_buff_indx;
u32 cqn_rcv;
- u32 next_rcv_wqe[2];
+ u32 rcv_wqe_base_l; /* Next recv WQE on Tavor */
+ u32 rcv_db_index; /* (debugging only entries) */
u32 qkey;
u32 srqn;
u32 rmsn;
- u32 reserved3[19];
+ u16 rq_wqe_counter; /* reserved on Tavor */
+ u16 sq_wqe_counter; /* reserved on Tavor */
+ u32 reserved3[18];
} __attribute__((packed));

struct mthca_qp_param {
@@ -193,7 +201,7 @@
u32 imm; /* immediate data */
};

-struct mthca_ud_seg {
+struct mthca_tavor_ud_seg {
u32 reserved1;
u32 lkey;
u64 av_addr;
@@ -203,6 +211,13 @@
u32 reserved3[2];
};

+struct mthca_arbel_ud_seg {
+ u32 av[8];
+ u32 dqpn;
+ u32 qkey;
+ u32 reserved[2];
+};
+
struct mthca_bind_seg {
u32 flags; /* [31] Atomic [30] rem write [29] rem read */
u32 reserved;
@@ -617,14 +632,24 @@
break;
}
}
- /* leave sched_queue as 0 */
+
+ /* leave tavor_sched_queue as 0 */
+
if (qp->transport == MLX || qp->transport == UD)
- qp_context->mtu_msgmax = cpu_to_be32((IB_MTU_2048 << 29) |
- (11 << 24));
+ qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
else if (attr_mask & IB_QP_PATH_MTU) {
- qp_context->mtu_msgmax = cpu_to_be32((attr->path_mtu << 29) |
- (31 << 24));
+ qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+ }
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ qp_context->rq_size_stride =
+ ((ffs(qp->rq.max) - 1) << 3) | (qp->rq.wqe_shift - 4);
+ qp_context->sq_size_stride =
+ ((ffs(qp->sq.max) - 1) << 3) | (qp->sq.wqe_shift - 4);
}
+
+ /* leave arbel_sched_queue as 0 */
+
qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
qp_context->local_qpn = cpu_to_be32(qp->qpn);
if (attr_mask & IB_QP_DEST_QPN) {
@@ -708,6 +733,11 @@
qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);

+ if (dev->hca_type == ARBEL_NATIVE) {
+ qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+ qp_context->snd_db_index = cpu_to_be32(qp->sq.db_index);
+ }
+
if (attr_mask & IB_QP_ACCESS_FLAGS) {
/*
* Only enable RDMA/atomics if we have responder
@@ -787,12 +817,16 @@
if (attr_mask & IB_QP_RQ_PSN)
qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);

- qp_context->ra_buff_indx = dev->qp_table.rdb_base +
- ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
- dev->qp_table.rdb_shift);
+ qp_context->ra_buff_indx =
+ cpu_to_be32(dev->qp_table.rdb_base +
+ ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+ dev->qp_table.rdb_shift));

qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);

+ if (dev->hca_type == ARBEL_NATIVE)
+ qp_context->rcv_db_index = cpu_to_be32(qp->rq.db_index);
+
if (attr_mask & IB_QP_QKEY) {
qp_context->qkey = cpu_to_be32(attr->qkey);
qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
@@ -860,12 +894,20 @@

size = sizeof (struct mthca_next_seg) +
qp->sq.max_gs * sizeof (struct mthca_data_seg);
- if (qp->transport == MLX)
+ switch (qp->transport) {
+ case MLX:
size += 2 * sizeof (struct mthca_data_seg);
- else if (qp->transport == UD)
- size += sizeof (struct mthca_ud_seg);
- else /* bind seg is as big as atomic + raddr segs */
+ break;
+ case UD:
+ if (dev->hca_type == ARBEL_NATIVE)
+ size += sizeof (struct mthca_arbel_ud_seg);
+ else
+ size += sizeof (struct mthca_tavor_ud_seg);
+ break;
+ default:
+ /* bind seg is as big as atomic + raddr segs */
size += sizeof (struct mthca_bind_seg);
+ }

for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
qp->sq.wqe_shift++)
@@ -942,7 +984,6 @@

err = mthca_mr_alloc_phys(dev, pd->pd_num, dma_list, shift,
npages, 0, size,
- MTHCA_MPT_FLAG_LOCAL_WRITE |
MTHCA_MPT_FLAG_LOCAL_READ,
&qp->mr);
if (err)
@@ -972,6 +1013,60 @@
return err;
}

+static int mthca_alloc_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ int ret = 0;
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+ if (ret)
+ return ret;
+
+ ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+ if (ret)
+ goto err_qpc;
+
+ qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+ qp->qpn, &qp->rq.db);
+ if (qp->rq.db_index < 0) {
+ ret = -ENOMEM;
+ goto err_eqpc;
+ }
+
+ qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+ qp->qpn, &qp->sq.db);
+ if (qp->sq.db_index < 0) {
+ ret = -ENOMEM;
+ goto err_rq_db;
+ }
+ }
+
+ return 0;
+
+err_rq_db:
+ mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+
+err_eqpc:
+ mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+ mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+ return ret;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+ mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+ mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+ mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+ }
+}
+
static int mthca_alloc_qp_common(struct mthca_dev *dev,
struct mthca_pd *pd,
struct mthca_cq *send_cq,
@@ -979,7 +1074,9 @@
enum ib_sig_type send_policy,
struct mthca_qp *qp)
{
- int err;
+ struct mthca_next_seg *wqe;
+ int ret;
+ int i;

spin_lock_init(&qp->lock);
atomic_set(&qp->refcount, 1);
@@ -996,8 +1093,51 @@
qp->rq.last = NULL;
qp->sq.last = NULL;

- err = mthca_alloc_wqe_buf(dev, pd, qp);
- return err;
+ ret = mthca_alloc_memfree(dev, qp);
+ if (ret)
+ return ret;
+
+ ret = mthca_alloc_wqe_buf(dev, pd, qp);
+ if (ret) {
+ mthca_free_memfree(dev, qp);
+ return ret;
+ }
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ for (i = 0; i < qp->rq.max; ++i) {
+ wqe = get_recv_wqe(qp, i);
+ wqe->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+ qp->rq.wqe_shift);
+ wqe->ee_nds = cpu_to_be32(1 << (qp->rq.wqe_shift - 4));
+ }
+
+ for (i = 0; i < qp->sq.max; ++i) {
+ wqe = get_send_wqe(qp, i);
+ wqe->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+ qp->sq.wqe_shift) +
+ qp->send_wqe_offset);
+ }
+ }
+
+ return 0;
+}
+
+static void mthca_align_qp_size(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ int i;
+
+ if (dev->hca_type != ARBEL_NATIVE)
+ return;
+
+ for (i = 0; 1 << i < qp->rq.max; ++i)
+ ; /* nothing */
+
+ qp->rq.max = 1 << i;
+
+ for (i = 0; 1 << i < qp->sq.max; ++i)
+ ; /* nothing */
+
+ qp->sq.max = 1 << i;
}

int mthca_alloc_qp(struct mthca_dev *dev,
@@ -1010,6 +1150,8 @@
{
int err;

+ mthca_align_qp_size(dev, qp);
+
switch (type) {
case IB_QPT_RC: qp->transport = RC; break;
case IB_QPT_UC: qp->transport = UC; break;
@@ -1048,6 +1190,8 @@
int err = 0;
u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;

+ mthca_align_qp_size(dev, &sqp->qp);
+
sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
&sqp->header_dma, GFP_KERNEL);
@@ -1160,14 +1304,15 @@

kfree(qp->wrid);

+ mthca_free_memfree(dev, qp);
+
if (is_sqp(dev, qp)) {
atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
dma_free_coherent(&dev->pdev->dev,
to_msqp(qp)->header_buf_size,
to_msqp(qp)->header_buf,
to_msqp(qp)->header_dma);
- }
- else
+ } else
mthca_free(&dev->qp_table.alloc, qp->qpn);
}

@@ -1350,17 +1495,17 @@
break;

case UD:
- ((struct mthca_ud_seg *) wqe)->lkey =
+ ((struct mthca_tavor_ud_seg *) wqe)->lkey =
cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
- ((struct mthca_ud_seg *) wqe)->av_addr =
+ ((struct mthca_tavor_ud_seg *) wqe)->av_addr =
cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
- ((struct mthca_ud_seg *) wqe)->dqpn =
+ ((struct mthca_tavor_ud_seg *) wqe)->dqpn =
cpu_to_be32(wr->wr.ud.remote_qpn);
- ((struct mthca_ud_seg *) wqe)->qkey =
+ ((struct mthca_tavor_ud_seg *) wqe)->qkey =
cpu_to_be32(wr->wr.ud.remote_qkey);

- wqe += sizeof (struct mthca_ud_seg);
- size += sizeof (struct mthca_ud_seg) / 16;
+ wqe += sizeof (struct mthca_tavor_ud_seg);
+ size += sizeof (struct mthca_tavor_ud_seg) / 16;
break;

case MLX:

2005-03-03 23:40:44

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][24/26] IB/mthca: QP locking optimization

From: Michael S. Tsirkin <[email protected]>

1. Split the QP spinlock into separate send and receive locks.

The only place where we have to lock both is upon modify_qp, and
that is not on data path.

2. Avoid taking any QP locks when polling CQ.

This last part is achieved by getting rid of the cur field in
mthca_wq, and calculating the number of outstanding WQEs by
comparing the head and tail fields. head is only updated by
post, tail is only updated by poll.

In a rare case where an overrun is detected, a CQ is locked and the
overrun condition is re-tested, to avoid any potential for stale
tail values.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:13:01.214633912 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:13:03.417155819 -0800
@@ -423,15 +423,6 @@
is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;

if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
- if (*cur_qp) {
- if (*freed) {
- wmb();
- update_cons_index(dev, cq, *freed);
- *freed = 0;
- }
- spin_unlock(&(*cur_qp)->lock);
- }
-
/*
* We do not have to take the QP table lock here,
* because CQs will be locked while QPs are removed
@@ -446,8 +437,6 @@
err = -EINVAL;
goto out;
}
-
- spin_lock(&(*cur_qp)->lock);
}

entry->qp_num = (*cur_qp)->qpn;
@@ -465,9 +454,9 @@
}

if (wq->last_comp < wqe_index)
- wq->cur -= wqe_index - wq->last_comp;
+ wq->tail += wqe_index - wq->last_comp;
else
- wq->cur -= wq->max - wq->last_comp + wqe_index;
+ wq->tail += wqe_index + wq->max - wq->last_comp;

wq->last_comp = wqe_index;

@@ -551,9 +540,6 @@
update_cons_index(dev, cq, freed);
}

- if (qp)
- spin_unlock(&qp->lock);
-
spin_unlock_irqrestore(&cq->lock, flags);

return err == 0 || err == -EAGAIN ? npolled : err;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:02.120437293 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:03.416156036 -0800
@@ -166,21 +166,22 @@
};

struct mthca_wq {
- int max;
- int cur;
- int next;
- int last_comp;
- void *last;
- int max_gs;
- int wqe_shift;
+ spinlock_t lock;
+ int max;
+ unsigned next_ind;
+ unsigned last_comp;
+ unsigned head;
+ unsigned tail;
+ void *last;
+ int max_gs;
+ int wqe_shift;

- int db_index; /* Arbel only */
- u32 *db;
+ int db_index; /* Arbel only */
+ u32 *db;
};

struct mthca_qp {
struct ib_qp ibqp;
- spinlock_t lock;
atomic_t refcount;
u32 qpn;
int is_direct;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:02.567340285 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:03.418155602 -0800
@@ -577,9 +577,11 @@
else
cur_state = attr->cur_qp_state;
} else {
- spin_lock_irq(&qp->lock);
+ spin_lock_irq(&qp->sq.lock);
+ spin_lock(&qp->rq.lock);
cur_state = qp->state;
- spin_unlock_irq(&qp->lock);
+ spin_unlock(&qp->rq.lock);
+ spin_unlock_irq(&qp->sq.lock);
}

if (attr_mask & IB_QP_STATE) {
@@ -1076,6 +1078,16 @@
}
}

+static void mthca_wq_init(struct mthca_wq* wq)
+{
+ spin_lock_init(&wq->lock);
+ wq->next_ind = 0;
+ wq->last_comp = wq->max - 1;
+ wq->head = 0;
+ wq->tail = 0;
+ wq->last = NULL;
+}
+
static int mthca_alloc_qp_common(struct mthca_dev *dev,
struct mthca_pd *pd,
struct mthca_cq *send_cq,
@@ -1087,20 +1099,13 @@
int ret;
int i;

- spin_lock_init(&qp->lock);
atomic_set(&qp->refcount, 1);
qp->state = IB_QPS_RESET;
qp->atomic_rd_en = 0;
qp->resp_depth = 0;
qp->sq_policy = send_policy;
- qp->rq.cur = 0;
- qp->sq.cur = 0;
- qp->rq.next = 0;
- qp->sq.next = 0;
- qp->rq.last_comp = qp->rq.max - 1;
- qp->sq.last_comp = qp->sq.max - 1;
- qp->rq.last = NULL;
- qp->sq.last = NULL;
+ mthca_wq_init(&qp->sq);
+ mthca_wq_init(&qp->rq);

ret = mthca_alloc_memfree(dev, qp);
if (ret)
@@ -1394,6 +1399,24 @@
return 0;
}

+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+ struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mthca_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max;
+}
+
int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
struct ib_send_wr **bad_wr)
{
@@ -1411,16 +1434,18 @@
int ind;
u8 op0 = 0;

- spin_lock_irqsave(&qp->lock, flags);
+ spin_lock_irqsave(&qp->sq.lock, flags);

/* XXX check that state is OK to post send */

- ind = qp->sq.next;
+ ind = qp->sq.next_ind;

for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (qp->sq.cur + nreq >= qp->sq.max) {
- mthca_err(dev, "SQ full (%d posted, %d max, %d nreq)\n",
- qp->sq.cur, qp->sq.max, nreq);
+ if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->sq.head, qp->sq.tail,
+ qp->sq.max, nreq);
err = -ENOMEM;
*bad_wr = wr;
goto out;
@@ -1580,7 +1605,7 @@
if (likely(nreq)) {
u32 doorbell[2];

- doorbell[0] = cpu_to_be32(((qp->sq.next << qp->sq.wqe_shift) +
+ doorbell[0] = cpu_to_be32(((qp->sq.next_ind << qp->sq.wqe_shift) +
qp->send_wqe_offset) | f0 | op0);
doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);

@@ -1591,10 +1616,10 @@
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
}

- qp->sq.cur += nreq;
- qp->sq.next = ind;
+ qp->sq.next_ind = ind;
+ qp->sq.head += nreq;

- spin_unlock_irqrestore(&qp->lock, flags);
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
return err;
}

@@ -1613,15 +1638,18 @@
void *wqe;
void *prev_wqe;

- spin_lock_irqsave(&qp->lock, flags);
+ spin_lock_irqsave(&qp->rq.lock, flags);

/* XXX check that state is OK to post receive */

- ind = qp->rq.next;
+ ind = qp->rq.next_ind;

for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (unlikely(qp->rq.cur + nreq >= qp->rq.max)) {
- mthca_err(dev, "RQ %06x full\n", qp->qpn);
+ if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->rq.head, qp->rq.tail,
+ qp->rq.max, nreq);
err = -ENOMEM;
*bad_wr = wr;
goto out;
@@ -1678,7 +1706,7 @@
if (likely(nreq)) {
u32 doorbell[2];

- doorbell[0] = cpu_to_be32((qp->rq.next << qp->rq.wqe_shift) | size0);
+ doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq);

wmb();
@@ -1688,10 +1716,10 @@
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
}

- qp->rq.cur += nreq;
- qp->rq.next = ind;
+ qp->rq.next_ind = ind;
+ qp->rq.head += nreq;

- spin_unlock_irqrestore(&qp->lock, flags);
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
return err;
}

@@ -1712,16 +1740,18 @@
int ind;
u8 op0 = 0;

- spin_lock_irqsave(&qp->lock, flags);
+ spin_lock_irqsave(&qp->sq.lock, flags);

/* XXX check that state is OK to post send */

- ind = qp->sq.next & (qp->sq.max - 1);
+ ind = qp->sq.head & (qp->sq.max - 1);

for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (qp->sq.cur + nreq >= qp->sq.max) {
- mthca_err(dev, "SQ full (%d posted, %d max, %d nreq)\n",
- qp->sq.cur, qp->sq.max, nreq);
+ if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->sq.head, qp->sq.tail,
+ qp->sq.max, nreq);
err = -ENOMEM;
*bad_wr = wr;
goto out;
@@ -1831,19 +1861,18 @@
u32 doorbell[2];

doorbell[0] = cpu_to_be32((nreq << 24) |
- ((qp->sq.next & 0xffff) << 8) |
+ ((qp->sq.head & 0xffff) << 8) |
f0 | op0);
doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0);

- qp->sq.cur += nreq;
- qp->sq.next += nreq;
+ qp->sq.head += nreq;

/*
* Make sure that descriptors are written before
* doorbell record.
*/
wmb();
- *qp->sq.db = cpu_to_be32(qp->sq.next & 0xffff);
+ *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);

/*
* Make sure doorbell record is written before we
@@ -1855,7 +1884,7 @@
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
}

- spin_unlock_irqrestore(&qp->lock, flags);
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
return err;
}

@@ -1871,15 +1900,18 @@
int i;
void *wqe;

- spin_lock_irqsave(&qp->lock, flags);
+ spin_lock_irqsave(&qp->rq.lock, flags);

/* XXX check that state is OK to post receive */

- ind = qp->rq.next & (qp->rq.max - 1);
+ ind = qp->rq.head & (qp->rq.max - 1);

for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (unlikely(qp->rq.cur + nreq >= qp->rq.max)) {
- mthca_err(dev, "RQ %06x full\n", qp->qpn);
+ if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->rq.head, qp->rq.tail,
+ qp->rq.max, nreq);
err = -ENOMEM;
*bad_wr = wr;
goto out;
@@ -1921,18 +1953,17 @@
}
out:
if (likely(nreq)) {
- qp->rq.cur += nreq;
- qp->rq.next += nreq;
+ qp->rq.head += nreq;

/*
* Make sure that descriptors are written before
* doorbell record.
*/
wmb();
- *qp->rq.db = cpu_to_be32(qp->rq.next & 0xffff);
+ *qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
}

- spin_unlock_irqrestore(&qp->lock, flags);
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
return err;
}


2005-03-03 23:45:40

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][18/26] IB/mthca: mem-free CQ initialization

Update CQ initialization and cleanup to handle mem-free mode: we need
to make sure the HCA has memory mapped for the entry in the CQ context
table we will use and also allocate doorbell records.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:59.925913650 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:13:00.312829664 -0800
@@ -39,6 +39,7 @@

#include "mthca_dev.h"
#include "mthca_cmd.h"
+#include "mthca_memfree.h"

enum {
MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
@@ -55,7 +56,7 @@
u32 flags;
u64 start;
u32 logsize_usrpage;
- u32 error_eqn;
+ u32 error_eqn; /* Tavor only */
u32 comp_eqn;
u32 pd;
u32 lkey;
@@ -64,7 +65,9 @@
u32 consumer_index;
u32 producer_index;
u32 cqn;
- u32 reserved[3];
+ u32 ci_db; /* Arbel only */
+ u32 state_db; /* Arbel only */
+ u32 reserved;
} __attribute__((packed));

#define MTHCA_CQ_STATUS_OK ( 0 << 28)
@@ -685,10 +688,30 @@
if (cq->cqn == -1)
return -ENOMEM;

+ if (dev->hca_type == ARBEL_NATIVE) {
+ cq->arm_sn = 1;
+
+ err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+ if (err)
+ goto err_out;
+
+ err = -ENOMEM;
+
+ cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+ cq->cqn, &cq->set_ci_db);
+ if (cq->set_ci_db_index < 0)
+ goto err_out_icm;
+
+ cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+ cq->cqn, &cq->arm_db);
+ if (cq->arm_db_index < 0)
+ goto err_out_ci;
+ }
+
mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
GFP_KERNEL);
if (!mailbox)
- goto err_out;
+ goto err_out_mailbox;

cq_context = MAILBOX_ALIGN(mailbox);

@@ -716,6 +739,11 @@
cq_context->lkey = cpu_to_be32(cq->mr.ibmr.lkey);
cq_context->cqn = cpu_to_be32(cq->cqn);

+ if (dev->hca_type == ARBEL_NATIVE) {
+ cq_context->ci_db = cpu_to_be32(cq->set_ci_db_index);
+ cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+ }
+
err = mthca_SW2HW_CQ(dev, cq_context, cq->cqn, &status);
if (err) {
mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
@@ -751,6 +779,14 @@
err_out_mailbox:
kfree(mailbox);

+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+ mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
err_out:
mthca_free(&dev->cq_table.alloc, cq->cqn);

@@ -806,6 +842,12 @@
mthca_free_mr(dev, &cq->mr);
mthca_free_cq_buf(dev, cq);

+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+ mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+ }
+
mthca_free(&dev->cq_table.alloc, cq->cqn);
kfree(mailbox);
}
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:12:57.858362446 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:00.312829664 -0800
@@ -143,6 +143,14 @@
int cqn;
int cons_index;
int is_direct;
+
+ /* Next fields are Arbel only */
+ int set_ci_db_index;
+ u32 *set_ci_db;
+ int arm_db_index;
+ u32 *arm_db;
+ int arm_sn;
+
union {
struct mthca_buf_list direct;
struct mthca_buf_list *page_list;

2005-03-03 23:45:41

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][19/26] IB/mthca: mem-free CQ operations

Add support for CQ data path operations (request notification, update
consumer index) in mem-free mode.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:13:00.312829664 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:13:01.214633912 -0800
@@ -136,11 +136,15 @@
#define MTHCA_CQ_ENTRY_OWNER_SW (0 << 7)
#define MTHCA_CQ_ENTRY_OWNER_HW (1 << 7)

-#define MTHCA_CQ_DB_INC_CI (1 << 24)
-#define MTHCA_CQ_DB_REQ_NOT (2 << 24)
-#define MTHCA_CQ_DB_REQ_NOT_SOL (3 << 24)
-#define MTHCA_CQ_DB_SET_CI (4 << 24)
-#define MTHCA_CQ_DB_REQ_NOT_MULT (5 << 24)
+#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)

static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
{
@@ -159,7 +163,7 @@

static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
{
- return cqe_sw(cq, cq->cons_index);
+ return cqe_sw(cq, cq->cons_index & cq->ibcq.cqe);
}

static inline void set_cqe_hw(struct mthca_cqe *cqe)
@@ -167,17 +171,26 @@
cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
}

-static inline void inc_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
- int nent)
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+ int incr)
{
u32 doorbell[2];

- doorbell[0] = cpu_to_be32(MTHCA_CQ_DB_INC_CI | cq->cqn);
- doorbell[1] = cpu_to_be32(nent - 1);
+ if (dev->hca_type == ARBEL_NATIVE) {
+ *cq->set_ci_db = cpu_to_be32(cq->cons_index);
+ wmb();
+ } else {
+ doorbell[0] = cpu_to_be32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn);
+ doorbell[1] = cpu_to_be32(incr - 1);

- mthca_write64(doorbell,
- dev->kar + MTHCA_CQ_DOORBELL,
- MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
}

void mthca_cq_event(struct mthca_dev *dev, u32 cqn)
@@ -191,6 +204,8 @@
return;
}

+ ++cq->arm_sn;
+
cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
}

@@ -247,8 +262,8 @@

if (nfreed) {
wmb();
- inc_cons_index(dev, cq, nfreed);
- cq->cons_index = (cq->cons_index + nfreed) & cq->ibcq.cqe;
+ cq->cons_index += nfreed;
+ update_cons_index(dev, cq, nfreed);
}

spin_unlock_irq(&cq->lock);
@@ -341,7 +356,7 @@
break;
}

- err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe);
+ err = mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
if (err)
return err;

@@ -411,7 +426,7 @@
if (*cur_qp) {
if (*freed) {
wmb();
- inc_cons_index(dev, cq, *freed);
+ update_cons_index(dev, cq, *freed);
*freed = 0;
}
spin_unlock(&(*cur_qp)->lock);
@@ -505,7 +520,7 @@
if (likely(free_cqe)) {
set_cqe_hw(cqe);
++(*freed);
- cq->cons_index = (cq->cons_index + 1) & cq->ibcq.cqe;
+ ++cq->cons_index;
}

return err;
@@ -533,7 +548,7 @@

if (freed) {
wmb();
- inc_cons_index(dev, cq, freed);
+ update_cons_index(dev, cq, freed);
}

if (qp)
@@ -544,20 +559,57 @@
return err == 0 || err == -EAGAIN ? npolled : err;
}

-void mthca_arm_cq(struct mthca_dev *dev, struct mthca_cq *cq,
- int solicited)
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify)
{
u32 doorbell[2];

- doorbell[0] = cpu_to_be32((solicited ?
- MTHCA_CQ_DB_REQ_NOT_SOL :
- MTHCA_CQ_DB_REQ_NOT) |
- cq->cqn);
+ doorbell[0] = cpu_to_be32((notify == IB_CQ_SOLICITED ?
+ MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+ MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+ to_mcq(cq)->cqn);
doorbell[1] = 0xffffffff;

mthca_write64(doorbell,
- dev->kar + MTHCA_CQ_DOORBELL,
- MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+ return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+{
+ struct mthca_cq *cq = to_mcq(ibcq);
+ u32 doorbell[2];
+ u32 sn;
+ u32 ci;
+
+ sn = cq->arm_sn & 3;
+ ci = cpu_to_be32(cq->cons_index);
+
+ doorbell[0] = ci;
+ doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+ (notify == IB_CQ_SOLICITED ? 1 : 2));
+
+ mthca_write_db_rec(doorbell, cq->arm_db);
+
+ /*
+ * Make sure that the doorbell record in host memory is
+ * written before ringing the doorbell via PCI MMIO.
+ */
+ wmb();
+
+ doorbell[0] = cpu_to_be32((sn << 28) |
+ (notify == IB_CQ_SOLICITED ?
+ MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+ MTHCA_ARBEL_CQ_DB_REQ_NOT) |
+ cq->cqn);
+ doorbell[1] = ci;
+
+ mthca_write64(doorbell,
+ to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+ return 0;
}

static void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq *cq)
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:59.077097900 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:13:01.213634129 -0800
@@ -368,8 +368,8 @@

int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
struct ib_wc *entry);
-void mthca_arm_cq(struct mthca_dev *dev, struct mthca_cq *cq,
- int solicited);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
int mthca_init_cq(struct mthca_dev *dev, int nent,
struct mthca_cq *cq);
void mthca_free_cq(struct mthca_dev *dev,
@@ -384,7 +384,7 @@
struct ib_send_wr **bad_wr);
int mthca_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
-int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
+int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
int index, int *dbd, u32 *new_wqe);
int mthca_alloc_qp(struct mthca_dev *dev,
struct mthca_pd *pd,
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:12:59.925913650 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:13:01.213634129 -0800
@@ -421,13 +421,6 @@
return 0;
}

-static int mthca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify notify)
-{
- mthca_arm_cq(to_mdev(cq->device), to_mcq(cq),
- notify == IB_CQ_SOLICITED);
- return 0;
-}
-
static inline u32 convert_access(int acc)
{
return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC : 0) |
@@ -625,7 +618,6 @@
dev->ib_dev.create_cq = mthca_create_cq;
dev->ib_dev.destroy_cq = mthca_destroy_cq;
dev->ib_dev.poll_cq = mthca_poll_cq;
- dev->ib_dev.req_notify_cq = mthca_req_notify_cq;
dev->ib_dev.get_dma_mr = mthca_get_dma_mr;
dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr;
dev->ib_dev.dereg_mr = mthca_dereg_mr;
@@ -633,6 +625,11 @@
dev->ib_dev.detach_mcast = mthca_multicast_detach;
dev->ib_dev.process_mad = mthca_process_mad;

+ if (dev->hca_type == ARBEL_NATIVE)
+ dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+ else
+ dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+
init_MUTEX(&dev->cap_mask_mutex);

ret = ib_register_device(&dev->ib_dev);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:00.312829664 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:01.213634129 -0800
@@ -141,7 +141,7 @@
spinlock_t lock;
atomic_t refcount;
int cqn;
- int cons_index;
+ u32 cons_index;
int is_direct;

/* Next fields are Arbel only */
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:12:56.155732030 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:13:01.215633695 -0800
@@ -1551,7 +1551,7 @@
return err;
}

-int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
+int mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
int index, int *dbd, u32 *new_wqe)
{
struct mthca_next_seg *next;
@@ -1561,7 +1561,10 @@
else
next = get_recv_wqe(qp, index);

- *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+ if (dev->hca_type == ARBEL_NATIVE)
+ *dbd = 1;
+ else
+ *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
if (next->ee_nds & cpu_to_be32(0x3f))
*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
(next->ee_nds & cpu_to_be32(0x3f));

2005-03-03 23:50:22

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][13/26] IB/mthca: tweak firmware command debug messages

Slightly improve debugging output for UNMAP_ICM and MODIFY_QP firmware commands.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cmd.c 2005-01-25 20:48:02.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cmd.c 2005-03-03 14:12:58.283270213 -0800
@@ -1305,6 +1305,9 @@

int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status)
{
+ mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+ page_count, (unsigned long long) virt);
+
return mthca_cmd(dev, virt, page_count, 0, CMD_UNMAP_ICM, CMD_TIME_CLASS_B, status);
}

@@ -1538,10 +1541,10 @@
if (0) {
int i;
mthca_dbg(dev, "Dumping QP context:\n");
- printk(" %08x\n", be32_to_cpup(qp_context));
+ printk(" opt param mask: %08x\n", be32_to_cpup(qp_context));
for (i = 0; i < 0x100 / 4; ++i) {
if (i % 8 == 0)
- printk("[%02x] ", i * 4);
+ printk(" [%02x] ", i * 4);
printk(" %08x", be32_to_cpu(((u32 *) qp_context)[i + 2]));
if ((i + 1) % 8 == 0)
printk("\n");

2005-03-03 23:50:24

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][16/26] IB/mthca: mem-free doorbell record writing

Add a mthca_write_db_rec() to wrap writing doorbell records. On
64-bit archs, this is just a 64-bit write, while on 32-bit archs it
splits the write into two 32-bit writes with a memory barrier to make
sure the two halves of the record are written in the correct order.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_doorbell.h 2005-01-25 20:49:05.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_doorbell.h 2005-03-03 14:12:59.570990692 -0800
@@ -57,6 +57,11 @@
__raw_writeq(*(u64 *) val, dest);
}

+static inline void mthca_write_db_rec(u32 val[2], u32 *db)
+{
+ *(u64 *) db = *(u64 *) val;
+}
+
#else

/*
@@ -80,4 +85,11 @@
spin_unlock_irqrestore(doorbell_lock, flags);
}

+static inline void mthca_write_db_rec(u32 val[2], u32 *db)
+{
+ db[0] = val[0];
+ wmb();
+ db[1] = val[1];
+}
+
#endif

2005-03-03 23:55:22

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][10/26] IB/mthca: mem-free memory region support

Add support for mem-free mode to memory region code. This mostly
amounts to properly munging between keys and indices.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_mr.c 2005-01-15 15:16:11.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_mr.c 2005-03-03 14:12:57.165512841 -0800
@@ -53,7 +53,8 @@
u32 window_count;
u32 window_count_limit;
u64 mtt_seg;
- u32 reserved[3];
+ u32 mtt_sz; /* Arbel only */
+ u32 reserved[2];
} __attribute__((packed));

#define MTHCA_MPT_FLAG_SW_OWNS (0xfUL << 28)
@@ -121,21 +122,38 @@
spin_unlock(&dev->mr_table.mpt_alloc.lock);
}

+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ return (ind >> 24) | (ind << 8);
+ else
+ return ind;
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ return (key << 24) | (key >> 8);
+ else
+ return key;
+}
+
int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
u32 access, struct mthca_mr *mr)
{
void *mailbox;
struct mthca_mpt_entry *mpt_entry;
+ u32 key;
int err;
u8 status;

might_sleep();

mr->order = -1;
- mr->ibmr.lkey = mthca_alloc(&dev->mr_table.mpt_alloc);
- if (mr->ibmr.lkey == -1)
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
return -ENOMEM;
- mr->ibmr.rkey = mr->ibmr.lkey;
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);

mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
GFP_KERNEL);
@@ -151,7 +169,7 @@
MTHCA_MPT_FLAG_REGION |
access);
mpt_entry->page_size = 0;
- mpt_entry->key = cpu_to_be32(mr->ibmr.lkey);
+ mpt_entry->key = cpu_to_be32(key);
mpt_entry->pd = cpu_to_be32(pd);
mpt_entry->start = 0;
mpt_entry->length = ~0ULL;
@@ -160,7 +178,7 @@
sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));

err = mthca_SW2HW_MPT(dev, mpt_entry,
- mr->ibmr.lkey & (dev->limits.num_mpts - 1),
+ key & (dev->limits.num_mpts - 1),
&status);
if (err)
mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
@@ -182,6 +200,7 @@
void *mailbox;
u64 *mtt_entry;
struct mthca_mpt_entry *mpt_entry;
+ u32 key;
int err = -ENOMEM;
u8 status;
int i;
@@ -189,10 +208,10 @@
might_sleep();
WARN_ON(buffer_size_shift >= 32);

- mr->ibmr.lkey = mthca_alloc(&dev->mr_table.mpt_alloc);
- if (mr->ibmr.lkey == -1)
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
return -ENOMEM;
- mr->ibmr.rkey = mr->ibmr.lkey;
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);

for (i = dev->limits.mtt_seg_size / 8, mr->order = 0;
i < list_len;
@@ -254,7 +273,7 @@
access);

mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
- mpt_entry->key = cpu_to_be32(mr->ibmr.lkey);
+ mpt_entry->key = cpu_to_be32(key);
mpt_entry->pd = cpu_to_be32(pd);
mpt_entry->start = cpu_to_be64(iova);
mpt_entry->length = cpu_to_be64(total_size);
@@ -275,7 +294,7 @@
}

err = mthca_SW2HW_MPT(dev, mpt_entry,
- mr->ibmr.lkey & (dev->limits.num_mpts - 1),
+ key & (dev->limits.num_mpts - 1),
&status);
if (err)
mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
@@ -307,7 +326,8 @@
might_sleep();

err = mthca_HW2SW_MPT(dev, NULL,
- mr->ibmr.lkey & (dev->limits.num_mpts - 1),
+ key_to_hw_index(dev, mr->ibmr.lkey) &
+ (dev->limits.num_mpts - 1),
&status);
if (err)
mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
@@ -318,7 +338,7 @@
if (mr->order >= 0)
mthca_free_mtt(dev, mr->first_seg, mr->order);

- mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
+ mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, mr->ibmr.lkey));
}

int __devinit mthca_init_mr_table(struct mthca_dev *dev)

2005-03-03 23:55:23

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][12/26] IB/mthca: mem-free interrupt handling

Update interrupt handling code to handle mem-free mode. While we're
at it, improve the Tavor interrupt handling to avoid an extra MMIO
read of the event cause register.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:56.152732681 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:57.857362663 -0800
@@ -171,6 +171,7 @@
struct mthca_alloc alloc;
void __iomem *clr_int;
u32 clr_mask;
+ u32 arm_mask;
struct mthca_eq eq[MTHCA_NUM_EQ];
u64 icm_virt;
struct page *icm_page;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:57.462448386 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:57.859362229 -0800
@@ -165,19 +165,46 @@
MTHCA_ASYNC_EVENT_MASK;
}

-static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
{
u32 doorbell[2];

doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_SET_CI | eq->eqn);
doorbell[1] = cpu_to_be32(ci & (eq->nent - 1));

+ /*
+ * This barrier makes sure that all updates to ownership bits
+ * done by set_eqe_hw() hit memory before the consumer index
+ * is updated. set_eq_ci() allows the HCA to possibly write
+ * more EQ entries, and we want to avoid the exceedingly
+ * unlikely possibility of the HCA writing an entry and then
+ * having set_eqe_hw() overwrite the owner field.
+ */
+ wmb();
mthca_write64(doorbell,
dev->kar + MTHCA_EQ_DOORBELL,
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
}

-static inline void eq_req_not(struct mthca_dev *dev, int eqn)
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ /* See comment in tavor_set_eq_ci() above. */
+ wmb();
+ __raw_writel(cpu_to_be32(ci), dev->eq_regs.arbel.eq_set_ci_base +
+ eq->eqn * 8);
+ /* We still want ordering, just not swabbing, so add a barrier */
+ mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ if (dev->hca_type == ARBEL_NATIVE)
+ arbel_set_eq_ci(dev, eq, ci);
+ else
+ tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
{
u32 doorbell[2];

@@ -189,16 +216,23 @@
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
}

+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+ writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
{
- u32 doorbell[2];
+ if (dev->hca_type != ARBEL_NATIVE) {
+ u32 doorbell[2];

- doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn);
- doorbell[1] = cpu_to_be32(cqn);
+ doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn);
+ doorbell[1] = cpu_to_be32(cqn);

- mthca_write64(doorbell,
- dev->kar + MTHCA_EQ_DOORBELL,
- MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
}

static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
@@ -233,7 +267,7 @@
ib_dispatch_event(&record);
}

-static void mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
{
struct mthca_eqe *eqe;
int disarm_cqn;
@@ -334,60 +368,93 @@
++eq->cons_index;
eqes_found = 1;

- if (set_ci) {
- wmb(); /* see comment below */
+ if (unlikely(set_ci)) {
+ /*
+ * Conditional on hca_type is OK here because
+ * this is a rare case, not the fast path.
+ */
set_eq_ci(dev, eq, eq->cons_index);
set_ci = 0;
}
}

/*
- * This barrier makes sure that all updates to
- * ownership bits done by set_eqe_hw() hit memory
- * before the consumer index is updated. set_eq_ci()
- * allows the HCA to possibly write more EQ entries,
- * and we want to avoid the exceedingly unlikely
- * possibility of the HCA writing an entry and then
- * having set_eqe_hw() overwrite the owner field.
+ * Rely on caller to set consumer index so that we don't have
+ * to test hca_type in our interrupt handling fast path.
*/
- if (likely(eqes_found)) {
- wmb();
- set_eq_ci(dev, eq, eq->cons_index);
- }
- eq_req_not(dev, eq->eqn);
+ return eqes_found;
}

-static irqreturn_t mthca_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
{
struct mthca_dev *dev = dev_ptr;
u32 ecr;
- int work = 0;
int i;

if (dev->eq_table.clr_mask)
writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);

- if ((ecr = readl(dev->eq_regs.tavor.ecr_base + 4)) != 0) {
- work = 1;
-
+ ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+ if (ecr) {
writel(ecr, dev->eq_regs.tavor.ecr_base +
MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);

for (i = 0; i < MTHCA_NUM_EQ; ++i)
- if (ecr & dev->eq_table.eq[i].ecr_mask)
- mthca_eq_int(dev, &dev->eq_table.eq[i]);
+ if (ecr & dev->eq_table.eq[i].eqn_mask &&
+ mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+ tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+ dev->eq_table.eq[i].cons_index);
+ tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+ }
}

- return IRQ_RETVAL(work);
+ return IRQ_RETVAL(ecr);
}

-static irqreturn_t mthca_msi_x_interrupt(int irq, void *eq_ptr,
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr,
struct pt_regs *regs)
{
struct mthca_eq *eq = eq_ptr;
struct mthca_dev *dev = eq->dev;

mthca_eq_int(dev, eq);
+ tavor_set_eq_ci(dev, eq, eq->cons_index);
+ tavor_eq_req_not(dev, eq->eqn);
+
+ /* MSI-X vectors always belong to us */
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr, struct pt_regs *regs)
+{
+ struct mthca_dev *dev = dev_ptr;
+ int work = 0;
+ int i;
+
+ if (dev->eq_table.clr_mask)
+ writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+ work = 1;
+ arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+ dev->eq_table.eq[i].cons_index);
+ }
+
+ arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+ return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr,
+ struct pt_regs *regs)
+{
+ struct mthca_eq *eq = eq_ptr;
+ struct mthca_dev *dev = eq->dev;
+
+ mthca_eq_int(dev, eq);
+ arbel_set_eq_ci(dev, eq, eq->cons_index);
+ arbel_eq_req_not(dev, eq->eqn_mask);

/* MSI-X vectors always belong to us */
return IRQ_HANDLED;
@@ -496,10 +563,10 @@
kfree(dma_list);
kfree(mailbox);

- eq->ecr_mask = swab32(1 << eq->eqn);
+ eq->eqn_mask = swab32(1 << eq->eqn);
eq->cons_index = 0;

- eq_req_not(dev, eq->eqn);
+ dev->eq_table.arm_mask |= eq->eqn_mask;

mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
eq->eqn, nent);
@@ -551,6 +618,8 @@
mthca_warn(dev, "HW2SW_EQ returned status 0x%02x\n",
status);

+ dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
if (0) {
mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
@@ -562,7 +631,6 @@
}
}

-
mthca_free_mr(dev, &eq->mr);
for (i = 0; i < npages; ++i)
pci_free_consistent(dev->pdev, PAGE_SIZE,
@@ -780,6 +848,8 @@
(dev->eq_table.inta_pin < 31 ? 4 : 0);
}

+ dev->eq_table.arm_mask = 0;
+
intr = (dev->mthca_flags & MTHCA_FLAG_MSI) ?
128 : dev->eq_table.inta_pin;

@@ -810,15 +880,20 @@

for (i = 0; i < MTHCA_NUM_EQ; ++i) {
err = request_irq(dev->eq_table.eq[i].msi_x_vector,
- mthca_msi_x_interrupt, 0,
- eq_name[i], dev->eq_table.eq + i);
+ dev->hca_type == ARBEL_NATIVE ?
+ mthca_arbel_msi_x_interrupt :
+ mthca_tavor_msi_x_interrupt,
+ 0, eq_name[i], dev->eq_table.eq + i);
if (err)
goto err_out_cmd;
dev->eq_table.eq[i].have_irq = 1;
}
} else {
- err = request_irq(dev->pdev->irq, mthca_interrupt, SA_SHIRQ,
- DRV_NAME, dev);
+ err = request_irq(dev->pdev->irq,
+ dev->hca_type == ARBEL_NATIVE ?
+ mthca_arbel_interrupt :
+ mthca_tavor_interrupt,
+ SA_SHIRQ, DRV_NAME, dev);
if (err)
goto err_out_cmd;
dev->eq_table.have_irq = 1;
@@ -842,6 +917,12 @@
mthca_warn(dev, "MAP_EQ for cmd EQ %d returned status 0x%02x\n",
dev->eq_table.eq[MTHCA_EQ_CMD].eqn, status);

+ for (i = 0; i < MTHCA_EQ_CMD; ++i)
+ if (dev->hca_type == ARBEL_NATIVE)
+ arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+ else
+ tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
return 0;

err_out_cmd:
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:56.772598129 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:57.858362446 -0800
@@ -608,13 +608,6 @@
goto err_mr_table_free;
}

- if (dev->hca_type == ARBEL_NATIVE) {
- mthca_warn(dev, "Sorry, native MT25208 mode support is not done, "
- "aborting.\n");
- err = -ENODEV;
- goto err_pd_free;
- }
-
err = mthca_init_eq_table(dev);
if (err) {
mthca_err(dev, "Failed to initialize "
@@ -638,8 +631,16 @@
mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");

goto err_cmd_poll;
- } else
- mthca_dbg(dev, "NOP command IRQ test passed\n");
+ }
+
+ mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_warn(dev, "Sorry, native MT25208 mode support is not complete, "
+ "aborting.\n");
+ err = -ENODEV;
+ goto err_cmd_poll;
+ }

err = mthca_init_cq_table(dev);
if (err) {
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:12:56.153732464 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:12:57.858362446 -0800
@@ -70,7 +70,7 @@
struct mthca_eq {
struct mthca_dev *dev;
int eqn;
- u32 ecr_mask;
+ u32 eqn_mask;
u32 cons_index;
u16 msi_x_vector;
u16 msi_x_entry;

2005-03-04 00:00:02

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][9/26] IB/mthca: dynamic context memory mapping for mem-free mode

Add support for mapping more memory into HCA's context to cover
context tables when new objects are allocated. Pass the object
size into mthca_alloc_icm_table(), reference count the ICM chunks,
and add new mthca_table_get() and mthca_table_put() functions to
handle mapping memory when allocating or destroying objects.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:56.152732681 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:56.772598129 -0800
@@ -363,10 +363,9 @@
}

mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
- mdev->limits.num_mtt_segs *
init_hca->mtt_seg_sz,
- mdev->limits.reserved_mtts *
- init_hca->mtt_seg_sz, 1);
+ mdev->limits.num_mtt_segs,
+ mdev->limits.reserved_mtts, 1);
if (!mdev->mr_table.mtt_table) {
mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
err = -ENOMEM;
@@ -374,10 +373,9 @@
}

mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
- mdev->limits.num_mpts *
dev_lim->mpt_entry_sz,
- mdev->limits.reserved_mrws *
- dev_lim->mpt_entry_sz, 1);
+ mdev->limits.num_mpts,
+ mdev->limits.reserved_mrws, 1);
if (!mdev->mr_table.mpt_table) {
mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
err = -ENOMEM;
@@ -385,10 +383,9 @@
}

mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
- mdev->limits.num_qps *
dev_lim->qpc_entry_sz,
- mdev->limits.reserved_qps *
- dev_lim->qpc_entry_sz, 1);
+ mdev->limits.num_qps,
+ mdev->limits.reserved_qps, 0);
if (!mdev->qp_table.qp_table) {
mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
err = -ENOMEM;
@@ -396,10 +393,9 @@
}

mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
- mdev->limits.num_qps *
dev_lim->eqpc_entry_sz,
- mdev->limits.reserved_qps *
- dev_lim->eqpc_entry_sz, 1);
+ mdev->limits.num_qps,
+ mdev->limits.reserved_qps, 0);
if (!mdev->qp_table.eqp_table) {
mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
err = -ENOMEM;
@@ -407,10 +403,9 @@
}

mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
- mdev->limits.num_cqs *
dev_lim->cqc_entry_sz,
- mdev->limits.reserved_cqs *
- dev_lim->cqc_entry_sz, 1);
+ mdev->limits.num_cqs,
+ mdev->limits.reserved_cqs, 0);
if (!mdev->cq_table.table) {
mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
err = -ENOMEM;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_memfree.c 2005-01-25 20:46:29.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_memfree.c 2005-03-03 14:12:56.773597912 -0800
@@ -79,6 +79,7 @@
if (!icm)
return icm;

+ icm->refcount = 0;
INIT_LIST_HEAD(&icm->chunk_list);

cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
@@ -138,9 +139,62 @@
return NULL;
}

+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+ int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+ int ret = 0;
+ u8 status;
+
+ down(&table->mutex);
+
+ if (table->icm[i]) {
+ ++table->icm[i]->refcount;
+ goto out;
+ }
+
+ table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+ (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN);
+ if (!table->icm[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ &status) || status) {
+ mthca_free_icm(dev, table->icm[i]);
+ table->icm[i] = NULL;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ++table->icm[i]->refcount;
+
+out:
+ up(&table->mutex);
+ return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+ int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+ u8 status;
+
+ down(&table->mutex);
+
+ if (--table->icm[i]->refcount == 0) {
+ mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE >> 12, &status);
+ mthca_free_icm(dev, table->icm[i]);
+ table->icm[i] = NULL;
+ }
+
+ up(&table->mutex);
+}
+
struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
- u64 virt, unsigned size,
- unsigned reserved,
+ u64 virt, int obj_size,
+ int nobj, int reserved,
int use_lowmem)
{
struct mthca_icm_table *table;
@@ -148,20 +202,23 @@
int i;
u8 status;

- num_icm = size / MTHCA_TABLE_CHUNK_SIZE;
+ num_icm = obj_size * nobj / MTHCA_TABLE_CHUNK_SIZE;

table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
if (!table)
return NULL;

- table->virt = virt;
- table->num_icm = num_icm;
- init_MUTEX(&table->sem);
+ table->virt = virt;
+ table->num_icm = num_icm;
+ table->num_obj = nobj;
+ table->obj_size = obj_size;
+ table->lowmem = use_lowmem;
+ init_MUTEX(&table->mutex);

for (i = 0; i < num_icm; ++i)
table->icm[i] = NULL;

- for (i = 0; i < (reserved + MTHCA_TABLE_CHUNK_SIZE - 1) / MTHCA_TABLE_CHUNK_SIZE; ++i) {
+ for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
__GFP_NOWARN);
@@ -173,6 +230,12 @@
table->icm[i] = NULL;
goto err;
}
+
+ /*
+ * Add a reference to this ICM chunk so that it never
+ * gets freed (since it contains reserved firmware objects).
+ */
+ ++table->icm[i]->refcount;
}

return table;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_memfree.h 2005-01-25 20:46:29.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_memfree.h 2005-03-03 14:12:56.773597912 -0800
@@ -53,12 +53,16 @@

struct mthca_icm {
struct list_head chunk_list;
+ int refcount;
};

struct mthca_icm_table {
u64 virt;
int num_icm;
- struct semaphore sem;
+ int num_obj;
+ int obj_size;
+ int lowmem;
+ struct semaphore mutex;
struct mthca_icm *icm[0];
};

@@ -75,10 +79,12 @@
void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm);

struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
- u64 virt, unsigned size,
- unsigned reserved,
+ u64 virt, int obj_size,
+ int nobj, int reserved,
int use_lowmem);
void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);

static inline void mthca_icm_first(struct mthca_icm *icm,
struct mthca_icm_iter *iter)

2005-03-04 00:00:20

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][8/26] IB/mthca: add UAR allocation

Add support for allocating user access regions (UARs). Use this to
allocate a region for kernel at driver init instead using hard-coded
MTHCA_KAR_PAGE index.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/Makefile 2005-01-15 15:16:40.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/Makefile 2005-03-03 14:12:56.155732030 -0800
@@ -9,4 +9,4 @@
ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
- mthca_provider.o mthca_memfree.o
+ mthca_provider.o mthca_memfree.o mthca_uar.o
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:53.538300187 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:56.153732464 -0800
@@ -666,7 +666,7 @@
MTHCA_CQ_FLAG_TR);
cq_context->start = cpu_to_be64(0);
cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
- MTHCA_KAR_PAGE);
+ dev->driver_uar.index);
cq_context->error_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
cq_context->comp_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
cq_context->pd = cpu_to_be32(dev->driver_pd.pd_num);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:55.515870922 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:56.152732681 -0800
@@ -65,7 +65,6 @@
};

enum {
- MTHCA_KAR_PAGE = 1,
MTHCA_MAX_PORTS = 2
};

@@ -108,6 +107,7 @@
int gid_table_len;
int pkey_table_len;
int local_ca_ack_delay;
+ int num_uars;
int max_sg;
int num_qps;
int reserved_qps;
@@ -148,6 +148,12 @@
} *page_list;
};

+struct mthca_uar_table {
+ struct mthca_alloc alloc;
+ u64 uarc_base;
+ int uarc_size;
+};
+
struct mthca_pd_table {
struct mthca_alloc alloc;
};
@@ -252,6 +258,7 @@
struct mthca_cmd cmd;
struct mthca_limits limits;

+ struct mthca_uar_table uar_table;
struct mthca_pd_table pd_table;
struct mthca_mr_table mr_table;
struct mthca_eq_table eq_table;
@@ -260,6 +267,7 @@
struct mthca_av_table av_table;
struct mthca_mcg_table mcg_table;

+ struct mthca_uar driver_uar;
struct mthca_pd driver_pd;
struct mthca_mr driver_mr;

@@ -318,6 +326,7 @@
int mthca_array_init(struct mthca_array *array, int nent);
void mthca_array_cleanup(struct mthca_array *array, int nent);

+int mthca_init_uar_table(struct mthca_dev *dev);
int mthca_init_pd_table(struct mthca_dev *dev);
int mthca_init_mr_table(struct mthca_dev *dev);
int mthca_init_eq_table(struct mthca_dev *dev);
@@ -326,6 +335,7 @@
int mthca_init_av_table(struct mthca_dev *dev);
int mthca_init_mcg_table(struct mthca_dev *dev);

+void mthca_cleanup_uar_table(struct mthca_dev *dev);
void mthca_cleanup_pd_table(struct mthca_dev *dev);
void mthca_cleanup_mr_table(struct mthca_dev *dev);
void mthca_cleanup_eq_table(struct mthca_dev *dev);
@@ -337,6 +347,9 @@
int mthca_register_device(struct mthca_dev *dev);
void mthca_unregister_device(struct mthca_dev *dev);

+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
int mthca_pd_alloc(struct mthca_dev *dev, struct mthca_pd *pd);
void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);

--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:55.516870705 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:56.154732247 -0800
@@ -469,7 +469,7 @@
MTHCA_EQ_FLAG_TR);
eq_context->start = cpu_to_be64(0);
eq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
- MTHCA_KAR_PAGE);
+ dev->driver_uar.index);
eq_context->pd = cpu_to_be32(dev->driver_pd.pd_num);
eq_context->intr = intr;
eq_context->lkey = cpu_to_be32(eq->mr.ibmr.lkey);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:55.516870705 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:56.152732681 -0800
@@ -570,13 +570,35 @@

MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);

- err = mthca_init_pd_table(dev);
+ err = mthca_init_uar_table(dev);
if (err) {
mthca_err(dev, "Failed to initialize "
- "protection domain table, aborting.\n");
+ "user access region table, aborting.\n");
return err;
}

+ err = mthca_uar_alloc(dev, &dev->driver_uar);
+ if (err) {
+ mthca_err(dev, "Failed to allocate driver access region, "
+ "aborting.\n");
+ goto err_uar_table_free;
+ }
+
+ dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!dev->kar) {
+ mthca_err(dev, "Couldn't map kernel access region, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_uar_free;
+ }
+
+ err = mthca_init_pd_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "protection domain table, aborting.\n");
+ goto err_kar_unmap;
+ }
+
err = mthca_init_mr_table(dev);
if (err) {
mthca_err(dev, "Failed to initialize "
@@ -677,7 +699,16 @@

err_pd_table_free:
mthca_cleanup_pd_table(dev);
- return err;
+
+err_kar_unmap:
+ iounmap(dev->kar);
+
+err_uar_free:
+ mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+ mthca_cleanup_uar_table(dev);
+ return err;
}

static int __devinit mthca_request_regions(struct pci_dev *pdev,
@@ -789,7 +820,6 @@
static int mthca_version_printed = 0;
int ddr_hidden = 0;
int err;
- unsigned long mthca_base;
struct mthca_dev *mdev;

if (!mthca_version_printed) {
@@ -891,8 +921,7 @@
sema_init(&mdev->cmd.poll_sem, 1);
mdev->cmd.use_events = 0;

- mthca_base = pci_resource_start(pdev, 0);
- mdev->hcr = ioremap(mthca_base + MTHCA_HCR_BASE, MTHCA_HCR_SIZE);
+ mdev->hcr = ioremap(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE, MTHCA_HCR_SIZE);
if (!mdev->hcr) {
mthca_err(mdev, "Couldn't map command register, "
"aborting.\n");
@@ -900,22 +929,13 @@
goto err_free_dev;
}

- mthca_base = pci_resource_start(pdev, 2);
- mdev->kar = ioremap(mthca_base + PAGE_SIZE * MTHCA_KAR_PAGE, PAGE_SIZE);
- if (!mdev->kar) {
- mthca_err(mdev, "Couldn't map kernel access region, "
- "aborting.\n");
- err = -ENOMEM;
- goto err_iounmap;
- }
-
err = mthca_tune_pci(mdev);
if (err)
- goto err_iounmap_kar;
+ goto err_iounmap;

err = mthca_init_hca(mdev);
if (err)
- goto err_iounmap_kar;
+ goto err_iounmap;

err = mthca_setup_hca(mdev);
if (err)
@@ -948,13 +968,11 @@

mthca_cleanup_mr_table(mdev);
mthca_cleanup_pd_table(mdev);
+ mthca_cleanup_uar_table(mdev);

err_close:
mthca_close_hca(mdev);

-err_iounmap_kar:
- iounmap(mdev->kar);
-
err_iounmap:
iounmap(mdev->hcr);

@@ -1000,9 +1018,12 @@
mthca_cleanup_mr_table(mdev);
mthca_cleanup_pd_table(mdev);

+ iounmap(mdev->kar);
+ mthca_uar_free(mdev, &mdev->driver_uar);
+ mthca_cleanup_uar_table(mdev);
+
mthca_close_hca(mdev);

- iounmap(mdev->kar);
iounmap(mdev->hcr);

if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_profile.c 2005-03-02 20:53:21.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_profile.c 2005-03-03 14:12:56.153732464 -0800
@@ -236,6 +236,7 @@
init_hca->mtt_seg_sz = ffs(dev_lim->mtt_seg_sz) - 7;
break;
case MTHCA_RES_UAR:
+ dev->limits.num_uars = profile[i].num;
init_hca->uar_scratch_base = profile[i].start;
break;
case MTHCA_RES_UDAV:
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:12:54.674053653 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:12:56.153732464 -0800
@@ -49,6 +49,11 @@
DECLARE_PCI_UNMAP_ADDR(mapping)
};

+struct mthca_uar {
+ unsigned long pfn;
+ int index;
+};
+
struct mthca_mr {
struct ib_mr ibmr;
int order;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:12:54.675053436 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:12:56.155732030 -0800
@@ -625,7 +625,7 @@
qp_context->mtu_msgmax = cpu_to_be32((attr->path_mtu << 29) |
(31 << 24));
}
- qp_context->usr_page = cpu_to_be32(MTHCA_KAR_PAGE);
+ qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
qp_context->local_qpn = cpu_to_be32(qp->qpn);
if (attr_mask & IB_QP_DEST_QPN) {
qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/hw/mthca/mthca_uar.c 2005-03-03 14:12:56.152732681 -0800
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include "mthca_dev.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+ uar->index = mthca_alloc(&dev->uar_table.alloc);
+ if (uar->index == -1)
+ return -ENOMEM;
+
+ uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+ return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+ mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+ int ret;
+
+ ret = mthca_alloc_init(&dev->uar_table.alloc,
+ dev->limits.num_uars,
+ dev->limits.num_uars - 1,
+ dev->limits.reserved_uars);
+
+ return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+ /* XXX check if any UARs are still allocated? */
+ mthca_alloc_cleanup(&dev->uar_table.alloc);
+}

2005-03-04 00:04:55

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][4/26] IB/mthca: improve CQ locking part 2

From: Michael S. Tsirkin <[email protected]>

Locking during the poll cq operation can be reduced by locking the cq
while qp is being removed from the qp array. This also avoids an
extra atomic operation for reference counting.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:52.368554099 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:52.923433653 -0800
@@ -418,14 +418,14 @@
spin_unlock(&(*cur_qp)->lock);
}

- spin_lock(&dev->qp_table.lock);
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
*cur_qp = mthca_array_get(&dev->qp_table.qp,
be32_to_cpu(cqe->my_qpn) &
(dev->limits.num_qps - 1));
- if (*cur_qp)
- atomic_inc(&(*cur_qp)->refcount);
- spin_unlock(&dev->qp_table.lock);
-
if (!*cur_qp) {
mthca_warn(dev, "CQ entry for unknown QP %06x\n",
be32_to_cpu(cqe->my_qpn) & 0xffffff);
@@ -537,12 +537,8 @@
inc_cons_index(dev, cq, freed);
}

- if (qp) {
+ if (qp)
spin_unlock(&qp->lock);
- if (atomic_dec_and_test(&qp->refcount))
- wake_up(&qp->wait);
- }
-

spin_unlock_irqrestore(&cq->lock, flags);

--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-02-03 16:59:28.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:12:52.924433436 -0800
@@ -1083,9 +1083,21 @@
return 0;

err_out_free:
- spin_lock_irq(&dev->qp_table.lock);
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ spin_lock_irq(&send_cq->lock);
+ if (send_cq != recv_cq)
+ spin_lock(&recv_cq->lock);
+
+ spin_lock(&dev->qp_table.lock);
mthca_array_clear(&dev->qp_table.qp, mqpn);
- spin_unlock_irq(&dev->qp_table.lock);
+ spin_unlock(&dev->qp_table.lock);
+
+ if (send_cq != recv_cq)
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);

err_out:
dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
@@ -1100,11 +1112,28 @@
u8 status;
int size;
int i;
+ struct mthca_cq *send_cq;
+ struct mthca_cq *recv_cq;
+
+ send_cq = to_mcq(qp->ibqp.send_cq);
+ recv_cq = to_mcq(qp->ibqp.recv_cq);

- spin_lock_irq(&dev->qp_table.lock);
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ spin_lock_irq(&send_cq->lock);
+ if (send_cq != recv_cq)
+ spin_lock(&recv_cq->lock);
+
+ spin_lock(&dev->qp_table.lock);
mthca_array_clear(&dev->qp_table.qp,
qp->qpn & (dev->limits.num_qps - 1));
- spin_unlock_irq(&dev->qp_table.lock);
+ spin_unlock(&dev->qp_table.lock);
+
+ if (send_cq != recv_cq)
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);

atomic_dec(&qp->refcount);
wait_event(qp->wait, !atomic_read(&qp->refcount));

2005-03-04 00:04:54

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][3/26] IB/mthca: improve CQ locking part 1

From: Michael S. Tsirkin <[email protected]>

Avoid taking the CQ table lock in the fast path path by using
synchronize_irq() after removing a CQ from the table to make sure that
no completion events are still in progress. This gets a nice speedup
(about 4%) in IP over IB on my hardware.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:51.832670421 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:52.368554099 -0800
@@ -33,6 +33,7 @@
*/

#include <linux/init.h>
+#include <linux/hardirq.h>

#include <ib_pack.h>

@@ -181,11 +182,7 @@
{
struct mthca_cq *cq;

- spin_lock(&dev->cq_table.lock);
cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
- if (cq)
- atomic_inc(&cq->refcount);
- spin_unlock(&dev->cq_table.lock);

if (!cq) {
mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
@@ -193,9 +190,6 @@
}

cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
-
- if (atomic_dec_and_test(&cq->refcount))
- wake_up(&cq->wait);
}

void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn)
@@ -783,6 +777,11 @@
cq->cqn & (dev->limits.num_cqs - 1));
spin_unlock_irq(&dev->cq_table.lock);

+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+ synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+ else
+ synchronize_irq(dev->pdev->irq);
+
atomic_dec(&cq->refcount);
wait_event(cq->wait, !atomic_read(&cq->refcount));


2005-03-04 00:19:20

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH][26/26] IB: MAD cancel callbacks from thread

Roland Dreier wrote:
> +void cancel_sends(void *data)
> +{
> + struct ib_mad_agent_private *mad_agent_priv;
> + struct ib_mad_send_wr_private *mad_send_wr;
> + struct ib_mad_send_wc mad_send_wc;
> + unsigned long flags;
> +
> + mad_agent_priv = (struct ib_mad_agent_private *)data;

don't add casts to a void pointer, that's silly.



> + mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
> + mad_send_wc.vendor_err = 0;
> +
> + spin_lock_irqsave(&mad_agent_priv->lock, flags);
> + while (!list_empty(&mad_agent_priv->canceled_list)) {
> + mad_send_wr = list_entry(mad_agent_priv->canceled_list.next,
> + struct ib_mad_send_wr_private,
> + agent_list);
> +
> + list_del(&mad_send_wr->agent_list);
> + spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
> +
> + mad_send_wc.wr_id = mad_send_wr->wr_id;
> + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
> + &mad_send_wc);
> +
> + kfree(mad_send_wr);
> + if (atomic_dec_and_test(&mad_agent_priv->refcount))
> + wake_up(&mad_agent_priv->wait);
> + spin_lock_irqsave(&mad_agent_priv->lock, flags);
> + }
> + spin_unlock_irqrestore(&mad_agent_priv->lock, flags);

dumb question... why is the lock dropped? is it just for the
send_handler(), or also for wr_id assigned, kfree, and wake_up() ?

2005-03-04 00:39:59

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH][16/26] IB/mthca: mem-free doorbell record writing

Jeff> Are you concerned about ordering, or write-combining?

ordering... write combining would be fine.

Jeff> I am unaware of a situation where writes are re-ordered into
Jeff> a reversed, descending order for no apparent reason.

Hmm... I've seen ppc64 do some pretty freaky reordering but on the
other hand that's a 64-bit arch so we don't care in this case. I
guess I'd rather keep the barrier there so we don't have the
possibility of a rare hardware crash when the HCA just happens to read
the doorbell record in a corrupt state.

- R.

2005-03-04 00:35:14

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH][26/26] IB: MAD cancel callbacks from thread

Jeff> don't add casts to a void pointer, that's silly.

Fair enough...

Jeff> dumb question... why is the lock dropped? is it just for
Jeff> the send_handler(), or also for wr_id assigned, kfree, and
Jeff> wake_up() ?

Not sure... Sean?

- R.

2005-03-04 00:40:00

by Hefty, Sean

[permalink] [raw]
Subject: RE: [openib-general] Re: [PATCH][26/26] IB: MAD cancel callbacks fromthread

>Roland Dreier wrote:
>> +void cancel_sends(void *data)
>> +{
>> + struct ib_mad_agent_private *mad_agent_priv;
>> + struct ib_mad_send_wr_private *mad_send_wr;
>> + struct ib_mad_send_wc mad_send_wc;
>> + unsigned long flags;
>> +
>> + mad_agent_priv = (struct ib_mad_agent_private *)data;
>
>don't add casts to a void pointer, that's silly.

This is my bad.

>> + mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
>> + mad_send_wc.vendor_err = 0;
>> +
>> + spin_lock_irqsave(&mad_agent_priv->lock, flags);
>> + while (!list_empty(&mad_agent_priv->canceled_list)) {
>> + mad_send_wr = list_entry(mad_agent_priv->canceled_list.next,
>> + struct ib_mad_send_wr_private,
>> + agent_list);
>> +
>> + list_del(&mad_send_wr->agent_list);
>> + spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>> +
>> + mad_send_wc.wr_id = mad_send_wr->wr_id;
>> + mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
>> + &mad_send_wc);
>> +
>> + kfree(mad_send_wr);
>> + if (atomic_dec_and_test(&mad_agent_priv->refcount))
>> + wake_up(&mad_agent_priv->wait);
>> + spin_lock_irqsave(&mad_agent_priv->lock, flags);
>> + }
>> + spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>
>dumb question... why is the lock dropped? is it just for the
>send_handler(), or also for wr_id assigned, kfree, and wake_up() ?

The lock is dropped to avoid calling the user back with it held. The if
statement / wake_up call near the bottom of the loop can be replaced with a
simple atomic_dec. The test should always fail. The lock is to protect
access to the canceled_list.

(Sorry about the mailer...)

- Sean

2005-03-04 00:44:24

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH][3/26] IB/mthca: improve CQ locking part 1

Roland Dreier wrote:
> @@ -783,6 +777,11 @@
> cq->cqn & (dev->limits.num_cqs - 1));
> spin_unlock_irq(&dev->cq_table.lock);
>
> + if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
> + synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
> + else
> + synchronize_irq(dev->pdev->irq);
> +


Tangent: I think we need a pci_irq_sync() rather than putting the above
code into each driver.

Jeff


2005-03-04 00:48:31

by Roland Dreier

[permalink] [raw]
Subject: Re: [openib-general] Re: [PATCH][26/26] IB: MAD cancel callbacks fromthread

>> don't add casts to a void pointer, that's silly.

How should we handle this nit? Should I post a new version of this
patch or an incremental diff that fixes it up?

- R.

2005-03-04 00:48:31

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH][16/26] IB/mthca: mem-free doorbell record writing

Roland Dreier wrote:
> Jeff> Are you concerned about ordering, or write-combining?
>
> ordering... write combining would be fine.
>
> Jeff> I am unaware of a situation where writes are re-ordered into
> Jeff> a reversed, descending order for no apparent reason.
>
> Hmm... I've seen ppc64 do some pretty freaky reordering but on the
> other hand that's a 64-bit arch so we don't care in this case. I
> guess I'd rather keep the barrier there so we don't have the
> possibility of a rare hardware crash when the HCA just happens to read
> the doorbell record in a corrupt state.

Well, we don't just add code to "hope and pray" for an event that nobody
is sure can even occur...

Does someone have a concrete case where this could happen? ever?

Jeff


2005-03-04 01:04:49

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH][3/26] IB/mthca: improve CQ locking part 1

On Thu, Mar 03, 2005 at 07:35:00PM -0500, Jeff Garzik wrote:
> Roland Dreier wrote:
> >@@ -783,6 +777,11 @@
> > cq->cqn & (dev->limits.num_cqs - 1));
> > spin_unlock_irq(&dev->cq_table.lock);
> >
> >+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
> >+ synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
> >+ else
> >+ synchronize_irq(dev->pdev->irq);
> >+
>
>
> Tangent: I think we need a pci_irq_sync() rather than putting the above
> code into each driver.

Sure, I have no problem accepting that into the pci core.

thanks,

greg k-h

2005-03-04 01:09:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [openib-general] Re: [PATCH][26/26] IB: MAD cancel callbacks fromthread

Roland Dreier <[email protected]> wrote:
>
> >> don't add casts to a void pointer, that's silly.
>
> How should we handle this nit? Should I post a new version of this
> patch or an incremental diff that fixes it up?
>

I'll fix it up.

2005-03-04 01:09:24

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH][3/26] IB/mthca: improve CQ locking part 1

Greg> Sure, I have no problem accepting that into the pci core.

What would pci_irq_sync() do exactly?

- R.

2005-03-04 01:13:40

by Andrew Morton

[permalink] [raw]
Subject: Re: [openib-general] Re: [PATCH][26/26] IB: MAD cancel callbacks fromthread

Andrew Morton <[email protected]> wrote:
>
> Roland Dreier <[email protected]> wrote:
> >
> > >> don't add casts to a void pointer, that's silly.
> >
> > How should we handle this nit? Should I post a new version of this
> > patch or an incremental diff that fixes it up?
> >
>
> I'll fix it up.

Actually, seeing as 15/26 has vanished into the ether and there have been
quite a few comments, please resend everything.

2005-03-04 01:30:00

by Andrew Morton

[permalink] [raw]
Subject: Re: [openib-general] Re: [PATCH][26/26] IB: MAD cancel callbacks fromthread

Andrew Morton <[email protected]> wrote:
>
> Andrew Morton <[email protected]> wrote:
> >
> > Roland Dreier <[email protected]> wrote:
> > >
> > > >> don't add casts to a void pointer, that's silly.
> > >
> > > How should we handle this nit? Should I post a new version of this
> > > patch or an incremental diff that fixes it up?
> > >
> >
> > I'll fix it up.
>
> Actually, seeing as 15/26 has vanished into the ether and there have been
> quite a few comments, please resend everything.

I seem to have forgotten how to operate this computer thingy. I have all
26 patches.

2005-03-04 00:48:30

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH][3/26] IB/mthca: improve CQ locking part 1

> @@ -783,6 +777,11 @@
> cq->cqn & (dev->limits.num_cqs - 1));
> spin_unlock_irq(&dev->cq_table.lock);
> + if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
> + synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
> + else
> + synchronize_irq(dev->pdev->irq);
> +

Jeff> Tangent: I think we need a pci_irq_sync() rather than
Jeff> putting the above code into each driver.

The problem with trying to make it generic is that mthca has multiple
MSI-X vectors, and only the driver author could know that we only need
to synchronize with the completion event vector.

- R.

2005-03-04 01:34:12

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH][16/26] IB/mthca: mem-free doorbell record writing

Jeff> Well, we don't just add code to "hope and pray" for an event
Jeff> that nobody is sure can even occur...

The hardware requires that if the record is written in two 32-bit
chunks, then they must be written in order. Of course the hardware
probably won't be reading just as we're writing, so almost all of the
time we won't notice the problem.

It feels more like "hope and pray" to me to leave the barrier out and
assume that every possible implementation of every architecture will
always write them in order.

Jeff> Does someone have a concrete case where this could happen? ever?

I don't see how you can rule it out on out-of-order architectures. If
the second word becomes ready before the first, then the CPU may
execute the second write before the first.

It's not precisely the same situation, but if you look at mthca_eq.c
you'll see an rmb() in mthca_eq_int(). That's there because on ppc64,
I really saw a situation where code like:

while (foo->x) {
switch (foo->y) {

was behaving as if foo->y was being read before foo->x. Even though
both foo->x and foo->y are in the same cache line, and foo->x was
written by the hardware after foo->y.

- R.

2005-03-03 23:55:22

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][11/26] IB/mthca: mem-free EQ initialization

Add code to initialize EQ context properly in both Tavor and mem-free mode.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:56.154732247 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:57.462448386 -0800
@@ -54,10 +54,10 @@
u32 flags;
u64 start;
u32 logsize_usrpage;
- u32 pd;
+ u32 tavor_pd; /* reserved for Arbel */
u8 reserved1[3];
u8 intr;
- u32 lost_count;
+ u32 arbel_pd; /* lost_count for Tavor */
u32 lkey;
u32 reserved2[2];
u32 consumer_index;
@@ -75,6 +75,7 @@
#define MTHCA_EQ_STATE_ARMED ( 1 << 8)
#define MTHCA_EQ_STATE_FIRED ( 2 << 8)
#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 << 8)
+#define MTHCA_EQ_STATE_ARBEL ( 8 << 8)

enum {
MTHCA_EVENT_TYPE_COMP = 0x00,
@@ -467,10 +468,16 @@
MTHCA_EQ_OWNER_HW |
MTHCA_EQ_STATE_ARMED |
MTHCA_EQ_FLAG_TR);
- eq_context->start = cpu_to_be64(0);
- eq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
- dev->driver_uar.index);
- eq_context->pd = cpu_to_be32(dev->driver_pd.pd_num);
+ if (dev->hca_type == ARBEL_NATIVE)
+ eq_context->flags |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+ eq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+ if (dev->hca_type == ARBEL_NATIVE) {
+ eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+ } else {
+ eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+ eq_context->tavor_pd = cpu_to_be32(dev->driver_pd.pd_num);
+ }
eq_context->intr = intr;
eq_context->lkey = cpu_to_be32(eq->mr.ibmr.lkey);


2005-03-03 23:50:23

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][15/26] IB/mthca: mem-free doorbell record allocation

Mem-free mode requires the driver to allocate additional doorbell pages
for each user access region. Add support for this in mthca_memfree.c,
and have the driver allocate a table in db_tab for kernel use.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:57.857362663 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:59.077097900 -0800
@@ -268,9 +268,10 @@
struct mthca_av_table av_table;
struct mthca_mcg_table mcg_table;

- struct mthca_uar driver_uar;
- struct mthca_pd driver_pd;
- struct mthca_mr driver_mr;
+ struct mthca_uar driver_uar;
+ struct mthca_db_table *db_tab;
+ struct mthca_pd driver_pd;
+ struct mthca_mr driver_mr;

struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2];
struct ib_ah *sm_ah[MTHCA_MAX_PORTS];
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_memfree.c 2005-03-03 14:12:56.773597912 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_memfree.c 2005-03-03 14:12:59.079097466 -0800
@@ -267,3 +267,199 @@

kfree(table);
}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, int page)
+{
+ return dev->uar_table.uarc_base +
+ dev->driver_uar.index * dev->uar_table.uarc_size +
+ page * 4096;
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db)
+{
+ int group;
+ int start, end, dir;
+ int i, j;
+ struct mthca_db_page *page;
+ int ret = 0;
+ u8 status;
+
+ down(&dev->db_tab->mutex);
+
+ switch (type) {
+ case MTHCA_DB_TYPE_CQ_ARM:
+ case MTHCA_DB_TYPE_SQ:
+ group = 0;
+ start = 0;
+ end = dev->db_tab->max_group1;
+ dir = 1;
+ break;
+
+ case MTHCA_DB_TYPE_CQ_SET_CI:
+ case MTHCA_DB_TYPE_RQ:
+ case MTHCA_DB_TYPE_SRQ:
+ group = 1;
+ start = dev->db_tab->npages - 1;
+ end = dev->db_tab->min_group2;
+ dir = -1;
+ break;
+
+ default:
+ return -1;
+ }
+
+ for (i = start; i != end; i += dir)
+ if (dev->db_tab->page[i].db_rec &&
+ !bitmap_full(dev->db_tab->page[i].used,
+ MTHCA_DB_REC_PER_PAGE)) {
+ page = dev->db_tab->page + i;
+ goto found;
+ }
+
+ if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = dev->db_tab->page + end;
+ page->db_rec = dma_alloc_coherent(&dev->pdev->dev, 4096,
+ &page->mapping, GFP_KERNEL);
+ if (!page->db_rec) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memset(page->db_rec, 0, 4096);
+
+ ret = mthca_MAP_ICM_page(dev, page->mapping, mthca_uarc_virt(dev, i), &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ dma_free_coherent(&dev->pdev->dev, 4096,
+ page->db_rec, page->mapping);
+ goto out;
+ }
+
+ bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+ if (group == 0)
+ ++dev->db_tab->max_group1;
+ else
+ --dev->db_tab->min_group2;
+
+found:
+ j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+ set_bit(j, page->used);
+
+ if (group == 1)
+ j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+ ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+ page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+ *db = (u32 *) &page->db_rec[j];
+
+out:
+ up(&dev->db_tab->mutex);
+
+ return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+ int i, j;
+ struct mthca_db_page *page;
+ u8 status;
+
+ i = db_index / MTHCA_DB_REC_PER_PAGE;
+ j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+ page = dev->db_tab->page + i;
+
+ down(&dev->db_tab->mutex);
+
+ page->db_rec[j] = 0;
+ if (i >= dev->db_tab->min_group2)
+ j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+ clear_bit(j, page->used);
+
+ if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+ i >= dev->db_tab->max_group1 - 1) {
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+
+ dma_free_coherent(&dev->pdev->dev, 4096,
+ page->db_rec, page->mapping);
+ page->db_rec = NULL;
+
+ if (i == dev->db_tab->max_group1) {
+ --dev->db_tab->max_group1;
+ /* XXX may be able to unmap more pages now */
+ }
+ if (i == dev->db_tab->min_group2)
+ ++dev->db_tab->min_group2;
+ }
+
+ up(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+ int i;
+
+ if (dev->hca_type != ARBEL_NATIVE)
+ return 0;
+
+ dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+ if (!dev->db_tab)
+ return -ENOMEM;
+
+ init_MUTEX(&dev->db_tab->mutex);
+
+ dev->db_tab->npages = dev->uar_table.uarc_size / PAGE_SIZE;
+ dev->db_tab->max_group1 = 0;
+ dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+ dev->db_tab->page = kmalloc(dev->db_tab->npages *
+ sizeof *dev->db_tab->page,
+ GFP_KERNEL);
+ if (!dev->db_tab->page) {
+ kfree(dev->db_tab);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < dev->db_tab->npages; ++i)
+ dev->db_tab->page[i].db_rec = NULL;
+
+ return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+ int i;
+ u8 status;
+
+ if (dev->hca_type != ARBEL_NATIVE)
+ return;
+
+ /*
+ * Because we don't always free our UARC pages when they
+ * become empty to make mthca_free_db() simpler we need to
+ * make a sweep through the doorbell pages and free any
+ * leftover pages now.
+ */
+ for (i = 0; i < dev->db_tab->npages; ++i) {
+ if (!dev->db_tab->page[i].db_rec)
+ continue;
+
+ if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+ mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+
+ dma_free_coherent(&dev->pdev->dev, 4096,
+ dev->db_tab->page[i].db_rec,
+ dev->db_tab->page[i].mapping);
+ }
+
+ kfree(dev->db_tab->page);
+ kfree(dev->db_tab);
+}
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_memfree.h 2005-03-03 14:12:56.773597912 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_memfree.h 2005-03-03 14:12:59.078097683 -0800
@@ -125,4 +125,37 @@
return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
}

+enum {
+ MTHCA_DB_REC_PER_PAGE = 4096 / 8
+};
+
+struct mthca_db_page {
+ DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+ u64 *db_rec;
+ dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+ int npages;
+ int max_group1;
+ int min_group2;
+ struct mthca_db_page *page;
+ struct semaphore mutex;
+};
+
+enum {
+ MTHCA_DB_TYPE_INVALID = 0x0,
+ MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+ MTHCA_DB_TYPE_CQ_ARM = 0x2,
+ MTHCA_DB_TYPE_SQ = 0x3,
+ MTHCA_DB_TYPE_RQ = 0x4,
+ MTHCA_DB_TYPE_SRQ = 0x5,
+ MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
#endif /* MTHCA_MEMFREE_H */
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_profile.c 2005-03-03 14:12:56.153732464 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_profile.c 2005-03-03 14:12:59.078097683 -0800
@@ -244,9 +244,11 @@
dev->av_table.num_ddr_avs = profile[i].num;
break;
case MTHCA_RES_UARC:
- init_hca->uarc_base = profile[i].start;
- init_hca->log_uarc_sz = ffs(request->uarc_size) - 13;
- init_hca->log_uar_sz = ffs(request->num_uar) - 1;
+ dev->uar_table.uarc_size = request->uarc_size;
+ dev->uar_table.uarc_base = profile[i].start;
+ init_hca->uarc_base = profile[i].start;
+ init_hca->log_uarc_sz = ffs(request->uarc_size) - 13;
+ init_hca->log_uar_sz = ffs(request->num_uar) - 1;
break;
default:
break;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_uar.c 2005-03-03 14:12:56.152732681 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_uar.c 2005-03-03 14:12:59.078097683 -0800
@@ -33,6 +33,7 @@
*/

#include "mthca_dev.h"
+#include "mthca_memfree.h"

int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
{
@@ -58,12 +59,20 @@
dev->limits.num_uars,
dev->limits.num_uars - 1,
dev->limits.reserved_uars);
+ if (ret)
+ return ret;
+
+ ret = mthca_init_db_tab(dev);
+ if (ret)
+ mthca_alloc_cleanup(&dev->uar_table.alloc);

return ret;
}

void mthca_cleanup_uar_table(struct mthca_dev *dev)
{
+ mthca_cleanup_db_tab(dev);
+
/* XXX check if any UARs are still allocated? */
mthca_alloc_cleanup(&dev->uar_table.alloc);
}

2005-03-04 01:47:24

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH][16/26] IB/mthca: mem-free doorbell record writing

Roland Dreier wrote:
> Add a mthca_write_db_rec() to wrap writing doorbell records. On
> 64-bit archs, this is just a 64-bit write, while on 32-bit archs it
> splits the write into two 32-bit writes with a memory barrier to make
> sure the two halves of the record are written in the correct order.

> +static inline void mthca_write_db_rec(u32 val[2], u32 *db)
> +{
> + db[0] = val[0];
> + wmb();
> + db[1] = val[1];
> +}
> +


Are you concerned about ordering, or write-combining?

I am unaware of a situation where writes are re-ordered into a reversed,
descending order for no apparent reason.

Jeff


2005-03-04 01:44:37

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][5/26] IB/mthca: CQ cleanups

Simplify some of the code for CQ handling slightly.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:52.923433653 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:53.538300187 -0800
@@ -150,9 +150,8 @@

static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
{
- struct mthca_cqe *cqe;
- cqe = get_cqe(cq, i);
- return (MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner) ? NULL : cqe;
+ struct mthca_cqe *cqe = get_cqe(cq, i);
+ return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
}

static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
@@ -378,7 +377,7 @@
struct mthca_wq *wq;
struct mthca_cqe *cqe;
int wqe_index;
- int is_error = 0;
+ int is_error;
int is_send;
int free_cqe = 1;
int err = 0;
@@ -401,12 +400,9 @@
dump_cqe(cqe);
}

- if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
- MTHCA_ERROR_CQE_OPCODE_MASK) {
- is_error = 1;
- is_send = cqe->opcode & 1;
- } else
- is_send = cqe->is_send & 0x80;
+ is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+ MTHCA_ERROR_CQE_OPCODE_MASK;
+ is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;

if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
if (*cur_qp) {

2005-03-04 01:44:36

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][2/26] IB/mthca: CQ minor tweaks

From: "Michael S. Tsirkin" <[email protected]>

Clean up CQ code so that we only calculate the address of a CQ entry
once when using it.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-02-03 16:59:43.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:51.832670421 -0800
@@ -147,20 +147,21 @@
+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
}

-static inline int cqe_sw(struct mthca_cq *cq, int i)
+static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
{
- return !(MTHCA_CQ_ENTRY_OWNER_HW &
- get_cqe(cq, i)->owner);
+ struct mthca_cqe *cqe;
+ cqe = get_cqe(cq, i);
+ return (MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner) ? NULL : cqe;
}

-static inline int next_cqe_sw(struct mthca_cq *cq)
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
{
return cqe_sw(cq, cq->cons_index);
}

-static inline void set_cqe_hw(struct mthca_cq *cq, int entry)
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
{
- get_cqe(cq, entry)->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+ cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
}

static inline void inc_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
@@ -388,7 +389,8 @@
int free_cqe = 1;
int err = 0;

- if (!next_cqe_sw(cq))
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
return -EAGAIN;

/*
@@ -397,8 +399,6 @@
*/
rmb();

- cqe = get_cqe(cq, cq->cons_index);
-
if (0) {
mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
@@ -509,8 +509,8 @@
entry->status = IB_WC_SUCCESS;

out:
- if (free_cqe) {
- set_cqe_hw(cq, cq->cons_index);
+ if (likely(free_cqe)) {
+ set_cqe_hw(cqe);
++(*freed);
cq->cons_index = (cq->cons_index + 1) & cq->ibcq.cqe;
}
@@ -655,7 +655,7 @@
}

for (i = 0; i < nent; ++i)
- set_cqe_hw(cq, i);
+ set_cqe_hw(get_cqe(cq, i));

cq->cqn = mthca_alloc(&dev->cq_table.alloc);
if (cq->cqn == -1)
@@ -773,7 +773,7 @@
int j;

printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
- cq->cqn, cq->cons_index, next_cqe_sw(cq));
+ cq->cqn, cq->cons_index, !!next_cqe_sw(cq));
for (j = 0; j < 16; ++j)
printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
}

2005-03-04 01:43:40

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][7/26] IB/mthca: map registers for mem-free mode

Move the request/ioremap of regions related to event handling into
mthca_eq.c. Map the correct regions depending on whether we're in
Tavor or native mem-free mode.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_config_reg.h 2005-01-25 20:48:48.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_config_reg.h 2005-03-03 14:12:55.516870705 -0800
@@ -46,5 +46,6 @@
#define MTHCA_MAP_ECR_SIZE (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
#define MTHCA_CLR_INT_BASE 0xf00d8
#define MTHCA_CLR_INT_SIZE 0x00008
+#define MTHCA_EQ_SET_CI_SIZE (8 * 32)

#endif /* MTHCA_CONFIG_REG_H */
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:54.672054087 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:55.515870922 -0800
@@ -237,9 +237,17 @@
struct semaphore cap_mask_mutex;

void __iomem *hcr;
- void __iomem *ecr_base;
- void __iomem *clr_base;
void __iomem *kar;
+ void __iomem *clr_base;
+ union {
+ struct {
+ void __iomem *ecr_base;
+ } tavor;
+ struct {
+ void __iomem *eq_arm;
+ void __iomem *eq_set_ci_base;
+ } arbel;
+ } eq_regs;

struct mthca_cmd cmd;
struct mthca_limits limits;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_eq.c 2005-01-25 20:48:48.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_eq.c 2005-03-03 14:12:55.516870705 -0800
@@ -366,10 +366,10 @@
if (dev->eq_table.clr_mask)
writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);

- if ((ecr = readl(dev->ecr_base + 4)) != 0) {
+ if ((ecr = readl(dev->eq_regs.tavor.ecr_base + 4)) != 0) {
work = 1;

- writel(ecr, dev->ecr_base +
+ writel(ecr, dev->eq_regs.tavor.ecr_base +
MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);

for (i = 0; i < MTHCA_NUM_EQ; ++i)
@@ -578,6 +578,129 @@
dev->eq_table.eq + i);
}

+static int __devinit mthca_map_reg(struct mthca_dev *dev,
+ unsigned long offset, unsigned long size,
+ void __iomem **map)
+{
+ unsigned long base = pci_resource_start(dev->pdev, 0);
+
+ if (!request_mem_region(base + offset, size, DRV_NAME))
+ return -EBUSY;
+
+ *map = ioremap(base + offset, size);
+ if (!*map) {
+ release_mem_region(base + offset, size);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mthca_unmap_reg(struct mthca_dev *dev, unsigned long offset,
+ unsigned long size, void __iomem *map)
+{
+ unsigned long base = pci_resource_start(dev->pdev, 0);
+
+ release_mem_region(base + offset, size);
+ iounmap(map);
+}
+
+static int __devinit mthca_map_eq_regs(struct mthca_dev *dev)
+{
+ unsigned long mthca_base;
+
+ mthca_base = pci_resource_start(dev->pdev, 0);
+
+ if (dev->hca_type == ARBEL_NATIVE) {
+ /*
+ * We assume that the EQ arm and EQ set CI registers
+ * fall within the first BAR. We can't trust the
+ * values firmware gives us, since those addresses are
+ * valid on the HCA's side of the PCI bus but not
+ * necessarily the host side.
+ */
+ if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ &dev->clr_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Add 4 because we limit ourselves to EQs 0 ... 31,
+ * so we only need the low word of the register.
+ */
+ if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ &dev->eq_regs.arbel.eq_arm)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ return -ENOMEM;
+ }
+
+ if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_set_ci_base,
+ MTHCA_EQ_SET_CI_SIZE,
+ &dev->eq_regs.arbel.eq_set_ci_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ mthca_unmap_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ dev->eq_regs.arbel.eq_arm);
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ return -ENOMEM;
+ }
+ } else {
+ if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ &dev->clr_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ return -ENOMEM;
+ }
+
+ if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+ MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+ &dev->eq_regs.tavor.ecr_base)) {
+ mthca_err(dev, "Couldn't map ecr register, "
+ "aborting.\n");
+ mthca_unmap_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+
+}
+
+static void __devexit mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+ if (dev->hca_type == ARBEL_NATIVE) {
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_set_ci_base,
+ MTHCA_EQ_SET_CI_SIZE,
+ dev->eq_regs.arbel.eq_set_ci_base);
+ mthca_unmap_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ dev->eq_regs.arbel.eq_arm);
+ mthca_unmap_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ } else {
+ mthca_unmap_reg(dev, MTHCA_ECR_BASE,
+ MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+ dev->eq_regs.tavor.ecr_base);
+ mthca_unmap_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ dev->clr_base);
+ }
+}
+
int __devinit mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
{
int ret;
@@ -636,6 +759,10 @@
if (err)
return err;

+ err = mthca_map_eq_regs(dev);
+ if (err)
+ goto err_out_free;
+
if (dev->mthca_flags & MTHCA_FLAG_MSI ||
dev->mthca_flags & MTHCA_FLAG_MSI_X) {
dev->eq_table.clr_mask = 0;
@@ -653,7 +780,7 @@
(dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
&dev->eq_table.eq[MTHCA_EQ_COMP]);
if (err)
- goto err_out_free;
+ goto err_out_unmap;

err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE,
(dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
@@ -720,6 +847,9 @@
err_out_comp:
mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);

+err_out_unmap:
+ mthca_unmap_eq_regs(dev);
+
err_out_free:
mthca_alloc_cleanup(&dev->eq_table.alloc);
return err;
@@ -740,5 +870,7 @@
for (i = 0; i < MTHCA_NUM_EQ; ++i)
mthca_free_eq(dev, &dev->eq_table.eq[i]);

+ mthca_unmap_eq_regs(dev);
+
mthca_alloc_cleanup(&dev->eq_table.alloc);
}
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_main.c 2005-01-25 20:49:05.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_main.c 2005-03-03 14:12:55.516870705 -0800
@@ -686,37 +686,18 @@
int err;

/*
- * We request our first BAR in two chunks, since the MSI-X
- * vector table is right in the middle.
+ * We can't just use pci_request_regions() because the MSI-X
+ * table is right in the middle of the first BAR. If we did
+ * pci_request_region and grab all of the first BAR, then
+ * setting up MSI-X would fail, since the PCI core wants to do
+ * request_mem_region on the MSI-X vector table.
*
- * This is why we can't just use pci_request_regions() -- if
- * we did then setting up MSI-X would fail, since the PCI core
- * wants to do request_mem_region on the MSI-X vector table.
+ * So just request what we need right now, and request any
+ * other regions we need when setting up EQs.
*/
- if (!request_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_HCR_BASE,
- MTHCA_HCR_SIZE,
- DRV_NAME)) {
- err = -EBUSY;
- goto err_hcr_failed;
- }
-
- if (!request_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_ECR_BASE,
- MTHCA_MAP_ECR_SIZE,
- DRV_NAME)) {
- err = -EBUSY;
- goto err_ecr_failed;
- }
-
- if (!request_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_CLR_INT_BASE,
- MTHCA_CLR_INT_SIZE,
- DRV_NAME)) {
- err = -EBUSY;
- goto err_int_failed;
- }
-
+ if (!request_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
+ MTHCA_HCR_SIZE, DRV_NAME))
+ return -EBUSY;

err = pci_request_region(pdev, 2, DRV_NAME);
if (err)
@@ -731,24 +712,11 @@
return 0;

err_bar4_failed:
-
pci_release_region(pdev, 2);
-err_bar2_failed:
-
- release_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_CLR_INT_BASE,
- MTHCA_CLR_INT_SIZE);
-err_int_failed:
-
- release_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_ECR_BASE,
- MTHCA_MAP_ECR_SIZE);
-err_ecr_failed:

- release_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_HCR_BASE,
+err_bar2_failed:
+ release_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
MTHCA_HCR_SIZE);
-err_hcr_failed:

return err;
}
@@ -761,16 +729,7 @@

pci_release_region(pdev, 2);

- release_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_CLR_INT_BASE,
- MTHCA_CLR_INT_SIZE);
-
- release_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_ECR_BASE,
- MTHCA_MAP_ECR_SIZE);
-
- release_mem_region(pci_resource_start(pdev, 0) +
- MTHCA_HCR_BASE,
+ release_mem_region(pci_resource_start(pdev, 0) + MTHCA_HCR_BASE,
MTHCA_HCR_SIZE);
}

@@ -941,31 +900,13 @@
goto err_free_dev;
}

- mdev->clr_base = ioremap(mthca_base + MTHCA_CLR_INT_BASE,
- MTHCA_CLR_INT_SIZE);
- if (!mdev->clr_base) {
- mthca_err(mdev, "Couldn't map interrupt clear register, "
- "aborting.\n");
- err = -ENOMEM;
- goto err_iounmap;
- }
-
- mdev->ecr_base = ioremap(mthca_base + MTHCA_ECR_BASE,
- MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE);
- if (!mdev->ecr_base) {
- mthca_err(mdev, "Couldn't map ecr register, "
- "aborting.\n");
- err = -ENOMEM;
- goto err_iounmap_clr;
- }
-
mthca_base = pci_resource_start(pdev, 2);
mdev->kar = ioremap(mthca_base + PAGE_SIZE * MTHCA_KAR_PAGE, PAGE_SIZE);
if (!mdev->kar) {
mthca_err(mdev, "Couldn't map kernel access region, "
"aborting.\n");
err = -ENOMEM;
- goto err_iounmap_ecr;
+ goto err_iounmap;
}

err = mthca_tune_pci(mdev);
@@ -1014,12 +955,6 @@
err_iounmap_kar:
iounmap(mdev->kar);

-err_iounmap_ecr:
- iounmap(mdev->ecr_base);
-
-err_iounmap_clr:
- iounmap(mdev->clr_base);
-
err_iounmap:
iounmap(mdev->hcr);

@@ -1067,9 +1002,8 @@

mthca_close_hca(mdev);

+ iounmap(mdev->kar);
iounmap(mdev->hcr);
- iounmap(mdev->ecr_base);
- iounmap(mdev->clr_base);

if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
pci_disable_msix(pdev);

2005-03-04 01:43:40

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][6/26] IB: remove unsignaled receives

From: Michael S. Tsirkin <[email protected]>

Remove support for unsignaled receive requests. This is a
non-standard extension to the IB spec that is not used by any known
applications or protocols, and is not supported by newer hardware.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/core/mad.c 2005-03-02 20:53:21.000000000 -0800
+++ linux-export/drivers/infiniband/core/mad.c 2005-03-03 14:12:54.671054304 -0800
@@ -2191,7 +2191,6 @@
recv_wr.next = NULL;
recv_wr.sg_list = &sg_list;
recv_wr.num_sge = 1;
- recv_wr.recv_flags = IB_RECV_SIGNALED;

do {
/* Allocate and map receive buffer */
@@ -2386,7 +2385,6 @@
qp_init_attr.send_cq = qp_info->port_priv->cq;
qp_init_attr.recv_cq = qp_info->port_priv->cq;
qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
- qp_init_attr.rq_sig_type = IB_SIGNAL_ALL_WR;
qp_init_attr.cap.max_send_wr = IB_MAD_QP_SEND_SIZE;
qp_init_attr.cap.max_recv_wr = IB_MAD_QP_RECV_SIZE;
qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h 2005-01-25 20:48:48.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h 2005-03-03 14:12:54.672054087 -0800
@@ -369,14 +369,12 @@
struct mthca_cq *recv_cq,
enum ib_qp_type type,
enum ib_sig_type send_policy,
- enum ib_sig_type recv_policy,
struct mthca_qp *qp);
int mthca_alloc_sqp(struct mthca_dev *dev,
struct mthca_pd *pd,
struct mthca_cq *send_cq,
struct mthca_cq *recv_cq,
enum ib_sig_type send_policy,
- enum ib_sig_type recv_policy,
int qpn,
int port,
struct mthca_sqp *sqp);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.c 2005-01-25 20:49:23.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:12:54.673053870 -0800
@@ -343,7 +343,7 @@
to_mcq(init_attr->send_cq),
to_mcq(init_attr->recv_cq),
init_attr->qp_type, init_attr->sq_sig_type,
- init_attr->rq_sig_type, qp);
+ qp);
qp->ibqp.qp_num = qp->qpn;
break;
}
@@ -364,7 +364,7 @@
err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
to_mcq(init_attr->send_cq),
to_mcq(init_attr->recv_cq),
- init_attr->sq_sig_type, init_attr->rq_sig_type,
+ init_attr->sq_sig_type,
qp->ibqp.qp_num, init_attr->port_num,
to_msqp(qp));
break;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-01-25 20:47:46.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:12:54.674053653 -0800
@@ -154,7 +154,6 @@
void *last;
int max_gs;
int wqe_shift;
- enum ib_sig_type policy;
};

struct mthca_qp {
@@ -172,6 +171,7 @@

struct mthca_wq rq;
struct mthca_wq sq;
+ enum ib_sig_type sq_policy;
int send_wqe_offset;

u64 *wrid;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:12:52.924433436 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c 2005-03-03 14:12:54.675053436 -0800
@@ -690,7 +690,7 @@
MTHCA_QP_BIT_SRE |
MTHCA_QP_BIT_SWE |
MTHCA_QP_BIT_SAE);
- if (qp->sq.policy == IB_SIGNAL_ALL_WR)
+ if (qp->sq_policy == IB_SIGNAL_ALL_WR)
qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
if (attr_mask & IB_QP_RETRY_CNT) {
qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
@@ -778,8 +778,8 @@
qp->resp_depth = attr->max_rd_atomic;
}

- if (qp->rq.policy == IB_SIGNAL_ALL_WR)
- qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+ qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
if (attr_mask & IB_QP_MIN_RNR_TIMER) {
qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
@@ -977,7 +977,6 @@
struct mthca_cq *send_cq,
struct mthca_cq *recv_cq,
enum ib_sig_type send_policy,
- enum ib_sig_type recv_policy,
struct mthca_qp *qp)
{
int err;
@@ -987,8 +986,7 @@
qp->state = IB_QPS_RESET;
qp->atomic_rd_en = 0;
qp->resp_depth = 0;
- qp->sq.policy = send_policy;
- qp->rq.policy = recv_policy;
+ qp->sq_policy = send_policy;
qp->rq.cur = 0;
qp->sq.cur = 0;
qp->rq.next = 0;
@@ -1008,7 +1006,6 @@
struct mthca_cq *recv_cq,
enum ib_qp_type type,
enum ib_sig_type send_policy,
- enum ib_sig_type recv_policy,
struct mthca_qp *qp)
{
int err;
@@ -1025,7 +1022,7 @@
return -ENOMEM;

err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
- send_policy, recv_policy, qp);
+ send_policy, qp);
if (err) {
mthca_free(&dev->qp_table.alloc, qp->qpn);
return err;
@@ -1044,7 +1041,6 @@
struct mthca_cq *send_cq,
struct mthca_cq *recv_cq,
enum ib_sig_type send_policy,
- enum ib_sig_type recv_policy,
int qpn,
int port,
struct mthca_sqp *sqp)
@@ -1073,8 +1069,7 @@
sqp->qp.transport = MLX;

err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
- send_policy, recv_policy,
- &sqp->qp);
+ send_policy, &sqp->qp);
if (err)
goto err_out_free;

@@ -1495,9 +1490,7 @@
((struct mthca_next_seg *) wqe)->nda_op = 0;
((struct mthca_next_seg *) wqe)->ee_nds =
cpu_to_be32(MTHCA_NEXT_DBD);
- ((struct mthca_next_seg *) wqe)->flags =
- (wr->recv_flags & IB_RECV_SIGNALED) ?
- cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0;
+ ((struct mthca_next_seg *) wqe)->flags = 0;

wqe += sizeof (struct mthca_next_seg);
size = sizeof (struct mthca_next_seg) / 16;
--- linux-export.orig/drivers/infiniband/include/ib_verbs.h 2005-01-25 20:47:00.000000000 -0800
+++ linux-export/drivers/infiniband/include/ib_verbs.h 2005-03-03 14:12:54.669054738 -0800
@@ -73,7 +73,6 @@
IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
IB_DEVICE_SRQ_RESIZE = (1<<13),
IB_DEVICE_N_NOTIFY_CQ = (1<<14),
- IB_DEVICE_RQ_SIG_TYPE = (1<<15)
};

enum ib_atomic_cap {
@@ -408,7 +407,6 @@
struct ib_srq *srq;
struct ib_qp_cap cap;
enum ib_sig_type sq_sig_type;
- enum ib_sig_type rq_sig_type;
enum ib_qp_type qp_type;
u8 port_num; /* special QP types only */
};
@@ -533,10 +531,6 @@
IB_SEND_INLINE = (1<<3)
};

-enum ib_recv_flags {
- IB_RECV_SIGNALED = 1
-};
-
struct ib_sge {
u64 addr;
u32 length;
@@ -579,7 +573,6 @@
u64 wr_id;
struct ib_sge *sg_list;
int num_sge;
- int recv_flags;
};

enum ib_access_flags {
--- linux-export.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2005-03-02 20:53:21.000000000 -0800
+++ linux-export/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2005-03-03 14:12:54.668054955 -0800
@@ -105,7 +105,6 @@
.wr_id = wr_id | IPOIB_OP_RECV,
.sg_list = &list,
.num_sge = 1,
- .recv_flags = IB_RECV_SIGNALED
};
struct ib_recv_wr *bad_wr;

--- linux-export.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2005-01-15 15:19:59.000000000 -0800
+++ linux-export/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2005-03-03 14:12:54.667055172 -0800
@@ -165,7 +165,6 @@
.max_recv_sge = 1
},
.sq_sig_type = IB_SIGNAL_ALL_WR,
- .rq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_UD
};


2005-03-04 01:43:39

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][1/26] IB: fix ib_find_cached_gid() port numbering

From: Sean Hefty <[email protected]>

Fix ib_find_cached_gid() to return the correct port number relative to
the port numbering used by the device.

Signed-off-by: Sean Hefty <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>

--- linux-export.orig/drivers/infiniband/core/cache.c 2005-03-02 20:53:21.000000000 -0800
+++ linux-export/drivers/infiniband/core/cache.c 2005-03-03 15:02:57.180310444 -0800
@@ -114,7 +114,7 @@
cache = device->cache.gid_cache[p];
for (i = 0; i < cache->table_len; ++i) {
if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
- *port_num = p;
+ *port_num = p + start_port(device);
if (index)
*index = i;
ret = 0;

2005-03-03 23:50:22

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][14/26] IB/mthca: tweak MAP_ICM_page firmware command

Have MAP_ICM_page() firmware command map assume pages are always the
HCA-native 4K size rather than using the kernel's page size. This
will make handling doorbell pages for mem-free mode simpler.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cmd.c 2005-03-03 14:12:58.283270213 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cmd.c 2005-03-03 14:12:58.619197294 -0800
@@ -1290,7 +1290,7 @@
return -ENOMEM;

inbox[0] = cpu_to_be64(virt);
- inbox[1] = cpu_to_be64(dma_addr | (PAGE_SHIFT - 12));
+ inbox[1] = cpu_to_be64(dma_addr);

err = mthca_cmd(dev, indma, 1, 0, CMD_MAP_ICM, CMD_TIME_CLASS_B, status);


2005-03-04 02:40:39

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][17/26] IB/mthca: refactor CQ buffer allocate/free

Factor the allocation and freeing of completion queue buffers into
mthca_alloc_cq_buf() and mthca_free_cq_buf(). This makes the code
more readable and will eventually make handling userspace CQs simpler
(the kernel doesn't have to allocate a buffer at all).

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:56.153732464 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c 2005-03-03 14:12:59.925913650 -0800
@@ -557,32 +557,40 @@
MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
}

-int mthca_init_cq(struct mthca_dev *dev, int nent,
- struct mthca_cq *cq)
+static void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq *cq)
{
- int size = nent * MTHCA_CQ_ENTRY_SIZE;
- dma_addr_t t;
- void *mailbox = NULL;
- int npages, shift;
- u64 *dma_list = NULL;
- struct mthca_cq_context *cq_context;
- int err = -ENOMEM;
- u8 status;
int i;
+ int size;

- might_sleep();
+ if (cq->is_direct)
+ pci_free_consistent(dev->pdev,
+ (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE,
+ cq->queue.direct.buf,
+ pci_unmap_addr(&cq->queue.direct,
+ mapping));
+ else {
+ size = (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE;
+ for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+ if (cq->queue.page_list[i].buf)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ cq->queue.page_list[i].buf,
+ pci_unmap_addr(&cq->queue.page_list[i],
+ mapping));

- mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
- GFP_KERNEL);
- if (!mailbox)
- goto err_out;
+ kfree(cq->queue.page_list);
+ }
+}

- cq_context = MAILBOX_ALIGN(mailbox);
+static int mthca_alloc_cq_buf(struct mthca_dev *dev, int size,
+ struct mthca_cq *cq)
+{
+ int err = -ENOMEM;
+ int npages, shift;
+ u64 *dma_list = NULL;
+ dma_addr_t t;
+ int i;

if (size <= MTHCA_MAX_DIRECT_CQ_SIZE) {
- if (0)
- mthca_dbg(dev, "Creating direct CQ of size %d\n", size);
-
cq->is_direct = 1;
npages = 1;
shift = get_order(size) + PAGE_SHIFT;
@@ -590,7 +598,7 @@
cq->queue.direct.buf = pci_alloc_consistent(dev->pdev,
size, &t);
if (!cq->queue.direct.buf)
- goto err_out;
+ return -ENOMEM;

pci_unmap_addr_set(&cq->queue.direct, mapping, t);

@@ -603,7 +611,7 @@

dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
if (!dma_list)
- goto err_out_free;
+ goto err_free;

for (i = 0; i < npages; ++i)
dma_list[i] = t + i * (1 << shift);
@@ -612,12 +620,9 @@
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
shift = PAGE_SHIFT;

- if (0)
- mthca_dbg(dev, "Creating indirect CQ with %d pages\n", npages);
-
dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
if (!dma_list)
- goto err_out;
+ return -ENOMEM;

cq->queue.page_list = kmalloc(npages * sizeof *cq->queue.page_list,
GFP_KERNEL);
@@ -631,7 +636,7 @@
cq->queue.page_list[i].buf =
pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
if (!cq->queue.page_list[i].buf)
- goto err_out_free;
+ goto err_free;

dma_list[i] = t;
pci_unmap_addr_set(&cq->queue.page_list[i], mapping, t);
@@ -640,13 +645,6 @@
}
}

- for (i = 0; i < nent; ++i)
- set_cqe_hw(get_cqe(cq, i));
-
- cq->cqn = mthca_alloc(&dev->cq_table.alloc);
- if (cq->cqn == -1)
- goto err_out_free;
-
err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
dma_list, shift, npages,
0, size,
@@ -654,7 +652,52 @@
MTHCA_MPT_FLAG_LOCAL_READ,
&cq->mr);
if (err)
- goto err_out_free_cq;
+ goto err_free;
+
+ kfree(dma_list);
+
+ return 0;
+
+err_free:
+ mthca_free_cq_buf(dev, cq);
+
+err_out:
+ kfree(dma_list);
+
+ return err;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+ struct mthca_cq *cq)
+{
+ int size = nent * MTHCA_CQ_ENTRY_SIZE;
+ void *mailbox = NULL;
+ struct mthca_cq_context *cq_context;
+ int err = -ENOMEM;
+ u8 status;
+ int i;
+
+ might_sleep();
+
+ cq->ibcq.cqe = nent - 1;
+
+ cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+ if (cq->cqn == -1)
+ return -ENOMEM;
+
+ mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
+ GFP_KERNEL);
+ if (!mailbox)
+ goto err_out;
+
+ cq_context = MAILBOX_ALIGN(mailbox);
+
+ err = mthca_alloc_cq_buf(dev, size, cq);
+ if (err)
+ goto err_out_mailbox;
+
+ for (i = 0; i < nent; ++i)
+ set_cqe_hw(get_cqe(cq, i));

spin_lock_init(&cq->lock);
atomic_set(&cq->refcount, 1);
@@ -697,37 +740,20 @@

cq->cons_index = 0;

- kfree(dma_list);
kfree(mailbox);

return 0;

- err_out_free_mr:
+err_out_free_mr:
mthca_free_mr(dev, &cq->mr);
+ mthca_free_cq_buf(dev, cq);

- err_out_free_cq:
- mthca_free(&dev->cq_table.alloc, cq->cqn);
-
- err_out_free:
- if (cq->is_direct)
- pci_free_consistent(dev->pdev, size,
- cq->queue.direct.buf,
- pci_unmap_addr(&cq->queue.direct, mapping));
- else {
- for (i = 0; i < npages; ++i)
- if (cq->queue.page_list[i].buf)
- pci_free_consistent(dev->pdev, PAGE_SIZE,
- cq->queue.page_list[i].buf,
- pci_unmap_addr(&cq->queue.page_list[i],
- mapping));
-
- kfree(cq->queue.page_list);
- }
-
- err_out:
- kfree(dma_list);
+err_out_mailbox:
kfree(mailbox);

+err_out:
+ mthca_free(&dev->cq_table.alloc, cq->cqn);
+
return err;
}

@@ -778,27 +804,7 @@
wait_event(cq->wait, !atomic_read(&cq->refcount));

mthca_free_mr(dev, &cq->mr);
-
- if (cq->is_direct)
- pci_free_consistent(dev->pdev,
- (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE,
- cq->queue.direct.buf,
- pci_unmap_addr(&cq->queue.direct,
- mapping));
- else {
- int i;
-
- for (i = 0;
- i < ((cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE + PAGE_SIZE - 1) /
- PAGE_SIZE;
- ++i)
- pci_free_consistent(dev->pdev, PAGE_SIZE,
- cq->queue.page_list[i].buf,
- pci_unmap_addr(&cq->queue.page_list[i],
- mapping));
-
- kfree(cq->queue.page_list);
- }
+ mthca_free_cq_buf(dev, cq);

mthca_free(&dev->cq_table.alloc, cq->cqn);
kfree(mailbox);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:12:54.673053870 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.c 2005-03-03 14:12:59.925913650 -0800
@@ -408,8 +408,7 @@
if (err) {
kfree(cq);
cq = ERR_PTR(err);
- } else
- cq->ibcq.cqe = nent - 1;
+ }

return &cq->ibcq;
}

2005-03-04 02:45:29

by Roland Dreier

[permalink] [raw]
Subject: [PATCH][21/26] IB/mthca: mem-free address vectors

Update address vector handling to support mem-free mode. In mem-free
mode, the address vector (in hardware format) is copied by the driver
into each send work queue entry, so our address handle creation can
become pretty trivial: we just kmalloc() a buffer to hold the
formatted address vector.

Signed-off-by: Roland Dreier <[email protected]>


--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_av.c 2005-01-15 15:19:30.000000000 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_av.c 2005-03-03 14:13:02.121437076 -0800
@@ -60,27 +60,34 @@
u32 index = -1;
struct mthca_av *av = NULL;

- ah->on_hca = 0;
+ ah->type = MTHCA_AH_PCI_POOL;

- if (!atomic_read(&pd->sqp_count) &&
- !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ if (dev->hca_type == ARBEL_NATIVE) {
+ ah->av = kmalloc(sizeof *ah->av, GFP_KERNEL);
+ if (!ah->av)
+ return -ENOMEM;
+
+ ah->type = MTHCA_AH_KMALLOC;
+ av = ah->av;
+ } else if (!atomic_read(&pd->sqp_count) &&
+ !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
index = mthca_alloc(&dev->av_table.alloc);

/* fall back to allocate in host memory */
if (index == -1)
- goto host_alloc;
+ goto on_hca_fail;

av = kmalloc(sizeof *av, GFP_KERNEL);
if (!av)
- goto host_alloc;
+ goto on_hca_fail;

- ah->on_hca = 1;
+ ah->type = MTHCA_AH_ON_HCA;
ah->avdma = dev->av_table.ddr_av_base +
index * MTHCA_AV_SIZE;
}

- host_alloc:
- if (!ah->on_hca) {
+on_hca_fail:
+ if (ah->type == MTHCA_AH_PCI_POOL) {
ah->av = pci_pool_alloc(dev->av_table.pool,
SLAB_KERNEL, &ah->avdma);
if (!ah->av)
@@ -123,7 +130,7 @@
j * 4, be32_to_cpu(((u32 *) av)[j]));
}

- if (ah->on_hca) {
+ if (ah->type == MTHCA_AH_ON_HCA) {
memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
av, MTHCA_AV_SIZE);
kfree(av);
@@ -134,12 +141,21 @@

int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
{
- if (ah->on_hca)
+ switch (ah->type) {
+ case MTHCA_AH_ON_HCA:
mthca_free(&dev->av_table.alloc,
(ah->avdma - dev->av_table.ddr_av_base) /
MTHCA_AV_SIZE);
- else
+ break;
+
+ case MTHCA_AH_PCI_POOL:
pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+ break;
+
+ case MTHCA_AH_KMALLOC:
+ kfree(ah->av);
+ break;
+ }

return 0;
}
@@ -147,7 +163,7 @@
int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
struct ib_ud_header *header)
{
- if (ah->on_hca)
+ if (ah->type == MTHCA_AH_ON_HCA)
return -EINVAL;

header->lrh.service_level = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
@@ -176,6 +192,9 @@
{
int err;

+ if (dev->hca_type == ARBEL_NATIVE)
+ return 0;
+
err = mthca_alloc_init(&dev->av_table.alloc,
dev->av_table.num_ddr_avs,
dev->av_table.num_ddr_avs - 1,
@@ -212,6 +231,9 @@

void __devexit mthca_cleanup_av_table(struct mthca_dev *dev)
{
+ if (dev->hca_type == ARBEL_NATIVE)
+ return;
+
if (dev->av_table.av_map)
iounmap(dev->av_table.av_map);
pci_pool_destroy(dev->av_table.pool);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:01.712525837 -0800
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h 2005-03-03 14:13:02.120437293 -0800
@@ -82,12 +82,18 @@

struct mthca_av;

+enum mthca_ah_type {
+ MTHCA_AH_ON_HCA,
+ MTHCA_AH_PCI_POOL,
+ MTHCA_AH_KMALLOC
+};
+
struct mthca_ah {
- struct ib_ah ibah;
- int on_hca;
- u32 key;
- struct mthca_av *av;
- dma_addr_t avdma;
+ struct ib_ah ibah;
+ enum mthca_ah_type type;
+ u32 key;
+ struct mthca_av *av;
+ dma_addr_t avdma;
};

/*

2005-03-04 16:37:55

by Greg KH

[permalink] [raw]
Subject: Re: [PATCH][3/26] IB/mthca: improve CQ locking part 1

On Thu, Mar 03, 2005 at 05:02:36PM -0800, Roland Dreier wrote:
> Greg> Sure, I have no problem accepting that into the pci core.
>
> What would pci_irq_sync() do exactly?

Consolidate common code like this? :)

thanks,

greg k-h

2005-03-04 16:44:55

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH][3/26] IB/mthca: improve CQ locking part 1

Roland> What would pci_irq_sync() do exactly?

Greg> Consolidate common code like this? :)

I don't see how one can do that. As I pointed out in my reply to
Jeff, it actually requires understanding how the driver uses the
different MSI-X vectors to know which vector we need to synchronize
against. So it seems pci_irq_sync() would have to be psychic.

If we can figure out how to do that, maybe we can consolidate a lot
more code into an API like

void do_what_i_mean(void);

;)

- R.