2005-11-10 18:32:03

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 1/7] [IB] Have cq_resize() method take an int, not int*

Change the struct ib_device.resize_cq() method to take a plain integer
that holds the new CQ size, rather than a pointer to an integer that
it uses to return the new size. This makes the interface match the
exported ib_resize_cq() signature, and allows the low-level driver to
update the CQ size with proper locking if necessary.

No in-tree drivers are exporting this method yet.

Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/core/verbs.c | 12 ++----------
include/rdma/ib_verbs.h | 2 +-
2 files changed, 3 insertions(+), 11 deletions(-)

applies-to: 08d94f59d6f80937db5d87f0bb60eafcedd811d1
40de2e548c225e3ef859e3c60de9785e37e1b5b1
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 72d3ef7..4f51d79 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -324,16 +324,8 @@ EXPORT_SYMBOL(ib_destroy_cq);
int ib_resize_cq(struct ib_cq *cq,
int cqe)
{
- int ret;
-
- if (!cq->device->resize_cq)
- return -ENOSYS;
-
- ret = cq->device->resize_cq(cq, &cqe);
- if (!ret)
- cq->cqe = cqe;
-
- return ret;
+ return cq->device->resize_cq ?
+ cq->device->resize_cq(cq, cqe) : -ENOSYS;
}
EXPORT_SYMBOL(ib_resize_cq);

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f72d46d..a7f4c35 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -881,7 +881,7 @@ struct ib_device {
struct ib_ucontext *context,
struct ib_udata *udata);
int (*destroy_cq)(struct ib_cq *cq);
- int (*resize_cq)(struct ib_cq *cq, int *cqe);
+ int (*resize_cq)(struct ib_cq *cq, int cqe);
int (*poll_cq)(struct ib_cq *cq, int num_entries,
struct ib_wc *wc);
int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
---
0.99.9e


2005-11-10 18:32:38

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 4/7] [IB] mthca: fix posting of atomic operations

The size of work requests for atomic operations was computed
incorrectly in mthca: all sizeofs need to be divided by 16.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/hw/mthca/mthca_qp.c | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)

applies-to: 308dce81364b1cbb563942a1a57146c1808e8911
62abb8416f1923f4cef50ce9ce841b919275e3fb
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 7f39af4..190c1dc 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1556,8 +1556,8 @@ int mthca_tavor_post_send(struct ib_qp *
}

wqe += sizeof (struct mthca_atomic_seg);
- size += sizeof (struct mthca_raddr_seg) / 16 +
- sizeof (struct mthca_atomic_seg);
+ size += (sizeof (struct mthca_raddr_seg) +
+ sizeof (struct mthca_atomic_seg)) / 16;
break;

case IB_WR_RDMA_WRITE:
@@ -1876,8 +1876,8 @@ int mthca_arbel_post_send(struct ib_qp *
}

wqe += sizeof (struct mthca_atomic_seg);
- size += sizeof (struct mthca_raddr_seg) / 16 +
- sizeof (struct mthca_atomic_seg);
+ size += (sizeof (struct mthca_raddr_seg) +
+ sizeof (struct mthca_atomic_seg)) / 16;
break;

case IB_WR_RDMA_READ:
---
0.99.9e

2005-11-10 18:32:39

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 2/7] [IB] umad: get rid of unused mr array

Now that ib_umad uses the new MAD sending interface, it no longer
needs its own L_Key. So just delete the array of MRs that it keeps.

Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/core/user_mad.c | 29 ++++-------------------------
1 files changed, 4 insertions(+), 25 deletions(-)

applies-to: e7b9ffe6fca9246f29a0a3cdf6417770f5821cef
ec914c52d6208d8752dfd85b48a9aff304911434
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index f5ed36c..d61f544 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -116,7 +116,6 @@ struct ib_umad_file {
spinlock_t recv_lock;
wait_queue_head_t recv_wait;
struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
- struct ib_mr *mr[IB_UMAD_MAX_AGENTS];
};

struct ib_umad_packet {
@@ -505,29 +504,16 @@ found:
goto out;
}

- file->mr[agent_id] = ib_get_dma_mr(agent->qp->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(file->mr[agent_id])) {
- ret = -ENOMEM;
- goto err;
- }
-
if (put_user(agent_id,
(u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
ret = -EFAULT;
- goto err_mr;
+ ib_unregister_mad_agent(agent);
+ goto out;
}

file->agent[agent_id] = agent;
ret = 0;

- goto out;
-
-err_mr:
- ib_dereg_mr(file->mr[agent_id]);
-
-err:
- ib_unregister_mad_agent(agent);
-
out:
up_write(&file->port->mutex);
return ret;
@@ -536,7 +522,6 @@ out:
static int ib_umad_unreg_agent(struct ib_umad_file *file, unsigned long arg)
{
struct ib_mad_agent *agent = NULL;
- struct ib_mr *mr = NULL;
u32 id;
int ret = 0;

@@ -551,16 +536,13 @@ static int ib_umad_unreg_agent(struct ib
}

agent = file->agent[id];
- mr = file->mr[id];
file->agent[id] = NULL;

out:
up_write(&file->port->mutex);

- if (agent) {
+ if (agent)
ib_unregister_mad_agent(agent);
- ib_dereg_mr(mr);
- }

return ret;
}
@@ -629,10 +611,8 @@ static int ib_umad_close(struct inode *i
int i;

for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
- if (file->agent[i]) {
+ if (file->agent[i])
ib_unregister_mad_agent(file->agent[i]);
- ib_dereg_mr(file->mr[i]);
- }

list_for_each_entry_safe(packet, tmp, &file->recv_list, list)
kfree(packet);
@@ -872,7 +852,6 @@ static void ib_umad_kill_port(struct ib_
for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) {
if (!file->agent[id])
continue;
- ib_dereg_mr(file->mr[id]);
ib_unregister_mad_agent(file->agent[id]);
file->agent[id] = NULL;
}
---
0.99.9e

2005-11-10 18:32:05

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 3/7] [IB] uverbs: have kernel return QP capabilities

Move the computation of QP capabilities (max scatter/gather entries,
max inline data, etc) into the kernel, and have the uverbs module
return the values as part of the create QP response. This keeps
precise knowledge of device limits in the low-level kernel driver.

This requires an ABI bump, so while we're making changes, get rid of
the max_sge parameter for the modify SRQ command -- it's not used and
shouldn't be there.

Signed-off-by: Jack Morgenstein <[email protected]>
Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/core/uverbs_cmd.c | 12 ++--
drivers/infiniband/hw/mthca/mthca_cmd.c | 2 +
drivers/infiniband/hw/mthca/mthca_dev.h | 1
drivers/infiniband/hw/mthca/mthca_main.c | 1
drivers/infiniband/hw/mthca/mthca_provider.c | 2 -
drivers/infiniband/hw/mthca/mthca_provider.h | 1
drivers/infiniband/hw/mthca/mthca_qp.c | 86 ++++++++++++++++++++++++--
include/rdma/ib_user_verbs.h | 9 ++-
8 files changed, 98 insertions(+), 16 deletions(-)

applies-to: 2741f22c820fb664f6958becc4f3d415eea0e61b
77369ed31daac51f4827c50d30f233c45480235a
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 63a7415..ed45da8 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -708,7 +708,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uver
resp->wc[i].opcode = wc[i].opcode;
resp->wc[i].vendor_err = wc[i].vendor_err;
resp->wc[i].byte_len = wc[i].byte_len;
- resp->wc[i].imm_data = wc[i].imm_data;
+ resp->wc[i].imm_data = (__u32 __force) wc[i].imm_data;
resp->wc[i].qp_num = wc[i].qp_num;
resp->wc[i].src_qp = wc[i].src_qp;
resp->wc[i].wc_flags = wc[i].wc_flags;
@@ -908,7 +908,12 @@ retry:
if (ret)
goto err_destroy;

- resp.qp_handle = uobj->uobject.id;
+ resp.qp_handle = uobj->uobject.id;
+ resp.max_recv_sge = attr.cap.max_recv_sge;
+ resp.max_send_sge = attr.cap.max_send_sge;
+ resp.max_recv_wr = attr.cap.max_recv_wr;
+ resp.max_send_wr = attr.cap.max_send_wr;
+ resp.max_inline_data = attr.cap.max_inline_data;

if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp)) {
@@ -1135,7 +1140,7 @@ ssize_t ib_uverbs_post_send(struct ib_uv
next->num_sge = user_wr->num_sge;
next->opcode = user_wr->opcode;
next->send_flags = user_wr->send_flags;
- next->imm_data = user_wr->imm_data;
+ next->imm_data = (__be32 __force) user_wr->imm_data;

if (qp->qp_type == IB_QPT_UD) {
next->wr.ud.ah = idr_find(&ib_uverbs_ah_idr,
@@ -1701,7 +1706,6 @@ ssize_t ib_uverbs_modify_srq(struct ib_u
}

attr.max_wr = cmd.max_wr;
- attr.max_sge = cmd.max_sge;
attr.srq_limit = cmd.srq_limit;

ret = ib_modify_srq(srq, &attr, cmd.attr_mask);
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 49f211d..9ed3458 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1060,6 +1060,8 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev
dev_lim->hca.arbel.resize_srq = field & 1;
MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
dev_lim->max_sg = min_t(int, field, dev_lim->max_sg);
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET);
+ dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz);
MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
dev_lim->mpt_entry_sz = size;
MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 808037f..497ff79 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -131,6 +131,7 @@ struct mthca_limits {
int max_sg;
int num_qps;
int max_wqes;
+ int max_desc_sz;
int max_qp_init_rdma;
int reserved_qps;
int num_srqs;
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 16594d1..147f248 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -168,6 +168,7 @@ static int __devinit mthca_dev_lim(struc
mdev->limits.max_srq_wqes = dev_lim->max_srq_sz;
mdev->limits.reserved_srqs = dev_lim->reserved_srqs;
mdev->limits.reserved_eecs = dev_lim->reserved_eecs;
+ mdev->limits.max_desc_sz = dev_lim->max_desc_sz;
/*
* Subtract 1 from the limit because we need to allocate a
* spare CQE so the HCA HW can tell the difference between an
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index e78259b..4cc7e28 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -616,11 +616,11 @@ static struct ib_qp *mthca_create_qp(str
return ERR_PTR(err);
}

- init_attr->cap.max_inline_data = 0;
init_attr->cap.max_send_wr = qp->sq.max;
init_attr->cap.max_recv_wr = qp->rq.max;
init_attr->cap.max_send_sge = qp->sq.max_gs;
init_attr->cap.max_recv_sge = qp->rq.max_gs;
+ init_attr->cap.max_inline_data = qp->max_inline_data;

return &qp->ibqp;
}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
index bcd4b01..1e73947 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.h
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -251,6 +251,7 @@ struct mthca_qp {
struct mthca_wq sq;
enum ib_sig_type sq_policy;
int send_wqe_offset;
+ int max_inline_data;

u64 *wrid;
union mthca_buf queue;
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 8852ea4..7f39af4 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -885,6 +885,48 @@ int mthca_modify_qp(struct ib_qp *ibqp,
return err;
}

+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_qp *qp)
+{
+ int max_data_size;
+
+ /*
+ * Calculate the maximum size of WQE s/g segments, excluding
+ * the next segment and other non-data segments.
+ */
+ max_data_size = min(dev->limits.max_desc_sz, 1 << qp->sq.wqe_shift) -
+ sizeof (struct mthca_next_seg);
+
+ switch (qp->transport) {
+ case MLX:
+ max_data_size -= 2 * sizeof (struct mthca_data_seg);
+ break;
+
+ case UD:
+ if (mthca_is_memfree(dev))
+ max_data_size -= sizeof (struct mthca_arbel_ud_seg);
+ else
+ max_data_size -= sizeof (struct mthca_tavor_ud_seg);
+ break;
+
+ default:
+ max_data_size -= sizeof (struct mthca_raddr_seg);
+ break;
+ }
+
+ /* We don't support inline data for kernel QPs (yet). */
+ if (!pd->ibpd.uobject)
+ qp->max_inline_data = 0;
+ else
+ qp->max_inline_data = max_data_size - MTHCA_INLINE_HEADER_SIZE;
+
+ qp->sq.max_gs = max_data_size / sizeof (struct mthca_data_seg);
+ qp->rq.max_gs = (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) -
+ sizeof (struct mthca_next_seg)) /
+ sizeof (struct mthca_data_seg);
+}
+
/*
* Allocate and register buffer for WQEs. qp->rq.max, sq.max,
* rq.max_gs and sq.max_gs must all be assigned.
@@ -902,27 +944,53 @@ static int mthca_alloc_wqe_buf(struct mt
size = sizeof (struct mthca_next_seg) +
qp->rq.max_gs * sizeof (struct mthca_data_seg);

+ if (size > dev->limits.max_desc_sz)
+ return -EINVAL;
+
for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
qp->rq.wqe_shift++)
; /* nothing */

- size = sizeof (struct mthca_next_seg) +
- qp->sq.max_gs * sizeof (struct mthca_data_seg);
+ size = qp->sq.max_gs * sizeof (struct mthca_data_seg);
switch (qp->transport) {
case MLX:
size += 2 * sizeof (struct mthca_data_seg);
break;
+
case UD:
- if (mthca_is_memfree(dev))
- size += sizeof (struct mthca_arbel_ud_seg);
- else
- size += sizeof (struct mthca_tavor_ud_seg);
+ size += mthca_is_memfree(dev) ?
+ sizeof (struct mthca_arbel_ud_seg) :
+ sizeof (struct mthca_tavor_ud_seg);
+ break;
+
+ case UC:
+ size += sizeof (struct mthca_raddr_seg);
+ break;
+
+ case RC:
+ size += sizeof (struct mthca_raddr_seg);
+ /*
+ * An atomic op will require an atomic segment, a
+ * remote address segment and one scatter entry.
+ */
+ size = max_t(int, size,
+ sizeof (struct mthca_atomic_seg) +
+ sizeof (struct mthca_raddr_seg) +
+ sizeof (struct mthca_data_seg));
break;
+
default:
- /* bind seg is as big as atomic + raddr segs */
- size += sizeof (struct mthca_bind_seg);
+ break;
}

+ /* Make sure that we have enough space for a bind request */
+ size = max_t(int, size, sizeof (struct mthca_bind_seg));
+
+ size += sizeof (struct mthca_next_seg);
+
+ if (size > dev->limits.max_desc_sz)
+ return -EINVAL;
+
for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
qp->sq.wqe_shift++)
; /* nothing */
@@ -1066,6 +1134,8 @@ static int mthca_alloc_qp_common(struct
return ret;
}

+ mthca_adjust_qp_caps(dev, pd, qp);
+
/*
* If this is a userspace QP, we're done now. The doorbells
* will be allocated and buffers will be initialized in
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index 072f3a2..5ff1490 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -43,7 +43,7 @@
* Increment this value if any changes that break userspace ABI
* compatibility are made.
*/
-#define IB_USER_VERBS_ABI_VERSION 3
+#define IB_USER_VERBS_ABI_VERSION 4

enum {
IB_USER_VERBS_CMD_GET_CONTEXT,
@@ -333,6 +333,11 @@ struct ib_uverbs_create_qp {
struct ib_uverbs_create_qp_resp {
__u32 qp_handle;
__u32 qpn;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
};

/*
@@ -552,9 +557,7 @@ struct ib_uverbs_modify_srq {
__u32 srq_handle;
__u32 attr_mask;
__u32 max_wr;
- __u32 max_sge;
__u32 srq_limit;
- __u32 reserved;
__u64 driver_data[0];
};

---
0.99.9e

2005-11-10 18:33:03

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 5/7] [IB] mthca: fix wraparound handling in mthca_cq_clean()

Handle case where prod_index has wrapped around and become less than
cq->cons_index by checking that their difference as a signed int is
positive rather than comparing directly.

Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/hw/mthca/mthca_cq.c | 16 ++++++----------
1 files changed, 6 insertions(+), 10 deletions(-)

applies-to: 704990abeb22a51ed2722e92536d22135f60957f
64044bcf75063cb5a6d42712886a712449df2ce3
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
index f98e235..4a8adce 100644
--- a/drivers/infiniband/hw/mthca/mthca_cq.c
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -258,7 +258,7 @@ void mthca_cq_clean(struct mthca_dev *de
{
struct mthca_cq *cq;
struct mthca_cqe *cqe;
- int prod_index;
+ u32 prod_index;
int nfreed = 0;

spin_lock_irq(&dev->cq_table.lock);
@@ -293,19 +293,15 @@ void mthca_cq_clean(struct mthca_dev *de
* Now sweep backwards through the CQ, removing CQ entries
* that match our QP by copying older entries on top of them.
*/
- while (prod_index > cq->cons_index) {
- cqe = get_cqe(cq, (prod_index - 1) & cq->ibcq.cqe);
+ while ((int) --prod_index - (int) cq->cons_index >= 0) {
+ cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
if (cqe->my_qpn == cpu_to_be32(qpn)) {
if (srq)
mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
++nfreed;
- }
- else if (nfreed)
- memcpy(get_cqe(cq, (prod_index - 1 + nfreed) &
- cq->ibcq.cqe),
- cqe,
- MTHCA_CQ_ENTRY_SIZE);
- --prod_index;
+ } else if (nfreed)
+ memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+ cqe, MTHCA_CQ_ENTRY_SIZE);
}

if (nfreed) {
---
0.99.9e

2005-11-10 18:32:35

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 7/7] [IB] umad: further ib_unregister_mad_agent() deadlock fixes

The previous umad deadlock fix left ib_umad_kill_port() still
vulnerable to deadlocking. This patch fixes that by downgrading our
lock to a read lock when we might end up trying to reacquire the lock
for reading.

Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/core/user_mad.c | 87 ++++++++++++++++++++++++++----------
1 files changed, 63 insertions(+), 24 deletions(-)

applies-to: 17115437026be55dcd74641be21561fecf33dcdb
94382f3562e350ed7c8f7dcd6fc968bdece31328
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index d61f544..5ea741f 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -31,7 +31,7 @@
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
- * $Id: user_mad.c 2814 2005-07-06 19:14:09Z halr $
+ * $Id: user_mad.c 4010 2005-11-09 23:11:56Z roland $
*/

#include <linux/module.h>
@@ -110,12 +110,13 @@ struct ib_umad_device {
};

struct ib_umad_file {
- struct ib_umad_port *port;
- struct list_head recv_list;
- struct list_head port_list;
- spinlock_t recv_lock;
- wait_queue_head_t recv_wait;
- struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ struct ib_umad_port *port;
+ struct list_head recv_list;
+ struct list_head port_list;
+ spinlock_t recv_lock;
+ wait_queue_head_t recv_wait;
+ struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ int agents_dead;
};

struct ib_umad_packet {
@@ -144,6 +145,12 @@ static void ib_umad_release_dev(struct k
kfree(dev);
}

+/* caller must hold port->mutex at least for reading */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+ return file->agents_dead ? NULL : file->agent[id];
+}
+
static int queue_packet(struct ib_umad_file *file,
struct ib_mad_agent *agent,
struct ib_umad_packet *packet)
@@ -151,10 +158,11 @@ static int queue_packet(struct ib_umad_f
int ret = 1;

down_read(&file->port->mutex);
+
for (packet->mad.hdr.id = 0;
packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
packet->mad.hdr.id++)
- if (agent == file->agent[packet->mad.hdr.id]) {
+ if (agent == __get_agent(file, packet->mad.hdr.id)) {
spin_lock_irq(&file->recv_lock);
list_add_tail(&packet->list, &file->recv_list);
spin_unlock_irq(&file->recv_lock);
@@ -326,7 +334,7 @@ static ssize_t ib_umad_write(struct file

down_read(&file->port->mutex);

- agent = file->agent[packet->mad.hdr.id];
+ agent = __get_agent(file, packet->mad.hdr.id);
if (!agent) {
ret = -EINVAL;
goto err_up;
@@ -480,7 +488,7 @@ static int ib_umad_reg_agent(struct ib_u
}

for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
- if (!file->agent[agent_id])
+ if (!__get_agent(file, agent_id))
goto found;

ret = -ENOMEM;
@@ -530,7 +538,7 @@ static int ib_umad_unreg_agent(struct ib

down_write(&file->port->mutex);

- if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !file->agent[id]) {
+ if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
ret = -EINVAL;
goto out;
}
@@ -608,21 +616,29 @@ static int ib_umad_close(struct inode *i
struct ib_umad_file *file = filp->private_data;
struct ib_umad_device *dev = file->port->umad_dev;
struct ib_umad_packet *packet, *tmp;
+ int already_dead;
int i;

- for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
- if (file->agent[i])
- ib_unregister_mad_agent(file->agent[i]);
+ down_write(&file->port->mutex);
+
+ already_dead = file->agents_dead;
+ file->agents_dead = 1;

list_for_each_entry_safe(packet, tmp, &file->recv_list, list)
kfree(packet);

- down_write(&file->port->mutex);
list_del(&file->port_list);
- up_write(&file->port->mutex);

- kfree(file);
+ downgrade_write(&file->port->mutex);
+
+ if (!already_dead)
+ for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+ if (file->agent[i])
+ ib_unregister_mad_agent(file->agent[i]);

+ up_read(&file->port->mutex);
+
+ kfree(file);
kref_put(&dev->ref, ib_umad_release_dev);

return 0;
@@ -848,13 +864,36 @@ static void ib_umad_kill_port(struct ib_

port->ib_dev = NULL;

- list_for_each_entry(file, &port->file_list, port_list)
- for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) {
- if (!file->agent[id])
- continue;
- ib_unregister_mad_agent(file->agent[id]);
- file->agent[id] = NULL;
- }
+ /*
+ * Now go through the list of files attached to this port and
+ * unregister all of their MAD agents. We need to hold
+ * port->mutex while doing this to avoid racing with
+ * ib_umad_close(), but we can't hold the mutex for writing
+ * while calling ib_unregister_mad_agent(), since that might
+ * deadlock by calling back into queue_packet(). So we
+ * downgrade our lock to a read lock, and then drop and
+ * reacquire the write lock for the next iteration.
+ *
+ * We do list_del_init() on the file's list_head so that the
+ * list_del in ib_umad_close() is still OK, even after the
+ * file is removed from the list.
+ */
+ while (!list_empty(&port->file_list)) {
+ file = list_entry(port->file_list.next, struct ib_umad_file,
+ port_list);
+
+ file->agents_dead = 1;
+ list_del_init(&file->port_list);
+
+ downgrade_write(&port->mutex);
+
+ for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+ if (file->agent[id])
+ ib_unregister_mad_agent(file->agent[id]);
+
+ up_read(&port->mutex);
+ down_write(&port->mutex);
+ }

up_write(&port->mutex);

---
0.99.9e

2005-11-10 18:32:39

by Roland Dreier

[permalink] [raw]
Subject: [git patch review 6/7] [IB] mthca: fix posting long lists of receive work requests

In Tavor mode, when posting a long list of receive work requests, a
doorbell must be rung every 256 requests. Add code to do this when
required.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>

---

drivers/infiniband/hw/mthca/mthca_qp.c | 19 +++++++++++++++++--
drivers/infiniband/hw/mthca/mthca_srq.c | 22 ++++++++++++++++++++--
drivers/infiniband/hw/mthca/mthca_wqe.h | 3 ++-
3 files changed, 39 insertions(+), 5 deletions(-)

applies-to: 984d2fc62c548af3d01450135f33b5b97aecf00b
ae57e24a4006fd46b73d842ee99db9580ef74a02
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 190c1dc..760c418 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1707,6 +1707,7 @@ int mthca_tavor_post_receive(struct ib_q
{
struct mthca_dev *dev = to_mdev(ibqp->device);
struct mthca_qp *qp = to_mqp(ibqp);
+ __be32 doorbell[2];
unsigned long flags;
int err = 0;
int nreq;
@@ -1724,6 +1725,22 @@ int mthca_tavor_post_receive(struct ib_q
ind = qp->rq.next_ind;

for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+ nreq = 0;
+
+ doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
+ doorbell[1] = cpu_to_be32(qp->qpn << 8);
+
+ wmb();
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+ qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
+ size0 = 0;
+ }
+
if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
mthca_err(dev, "RQ %06x full (%u head, %u tail,"
" %d max, %d nreq)\n", qp->qpn,
@@ -1781,8 +1798,6 @@ int mthca_tavor_post_receive(struct ib_q

out:
if (likely(nreq)) {
- __be32 doorbell[2];
-
doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq);

diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 292f55b..c3c0331 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -414,6 +414,7 @@ int mthca_tavor_post_srq_recv(struct ib_
{
struct mthca_dev *dev = to_mdev(ibsrq->device);
struct mthca_srq *srq = to_msrq(ibsrq);
+ __be32 doorbell[2];
unsigned long flags;
int err = 0;
int first_ind;
@@ -429,6 +430,25 @@ int mthca_tavor_post_srq_recv(struct ib_
first_ind = srq->first_free;

for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+ nreq = 0;
+
+ doorbell[0] = cpu_to_be32(first_ind << srq->wqe_shift);
+ doorbell[1] = cpu_to_be32(srq->srqn << 8);
+
+ /*
+ * Make sure that descriptors are written
+ * before doorbell is rung.
+ */
+ wmb();
+
+ mthca_write64(doorbell,
+ dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+ first_ind = srq->first_free;
+ }
+
ind = srq->first_free;

if (ind < 0) {
@@ -491,8 +511,6 @@ int mthca_tavor_post_srq_recv(struct ib_
}

if (likely(nreq)) {
- __be32 doorbell[2];
-
doorbell[0] = cpu_to_be32(first_ind << srq->wqe_shift);
doorbell[1] = cpu_to_be32((srq->srqn << 8) | nreq);

diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h
index 1f4c0ff..73f1c0b 100644
--- a/drivers/infiniband/hw/mthca/mthca_wqe.h
+++ b/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -49,7 +49,8 @@ enum {
};

enum {
- MTHCA_INVAL_LKEY = 0x100
+ MTHCA_INVAL_LKEY = 0x100,
+ MTHCA_TAVOR_MAX_WQES_PER_RECV_DB = 256
};

struct mthca_next_seg {
---
0.99.9e