Support userspace configuring congestion control algorithm with
QP granularity. If the algorithm is not specified in userspace,
use the default one.
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +---
drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 +-
drivers/infiniband/hw/hns/hns_roce_main.c | 3 +
drivers/infiniband/hw/hns/hns_roce_qp.c | 71 +++++++++++++++++++++
include/uapi/rdma/hns-abi.h | 17 +++++
6 files changed, 112 insertions(+), 19 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index c88ba7e053bf..55f2f54e15fb 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -594,11 +594,19 @@ struct hns_roce_work {
u32 queue_num;
};
-enum cong_type {
- CONG_TYPE_DCQCN,
- CONG_TYPE_LDCP,
- CONG_TYPE_HC3,
- CONG_TYPE_DIP,
+enum hns_roce_scc_algo {
+ HNS_ROCE_SCC_ALGO_DCQCN = 0,
+ HNS_ROCE_SCC_ALGO_LDCP,
+ HNS_ROCE_SCC_ALGO_HC3,
+ HNS_ROCE_SCC_ALGO_DIP,
+ HNS_ROCE_SCC_ALGO_TOTAL,
+};
+
+enum hns_roce_cong_type {
+ CONG_TYPE_DCQCN = 1 << HNS_ROCE_SCC_ALGO_DCQCN,
+ CONG_TYPE_LDCP = 1 << HNS_ROCE_SCC_ALGO_LDCP,
+ CONG_TYPE_HC3 = 1 << HNS_ROCE_SCC_ALGO_HC3,
+ CONG_TYPE_DIP = 1 << HNS_ROCE_SCC_ALGO_DIP,
};
struct hns_roce_qp {
@@ -644,7 +652,7 @@ struct hns_roce_qp {
struct list_head sq_node; /* all send qps are on a list */
struct hns_user_mmap_entry *dwqe_mmap_entry;
u32 config;
- enum cong_type cong_type;
+ enum hns_roce_cong_type cong_type;
};
struct hns_roce_ib_iboe {
@@ -845,7 +853,8 @@ struct hns_roce_caps {
u16 default_aeq_period;
u16 default_aeq_arm_st;
u16 default_ceq_arm_st;
- enum cong_type cong_type;
+ u8 cong_cap;
+ enum hns_roce_cong_type default_cong_type;
};
enum hns_roce_device_state {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 42e28586cefa..21532f213b0f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -2209,11 +2209,12 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS);
- caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE);
+ caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP);
caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
+ caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG);
caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
@@ -4737,14 +4738,8 @@ enum {
static int check_cong_type(struct ib_qp *ibqp,
struct hns_roce_congestion_algorithm *cong_alg)
{
- struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
- if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_GSI)
- hr_qp->cong_type = CONG_TYPE_DCQCN;
- else
- hr_qp->cong_type = hr_dev->caps.cong_type;
-
/* different congestion types match different configurations */
switch (hr_qp->cong_type) {
case CONG_TYPE_DCQCN:
@@ -4772,9 +4767,6 @@ static int check_cong_type(struct ib_qp *ibqp,
cong_alg->wnd_mode_sel = WND_LIMIT;
break;
default:
- ibdev_warn(&hr_dev->ib_dev,
- "invalid type(%u) for congestion selection.\n",
- hr_qp->cong_type);
hr_qp->cong_type = CONG_TYPE_DCQCN;
cong_alg->alg_sel = CONG_DCQCN;
cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
@@ -4807,7 +4799,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
return ret;
hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
- hr_qp->cong_type * HNS_ROCE_CONG_SIZE);
+ ilog2(hr_qp->cong_type) * HNS_ROCE_CONG_SIZE);
hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID);
hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index cd97cbee682a..359a74672ba1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -1214,12 +1214,13 @@ struct hns_roce_query_pf_caps_d {
#define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20)
#define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22)
#define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24)
-#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26)
+#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26)
#define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64)
#define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86)
#define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96)
#define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118)
#define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120)
+#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122)
#define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128)
#define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148)
#define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index b55fe6911f9f..e5b678814f58 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -394,6 +394,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS;
}
+ if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
+ resp.congest_type = hr_dev->caps.cong_cap;
+
ret = hns_roce_uar_alloc(hr_dev, &context->uar);
if (ret)
goto error_out;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 31b147210688..e22911d6b6a9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -1004,6 +1004,70 @@ static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
kfree(hr_qp->sq.wrid);
}
+static void default_congest_type(struct hns_roce_dev *hr_dev,
+ struct hns_roce_qp *hr_qp)
+{
+ struct hns_roce_caps *caps = &hr_dev->caps;
+
+ if (hr_qp->ibqp.qp_type == IB_QPT_UD ||
+ hr_qp->ibqp.qp_type == IB_QPT_GSI)
+ hr_qp->cong_type = CONG_TYPE_DCQCN;
+ else
+ hr_qp->cong_type = 1 << caps->default_cong_type;
+}
+
+static int set_congest_type(struct hns_roce_qp *hr_qp,
+ struct hns_roce_ib_create_qp *ucmd)
+{
+ struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
+
+ switch (ucmd->cong_type_flags) {
+ case HNS_ROCE_CREATE_QP_FLAGS_DCQCN:
+ hr_qp->cong_type = CONG_TYPE_DCQCN;
+ break;
+ case HNS_ROCE_CREATE_QP_FLAGS_LDCP:
+ hr_qp->cong_type = CONG_TYPE_LDCP;
+ break;
+ case HNS_ROCE_CREATE_QP_FLAGS_HC3:
+ hr_qp->cong_type = CONG_TYPE_HC3;
+ break;
+ case HNS_ROCE_CREATE_QP_FLAGS_DIP:
+ hr_qp->cong_type = CONG_TYPE_DIP;
+ break;
+ default:
+ hr_qp->cong_type = 0;
+ }
+
+ if (!(hr_qp->cong_type & hr_dev->caps.cong_cap)) {
+ ibdev_err_ratelimited(&hr_dev->ib_dev,
+ "Unsupported congest type 0x%x, cong_cap = 0x%x.\n",
+ hr_qp->cong_type, hr_dev->caps.cong_cap);
+ return -EOPNOTSUPP;
+ }
+
+ if (hr_qp->ibqp.qp_type == IB_QPT_UD &&
+ !(hr_qp->cong_type & CONG_TYPE_DCQCN)) {
+ ibdev_err_ratelimited(&hr_dev->ib_dev,
+ "Only DCQCN supported for UD. Unsupported congest type 0x%x.\n",
+ hr_qp->cong_type);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int set_congest_param(struct hns_roce_dev *hr_dev,
+ struct hns_roce_qp *hr_qp,
+ struct hns_roce_ib_create_qp *ucmd)
+{
+ if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE)
+ return set_congest_type(hr_qp, ucmd);
+
+ default_congest_type(hr_dev, hr_qp);
+
+ return 0;
+}
+
static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata,
@@ -1026,6 +1090,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
return ret;
}
+ if (init_attr->qp_type == IB_QPT_XRC_TGT)
+ default_congest_type(hr_dev, hr_qp);
+
if (udata) {
ret = ib_copy_from_udata(ucmd, udata,
min(udata->inlen, sizeof(*ucmd)));
@@ -1043,6 +1110,10 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
ibdev_err(ibdev,
"failed to set user SQ size, ret = %d.\n",
ret);
+
+ ret = set_congest_param(hr_dev, hr_qp, ucmd);
+ if (ret)
+ return ret;
} else {
if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
hr_qp->config = HNS_ROCE_EXSGE_FLAGS;
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index c996e151081e..757095a6c6fc 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -81,6 +81,9 @@ struct hns_roce_ib_create_qp {
__u8 sq_no_prefetch;
__u8 reserved[5];
__aligned_u64 sdb_addr;
+ __aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
+ __aligned_u64 create_flags;
+ __aligned_u64 cong_type_flags;
};
enum hns_roce_qp_cap_flags {
@@ -107,6 +110,17 @@ enum {
HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
};
+enum hns_roce_congest_type_flags {
+ HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
+ HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
+ HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
+ HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
+};
+
+enum hns_roce_create_qp_comp_mask {
+ HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
+};
+
struct hns_roce_ib_alloc_ucontext_resp {
__u32 qp_tab_size;
__u32 cqe_size;
@@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
__u32 reserved;
__u32 config;
__u32 max_inline_data;
+ __u8 reserved0;
+ __u8 congest_type;
+ __u8 reserved1[6];
};
struct hns_roce_ib_alloc_ucontext {
--
2.30.0
On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
> Support userspace configuring congestion control algorithm with
> QP granularity. If the algorithm is not specified in userspace,
> use the default one.
>
> Signed-off-by: Junxian Huang <[email protected]>
> ---
> drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
> drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +---
> drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 +-
> drivers/infiniband/hw/hns/hns_roce_main.c | 3 +
> drivers/infiniband/hw/hns/hns_roce_qp.c | 71 +++++++++++++++++++++
> include/uapi/rdma/hns-abi.h | 17 +++++
> 6 files changed, 112 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
> index c88ba7e053bf..55f2f54e15fb 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -594,11 +594,19 @@ struct hns_roce_work {
> u32 queue_num;
> };
>
> -enum cong_type {
> - CONG_TYPE_DCQCN,
> - CONG_TYPE_LDCP,
> - CONG_TYPE_HC3,
> - CONG_TYPE_DIP,
> +enum hns_roce_scc_algo {
> + HNS_ROCE_SCC_ALGO_DCQCN = 0,
> + HNS_ROCE_SCC_ALGO_LDCP,
> + HNS_ROCE_SCC_ALGO_HC3,
> + HNS_ROCE_SCC_ALGO_DIP,
> + HNS_ROCE_SCC_ALGO_TOTAL,
> +};
> +
> +enum hns_roce_cong_type {
> + CONG_TYPE_DCQCN = 1 << HNS_ROCE_SCC_ALGO_DCQCN,
> + CONG_TYPE_LDCP = 1 << HNS_ROCE_SCC_ALGO_LDCP,
> + CONG_TYPE_HC3 = 1 << HNS_ROCE_SCC_ALGO_HC3,
> + CONG_TYPE_DIP = 1 << HNS_ROCE_SCC_ALGO_DIP,
> };
>
> struct hns_roce_qp {
> @@ -644,7 +652,7 @@ struct hns_roce_qp {
> struct list_head sq_node; /* all send qps are on a list */
> struct hns_user_mmap_entry *dwqe_mmap_entry;
> u32 config;
> - enum cong_type cong_type;
> + enum hns_roce_cong_type cong_type;
> };
>
> struct hns_roce_ib_iboe {
> @@ -845,7 +853,8 @@ struct hns_roce_caps {
> u16 default_aeq_period;
> u16 default_aeq_arm_st;
> u16 default_ceq_arm_st;
> - enum cong_type cong_type;
> + u8 cong_cap;
> + enum hns_roce_cong_type default_cong_type;
> };
>
> enum hns_roce_device_state {
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> index 42e28586cefa..21532f213b0f 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> @@ -2209,11 +2209,12 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
> caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
>
> caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS);
> - caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE);
> + caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP);
> caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
> caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
> caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
> caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
> + caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG);
> caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
> caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
> caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
> @@ -4737,14 +4738,8 @@ enum {
> static int check_cong_type(struct ib_qp *ibqp,
> struct hns_roce_congestion_algorithm *cong_alg)
> {
> - struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
> struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
>
> - if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_GSI)
> - hr_qp->cong_type = CONG_TYPE_DCQCN;
> - else
> - hr_qp->cong_type = hr_dev->caps.cong_type;
> -
> /* different congestion types match different configurations */
> switch (hr_qp->cong_type) {
> case CONG_TYPE_DCQCN:
> @@ -4772,9 +4767,6 @@ static int check_cong_type(struct ib_qp *ibqp,
> cong_alg->wnd_mode_sel = WND_LIMIT;
> break;
> default:
> - ibdev_warn(&hr_dev->ib_dev,
> - "invalid type(%u) for congestion selection.\n",
> - hr_qp->cong_type);
> hr_qp->cong_type = CONG_TYPE_DCQCN;
> cong_alg->alg_sel = CONG_DCQCN;
> cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
> @@ -4807,7 +4799,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
> return ret;
>
> hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
> - hr_qp->cong_type * HNS_ROCE_CONG_SIZE);
> + ilog2(hr_qp->cong_type) * HNS_ROCE_CONG_SIZE);
> hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID);
> hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
> hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL);
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
> index cd97cbee682a..359a74672ba1 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
> @@ -1214,12 +1214,13 @@ struct hns_roce_query_pf_caps_d {
> #define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20)
> #define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22)
> #define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24)
> -#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26)
> +#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26)
> #define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64)
> #define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86)
> #define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96)
> #define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118)
> #define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120)
> +#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122)
> #define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128)
> #define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148)
> #define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160)
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
> index b55fe6911f9f..e5b678814f58 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -394,6 +394,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
> resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS;
> }
>
> + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
> + resp.congest_type = hr_dev->caps.cong_cap;
> +
> ret = hns_roce_uar_alloc(hr_dev, &context->uar);
> if (ret)
> goto error_out;
> diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
> index 31b147210688..e22911d6b6a9 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_qp.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
> @@ -1004,6 +1004,70 @@ static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
> kfree(hr_qp->sq.wrid);
> }
>
> +static void default_congest_type(struct hns_roce_dev *hr_dev,
> + struct hns_roce_qp *hr_qp)
> +{
> + struct hns_roce_caps *caps = &hr_dev->caps;
> +
> + if (hr_qp->ibqp.qp_type == IB_QPT_UD ||
> + hr_qp->ibqp.qp_type == IB_QPT_GSI)
> + hr_qp->cong_type = CONG_TYPE_DCQCN;
> + else
> + hr_qp->cong_type = 1 << caps->default_cong_type;
> +}
> +
> +static int set_congest_type(struct hns_roce_qp *hr_qp,
> + struct hns_roce_ib_create_qp *ucmd)
> +{
> + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
> +
> + switch (ucmd->cong_type_flags) {
> + case HNS_ROCE_CREATE_QP_FLAGS_DCQCN:
> + hr_qp->cong_type = CONG_TYPE_DCQCN;
> + break;
> + case HNS_ROCE_CREATE_QP_FLAGS_LDCP:
> + hr_qp->cong_type = CONG_TYPE_LDCP;
> + break;
> + case HNS_ROCE_CREATE_QP_FLAGS_HC3:
> + hr_qp->cong_type = CONG_TYPE_HC3;
> + break;
> + case HNS_ROCE_CREATE_QP_FLAGS_DIP:
> + hr_qp->cong_type = CONG_TYPE_DIP;
> + break;
> + default:
> + hr_qp->cong_type = 0;
> + }
> +
> + if (!(hr_qp->cong_type & hr_dev->caps.cong_cap)) {
> + ibdev_err_ratelimited(&hr_dev->ib_dev,
> + "Unsupported congest type 0x%x, cong_cap = 0x%x.\n",
> + hr_qp->cong_type, hr_dev->caps.cong_cap);
> + return -EOPNOTSUPP;
> + }
> +
> + if (hr_qp->ibqp.qp_type == IB_QPT_UD &&
> + !(hr_qp->cong_type & CONG_TYPE_DCQCN)) {
> + ibdev_err_ratelimited(&hr_dev->ib_dev,
> + "Only DCQCN supported for UD. Unsupported congest type 0x%x.\n",
> + hr_qp->cong_type);
Do not print kernel messages triggered by bad userspace input.
Jason
> + return -EOPNOTSUPP;
> + }
> +
> + return 0;
> +}
> +
> +static int set_congest_param(struct hns_roce_dev *hr_dev,
> + struct hns_roce_qp *hr_qp,
> + struct hns_roce_ib_create_qp *ucmd)
> +{
> + if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE)
> + return set_congest_type(hr_qp, ucmd);
> +
> + default_congest_type(hr_dev, hr_qp);
> +
> + return 0;
> +}
> +
> static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
> struct ib_qp_init_attr *init_attr,
> struct ib_udata *udata,
> @@ -1026,6 +1090,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
> return ret;
> }
>
> + if (init_attr->qp_type == IB_QPT_XRC_TGT)
> + default_congest_type(hr_dev, hr_qp);
> +
> if (udata) {
> ret = ib_copy_from_udata(ucmd, udata,
> min(udata->inlen, sizeof(*ucmd)));
> @@ -1043,6 +1110,10 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
> ibdev_err(ibdev,
> "failed to set user SQ size, ret = %d.\n",
> ret);
> +
> + ret = set_congest_param(hr_dev, hr_qp, ucmd);
> + if (ret)
> + return ret;
> } else {
> if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
> hr_qp->config = HNS_ROCE_EXSGE_FLAGS;
> diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
> index c996e151081e..757095a6c6fc 100644
> --- a/include/uapi/rdma/hns-abi.h
> +++ b/include/uapi/rdma/hns-abi.h
> @@ -81,6 +81,9 @@ struct hns_roce_ib_create_qp {
> __u8 sq_no_prefetch;
> __u8 reserved[5];
> __aligned_u64 sdb_addr;
> + __aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
> + __aligned_u64 create_flags;
> + __aligned_u64 cong_type_flags;
> };
>
> enum hns_roce_qp_cap_flags {
> @@ -107,6 +110,17 @@ enum {
> HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
> };
>
> +enum hns_roce_congest_type_flags {
> + HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
> + HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
> + HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
> + HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
> +};
Why are these bit flags if they are exclusive?
> +
> +enum hns_roce_create_qp_comp_mask {
> + HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
Why 1<<1 not 1<<0?
These are in the wrong order in the file too, they should be before
their first "use", ie move above struct hns_roce_ib_create_qp
> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
> __u32 reserved;
> __u32 config;
> __u32 max_inline_data;
> + __u8 reserved0;
> + __u8 congest_type;
Why this layout?
Jason
On 2024/2/21 23:52, Jason Gunthorpe wrote:
> On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
>> Support userspace configuring congestion control algorithm with
>> QP granularity. If the algorithm is not specified in userspace,
>> use the default one.
>>
>> Signed-off-by: Junxian Huang <[email protected]>
>> ---
>> drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
>> drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +---
>> drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 +-
>> drivers/infiniband/hw/hns/hns_roce_main.c | 3 +
>> drivers/infiniband/hw/hns/hns_roce_qp.c | 71 +++++++++++++++++++++
>> include/uapi/rdma/hns-abi.h | 17 +++++
>> 6 files changed, 112 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
>> index c88ba7e053bf..55f2f54e15fb 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
>> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
>> @@ -594,11 +594,19 @@ struct hns_roce_work {
>> u32 queue_num;
>> };
>>
>> -enum cong_type {
>> - CONG_TYPE_DCQCN,
>> - CONG_TYPE_LDCP,
>> - CONG_TYPE_HC3,
>> - CONG_TYPE_DIP,
>> +enum hns_roce_scc_algo {
>> + HNS_ROCE_SCC_ALGO_DCQCN = 0,
>> + HNS_ROCE_SCC_ALGO_LDCP,
>> + HNS_ROCE_SCC_ALGO_HC3,
>> + HNS_ROCE_SCC_ALGO_DIP,
>> + HNS_ROCE_SCC_ALGO_TOTAL,
>> +};
>> +
>> +enum hns_roce_cong_type {
>> + CONG_TYPE_DCQCN = 1 << HNS_ROCE_SCC_ALGO_DCQCN,
>> + CONG_TYPE_LDCP = 1 << HNS_ROCE_SCC_ALGO_LDCP,
>> + CONG_TYPE_HC3 = 1 << HNS_ROCE_SCC_ALGO_HC3,
>> + CONG_TYPE_DIP = 1 << HNS_ROCE_SCC_ALGO_DIP,
>> };
>>
>> struct hns_roce_qp {
>> @@ -644,7 +652,7 @@ struct hns_roce_qp {
>> struct list_head sq_node; /* all send qps are on a list */
>> struct hns_user_mmap_entry *dwqe_mmap_entry;
>> u32 config;
>> - enum cong_type cong_type;
>> + enum hns_roce_cong_type cong_type;
>> };
>>
>> struct hns_roce_ib_iboe {
>> @@ -845,7 +853,8 @@ struct hns_roce_caps {
>> u16 default_aeq_period;
>> u16 default_aeq_arm_st;
>> u16 default_ceq_arm_st;
>> - enum cong_type cong_type;
>> + u8 cong_cap;
>> + enum hns_roce_cong_type default_cong_type;
>> };
>>
>> enum hns_roce_device_state {
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> index 42e28586cefa..21532f213b0f 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>> @@ -2209,11 +2209,12 @@ static int hns_roce_query_caps(struct hns_roce_dev *hr_dev)
>> caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
>>
>> caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS);
>> - caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE);
>> + caps->cong_cap = hr_reg_read(resp_d, PF_CAPS_D_CONG_CAP);
>> caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
>> caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH);
>> caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS);
>> caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH);
>> + caps->default_cong_type = hr_reg_read(resp_d, PF_CAPS_D_DEFAULT_ALG);
>> caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS);
>> caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS);
>> caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS);
>> @@ -4737,14 +4738,8 @@ enum {
>> static int check_cong_type(struct ib_qp *ibqp,
>> struct hns_roce_congestion_algorithm *cong_alg)
>> {
>> - struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
>> struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
>>
>> - if (ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_GSI)
>> - hr_qp->cong_type = CONG_TYPE_DCQCN;
>> - else
>> - hr_qp->cong_type = hr_dev->caps.cong_type;
>> -
>> /* different congestion types match different configurations */
>> switch (hr_qp->cong_type) {
>> case CONG_TYPE_DCQCN:
>> @@ -4772,9 +4767,6 @@ static int check_cong_type(struct ib_qp *ibqp,
>> cong_alg->wnd_mode_sel = WND_LIMIT;
>> break;
>> default:
>> - ibdev_warn(&hr_dev->ib_dev,
>> - "invalid type(%u) for congestion selection.\n",
>> - hr_qp->cong_type);
>> hr_qp->cong_type = CONG_TYPE_DCQCN;
>> cong_alg->alg_sel = CONG_DCQCN;
>> cong_alg->alg_sub_sel = UNSUPPORT_CONG_LEVEL;
>> @@ -4807,7 +4799,7 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
>> return ret;
>>
>> hr_reg_write(context, QPC_CONG_ALGO_TMPL_ID, hr_dev->cong_algo_tmpl_id +
>> - hr_qp->cong_type * HNS_ROCE_CONG_SIZE);
>> + ilog2(hr_qp->cong_type) * HNS_ROCE_CONG_SIZE);
>> hr_reg_clear(qpc_mask, QPC_CONG_ALGO_TMPL_ID);
>> hr_reg_write(&context->ext, QPCEX_CONG_ALG_SEL, cong_field.alg_sel);
>> hr_reg_clear(&qpc_mask->ext, QPCEX_CONG_ALG_SEL);
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
>> index cd97cbee682a..359a74672ba1 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
>> @@ -1214,12 +1214,13 @@ struct hns_roce_query_pf_caps_d {
>> #define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20)
>> #define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22)
>> #define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24)
>> -#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26)
>> +#define PF_CAPS_D_CONG_CAP PF_CAPS_D_FIELD_LOC(29, 26)
>> #define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64)
>> #define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86)
>> #define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96)
>> #define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118)
>> #define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120)
>> +#define PF_CAPS_D_DEFAULT_ALG PF_CAPS_D_FIELD_LOC(127, 122)
>> #define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128)
>> #define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148)
>> #define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160)
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
>> index b55fe6911f9f..e5b678814f58 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
>> @@ -394,6 +394,9 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
>> resp.config |= HNS_ROCE_RSP_CQE_INLINE_FLAGS;
>> }
>>
>> + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
>> + resp.congest_type = hr_dev->caps.cong_cap;
>> +
>> ret = hns_roce_uar_alloc(hr_dev, &context->uar);
>> if (ret)
>> goto error_out;
>> diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
>> index 31b147210688..e22911d6b6a9 100644
>> --- a/drivers/infiniband/hw/hns/hns_roce_qp.c
>> +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
>> @@ -1004,6 +1004,70 @@ static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
>> kfree(hr_qp->sq.wrid);
>> }
>>
>> +static void default_congest_type(struct hns_roce_dev *hr_dev,
>> + struct hns_roce_qp *hr_qp)
>> +{
>> + struct hns_roce_caps *caps = &hr_dev->caps;
>> +
>> + if (hr_qp->ibqp.qp_type == IB_QPT_UD ||
>> + hr_qp->ibqp.qp_type == IB_QPT_GSI)
>> + hr_qp->cong_type = CONG_TYPE_DCQCN;
>> + else
>> + hr_qp->cong_type = 1 << caps->default_cong_type;
>> +}
>> +
>> +static int set_congest_type(struct hns_roce_qp *hr_qp,
>> + struct hns_roce_ib_create_qp *ucmd)
>> +{
>> + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device);
>> +
>> + switch (ucmd->cong_type_flags) {
>> + case HNS_ROCE_CREATE_QP_FLAGS_DCQCN:
>> + hr_qp->cong_type = CONG_TYPE_DCQCN;
>> + break;
>> + case HNS_ROCE_CREATE_QP_FLAGS_LDCP:
>> + hr_qp->cong_type = CONG_TYPE_LDCP;
>> + break;
>> + case HNS_ROCE_CREATE_QP_FLAGS_HC3:
>> + hr_qp->cong_type = CONG_TYPE_HC3;
>> + break;
>> + case HNS_ROCE_CREATE_QP_FLAGS_DIP:
>> + hr_qp->cong_type = CONG_TYPE_DIP;
>> + break;
>> + default:
>> + hr_qp->cong_type = 0;
>> + }
>> +
>> + if (!(hr_qp->cong_type & hr_dev->caps.cong_cap)) {
>> + ibdev_err_ratelimited(&hr_dev->ib_dev,
>> + "Unsupported congest type 0x%x, cong_cap = 0x%x.\n",
>> + hr_qp->cong_type, hr_dev->caps.cong_cap);
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + if (hr_qp->ibqp.qp_type == IB_QPT_UD &&
>> + !(hr_qp->cong_type & CONG_TYPE_DCQCN)) {
>> + ibdev_err_ratelimited(&hr_dev->ib_dev,
>> + "Only DCQCN supported for UD. Unsupported congest type 0x%x.\n",
>> + hr_qp->cong_type);
>
> Do not print kernel messages triggered by bad userspace input.
>
> Jason
>
OK. Will remove these printing in next version.
>> + return -EOPNOTSUPP;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int set_congest_param(struct hns_roce_dev *hr_dev,
>> + struct hns_roce_qp *hr_qp,
>> + struct hns_roce_ib_create_qp *ucmd)
>> +{
>> + if (ucmd->comp_mask & HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE)
>> + return set_congest_type(hr_qp, ucmd);
>> +
>> + default_congest_type(hr_dev, hr_qp);
>> +
>> + return 0;
>> +}
>> +
>> static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>> struct ib_qp_init_attr *init_attr,
>> struct ib_udata *udata,
>> @@ -1026,6 +1090,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>> return ret;
>> }
>>
>> + if (init_attr->qp_type == IB_QPT_XRC_TGT)
>> + default_congest_type(hr_dev, hr_qp);
>> +
>> if (udata) {
>> ret = ib_copy_from_udata(ucmd, udata,
>> min(udata->inlen, sizeof(*ucmd)));
>> @@ -1043,6 +1110,10 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
>> ibdev_err(ibdev,
>> "failed to set user SQ size, ret = %d.\n",
>> ret);
>> +
>> + ret = set_congest_param(hr_dev, hr_qp, ucmd);
>> + if (ret)
>> + return ret;
>> } else {
>> if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
>> hr_qp->config = HNS_ROCE_EXSGE_FLAGS;
>> diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
>> index c996e151081e..757095a6c6fc 100644
>> --- a/include/uapi/rdma/hns-abi.h
>> +++ b/include/uapi/rdma/hns-abi.h
>> @@ -81,6 +81,9 @@ struct hns_roce_ib_create_qp {
>> __u8 sq_no_prefetch;
>> __u8 reserved[5];
>> __aligned_u64 sdb_addr;
>> + __aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
>> + __aligned_u64 create_flags;
>> + __aligned_u64 cong_type_flags;
>> };
>>
>> enum hns_roce_qp_cap_flags {
>> @@ -107,6 +110,17 @@ enum {
>> HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
>> };
>>
>> +enum hns_roce_congest_type_flags {
>> + HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
>> + HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
>> + HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
>> + HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
>> +};
>
> Why are these bit flags if they are exclusive?
>
Our FW uses bit flags. Although there is no direct relationship between
FW and ABI, but from the perspective of readability, bit flags are also
used consistently here in ABI.
>> +
>> +enum hns_roce_create_qp_comp_mask {
>> + HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
>
> Why 1<<1 not 1<<0?
This is to keep consistent with our internal ABI, there are some
features not upstream yet.
>
> These are in the wrong order in the file too, they should be before
> their first "use", ie move above struct hns_roce_ib_create_qp
>
OK.
>> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>> __u32 reserved;
>> __u32 config;
>> __u32 max_inline_data;
>> + __u8 reserved0;
>> + __u8 congest_type;
>
> Why this layout?
> > Jason
Same as the 1<<1 issue, to keep consistent with our internal ABI.
Thanks,
Junxian
On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
>
>
> On 2024/2/21 23:52, Jason Gunthorpe wrote:
> > On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
> >> Support userspace configuring congestion control algorithm with
> >> QP granularity. If the algorithm is not specified in userspace,
> >> use the default one.
> >>
> >> Signed-off-by: Junxian Huang <[email protected]>
> >> ---
> >> drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
> >> drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +---
> >> drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 +-
> >> drivers/infiniband/hw/hns/hns_roce_main.c | 3 +
> >> drivers/infiniband/hw/hns/hns_roce_qp.c | 71 +++++++++++++++++++++
> >> include/uapi/rdma/hns-abi.h | 17 +++++
> >> 6 files changed, 112 insertions(+), 19 deletions(-)
<...>
> >> +
> >> +enum hns_roce_create_qp_comp_mask {
> >> + HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
> >
> > Why 1<<1 not 1<<0?
>
> This is to keep consistent with our internal ABI, there are some
> features not upstream yet.
>
<...>
> >> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
> >> __u32 reserved;
> >> __u32 config;
> >> __u32 max_inline_data;
> >> + __u8 reserved0;
> >> + __u8 congest_type;
> >
> > Why this layout?
> > > Jason
>
> Same as the 1<<1 issue, to keep consistent with our internal ABI.
We are talking about upstream kernel UAPI, there is no internal ABI here.
Please fix it.
Thanks
>
> Thanks,
> Junxian
On 2024/2/26 16:09, Leon Romanovsky wrote:
> On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
>>
>>
>> On 2024/2/21 23:52, Jason Gunthorpe wrote:
>>> On Thu, Feb 08, 2024 at 11:50:38AM +0800, Junxian Huang wrote:
>>>> Support userspace configuring congestion control algorithm with
>>>> QP granularity. If the algorithm is not specified in userspace,
>>>> use the default one.
>>>>
>>>> Signed-off-by: Junxian Huang <[email protected]>
>>>> ---
>>>> drivers/infiniband/hw/hns/hns_roce_device.h | 23 +++++--
>>>> drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +---
>>>> drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 3 +-
>>>> drivers/infiniband/hw/hns/hns_roce_main.c | 3 +
>>>> drivers/infiniband/hw/hns/hns_roce_qp.c | 71 +++++++++++++++++++++
>>>> include/uapi/rdma/hns-abi.h | 17 +++++
>>>> 6 files changed, 112 insertions(+), 19 deletions(-)
>
> <...>
>
>>>> +
>>>> +enum hns_roce_create_qp_comp_mask {
>>>> + HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
>>>
>>> Why 1<<1 not 1<<0?
>>
>> This is to keep consistent with our internal ABI, there are some
>> features not upstream yet.
>>
>
> <...>
>
>>>> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>>>> __u32 reserved;
>>>> __u32 config;
>>>> __u32 max_inline_data;
>>>> + __u8 reserved0;
>>>> + __u8 congest_type;
>>>
>>> Why this layout?
>>>> Jason
>>
>> Same as the 1<<1 issue, to keep consistent with our internal ABI.
>
> We are talking about upstream kernel UAPI, there is no internal ABI here.
>
> Please fix it.
>
> Thanks
>
Sure. Will fix it in next version.
Thanks,
Junxian
>>
>> Thanks,
>> Junxian
On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
> >> +enum hns_roce_congest_type_flags {
> >> + HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
> >> + HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
> >> + HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
> >> + HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
> >> +};
> >
> > Why are these bit flags if they are exclusive?
> >
>
> Our FW uses bit flags. Although there is no direct relationship between
> FW and ABI, but from the perspective of readability, bit flags are also
> used consistently here in ABI.
Don't do that in uapi.
> >> +enum hns_roce_create_qp_comp_mask {
> >> + HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
> >
> > Why 1<<1 not 1<<0?
>
> This is to keep consistent with our internal ABI, there are some
> features not upstream yet.
Nope, pack them tightly. Don't keep an "internal ABI"
> >> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
> >> __u32 reserved;
> >> __u32 config;
> >> __u32 max_inline_data;
> >> + __u8 reserved0;
> >> + __u8 congest_type;
> >
> > Why this layout?
>
> Same as the 1<<1 issue, to keep consistent with our internal ABI.
Same answer
Jason
Problems below will be fixed in next version.
Thanks,
Junxian
On 2024/2/26 22:06, Jason Gunthorpe wrote:
> On Thu, Feb 22, 2024 at 03:06:20PM +0800, Junxian Huang wrote:
>>>> +enum hns_roce_congest_type_flags {
>>>> + HNS_ROCE_CREATE_QP_FLAGS_DCQCN = 1 << 0,
>>>> + HNS_ROCE_CREATE_QP_FLAGS_LDCP = 1 << 1,
>>>> + HNS_ROCE_CREATE_QP_FLAGS_HC3 = 1 << 2,
>>>> + HNS_ROCE_CREATE_QP_FLAGS_DIP = 1 << 3,
>>>> +};
>>>
>>> Why are these bit flags if they are exclusive?
>>>
>>
>> Our FW uses bit flags. Although there is no direct relationship between
>> FW and ABI, but from the perspective of readability, bit flags are also
>> used consistently here in ABI.
>
> Don't do that in uapi.
>
>>>> +enum hns_roce_create_qp_comp_mask {
>>>> + HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE = 1 << 1,
>>>
>>> Why 1<<1 not 1<<0?
>>
>> This is to keep consistent with our internal ABI, there are some
>> features not upstream yet.
>
> Nope, pack them tightly. Don't keep an "internal ABI"
>
>>>> @@ -114,6 +128,9 @@ struct hns_roce_ib_alloc_ucontext_resp {
>>>> __u32 reserved;
>>>> __u32 config;
>>>> __u32 max_inline_data;
>>>> + __u8 reserved0;
>>>> + __u8 congest_type;
>>>
>>> Why this layout?
>>
>> Same as the 1<<1 issue, to keep consistent with our internal ABI.
>
> Same answer
>
> Jason