2023-12-25 07:57:31

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 0/6] RDMA/hns: Improvement for multi-level addressing

This series optimizes multi-level addressing for hns.

Patch #1, #2 and #6 are optimization of multi-level addressing codes.

Patch #3 is prepared for the following optimizations.

Patch #4 and #5 introduce adaptive pagesize and hopnum to improve HW
performance.

Chengchang Tang (5):
RDMA/hns: Refactor mtr find
RDMA/hns: Refactor mtr_init_buf_cfg()
RDMA/hns: Alloc MTR memory before alloc_mtt()
RDMA/hns: Support flexible pagesize
RDMA/hns: Support adaptive PBL hopnum

Yunsheng Lin (1):
RDMA/hns: Simplify 'struct hns_roce_hem' allocation

drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
drivers/infiniband/hw/hns/hns_roce_cq.c | 11 +-
drivers/infiniband/hw/hns/hns_roce_device.h | 16 +-
drivers/infiniband/hw/hns/hns_roce_hem.c | 95 +----
drivers/infiniband/hw/hns/hns_roce_hem.h | 56 +--
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 111 +++---
drivers/infiniband/hw/hns/hns_roce_mr.c | 386 +++++++++++++++-----
7 files changed, 386 insertions(+), 295 deletions(-)

--
2.30.0



2023-12-25 07:57:50

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 1/6] RDMA/hns: Refactor mtr find

From: Chengchang Tang <[email protected]>

hns_roce_mtr_find() is a collection of multiple functions, and the
return value is also difficult to understand, which is not conducive
to modification and maintenance.

Separate the function of obtaining MTR root BA from this function.
And some adjustments has been made to improve readability.

Signed-off-by: Chengchang Tang <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_cq.c | 11 +--
drivers/infiniband/hw/hns/hns_roce_device.h | 7 +-
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 102 ++++++++++----------
drivers/infiniband/hw/hns/hns_roce_mr.c | 86 +++++++++++------
4 files changed, 121 insertions(+), 85 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 1b6d16af8c12..7250d0643b5c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -133,14 +133,12 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
struct ib_device *ibdev = &hr_dev->ib_dev;
u64 mtts[MTT_MIN_COUNT] = {};
- dma_addr_t dma_handle;
int ret;

- ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts),
- &dma_handle);
- if (!ret) {
+ ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts));
+ if (ret) {
ibdev_err(ibdev, "failed to find CQ mtr, ret = %d.\n", ret);
- return -EINVAL;
+ return ret;
}

/* Get CQC memory HEM(Hardware Entry Memory) table */
@@ -157,7 +155,8 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
goto err_put;
}

- ret = hns_roce_create_cqc(hr_dev, hr_cq, mtts, dma_handle);
+ ret = hns_roce_create_cqc(hr_dev, hr_cq, mtts,
+ hns_roce_get_mtr_ba(&hr_cq->mtr));
if (ret)
goto err_xa;

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index b1fce5ddf631..dd652dc090b0 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1152,8 +1152,13 @@ void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev);

/* hns roce hw need current block and next block addr from mtt */
#define MTT_MIN_COUNT 2
+static inline dma_addr_t hns_roce_get_mtr_ba(struct hns_roce_mtr *mtr)
+{
+ return mtr->hem_cfg.root_ba;
+}
+
int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
- u32 offset, u64 *mtt_buf, int mtt_max, u64 *base_addr);
+ u32 offset, u64 *mtt_buf, int mtt_max);
int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
struct hns_roce_buf_attr *buf_attr,
unsigned int page_shift, struct ib_udata *udata,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 8206daea6767..94e9e6a237cf 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3195,21 +3195,22 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev,
u64 pages[HNS_ROCE_V2_MAX_INNER_MTPT_NUM] = { 0 };
struct ib_device *ibdev = &hr_dev->ib_dev;
dma_addr_t pbl_ba;
- int i, count;
+ int ret;
+ int i;

- count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages,
- min_t(int, ARRAY_SIZE(pages), mr->npages),
- &pbl_ba);
- if (count < 1) {
- ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n",
- count);
- return -ENOBUFS;
+ ret = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages,
+ min_t(int, ARRAY_SIZE(pages), mr->npages));
+ if (ret) {
+ ibdev_err(ibdev, "failed to find PBL mtr, ret = %d.\n", ret);
+ return ret;
}

/* Aligned to the hardware address access unit */
- for (i = 0; i < count; i++)
+ for (i = 0; i < ARRAY_SIZE(pages); i++)
pages[i] >>= 6;

+ pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr);
+
mpt_entry->pbl_size = cpu_to_le32(mr->npages);
mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3);
hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3));
@@ -3308,18 +3309,12 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev,
static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev,
void *mb_buf, struct hns_roce_mr *mr)
{
- struct ib_device *ibdev = &hr_dev->ib_dev;
+ dma_addr_t pbl_ba = hns_roce_get_mtr_ba(&mr->pbl_mtr);
struct hns_roce_v2_mpt_entry *mpt_entry;
- dma_addr_t pbl_ba = 0;

mpt_entry = mb_buf;
memset(mpt_entry, 0, sizeof(*mpt_entry));

- if (hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, NULL, 0, &pbl_ba) < 0) {
- ibdev_err(ibdev, "failed to find frmr mtr.\n");
- return -ENOBUFS;
- }
-
hr_reg_write(mpt_entry, MPT_ST, V2_MPT_ST_FREE);
hr_reg_write(mpt_entry, MPT_PD, mr->pd);

@@ -4346,17 +4341,20 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
{
u64 mtts[MTT_MIN_COUNT] = { 0 };
u64 wqe_sge_ba;
- int count;
+ int ret;

/* Search qp buf's mtts */
- count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts,
- MTT_MIN_COUNT, &wqe_sge_ba);
- if (hr_qp->rq.wqe_cnt && count < 1) {
+ ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts,
+ MTT_MIN_COUNT);
+ if (hr_qp->rq.wqe_cnt && ret) {
ibdev_err(&hr_dev->ib_dev,
- "failed to find RQ WQE, QPN = 0x%lx.\n", hr_qp->qpn);
- return -EINVAL;
+ "failed to find QP(0x%lx) RQ WQE buf, ret = %d.\n",
+ hr_qp->qpn, ret);
+ return ret;
}

+ wqe_sge_ba = hns_roce_get_mtr_ba(&hr_qp->mtr);
+
context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3);
qpc_mask->wqe_sge_ba = 0;

@@ -4418,23 +4416,23 @@ static int config_qp_sq_buf(struct hns_roce_dev *hr_dev,
struct ib_device *ibdev = &hr_dev->ib_dev;
u64 sge_cur_blk = 0;
u64 sq_cur_blk = 0;
- int count;
+ int ret;

/* search qp buf's mtts */
- count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL);
- if (count < 1) {
- ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf.\n",
- hr_qp->qpn);
- return -EINVAL;
+ ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->sq.offset,
+ &sq_cur_blk, 1);
+ if (ret) {
+ ibdev_err(ibdev, "failed to find QP(0x%lx) SQ WQE buf, ret = %d.\n",
+ hr_qp->qpn, ret);
+ return ret;
}
if (hr_qp->sge.sge_cnt > 0) {
- count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
- hr_qp->sge.offset,
- &sge_cur_blk, 1, NULL);
- if (count < 1) {
- ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n",
- hr_qp->qpn);
- return -EINVAL;
+ ret = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+ hr_qp->sge.offset, &sge_cur_blk, 1);
+ if (ret) {
+ ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf, ret = %d.\n",
+ hr_qp->qpn, ret);
+ return ret;
}
}

@@ -5581,18 +5579,20 @@ static int hns_roce_v2_write_srqc_index_queue(struct hns_roce_srq *srq,
struct ib_device *ibdev = srq->ibsrq.device;
struct hns_roce_dev *hr_dev = to_hr_dev(ibdev);
u64 mtts_idx[MTT_MIN_COUNT] = {};
- dma_addr_t dma_handle_idx = 0;
+ dma_addr_t dma_handle_idx;
int ret;

/* Get physical address of idx que buf */
ret = hns_roce_mtr_find(hr_dev, &idx_que->mtr, 0, mtts_idx,
- ARRAY_SIZE(mtts_idx), &dma_handle_idx);
- if (ret < 1) {
+ ARRAY_SIZE(mtts_idx));
+ if (ret) {
ibdev_err(ibdev, "failed to find mtr for SRQ idx, ret = %d.\n",
ret);
- return -ENOBUFS;
+ return ret;
}

+ dma_handle_idx = hns_roce_get_mtr_ba(&idx_que->mtr);
+
hr_reg_write(ctx, SRQC_IDX_HOP_NUM,
to_hr_hem_hopnum(hr_dev->caps.idx_hop_num, srq->wqe_cnt));

@@ -5624,20 +5624,22 @@ static int hns_roce_v2_write_srqc(struct hns_roce_srq *srq, void *mb_buf)
struct hns_roce_dev *hr_dev = to_hr_dev(ibdev);
struct hns_roce_srq_context *ctx = mb_buf;
u64 mtts_wqe[MTT_MIN_COUNT] = {};
- dma_addr_t dma_handle_wqe = 0;
+ dma_addr_t dma_handle_wqe;
int ret;

memset(ctx, 0, sizeof(*ctx));

/* Get the physical address of srq buf */
ret = hns_roce_mtr_find(hr_dev, &srq->buf_mtr, 0, mtts_wqe,
- ARRAY_SIZE(mtts_wqe), &dma_handle_wqe);
- if (ret < 1) {
+ ARRAY_SIZE(mtts_wqe));
+ if (ret) {
ibdev_err(ibdev, "failed to find mtr for SRQ WQE, ret = %d.\n",
ret);
- return -ENOBUFS;
+ return ret;
}

+ dma_handle_wqe = hns_roce_get_mtr_ba(&srq->buf_mtr);
+
hr_reg_write(ctx, SRQC_SRQ_ST, 1);
hr_reg_write_bool(ctx, SRQC_SRQ_TYPE,
srq->ibsrq.srq_type == IB_SRQT_XRC);
@@ -6353,7 +6355,7 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
u64 eqe_ba[MTT_MIN_COUNT] = { 0 };
struct hns_roce_eq_context *eqc;
u64 bt_ba = 0;
- int count;
+ int ret;

eqc = mb_buf;
memset(eqc, 0, sizeof(struct hns_roce_eq_context));
@@ -6361,13 +6363,15 @@ static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
init_eq_config(hr_dev, eq);

/* if not multi-hop, eqe buffer only use one trunk */
- count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba, MTT_MIN_COUNT,
- &bt_ba);
- if (count < 1) {
- dev_err(hr_dev->dev, "failed to find EQE mtr\n");
- return -ENOBUFS;
+ ret = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba,
+ ARRAY_SIZE(eqe_ba));
+ if (ret) {
+ dev_err(hr_dev->dev, "failed to find EQE mtr, ret = %d\n", ret);
+ return ret;
}

+ bt_ba = hns_roce_get_mtr_ba(&eq->mtr);
+
hr_reg_write(eqc, EQC_EQ_ST, HNS_ROCE_V2_EQ_STATE_VALID);
hr_reg_write(eqc, EQC_EQE_HOP_NUM, eq->hop_num);
hr_reg_write(eqc, EQC_OVER_IGNORE, eq->over_ignore);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 91cd580480fe..9537a2c00bb6 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -809,47 +809,53 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
return ret;
}

-int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
- u32 offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
+static int hns_roce_get_direct_addr_mtt(struct hns_roce_hem_cfg *cfg,
+ u32 start_index, u64 *mtt_buf,
+ int mtt_cnt)
{
- struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
- int mtt_count, left;
- u32 start_index;
+ int mtt_count;
int total = 0;
- __le64 *mtts;
u32 npage;
u64 addr;

- if (!mtt_buf || mtt_max < 1)
- goto done;
-
- /* no mtt memory in direct mode, so just return the buffer address */
- if (cfg->is_direct) {
- start_index = offset >> HNS_HW_PAGE_SHIFT;
- for (mtt_count = 0; mtt_count < cfg->region_count &&
- total < mtt_max; mtt_count++) {
- npage = cfg->region[mtt_count].offset;
- if (npage < start_index)
- continue;
+ if (mtt_cnt > cfg->region_count)
+ return -EINVAL;

- addr = cfg->root_ba + (npage << HNS_HW_PAGE_SHIFT);
- mtt_buf[total] = addr;
+ for (mtt_count = 0; mtt_count < cfg->region_count && total < mtt_cnt;
+ mtt_count++) {
+ npage = cfg->region[mtt_count].offset;
+ if (npage < start_index)
+ continue;

- total++;
- }
+ addr = cfg->root_ba + (npage << HNS_HW_PAGE_SHIFT);
+ mtt_buf[total] = addr;

- goto done;
+ total++;
}

- start_index = offset >> cfg->buf_pg_shift;
- left = mtt_max;
+ if (!total)
+ return -ENOENT;
+
+ return 0;
+}
+
+static int hns_roce_get_mhop_mtt(struct hns_roce_dev *hr_dev,
+ struct hns_roce_mtr *mtr, u32 start_index,
+ u64 *mtt_buf, int mtt_cnt)
+{
+ int left = mtt_cnt;
+ int total = 0;
+ int mtt_count;
+ __le64 *mtts;
+ u32 npage;
+
while (left > 0) {
mtt_count = 0;
mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
start_index + total,
&mtt_count);
if (!mtts || !mtt_count)
- goto done;
+ break;

npage = min(mtt_count, left);
left -= npage;
@@ -857,11 +863,33 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
mtt_buf[total++] = le64_to_cpu(mtts[mtt_count]);
}

-done:
- if (base_addr)
- *base_addr = cfg->root_ba;
+ if (!total)
+ return -ENOENT;
+
+ return 0;
+}
+
+int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+ u32 offset, u64 *mtt_buf, int mtt_max)
+{
+ struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
+ u32 start_index;
+ int ret;
+
+ if (!mtt_buf || mtt_max < 1)
+ return -EINVAL;

- return total;
+ /* no mtt memory in direct mode, so just return the buffer address */
+ if (cfg->is_direct) {
+ start_index = offset >> HNS_HW_PAGE_SHIFT;
+ ret = hns_roce_get_direct_addr_mtt(cfg, start_index,
+ mtt_buf, mtt_max);
+ } else {
+ start_index = offset >> cfg->buf_pg_shift;
+ ret = hns_roce_get_mhop_mtt(hr_dev, mtr, start_index,
+ mtt_buf, mtt_max);
+ }
+ return ret;
}

static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
--
2.30.0


2023-12-25 07:58:08

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 2/6] RDMA/hns: Refactor mtr_init_buf_cfg()

From: Chengchang Tang <[email protected]>

page_shift and page_cnt is only used in mtr_map_bufs(). And these
parameter could be calculated indepedently.

Strip the computation of page_shift and page_cnt from mtr_init_buf_cfg(),
reducing the number of parameters of it. This helps reducing coupling
between mtr_init_buf_cfg() and mtr_map_bufs().

Signed-off-by: Chengchang Tang <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_mr.c | 76 +++++++++++++++----------
1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 9537a2c00bb6..adc401aea8df 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -707,14 +707,37 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
return 0;
}

-static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
- int page_count, unsigned int page_shift)
+static int cal_mtr_pg_cnt(struct hns_roce_mtr *mtr)
+{
+ struct hns_roce_buf_region *region;
+ int page_cnt = 0;
+ int i;
+
+ for (i = 0; i < mtr->hem_cfg.region_count; i++) {
+ region = &mtr->hem_cfg.region[i];
+ page_cnt += region->count;
+ }
+
+ return page_cnt;
+}
+
+static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
{
struct ib_device *ibdev = &hr_dev->ib_dev;
+ int page_count = cal_mtr_pg_cnt(mtr);
+ unsigned int page_shift;
dma_addr_t *pages;
int npage;
int ret;

+ /* When HEM buffer uses 0-level addressing, the page size is
+ * equal to the whole buffer size, and we split the buffer into
+ * small pages which is used to check whether the adjacent
+ * units are in the continuous space and its size is fixed to
+ * 4K based on hns ROCEE's requirement.
+ */
+ page_shift = mtr->hem_cfg.is_direct ? HNS_HW_PAGE_SHIFT :
+ mtr->hem_cfg.buf_pg_shift;
/* alloc a tmp array to store buffer's dma address */
pages = kvcalloc(page_count, sizeof(dma_addr_t), GFP_KERNEL);
if (!pages)
@@ -894,37 +917,30 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,

static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
struct hns_roce_buf_attr *attr,
- struct hns_roce_hem_cfg *cfg,
- unsigned int *buf_page_shift, u64 unalinged_size)
+ struct hns_roce_hem_cfg *cfg, u64 unalinged_size)
{
+ struct ib_device *ibdev = &hr_dev->ib_dev;
struct hns_roce_buf_region *r;
u64 first_region_padding;
int page_cnt, region_cnt;
- unsigned int page_shift;
+ size_t buf_pg_sz;
size_t buf_size;

/* If mtt is disabled, all pages must be within a continuous range */
cfg->is_direct = !mtr_has_mtt(attr);
buf_size = mtr_bufs_size(attr);
if (cfg->is_direct) {
- /* When HEM buffer uses 0-level addressing, the page size is
- * equal to the whole buffer size, and we split the buffer into
- * small pages which is used to check whether the adjacent
- * units are in the continuous space and its size is fixed to
- * 4K based on hns ROCEE's requirement.
- */
- page_shift = HNS_HW_PAGE_SHIFT;
-
- /* The ROCEE requires the page size to be 4K * 2 ^ N. */
+ buf_pg_sz = HNS_HW_PAGE_SIZE;
cfg->buf_pg_count = 1;
+ /* The ROCEE requires the page size to be 4K * 2 ^ N. */
cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT +
order_base_2(DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE));
first_region_padding = 0;
} else {
- page_shift = attr->page_shift;
cfg->buf_pg_count = DIV_ROUND_UP(buf_size + unalinged_size,
- 1 << page_shift);
- cfg->buf_pg_shift = page_shift;
+ 1 << attr->page_shift);
+ cfg->buf_pg_shift = attr->page_shift;
+ buf_pg_sz = 1 << cfg->buf_pg_shift;
first_region_padding = unalinged_size;
}

@@ -937,7 +953,7 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
r->offset = page_cnt;
buf_size = hr_hw_page_align(attr->region[region_cnt].size +
first_region_padding);
- r->count = DIV_ROUND_UP(buf_size, 1 << page_shift);
+ r->count = DIV_ROUND_UP(buf_size, buf_pg_sz);
first_region_padding = 0;
page_cnt += r->count;
r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum,
@@ -945,9 +961,13 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
}

cfg->region_count = region_cnt;
- *buf_page_shift = page_shift;
+ if (cfg->region_count < 1 || cfg->buf_pg_shift < HNS_HW_PAGE_SHIFT) {
+ ibdev_err(ibdev, "failed to init mtr cfg, count %d shift %u.\n",
+ cfg->region_count, cfg->buf_pg_shift);
+ return -EINVAL;
+ }

- return page_cnt;
+ return 0;
}

static u64 cal_pages_per_l1ba(unsigned int ba_per_bt, unsigned int hopnum)
@@ -1035,18 +1055,12 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
unsigned long user_addr)
{
struct ib_device *ibdev = &hr_dev->ib_dev;
- unsigned int buf_page_shift = 0;
- int buf_page_cnt;
int ret;

- buf_page_cnt = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg,
- &buf_page_shift,
- udata ? user_addr & ~PAGE_MASK : 0);
- if (buf_page_cnt < 1 || buf_page_shift < HNS_HW_PAGE_SHIFT) {
- ibdev_err(ibdev, "failed to init mtr cfg, count %d shift %u.\n",
- buf_page_cnt, buf_page_shift);
- return -EINVAL;
- }
+ ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg,
+ udata ? user_addr & ~PAGE_MASK : 0);
+ if (ret)
+ return ret;

ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift);
if (ret) {
@@ -1070,7 +1084,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
}

/* Write buffer's dma address to MTT */
- ret = mtr_map_bufs(hr_dev, mtr, buf_page_cnt, buf_page_shift);
+ ret = mtr_map_bufs(hr_dev, mtr);
if (ret)
ibdev_err(ibdev, "failed to map mtr bufs, ret = %d.\n", ret);
else
--
2.30.0


2023-12-25 07:58:27

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 3/6] RDMA/hns: Alloc MTR memory before alloc_mtt()

From: Chengchang Tang <[email protected]>

MTR memory allocation do not depend on allocation of mtt.

This patch moves the allocation of mtr before mtt in preparation for
the following optimization.

Signed-off-by: Chengchang Tang <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_mr.c | 47 ++++++++++++++-----------
1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index adc401aea8df..74ea9d8482b9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -695,7 +695,7 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
mtr->umem = NULL;
mtr->kmem = hns_roce_buf_alloc(hr_dev, total_size,
buf_attr->page_shift,
- mtr->hem_cfg.is_direct ?
+ !mtr_has_mtt(buf_attr) ?
HNS_ROCE_BUF_DIRECT : 0);
if (IS_ERR(mtr->kmem)) {
ibdev_err(ibdev, "failed to alloc kmem, ret = %ld.\n",
@@ -1054,45 +1054,52 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
unsigned int ba_page_shift, struct ib_udata *udata,
unsigned long user_addr)
{
+ u64 pgoff = udata ? user_addr & ~PAGE_MASK : 0;
struct ib_device *ibdev = &hr_dev->ib_dev;
int ret;

- ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg,
- udata ? user_addr & ~PAGE_MASK : 0);
- if (ret)
- return ret;
-
- ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift);
- if (ret) {
- ibdev_err(ibdev, "failed to alloc mtr mtt, ret = %d.\n", ret);
- return ret;
- }
-
/* The caller has its own buffer list and invokes the hns_roce_mtr_map()
* to finish the MTT configuration.
*/
if (buf_attr->mtt_only) {
mtr->umem = NULL;
mtr->kmem = NULL;
- return 0;
+ } else {
+ ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, udata, user_addr);
+ if (ret) {
+ ibdev_err(ibdev,
+ "failed to alloc mtr bufs, ret = %d.\n", ret);
+ return ret;
+ }
}

- ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, udata, user_addr);
+ ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, pgoff);
+ if (ret)
+ goto err_init_buf;
+
+ ret = mtr_alloc_mtt(hr_dev, mtr, ba_page_shift);
if (ret) {
- ibdev_err(ibdev, "failed to alloc mtr bufs, ret = %d.\n", ret);
- goto err_alloc_mtt;
+ ibdev_err(ibdev, "failed to alloc mtr mtt, ret = %d.\n", ret);
+ goto err_init_buf;
}

+ if (buf_attr->mtt_only)
+ return 0;
+
/* Write buffer's dma address to MTT */
ret = mtr_map_bufs(hr_dev, mtr);
- if (ret)
+ if (ret) {
ibdev_err(ibdev, "failed to map mtr bufs, ret = %d.\n", ret);
- else
- return 0;
+ goto err_alloc_mtt;
+ }
+
+ return 0;

- mtr_free_bufs(hr_dev, mtr);
err_alloc_mtt:
mtr_free_mtt(hr_dev, mtr);
+err_init_buf:
+ mtr_free_bufs(hr_dev, mtr);
+
return ret;
}

--
2.30.0


2023-12-25 07:59:03

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize

From: Chengchang Tang <[email protected]>

In the current implementation, a fixed page size is used to
configure the PBL, which is not flexible enough and is not
conducive to the performance of the HW.

Signed-off-by: Chengchang Tang <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
3 files changed, 139 insertions(+), 44 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c
index 11a78ceae568..60269322ba98 100644
--- a/drivers/infiniband/hw/hns/hns_roce_alloc.c
+++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
@@ -137,12 +137,6 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
int total = 0;
int i;

- if (page_shift > buf->trunk_shift) {
- dev_err(hr_dev->dev, "failed to check kmem buf shift %u > %u\n",
- page_shift, buf->trunk_shift);
- return -EINVAL;
- }
-
offset = 0;
max_size = buf->ntrunks << buf->trunk_shift;
for (i = 0; i < buf_cnt && offset < max_size; i++) {
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index dd652dc090b0..1a8516019516 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -179,6 +179,7 @@ enum {

#define HNS_ROCE_CMD_SUCCESS 1

+#define HNS_ROCE_MAX_HOP_NUM 3
/* The minimum page size is 4K for hardware */
#define HNS_HW_PAGE_SHIFT 12
#define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT)
@@ -269,6 +270,11 @@ struct hns_roce_hem_list {
dma_addr_t root_ba; /* pointer to the root ba table */
};

+enum mtr_type {
+ MTR_DEFAULT = 0,
+ MTR_PBL,
+};
+
struct hns_roce_buf_attr {
struct {
size_t size; /* region size */
@@ -277,7 +283,10 @@ struct hns_roce_buf_attr {
unsigned int region_count; /* valid region count */
unsigned int page_shift; /* buffer page shift */
unsigned int user_access; /* umem access flag */
+ u64 iova;
+ enum mtr_type type;
bool mtt_only; /* only alloc buffer-required MTT memory */
+ bool adaptive; /* adaptive for page_shift and hopnum */
};

struct hns_roce_hem_cfg {
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 74ea9d8482b9..d3a8b342c38c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -32,6 +32,7 @@
*/

#include <linux/vmalloc.h>
+#include <linux/count_zeros.h>
#include <rdma/ib_umem.h>
#include <linux/math.h>
#include "hns_roce_device.h"
@@ -103,6 +104,10 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
buf_attr.user_access = mr->access;
/* fast MR's buffer is alloced before mapping, not at creation */
buf_attr.mtt_only = is_fast;
+ buf_attr.iova = mr->iova;
+ /* pagesize and hopnum is fixed for fast MR */
+ buf_attr.adaptive = !is_fast;
+ buf_attr.type = MTR_PBL;

err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr,
hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT,
@@ -721,6 +726,16 @@ static int cal_mtr_pg_cnt(struct hns_roce_mtr *mtr)
return page_cnt;
}

+static bool need_split_huge_page(struct hns_roce_mtr *mtr)
+{
+ /* When HEM buffer uses 0-level addressing, the page size is
+ * equal to the whole buffer size. If the current MTR has multiple
+ * regions, we split the buffer into small pages(4k, required by hns
+ * ROCEE). These pages will be used in multiple regions.
+ */
+ return mtr->hem_cfg.is_direct && mtr->hem_cfg.region_count > 1;
+}
+
static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
{
struct ib_device *ibdev = &hr_dev->ib_dev;
@@ -730,14 +745,8 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
int npage;
int ret;

- /* When HEM buffer uses 0-level addressing, the page size is
- * equal to the whole buffer size, and we split the buffer into
- * small pages which is used to check whether the adjacent
- * units are in the continuous space and its size is fixed to
- * 4K based on hns ROCEE's requirement.
- */
- page_shift = mtr->hem_cfg.is_direct ? HNS_HW_PAGE_SHIFT :
- mtr->hem_cfg.buf_pg_shift;
+ page_shift = need_split_huge_page(mtr) ? HNS_HW_PAGE_SHIFT :
+ mtr->hem_cfg.buf_pg_shift;
/* alloc a tmp array to store buffer's dma address */
pages = kvcalloc(page_count, sizeof(dma_addr_t), GFP_KERNEL);
if (!pages)
@@ -757,7 +766,7 @@ static int mtr_map_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
goto err_alloc_list;
}

- if (mtr->hem_cfg.is_direct && npage > 1) {
+ if (need_split_huge_page(mtr) && npage > 1) {
ret = mtr_check_direct_pages(pages, npage, page_shift);
if (ret) {
ibdev_err(ibdev, "failed to check %s page: %d / %d.\n",
@@ -915,56 +924,136 @@ int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
return ret;
}

-static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
- struct hns_roce_buf_attr *attr,
- struct hns_roce_hem_cfg *cfg, u64 unalinged_size)
+/**
+ * hns_roce_find_buf_best_pgsz - Find best page size of the kmem.
+ *
+ * @hr_dev: hns_roce_dev struct
+ * @buf: kmem
+ *
+ * This function helps all DMA regions using multi-level addressing to
+ * find the best page size in kmem.
+ *
+ * Returns 0 if the find the best pagesize failed.
+ */
+static unsigned int hns_roce_find_buf_best_pgsz(struct hns_roce_dev *hr_dev,
+ struct hns_roce_buf *buf)
+{
+ unsigned long pgsz_bitmap = hr_dev->caps.page_size_cap;
+ u64 trunk_size = 1 << buf->trunk_shift;
+ u64 buf_size = trunk_size * buf->ntrunks;
+ dma_addr_t dma_addr = 0;
+ dma_addr_t mask;
+ int i;
+
+ /* trunk_shift determines the size of each buf not PAGE_SIZE. */
+ pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, buf->trunk_shift);
+ /* Best page size should smaller than the actual size of the block. */
+ mask = pgsz_bitmap &
+ GENMASK(BITS_PER_LONG - 1,
+ bits_per((buf_size + dma_addr) ^ dma_addr));
+
+ for (i = 0; i < buf->ntrunks; i++) {
+ /* Walk kmem bufs to make sure that the start address of the
+ * current DMA block and the end address of the previous DMA
+ * block have the same offset, otherwise the page will be
+ * reduced.
+ */
+ mask |= dma_addr ^ buf->trunk_list[i].map;
+ dma_addr = buf->trunk_list[i].map + trunk_size;
+ }
+
+ if (mask)
+ pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
+ return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0;
+}
+
+static int get_best_page_shift(struct hns_roce_dev *hr_dev,
+ struct hns_roce_mtr *mtr,
+ struct hns_roce_buf_attr *buf_attr)
+{
+ unsigned int page_sz;
+
+ if (!buf_attr->adaptive || buf_attr->type != MTR_PBL)
+ return 0;
+
+ if (mtr->umem)
+ page_sz = ib_umem_find_best_pgsz(mtr->umem,
+ hr_dev->caps.page_size_cap,
+ buf_attr->iova);
+ else
+ page_sz = hns_roce_find_buf_best_pgsz(hr_dev, mtr->kmem);
+
+ if (!page_sz)
+ return -EINVAL;
+
+ buf_attr->page_shift = order_base_2(page_sz);
+ return 0;
+}
+
+static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev,
+ struct hns_roce_buf_attr *attr)
{
struct ib_device *ibdev = &hr_dev->ib_dev;
+
+ if (attr->region_count > ARRAY_SIZE(attr->region) ||
+ attr->region_count < 1 || attr->page_shift < HNS_HW_PAGE_SHIFT) {
+ ibdev_err(ibdev,
+ "invalid buf attr, region count %d, page shift %u.\n",
+ attr->region_count, attr->page_shift);
+ return false;
+ }
+
+ return true;
+}
+
+static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
+ struct hns_roce_mtr *mtr,
+ struct hns_roce_buf_attr *attr)
+{
+ struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
struct hns_roce_buf_region *r;
- u64 first_region_padding;
- int page_cnt, region_cnt;
size_t buf_pg_sz;
size_t buf_size;
+ int page_cnt, i;
+ u64 pgoff = 0;
+
+ if (!is_buf_attr_valid(hr_dev, attr))
+ return -EINVAL;

/* If mtt is disabled, all pages must be within a continuous range */
cfg->is_direct = !mtr_has_mtt(attr);
+ cfg->region_count = attr->region_count;
buf_size = mtr_bufs_size(attr);
- if (cfg->is_direct) {
+ if (need_split_huge_page(mtr)) {
buf_pg_sz = HNS_HW_PAGE_SIZE;
cfg->buf_pg_count = 1;
/* The ROCEE requires the page size to be 4K * 2 ^ N. */
cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT +
order_base_2(DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE));
- first_region_padding = 0;
} else {
- cfg->buf_pg_count = DIV_ROUND_UP(buf_size + unalinged_size,
- 1 << attr->page_shift);
+ buf_pg_sz = 1 << attr->page_shift;
+ cfg->buf_pg_count = mtr->umem ?
+ ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz) :
+ DIV_ROUND_UP(buf_size, buf_pg_sz);
cfg->buf_pg_shift = attr->page_shift;
- buf_pg_sz = 1 << cfg->buf_pg_shift;
- first_region_padding = unalinged_size;
+ pgoff = mtr->umem ? mtr->umem->address & ~PAGE_MASK : 0;
}

/* Convert buffer size to page index and page count for each region and
* the buffer's offset needs to be appended to the first region.
*/
- for (page_cnt = 0, region_cnt = 0; region_cnt < attr->region_count &&
- region_cnt < ARRAY_SIZE(cfg->region); region_cnt++) {
- r = &cfg->region[region_cnt];
+ for (page_cnt = 0, i = 0; i < attr->region_count; i++) {
+ r = &cfg->region[i];
r->offset = page_cnt;
- buf_size = hr_hw_page_align(attr->region[region_cnt].size +
- first_region_padding);
- r->count = DIV_ROUND_UP(buf_size, buf_pg_sz);
- first_region_padding = 0;
- page_cnt += r->count;
- r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum,
- r->count);
- }
+ buf_size = hr_hw_page_align(attr->region[i].size + pgoff);
+ if (attr->type == MTR_PBL && mtr->umem)
+ r->count = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz);
+ else
+ r->count = DIV_ROUND_UP(buf_size, buf_pg_sz);

- cfg->region_count = region_cnt;
- if (cfg->region_count < 1 || cfg->buf_pg_shift < HNS_HW_PAGE_SHIFT) {
- ibdev_err(ibdev, "failed to init mtr cfg, count %d shift %u.\n",
- cfg->region_count, cfg->buf_pg_shift);
- return -EINVAL;
+ pgoff = 0;
+ page_cnt += r->count;
+ r->hopnum = to_hr_hem_hopnum(attr->region[i].hopnum, r->count);
}

return 0;
@@ -1054,7 +1143,6 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
unsigned int ba_page_shift, struct ib_udata *udata,
unsigned long user_addr)
{
- u64 pgoff = udata ? user_addr & ~PAGE_MASK : 0;
struct ib_device *ibdev = &hr_dev->ib_dev;
int ret;

@@ -1071,9 +1159,13 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
"failed to alloc mtr bufs, ret = %d.\n", ret);
return ret;
}
+
+ ret = get_best_page_shift(hr_dev, mtr, buf_attr);
+ if (ret)
+ goto err_init_buf;
}

- ret = mtr_init_buf_cfg(hr_dev, buf_attr, &mtr->hem_cfg, pgoff);
+ ret = mtr_init_buf_cfg(hr_dev, mtr, buf_attr);
if (ret)
goto err_init_buf;

--
2.30.0


2023-12-25 07:59:21

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 6/6] RDMA/hns: Simplify 'struct hns_roce_hem' allocation

From: Yunsheng Lin <[email protected]>

'struct hns_roce_hem' is used to refer to the last level of
dma buffer managed by the hw, pointed by a single BA(base
address) in the previous level of BT(base table), so the dma
buffer in 'struct hns_roce_hem' must be contiguous.

Right now the size of dma buffer in 'struct hns_roce_hem' is
decided by mhop->buf_chunk_size in get_hem_table_config(),
which ensure the mhop->buf_chunk_size is power of two of
PAGE_SIZE, so there will be only one contiguous dma buffer
allocated in hns_roce_alloc_hem(), which means hem->chunk_list
and chunk->mem for linking multi dma buffers is unnecessary.

This patch removes the hem->chunk_list and chunk->mem and other
related macro and function accordingly.

Signed-off-by: Yunsheng Lin <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_hem.c | 95 +++++-----------------
drivers/infiniband/hw/hns/hns_roce_hem.h | 56 +------------
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +-
3 files changed, 24 insertions(+), 136 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index c4ac06a33869..a4b3f19161dc 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -249,61 +249,34 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev,
}

static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
- int npages,
unsigned long hem_alloc_size,
gfp_t gfp_mask)
{
- struct hns_roce_hem_chunk *chunk = NULL;
struct hns_roce_hem *hem;
- struct scatterlist *mem;
int order;
void *buf;

WARN_ON(gfp_mask & __GFP_HIGHMEM);

+ order = get_order(hem_alloc_size);
+ if (PAGE_SIZE << order != hem_alloc_size) {
+ dev_err(hr_dev->dev, "invalid hem_alloc_size: %lu!\n",
+ hem_alloc_size);
+ return NULL;
+ }
+
hem = kmalloc(sizeof(*hem),
gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
if (!hem)
return NULL;

- INIT_LIST_HEAD(&hem->chunk_list);
-
- order = get_order(hem_alloc_size);
-
- while (npages > 0) {
- if (!chunk) {
- chunk = kmalloc(sizeof(*chunk),
- gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
- if (!chunk)
- goto fail;
-
- sg_init_table(chunk->mem, HNS_ROCE_HEM_CHUNK_LEN);
- chunk->npages = 0;
- chunk->nsg = 0;
- memset(chunk->buf, 0, sizeof(chunk->buf));
- list_add_tail(&chunk->list, &hem->chunk_list);
- }
+ buf = dma_alloc_coherent(hr_dev->dev, hem_alloc_size,
+ &hem->dma, gfp_mask);
+ if (!buf)
+ goto fail;

- while (1 << order > npages)
- --order;
-
- /*
- * Alloc memory one time. If failed, don't alloc small block
- * memory, directly return fail.
- */
- mem = &chunk->mem[chunk->npages];
- buf = dma_alloc_coherent(hr_dev->dev, PAGE_SIZE << order,
- &sg_dma_address(mem), gfp_mask);
- if (!buf)
- goto fail;
-
- chunk->buf[chunk->npages] = buf;
- sg_dma_len(mem) = PAGE_SIZE << order;
-
- ++chunk->npages;
- ++chunk->nsg;
- npages -= 1 << order;
- }
+ hem->buf = buf;
+ hem->size = hem_alloc_size;

return hem;

@@ -314,20 +287,10 @@ static struct hns_roce_hem *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,

void hns_roce_free_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem *hem)
{
- struct hns_roce_hem_chunk *chunk, *tmp;
- int i;
-
if (!hem)
return;

- list_for_each_entry_safe(chunk, tmp, &hem->chunk_list, list) {
- for (i = 0; i < chunk->npages; ++i)
- dma_free_coherent(hr_dev->dev,
- sg_dma_len(&chunk->mem[i]),
- chunk->buf[i],
- sg_dma_address(&chunk->mem[i]));
- kfree(chunk);
- }
+ dma_free_coherent(hr_dev->dev, hem->size, hem->buf, hem->dma);

kfree(hem);
}
@@ -415,7 +378,6 @@ static int alloc_mhop_hem(struct hns_roce_dev *hr_dev,
{
u32 bt_size = mhop->bt_chunk_size;
struct device *dev = hr_dev->dev;
- struct hns_roce_hem_iter iter;
gfp_t flag;
u64 bt_ba;
u32 size;
@@ -456,16 +418,15 @@ static int alloc_mhop_hem(struct hns_roce_dev *hr_dev,
*/
size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size : bt_size;
flag = GFP_KERNEL | __GFP_NOWARN;
- table->hem[index->buf] = hns_roce_alloc_hem(hr_dev, size >> PAGE_SHIFT,
- size, flag);
+ table->hem[index->buf] = hns_roce_alloc_hem(hr_dev, size, flag);
if (!table->hem[index->buf]) {
ret = -ENOMEM;
goto err_alloc_hem;
}

index->inited |= HEM_INDEX_BUF;
- hns_roce_hem_first(table->hem[index->buf], &iter);
- bt_ba = hns_roce_hem_addr(&iter);
+ bt_ba = table->hem[index->buf]->dma;
+
if (table->type < HEM_TYPE_MTT) {
if (mhop->hop_num == 2)
*(table->bt_l1[index->l1] + mhop->l2_idx) = bt_ba;
@@ -586,7 +547,6 @@ int hns_roce_table_get(struct hns_roce_dev *hr_dev,
}

table->hem[i] = hns_roce_alloc_hem(hr_dev,
- table->table_chunk_size >> PAGE_SHIFT,
table->table_chunk_size,
GFP_KERNEL | __GFP_NOWARN);
if (!table->hem[i]) {
@@ -725,7 +685,6 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_table *table,
unsigned long obj, dma_addr_t *dma_handle)
{
- struct hns_roce_hem_chunk *chunk;
struct hns_roce_hem_mhop mhop;
struct hns_roce_hem *hem;
unsigned long mhop_obj = obj;
@@ -734,7 +693,6 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
int offset, dma_offset;
void *addr = NULL;
u32 hem_idx = 0;
- int length;
int i, j;

mutex_lock(&table->mutex);
@@ -767,23 +725,8 @@ void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
if (!hem)
goto out;

- list_for_each_entry(chunk, &hem->chunk_list, list) {
- for (i = 0; i < chunk->npages; ++i) {
- length = sg_dma_len(&chunk->mem[i]);
- if (dma_handle && dma_offset >= 0) {
- if (length > (u32)dma_offset)
- *dma_handle = sg_dma_address(
- &chunk->mem[i]) + dma_offset;
- dma_offset -= length;
- }
-
- if (length > (u32)offset) {
- addr = chunk->buf[i] + offset;
- goto out;
- }
- offset -= length;
- }
- }
+ *dma_handle = hem->dma + dma_offset;
+ addr = hem->buf + offset;

out:
mutex_unlock(&table->mutex);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h
index 7d23d3c51da4..6fb51db9682b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
@@ -56,10 +56,6 @@ enum {
HEM_TYPE_TRRL,
};

-#define HNS_ROCE_HEM_CHUNK_LEN \
- ((256 - sizeof(struct list_head) - 2 * sizeof(int)) / \
- (sizeof(struct scatterlist) + sizeof(void *)))
-
#define check_whether_bt_num_3(type, hop_num) \
(type < HEM_TYPE_MTT && hop_num == 2)

@@ -72,25 +68,13 @@ enum {
(type >= HEM_TYPE_MTT && hop_num == 1) || \
(type >= HEM_TYPE_MTT && hop_num == HNS_ROCE_HOP_NUM_0))

-struct hns_roce_hem_chunk {
- struct list_head list;
- int npages;
- int nsg;
- struct scatterlist mem[HNS_ROCE_HEM_CHUNK_LEN];
- void *buf[HNS_ROCE_HEM_CHUNK_LEN];
-};
-
struct hns_roce_hem {
- struct list_head chunk_list;
+ void *buf;
+ dma_addr_t dma;
+ unsigned long size;
refcount_t refcount;
};

-struct hns_roce_hem_iter {
- struct hns_roce_hem *hem;
- struct hns_roce_hem_chunk *chunk;
- int page_idx;
-};
-
struct hns_roce_hem_mhop {
u32 hop_num;
u32 buf_chunk_size;
@@ -133,38 +117,4 @@ void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_list *hem_list,
int offset, int *mtt_cnt);

-static inline void hns_roce_hem_first(struct hns_roce_hem *hem,
- struct hns_roce_hem_iter *iter)
-{
- iter->hem = hem;
- iter->chunk = list_empty(&hem->chunk_list) ? NULL :
- list_entry(hem->chunk_list.next,
- struct hns_roce_hem_chunk, list);
- iter->page_idx = 0;
-}
-
-static inline int hns_roce_hem_last(struct hns_roce_hem_iter *iter)
-{
- return !iter->chunk;
-}
-
-static inline void hns_roce_hem_next(struct hns_roce_hem_iter *iter)
-{
- if (++iter->page_idx >= iter->chunk->nsg) {
- if (iter->chunk->list.next == &iter->hem->chunk_list) {
- iter->chunk = NULL;
- return;
- }
-
- iter->chunk = list_entry(iter->chunk->list.next,
- struct hns_roce_hem_chunk, list);
- iter->page_idx = 0;
- }
-}
-
-static inline dma_addr_t hns_roce_hem_addr(struct hns_roce_hem_iter *iter)
-{
- return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
-}
-
#endif /* _HNS_ROCE_HEM_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 94e9e6a237cf..de56dc6e3226 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -4058,7 +4058,6 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
struct hns_roce_hem_table *table, int obj,
u32 step_idx)
{
- struct hns_roce_hem_iter iter;
struct hns_roce_hem_mhop mhop;
struct hns_roce_hem *hem;
unsigned long mhop_obj = obj;
@@ -4095,12 +4094,8 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,

if (check_whether_last_step(hop_num, step_idx)) {
hem = table->hem[hem_idx];
- for (hns_roce_hem_first(hem, &iter);
- !hns_roce_hem_last(&iter); hns_roce_hem_next(&iter)) {
- bt_ba = hns_roce_hem_addr(&iter);
- ret = set_hem_to_hw(hr_dev, obj, bt_ba, table->type,
- step_idx);
- }
+
+ ret = set_hem_to_hw(hr_dev, obj, hem->dma, table->type, step_idx);
} else {
if (step_idx == 0)
bt_ba = table->bt_l0_dma_addr[i];
--
2.30.0


2023-12-25 07:59:25

by Junxian Huang

[permalink] [raw]
Subject: [PATCH for-next 5/6] RDMA/hns: Support adaptive PBL hopnum

From: Chengchang Tang <[email protected]>

In the current implementation, a fixed addressing level is used for
PBL. But in fact, the necessary addressing level is related to page
size and the size of MR.

This patch calculates the addressing level according to page size
and the size of MR, and uses the addressing level to configure the
PBL.

Signed-off-by: Chengchang Tang <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_mr.c | 57 +++++++++++++++++++++++--
1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index d3a8b342c38c..2a8e02dd6884 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -112,10 +112,13 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr,
hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT,
udata, start);
- if (err)
+ if (err) {
ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err);
- else
- mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count;
+ return err;
+ }
+
+ mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count;
+ mr->pbl_hop_num = buf_attr.region[0].hopnum;

return err;
}
@@ -990,6 +993,50 @@ static int get_best_page_shift(struct hns_roce_dev *hr_dev,
return 0;
}

+static int get_best_hop_num(struct hns_roce_dev *hr_dev,
+ struct hns_roce_mtr *mtr,
+ struct hns_roce_buf_attr *buf_attr,
+ unsigned int ba_pg_shift)
+{
+#define INVALID_HOPNUM -1
+#define MIN_BA_CNT 1
+ size_t buf_pg_sz = 1 << buf_attr->page_shift;
+ struct ib_device *ibdev = &hr_dev->ib_dev;
+ size_t ba_pg_sz = 1 << ba_pg_shift;
+ int hop_num = INVALID_HOPNUM;
+ size_t unit = MIN_BA_CNT;
+ size_t ba_cnt;
+ int j;
+
+ if (!buf_attr->adaptive || buf_attr->type != MTR_PBL)
+ return 0;
+
+ /* Caculating the number of buf pages, each buf page need a BA */
+ if (mtr->umem)
+ ba_cnt = ib_umem_num_dma_blocks(mtr->umem, buf_pg_sz);
+ else
+ ba_cnt = DIV_ROUND_UP(buf_attr->region[0].size, buf_pg_sz);
+
+ for (j = 0; j <= HNS_ROCE_MAX_HOP_NUM; j++) {
+ if (ba_cnt <= unit) {
+ hop_num = j;
+ break;
+ }
+ /* Number of BAs can be represented at per hop */
+ unit *= ba_pg_sz / BA_BYTE_LEN;
+ }
+
+ if (hop_num < 0) {
+ ibdev_err(ibdev,
+ "failed to calculate a valid hopnum.\n");
+ return -EINVAL;
+ }
+
+ buf_attr->region[0].hopnum = hop_num;
+
+ return 0;
+}
+
static bool is_buf_attr_valid(struct hns_roce_dev *hr_dev,
struct hns_roce_buf_attr *attr)
{
@@ -1163,6 +1210,10 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
ret = get_best_page_shift(hr_dev, mtr, buf_attr);
if (ret)
goto err_init_buf;
+
+ ret = get_best_hop_num(hr_dev, mtr, buf_attr, ba_page_shift);
+ if (ret)
+ goto err_init_buf;
}

ret = mtr_init_buf_cfg(hr_dev, mtr, buf_attr);
--
2.30.0


2023-12-26 08:52:17

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize

On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
> From: Chengchang Tang <[email protected]>
>
> In the current implementation, a fixed page size is used to
> configure the PBL, which is not flexible enough and is not
> conducive to the performance of the HW.
>
> Signed-off-by: Chengchang Tang <[email protected]>
> Signed-off-by: Junxian Huang <[email protected]>
> ---
> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
> 3 files changed, 139 insertions(+), 44 deletions(-)

I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
What is missing there?

Thanks

2023-12-26 09:16:54

by Junxian Huang

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize



On 2023/12/26 16:52, Leon Romanovsky wrote:
> On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
>> From: Chengchang Tang <[email protected]>
>>
>> In the current implementation, a fixed page size is used to
>> configure the PBL, which is not flexible enough and is not
>> conducive to the performance of the HW.
>>
>> Signed-off-by: Chengchang Tang <[email protected]>
>> Signed-off-by: Junxian Huang <[email protected]>
>> ---
>> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
>> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
>> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
>> 3 files changed, 139 insertions(+), 44 deletions(-)
>
> I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
> What is missing there?
>
> Thanks

Actually this API is used for umem.
For kmem, we add hns_roce_find_buf_best_pgsz() to do a similar job.

+static int get_best_page_shift(struct hns_roce_dev *hr_dev,
+ struct hns_roce_mtr *mtr,
+ struct hns_roce_buf_attr *buf_attr)
+{
+ unsigned int page_sz;
+
+ if (!buf_attr->adaptive || buf_attr->type != MTR_PBL)
+ return 0;
+
+ if (mtr->umem)
+ page_sz = ib_umem_find_best_pgsz(mtr->umem,
+ hr_dev->caps.page_size_cap,
+ buf_attr->iova);
+ else
+ page_sz = hns_roce_find_buf_best_pgsz(hr_dev, mtr->kmem);
+
+ if (!page_sz)
+ return -EINVAL;
+
+ buf_attr->page_shift = order_base_2(page_sz);
+ return 0;
+}

Thanks,
Junxian

2023-12-26 09:42:31

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize

On Tue, Dec 26, 2023 at 05:16:33PM +0800, Junxian Huang wrote:
>
>
> On 2023/12/26 16:52, Leon Romanovsky wrote:
> > On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
> >> From: Chengchang Tang <[email protected]>
> >>
> >> In the current implementation, a fixed page size is used to
> >> configure the PBL, which is not flexible enough and is not
> >> conducive to the performance of the HW.
> >>
> >> Signed-off-by: Chengchang Tang <[email protected]>
> >> Signed-off-by: Junxian Huang <[email protected]>
> >> ---
> >> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
> >> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
> >> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
> >> 3 files changed, 139 insertions(+), 44 deletions(-)
> >
> > I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
> > What is missing there?
> >
> > Thanks
>
> Actually this API is used for umem.
> For kmem, we add hns_roce_find_buf_best_pgsz() to do a similar job.

Thanks, let's give a chance to Jason to provide his feedback. I have a
strong feeling that this code duplicates something in the kernel.

Thanks

>
> +static int get_best_page_shift(struct hns_roce_dev *hr_dev,
> + struct hns_roce_mtr *mtr,
> + struct hns_roce_buf_attr *buf_attr)
> +{
> + unsigned int page_sz;
> +
> + if (!buf_attr->adaptive || buf_attr->type != MTR_PBL)
> + return 0;
> +
> + if (mtr->umem)
> + page_sz = ib_umem_find_best_pgsz(mtr->umem,
> + hr_dev->caps.page_size_cap,
> + buf_attr->iova);
> + else
> + page_sz = hns_roce_find_buf_best_pgsz(hr_dev, mtr->kmem);
> +
> + if (!page_sz)
> + return -EINVAL;
> +
> + buf_attr->page_shift = order_base_2(page_sz);
> + return 0;
> +}
>
> Thanks,
> Junxian

2024-01-04 20:29:23

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize

On Tue, Dec 26, 2023 at 05:16:33PM +0800, Junxian Huang wrote:
>
>
> On 2023/12/26 16:52, Leon Romanovsky wrote:
> > On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
> >> From: Chengchang Tang <[email protected]>
> >>
> >> In the current implementation, a fixed page size is used to
> >> configure the PBL, which is not flexible enough and is not
> >> conducive to the performance of the HW.
> >>
> >> Signed-off-by: Chengchang Tang <[email protected]>
> >> Signed-off-by: Junxian Huang <[email protected]>
> >> ---
> >> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
> >> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
> >> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
> >> 3 files changed, 139 insertions(+), 44 deletions(-)
> >
> > I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
> > What is missing there?
> >
> > Thanks
>
> Actually this API is used for umem.
> For kmem, we add hns_roce_find_buf_best_pgsz() to do a similar job.

But why do you need to do something like this for kmem? It looked to
me like kmem knows its allocation size when it was allocated, how come
you need to iterate over all of it again?

Jason

2024-01-05 05:55:27

by Junxian Huang

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize



On 2024/1/5 4:29, Jason Gunthorpe wrote:
> On Tue, Dec 26, 2023 at 05:16:33PM +0800, Junxian Huang wrote:
>>
>>
>> On 2023/12/26 16:52, Leon Romanovsky wrote:
>>> On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
>>>> From: Chengchang Tang <[email protected]>
>>>>
>>>> In the current implementation, a fixed page size is used to
>>>> configure the PBL, which is not flexible enough and is not
>>>> conducive to the performance of the HW.
>>>>
>>>> Signed-off-by: Chengchang Tang <[email protected]>
>>>> Signed-off-by: Junxian Huang <[email protected]>
>>>> ---
>>>> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
>>>> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
>>>> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
>>>> 3 files changed, 139 insertions(+), 44 deletions(-)
>>>
>>> I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
>>> What is missing there?
>>>
>>> Thanks
>>
>> Actually this API is used for umem.
>> For kmem, we add hns_roce_find_buf_best_pgsz() to do a similar job.
>
> But why do you need to do something like this for kmem? It looked to
> me like kmem knows its allocation size when it was allocated, how come
> you need to iterate over all of it again?
>
> Jason
>

kmem was split into multiple small pages for allocation to prevent allocation
failure due to memory fragmentation.

And now we add this function to confirm whether these small pages have contiguous
address. If so, they can be combined into one huge page for use, which is more
likely when iommu/smmu is enabled.

Junxian

2024-01-05 14:19:02

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize

On Fri, Jan 05, 2024 at 01:55:11PM +0800, Junxian Huang wrote:
>
>
> On 2024/1/5 4:29, Jason Gunthorpe wrote:
> > On Tue, Dec 26, 2023 at 05:16:33PM +0800, Junxian Huang wrote:
> >>
> >>
> >> On 2023/12/26 16:52, Leon Romanovsky wrote:
> >>> On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
> >>>> From: Chengchang Tang <[email protected]>
> >>>>
> >>>> In the current implementation, a fixed page size is used to
> >>>> configure the PBL, which is not flexible enough and is not
> >>>> conducive to the performance of the HW.
> >>>>
> >>>> Signed-off-by: Chengchang Tang <[email protected]>
> >>>> Signed-off-by: Junxian Huang <[email protected]>
> >>>> ---
> >>>> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
> >>>> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
> >>>> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
> >>>> 3 files changed, 139 insertions(+), 44 deletions(-)
> >>>
> >>> I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
> >>> What is missing there?
> >>>
> >>> Thanks
> >>
> >> Actually this API is used for umem.
> >> For kmem, we add hns_roce_find_buf_best_pgsz() to do a similar job.
> >
> > But why do you need to do something like this for kmem? It looked to
> > me like kmem knows its allocation size when it was allocated, how come
> > you need to iterate over all of it again?
> >
> > Jason
> >
>
> kmem was split into multiple small pages for allocation to prevent allocation
> failure due to memory fragmentation.
>
> And now we add this function to confirm whether these small pages have contiguous
> address. If so, they can be combined into one huge page for use, which is more
> likely when iommu/smmu is enabled.

That seems unncessary. The chances you get contiguous pages from
a lot of random allocations is really slim.

If you care about this optimization then you should have the allocator
explicitly request contiguous pages with high order allocations in a
manner that quickly fails and falls back to PAGE_SIZE.

Then you just use the size that the allocator was able to get, not try
to figure it out after the fact.

Jason

2024-01-09 01:47:54

by Junxian Huang

[permalink] [raw]
Subject: Re: [PATCH for-next 4/6] RDMA/hns: Support flexible pagesize



On 2024/1/5 22:18, Jason Gunthorpe wrote:
> On Fri, Jan 05, 2024 at 01:55:11PM +0800, Junxian Huang wrote:
>>
>>
>> On 2024/1/5 4:29, Jason Gunthorpe wrote:
>>> On Tue, Dec 26, 2023 at 05:16:33PM +0800, Junxian Huang wrote:
>>>>
>>>>
>>>> On 2023/12/26 16:52, Leon Romanovsky wrote:
>>>>> On Mon, Dec 25, 2023 at 03:53:28PM +0800, Junxian Huang wrote:
>>>>>> From: Chengchang Tang <[email protected]>
>>>>>>
>>>>>> In the current implementation, a fixed page size is used to
>>>>>> configure the PBL, which is not flexible enough and is not
>>>>>> conducive to the performance of the HW.
>>>>>>
>>>>>> Signed-off-by: Chengchang Tang <[email protected]>
>>>>>> Signed-off-by: Junxian Huang <[email protected]>
>>>>>> ---
>>>>>> drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 -
>>>>>> drivers/infiniband/hw/hns/hns_roce_device.h | 9 ++
>>>>>> drivers/infiniband/hw/hns/hns_roce_mr.c | 168 +++++++++++++++-----
>>>>>> 3 files changed, 139 insertions(+), 44 deletions(-)
>>>>>
>>>>> I'm wonder if the ib_umem_find_best_pgsz() API should be used instead.
>>>>> What is missing there?
>>>>>
>>>>> Thanks
>>>>
>>>> Actually this API is used for umem.
>>>> For kmem, we add hns_roce_find_buf_best_pgsz() to do a similar job.
>>>
>>> But why do you need to do something like this for kmem? It looked to
>>> me like kmem knows its allocation size when it was allocated, how come
>>> you need to iterate over all of it again?
>>>
>>> Jason
>>>
>>
>> kmem was split into multiple small pages for allocation to prevent allocation
>> failure due to memory fragmentation.
>>
>> And now we add this function to confirm whether these small pages have contiguous
>> address. If so, they can be combined into one huge page for use, which is more
>> likely when iommu/smmu is enabled.
>
> That seems unncessary. The chances you get contiguous pages from
> a lot of random allocations is really slim.
>
> If you care about this optimization then you should have the allocator
> explicitly request contiguous pages with high order allocations in a
> manner that quickly fails and falls back to PAGE_SIZE.
>
> Then you just use the size that the allocator was able to get, not try
> to figure it out after the fact.
>
> Jason

Thanks for the suggestion! We'll consider a new implementation.
For this patch, I'll drop the kmem modification and re-send a v2.

Junxian