From: Chengchang Tang <[email protected]>
Currently, driver fixedly allocates 4K pages for userspace WQE buffer
and results in HW reading WQE with a granularity of 4K even in a 64K
system. HW has to switch pages every 4K, leading to a loss of performance.
In order to improve performance, add support for userspace to allocate
flexible WQE buffer page size between 4K to system PAGESIZE.
For old-version userspace driver that does not support this feature,
the kernel driver will use a fixed 4K pagesize.
Signed-off-by: Chengchang Tang <[email protected]>
Signed-off-by: Junxian Huang <[email protected]>
---
drivers/infiniband/hw/hns/hns_roce_main.c | 5 ++++
drivers/infiniband/hw/hns/hns_roce_qp.c | 32 ++++++++++++++---------
include/uapi/rdma/hns-abi.h | 5 +++-
3 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 4cb0af733587..19b13c79b67b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -405,6 +405,11 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
resp.congest_type = hr_dev->caps.cong_cap;
+ if (ucmd.config & HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS) {
+ context->config |= HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS;
+ resp.config |= HNS_ROCE_RSP_UCTX_DYN_QP_PGSZ_FLAGS;
+ }
+
ret = hns_roce_uar_alloc(hr_dev, &context->uar);
if (ret)
goto error_out;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index db34665d1dfb..df8aba6a7840 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -643,18 +643,21 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev,
}
static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
- struct hns_roce_qp *hr_qp,
+ struct hns_roce_qp *hr_qp, u8 page_shift,
struct hns_roce_buf_attr *buf_attr)
{
+ unsigned int page_size = BIT(page_shift);
int buf_size;
int idx = 0;
hr_qp->buff_size = 0;
+ if (page_shift > PAGE_SHIFT || page_shift < HNS_HW_PAGE_SHIFT)
+ return -EOPNOTSUPP;
+
/* SQ WQE */
hr_qp->sq.offset = 0;
- buf_size = to_hr_hem_entries_size(hr_qp->sq.wqe_cnt,
- hr_qp->sq.wqe_shift);
+ buf_size = ALIGN(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift, page_size);
if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
buf_attr->region[idx].size = buf_size;
buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sq_hop_num;
@@ -664,8 +667,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
/* extend SGE WQE in SQ */
hr_qp->sge.offset = hr_qp->buff_size;
- buf_size = to_hr_hem_entries_size(hr_qp->sge.sge_cnt,
- hr_qp->sge.sge_shift);
+ buf_size = ALIGN(hr_qp->sge.sge_cnt << hr_qp->sge.sge_shift, page_size);
if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
buf_attr->region[idx].size = buf_size;
buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sge_hop_num;
@@ -675,8 +677,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
/* RQ WQE */
hr_qp->rq.offset = hr_qp->buff_size;
- buf_size = to_hr_hem_entries_size(hr_qp->rq.wqe_cnt,
- hr_qp->rq.wqe_shift);
+ buf_size = ALIGN(hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift, page_size);
if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
buf_attr->region[idx].size = buf_size;
buf_attr->region[idx].hopnum = hr_dev->caps.wqe_rq_hop_num;
@@ -687,8 +688,8 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
if (hr_qp->buff_size < 1)
return -EINVAL;
- buf_attr->page_shift = HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
buf_attr->region_count = idx;
+ buf_attr->page_shift = page_shift;
return 0;
}
@@ -744,20 +745,27 @@ static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr)
static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
struct ib_qp_init_attr *init_attr,
- struct ib_udata *udata, unsigned long addr)
+ struct ib_udata *udata,
+ struct hns_roce_ib_create_qp *ucmd)
{
+ struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata,
+ struct hns_roce_ucontext, ibucontext);
struct ib_device *ibdev = &hr_dev->ib_dev;
struct hns_roce_buf_attr buf_attr = {};
+ u8 page_shift = HNS_HW_PAGE_SHIFT;
int ret;
- ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr);
+ if (uctx && (uctx->config & HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS))
+ page_shift = ucmd->pageshift;
+
+ ret = set_wqe_buf_attr(hr_dev, hr_qp, page_shift, &buf_attr);
if (ret) {
ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret);
goto err_inline;
}
ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr,
PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz,
- udata, addr);
+ udata, ucmd->buf_addr);
if (ret) {
ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret);
goto err_inline;
@@ -1152,7 +1160,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
}
}
- ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
+ ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, &ucmd);
if (ret) {
ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret);
goto err_buf;
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index 94e861870e27..c5211b8dbf91 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
__u8 log_sq_bb_count;
__u8 log_sq_stride;
__u8 sq_no_prefetch;
- __u8 reserved[5];
+ __u8 pageshift;
+ __u8 reserved[4];
__aligned_u64 sdb_addr;
__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
__aligned_u64 create_flags;
@@ -119,12 +120,14 @@ enum {
HNS_ROCE_EXSGE_FLAGS = 1 << 0,
HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
+ HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS = 1 << 3,
};
enum {
HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0,
HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
+ HNS_ROCE_RSP_UCTX_DYN_QP_PGSZ_FLAGS = 1 << 3,
};
struct hns_roce_ib_alloc_ucontext_resp {
--
2.30.0
On Tue, Apr 30, 2024 at 05:28:45PM +0800, Junxian Huang wrote:
> From: Chengchang Tang <[email protected]>
>
> Currently, driver fixedly allocates 4K pages for userspace WQE buffer
> and results in HW reading WQE with a granularity of 4K even in a 64K
> system. HW has to switch pages every 4K, leading to a loss of performance.
> In order to improve performance, add support for userspace to allocate
> flexible WQE buffer page size between 4K to system PAGESIZE.
> @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
> __u8 log_sq_bb_count;
> __u8 log_sq_stride;
> __u8 sq_no_prefetch;
> - __u8 reserved[5];
> + __u8 pageshift;
> + __u8 reserved[4];
It doesn't make any sense to pass in a pageshift from userspace.
Kernel should detect whatever underlying physical contiguity userspace
has been able to create and configure the hardware optimally. The umem
already has all the tools to do this trivially.
Why would you need to specify anything?
Jason
On 2024/4/30 21:41, Jason Gunthorpe wrote:
> On Tue, Apr 30, 2024 at 05:28:45PM +0800, Junxian Huang wrote:
>> From: Chengchang Tang <[email protected]>
>>
>> Currently, driver fixedly allocates 4K pages for userspace WQE buffer
>> and results in HW reading WQE with a granularity of 4K even in a 64K
>> system. HW has to switch pages every 4K, leading to a loss of performance.
>
>> In order to improve performance, add support for userspace to allocate
>> flexible WQE buffer page size between 4K to system PAGESIZE.
>> @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
>> __u8 log_sq_bb_count;
>> __u8 log_sq_stride;
>> __u8 sq_no_prefetch;
>> - __u8 reserved[5];
>> + __u8 pageshift;
>> + __u8 reserved[4];
>
> It doesn't make any sense to pass in a pageshift from userspace.
>
> Kernel should detect whatever underlying physical contiguity userspace
> has been able to create and configure the hardware optimally. The umem
> already has all the tools to do this trivially.
>
> Why would you need to specify anything?
>
> Jason
>
For hns roce, QPs requires three wqe buffers, namely SQ wqe buffer, RQ wqe
buffer and EXT_SGE buffer. Due to HW constraints, they need to be configured
with the same page size. The memory of these three buffers is allocated by
the user-mode driver now. The user-mode driver will calculate the size of
each region and align them to the page size. Finally, the driver will merge
the memories of these three regions together, apply for a memory with
continuous virtual addresses, and send the address to the kernel-mode driver
(during this process, the user-mode driver and the kernel-mode driver only
exchange addresses, but not the the sizes of these three areas or other
information).
Since the three regions share one umem, through umem's tools, such as
ib_umem_find_best_pgsz(), they will eventually calculate the best page size
of the entire umem, not each region. For this reason, coupled with the fact
that currently only the address is passed when the kernel mode driver interacts
with the user mode driver, and no other information is passed, it makes it more
difficult to calculate the page size used by the user mode driver from the
kernel mode driver. In this case, it is a relatively simpler method to let user
mode directly tell kernel mode which pageshift it uses, and it is also easier
in terms of forward and backward compatibility.
Chengchang
On Mon, May 06, 2024 at 02:47:01PM +0800, Chengchang Tang wrote:
>
>
> On 2024/4/30 21:41, Jason Gunthorpe wrote:
> > On Tue, Apr 30, 2024 at 05:28:45PM +0800, Junxian Huang wrote:
> >> From: Chengchang Tang <[email protected]>
> >>
> >> Currently, driver fixedly allocates 4K pages for userspace WQE buffer
> >> and results in HW reading WQE with a granularity of 4K even in a 64K
> >> system. HW has to switch pages every 4K, leading to a loss of performance.
> >
> >> In order to improve performance, add support for userspace to allocate
> >> flexible WQE buffer page size between 4K to system PAGESIZE.
> >> @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
> >> __u8 log_sq_bb_count;
> >> __u8 log_sq_stride;
> >> __u8 sq_no_prefetch;
> >> - __u8 reserved[5];
> >> + __u8 pageshift;
> >> + __u8 reserved[4];
> >
> > It doesn't make any sense to pass in a pageshift from userspace.
> >
> > Kernel should detect whatever underlying physical contiguity userspace
> > has been able to create and configure the hardware optimally. The umem
> > already has all the tools to do this trivially.
> >
> > Why would you need to specify anything?
> >
>
> For hns roce, QPs requires three wqe buffers, namely SQ wqe buffer, RQ wqe
> buffer and EXT_SGE buffer. Due to HW constraints, they need to be configured
> with the same page size. The memory of these three buffers is allocated by
> the user-mode driver now. The user-mode driver will calculate the size of
> each region and align them to the page size. Finally, the driver will merge
> the memories of these three regions together, apply for a memory with
> continuous virtual addresses, and send the address to the kernel-mode driver
> (during this process, the user-mode driver and the kernel-mode driver only
> exchange addresses, but not the the sizes of these three areas or other
> information).
So you get a umem and the driver is slicing it up. What is the
problem? The kernel has the umem and the kernel knows the uniform page
size of that umem.
> Since the three regions share one umem, through umem's tools, such as
> ib_umem_find_best_pgsz(), they will eventually calculate the best page size
> of the entire umem, not each region.
That is what you want, you said? Each region has to have the same page
size. So the global page size of the umem is the correct one?
> For this reason, coupled with the fact
> that currently only the address is passed when the kernel mode driver interacts
> with the user mode driver, and no other information is passed, it makes it more
> difficult to calculate the page size used by the user mode driver from the
> kernel mode driver.
Even if it is difficult, this has to be done like this. You can't pass
a page size in from userspace, there is no good way for userspace to
do this correctly in all cases.
It sounds like you have it right, just get the page size from the
shared umem.
Jason
On 2024/4/30 21:41, Jason Gunthorpe wrote:
> On Tue, Apr 30, 2024 at 05:28:45PM +0800, Junxian Huang wrote:
>> From: Chengchang Tang <[email protected]>
>>
>> Currently, driver fixedly allocates 4K pages for userspace WQE buffer
>> and results in HW reading WQE with a granularity of 4K even in a 64K
>> system. HW has to switch pages every 4K, leading to a loss of performance.
>
>> In order to improve performance, add support for userspace to allocate
>> flexible WQE buffer page size between 4K to system PAGESIZE.
>> @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
>> __u8 log_sq_bb_count;
>> __u8 log_sq_stride;
>> __u8 sq_no_prefetch;
>> - __u8 reserved[5];
>> + __u8 pageshift;
>> + __u8 reserved[4];
>
> It doesn't make any sense to pass in a pageshift from userspace.
>
> Kernel should detect whatever underlying physical contiguity userspace
> has been able to create and configure the hardware optimally. The umem
> already has all the tools to do this trivially.
>
> Why would you need to specify anything?
>
> Jason
Hi Jason. Sorry for the late response.
WQE buffer of hns HW actually consists of 3 regions: SQ WQE, RQ WQE and
ext SGE. Userspace and kernel driver both computes buffer size and start
offset of these 3 regions based on the page shift. Kernel needs to obtains
the page shift from userspace to ensure the buffer size and start offset
are the same between kernel and userspace and avoid invalid memory access.
The "tools of umem" you said refers to ib_umem_find_best_pgsz() I assume.
This API cannot ensure returning the same page size as userspace, and
kernel cannot determine the start offset of the 3 regions in userspace in
this case.
Junxian
On 2024/5/6 23:11, Jason Gunthorpe wrote:
> On Mon, May 06, 2024 at 02:47:01PM +0800, Chengchang Tang wrote:
>>
>>
>> On 2024/4/30 21:41, Jason Gunthorpe wrote:
>>> On Tue, Apr 30, 2024 at 05:28:45PM +0800, Junxian Huang wrote:
>>>> From: Chengchang Tang <[email protected]>
>>>>
>>>> Currently, driver fixedly allocates 4K pages for userspace WQE buffer
>>>> and results in HW reading WQE with a granularity of 4K even in a 64K
>>>> system. HW has to switch pages every 4K, leading to a loss of performance.
>>>
>>>> In order to improve performance, add support for userspace to allocate
>>>> flexible WQE buffer page size between 4K to system PAGESIZE.
>>>> @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
>>>> __u8 log_sq_bb_count;
>>>> __u8 log_sq_stride;
>>>> __u8 sq_no_prefetch;
>>>> - __u8 reserved[5];
>>>> + __u8 pageshift;
>>>> + __u8 reserved[4];
>>>
>>> It doesn't make any sense to pass in a pageshift from userspace.
>>>
>>> Kernel should detect whatever underlying physical contiguity userspace
>>> has been able to create and configure the hardware optimally. The umem
>>> already has all the tools to do this trivially.
>>>
>>> Why would you need to specify anything?
>>>
>>
>> For hns roce, QPs requires three wqe buffers, namely SQ wqe buffer, RQ wqe
>> buffer and EXT_SGE buffer. Due to HW constraints, they need to be configured
>> with the same page size. The memory of these three buffers is allocated by
>> the user-mode driver now. The user-mode driver will calculate the size of
>> each region and align them to the page size. Finally, the driver will merge
>> the memories of these three regions together, apply for a memory with
>> continuous virtual addresses, and send the address to the kernel-mode driver
>> (during this process, the user-mode driver and the kernel-mode driver only
>> exchange addresses, but not the the sizes of these three areas or other
>> information).
>
> So you get a umem and the driver is slicing it up. What is the
> problem? The kernel has the umem and the kernel knows the uniform page
> size of that umem.
Currently, because the user-mode driver and the kernel-mode driver only
exchange addresses, from the perspective of the kernel-mode driver, if the
page size is not negotiated, it cannot even calculate the size of each region,
and thus cannot complete ib_umem_get().
Of course, we can add some information to be passed to the kernel mode driver,
such as size and offset of each region, but is there any essential difference
between this and directly passing page shift?
>
>
>> Since the three regions share one umem, through umem's tools, such as
>> ib_umem_find_best_pgsz(), they will eventually calculate the best page size
>> of the entire umem, not each region.
>
> That is what you want, you said? Each region has to have the same page
> size. So the global page size of the umem is the correct one?
No, the global page size may be bigger than the page size of each region.
If we use the global page size, the hardware may have out-of-bounds access.
>
>> For this reason, coupled with the fact
>> that currently only the address is passed when the kernel mode driver interacts
>> with the user mode driver, and no other information is passed, it makes it more
>> difficult to calculate the page size used by the user mode driver from the
>> kernel mode driver.
>
> Even if it is difficult, this has to be done like this. You can't pass
> a page size in from userspace, there is no good way for userspace to
> do this correctly in all cases.
Userspace may indeed go wrong, but in the current scenario, the page size is
only set within the allowed range [4k, 64k], and its errors only affects the
current QP. Is this acceptable?
Chengchang
On Tue, May 07, 2024 at 10:21:09PM +0800, Chengchang Tang wrote:
>
>
> On 2024/5/6 23:11, Jason Gunthorpe wrote:
> > On Mon, May 06, 2024 at 02:47:01PM +0800, Chengchang Tang wrote:
> >>
> >>
> >> On 2024/4/30 21:41, Jason Gunthorpe wrote:
> >>> On Tue, Apr 30, 2024 at 05:28:45PM +0800, Junxian Huang wrote:
> >>>> From: Chengchang Tang <[email protected]>
> >>>>
> >>>> Currently, driver fixedly allocates 4K pages for userspace WQE buffer
> >>>> and results in HW reading WQE with a granularity of 4K even in a 64K
> >>>> system. HW has to switch pages every 4K, leading to a loss of performance.
> >>>
> >>>> In order to improve performance, add support for userspace to allocate
> >>>> flexible WQE buffer page size between 4K to system PAGESIZE.
> >>>> @@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
> >>>> __u8 log_sq_bb_count;
> >>>> __u8 log_sq_stride;
> >>>> __u8 sq_no_prefetch;
> >>>> - __u8 reserved[5];
> >>>> + __u8 pageshift;
> >>>> + __u8 reserved[4];
> >>>
> >>> It doesn't make any sense to pass in a pageshift from userspace.
> >>>
> >>> Kernel should detect whatever underlying physical contiguity userspace
> >>> has been able to create and configure the hardware optimally. The umem
> >>> already has all the tools to do this trivially.
> >>>
> >>> Why would you need to specify anything?
> >>>
> >>
> >> For hns roce, QPs requires three wqe buffers, namely SQ wqe buffer, RQ wqe
> >> buffer and EXT_SGE buffer. Due to HW constraints, they need to be configured
> >> with the same page size. The memory of these three buffers is allocated by
> >> the user-mode driver now. The user-mode driver will calculate the size of
> >> each region and align them to the page size. Finally, the driver will merge
> >> the memories of these three regions together, apply for a memory with
> >> continuous virtual addresses, and send the address to the kernel-mode driver
> >> (during this process, the user-mode driver and the kernel-mode driver only
> >> exchange addresses, but not the the sizes of these three areas or other
> >> information).
> >
> > So you get a umem and the driver is slicing it up. What is the
> > problem? The kernel has the umem and the kernel knows the uniform page
> > size of that umem.
>
> Currently, because the user-mode driver and the kernel-mode driver only
> exchange addresses, from the perspective of the kernel-mode driver, if the
> page size is not negotiated, it cannot even calculate the size of each region,
> and thus cannot complete ib_umem_get().
That seems like what you should correct instead of sending in a page
shift.
> Of course, we can add some information to be passed to the kernel mode driver,
> such as size and offset of each region, but is there any essential difference
> between this and directly passing page shift?
Yes, userspace can reliably compute the start, length and any
offsets. It cannot reliably compute any sort of page size for DMA.
> >> Since the three regions share one umem, through umem's tools, such as
> >> ib_umem_find_best_pgsz(), they will eventually calculate the best page size
> >> of the entire umem, not each region.
> >
> > That is what you want, you said? Each region has to have the same page
> > size. So the global page size of the umem is the correct one?
>
> No, the global page size may be bigger than the page size of each region.
> If we use the global page size, the hardware may have out-of-bounds
> access.
Because the page size may be the entire umem?
But this is simple to deal with, the API even has it built in. Just be
sure to limit the page size bit map to only include sizes you can
handle based on the smallest sub region and possibly region
alignment. It should be a couple of bit ops.
> > Even if it is difficult, this has to be done like this. You can't pass
> > a page size in from userspace, there is no good way for userspace to
> > do this correctly in all cases.
>
> Userspace may indeed go wrong, but in the current scenario, the page size is
> only set within the allowed range [4k, 64k], and its errors only affects the
> current QP. Is this acceptable?
It is bad uAPI design to allow for such a failure mode.
Jason