As cdc msg's length is 44B, cdc msgs can be sent inline in
most rdma devices, which can help reducing sending latency.
In my test environment, which are 2 VMs running on the same
physical host and whose NICs(ConnectX-4Lx) are working on
SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency.
Test command:
server: smc_run taskset -c 1 qperf
client: smc_run taskset -c 1 qperf <server ip> -oo \
msg_size:1:2K:*2 -t 30 -vu tcp_lat
The results shown below:
msgsize before after
1B 11.9 us 11.2 us (-0.7 us)
2B 11.7 us 11.2 us (-0.5 us)
4B 11.7 us 11.3 us (-0.4 us)
8B 11.6 us 11.2 us (-0.4 us)
16B 11.7 us 11.3 us (-0.4 us)
32B 11.7 us 11.3 us (-0.4 us)
64B 11.7 us 11.2 us (-0.5 us)
128B 11.6 us 11.2 us (-0.4 us)
256B 11.8 us 11.2 us (-0.6 us)
512B 11.8 us 11.4 us (-0.4 us)
1KB 11.9 us 11.4 us (-0.5 us)
2KB 12.1 us 11.5 us (-0.6 us)
Signed-off-by: Guangguan Wang <[email protected]>
---
net/smc/smc_ib.c | 1 +
net/smc/smc_wr.c | 5 ++++-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a3e2d3b89568..1dcce9e4f4ca 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -671,6 +671,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
.max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE,
.max_recv_sge = sges_per_buf,
+ .max_inline_data = SMC_WR_TX_SIZE,
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_RC,
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 24be1d03fef9..8a2f9a561197 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
static void smc_wr_init_sge(struct smc_link *lnk)
{
int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+ bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
u32 i;
for (i = 0; i < lnk->wr_tx_cnt; i++) {
- lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
@@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
lnk->wr_tx_ibs[i].send_flags =
IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ if (send_inline)
+ lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
--
2.24.3 (Apple Git-128)
Hi Guangguan,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on net-next/master]
url: https://github.com/intel-lab-lkp/linux/commits/Guangguan-Wang/net-smc-send-and-write-inline-optimization-for-smc/20220513-151715
base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git b67fd3d9d94223b424674f45eeadeff58b4b03ef
config: nios2-allyesconfig (https://download.01.org/0day-ci/archive/20220513/[email protected]/config)
compiler: nios2-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/bac726bf950dac20959af52c6884b7bb07772dac
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Guangguan-Wang/net-smc-send-and-write-inline-optimization-for-smc/20220513-151715
git checkout bac726bf950dac20959af52c6884b7bb07772dac
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=nios2 SHELL=/bin/bash net/smc/
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All warnings (new ones prefixed by >>):
net/smc/smc_wr.c: In function 'smc_wr_init_sge':
>> net/smc/smc_wr.c:561:57: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
561 | lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
| ^
vim +561 net/smc/smc_wr.c
553
554 static void smc_wr_init_sge(struct smc_link *lnk)
555 {
556 int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
557 bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
558 u32 i;
559
560 for (i = 0; i < lnk->wr_tx_cnt; i++) {
> 561 lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
562 lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
563 lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
564 lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
565 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
566 lnk->roce_pd->local_dma_lkey;
567 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
568 lnk->roce_pd->local_dma_lkey;
569 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
570 lnk->roce_pd->local_dma_lkey;
571 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
572 lnk->roce_pd->local_dma_lkey;
573 lnk->wr_tx_ibs[i].next = NULL;
574 lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
575 lnk->wr_tx_ibs[i].num_sge = 1;
576 lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
577 lnk->wr_tx_ibs[i].send_flags =
578 IB_SEND_SIGNALED | IB_SEND_SOLICITED;
579 if (send_inline)
580 lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
581 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
582 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
583 lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
584 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
585 lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
586 lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
587 }
588
589 if (lnk->lgr->smc_version == SMC_V2) {
590 lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
591 lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
592 lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
593
594 lnk->wr_tx_v2_ib->next = NULL;
595 lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
596 lnk->wr_tx_v2_ib->num_sge = 1;
597 lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
598 lnk->wr_tx_v2_ib->send_flags =
599 IB_SEND_SIGNALED | IB_SEND_SOLICITED;
600 }
601
602 /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
603 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
604 * and the same buffer for all sges. When a larger message arrived then
605 * the content of the first small sge is copied to the beginning of
606 * the larger spillover buffer, allowing easy data mapping.
607 */
608 for (i = 0; i < lnk->wr_rx_cnt; i++) {
609 int x = i * sges_per_buf;
610
611 lnk->wr_rx_sges[x].addr =
612 lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
613 lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
614 lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
615 if (lnk->lgr->smc_version == SMC_V2) {
616 lnk->wr_rx_sges[x + 1].addr =
617 lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
618 lnk->wr_rx_sges[x + 1].length =
619 SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
620 lnk->wr_rx_sges[x + 1].lkey =
621 lnk->roce_pd->local_dma_lkey;
622 }
623 lnk->wr_rx_ibs[i].next = NULL;
624 lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
625 lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
626 }
627 lnk->wr_reg.wr.next = NULL;
628 lnk->wr_reg.wr.num_sge = 0;
629 lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
630 lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
631 lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
632 }
633
--
0-DAY CI Kernel Test Service
https://01.org/lkp
On Fri, May 13, 2022 at 03:15:50PM +0800, Guangguan Wang wrote:
> As cdc msg's length is 44B, cdc msgs can be sent inline in
> most rdma devices, which can help reducing sending latency.
>
> In my test environment, which are 2 VMs running on the same
> physical host and whose NICs(ConnectX-4Lx) are working on
> SR-IOV mode, qperf shows 0.4us-0.7us improvement in latency.
>
> Test command:
> server: smc_run taskset -c 1 qperf
> client: smc_run taskset -c 1 qperf <server ip> -oo \
> msg_size:1:2K:*2 -t 30 -vu tcp_lat
>
> The results shown below:
> msgsize before after
> 1B 11.9 us 11.2 us (-0.7 us)
> 2B 11.7 us 11.2 us (-0.5 us)
> 4B 11.7 us 11.3 us (-0.4 us)
> 8B 11.6 us 11.2 us (-0.4 us)
> 16B 11.7 us 11.3 us (-0.4 us)
> 32B 11.7 us 11.3 us (-0.4 us)
> 64B 11.7 us 11.2 us (-0.5 us)
> 128B 11.6 us 11.2 us (-0.4 us)
> 256B 11.8 us 11.2 us (-0.6 us)
> 512B 11.8 us 11.4 us (-0.4 us)
> 1KB 11.9 us 11.4 us (-0.5 us)
> 2KB 12.1 us 11.5 us (-0.6 us)
>
> Signed-off-by: Guangguan Wang <[email protected]>
> ---
> net/smc/smc_ib.c | 1 +
> net/smc/smc_wr.c | 5 ++++-
> 2 files changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
> index a3e2d3b89568..1dcce9e4f4ca 100644
> --- a/net/smc/smc_ib.c
> +++ b/net/smc/smc_ib.c
> @@ -671,6 +671,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
> .max_recv_wr = SMC_WR_BUF_CNT * 3,
> .max_send_sge = SMC_IB_MAX_SEND_SGE,
> .max_recv_sge = sges_per_buf,
> + .max_inline_data = SMC_WR_TX_SIZE,
> },
> .sq_sig_type = IB_SIGNAL_REQ_WR,
> .qp_type = IB_QPT_RC,
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 24be1d03fef9..8a2f9a561197 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
> static void smc_wr_init_sge(struct smc_link *lnk)
> {
> int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
> + bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
When will it be false? You are creating QPs with max_inline_data == SMC_WR_TX_SIZE?
> u32 i;
>
> for (i = 0; i < lnk->wr_tx_cnt; i++) {
> - lnk->wr_tx_sges[i].addr =
> + lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
> lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
> lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
> lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
> @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
> lnk->wr_tx_ibs[i].send_flags =
> IB_SEND_SIGNALED | IB_SEND_SOLICITED;
> + if (send_inline)
> + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
If you try to transfer data == SMC_WR_TX_SIZE, you will get -ENOMEM error.
IB drivers check that length < qp->max_inline_data.
Thanks
> lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
> lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
> lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
> --
> 2.24.3 (Apple Git-128)
>
On 2022/5/14 14:02, Leon Romanovsky wrote:
>> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
>> index 24be1d03fef9..8a2f9a561197 100644
>> --- a/net/smc/smc_wr.c
>> +++ b/net/smc/smc_wr.c
>> @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
>> static void smc_wr_init_sge(struct smc_link *lnk)
>> {
>> int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
>> + bool send_inline = (lnk->qp_attr.cap.max_inline_data >= SMC_WR_TX_SIZE);
>
> When will it be false? You are creating QPs with max_inline_data == SMC_WR_TX_SIZE?
>
>> u32 i;
>>
>> for (i = 0; i < lnk->wr_tx_cnt; i++) {
>> - lnk->wr_tx_sges[i].addr =
>> + lnk->wr_tx_sges[i].addr = send_inline ? (u64)(&lnk->wr_tx_bufs[i]) :
>> lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
>> lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
>> lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
>> @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
>> lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
>> lnk->wr_tx_ibs[i].send_flags =
>> IB_SEND_SIGNALED | IB_SEND_SOLICITED;
>> + if (send_inline)
>> + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
>
> If you try to transfer data == SMC_WR_TX_SIZE, you will get -ENOMEM error.
> IB drivers check that length < qp->max_inline_data.
>
> Thanks
>
Got it.
I should create qps with max_inline_data == 0, and get the actual max_inline_data by query_qp.
And I should use lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE to decide whether to send inline or not.
Thank you.