2021-10-07 12:51:35

by Gal Pressman

[permalink] [raw]
Subject: [RFC PATCH 0/2] EFA dmabuf memory regions

Hey all,

This is a followup to my previous RFC [1], which now makes use of the
dynamic attachment API implemented in the RDMA subsystem, but calls
dma_buf_pin() in order to make sure that the move_notify callback will
not be used, as suggested by Christian.
As explained in the previous RFC, move_notify requires the RDMA device
to support on-demand-paging (ODP) which is not common on most devices
(only supported by mlx5).

While the dynamic requirement makes sense for certain GPUs, some devices
(such as habanalabs) have device memory that is always "pinned" and do
not need/use the move_notify operation.

The first patch changes the dmabuf documentation to make it clear that
pinning does not necessarily mean the memory must be moved to system
memory, it is up to the exporter to decide.

The motivation of this RFC is to use habanalabs as the dmabuf exporter,
and EFA as the importer to allow for peer2peer access through libibverbs.

[1] https://lore.kernel.org/linux-rdma/[email protected]/

Thanks

Gal Pressman (2):
dma-buf: Fix pin callback comment
RDMA/efa: Add support for dmabuf memory regions

drivers/infiniband/hw/efa/efa.h | 4 +
drivers/infiniband/hw/efa/efa_main.c | 1 +
drivers/infiniband/hw/efa/efa_verbs.c | 166 +++++++++++++++++++++-----
include/linux/dma-buf.h | 4 +-
4 files changed, 142 insertions(+), 33 deletions(-)

--
2.33.0


2021-10-07 12:51:43

by Gal Pressman

[permalink] [raw]
Subject: [RFC PATCH 2/2] RDMA/efa: Add support for dmabuf memory regions

Implement a dmabuf importer for the EFA driver. As ODP is not supported,
the dmabuf memory regions always pin the buffers to prevent the
move_notify callback from being called.

Signed-off-by: Gal Pressman <[email protected]>
---
drivers/infiniband/hw/efa/efa.h | 4 +
drivers/infiniband/hw/efa/efa_main.c | 1 +
drivers/infiniband/hw/efa/efa_verbs.c | 166 +++++++++++++++++++++-----
3 files changed, 141 insertions(+), 30 deletions(-)

diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 2b8ca099b381..407d7c4baa16 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -141,6 +141,10 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct ib_udata *udata);
int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
struct ib_port_immutable *immutable);
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 203e6ddcacbc..72cd7d952a07 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -267,6 +267,7 @@ static const struct ib_device_ops efa_dev_ops = {
.query_port = efa_query_port,
.query_qp = efa_query_qp,
.reg_user_mr = efa_reg_mr,
+ .reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf,

INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index be6d3ff0f1be..ca907853a84f 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -3,6 +3,8 @@
* Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
*/

+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
#include <linux/vmalloc.h>
#include <linux/log2.h>

@@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev,
return 0;
}

-struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
- u64 virt_addr, int access_flags,
- struct ib_udata *udata)
+static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
+{
+ WARN_ON_ONCE(1,
+ "Invalidate callback should not be called when memory is pinned\n");
+}
+
+static struct dma_buf_attach_ops efa_dmabuf_attach_ops = {
+ .allow_peer2peer = true,
+ .move_notify = efa_dmabuf_invalidate_cb,
+};
+
+static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
+ struct ib_udata *udata)
{
struct efa_dev *dev = to_edev(ibpd->device);
- struct efa_com_reg_mr_params params = {};
- struct efa_com_reg_mr_result result = {};
- struct pbl_context pbl;
int supp_access_flags;
- unsigned int pg_sz;
struct efa_mr *mr;
- int inline_size;
- int err;

if (udata && udata->inlen &&
!ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
ibdev_dbg(&dev->ibdev,
"Incompatible ABI params, udata not cleared\n");
- err = -EINVAL;
- goto err_out;
+ return ERR_PTR(-EINVAL);
}

supp_access_flags =
@@ -1522,23 +1527,26 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
ibdev_dbg(&dev->ibdev,
"Unsupported access flags[%#x], supported[%#x]\n",
access_flags, supp_access_flags);
- err = -EOPNOTSUPP;
- goto err_out;
+ return ERR_PTR(-EOPNOTSUPP);
}

mr = kzalloc(sizeof(*mr), GFP_KERNEL);
- if (!mr) {
- err = -ENOMEM;
- goto err_out;
- }
+ if (!mr)
+ return ERR_PTR(-ENOMEM);

- mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
- if (IS_ERR(mr->umem)) {
- err = PTR_ERR(mr->umem);
- ibdev_dbg(&dev->ibdev,
- "Failed to pin and map user space memory[%d]\n", err);
- goto err_free;
- }
+ return mr;
+}
+
+static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
+ u64 length, u64 virt_addr, int access_flags)
+{
+ struct efa_dev *dev = to_edev(ibpd->device);
+ struct efa_com_reg_mr_params params = {};
+ struct efa_com_reg_mr_result result = {};
+ struct pbl_context pbl;
+ unsigned int pg_sz;
+ int inline_size;
+ int err;

params.pd = to_epd(ibpd)->pdn;
params.iova = virt_addr;
@@ -1549,10 +1557,9 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
dev->dev_attr.page_size_cap,
virt_addr);
if (!pg_sz) {
- err = -EOPNOTSUPP;
ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
dev->dev_attr.page_size_cap);
- goto err_unmap;
+ return -EOPNOTSUPP;
}

params.page_shift = order_base_2(pg_sz);
@@ -1566,21 +1573,21 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
if (params.page_num <= inline_size) {
err = efa_create_inline_pbl(dev, mr, &params);
if (err)
- goto err_unmap;
+ return err;

err = efa_com_register_mr(&dev->edev, &params, &result);
if (err)
- goto err_unmap;
+ return err;
} else {
err = efa_create_pbl(dev, &pbl, mr, &params);
if (err)
- goto err_unmap;
+ return err;

err = efa_com_register_mr(&dev->edev, &params, &result);
pbl_destroy(dev, &pbl);

if (err)
- goto err_unmap;
+ return err;
}

mr->ibmr.lkey = result.l_key;
@@ -1588,9 +1595,98 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
mr->ibmr.length = length;
ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);

+ return 0;
+}
+
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+ u64 length, u64 virt_addr,
+ int fd, int access_flags,
+ struct ib_udata *udata)
+{
+ struct efa_dev *dev = to_edev(ibpd->device);
+ struct ib_umem_dmabuf *umem_dmabuf;
+ struct efa_mr *mr;
+ int err;
+
+ mr = efa_alloc_mr(ibpd, access_flags, udata);
+ if (IS_ERR(mr)) {
+ err = PTR_ERR(mr);
+ goto err_out;
+ }
+
+ umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd,
+ access_flags, &efa_dmabuf_attach_ops);
+ if (IS_ERR(umem_dmabuf)) {
+ ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err);
+ err = PTR_ERR(umem_dmabuf);
+ goto err_free;
+ }
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ err = dma_buf_pin(umem_dmabuf->attach);
+ if (err) {
+ ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n");
+ goto err_release;
+ }
+
+ err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+ if (err) {
+ ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n");
+ goto err_unpin;
+ }
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ mr->umem = &umem_dmabuf->umem;
+ err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+ if (err)
+ goto err_unmap;
+
return &mr->ibmr;

err_unmap:
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+err_unpin:
+ dma_buf_unpin(umem_dmabuf->attach);
+err_release:
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+ ib_umem_release(mr->umem);
+err_free:
+ kfree(mr);
+err_out:
+ atomic64_inc(&dev->stats.reg_mr_err);
+ return ERR_PTR(err);
+}
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata)
+{
+ struct efa_dev *dev = to_edev(ibpd->device);
+ struct efa_mr *mr;
+ int err;
+
+ mr = efa_alloc_mr(ibpd, access_flags, udata);
+ if (IS_ERR(mr)) {
+ err = PTR_ERR(mr);
+ goto err_out;
+ }
+
+ mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
+ if (IS_ERR(mr->umem)) {
+ err = PTR_ERR(mr->umem);
+ ibdev_dbg(&dev->ibdev,
+ "Failed to pin and map user space memory[%d]\n", err);
+ goto err_free;
+ }
+
+ err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+ if (err)
+ goto err_release;
+
+ return &mr->ibmr;
+
+err_release:
ib_umem_release(mr->umem);
err_free:
kfree(mr);
@@ -1603,6 +1699,7 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
{
struct efa_dev *dev = to_edev(ibmr->device);
struct efa_com_dereg_mr_params params;
+ struct ib_umem_dmabuf *umem_dmabuf;
struct efa_mr *mr = to_emr(ibmr);
int err;

@@ -1613,6 +1710,15 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
if (err)
return err;

+ if (mr->umem->is_dmabuf) {
+ umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ dma_buf_unpin(umem_dmabuf->attach);
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+ }
+
ib_umem_release(mr->umem);
kfree(mr);

--
2.33.0