2022-09-27 06:17:25

by Li Zhijian

[permalink] [raw]
Subject: [for-next PATCH v5 03/11] RDMA: Extend RDMA kernel verbs ABI to support flush

This commit extends the RDMA kernel verbs ABI to support the flush
operation defined in IBA A19.4.1. These changes are
backwards compatible with the existing RDMA kernel verbs ABI.

It makes device/HCA support new FLUSH attributes/capabilities, and it
also makes memory region support new FLUSH access flags.

Users can use ibv_reg_mr(3) to register flush access flags. Only the
access flags also supported by device's capabilities can be registered
successfully.

Once registered successfully, it means the MR is flushable. Similarly,
A flushable MR should also have one or both of GLOBAL_VISIBILITY and
PERSISTENT attributes/capabilities like device/HCA.

Signed-off-by: Li Zhijian <[email protected]>
---
V5: new names and new patch split scheme, suggested by Bob
---
include/rdma/ib_pack.h | 3 +++
include/rdma/ib_verbs.h | 20 +++++++++++++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index a9162f25beaf..56211d1cc9f9 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -84,6 +84,7 @@ enum {
/* opcode 0x15 is reserved */
IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+ IB_OPCODE_FLUSH = 0x1C,

/* real constants follow -- see comment about above IB_OPCODE()
macro for more details */
@@ -112,6 +113,7 @@ enum {
IB_OPCODE(RC, FETCH_ADD),
IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+ IB_OPCODE(RC, FLUSH),

/* UC */
IB_OPCODE(UC, SEND_FIRST),
@@ -149,6 +151,7 @@ enum {
IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
IB_OPCODE(RD, COMPARE_SWAP),
IB_OPCODE(RD, FETCH_ADD),
+ IB_OPCODE(RD, FLUSH),

/* UD */
IB_OPCODE(UD, SEND_ONLY),
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 975d6e9efbcb..571838dd06eb 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -270,6 +270,9 @@ enum ib_device_cap_flags {
/* The device supports padding incoming writes to cacheline. */
IB_DEVICE_PCI_WRITE_END_PADDING =
IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING,
+ /* Placement type attributes */
+ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL,
+ IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT,
};

enum ib_kernel_cap_flags {
@@ -985,6 +988,7 @@ enum ib_wc_opcode {
IB_WC_REG_MR,
IB_WC_MASKED_COMP_SWAP,
IB_WC_MASKED_FETCH_ADD,
+ IB_WC_FLUSH = IB_UVERBS_WC_FLUSH,
/*
* Set value of IB_WC_RECV so consumers can test if a completion is a
* receive by testing (opcode & IB_WC_RECV).
@@ -1325,6 +1329,7 @@ enum ib_wr_opcode {
IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ IB_WR_FLUSH = IB_UVERBS_WR_FLUSH,

/* These are kernel only and can not be issued by userspace */
IB_WR_REG_MR = 0x20,
@@ -1458,10 +1463,14 @@ enum ib_access_flags {
IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND,
IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB,
IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING,
+ IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL,
+ IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT,
+ IB_ACCESS_FLUSHABLE = IB_ACCESS_FLUSH_GLOBAL |
+ IB_ACCESS_FLUSH_PERSISTENT,

IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE,
IB_ACCESS_SUPPORTED =
- ((IB_ACCESS_HUGETLB << 1) - 1) | IB_ACCESS_OPTIONAL,
+ ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL,
};

/*
@@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
static inline int ib_check_mr_access(struct ib_device *ib_dev,
unsigned int flags)
{
+ u64 device_cap = ib_dev->attrs.device_cap_flags;
+
/*
* Local write permission is required if remote write or
* remote atomic permission is also requested.
@@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
if (flags & IB_ACCESS_ON_DEMAND &&
!(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
return -EINVAL;
+
+ if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
+ !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
+ (flags & IB_ACCESS_FLUSH_PERSISTENT &&
+ !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
+ return -EINVAL;
+
return 0;
}

--
2.31.1


2022-09-29 07:02:37

by Li Zhijian

[permalink] [raw]
Subject: Re: [for-next PATCH v5 03/11] RDMA: Extend RDMA kernel verbs ABI to support flush

Leon, Jason


On 27/09/2022 13:53, Li Zhijian wrote:
> /*
> @@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
> static inline int ib_check_mr_access(struct ib_device *ib_dev,
> unsigned int flags)
> {
> + u64 device_cap = ib_dev->attrs.device_cap_flags;
> +
> /*
> * Local write permission is required if remote write or
> * remote atomic permission is also requested.
> @@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
> if (flags & IB_ACCESS_ON_DEMAND &&
> !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
> return -EINVAL;
> +
> + if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
> + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
> + (flags & IB_ACCESS_FLUSH_PERSISTENT &&
> + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
> + return -EINVAL;
> +
Regarding of the return value of ib_check_mr_access. While updating the man page of ibv_reg_mr(3) of rdma-core,
```
       IBV_ACCESS_REMOTE_READ Enable Remote Read Access
       IBV_ACCESS_REMOTE_ATOMIC Enable Remote Atomic Operation Access (if supported)
       IBV_ACCESS_MW_BIND Enable Memory Window Binding
       IBV_ACCESS_ZERO_BASED  Use  byte offset from beginning of MR to access this MR, instead of a pointer address
       IBV_ACCESS_ON_DEMAND Create an on-demand paging MR (if supported)
...
RETURN VALUE
       ibv_reg_mr() / ibv_reg_mr_iova() / ibv_reg_dmabuf_mr() returns a pointer to the registered MR, or NULL if the request fails.  The local key (L_Key) field lkey is used as the lkey field of struct  ibv_sge  when  posting
       buffers  with  ibv_post_* verbs, and the the remote key (R_Key) field rkey is used by remote processes to perform Atomic and RDMA operations.  The remote process places this rkey as the rkey field of struct ibv_send_wr
       passed to the ibv_post_send function.
```
we can see, IBV_ACCESS_REMOTE_ATOMIC and IBV_ACCESS_ON_DEMAND are tagged "if supported" . but currently kernel
just returns EINVAL when user registers a MR with IB_ACCESS_ON_DEMAND to RXE.

I wonder we should return -EOPNOTSUPP if the device doesn't support requested capabilities

Thanks
Li


> return 0;
> }
>

2022-09-30 19:00:11

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [for-next PATCH v5 03/11] RDMA: Extend RDMA kernel verbs ABI to support flush

On Thu, Sep 29, 2022 at 02:21:24PM +0800, Li Zhijian wrote:

> we can see, IBV_ACCESS_REMOTE_ATOMIC and IBV_ACCESS_ON_DEMAND are
> tagged "if supported" . but currently kernel just returns EINVAL
> when user registers a MR with IB_ACCESS_ON_DEMAND to RXE.
>
> I wonder we should return -EOPNOTSUPP if the device doesn't support requested capabilities

Yes, unsupported combinations of access flags should trigger
EOPNOTSUPP

Jason

2022-10-28 18:58:37

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [for-next PATCH v5 03/11] RDMA: Extend RDMA kernel verbs ABI to support flush

On Tue, Sep 27, 2022 at 01:53:29PM +0800, Li Zhijian wrote:
> @@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
> static inline int ib_check_mr_access(struct ib_device *ib_dev,
> unsigned int flags)
> {
> + u64 device_cap = ib_dev->attrs.device_cap_flags;
> +
> /*
> * Local write permission is required if remote write or
> * remote atomic permission is also requested.
> @@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
> if (flags & IB_ACCESS_ON_DEMAND &&
> !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
> return -EINVAL;
> +
> + if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
> + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
> + (flags & IB_ACCESS_FLUSH_PERSISTENT &&
> + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
> + return -EINVAL;

This should be -EOPNOTSUPP as the above is changed to in for-next

Jason

2022-10-29 03:34:19

by Li Zhijian

[permalink] [raw]
Subject: Re: [for-next PATCH v5 03/11] RDMA: Extend RDMA kernel verbs ABI to support flush



On 29/10/2022 01:44, Jason Gunthorpe wrote:
> On Tue, Sep 27, 2022 at 01:53:29PM +0800, Li Zhijian wrote:
>> @@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
>> static inline int ib_check_mr_access(struct ib_device *ib_dev,
>> unsigned int flags)
>> {
>> + u64 device_cap = ib_dev->attrs.device_cap_flags;
>> +
>> /*
>> * Local write permission is required if remote write or
>> * remote atomic permission is also requested.
>> @@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
>> if (flags & IB_ACCESS_ON_DEMAND &&
>> !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
>> return -EINVAL;
>> +
>> + if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
>> + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
>> + (flags & IB_ACCESS_FLUSH_PERSISTENT &&
>> + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
>> + return -EINVAL;
> This should be -EOPNOTSUPP as the above is changed to in for-next
Yes,  my local tree(V6) had updated this. will repost this later.



>
> Jason