This commit extends the RDMA kernel verbs ABI to support the flush
operation defined in IBA A19.4.1. These changes are
backwards compatible with the existing RDMA kernel verbs ABI.
It makes device/HCA support new FLUSH attributes/capabilities, and it
also makes memory region support new FLUSH access flags.
Users can use ibv_reg_mr(3) to register flush access flags. Only the
access flags also supported by device's capabilities can be registered
successfully.
Once registered successfully, it means the MR is flushable. Similarly,
A flushable MR should also have one or both of GLOBAL_VISIBILITY and
PERSISTENT attributes/capabilities like device/HCA.
Signed-off-by: Li Zhijian <[email protected]>
---
V5: new names and new patch split scheme, suggested by Bob
---
include/rdma/ib_pack.h | 3 +++
include/rdma/ib_verbs.h | 20 +++++++++++++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index a9162f25beaf..56211d1cc9f9 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -84,6 +84,7 @@ enum {
/* opcode 0x15 is reserved */
IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+ IB_OPCODE_FLUSH = 0x1C,
/* real constants follow -- see comment about above IB_OPCODE()
macro for more details */
@@ -112,6 +113,7 @@ enum {
IB_OPCODE(RC, FETCH_ADD),
IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+ IB_OPCODE(RC, FLUSH),
/* UC */
IB_OPCODE(UC, SEND_FIRST),
@@ -149,6 +151,7 @@ enum {
IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
IB_OPCODE(RD, COMPARE_SWAP),
IB_OPCODE(RD, FETCH_ADD),
+ IB_OPCODE(RD, FLUSH),
/* UD */
IB_OPCODE(UD, SEND_ONLY),
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 975d6e9efbcb..571838dd06eb 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -270,6 +270,9 @@ enum ib_device_cap_flags {
/* The device supports padding incoming writes to cacheline. */
IB_DEVICE_PCI_WRITE_END_PADDING =
IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING,
+ /* Placement type attributes */
+ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL,
+ IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT,
};
enum ib_kernel_cap_flags {
@@ -985,6 +988,7 @@ enum ib_wc_opcode {
IB_WC_REG_MR,
IB_WC_MASKED_COMP_SWAP,
IB_WC_MASKED_FETCH_ADD,
+ IB_WC_FLUSH = IB_UVERBS_WC_FLUSH,
/*
* Set value of IB_WC_RECV so consumers can test if a completion is a
* receive by testing (opcode & IB_WC_RECV).
@@ -1325,6 +1329,7 @@ enum ib_wr_opcode {
IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ IB_WR_FLUSH = IB_UVERBS_WR_FLUSH,
/* These are kernel only and can not be issued by userspace */
IB_WR_REG_MR = 0x20,
@@ -1458,10 +1463,14 @@ enum ib_access_flags {
IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND,
IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB,
IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING,
+ IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL,
+ IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT,
+ IB_ACCESS_FLUSHABLE = IB_ACCESS_FLUSH_GLOBAL |
+ IB_ACCESS_FLUSH_PERSISTENT,
IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE,
IB_ACCESS_SUPPORTED =
- ((IB_ACCESS_HUGETLB << 1) - 1) | IB_ACCESS_OPTIONAL,
+ ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL,
};
/*
@@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
static inline int ib_check_mr_access(struct ib_device *ib_dev,
unsigned int flags)
{
+ u64 device_cap = ib_dev->attrs.device_cap_flags;
+
/*
* Local write permission is required if remote write or
* remote atomic permission is also requested.
@@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
if (flags & IB_ACCESS_ON_DEMAND &&
!(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
return -EINVAL;
+
+ if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
+ !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
+ (flags & IB_ACCESS_FLUSH_PERSISTENT &&
+ !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
+ return -EINVAL;
+
return 0;
}
--
2.31.1
Leon, Jason
On 27/09/2022 13:53, Li Zhijian wrote:
> /*
> @@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
> static inline int ib_check_mr_access(struct ib_device *ib_dev,
> unsigned int flags)
> {
> + u64 device_cap = ib_dev->attrs.device_cap_flags;
> +
> /*
> * Local write permission is required if remote write or
> * remote atomic permission is also requested.
> @@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
> if (flags & IB_ACCESS_ON_DEMAND &&
> !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
> return -EINVAL;
> +
> + if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
> + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
> + (flags & IB_ACCESS_FLUSH_PERSISTENT &&
> + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
> + return -EINVAL;
> +
Regarding of the return value of ib_check_mr_access. While updating the man page of ibv_reg_mr(3) of rdma-core,
```
IBV_ACCESS_REMOTE_READ Enable Remote Read Access
IBV_ACCESS_REMOTE_ATOMIC Enable Remote Atomic Operation Access (if supported)
IBV_ACCESS_MW_BIND Enable Memory Window Binding
IBV_ACCESS_ZERO_BASED Use byte offset from beginning of MR to access this MR, instead of a pointer address
IBV_ACCESS_ON_DEMAND Create an on-demand paging MR (if supported)
...
RETURN VALUE
ibv_reg_mr() / ibv_reg_mr_iova() / ibv_reg_dmabuf_mr() returns a pointer to the registered MR, or NULL if the request fails. The local key (L_Key) field lkey is used as the lkey field of struct ibv_sge when posting
buffers with ibv_post_* verbs, and the the remote key (R_Key) field rkey is used by remote processes to perform Atomic and RDMA operations. The remote process places this rkey as the rkey field of struct ibv_send_wr
passed to the ibv_post_send function.
```
we can see, IBV_ACCESS_REMOTE_ATOMIC and IBV_ACCESS_ON_DEMAND are tagged "if supported" . but currently kernel
just returns EINVAL when user registers a MR with IB_ACCESS_ON_DEMAND to RXE.
I wonder we should return -EOPNOTSUPP if the device doesn't support requested capabilities
Thanks
Li
> return 0;
> }
>
On Thu, Sep 29, 2022 at 02:21:24PM +0800, Li Zhijian wrote:
> we can see, IBV_ACCESS_REMOTE_ATOMIC and IBV_ACCESS_ON_DEMAND are
> tagged "if supported" . but currently kernel just returns EINVAL
> when user registers a MR with IB_ACCESS_ON_DEMAND to RXE.
>
> I wonder we should return -EOPNOTSUPP if the device doesn't support requested capabilities
Yes, unsupported combinations of access flags should trigger
EOPNOTSUPP
Jason
On Tue, Sep 27, 2022 at 01:53:29PM +0800, Li Zhijian wrote:
> @@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
> static inline int ib_check_mr_access(struct ib_device *ib_dev,
> unsigned int flags)
> {
> + u64 device_cap = ib_dev->attrs.device_cap_flags;
> +
> /*
> * Local write permission is required if remote write or
> * remote atomic permission is also requested.
> @@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
> if (flags & IB_ACCESS_ON_DEMAND &&
> !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
> return -EINVAL;
> +
> + if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
> + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
> + (flags & IB_ACCESS_FLUSH_PERSISTENT &&
> + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
> + return -EINVAL;
This should be -EOPNOTSUPP as the above is changed to in for-next
Jason
On 29/10/2022 01:44, Jason Gunthorpe wrote:
> On Tue, Sep 27, 2022 at 01:53:29PM +0800, Li Zhijian wrote:
>> @@ -4321,6 +4330,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
>> static inline int ib_check_mr_access(struct ib_device *ib_dev,
>> unsigned int flags)
>> {
>> + u64 device_cap = ib_dev->attrs.device_cap_flags;
>> +
>> /*
>> * Local write permission is required if remote write or
>> * remote atomic permission is also requested.
>> @@ -4335,6 +4346,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev,
>> if (flags & IB_ACCESS_ON_DEMAND &&
>> !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
>> return -EINVAL;
>> +
>> + if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
>> + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
>> + (flags & IB_ACCESS_FLUSH_PERSISTENT &&
>> + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
>> + return -EINVAL;
> This should be -EOPNOTSUPP as the above is changed to in for-next
Yes, my local tree(V6) had updated this. will repost this later.
>
> Jason