2023-09-18 17:45:12

by Jørgen Hansen

[permalink] [raw]
Subject: Re: [PATCH RFC v2 12/18] cxl/region: Notify regions of DC changes

On 8/29/23 07:21, Ira Weiny wrote:
>
> In order for a user to use dynamic capacity effectively they need to
> know when dynamic capacity is available. Thus when Dynamic Capacity
> (DC) extents are added or removed by a DC device the regions affected
> need to be notified. Ultimately the DAX region uses the memory
> associated with DC extents. However, remember that CXL DAX regions
> maintain any interleave details between devices.
>
> When a DCD event occurs, iterate all CXL endpoint decoders and notify
> regions which contain the endpoints affected by the event. In turn
> notify the DAX regions of the changes to the DAX region extents.
>
> For now interleave is handled by creating simple 1:1 mappings between
> the CXL DAX region and DAX region layers. Future implementations will
> need to resolve when to actually surface a DAX region extent and pass
> the notification along.
>
> Remember that adding capacity is safe because there is no chance of the
> memory being in use. Also remember at this point releasing capacity is
> straight forward because DAX devices do not yet have references to the
> extents. Future patches will handle that complication.
>
> Signed-off-by: Ira Weiny <[email protected]>
>
> ---
> Changes from v1:
> [iweiny: Rewrite]
> ---
> drivers/cxl/core/mbox.c | 39 +++++++++++++--
> drivers/cxl/core/region.c | 123 +++++++++++++++++++++++++++++++++++++++++-----
> drivers/cxl/cxl.h | 22 +++++++++
> drivers/cxl/mem.c | 50 +++++++++++++++++++
> drivers/dax/cxl.c | 99 ++++++++++++++++++++++++++++++-------
> drivers/dax/dax-private.h | 3 ++
> drivers/dax/extent.c | 14 ++++++
> 7 files changed, 317 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 5472ab1d0370..9d9c13e13ecf 100644

[snip]

> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 0aeea50550f6..a0c1f2793dd7 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1547,8 +1547,8 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
> return 0;
> }
>
> -static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> - struct cxl_dc_extent_data *extent)
> +bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> + struct cxl_dc_extent_data *extent)
> {
> struct range dpa_range = (struct range){
> .start = extent->dpa_start,
> @@ -1567,14 +1567,66 @@ static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> return (cxled->dpa_res->start <= dpa_range.start &&
> dpa_range.end <= cxled->dpa_res->end);
> }
> +EXPORT_SYMBOL_NS_GPL(cxl_dc_extent_in_ed, CXL);
> +
> +static int cxl_region_notify_extent(struct cxl_endpoint_decoder *cxled,
> + enum dc_event event,
> + struct cxl_dr_extent *cxl_dr_ext)
> +{
> + struct cxl_dax_region *cxlr_dax;
> + struct device *dev;
> + int rc = 0;
> +
> + cxlr_dax = cxled->cxld.region->cxlr_dax;
> + dev = &cxlr_dax->dev;
> + dev_dbg(dev, "Trying notify: type %d HPA:%llx LEN:%llx\n",
> + event, cxl_dr_ext->hpa_offset, cxl_dr_ext->hpa_length);
> +
> + device_lock(dev);
> + if (dev->driver) {
> + struct cxl_driver *reg_drv = to_cxl_drv(dev->driver);
> + struct cxl_drv_nd nd = (struct cxl_drv_nd) {
> + .event = event,
> + .cxl_dr_ext = cxl_dr_ext,
> + };
> +
> + if (reg_drv->notify) {
> + dev_dbg(dev, "Notify: type %d HPA:%llx LEN:%llx\n",
> + event, cxl_dr_ext->hpa_offset,
> + cxl_dr_ext->hpa_length);
> + rc = reg_drv->notify(dev, &nd);
> + }
> + }
> + device_unlock(dev);
> + return rc;
> +}
> +
> +static resource_size_t
> +cxl_dc_extent_to_hpa_offset(struct cxl_endpoint_decoder *cxled,
> + struct cxl_dc_extent_data *extent)
> +{
> + struct cxl_dax_region *cxlr_dax;
> + resource_size_t dpa_offset, hpa;
> + struct range *ed_hpa_range;
> +
> + cxlr_dax = cxled->cxld.region->cxlr_dax;
> +
> + /*
> + * Without interleave...
> + * HPA offset == DPA offset
> + * ... but do the math anyway
> + */
> + dpa_offset = extent->dpa_start - cxled->dpa_res->start;
> + ed_hpa_range = &cxled->cxld.hpa_range;
> + hpa = ed_hpa_range->start + dpa_offset;
> + return hpa - cxlr_dax->hpa_range.start;
> +}
>
> static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> struct cxl_dc_extent_data *extent)
> {
> struct cxl_dr_extent *cxl_dr_ext;
> struct cxl_dax_region *cxlr_dax;
> - resource_size_t dpa_offset, hpa;
> - struct range *ed_hpa_range;
> struct device *dev;
> int rc;
>
> @@ -1601,15 +1653,7 @@ static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> cxl_dr_ext->extent = extent;
> kref_init(&cxl_dr_ext->region_ref);
>
> - /*
> - * Without interleave...
> - * HPA offset == DPA offset
> - * ... but do the math anyway
> - */
> - dpa_offset = extent->dpa_start - cxled->dpa_res->start;
> - ed_hpa_range = &cxled->cxld.hpa_range;
> - hpa = ed_hpa_range->start + dpa_offset;
> - cxl_dr_ext->hpa_offset = hpa - cxlr_dax->hpa_range.start;
> + cxl_dr_ext->hpa_offset = cxl_dc_extent_to_hpa_offset(cxled, extent);
>
> /* Without interleave carry length and label through */
> cxl_dr_ext->hpa_length = extent->length;
> @@ -1626,6 +1670,7 @@ static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> }
> /* Put in cxl_dr_release() */
> cxl_dc_extent_get(cxl_dr_ext->extent);
> + cxl_region_notify_extent(cxled, DCD_ADD_CAPACITY, cxl_dr_ext);
> return 0;
> }
>
> @@ -1663,6 +1708,58 @@ static int cxl_ed_add_extents(struct cxl_endpoint_decoder *cxled)
> return 0;
> }
>
> +static int cxl_ed_rm_dc_extent(struct cxl_endpoint_decoder *cxled,
> + enum dc_event event,
> + struct cxl_dc_extent_data *extent)
> +{
> + struct cxl_region *cxlr = cxled->cxld.region;
> + struct cxl_dax_region *cxlr_dax = cxlr->cxlr_dax;
> + struct cxl_dr_extent *cxl_dr_ext;
> + resource_size_t hpa_offset;
> +
> + hpa_offset = cxl_dc_extent_to_hpa_offset(cxled, extent);
> +
> + /*
> + * NOTE on Interleaving: There is no need to 'break up' the cxl_dr_ext.
> + * If one of the extents comprising it is gone it should be removed
> + * from the region to prevent future use. Later code may save other
> + * extents for future processing. But for now the corelation is 1:1:1
> + * so just erase the extent.
> + */
> + cxl_dr_ext = xa_erase(&cxlr_dax->extents, hpa_offset);
> +
> + dev_dbg(&cxlr_dax->dev, "Remove DAX region ext HPA:%llx\n",
> + cxl_dr_ext->hpa_offset);
> + cxl_region_notify_extent(cxled, event, cxl_dr_ext);
> + cxl_dr_extent_put(cxl_dr_ext);
> + return 0;
> +}
> +
> +int cxl_ed_notify_extent(struct cxl_endpoint_decoder *cxled,
> + struct cxl_drv_nd *nd)
> +{
> + int rc = 0;
> +
> + switch (nd->event) {
> + case DCD_ADD_CAPACITY:
> + if (cxl_dc_extent_get_not_zero(nd->extent)) {
> + rc = cxl_ed_add_one_extent(cxled, nd->extent);
> + if (rc)
> + cxl_dc_extent_put(nd->extent);

Hi,
when playing around with adding and releasing DCD extents through the
qmp interface for the QEMU DCD emulation, I noticed that extents weren't
handed back to the device. It looks like there is a refcounting issue,
as the kref never drops below 2 for the dc extents. So I was wondering
whether we should only put the dc extent here on error or maybe always
put it? cxl_ed_add_one_extent() also grabs a reference to the dc
extent, and that one is put in cxl_dr_release(), but I couldn't find a
matching put for this get_not_zero.


> + }
> + break;
> + case DCD_RELEASE_CAPACITY:
> + case DCD_FORCED_CAPACITY_RELEASE:
> + rc = cxl_ed_rm_dc_extent(cxled, nd->event, nd->extent);
> + break;
> + default:
> + dev_err(&cxled->cxld.dev, "Unknown DC event %d\n", nd->event);
> + break;
> + }
> + return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_ed_notify_extent, CXL);
> +
> static int cxl_region_attach_position(struct cxl_region *cxlr,
> struct cxl_root_decoder *cxlrd,
> struct cxl_endpoint_decoder *cxled,

[snip]

>
> --
> 2.41.0
>

Thanks,
Jorgen


2023-09-18 18:11:40

by Ira Weiny

[permalink] [raw]
Subject: Re: [PATCH RFC v2 12/18] cxl/region: Notify regions of DC changes

J?rgen Hansen wrote:
> On 8/29/23 07:21, Ira Weiny wrote:
> >
> > In order for a user to use dynamic capacity effectively they need to
> > know when dynamic capacity is available. Thus when Dynamic Capacity
> > (DC) extents are added or removed by a DC device the regions affected
> > need to be notified. Ultimately the DAX region uses the memory
> > associated with DC extents. However, remember that CXL DAX regions
> > maintain any interleave details between devices.
> >
> > When a DCD event occurs, iterate all CXL endpoint decoders and notify
> > regions which contain the endpoints affected by the event. In turn
> > notify the DAX regions of the changes to the DAX region extents.
> >
> > For now interleave is handled by creating simple 1:1 mappings between
> > the CXL DAX region and DAX region layers. Future implementations will
> > need to resolve when to actually surface a DAX region extent and pass
> > the notification along.
> >
> > Remember that adding capacity is safe because there is no chance of the
> > memory being in use. Also remember at this point releasing capacity is
> > straight forward because DAX devices do not yet have references to the
> > extents. Future patches will handle that complication.
> >
> > Signed-off-by: Ira Weiny <[email protected]>
> >
> > ---
> > Changes from v1:
> > [iweiny: Rewrite]
> > ---
> > drivers/cxl/core/mbox.c | 39 +++++++++++++--
> > drivers/cxl/core/region.c | 123 +++++++++++++++++++++++++++++++++++++++++-----
> > drivers/cxl/cxl.h | 22 +++++++++
> > drivers/cxl/mem.c | 50 +++++++++++++++++++
> > drivers/dax/cxl.c | 99 ++++++++++++++++++++++++++++++-------
> > drivers/dax/dax-private.h | 3 ++
> > drivers/dax/extent.c | 14 ++++++
> > 7 files changed, 317 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 5472ab1d0370..9d9c13e13ecf 100644
>
> [snip]
>
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index 0aeea50550f6..a0c1f2793dd7 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -1547,8 +1547,8 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
> > return 0;
> > }
> >
> > -static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > - struct cxl_dc_extent_data *extent)
> > +bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent_data *extent)
> > {
> > struct range dpa_range = (struct range){
> > .start = extent->dpa_start,
> > @@ -1567,14 +1567,66 @@ static bool cxl_dc_extent_in_ed(struct cxl_endpoint_decoder *cxled,
> > return (cxled->dpa_res->start <= dpa_range.start &&
> > dpa_range.end <= cxled->dpa_res->end);
> > }
> > +EXPORT_SYMBOL_NS_GPL(cxl_dc_extent_in_ed, CXL);
> > +
> > +static int cxl_region_notify_extent(struct cxl_endpoint_decoder *cxled,
> > + enum dc_event event,
> > + struct cxl_dr_extent *cxl_dr_ext)
> > +{
> > + struct cxl_dax_region *cxlr_dax;
> > + struct device *dev;
> > + int rc = 0;
> > +
> > + cxlr_dax = cxled->cxld.region->cxlr_dax;
> > + dev = &cxlr_dax->dev;
> > + dev_dbg(dev, "Trying notify: type %d HPA:%llx LEN:%llx\n",
> > + event, cxl_dr_ext->hpa_offset, cxl_dr_ext->hpa_length);
> > +
> > + device_lock(dev);
> > + if (dev->driver) {
> > + struct cxl_driver *reg_drv = to_cxl_drv(dev->driver);
> > + struct cxl_drv_nd nd = (struct cxl_drv_nd) {
> > + .event = event,
> > + .cxl_dr_ext = cxl_dr_ext,
> > + };
> > +
> > + if (reg_drv->notify) {
> > + dev_dbg(dev, "Notify: type %d HPA:%llx LEN:%llx\n",
> > + event, cxl_dr_ext->hpa_offset,
> > + cxl_dr_ext->hpa_length);
> > + rc = reg_drv->notify(dev, &nd);
> > + }
> > + }
> > + device_unlock(dev);
> > + return rc;
> > +}
> > +
> > +static resource_size_t
> > +cxl_dc_extent_to_hpa_offset(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_dc_extent_data *extent)
> > +{
> > + struct cxl_dax_region *cxlr_dax;
> > + resource_size_t dpa_offset, hpa;
> > + struct range *ed_hpa_range;
> > +
> > + cxlr_dax = cxled->cxld.region->cxlr_dax;
> > +
> > + /*
> > + * Without interleave...
> > + * HPA offset == DPA offset
> > + * ... but do the math anyway
> > + */
> > + dpa_offset = extent->dpa_start - cxled->dpa_res->start;
> > + ed_hpa_range = &cxled->cxld.hpa_range;
> > + hpa = ed_hpa_range->start + dpa_offset;
> > + return hpa - cxlr_dax->hpa_range.start;
> > +}
> >
> > static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > struct cxl_dc_extent_data *extent)
> > {
> > struct cxl_dr_extent *cxl_dr_ext;
> > struct cxl_dax_region *cxlr_dax;
> > - resource_size_t dpa_offset, hpa;
> > - struct range *ed_hpa_range;
> > struct device *dev;
> > int rc;
> >
> > @@ -1601,15 +1653,7 @@ static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > cxl_dr_ext->extent = extent;
> > kref_init(&cxl_dr_ext->region_ref);
> >
> > - /*
> > - * Without interleave...
> > - * HPA offset == DPA offset
> > - * ... but do the math anyway
> > - */
> > - dpa_offset = extent->dpa_start - cxled->dpa_res->start;
> > - ed_hpa_range = &cxled->cxld.hpa_range;
> > - hpa = ed_hpa_range->start + dpa_offset;
> > - cxl_dr_ext->hpa_offset = hpa - cxlr_dax->hpa_range.start;
> > + cxl_dr_ext->hpa_offset = cxl_dc_extent_to_hpa_offset(cxled, extent);
> >
> > /* Without interleave carry length and label through */
> > cxl_dr_ext->hpa_length = extent->length;
> > @@ -1626,6 +1670,7 @@ static int cxl_ed_add_one_extent(struct cxl_endpoint_decoder *cxled,
> > }
> > /* Put in cxl_dr_release() */
> > cxl_dc_extent_get(cxl_dr_ext->extent);
> > + cxl_region_notify_extent(cxled, DCD_ADD_CAPACITY, cxl_dr_ext);
> > return 0;
> > }
> >
> > @@ -1663,6 +1708,58 @@ static int cxl_ed_add_extents(struct cxl_endpoint_decoder *cxled)
> > return 0;
> > }
> >
> > +static int cxl_ed_rm_dc_extent(struct cxl_endpoint_decoder *cxled,
> > + enum dc_event event,
> > + struct cxl_dc_extent_data *extent)
> > +{
> > + struct cxl_region *cxlr = cxled->cxld.region;
> > + struct cxl_dax_region *cxlr_dax = cxlr->cxlr_dax;
> > + struct cxl_dr_extent *cxl_dr_ext;
> > + resource_size_t hpa_offset;
> > +
> > + hpa_offset = cxl_dc_extent_to_hpa_offset(cxled, extent);
> > +
> > + /*
> > + * NOTE on Interleaving: There is no need to 'break up' the cxl_dr_ext.
> > + * If one of the extents comprising it is gone it should be removed
> > + * from the region to prevent future use. Later code may save other
> > + * extents for future processing. But for now the corelation is 1:1:1
> > + * so just erase the extent.
> > + */
> > + cxl_dr_ext = xa_erase(&cxlr_dax->extents, hpa_offset);
> > +
> > + dev_dbg(&cxlr_dax->dev, "Remove DAX region ext HPA:%llx\n",
> > + cxl_dr_ext->hpa_offset);
> > + cxl_region_notify_extent(cxled, event, cxl_dr_ext);
> > + cxl_dr_extent_put(cxl_dr_ext);
> > + return 0;
> > +}
> > +
> > +int cxl_ed_notify_extent(struct cxl_endpoint_decoder *cxled,
> > + struct cxl_drv_nd *nd)
> > +{
> > + int rc = 0;
> > +
> > + switch (nd->event) {
> > + case DCD_ADD_CAPACITY:
> > + if (cxl_dc_extent_get_not_zero(nd->extent)) {
> > + rc = cxl_ed_add_one_extent(cxled, nd->extent);
> > + if (rc)
> > + cxl_dc_extent_put(nd->extent);
>
> Hi,
> when playing around with adding and releasing DCD extents through the
> qmp interface for the QEMU DCD emulation, I noticed that extents weren't
> handed back to the device. It looks like there is a refcounting issue,
> as the kref never drops below 2 for the dc extents. So I was wondering
> whether we should only put the dc extent here on error or maybe always
> put it? cxl_ed_add_one_extent() also grabs a reference to the dc
> extent, and that one is put in cxl_dr_release(), but I couldn't find a
> matching put for this get_not_zero.

This is a bug I have fixed in the next version.

Yes the put needs to happen regardless of the return value.

...
case DCD_ADD_CAPACITY:
if (cxl_dc_extent_get_not_zero(nd->extent)) {
rc = cxl_ed_add_one_extent(cxled, nd->extent);
cxl_dc_extent_put(nd->extent);
}
...

Please let me know if that does not work. And thanks for the testing,
Ira