Add PCI P2PDMA support for dma_direct_map_sg() so that it can map
PCI P2PDMA pages directly without a hack in the callers. This allows
for heterogeneous SGLs that contain both P2PDMA and regular pages.
SGL segments that contain PCI bus addresses are marked with
sg_mark_pci_p2pdma() and are ignored when unmapped.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
kernel/dma/direct.c | 35 ++++++++++++++++++++++++++++++++---
kernel/dma/mapping.c | 13 ++++++++++---
2 files changed, 42 insertions(+), 6 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 002268262c9a..f326d32062dd 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
#include "direct.h"
/*
@@ -387,19 +388,47 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
struct scatterlist *sg;
int i;
- for_each_sg(sgl, sg, nents, i)
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_is_pci_p2pdma(sg))
+ continue;
+
dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
attrs);
+ }
}
#endif
int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
- int i;
+ struct dev_pagemap *pgmap = NULL;
+ int i, map = -1, ret = 0;
struct scatterlist *sg;
+ u64 bus_off;
for_each_sg(sgl, sg, nents, i) {
+ if (is_pci_p2pdma_page(sg_page(sg))) {
+ if (sg_page(sg)->pgmap != pgmap) {
+ pgmap = sg_page(sg)->pgmap;
+ map = pci_p2pdma_dma_map_type(dev, pgmap);
+ bus_off = pci_p2pdma_bus_offset(sg_page(sg));
+ }
+
+ if (map < 0) {
+ sg->dma_address = DMA_MAPPING_ERROR;
+ ret = -EREMOTEIO;
+ goto out_unmap;
+ }
+
+ if (map) {
+ sg->dma_address = sg_phys(sg) + sg->offset -
+ bus_off;
+ sg_dma_len(sg) = sg->length;
+ sg_mark_pci_p2pdma(sg);
+ continue;
+ }
+ }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR)
@@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
- return 0;
+ return ret;
}
dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b6a633679933..adc1a83950be 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
EXPORT_SYMBOL(dma_unmap_page_attrs);
/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
+ * dma_maps_sg_attrs returns 0 on any resource error and > 0 on success.
+ *
+ * If 0 is returned, the mapping can be retried and will succeed once
+ * sufficient resources are available.
+ *
+ * If there are P2PDMA pages in the scatterlist then this function may
+ * return -EREMOTEIO to indicate that the pages are not mappable by the
+ * device. In this case, an error should be returned for the IO as it
+ * will never be successfully retried.
*/
int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs)
@@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
- BUG_ON(ents < 0);
+
debug_dma_map_sg(dev, sg, nents, ents, dir);
return ents;
--
2.20.1
On 2021-03-11 23:31, Logan Gunthorpe wrote:
> Add PCI P2PDMA support for dma_direct_map_sg() so that it can map
> PCI P2PDMA pages directly without a hack in the callers. This allows
> for heterogeneous SGLs that contain both P2PDMA and regular pages.
>
> SGL segments that contain PCI bus addresses are marked with
> sg_mark_pci_p2pdma() and are ignored when unmapped.
>
> Signed-off-by: Logan Gunthorpe <[email protected]>
> ---
> kernel/dma/direct.c | 35 ++++++++++++++++++++++++++++++++---
> kernel/dma/mapping.c | 13 ++++++++++---
> 2 files changed, 42 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 002268262c9a..f326d32062dd 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -13,6 +13,7 @@
> #include <linux/vmalloc.h>
> #include <linux/set_memory.h>
> #include <linux/slab.h>
> +#include <linux/pci-p2pdma.h>
> #include "direct.h"
>
> /*
> @@ -387,19 +388,47 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
> struct scatterlist *sg;
> int i;
>
> - for_each_sg(sgl, sg, nents, i)
> + for_each_sg(sgl, sg, nents, i) {
> + if (sg_is_pci_p2pdma(sg))
> + continue;
> +
> dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
> attrs);
> + }
> }
> #endif
>
> int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> enum dma_data_direction dir, unsigned long attrs)
> {
> - int i;
> + struct dev_pagemap *pgmap = NULL;
> + int i, map = -1, ret = 0;
> struct scatterlist *sg;
> + u64 bus_off;
>
> for_each_sg(sgl, sg, nents, i) {
> + if (is_pci_p2pdma_page(sg_page(sg))) {
> + if (sg_page(sg)->pgmap != pgmap) {
> + pgmap = sg_page(sg)->pgmap;
> + map = pci_p2pdma_dma_map_type(dev, pgmap);
> + bus_off = pci_p2pdma_bus_offset(sg_page(sg));
> + }
> +
> + if (map < 0) {
> + sg->dma_address = DMA_MAPPING_ERROR;
> + ret = -EREMOTEIO;
> + goto out_unmap;
> + }
> +
> + if (map) {
> + sg->dma_address = sg_phys(sg) + sg->offset -
> + bus_off;
> + sg_dma_len(sg) = sg->length;
> + sg_mark_pci_p2pdma(sg);
> + continue;
> + }
> + }
> +
> sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
> sg->offset, sg->length, dir, attrs);
> if (sg->dma_address == DMA_MAPPING_ERROR)
> @@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>
> out_unmap:
> dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
> - return 0;
> + return ret;
> }
>
> dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
> index b6a633679933..adc1a83950be 100644
> --- a/kernel/dma/mapping.c
> +++ b/kernel/dma/mapping.c
> @@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
> EXPORT_SYMBOL(dma_unmap_page_attrs);
>
> /*
> - * dma_maps_sg_attrs returns 0 on error and > 0 on success.
> - * It should never return a value < 0.
> + * dma_maps_sg_attrs returns 0 on any resource error and > 0 on success.
> + *
> + * If 0 is returned, the mapping can be retried and will succeed once
> + * sufficient resources are available.
That's not a guarantee we can uphold. Retrying forever in the vain hope
that a device might evolve some extra address bits, or a bounce buffer
might magically grow big enough for a gigantic mapping, isn't
necessarily the best idea.
> + *
> + * If there are P2PDMA pages in the scatterlist then this function may
> + * return -EREMOTEIO to indicate that the pages are not mappable by the
> + * device. In this case, an error should be returned for the IO as it
> + * will never be successfully retried.
> */
> int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
> enum dma_data_direction dir, unsigned long attrs)
> @@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
> ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
> else
> ents = ops->map_sg(dev, sg, nents, dir, attrs);
> - BUG_ON(ents < 0);
> +
This scares me - I hesitate to imagine the amount of driver/subsystem
code out there that will see nonzero and merrily set off iterating a
negative number of segments, if we open the floodgates of allowing
implementations to return error codes here.
Robin.
> debug_dma_map_sg(dev, sg, nents, ents, dir);
>
> return ents;
>
On 2021-03-12 8:52 a.m., Robin Murphy wrote:
>> +
>> sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
>> sg->offset, sg->length, dir, attrs);
>> if (sg->dma_address == DMA_MAPPING_ERROR)
>> @@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct
>> scatterlist *sgl, int nents,
>> out_unmap:
>> dma_direct_unmap_sg(dev, sgl, i, dir, attrs |
>> DMA_ATTR_SKIP_CPU_SYNC);
>> - return 0;
>> + return ret;
>> }
>> dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t
>> paddr,
>> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
>> index b6a633679933..adc1a83950be 100644
>> --- a/kernel/dma/mapping.c
>> +++ b/kernel/dma/mapping.c
>> @@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev,
>> dma_addr_t addr, size_t size,
>> EXPORT_SYMBOL(dma_unmap_page_attrs);
>> /*
>> - * dma_maps_sg_attrs returns 0 on error and > 0 on success.
>> - * It should never return a value < 0.
>> + * dma_maps_sg_attrs returns 0 on any resource error and > 0 on success.
>> + *
>> + * If 0 is returned, the mapping can be retried and will succeed once
>> + * sufficient resources are available.
>
> That's not a guarantee we can uphold. Retrying forever in the vain hope
> that a device might evolve some extra address bits, or a bounce buffer
> might magically grow big enough for a gigantic mapping, isn't
> necessarily the best idea.
Perhaps this is just poorly worded. Returning 0 is the normal case and
nothing has changed there. The block layer, for example, will retry if
zero is returned as this only happens if it failed to allocate resources
for the mapping. The reason we have to return -1 is to tell the block
layer not to retry these requests as they will never succeed in the future.
>> + *
>> + * If there are P2PDMA pages in the scatterlist then this function may
>> + * return -EREMOTEIO to indicate that the pages are not mappable by the
>> + * device. In this case, an error should be returned for the IO as it
>> + * will never be successfully retried.
>> */
>> int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int
>> nents,
>> enum dma_data_direction dir, unsigned long attrs)
>> @@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct
>> scatterlist *sg, int nents,
>> ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
>> else
>> ents = ops->map_sg(dev, sg, nents, dir, attrs);
>> - BUG_ON(ents < 0);
>> +
>
> This scares me - I hesitate to imagine the amount of driver/subsystem
> code out there that will see nonzero and merrily set off iterating a
> negative number of segments, if we open the floodgates of allowing
> implementations to return error codes here.
Yes, but it will never happen on existing drivers/subsystems. The only
way it can return a negative number is if the driver passes in P2PDMA
pages which can't happen without changes in the driver. We are careful
about where P2PDMA pages can get into so we don't have to worry about
all the existing driver code out there.
Logan
On 2021-03-12 16:24, Logan Gunthorpe wrote:
>
>
> On 2021-03-12 8:52 a.m., Robin Murphy wrote:
>>> +
>>> sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
>>> sg->offset, sg->length, dir, attrs);
>>> if (sg->dma_address == DMA_MAPPING_ERROR)
>>> @@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct
>>> scatterlist *sgl, int nents,
>>> out_unmap:
>>> dma_direct_unmap_sg(dev, sgl, i, dir, attrs |
>>> DMA_ATTR_SKIP_CPU_SYNC);
>>> - return 0;
>>> + return ret;
>>> }
>>> dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t
>>> paddr,
>>> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
>>> index b6a633679933..adc1a83950be 100644
>>> --- a/kernel/dma/mapping.c
>>> +++ b/kernel/dma/mapping.c
>>> @@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev,
>>> dma_addr_t addr, size_t size,
>>> EXPORT_SYMBOL(dma_unmap_page_attrs);
>>> /*
>>> - * dma_maps_sg_attrs returns 0 on error and > 0 on success.
>>> - * It should never return a value < 0.
>>> + * dma_maps_sg_attrs returns 0 on any resource error and > 0 on success.
>>> + *
>>> + * If 0 is returned, the mapping can be retried and will succeed once
>>> + * sufficient resources are available.
>>
>> That's not a guarantee we can uphold. Retrying forever in the vain hope
>> that a device might evolve some extra address bits, or a bounce buffer
>> might magically grow big enough for a gigantic mapping, isn't
>> necessarily the best idea.
>
> Perhaps this is just poorly worded. Returning 0 is the normal case and
> nothing has changed there. The block layer, for example, will retry if
> zero is returned as this only happens if it failed to allocate resources
> for the mapping. The reason we have to return -1 is to tell the block
> layer not to retry these requests as they will never succeed in the future.
>
>>> + *
>>> + * If there are P2PDMA pages in the scatterlist then this function may
>>> + * return -EREMOTEIO to indicate that the pages are not mappable by the
>>> + * device. In this case, an error should be returned for the IO as it
>>> + * will never be successfully retried.
>>> */
>>> int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int
>>> nents,
>>> enum dma_data_direction dir, unsigned long attrs)
>>> @@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct
>>> scatterlist *sg, int nents,
>>> ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
>>> else
>>> ents = ops->map_sg(dev, sg, nents, dir, attrs);
>>> - BUG_ON(ents < 0);
>>> +
>>
>> This scares me - I hesitate to imagine the amount of driver/subsystem
>> code out there that will see nonzero and merrily set off iterating a
>> negative number of segments, if we open the floodgates of allowing
>> implementations to return error codes here.
>
> Yes, but it will never happen on existing drivers/subsystems. The only
> way it can return a negative number is if the driver passes in P2PDMA
> pages which can't happen without changes in the driver. We are careful
> about where P2PDMA pages can get into so we don't have to worry about
> all the existing driver code out there.
Sure, that's how things stand immediately after this patch. But then
someone comes along with the perfectly reasonable argument for returning
more expressive error information for regular mapping failures as well
(because sometimes those can be terminal too, as above), we start to get
divergent behaviour across architectures and random bits of old code
subtly breaking down the line. *That* is what makes me wary of making a
fundamental change to a long-standing "nonzero means success" interface...
Robin.
On 2021-03-12 11:11 a.m., Robin Murphy wrote:
> On 2021-03-12 16:24, Logan Gunthorpe wrote:
>>
>>
>> On 2021-03-12 8:52 a.m., Robin Murphy wrote:
>>>> +
>>>> sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
>>>> sg->offset, sg->length, dir, attrs);
>>>> if (sg->dma_address == DMA_MAPPING_ERROR)
>>>> @@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct
>>>> scatterlist *sgl, int nents,
>>>> out_unmap:
>>>> dma_direct_unmap_sg(dev, sgl, i, dir, attrs |
>>>> DMA_ATTR_SKIP_CPU_SYNC);
>>>> - return 0;
>>>> + return ret;
>>>> }
>>>> dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t
>>>> paddr,
>>>> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
>>>> index b6a633679933..adc1a83950be 100644
>>>> --- a/kernel/dma/mapping.c
>>>> +++ b/kernel/dma/mapping.c
>>>> @@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev,
>>>> dma_addr_t addr, size_t size,
>>>> EXPORT_SYMBOL(dma_unmap_page_attrs);
>>>> /*
>>>> - * dma_maps_sg_attrs returns 0 on error and > 0 on success.
>>>> - * It should never return a value < 0.
>>>> + * dma_maps_sg_attrs returns 0 on any resource error and > 0 on
>>>> success.
>>>> + *
>>>> + * If 0 is returned, the mapping can be retried and will succeed once
>>>> + * sufficient resources are available.
>>>
>>> That's not a guarantee we can uphold. Retrying forever in the vain hope
>>> that a device might evolve some extra address bits, or a bounce buffer
>>> might magically grow big enough for a gigantic mapping, isn't
>>> necessarily the best idea.
>>
>> Perhaps this is just poorly worded. Returning 0 is the normal case and
>> nothing has changed there. The block layer, for example, will retry if
>> zero is returned as this only happens if it failed to allocate resources
>> for the mapping. The reason we have to return -1 is to tell the block
>> layer not to retry these requests as they will never succeed in the
>> future.
>>
>>>> + *
>>>> + * If there are P2PDMA pages in the scatterlist then this function may
>>>> + * return -EREMOTEIO to indicate that the pages are not mappable by
>>>> the
>>>> + * device. In this case, an error should be returned for the IO as it
>>>> + * will never be successfully retried.
>>>> */
>>>> int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int
>>>> nents,
>>>> enum dma_data_direction dir, unsigned long attrs)
>>>> @@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct
>>>> scatterlist *sg, int nents,
>>>> ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
>>>> else
>>>> ents = ops->map_sg(dev, sg, nents, dir, attrs);
>>>> - BUG_ON(ents < 0);
>>>> +
>>>
>>> This scares me - I hesitate to imagine the amount of driver/subsystem
>>> code out there that will see nonzero and merrily set off iterating a
>>> negative number of segments, if we open the floodgates of allowing
>>> implementations to return error codes here.
>>
>> Yes, but it will never happen on existing drivers/subsystems. The only
>> way it can return a negative number is if the driver passes in P2PDMA
>> pages which can't happen without changes in the driver. We are careful
>> about where P2PDMA pages can get into so we don't have to worry about
>> all the existing driver code out there.
>
> Sure, that's how things stand immediately after this patch. But then
> someone comes along with the perfectly reasonable argument for returning
> more expressive error information for regular mapping failures as well
> (because sometimes those can be terminal too, as above), we start to get
> divergent behaviour across architectures and random bits of old code
> subtly breaking down the line. *That* is what makes me wary of making a
> fundamental change to a long-standing "nonzero means success" interface...
So then we reject the patches that make that change. Seems like an odd
argument to say that we can't do something that won't cause problems
because someone might use it as an example and do something that will
cause problems. Reject the change that causes the problem.
Logan
On Fri, Mar 12, 2021 at 11:27:46AM -0700, Logan Gunthorpe wrote:
> So then we reject the patches that make that change. Seems like an odd
> argument to say that we can't do something that won't cause problems
> because someone might use it as an example and do something that will
> cause problems. Reject the change that causes the problem.
No, the problem is a mess of calling conventions. A calling convention
returning 0 for error, positive values for success is fine. One returning
a negative errno for error and positive values for success is fine a well.
One returning 0 for the usual errors and negativ errnos for an unusual
corner case is just a complete mess.
On Thu, Mar 11, 2021 at 04:31:36PM -0700, Logan Gunthorpe wrote:
> for_each_sg(sgl, sg, nents, i) {
> + if (is_pci_p2pdma_page(sg_page(sg))) {
> + if (sg_page(sg)->pgmap != pgmap) {
> + pgmap = sg_page(sg)->pgmap;
> + map = pci_p2pdma_dma_map_type(dev, pgmap);
> + bus_off = pci_p2pdma_bus_offset(sg_page(sg));
> + }
> +
> + if (map < 0) {
> + sg->dma_address = DMA_MAPPING_ERROR;
> + ret = -EREMOTEIO;
> + goto out_unmap;
> + }
> +
> + if (map) {
> + sg->dma_address = sg_phys(sg) + sg->offset -
> + bus_off;
> + sg_dma_len(sg) = sg->length;
> + sg_mark_pci_p2pdma(sg);
> + continue;
> + }
> + }
This code needs to go into a separate noinline helper to reduce the impact
on the fast path. Also as Robin noted the offset is already
accounted for in sg_phys. We also don't ever set the dma_address in
the scatterlist to DMA_MAPPING_ERROR, that is just a return value
for the single entry mapping routines.
On Fri, Mar 12, 2021 at 06:11:17PM +0000, Robin Murphy wrote:
> Sure, that's how things stand immediately after this patch. But then
> someone comes along with the perfectly reasonable argument for returning
> more expressive error information for regular mapping failures as well
> (because sometimes those can be terminal too, as above), we start to get
> divergent behaviour across architectures and random bits of old code subtly
> breaking down the line. *That* is what makes me wary of making a
> fundamental change to a long-standing "nonzero means success" interface...
Agreed. IMHO dma_map_sg actually needs to be switched to return
unsigned to help root this out, going the other way is no helpful.
On 2021-03-16 1:58 a.m., Christoph Hellwig wrote:
> On Fri, Mar 12, 2021 at 11:27:46AM -0700, Logan Gunthorpe wrote:
>> So then we reject the patches that make that change. Seems like an odd
>> argument to say that we can't do something that won't cause problems
>> because someone might use it as an example and do something that will
>> cause problems. Reject the change that causes the problem.
>
> No, the problem is a mess of calling conventions. A calling convention
> returning 0 for error, positive values for success is fine. One returning
> a negative errno for error and positive values for success is fine a well.
> One returning 0 for the usual errors and negativ errnos for an unusual
> corner case is just a complete mess.
Fair enough. I can try implementing a dma_map_sg_p2p() roughly as Robin
suggested that has a more reasonable calling convention.
Most of your other feedback seems easy enough so I'll address it in a
future series.
Thanks,
Logan