[permalink] [raw]

Subject: [PATCH 06/31] alpha/pci-noop: handle page-less SG entries

Use sg_phys() instead of virt_to_phys(sg_virt(sg)) so that we don't
require a kernel virtual address.

Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/alpha/kernel/pci-noop.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/alpha/kernel/pci-noop.c b/arch/alpha/kernel/pci-noop.c
index df24b76..7319151 100644
--- a/arch/alpha/kernel/pci-noop.c
+++ b/arch/alpha/kernel/pci-noop.c
@@ -145,11 +145,7 @@ static int alpha_noop_map_sg(struct device *dev, struct scatterlist *sgl, int ne
struct scatterlist *sg;

for_each_sg(sgl, sg, nents, i) {
- void *va;
-
- BUG_ON(!sg_page(sg));
- va = sg_virt(sg);
- sg_dma_address(sg) = (dma_addr_t)virt_to_phys(va);
+ sg_dma_address(sg) = (dma_addr_t)sg_phys(sg);
sg_dma_len(sg) = sg->length;
}

--
1.9.1

2015-08-12 07:19:38

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 07/31] alpha/pci_iommu: handle page-less SG entries

Use sg_phys() instead of virt_to_phys(sg_virt(sg)) so that we don't
require a kernel virtual address, and switch a few debug printfs to
print physical instead of virtual addresses.

Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/alpha/kernel/pci_iommu.c | 36 +++++++++++++++---------------------
1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index eddee77..5d46b49 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -248,20 +248,17 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask)
until either pci_unmap_single or pci_dma_sync_single is performed. */

static dma_addr_t
-pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
+pci_map_single_1(struct pci_dev *pdev, unsigned long paddr, size_t size,
int dac_allowed)
{
struct pci_controller *hose = pdev ? pdev->sysdata : pci_isa_hose;
dma_addr_t max_dma = pdev ? pdev->dma_mask : ISA_DMA_MASK;
struct pci_iommu_arena *arena;
long npages, dma_ofs, i;
- unsigned long paddr;
dma_addr_t ret;
unsigned int align = 0;
struct device *dev = pdev ? &pdev->dev : NULL;

- paddr = __pa(cpu_addr);
-
#if !DEBUG_NODIRECT
/* First check to see if we can use the direct map window. */
if (paddr + size + __direct_map_base - 1 <= max_dma
@@ -269,7 +266,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
ret = paddr + __direct_map_base;

DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %pf\n",
- cpu_addr, size, ret, __builtin_return_address(0));
+ paddr, size, ret, __builtin_return_address(0));

return ret;
}
@@ -280,7 +277,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
ret = paddr + alpha_mv.pci_dac_offset;

DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %pf\n",
- cpu_addr, size, ret, __builtin_return_address(0));
+ paddr, size, ret, __builtin_return_address(0));

return ret;
}
@@ -309,15 +306,15 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
return 0;
}

+ offset = paddr & ~PAGE_MASK;
paddr &= PAGE_MASK;
for (i = 0; i < npages; ++i, paddr += PAGE_SIZE)
arena->ptes[i + dma_ofs] = mk_iommu_pte(paddr);

- ret = arena->dma_base + dma_ofs * PAGE_SIZE;
- ret += (unsigned long)cpu_addr & ~PAGE_MASK;
+ ret = arena->dma_base + dma_ofs * PAGE_SIZE + offset;

DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %pf\n",
- cpu_addr, size, npages, ret, __builtin_return_address(0));
+ paddr, size, npages, ret, __builtin_return_address(0));

return ret;
}
@@ -357,7 +354,7 @@ static dma_addr_t alpha_pci_map_page(struct device *dev, struct page *page,
BUG_ON(dir == PCI_DMA_NONE);

dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0;
- return pci_map_single_1(pdev, (char *)page_address(page) + offset,
+ return pci_map_single_1(pdev, page_to_phys(page) + offset,
size, dac_allowed);
}

@@ -453,7 +450,7 @@ try_again:
}
memset(cpu_addr, 0, size);

- *dma_addrp = pci_map_single_1(pdev, cpu_addr, size, 0);
+ *dma_addrp = pci_map_single_1(pdev, __pa(cpu_addr), size, 0);
if (*dma_addrp == 0) {
free_pages((unsigned long)cpu_addr, order);
if (alpha_mv.mv_pci_tbi || (gfp & GFP_DMA))
@@ -497,9 +494,6 @@ static void alpha_pci_free_coherent(struct device *dev, size_t size,
Write dma_length of each leader with the combined lengths of
the mergable followers. */

-#define SG_ENT_VIRT_ADDRESS(SG) (sg_virt((SG)))
-#define SG_ENT_PHYS_ADDRESS(SG) __pa(SG_ENT_VIRT_ADDRESS(SG))
-
static void
sg_classify(struct device *dev, struct scatterlist *sg, struct scatterlist *end,
int virt_ok)
@@ -512,13 +506,13 @@ sg_classify(struct device *dev, struct scatterlist *sg, struct scatterlist *end,
leader = sg;
leader_flag = 0;
leader_length = leader->length;
- next_paddr = SG_ENT_PHYS_ADDRESS(leader) + leader_length;
+ next_paddr = sg_phys(leader) + leader_length;

/* we will not marge sg without device. */
max_seg_size = dev ? dma_get_max_seg_size(dev) : 0;
for (++sg; sg < end; ++sg) {
unsigned long addr, len;
- addr = SG_ENT_PHYS_ADDRESS(sg);
+ addr = sg_phys(sg);
len = sg->length;

if (leader_length + len > max_seg_size)
@@ -555,7 +549,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
struct scatterlist *out, struct pci_iommu_arena *arena,
dma_addr_t max_dma, int dac_allowed)
{
- unsigned long paddr = SG_ENT_PHYS_ADDRESS(leader);
+ unsigned long paddr = sg_phys(leader);
long size = leader->dma_length;
struct scatterlist *sg;
unsigned long *ptes;
@@ -621,7 +615,7 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,
#endif

size = sg->length;
- paddr = SG_ENT_PHYS_ADDRESS(sg);
+ paddr = sg_phys(sg);

while (sg+1 < end && (int) sg[1].dma_address == -1) {
size += sg[1].length;
@@ -636,11 +630,11 @@ sg_fill(struct device *dev, struct scatterlist *leader, struct scatterlist *end,

#if DEBUG_ALLOC > 0
DBGA(" (%ld) [%p,%x] np %ld\n",
- last_sg - leader, SG_ENT_VIRT_ADDRESS(last_sg),
+ last_sg - leader, sg_phys(last_sg),
last_sg->length, npages);
while (++last_sg <= sg) {
DBGA(" (%ld) [%p,%x] cont\n",
- last_sg - leader, SG_ENT_VIRT_ADDRESS(last_sg),
+ last_sg - leader, sg_phys(last_sg),
last_sg->length);
}
#endif
@@ -668,7 +662,7 @@ static int alpha_pci_map_sg(struct device *dev, struct scatterlist *sg,
if (nents == 1) {
sg->dma_length = sg->length;
sg->dma_address
- = pci_map_single_1(pdev, SG_ENT_VIRT_ADDRESS(sg),
+ = pci_map_single_1(pdev, sg_phys(sg),
sg->length, dac_allowed);
return sg->dma_address != 0;
}
--
1.9.1

2015-08-12 07:18:50

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 08/31] c6x: handle page-less SG entries

Use sg_phys() instead of virt_to_phys(sg_virt(sg)) so that we don't
require a kernel virtual address.

Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/c6x/kernel/dma.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
index ab7b12d..79cae03 100644
--- a/arch/c6x/kernel/dma.c
+++ b/arch/c6x/kernel/dma.c
@@ -68,8 +68,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist,
int i;

for_each_sg(sglist, sg, nents, i)
- sg->dma_address = dma_map_single(dev, sg_virt(sg), sg->length,
- dir);
+ sg->dma_address = sg_phys(sg);

debug_dma_map_sg(dev, sglist, nents, nents, dir);

--
1.9.1

2015-08-12 07:18:14

by Christoph Hellwig

[permalink] [raw]

Subject: [PATCH 09/31] ia64/pci_dma: handle page-less SG entries

Use sg_phys() instead of virt_to_phys(sg_virt(sg)) so that we don't
require a kernel virtual address.

Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/ia64/sn/pci/pci_dma.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index d0853e8..8f713c8 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -18,9 +18,6 @@
#include <asm/sn/pcidev.h>
#include <asm/sn/sn_sal.h>

-#define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
-#define SG_ENT_PHYS_ADDRESS(SG) virt_to_phys(SG_ENT_VIRT_ADDRESS(SG))
-
/**
* sn_dma_supported - test a DMA mask
* @dev: device to test
@@ -291,7 +288,7 @@ static int sn_dma_map_sg(struct device *dev, struct scatterlist *sgl,
*/
for_each_sg(sgl, sg, nhwentries, i) {
dma_addr_t dma_addr;
- phys_addr = SG_ENT_PHYS_ADDRESS(sg);
+ phys_addr = sg_phys(sg);
if (dmabarr)
dma_addr = provider->dma_map_consistent(pdev,
phys_addr,
--
1.9.1

2015-08-12 07:09:16

by Christoph Hellwig

[permalink] [raw]

2015-08-12 07:12:29

by Christoph Hellwig

[permalink] [raw]

[permalink] [raw]

Subject: Re: [PATCH 29/31] parisc: handle page-less SG entries

From: James Bottomley <[email protected]>
Date: Thu, 13 Aug 2015 20:59:20 -0700

> On Thu, 2015-08-13 at 20:30 -0700, Dan Williams wrote:
>> On Thu, Aug 13, 2015 at 7:31 AM, Christoph Hellwig <[email protected]> wrote:
>> > On Wed, Aug 12, 2015 at 09:01:02AM -0700, Linus Torvalds wrote:
>> >> I'm assuming that anybody who wants to use the page-less
>> >> scatter-gather lists always does so on memory that isn't actually
>> >> virtually mapped at all, or only does so on sane architectures that
>> >> are cache coherent at a physical level, but I'd like that assumption
>> >> *documented* somewhere.
>> >
>> > It's temporarily mapped by kmap-like helpers. That code isn't in
>> > this series. The most recent version of it is here:
>> >
>> > https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git/commit/?h=pfn&id=de8237c99fdb4352be2193f3a7610e902b9bb2f0
>> >
>> > note that it's not doing the cache flushing it would have to do yet, but
>> > it's also only enabled for x86 at the moment.
>>
>> For virtually tagged caches I assume we would temporarily map with
>> kmap_atomic_pfn_t(), similar to how drm_clflush_pages() implements
>> powerpc support. However with DAX we could end up with multiple
>> virtual aliases for a page-less pfn.
>
> At least on some PA architectures, you have to be very careful.
> Improperly managed, multiple aliases will cause the system to crash
> (actually a machine check in the cache chequerboard). For the most
> temperamental systems, we need the cache line flushed and the alias
> mapping ejected from the TLB cache before we access the same page at an
> inequivalent alias.

Also, I want to mention that on sparc64 we manage the cache aliasing
state in the page struct.

Until a page is mapped into userspace, we just record the most recent
cpu to store into that page with kernel side mappings. Once the page
ends up being mapped or the cpu doing kernel side stores changes, we
actually perform the cache flush.

Generally speaking, I think that all actual physical memory the kernel
operates on should have a struct page backing it. So this whole
discussion of operating on physical memory in scatter lists without
backing page structs feels really foreign to me.

2015-08-14 16:17:52

by Dan Williams

[permalink] [raw]

Subject: Re: [PATCH 29/31] parisc: handle page-less SG entries

On Thu, Aug 13, 2015 at 9:11 PM, David Miller <[email protected]> wrote:
> From: James Bottomley <[email protected]>
>> At least on some PA architectures, you have to be very careful.
>> Improperly managed, multiple aliases will cause the system to crash
>> (actually a machine check in the cache chequerboard). For the most
>> temperamental systems, we need the cache line flushed and the alias
>> mapping ejected from the TLB cache before we access the same page at an
>> inequivalent alias.
>
> Also, I want to mention that on sparc64 we manage the cache aliasing
> state in the page struct.
>
> Until a page is mapped into userspace, we just record the most recent
> cpu to store into that page with kernel side mappings. Once the page
> ends up being mapped or the cpu doing kernel side stores changes, we
> actually perform the cache flush.
>
> Generally speaking, I think that all actual physical memory the kernel
> operates on should have a struct page backing it. So this whole
> discussion of operating on physical memory in scatter lists without
> backing page structs feels really foreign to me.

So the only way for page-less pfns to enter the system is through the
->direct_access() method provided by a pmem device's struct
block_device_operations. Architectures that require struct page for
cache management to must disable ->direct_access() in this case.

If an arch still wants to support pmem+DAX then it needs something
like this patchset (feedback welcome) to map pmem pfns:

https://lkml.org/lkml/2015/8/12/970

Effectively this would disable ->direct_access() on /dev/pmem0, but
permit ->direct_access() on /dev/pmem0m.