2019-08-06 16:08:31

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

There is only a single place where the pgmap is passed over a function
call, so replace it with local variables in the places where we deal
with the pgmap.

Signed-off-by: Christoph Hellwig <[email protected]>
---
mm/hmm.c | 62 ++++++++++++++++++++++++--------------------------------
1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 9a908902e4cc..d66fa29b42e0 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -278,7 +278,6 @@ EXPORT_SYMBOL(hmm_mirror_unregister);

struct hmm_vma_walk {
struct hmm_range *range;
- struct dev_pagemap *pgmap;
unsigned long last;
unsigned int flags;
};
@@ -475,6 +474,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
+ struct dev_pagemap *pgmap = NULL;
unsigned long pfn, npages, i;
bool fault, write_fault;
uint64_t cpu_flags;
@@ -490,17 +490,14 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
pfn = pmd_pfn(pmd) + pte_index(addr);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
if (pmd_devmap(pmd)) {
- hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
- hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap))
return -EBUSY;
}
pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
}
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
+ if (pgmap)
+ put_dev_pagemap(pgmap);
hmm_vma_walk->last = end;
return 0;
#else
@@ -520,7 +517,7 @@ static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)

static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long end, pmd_t *pmdp, pte_t *ptep,
- uint64_t *pfn)
+ uint64_t *pfn, struct dev_pagemap **pgmap)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
@@ -591,9 +588,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
goto fault;

if (pte_devmap(pte)) {
- hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
- hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
+ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
+ if (unlikely(!*pgmap))
return -EBUSY;
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
*pfn = range->values[HMM_PFN_SPECIAL];
@@ -604,10 +600,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
return 0;

fault:
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
+ if (*pgmap)
+ put_dev_pagemap(*pgmap);
+ *pgmap = NULL;
+
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -620,6 +616,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
+ struct dev_pagemap *pgmap = NULL;
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
@@ -683,23 +680,21 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
int r;

- r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i],
+ &pgmap);
if (r) {
/* hmm_vma_handle_pte() did unmap pte directory */
hmm_vma_walk->last = addr;
return r;
}
}
- if (hmm_vma_walk->pgmap) {
- /*
- * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
- * so that we can leverage get_dev_pagemap() optimization which
- * will not re-take a reference on a pgmap if we already have
- * one.
- */
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
+ /*
+ * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() so that
+ * we can leverage the get_dev_pagemap() optimization which will not
+ * re-take a reference on a pgmap if we already have one.
+ */
+ if (pgmap)
+ put_dev_pagemap(pgmap);
pte_unmap(ptep - 1);

hmm_vma_walk->last = addr;
@@ -714,6 +709,7 @@ static int hmm_vma_walk_pud(pud_t *pudp,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start, next;
+ struct dev_pagemap *pgmap = NULL;
pmd_t *pmdp;
pud_t pud;
int ret;
@@ -744,17 +740,14 @@ static int hmm_vma_walk_pud(pud_t *pudp,

pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
- hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
- hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap))
return -EBUSY;
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
}
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
+ if (pgmap)
+ put_dev_pagemap(pgmap);
hmm_vma_walk->last = end;
return 0;
}
@@ -1002,7 +995,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
return -EPERM;
}

- hmm_vma_walk.pgmap = NULL;
hmm_vma_walk.last = start;
hmm_vma_walk.flags = flags;
hmm_vma_walk.range = range;
--
2.20.1


2019-08-07 19:20:16

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Tue, Aug 06, 2019 at 07:05:42PM +0300, Christoph Hellwig wrote:
> There is only a single place where the pgmap is passed over a function
> call, so replace it with local variables in the places where we deal
> with the pgmap.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> mm/hmm.c | 62 ++++++++++++++++++++++++--------------------------------
> 1 file changed, 27 insertions(+), 35 deletions(-)
>
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 9a908902e4cc..d66fa29b42e0 100644
> +++ b/mm/hmm.c
> @@ -278,7 +278,6 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
>
> struct hmm_vma_walk {
> struct hmm_range *range;
> - struct dev_pagemap *pgmap;
> unsigned long last;
> unsigned int flags;
> };
> @@ -475,6 +474,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> + struct dev_pagemap *pgmap = NULL;
> unsigned long pfn, npages, i;
> bool fault, write_fault;
> uint64_t cpu_flags;
> @@ -490,17 +490,14 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> pfn = pmd_pfn(pmd) + pte_index(addr);
> for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
> if (pmd_devmap(pmd)) {
> - hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> - hmm_vma_walk->pgmap);
> - if (unlikely(!hmm_vma_walk->pgmap))
> + pgmap = get_dev_pagemap(pfn, pgmap);
> + if (unlikely(!pgmap))
> return -EBUSY;

Unrelated to this patch, but what is the point of getting checking
that the pgmap exists for the page and then immediately releasing it?
This code has this pattern in several places.

It feels racy

> }
> pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
> }
> - if (hmm_vma_walk->pgmap) {
> - put_dev_pagemap(hmm_vma_walk->pgmap);
> - hmm_vma_walk->pgmap = NULL;

Putting the value in the hmm_vma_walk would have made some sense to me
if the pgmap was not set to NULL all over the place. Then the most
xa_loads would be eliminated, as I would expect the pgmap tends to be
mostly uniform for these use cases.

Is there some reason the pgmap ref can't be held across
faulting/sleeping? ie like below.

Anyhow, I looked over this pretty carefully and the change looks
functionally OK, I just don't know why the code is like this in the
first place.

Reviewed-by: Jason Gunthorpe <[email protected]>

diff --git a/mm/hmm.c b/mm/hmm.c
index 9a908902e4cc38..4e30128c23a505 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -497,10 +497,6 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
}
pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
}
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
hmm_vma_walk->last = end;
return 0;
#else
@@ -604,10 +600,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
return 0;

fault:
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -690,16 +682,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return r;
}
}
- if (hmm_vma_walk->pgmap) {
- /*
- * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
- * so that we can leverage get_dev_pagemap() optimization which
- * will not re-take a reference on a pgmap if we already have
- * one.
- */
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
pte_unmap(ptep - 1);

hmm_vma_walk->last = addr;
@@ -751,10 +733,6 @@ static int hmm_vma_walk_pud(pud_t *pudp,
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
}
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
hmm_vma_walk->last = end;
return 0;
}
@@ -1026,6 +1004,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
/* Keep trying while the range is valid. */
} while (ret == -EBUSY && range->valid);

+ /*
+ * We do put_dev_pagemap() here so that we can leverage
+ * get_dev_pagemap() optimization which will not re-take a
+ * reference on a pgmap if we already have one.
+ */
+ if (hmm_vma_walk->pgmap)
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+
if (ret) {
unsigned long i;

2019-08-07 19:43:03

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Wed, Aug 7, 2019 at 10:45 AM Jason Gunthorpe <[email protected]> wrote:
>
> On Tue, Aug 06, 2019 at 07:05:42PM +0300, Christoph Hellwig wrote:
> > There is only a single place where the pgmap is passed over a function
> > call, so replace it with local variables in the places where we deal
> > with the pgmap.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > mm/hmm.c | 62 ++++++++++++++++++++++++--------------------------------
> > 1 file changed, 27 insertions(+), 35 deletions(-)
> >
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index 9a908902e4cc..d66fa29b42e0 100644
> > +++ b/mm/hmm.c
> > @@ -278,7 +278,6 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
> >
> > struct hmm_vma_walk {
> > struct hmm_range *range;
> > - struct dev_pagemap *pgmap;
> > unsigned long last;
> > unsigned int flags;
> > };
> > @@ -475,6 +474,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> > #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > struct hmm_vma_walk *hmm_vma_walk = walk->private;
> > struct hmm_range *range = hmm_vma_walk->range;
> > + struct dev_pagemap *pgmap = NULL;
> > unsigned long pfn, npages, i;
> > bool fault, write_fault;
> > uint64_t cpu_flags;
> > @@ -490,17 +490,14 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> > pfn = pmd_pfn(pmd) + pte_index(addr);
> > for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
> > if (pmd_devmap(pmd)) {
> > - hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> > - hmm_vma_walk->pgmap);
> > - if (unlikely(!hmm_vma_walk->pgmap))
> > + pgmap = get_dev_pagemap(pfn, pgmap);
> > + if (unlikely(!pgmap))
> > return -EBUSY;
>
> Unrelated to this patch, but what is the point of getting checking
> that the pgmap exists for the page and then immediately releasing it?
> This code has this pattern in several places.
>
> It feels racy

Agree, not sure what the intent is here. The only other reason call
get_dev_pagemap() is to just check in general if the pfn is indeed
owned by some ZONE_DEVICE instance, but if the intent is to make sure
the device is still attached/enabled that check is invalidated at
put_dev_pagemap().

If it's the former case, validating ZONE_DEVICE pfns, I imagine we can
do something cheaper with a helper that is on the order of the same
cost as pfn_valid(). I.e. replace PTE_DEVMAP with a mem_section flag
or something similar.

>
> > }
> > pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
> > }
> > - if (hmm_vma_walk->pgmap) {
> > - put_dev_pagemap(hmm_vma_walk->pgmap);
> > - hmm_vma_walk->pgmap = NULL;
>
> Putting the value in the hmm_vma_walk would have made some sense to me
> if the pgmap was not set to NULL all over the place. Then the most
> xa_loads would be eliminated, as I would expect the pgmap tends to be
> mostly uniform for these use cases.
>
> Is there some reason the pgmap ref can't be held across
> faulting/sleeping? ie like below.

No restriction on holding refs over faulting / sleeping.

>
> Anyhow, I looked over this pretty carefully and the change looks
> functionally OK, I just don't know why the code is like this in the
> first place.
>
> Reviewed-by: Jason Gunthorpe <[email protected]>
>
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 9a908902e4cc38..4e30128c23a505 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -497,10 +497,6 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> }
> pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
> }
> - if (hmm_vma_walk->pgmap) {
> - put_dev_pagemap(hmm_vma_walk->pgmap);
> - hmm_vma_walk->pgmap = NULL;
> - }
> hmm_vma_walk->last = end;
> return 0;
> #else
> @@ -604,10 +600,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> return 0;
>
> fault:
> - if (hmm_vma_walk->pgmap) {
> - put_dev_pagemap(hmm_vma_walk->pgmap);
> - hmm_vma_walk->pgmap = NULL;
> - }
> pte_unmap(ptep);
> /* Fault any virtual address we were asked to fault */
> return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> @@ -690,16 +682,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> return r;
> }
> }
> - if (hmm_vma_walk->pgmap) {
> - /*
> - * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
> - * so that we can leverage get_dev_pagemap() optimization which
> - * will not re-take a reference on a pgmap if we already have
> - * one.
> - */
> - put_dev_pagemap(hmm_vma_walk->pgmap);
> - hmm_vma_walk->pgmap = NULL;
> - }
> pte_unmap(ptep - 1);
>
> hmm_vma_walk->last = addr;
> @@ -751,10 +733,6 @@ static int hmm_vma_walk_pud(pud_t *pudp,
> pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
> cpu_flags;
> }
> - if (hmm_vma_walk->pgmap) {
> - put_dev_pagemap(hmm_vma_walk->pgmap);
> - hmm_vma_walk->pgmap = NULL;
> - }
> hmm_vma_walk->last = end;
> return 0;
> }
> @@ -1026,6 +1004,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
> /* Keep trying while the range is valid. */
> } while (ret == -EBUSY && range->valid);
>
> + /*
> + * We do put_dev_pagemap() here so that we can leverage
> + * get_dev_pagemap() optimization which will not re-take a
> + * reference on a pgmap if we already have one.
> + */
> + if (hmm_vma_walk->pgmap)
> + put_dev_pagemap(hmm_vma_walk->pgmap);
> +

Seems ok, but only if the caller is guaranteeing that the range does
not span outside of a single pagemap instance. If that guarantee is
met why not just have the caller pass in a pinned pagemap? If that
guarantee is not met, then I think we're back to your race concern.

2019-08-08 07:01:46

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Wed, Aug 07, 2019 at 11:47:22AM -0700, Dan Williams wrote:
> > Unrelated to this patch, but what is the point of getting checking
> > that the pgmap exists for the page and then immediately releasing it?
> > This code has this pattern in several places.
> >
> > It feels racy
>
> Agree, not sure what the intent is here. The only other reason call
> get_dev_pagemap() is to just check in general if the pfn is indeed
> owned by some ZONE_DEVICE instance, but if the intent is to make sure
> the device is still attached/enabled that check is invalidated at
> put_dev_pagemap().
>
> If it's the former case, validating ZONE_DEVICE pfns, I imagine we can
> do something cheaper with a helper that is on the order of the same
> cost as pfn_valid(). I.e. replace PTE_DEVMAP with a mem_section flag
> or something similar.

The hmm literally never dereferences the pgmap, so validity checking is
the only explanation for it.

> > + /*
> > + * We do put_dev_pagemap() here so that we can leverage
> > + * get_dev_pagemap() optimization which will not re-take a
> > + * reference on a pgmap if we already have one.
> > + */
> > + if (hmm_vma_walk->pgmap)
> > + put_dev_pagemap(hmm_vma_walk->pgmap);
> > +
>
> Seems ok, but only if the caller is guaranteeing that the range does
> not span outside of a single pagemap instance. If that guarantee is
> met why not just have the caller pass in a pinned pagemap? If that
> guarantee is not met, then I think we're back to your race concern.

It iterates over multiple ptes in a non-huge pmd. Is there any kind of
limitations on different pgmap instances inside a pmd? I can't think
of one, so this might actually be a bug.

2019-08-14 01:38:49

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Wed, Aug 7, 2019 at 11:59 PM Christoph Hellwig <[email protected]> wrote:
>
> On Wed, Aug 07, 2019 at 11:47:22AM -0700, Dan Williams wrote:
> > > Unrelated to this patch, but what is the point of getting checking
> > > that the pgmap exists for the page and then immediately releasing it?
> > > This code has this pattern in several places.
> > >
> > > It feels racy
> >
> > Agree, not sure what the intent is here. The only other reason call
> > get_dev_pagemap() is to just check in general if the pfn is indeed
> > owned by some ZONE_DEVICE instance, but if the intent is to make sure
> > the device is still attached/enabled that check is invalidated at
> > put_dev_pagemap().
> >
> > If it's the former case, validating ZONE_DEVICE pfns, I imagine we can
> > do something cheaper with a helper that is on the order of the same
> > cost as pfn_valid(). I.e. replace PTE_DEVMAP with a mem_section flag
> > or something similar.
>
> The hmm literally never dereferences the pgmap, so validity checking is
> the only explanation for it.
>
> > > + /*
> > > + * We do put_dev_pagemap() here so that we can leverage
> > > + * get_dev_pagemap() optimization which will not re-take a
> > > + * reference on a pgmap if we already have one.
> > > + */
> > > + if (hmm_vma_walk->pgmap)
> > > + put_dev_pagemap(hmm_vma_walk->pgmap);
> > > +
> >
> > Seems ok, but only if the caller is guaranteeing that the range does
> > not span outside of a single pagemap instance. If that guarantee is
> > met why not just have the caller pass in a pinned pagemap? If that
> > guarantee is not met, then I think we're back to your race concern.
>
> It iterates over multiple ptes in a non-huge pmd. Is there any kind of
> limitations on different pgmap instances inside a pmd? I can't think
> of one, so this might actually be a bug.

Section alignment constraints somewhat save us here. The only example
I can think of a PMD not containing a uniform pgmap association for
each pte is the case when the pgmap overlaps normal dram, i.e. shares
the same 'struct memory_section' for a given span. Otherwise, distinct
pgmaps arrange to manage their own exclusive sections (and now
subsections as of v5.3). Otherwise the implementation could not
guarantee different mapping lifetimes.

That said, this seems to want a better mechanism to determine "pfn is
ZONE_DEVICE".

2019-08-14 07:40:16

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> Section alignment constraints somewhat save us here. The only example
> I can think of a PMD not containing a uniform pgmap association for
> each pte is the case when the pgmap overlaps normal dram, i.e. shares
> the same 'struct memory_section' for a given span. Otherwise, distinct
> pgmaps arrange to manage their own exclusive sections (and now
> subsections as of v5.3). Otherwise the implementation could not
> guarantee different mapping lifetimes.
>
> That said, this seems to want a better mechanism to determine "pfn is
> ZONE_DEVICE".

So I guess this patch is fine for now, and once you provide a better
mechanism we can switch over to it?

2019-08-14 13:29:04

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > Section alignment constraints somewhat save us here. The only example
> > I can think of a PMD not containing a uniform pgmap association for
> > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > the same 'struct memory_section' for a given span. Otherwise, distinct
> > pgmaps arrange to manage their own exclusive sections (and now
> > subsections as of v5.3). Otherwise the implementation could not
> > guarantee different mapping lifetimes.
> >
> > That said, this seems to want a better mechanism to determine "pfn is
> > ZONE_DEVICE".
>
> So I guess this patch is fine for now, and once you provide a better
> mechanism we can switch over to it?

What about the version I sent to just get rid of all the strange
put_dev_pagemaps while scanning? Odds are good we will work with only
a single pagemap, so it makes some sense to cache it once we find it?

diff --git a/mm/hmm.c b/mm/hmm.c
index 9a908902e4cc38..4e30128c23a505 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -497,10 +497,6 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
}
pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
}
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
hmm_vma_walk->last = end;
return 0;
#else
@@ -604,10 +600,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
return 0;

fault:
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -690,16 +682,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
return r;
}
}
- if (hmm_vma_walk->pgmap) {
- /*
- * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
- * so that we can leverage get_dev_pagemap() optimization which
- * will not re-take a reference on a pgmap if we already have
- * one.
- */
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
pte_unmap(ptep - 1);

hmm_vma_walk->last = addr;
@@ -751,10 +733,6 @@ static int hmm_vma_walk_pud(pud_t *pudp,
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
}
- if (hmm_vma_walk->pgmap) {
- put_dev_pagemap(hmm_vma_walk->pgmap);
- hmm_vma_walk->pgmap = NULL;
- }
hmm_vma_walk->last = end;
return 0;
}
@@ -1026,6 +1004,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
/* Keep trying while the range is valid. */
} while (ret == -EBUSY && range->valid);

+ /*
+ * We do put_dev_pagemap() here so that we can leverage
+ * get_dev_pagemap() optimization which will not re-take a
+ * reference on a pgmap if we already have one.
+ */
+ if (hmm_vma_walk->pgmap)
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+
if (ret) {
unsigned long i;


2019-08-14 14:49:42

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
>
> On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > Section alignment constraints somewhat save us here. The only example
> > > I can think of a PMD not containing a uniform pgmap association for
> > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > pgmaps arrange to manage their own exclusive sections (and now
> > > subsections as of v5.3). Otherwise the implementation could not
> > > guarantee different mapping lifetimes.
> > >
> > > That said, this seems to want a better mechanism to determine "pfn is
> > > ZONE_DEVICE".
> >
> > So I guess this patch is fine for now, and once you provide a better
> > mechanism we can switch over to it?
>
> What about the version I sent to just get rid of all the strange
> put_dev_pagemaps while scanning? Odds are good we will work with only
> a single pagemap, so it makes some sense to cache it once we find it?

Yes, if the scan is over a single pmd then caching it makes sense.

2019-08-15 19:59:47

by Jerome Glisse

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Wed, Aug 14, 2019 at 07:48:28AM -0700, Dan Williams wrote:
> On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
> >
> > On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > > Section alignment constraints somewhat save us here. The only example
> > > > I can think of a PMD not containing a uniform pgmap association for
> > > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > > pgmaps arrange to manage their own exclusive sections (and now
> > > > subsections as of v5.3). Otherwise the implementation could not
> > > > guarantee different mapping lifetimes.
> > > >
> > > > That said, this seems to want a better mechanism to determine "pfn is
> > > > ZONE_DEVICE".
> > >
> > > So I guess this patch is fine for now, and once you provide a better
> > > mechanism we can switch over to it?
> >
> > What about the version I sent to just get rid of all the strange
> > put_dev_pagemaps while scanning? Odds are good we will work with only
> > a single pagemap, so it makes some sense to cache it once we find it?
>
> Yes, if the scan is over a single pmd then caching it makes sense.

Quite frankly an easier an better solution is to remove the pagemap
lookup as HMM user abide by mmu notifier it means we will not make
use or dereference the struct page so that we are safe from any
racing hotunplug of dax memory (as long as device driver using hmm
do not have a bug).

Cheers,
J?r?me

2019-08-15 20:14:14

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 02:03:25PM -0400, Jerome Glisse wrote:
> On Wed, Aug 14, 2019 at 07:48:28AM -0700, Dan Williams wrote:
> > On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
> > >
> > > On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > > > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > > > Section alignment constraints somewhat save us here. The only example
> > > > > I can think of a PMD not containing a uniform pgmap association for
> > > > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > > > pgmaps arrange to manage their own exclusive sections (and now
> > > > > subsections as of v5.3). Otherwise the implementation could not
> > > > > guarantee different mapping lifetimes.
> > > > >
> > > > > That said, this seems to want a better mechanism to determine "pfn is
> > > > > ZONE_DEVICE".
> > > >
> > > > So I guess this patch is fine for now, and once you provide a better
> > > > mechanism we can switch over to it?
> > >
> > > What about the version I sent to just get rid of all the strange
> > > put_dev_pagemaps while scanning? Odds are good we will work with only
> > > a single pagemap, so it makes some sense to cache it once we find it?
> >
> > Yes, if the scan is over a single pmd then caching it makes sense.
>
> Quite frankly an easier an better solution is to remove the pagemap
> lookup as HMM user abide by mmu notifier it means we will not make
> use or dereference the struct page so that we are safe from any
> racing hotunplug of dax memory (as long as device driver using hmm
> do not have a bug).

Yes, I also would prefer to drop the confusing checks entirely -
Christoph can you resend this patch?

Thanks,
Jason

2019-08-15 20:24:53

by Jerome Glisse

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 12:36:58PM -0700, Dan Williams wrote:
> On Thu, Aug 15, 2019 at 11:07 AM Jerome Glisse <[email protected]> wrote:
> >
> > On Wed, Aug 14, 2019 at 07:48:28AM -0700, Dan Williams wrote:
> > > On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
> > > >
> > > > On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > > > > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > > > > Section alignment constraints somewhat save us here. The only example
> > > > > > I can think of a PMD not containing a uniform pgmap association for
> > > > > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > > > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > > > > pgmaps arrange to manage their own exclusive sections (and now
> > > > > > subsections as of v5.3). Otherwise the implementation could not
> > > > > > guarantee different mapping lifetimes.
> > > > > >
> > > > > > That said, this seems to want a better mechanism to determine "pfn is
> > > > > > ZONE_DEVICE".
> > > > >
> > > > > So I guess this patch is fine for now, and once you provide a better
> > > > > mechanism we can switch over to it?
> > > >
> > > > What about the version I sent to just get rid of all the strange
> > > > put_dev_pagemaps while scanning? Odds are good we will work with only
> > > > a single pagemap, so it makes some sense to cache it once we find it?
> > >
> > > Yes, if the scan is over a single pmd then caching it makes sense.
> >
> > Quite frankly an easier an better solution is to remove the pagemap
> > lookup as HMM user abide by mmu notifier it means we will not make
> > use or dereference the struct page so that we are safe from any
> > racing hotunplug of dax memory (as long as device driver using hmm
> > do not have a bug).
>
> Yes, as long as the driver remove is synchronized against HMM
> operations via another mechanism then there is no need to take pagemap
> references. Can you briefly describe what that other mechanism is?

So if you hotunplug some dax memory i assume that this can only
happens once all the pages are unmapped (as it must have the
zero refcount, well 1 because of the bias) and any unmap will
trigger a mmu notifier callback. User of hmm mirror abiding by
the API will never make use of information they get through the
fault or snapshot function until checking for racing notifier
under lock.

Cheers,
J?r?me

2019-08-15 21:04:39

by Jerome Glisse

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 01:12:22PM -0700, Dan Williams wrote:
> On Thu, Aug 15, 2019 at 12:44 PM Jerome Glisse <[email protected]> wrote:
> >
> > On Thu, Aug 15, 2019 at 12:36:58PM -0700, Dan Williams wrote:
> > > On Thu, Aug 15, 2019 at 11:07 AM Jerome Glisse <[email protected]> wrote:
> > > >
> > > > On Wed, Aug 14, 2019 at 07:48:28AM -0700, Dan Williams wrote:
> > > > > On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
> > > > > >
> > > > > > On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > > > > > > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > > > > > > Section alignment constraints somewhat save us here. The only example
> > > > > > > > I can think of a PMD not containing a uniform pgmap association for
> > > > > > > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > > > > > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > > > > > > pgmaps arrange to manage their own exclusive sections (and now
> > > > > > > > subsections as of v5.3). Otherwise the implementation could not
> > > > > > > > guarantee different mapping lifetimes.
> > > > > > > >
> > > > > > > > That said, this seems to want a better mechanism to determine "pfn is
> > > > > > > > ZONE_DEVICE".
> > > > > > >
> > > > > > > So I guess this patch is fine for now, and once you provide a better
> > > > > > > mechanism we can switch over to it?
> > > > > >
> > > > > > What about the version I sent to just get rid of all the strange
> > > > > > put_dev_pagemaps while scanning? Odds are good we will work with only
> > > > > > a single pagemap, so it makes some sense to cache it once we find it?
> > > > >
> > > > > Yes, if the scan is over a single pmd then caching it makes sense.
> > > >
> > > > Quite frankly an easier an better solution is to remove the pagemap
> > > > lookup as HMM user abide by mmu notifier it means we will not make
> > > > use or dereference the struct page so that we are safe from any
> > > > racing hotunplug of dax memory (as long as device driver using hmm
> > > > do not have a bug).
> > >
> > > Yes, as long as the driver remove is synchronized against HMM
> > > operations via another mechanism then there is no need to take pagemap
> > > references. Can you briefly describe what that other mechanism is?
> >
> > So if you hotunplug some dax memory i assume that this can only
> > happens once all the pages are unmapped (as it must have the
> > zero refcount, well 1 because of the bias) and any unmap will
> > trigger a mmu notifier callback. User of hmm mirror abiding by
> > the API will never make use of information they get through the
> > fault or snapshot function until checking for racing notifier
> > under lock.
>
> Hmm that first assumption is not guaranteed by the dev_pagemap core.
> The dev_pagemap end of life model is "disable, invalidate, drain" so
> it's possible to call devm_munmap_pages() while pages are still mapped
> it just won't complete the teardown of the pagemap until the last
> reference is dropped. New references are blocked during this teardown.
>
> However, if the driver is validating the liveness of the mapping in
> the mmu-notifier path and blocking new references it sounds like it
> should be ok. Might there be GPU driver unit tests that cover this
> racing teardown case?

So nor HMM nor driver should dereference the struct page (i do not
think any iommu driver would either), they only care about the pfn.
So even if we race with a teardown as soon as we get the mmu notifier
callback to invalidate the mmapping we will do so. The pattern is:

mydriver_populate_vaddr_range(start, end) {
hmm_range_register(range, start, end)
again:
ret = hmm_range_fault(start, end)
if (ret < 0)
return ret;

take_driver_page_table_lock();
if (range.valid) {
populate_device_page_table();
release_driver_page_table_lock();
} else {
release_driver_page_table_lock();
goto again;
}
}

The mmu notifier callback do use the same page table lock and we
also have the range tracking going on. So either we populate
device page table before racing with teardown in which case the
device page table entry are clear through the mmu notifier call
back. Or if we race, but then we can see the racing mmu notifier
calls and retry again which will trigger a regular page fault
which will return an error i assume.

So in the end we have the exact same behavior as if a CPU was trying
to access that virtual address. This is the whole point of HMM, to
behave exactly as if it was a CPU access. Fails in the same way,
race in the same way. So if DAX teardown are safe versus racing
CPU access to some vma that have that memory map, it will be the
same for HMM users.


GPU driver test suite are not good at testing this. They are geared
to test the GPU itself not the interaction of the GPU driver with
rest of the kernel.

Cheers,
J?r?me

2019-08-15 21:07:48

by Jerome Glisse

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 08:41:33PM +0000, Jason Gunthorpe wrote:
> On Thu, Aug 15, 2019 at 04:33:06PM -0400, Jerome Glisse wrote:
>
> > So nor HMM nor driver should dereference the struct page (i do not
> > think any iommu driver would either),
>
> Er, they do technically deref the struct page:
>
> nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> struct hmm_range *range)
> struct page *page;
> page = hmm_pfn_to_page(range, range->pfns[i]);
> if (!nouveau_dmem_page(drm, page)) {
>
>
> nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
> {
> return is_device_private_page(page) && drm->dmem == page_to_dmem(page)
>
>
> Which does touch 'page->pgmap'
>
> Is this OK without having a get_dev_pagemap() ?
>
> Noting that the collision-retry scheme doesn't protect anything here
> as we can have a concurrent invalidation while doing the above deref.

Uh ? How so ? We are not reading the same code i think.

My read is that function is call when holding the device
lock which exclude any racing mmu notifier from making
forward progress and it is also protected by the range so
at the time this happens it is safe to dereference the
struct page. In this case any way we can update the
nouveau_dmem_page() to check that page page->pgmap == the
expected pgmap.

Cheers,
J?r?me

2019-08-15 22:20:07

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 11:07 AM Jerome Glisse <[email protected]> wrote:
>
> On Wed, Aug 14, 2019 at 07:48:28AM -0700, Dan Williams wrote:
> > On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
> > >
> > > On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > > > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > > > Section alignment constraints somewhat save us here. The only example
> > > > > I can think of a PMD not containing a uniform pgmap association for
> > > > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > > > pgmaps arrange to manage their own exclusive sections (and now
> > > > > subsections as of v5.3). Otherwise the implementation could not
> > > > > guarantee different mapping lifetimes.
> > > > >
> > > > > That said, this seems to want a better mechanism to determine "pfn is
> > > > > ZONE_DEVICE".
> > > >
> > > > So I guess this patch is fine for now, and once you provide a better
> > > > mechanism we can switch over to it?
> > >
> > > What about the version I sent to just get rid of all the strange
> > > put_dev_pagemaps while scanning? Odds are good we will work with only
> > > a single pagemap, so it makes some sense to cache it once we find it?
> >
> > Yes, if the scan is over a single pmd then caching it makes sense.
>
> Quite frankly an easier an better solution is to remove the pagemap
> lookup as HMM user abide by mmu notifier it means we will not make
> use or dereference the struct page so that we are safe from any
> racing hotunplug of dax memory (as long as device driver using hmm
> do not have a bug).

Yes, as long as the driver remove is synchronized against HMM
operations via another mechanism then there is no need to take pagemap
references. Can you briefly describe what that other mechanism is?

2019-08-15 23:10:50

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 12:44 PM Jerome Glisse <[email protected]> wrote:
>
> On Thu, Aug 15, 2019 at 12:36:58PM -0700, Dan Williams wrote:
> > On Thu, Aug 15, 2019 at 11:07 AM Jerome Glisse <[email protected]> wrote:
> > >
> > > On Wed, Aug 14, 2019 at 07:48:28AM -0700, Dan Williams wrote:
> > > > On Wed, Aug 14, 2019 at 6:28 AM Jason Gunthorpe <[email protected]> wrote:
> > > > >
> > > > > On Wed, Aug 14, 2019 at 09:38:54AM +0200, Christoph Hellwig wrote:
> > > > > > On Tue, Aug 13, 2019 at 06:36:33PM -0700, Dan Williams wrote:
> > > > > > > Section alignment constraints somewhat save us here. The only example
> > > > > > > I can think of a PMD not containing a uniform pgmap association for
> > > > > > > each pte is the case when the pgmap overlaps normal dram, i.e. shares
> > > > > > > the same 'struct memory_section' for a given span. Otherwise, distinct
> > > > > > > pgmaps arrange to manage their own exclusive sections (and now
> > > > > > > subsections as of v5.3). Otherwise the implementation could not
> > > > > > > guarantee different mapping lifetimes.
> > > > > > >
> > > > > > > That said, this seems to want a better mechanism to determine "pfn is
> > > > > > > ZONE_DEVICE".
> > > > > >
> > > > > > So I guess this patch is fine for now, and once you provide a better
> > > > > > mechanism we can switch over to it?
> > > > >
> > > > > What about the version I sent to just get rid of all the strange
> > > > > put_dev_pagemaps while scanning? Odds are good we will work with only
> > > > > a single pagemap, so it makes some sense to cache it once we find it?
> > > >
> > > > Yes, if the scan is over a single pmd then caching it makes sense.
> > >
> > > Quite frankly an easier an better solution is to remove the pagemap
> > > lookup as HMM user abide by mmu notifier it means we will not make
> > > use or dereference the struct page so that we are safe from any
> > > racing hotunplug of dax memory (as long as device driver using hmm
> > > do not have a bug).
> >
> > Yes, as long as the driver remove is synchronized against HMM
> > operations via another mechanism then there is no need to take pagemap
> > references. Can you briefly describe what that other mechanism is?
>
> So if you hotunplug some dax memory i assume that this can only
> happens once all the pages are unmapped (as it must have the
> zero refcount, well 1 because of the bias) and any unmap will
> trigger a mmu notifier callback. User of hmm mirror abiding by
> the API will never make use of information they get through the
> fault or snapshot function until checking for racing notifier
> under lock.

Hmm that first assumption is not guaranteed by the dev_pagemap core.
The dev_pagemap end of life model is "disable, invalidate, drain" so
it's possible to call devm_munmap_pages() while pages are still mapped
it just won't complete the teardown of the pagemap until the last
reference is dropped. New references are blocked during this teardown.

However, if the driver is validating the liveness of the mapping in
the mmu-notifier path and blocking new references it sounds like it
should be ok. Might there be GPU driver unit tests that cover this
racing teardown case?

2019-08-15 23:26:17

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 04:33:06PM -0400, Jerome Glisse wrote:

> So nor HMM nor driver should dereference the struct page (i do not
> think any iommu driver would either),

Er, they do technically deref the struct page:

nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
struct hmm_range *range)
struct page *page;
page = hmm_pfn_to_page(range, range->pfns[i]);
if (!nouveau_dmem_page(drm, page)) {


nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
{
return is_device_private_page(page) && drm->dmem == page_to_dmem(page)


Which does touch 'page->pgmap'

Is this OK without having a get_dev_pagemap() ?

Noting that the collision-retry scheme doesn't protect anything here
as we can have a concurrent invalidation while doing the above deref.

Jason

2019-08-15 23:32:42

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 1:41 PM Jason Gunthorpe <[email protected]> wrote:
>
> On Thu, Aug 15, 2019 at 04:33:06PM -0400, Jerome Glisse wrote:
>
> > So nor HMM nor driver should dereference the struct page (i do not
> > think any iommu driver would either),
>
> Er, they do technically deref the struct page:
>
> nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> struct hmm_range *range)
> struct page *page;
> page = hmm_pfn_to_page(range, range->pfns[i]);
> if (!nouveau_dmem_page(drm, page)) {
>
>
> nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
> {
> return is_device_private_page(page) && drm->dmem == page_to_dmem(page)
>
>
> Which does touch 'page->pgmap'
>
> Is this OK without having a get_dev_pagemap() ?
>
> Noting that the collision-retry scheme doesn't protect anything here
> as we can have a concurrent invalidation while doing the above deref.

As long take_driver_page_table_lock() in Jerome's flow can replace
percpu_ref_tryget_live() on the pagemap reference. It seems
nouveau_dmem_convert_pfn() happens after:

mutex_lock(&svmm->mutex);
if (!nouveau_range_done(&range)) {

...so I would expect that to be functionally equivalent to validating
the reference count.

2019-08-16 00:42:32

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 01:47:12PM -0700, Dan Williams wrote:
> On Thu, Aug 15, 2019 at 1:41 PM Jason Gunthorpe <[email protected]> wrote:
> >
> > On Thu, Aug 15, 2019 at 04:33:06PM -0400, Jerome Glisse wrote:
> >
> > > So nor HMM nor driver should dereference the struct page (i do not
> > > think any iommu driver would either),
> >
> > Er, they do technically deref the struct page:
> >
> > nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> > struct hmm_range *range)
> > struct page *page;
> > page = hmm_pfn_to_page(range, range->pfns[i]);
> > if (!nouveau_dmem_page(drm, page)) {
> >
> >
> > nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
> > {
> > return is_device_private_page(page) && drm->dmem == page_to_dmem(page)
> >
> >
> > Which does touch 'page->pgmap'
> >
> > Is this OK without having a get_dev_pagemap() ?
> >
> > Noting that the collision-retry scheme doesn't protect anything here
> > as we can have a concurrent invalidation while doing the above deref.
>
> As long take_driver_page_table_lock() in Jerome's flow can replace
> percpu_ref_tryget_live() on the pagemap reference. It seems
> nouveau_dmem_convert_pfn() happens after:
>
> mutex_lock(&svmm->mutex);
> if (!nouveau_range_done(&range)) {
>
> ...so I would expect that to be functionally equivalent to validating
> the reference count.

Yes, OK, that makes sense, I was mostly surprised by the statement the
driver doesn't touch the struct page..

I suppose "doesn't touch the struct page out of the driver lock" is
the case.

However, this means we cannot do any processing of ZONE_DEVICE pages
outside the driver lock, so eg, doing any DMA map that might rely on
MEMORY_DEVICE_PCI_P2PDMA has to be done in the driver lock, which is
a bit unfortunate.

Jason

2019-08-16 00:45:26

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 04:51:33PM -0400, Jerome Glisse wrote:

> struct page. In this case any way we can update the
> nouveau_dmem_page() to check that page page->pgmap == the
> expected pgmap.

I was also wondering if that is a problem.. just blindly doing a
container_of on the page->pgmap does seem like it assumes that only
this driver is using DEVICE_PRIVATE.

It seems like something missing in hmm_range_fault, it should be told
what DEVICE_PRIVATE is acceptable to trigger HMM_PFN_DEVICE_PRIVATE
and fault all others?

Jason

2019-08-16 03:56:02

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 5:41 PM Jason Gunthorpe <[email protected]> wrote:
>
> On Thu, Aug 15, 2019 at 01:47:12PM -0700, Dan Williams wrote:
> > On Thu, Aug 15, 2019 at 1:41 PM Jason Gunthorpe <[email protected]> wrote:
> > >
> > > On Thu, Aug 15, 2019 at 04:33:06PM -0400, Jerome Glisse wrote:
> > >
> > > > So nor HMM nor driver should dereference the struct page (i do not
> > > > think any iommu driver would either),
> > >
> > > Er, they do technically deref the struct page:
> > >
> > > nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
> > > struct hmm_range *range)
> > > struct page *page;
> > > page = hmm_pfn_to_page(range, range->pfns[i]);
> > > if (!nouveau_dmem_page(drm, page)) {
> > >
> > >
> > > nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
> > > {
> > > return is_device_private_page(page) && drm->dmem == page_to_dmem(page)
> > >
> > >
> > > Which does touch 'page->pgmap'
> > >
> > > Is this OK without having a get_dev_pagemap() ?
> > >
> > > Noting that the collision-retry scheme doesn't protect anything here
> > > as we can have a concurrent invalidation while doing the above deref.
> >
> > As long take_driver_page_table_lock() in Jerome's flow can replace
> > percpu_ref_tryget_live() on the pagemap reference. It seems
> > nouveau_dmem_convert_pfn() happens after:
> >
> > mutex_lock(&svmm->mutex);
> > if (!nouveau_range_done(&range)) {
> >
> > ...so I would expect that to be functionally equivalent to validating
> > the reference count.
>
> Yes, OK, that makes sense, I was mostly surprised by the statement the
> driver doesn't touch the struct page..
>
> I suppose "doesn't touch the struct page out of the driver lock" is
> the case.
>
> However, this means we cannot do any processing of ZONE_DEVICE pages
> outside the driver lock, so eg, doing any DMA map that might rely on
> MEMORY_DEVICE_PCI_P2PDMA has to be done in the driver lock, which is
> a bit unfortunate.

Wouldn't P2PDMA use page pins? Not needing to hold a lock over
ZONE_DEVICE page operations was one of the motivations for plumbing
get_dev_pagemap() with a percpu-ref.

2019-08-16 04:42:55

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 04:33:06PM -0400, Jerome Glisse wrote:
> So nor HMM nor driver should dereference the struct page (i do not
> think any iommu driver would either),

Both current hmm drivers convert the hmm pfn back to a page and
eventually call dma_map_page on it. As do the ODP patches from you.

2019-08-16 04:47:14

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Fri, Aug 16, 2019 at 12:43:07AM +0000, Jason Gunthorpe wrote:
> On Thu, Aug 15, 2019 at 04:51:33PM -0400, Jerome Glisse wrote:
>
> > struct page. In this case any way we can update the
> > nouveau_dmem_page() to check that page page->pgmap == the
> > expected pgmap.
>
> I was also wondering if that is a problem.. just blindly doing a
> container_of on the page->pgmap does seem like it assumes that only
> this driver is using DEVICE_PRIVATE.
>
> It seems like something missing in hmm_range_fault, it should be told
> what DEVICE_PRIVATE is acceptable to trigger HMM_PFN_DEVICE_PRIVATE
> and fault all others?

The whole device private handling in hmm and migrate_vma seems pretty
broken as far as I can tell, and I have some WIP patches. Basically we
should not touch (or possibly eventually call migrate to ram eventually
in the future) device private pages not owned by the caller, where I
try to defined the caller by the dev_pagemap_ops instance.

2019-08-16 12:25:36

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Thu, Aug 15, 2019 at 08:54:46PM -0700, Dan Williams wrote:

> > However, this means we cannot do any processing of ZONE_DEVICE pages
> > outside the driver lock, so eg, doing any DMA map that might rely on
> > MEMORY_DEVICE_PCI_P2PDMA has to be done in the driver lock, which is
> > a bit unfortunate.
>
> Wouldn't P2PDMA use page pins? Not needing to hold a lock over
> ZONE_DEVICE page operations was one of the motivations for plumbing
> get_dev_pagemap() with a percpu-ref.

hmm_range_fault() doesn't use page pins at all, so if a ZONE_DEVICE
page comes out of it then it needs to use another locking pattern.

If I follow it all right:

We can do a get_dev_pagemap inside the page_walk and touch the pgmap,
or we can do the 'device mutex && retry' pattern and touch the pgmap
in the driver, under that lock.

However in all cases the current get_dev_pagemap()'s in the page walk
are not necessary, and we can delete them.

?

Jason

2019-08-16 12:33:42

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Fri, Aug 16, 2019 at 06:44:48AM +0200, Christoph Hellwig wrote:
> On Fri, Aug 16, 2019 at 12:43:07AM +0000, Jason Gunthorpe wrote:
> > On Thu, Aug 15, 2019 at 04:51:33PM -0400, Jerome Glisse wrote:
> >
> > > struct page. In this case any way we can update the
> > > nouveau_dmem_page() to check that page page->pgmap == the
> > > expected pgmap.
> >
> > I was also wondering if that is a problem.. just blindly doing a
> > container_of on the page->pgmap does seem like it assumes that only
> > this driver is using DEVICE_PRIVATE.
> >
> > It seems like something missing in hmm_range_fault, it should be told
> > what DEVICE_PRIVATE is acceptable to trigger HMM_PFN_DEVICE_PRIVATE
> > and fault all others?
>
> The whole device private handling in hmm and migrate_vma seems pretty
> broken as far as I can tell, and I have some WIP patches. Basically we
> should not touch (or possibly eventually call migrate to ram eventually
> in the future) device private pages not owned by the caller, where I
> try to defined the caller by the dev_pagemap_ops instance.

I think it needs to be more elaborate.

For instance, a system may have multiple DEVICE_PRIVATE map's owned by
the same driver - but multiple physical devices using that driver.

Each physical device's driver should only ever get DEVICE_PRIVATE
pages for it's own on-device memory. Never a DEVICE_PRIVATE for
another device's memory.

The dev_pagemap_ops would not be unique enough, right?

Probably also clusters of same-driver struct device can share a
DEVICE_PRIVATE, at least high end GPU's now have private memory
coherency busses between their devices.

Since we want to trigger migration to CPU on incompatible
DEVICE_PRIVATE pages, it seems best to sort this out in the
hmm_range_fault?

Maybe some sort of unique ID inside the page->pgmap and passed as
input?

Jason

2019-08-16 12:35:52

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Fri, Aug 16, 2019 at 12:30:41PM +0000, Jason Gunthorpe wrote:
>
> For instance, a system may have multiple DEVICE_PRIVATE map's owned by
> the same driver - but multiple physical devices using that driver.
>
> Each physical device's driver should only ever get DEVICE_PRIVATE
> pages for it's own on-device memory. Never a DEVICE_PRIVATE for
> another device's memory.
>
> The dev_pagemap_ops would not be unique enough, right?

True.

>
> Probably also clusters of same-driver struct device can share a
> DEVICE_PRIVATE, at least high end GPU's now have private memory
> coherency busses between their devices.
>
> Since we want to trigger migration to CPU on incompatible
> DEVICE_PRIVATE pages, it seems best to sort this out in the
> hmm_range_fault?
>
> Maybe some sort of unique ID inside the page->pgmap and passed as
> input?

Yes, we'll probably need an owner field. And it's not just
hmm_range_fault, the migrate_vma_* routines as affected in the same
way.

2019-08-16 17:23:54

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Fri, Aug 16, 2019 at 5:24 AM Jason Gunthorpe <[email protected]> wrote:
>
> On Thu, Aug 15, 2019 at 08:54:46PM -0700, Dan Williams wrote:
>
> > > However, this means we cannot do any processing of ZONE_DEVICE pages
> > > outside the driver lock, so eg, doing any DMA map that might rely on
> > > MEMORY_DEVICE_PCI_P2PDMA has to be done in the driver lock, which is
> > > a bit unfortunate.
> >
> > Wouldn't P2PDMA use page pins? Not needing to hold a lock over
> > ZONE_DEVICE page operations was one of the motivations for plumbing
> > get_dev_pagemap() with a percpu-ref.
>
> hmm_range_fault() doesn't use page pins at all, so if a ZONE_DEVICE
> page comes out of it then it needs to use another locking pattern.
>
> If I follow it all right:
>
> We can do a get_dev_pagemap inside the page_walk and touch the pgmap,
> or we can do the 'device mutex && retry' pattern and touch the pgmap
> in the driver, under that lock.
>
> However in all cases the current get_dev_pagemap()'s in the page walk
> are not necessary, and we can delete them.

Yes, as long as 'struct page' instances resulting from that lookup are
not passed outside of that lock.

2019-08-16 17:29:42

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

On Fri, Aug 16, 2019 at 10:21:41AM -0700, Dan Williams wrote:

> > We can do a get_dev_pagemap inside the page_walk and touch the pgmap,
> > or we can do the 'device mutex && retry' pattern and touch the pgmap
> > in the driver, under that lock.
> >
> > However in all cases the current get_dev_pagemap()'s in the page walk
> > are not necessary, and we can delete them.
>
> Yes, as long as 'struct page' instances resulting from that lookup are
> not passed outside of that lock.

Indeed.

Also, I was reflecting over lunch that the hmm_range_fault should only
return DEVICE_PRIVATE pages for the caller's device (see other thread
with HCH), and in this case, the caller should also be responsible to
ensure that the driver is not calling hmm_range_fault at the same time
it is deleting it's own DEVICE_PRIVATE mapping - ie by fencing its
page fault handler.

This does not apply to PCI_P2PDMA, but, lets see how that looks when
we get there.

So the whole thing seems pretty safe.

Jason

2019-08-16 21:12:58

by Ralph Campbell

[permalink] [raw]
Subject: Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk


On 8/16/19 10:28 AM, Jason Gunthorpe wrote:
> On Fri, Aug 16, 2019 at 10:21:41AM -0700, Dan Williams wrote:
>
>>> We can do a get_dev_pagemap inside the page_walk and touch the pgmap,
>>> or we can do the 'device mutex && retry' pattern and touch the pgmap
>>> in the driver, under that lock.
>>>
>>> However in all cases the current get_dev_pagemap()'s in the page walk
>>> are not necessary, and we can delete them.
>>
>> Yes, as long as 'struct page' instances resulting from that lookup are
>> not passed outside of that lock.
>
> Indeed.
>
> Also, I was reflecting over lunch that the hmm_range_fault should only
> return DEVICE_PRIVATE pages for the caller's device (see other thread
> with HCH), and in this case, the caller should also be responsible to
> ensure that the driver is not calling hmm_range_fault at the same time
> it is deleting it's own DEVICE_PRIVATE mapping - ie by fencing its
> page fault handler.

Yes, that would make it a one step process to access another
device's migrated memory pages.
Right now, it has to be a two step process where the caller calls
hmm_range_fault, check the struct page to see if it is device
private and not owned, then call hmm_range_fault again with
range->pfns[i] |= range->flags[HMM_PFN_DEVICE_PRIVATE] to cause
the other device to migrate the page back to system memory.