From: Jérôme Glisse <[email protected]>
Few fixes that only affect HMM users. Improve the synchronization call
back so that we match was other mmu_notifier listener do and add proper
support to the new blockable flags in the process.
For curious folks here are branches to leverage HMM in various existing
device drivers:
https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-nouveau-v01
https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-radeon-v00
https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-intel-v00
More to come (amd gpu, Mellanox, ...)
I expect more of the preparatory work for nouveau will be merge in 4.20
(like we have been doing since 4.16) and i will wait until this patchset
is upstream before pushing the patches that actualy make use of HMM (to
avoid complex tree inter-dependency).
Jérôme Glisse (5):
mm/hmm: fix utf8 ...
mm/hmm: properly handle migration pmd
mm/hmm: use a structure for update callback parameters
mm/hmm: invalidate device page table at start of invalidation
mm/hmm: proper support for blockable mmu_notifier
Ralph Campbell (2):
mm/rmap: map_pte() was not handling private ZONE_DEVICE page properly
mm/hmm: fix race between hmm_mirror_unregister() and mmu_notifier
callback
include/linux/hmm.h | 37 +++++++----
mm/hmm.c | 150 ++++++++++++++++++++++++++++++-------------
mm/page_vma_mapped.c | 9 +++
3 files changed, 142 insertions(+), 54 deletions(-)
--
2.17.1
From: Jérôme Glisse <[email protected]>
Invalidate device page table at start of invalidation and invalidate
in progress CPU page table snapshooting at both start and end of any
invalidation.
This is helpful when device need to dirty page because the device page
table report the page as dirty. Dirtying page must happen in the start
mmu notifier callback and not in the end one.
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/hmm.h | 2 +-
mm/hmm.c | 21 ++++++++++++++-------
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index a7f7600b6bb0..064924bce75c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -325,7 +325,7 @@ struct hmm_mirror_ops {
* synchronous call.
*/
void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
- const struct hmm_update *update);
+ const struct hmm_update *update);
};
/*
diff --git a/mm/hmm.c b/mm/hmm.c
index debd2f734ab5..6fe31e2bfa1e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -43,7 +43,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
*
* @mm: mm struct this HMM struct is bound to
* @lock: lock protecting ranges list
- * @sequence: we track updates to the CPU page table with a sequence number
* @ranges: list of range being snapshotted
* @mirrors: list of mirrors for this mm
* @mmu_notifier: mmu notifier to track updates to CPU page table
@@ -52,7 +51,6 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
struct hmm {
struct mm_struct *mm;
spinlock_t lock;
- atomic_t sequence;
struct list_head ranges;
struct list_head mirrors;
struct mmu_notifier mmu_notifier;
@@ -85,7 +83,6 @@ static struct hmm *hmm_register(struct mm_struct *mm)
return NULL;
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
- atomic_set(&hmm->sequence, 0);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
spin_lock_init(&hmm->lock);
@@ -126,8 +123,8 @@ void hmm_mm_destroy(struct mm_struct *mm)
kfree(mm->hmm);
}
-static void hmm_invalidate_range(struct hmm *hmm,
- const struct hmm_update *update)
+static void hmm_invalidate_range(struct hmm *hmm, bool device,
+ const struct hmm_update *update)
{
struct hmm_mirror *mirror;
struct hmm_range *range;
@@ -147,6 +144,9 @@ static void hmm_invalidate_range(struct hmm *hmm,
}
spin_unlock(&hmm->lock);
+ if (!device)
+ return;
+
down_read(&hmm->mirrors_sem);
list_for_each_entry(mirror, &hmm->mirrors, list)
mirror->ops->sync_cpu_device_pagetables(mirror, update);
@@ -185,11 +185,18 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
unsigned long end,
bool blockable)
{
+ struct hmm_update update;
struct hmm *hmm = mm->hmm;
+ if (!blockable)
+ return -EAGAIN;
+
VM_BUG_ON(!hmm);
- atomic_inc(&hmm->sequence);
+ update.start = start;
+ update.end = end;
+ update.event = HMM_UPDATE_INVALIDATE;
+ hmm_invalidate_range(hmm, true, &update);
return 0;
}
@@ -207,7 +214,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
update.start = start;
update.end = end;
update.event = HMM_UPDATE_INVALIDATE;
- hmm_invalidate_range(hmm, &update);
+ hmm_invalidate_range(hmm, false, &update);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
--
2.17.1
From: Jérôme Glisse <[email protected]>
When mmu_notifier calls invalidate_range_start callback with blockable
set to false we should not sleep. Properly propagate this to HMM users.
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/hmm.h | 12 +++++++++---
mm/hmm.c | 39 ++++++++++++++++++++++++++++-----------
2 files changed, 37 insertions(+), 14 deletions(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 064924bce75c..c783916f8732 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -287,11 +287,13 @@ enum hmm_update_event {
* @start: virtual start address of the range to update
* @end: virtual end address of the range to update
* @event: event triggering the update (what is happening)
+ * @blockable: can the callback block/sleep ?
*/
struct hmm_update {
unsigned long start;
unsigned long end;
enum hmm_update_event event;
+ bool blockable;
};
/*
@@ -314,6 +316,8 @@ struct hmm_mirror_ops {
*
* @mirror: pointer to struct hmm_mirror
* @update: update informations (see struct hmm_update)
+ * Returns: -EAGAIN if update.blockable false and callback need to
+ * block, 0 otherwise.
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
@@ -322,10 +326,12 @@ struct hmm_mirror_ops {
*
* The device driver must not return from this callback until the device
* page tables are completely updated (TLBs flushed, etc); this is a
- * synchronous call.
+ * synchronous call. If driver need to sleep and update->blockable is
+ * false then you need to abort (do not do anything that would sleep or
+ * block) and return -EAGAIN.
*/
- void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
- const struct hmm_update *update);
+ int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+ const struct hmm_update *update);
};
/*
diff --git a/mm/hmm.c b/mm/hmm.c
index 6fe31e2bfa1e..1d8fcaa0606f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -123,12 +123,18 @@ void hmm_mm_destroy(struct mm_struct *mm)
kfree(mm->hmm);
}
-static void hmm_invalidate_range(struct hmm *hmm, bool device,
- const struct hmm_update *update)
+static int hmm_invalidate_range(struct hmm *hmm, bool device,
+ const struct hmm_update *update)
{
struct hmm_mirror *mirror;
struct hmm_range *range;
+ /*
+ * It is fine to wait on lock here even if update->blockable is false
+ * as the hmm->lock is only held for short period of time (when adding
+ * or walking the ranges list). We could also convert the range list
+ * into a lru list and avoid the spinlock all together.
+ */
spin_lock(&hmm->lock);
list_for_each_entry(range, &hmm->ranges, list) {
unsigned long addr, idx, npages;
@@ -145,12 +151,26 @@ static void hmm_invalidate_range(struct hmm *hmm, bool device,
spin_unlock(&hmm->lock);
if (!device)
- return;
+ return 0;
+ /*
+ * It is fine to wait on mirrors_sem here even if update->blockable is
+ * false as this semaphore is only taken in write mode for short period
+ * when adding a new mirror to the list.
+ */
down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list)
- mirror->ops->sync_cpu_device_pagetables(mirror, update);
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int ret;
+
+ ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
+ if (!update->blockable && ret == -EAGAIN) {
+ up_read(&hmm->mirrors_sem);
+ return -EAGAIN;
+ }
+ }
up_read(&hmm->mirrors_sem);
+
+ return 0;
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
@@ -188,17 +208,13 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
struct hmm_update update;
struct hmm *hmm = mm->hmm;
- if (!blockable)
- return -EAGAIN;
-
VM_BUG_ON(!hmm);
update.start = start;
update.end = end;
update.event = HMM_UPDATE_INVALIDATE;
- hmm_invalidate_range(hmm, true, &update);
-
- return 0;
+ update.blockable = blockable;
+ return hmm_invalidate_range(hmm, true, &update);
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@ -214,6 +230,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
update.start = start;
update.end = end;
update.event = HMM_UPDATE_INVALIDATE;
+ update.blockable = true;
hmm_invalidate_range(hmm, false, &update);
}
--
2.17.1
From: Jérôme Glisse <[email protected]>
Before this patch migration pmd entry (!pmd_present()) would have
been treated as a bad entry (pmd_bad() returns true on migration
pmd entry). The outcome was that device driver would believe that
the range covered by the pmd was bad and would either SIGBUS or
simply kill all the device's threads (each device driver decide
how to react when the device tries to access poisonnous or invalid
range of memory).
This patch explicitly handle the case of migration pmd entry which
are non present pmd entry and either wait for the migration to
finish or report empty range (when device is just trying to pre-
fill a range of virtual address and thus do not want to wait or
trigger page fault).
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Andrew Morton <[email protected]>
---
mm/hmm.c | 45 +++++++++++++++++++++++++++++++++++++++------
1 file changed, 39 insertions(+), 6 deletions(-)
diff --git a/mm/hmm.c b/mm/hmm.c
index a16678d08127..659efc9aada6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -577,22 +577,47 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
+ pmd_t pmd;
- i = (addr - range->start) >> PAGE_SHIFT;
again:
- if (pmd_none(*pmdp))
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
+ if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
return hmm_pfns_bad(start, end, walk);
- if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
- pmd_t pmd;
+ if (!pmd_present(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ if (is_migration_entry(entry)) {
+ bool fault, write_fault;
+ unsigned long npages;
+ uint64_t *pfns;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ 0, &fault, &write_fault);
+ if (fault || write_fault) {
+ hmm_vma_walk->last = addr;
+ pmd_migration_entry_wait(vma->vm_mm, pmdp);
+ return -EAGAIN;
+ }
+ return 0;
+ }
+
+ return hmm_pfns_bad(start, end, walk);
+ }
+ if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
* No need to take pmd_lock here, even if some other threads
* is splitting the huge pmd we will get that event through
@@ -607,13 +632,21 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
+ i = (addr - range->start) >> PAGE_SHIFT;
return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
}
- if (pmd_bad(*pmdp))
+ /*
+ * We have handled all the valid case above ie either none, migration,
+ * huge or transparent huge. At this point either it is a valid pmd
+ * entry pointing to pte directory or it is a bad pmd that will not
+ * recover.
+ */
+ if (pmd_bad(pmd))
return hmm_pfns_bad(start, end, walk);
ptep = pte_offset_map(pmdp, addr);
+ i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
int r;
--
2.17.1
From: Ralph Campbell <[email protected]>
In hmm_mirror_unregister(), mm->hmm is set to NULL and then
mmu_notifier_unregister_no_release() is called. That creates a small
window where mmu_notifier can call mmu_notifier_ops with mm->hmm equal
to NULL. Fix this by first unregistering mmu notifier callbacks and
then setting mm->hmm to NULL.
Similarly in hmm_register(), set mm->hmm before registering mmu_notifier
callbacks so callback functions always see mm->hmm set.
Signed-off-by: Ralph Campbell <[email protected]>
Reviewed-by: John Hubbard <[email protected]>
Reviewed-by: Jérôme Glisse <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
---
mm/hmm.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)
diff --git a/mm/hmm.c b/mm/hmm.c
index 9a068a1da487..a16678d08127 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -91,16 +91,6 @@ static struct hmm *hmm_register(struct mm_struct *mm)
spin_lock_init(&hmm->lock);
hmm->mm = mm;
- /*
- * We should only get here if hold the mmap_sem in write mode ie on
- * registration of first mirror through hmm_mirror_register()
- */
- hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
- if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
- kfree(hmm);
- return NULL;
- }
-
spin_lock(&mm->page_table_lock);
if (!mm->hmm)
mm->hmm = hmm;
@@ -108,12 +98,27 @@ static struct hmm *hmm_register(struct mm_struct *mm)
cleanup = true;
spin_unlock(&mm->page_table_lock);
- if (cleanup) {
- mmu_notifier_unregister(&hmm->mmu_notifier, mm);
- kfree(hmm);
- }
+ if (cleanup)
+ goto error;
+
+ /*
+ * We should only get here if hold the mmap_sem in write mode ie on
+ * registration of first mirror through hmm_mirror_register()
+ */
+ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+ if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
+ goto error_mm;
return mm->hmm;
+
+error_mm:
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
+error:
+ kfree(hmm);
+ return NULL;
}
void hmm_mm_destroy(struct mm_struct *mm)
@@ -278,12 +283,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
if (!should_unregister || mm == NULL)
return;
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
+
spin_lock(&mm->page_table_lock);
if (mm->hmm == hmm)
mm->hmm = NULL;
spin_unlock(&mm->page_table_lock);
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
kfree(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
--
2.17.1
From: Jérôme Glisse <[email protected]>
Use a structure to gather all the parameters for the update callback.
This make it easier when adding new parameters by avoiding having to
update all callback function signature.
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/hmm.h | 25 +++++++++++++++++--------
mm/hmm.c | 27 ++++++++++++++-------------
2 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1ff4bae7ada7..a7f7600b6bb0 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -274,13 +274,26 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
struct hmm_mirror;
/*
- * enum hmm_update_type - type of update
+ * enum hmm_update_event - type of update
* @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
*/
-enum hmm_update_type {
+enum hmm_update_event {
HMM_UPDATE_INVALIDATE,
};
+/*
+ * struct hmm_update - HMM update informations for callback
+ *
+ * @start: virtual start address of the range to update
+ * @end: virtual end address of the range to update
+ * @event: event triggering the update (what is happening)
+ */
+struct hmm_update {
+ unsigned long start;
+ unsigned long end;
+ enum hmm_update_event event;
+};
+
/*
* struct hmm_mirror_ops - HMM mirror device operations callback
*
@@ -300,9 +313,7 @@ struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
- * @update_type: type of update that occurred to the CPU page table
- * @start: virtual start address of the range to update
- * @end: virtual end address of the range to update
+ * @update: update informations (see struct hmm_update)
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
@@ -314,9 +325,7 @@ struct hmm_mirror_ops {
* synchronous call.
*/
void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
- enum hmm_update_type update_type,
- unsigned long start,
- unsigned long end);
+ const struct hmm_update *update);
};
/*
diff --git a/mm/hmm.c b/mm/hmm.c
index 659efc9aada6..debd2f734ab5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -127,9 +127,7 @@ void hmm_mm_destroy(struct mm_struct *mm)
}
static void hmm_invalidate_range(struct hmm *hmm,
- enum hmm_update_type action,
- unsigned long start,
- unsigned long end)
+ const struct hmm_update *update)
{
struct hmm_mirror *mirror;
struct hmm_range *range;
@@ -138,21 +136,20 @@ static void hmm_invalidate_range(struct hmm *hmm,
list_for_each_entry(range, &hmm->ranges, list) {
unsigned long addr, idx, npages;
- if (end < range->start || start >= range->end)
+ if (update->end < range->start || update->start >= range->end)
continue;
range->valid = false;
- addr = max(start, range->start);
+ addr = max(update->start, range->start);
idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+ npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
}
spin_unlock(&hmm->lock);
down_read(&hmm->mirrors_sem);
list_for_each_entry(mirror, &hmm->mirrors, list)
- mirror->ops->sync_cpu_device_pagetables(mirror, action,
- start, end);
+ mirror->ops->sync_cpu_device_pagetables(mirror, update);
up_read(&hmm->mirrors_sem);
}
@@ -183,10 +180,10 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ bool blockable)
{
struct hmm *hmm = mm->hmm;
@@ -202,11 +199,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
unsigned long start,
unsigned long end)
{
+ struct hmm_update update;
struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm);
- hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+ update.start = start;
+ update.end = end;
+ update.event = HMM_UPDATE_INVALIDATE;
+ hmm_invalidate_range(hmm, &update);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
--
2.17.1
From: Ralph Campbell <[email protected]>
Private ZONE_DEVICE pages use a special pte entry and thus are not
present. Properly handle this case in map_pte(), it is already handled
in check_pte(), the map_pte() part was lost in some rebase most probably.
Without this patch the slow migration path can not migrate back private
ZONE_DEVICE memory to regular memory. This was found after stress
testing migration back to system memory. This ultimatly can lead the
CPU to an infinite page fault loop on the special swap entry.
Signed-off-by: Ralph Campbell <[email protected]>
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: [email protected]
---
mm/page_vma_mapped.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae3c2a35d61b..1cf5b9bfb559 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -21,6 +21,15 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
if (!is_swap_pte(*pvmw->pte))
return false;
} else {
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ /* Handle un-addressable ZONE_DEVICE memory */
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (is_device_private_entry(entry))
+ return true;
+ }
+
if (!pte_present(*pvmw->pte))
return false;
}
--
2.17.1
From: Jérôme Glisse <[email protected]>
Somehow utf=8 must have been broken.
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/hmm.h | 2 +-
mm/hmm.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4c92e3ba3e16..1ff4bae7ada7 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * Authors: Jérôme Glisse <[email protected]>
+ * Authors: Jérôme Glisse <[email protected]>
*/
/*
* Heterogeneous Memory Management (HMM)
diff --git a/mm/hmm.c b/mm/hmm.c
index c968e49f7a0c..9a068a1da487 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * Authors: Jérôme Glisse <[email protected]>
+ * Authors: Jérôme Glisse <[email protected]>
*/
/*
* Refer to include/linux/hmm.h for information about heterogeneous memory
--
2.17.1
Hi Jérôme,
On 24 Aug 2018, at 15:25, [email protected] wrote:
> From: Jérôme Glisse <[email protected]>
>
> Before this patch migration pmd entry (!pmd_present()) would have
> been treated as a bad entry (pmd_bad() returns true on migration
> pmd entry). The outcome was that device driver would believe that
> the range covered by the pmd was bad and would either SIGBUS or
> simply kill all the device's threads (each device driver decide
> how to react when the device tries to access poisonnous or invalid
> range of memory).
>
> This patch explicitly handle the case of migration pmd entry which
> are non present pmd entry and either wait for the migration to
> finish or report empty range (when device is just trying to pre-
> fill a range of virtual address and thus do not want to wait or
> trigger page fault).
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> Signed-off-by: Jérôme Glisse <[email protected]>
> Cc: Ralph Campbell <[email protected]>
> Cc: John Hubbard <[email protected]>
> Cc: Andrew Morton <[email protected]>
> ---
> mm/hmm.c | 45 +++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 39 insertions(+), 6 deletions(-)
>
> diff --git a/mm/hmm.c b/mm/hmm.c
> index a16678d08127..659efc9aada6 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -577,22 +577,47 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> {
> struct hmm_vma_walk *hmm_vma_walk = walk->private;
> struct hmm_range *range = hmm_vma_walk->range;
> + struct vm_area_struct *vma = walk->vma;
> uint64_t *pfns = range->pfns;
> unsigned long addr = start, i;
> pte_t *ptep;
> + pmd_t pmd;
>
> - i = (addr - range->start) >> PAGE_SHIFT;
>
> again:
> - if (pmd_none(*pmdp))
> + pmd = READ_ONCE(*pmdp);
> + if (pmd_none(pmd))
> return hmm_vma_walk_hole(start, end, walk);
>
> - if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
> + if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
> return hmm_pfns_bad(start, end, walk);
>
> - if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
> - pmd_t pmd;
> + if (!pmd_present(pmd)) {
> + swp_entry_t entry = pmd_to_swp_entry(pmd);
> +
> + if (is_migration_entry(entry)) {
I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
Other architectures should treat PMD migration entries as bad.
> + bool fault, write_fault;
> + unsigned long npages;
> + uint64_t *pfns;
> +
> + i = (addr - range->start) >> PAGE_SHIFT;
> + npages = (end - addr) >> PAGE_SHIFT;
> + pfns = &range->pfns[i];
> +
> + hmm_range_need_fault(hmm_vma_walk, pfns, npages,
> + 0, &fault, &write_fault);
> + if (fault || write_fault) {
> + hmm_vma_walk->last = addr;
> + pmd_migration_entry_wait(vma->vm_mm, pmdp);
> + return -EAGAIN;
> + }
> + return 0;
> + }
> +
> + return hmm_pfns_bad(start, end, walk);
> + }
>
—
Best Regards,
Yan Zi
On Fri, Aug 24, 2018 at 08:05:46PM -0400, Zi Yan wrote:
> Hi J?r?me,
>
> On 24 Aug 2018, at 15:25, [email protected] wrote:
>
> > From: J?r?me Glisse <[email protected]>
> >
> > Before this patch migration pmd entry (!pmd_present()) would have
> > been treated as a bad entry (pmd_bad() returns true on migration
> > pmd entry). The outcome was that device driver would believe that
> > the range covered by the pmd was bad and would either SIGBUS or
> > simply kill all the device's threads (each device driver decide
> > how to react when the device tries to access poisonnous or invalid
> > range of memory).
> >
> > This patch explicitly handle the case of migration pmd entry which
> > are non present pmd entry and either wait for the migration to
> > finish or report empty range (when device is just trying to pre-
> > fill a range of virtual address and thus do not want to wait or
> > trigger page fault).
> >
> > Signed-off-by: Aneesh Kumar K.V <[email protected]>
> > Signed-off-by: J?r?me Glisse <[email protected]>
> > Cc: Ralph Campbell <[email protected]>
> > Cc: John Hubbard <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > ---
> > mm/hmm.c | 45 +++++++++++++++++++++++++++++++++++++++------
> > 1 file changed, 39 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index a16678d08127..659efc9aada6 100644
> > --- a/mm/hmm.c
> > +++ b/mm/hmm.c
> > @@ -577,22 +577,47 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> > {
> > struct hmm_vma_walk *hmm_vma_walk = walk->private;
> > struct hmm_range *range = hmm_vma_walk->range;
> > + struct vm_area_struct *vma = walk->vma;
> > uint64_t *pfns = range->pfns;
> > unsigned long addr = start, i;
> > pte_t *ptep;
> > + pmd_t pmd;
> >
> > - i = (addr - range->start) >> PAGE_SHIFT;
> >
> > again:
> > - if (pmd_none(*pmdp))
> > + pmd = READ_ONCE(*pmdp);
> > + if (pmd_none(pmd))
> > return hmm_vma_walk_hole(start, end, walk);
> >
> > - if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
> > + if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
> > return hmm_pfns_bad(start, end, walk);
> >
> > - if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
> > - pmd_t pmd;
> > + if (!pmd_present(pmd)) {
> > + swp_entry_t entry = pmd_to_swp_entry(pmd);
> > +
> > + if (is_migration_entry(entry)) {
>
> I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> Other architectures should treat PMD migration entries as bad.
You are right, Andrew do you want to repost or can you edit above if
to:
if (thp_migration_supported() && is_migration_entry(entry)) {
Cheers,
J?r?me
On Fri 24-08-18 20:05:46, Zi Yan wrote:
[...]
> > + if (!pmd_present(pmd)) {
> > + swp_entry_t entry = pmd_to_swp_entry(pmd);
> > +
> > + if (is_migration_entry(entry)) {
>
> I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> Other architectures should treat PMD migration entries as bad.
How can we have a migration pmd entry when the migration is not
supported?
--
Michal Hocko
SUSE Labs
On Tue, Aug 28, 2018 at 05:24:14PM +0200, Michal Hocko wrote:
> On Fri 24-08-18 20:05:46, Zi Yan wrote:
> [...]
> > > + if (!pmd_present(pmd)) {
> > > + swp_entry_t entry = pmd_to_swp_entry(pmd);
> > > +
> > > + if (is_migration_entry(entry)) {
> >
> > I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> > Other architectures should treat PMD migration entries as bad.
>
> How can we have a migration pmd entry when the migration is not
> supported?
Not sure i follow here, migration can happen anywhere (assuming
that something like compaction is active or numa or ...). So this
code can face pmd migration entry on architecture that support
it. What is missing here is thp_migration_supported() call to
protect the is_migration_entry() to avoid false positive on arch
which do not support thp migration.
Cheers,
J?r?me
On Tue 28-08-18 11:36:59, Jerome Glisse wrote:
> On Tue, Aug 28, 2018 at 05:24:14PM +0200, Michal Hocko wrote:
> > On Fri 24-08-18 20:05:46, Zi Yan wrote:
> > [...]
> > > > + if (!pmd_present(pmd)) {
> > > > + swp_entry_t entry = pmd_to_swp_entry(pmd);
> > > > +
> > > > + if (is_migration_entry(entry)) {
> > >
> > > I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> > > Other architectures should treat PMD migration entries as bad.
> >
> > How can we have a migration pmd entry when the migration is not
> > supported?
>
> Not sure i follow here, migration can happen anywhere (assuming
> that something like compaction is active or numa or ...). So this
> code can face pmd migration entry on architecture that support
> it. What is missing here is thp_migration_supported() call to
> protect the is_migration_entry() to avoid false positive on arch
> which do not support thp migration.
I mean that architectures which do not support THP migration shouldn't
ever see any migration entry. So is_migration_entry should be always
false. Or do I miss something?
--
Michal Hocko
SUSE Labs
On Tue 28-08-18 17:42:06, Michal Hocko wrote:
> On Tue 28-08-18 11:36:59, Jerome Glisse wrote:
> > On Tue, Aug 28, 2018 at 05:24:14PM +0200, Michal Hocko wrote:
> > > On Fri 24-08-18 20:05:46, Zi Yan wrote:
> > > [...]
> > > > > + if (!pmd_present(pmd)) {
> > > > > + swp_entry_t entry = pmd_to_swp_entry(pmd);
> > > > > +
> > > > > + if (is_migration_entry(entry)) {
> > > >
> > > > I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> > > > Other architectures should treat PMD migration entries as bad.
> > >
> > > How can we have a migration pmd entry when the migration is not
> > > supported?
> >
> > Not sure i follow here, migration can happen anywhere (assuming
> > that something like compaction is active or numa or ...). So this
> > code can face pmd migration entry on architecture that support
> > it. What is missing here is thp_migration_supported() call to
> > protect the is_migration_entry() to avoid false positive on arch
> > which do not support thp migration.
>
> I mean that architectures which do not support THP migration shouldn't
> ever see any migration entry. So is_migration_entry should be always
> false. Or do I miss something?
And just to be clear. thp_migration_supported should be checked only
when we actually _do_ the migration or evaluate migratability of the
page. We definitely do want to sprinkle this check to all places where
is_migration_entry is checked.
--
Michal Hocko
SUSE Labs
Hi Michal,
On 28 Aug 2018, at 11:45, Michal Hocko wrote:
> On Tue 28-08-18 17:42:06, Michal Hocko wrote:
>> On Tue 28-08-18 11:36:59, Jerome Glisse wrote:
>>> On Tue, Aug 28, 2018 at 05:24:14PM +0200, Michal Hocko wrote:
>>>> On Fri 24-08-18 20:05:46, Zi Yan wrote:
>>>> [...]
>>>>>> + if (!pmd_present(pmd)) {
>>>>>> + swp_entry_t entry = pmd_to_swp_entry(pmd);
>>>>>> +
>>>>>> + if (is_migration_entry(entry)) {
>>>>>
>>>>> I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
>>>>> Other architectures should treat PMD migration entries as bad.
>>>>
>>>> How can we have a migration pmd entry when the migration is not
>>>> supported?
>>>
>>> Not sure i follow here, migration can happen anywhere (assuming
>>> that something like compaction is active or numa or ...). So this
>>> code can face pmd migration entry on architecture that support
>>> it. What is missing here is thp_migration_supported() call to
>>> protect the is_migration_entry() to avoid false positive on arch
>>> which do not support thp migration.
>>
>> I mean that architectures which do not support THP migration shouldn't
>> ever see any migration entry. So is_migration_entry should be always
>> false. Or do I miss something?
>
> And just to be clear. thp_migration_supported should be checked only
> when we actually _do_ the migration or evaluate migratability of the
> page. We definitely do want to sprinkle this check to all places where
> is_migration_entry is checked.
is_migration_entry() is a general check for swp_entry_t, so it can return
true even if THP migration is not enabled. is_pmd_migration_entry() always
returns false when THP migration is not enabled.
So the code can be changed in two ways, either replacing is_migration_entry()
with is_pmd_migration_entry() or adding thp_migration_supported() check
like Jerome did.
Does this clarify your question?
—
Best Regards,
Yan Zi
On Tue, Aug 28, 2018 at 11:54:33AM -0400, Zi Yan wrote:
> Hi Michal,
>
> On 28 Aug 2018, at 11:45, Michal Hocko wrote:
>
> > On Tue 28-08-18 17:42:06, Michal Hocko wrote:
> >> On Tue 28-08-18 11:36:59, Jerome Glisse wrote:
> >>> On Tue, Aug 28, 2018 at 05:24:14PM +0200, Michal Hocko wrote:
> >>>> On Fri 24-08-18 20:05:46, Zi Yan wrote:
> >>>> [...]
> >>>>>> + if (!pmd_present(pmd)) {
> >>>>>> + swp_entry_t entry = pmd_to_swp_entry(pmd);
> >>>>>> +
> >>>>>> + if (is_migration_entry(entry)) {
> >>>>>
> >>>>> I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> >>>>> Other architectures should treat PMD migration entries as bad.
> >>>>
> >>>> How can we have a migration pmd entry when the migration is not
> >>>> supported?
> >>>
> >>> Not sure i follow here, migration can happen anywhere (assuming
> >>> that something like compaction is active or numa or ...). So this
> >>> code can face pmd migration entry on architecture that support
> >>> it. What is missing here is thp_migration_supported() call to
> >>> protect the is_migration_entry() to avoid false positive on arch
> >>> which do not support thp migration.
> >>
> >> I mean that architectures which do not support THP migration shouldn't
> >> ever see any migration entry. So is_migration_entry should be always
> >> false. Or do I miss something?
> >
> > And just to be clear. thp_migration_supported should be checked only
> > when we actually _do_ the migration or evaluate migratability of the
> > page. We definitely do want to sprinkle this check to all places where
> > is_migration_entry is checked.
>
> is_migration_entry() is a general check for swp_entry_t, so it can return
> true even if THP migration is not enabled. is_pmd_migration_entry() always
> returns false when THP migration is not enabled.
>
> So the code can be changed in two ways, either replacing is_migration_entry()
> with is_pmd_migration_entry() or adding thp_migration_supported() check
> like Jerome did.
>
> Does this clarify your question?
>
Well looking back at code is_migration_entry() will return false on arch
which do not have thp migration because pmd_to_swp_entry() will return
swp_entry(0,0) which is can not be a valid migration entry.
Maybe using is_pmd_migration_entry() would be better here ? It seems
that is_pmd_migration_entry() is more common then the open coded
thp_migration_supported() && is_migration_entry()
Cheers,
J?r?me
On Tue 28-08-18 11:54:33, Zi Yan wrote:
> Hi Michal,
>
> On 28 Aug 2018, at 11:45, Michal Hocko wrote:
>
> > On Tue 28-08-18 17:42:06, Michal Hocko wrote:
> >> On Tue 28-08-18 11:36:59, Jerome Glisse wrote:
> >>> On Tue, Aug 28, 2018 at 05:24:14PM +0200, Michal Hocko wrote:
> >>>> On Fri 24-08-18 20:05:46, Zi Yan wrote:
> >>>> [...]
> >>>>>> + if (!pmd_present(pmd)) {
> >>>>>> + swp_entry_t entry = pmd_to_swp_entry(pmd);
> >>>>>> +
> >>>>>> + if (is_migration_entry(entry)) {
> >>>>>
> >>>>> I think you should check thp_migration_supported() here, since PMD migration is only enabled in x86_64 systems.
> >>>>> Other architectures should treat PMD migration entries as bad.
> >>>>
> >>>> How can we have a migration pmd entry when the migration is not
> >>>> supported?
> >>>
> >>> Not sure i follow here, migration can happen anywhere (assuming
> >>> that something like compaction is active or numa or ...). So this
> >>> code can face pmd migration entry on architecture that support
> >>> it. What is missing here is thp_migration_supported() call to
> >>> protect the is_migration_entry() to avoid false positive on arch
> >>> which do not support thp migration.
> >>
> >> I mean that architectures which do not support THP migration shouldn't
> >> ever see any migration entry. So is_migration_entry should be always
> >> false. Or do I miss something?
> >
> > And just to be clear. thp_migration_supported should be checked only
> > when we actually _do_ the migration or evaluate migratability of the
> > page. We definitely do want to sprinkle this check to all places where
> > is_migration_entry is checked.
>
> is_migration_entry() is a general check for swp_entry_t, so it can return
> true even if THP migration is not enabled. is_pmd_migration_entry() always
> returns false when THP migration is not enabled.
>
> So the code can be changed in two ways, either replacing is_migration_entry()
> with is_pmd_migration_entry() or adding thp_migration_supported() check
> like Jerome did.
>
> Does this clarify your question?
Not really. IIUC the code checks for the pmd. So even though
is_migration_entry is a more generic check it should never return true
for thp_migration_supported() == F because we simply never have those
unless I am missing something.
is_pmd_migration_entry is much more readable of course and I suspect it
can save few cycles as well.
--
Michal Hocko
SUSE Labs
From: Jérôme Glisse <[email protected]>
Before this patch migration pmd entry (!pmd_present()) would have
been treated as a bad entry (pmd_bad() returns true on migration
pmd entry). The outcome was that device driver would believe that
the range covered by the pmd was bad and would either SIGBUS or
simply kill all the device's threads (each device driver decide
how to react when the device tries to access poisonnous or invalid
range of memory).
This patch explicitly handle the case of migration pmd entry which
are non present pmd entry and either wait for the migration to
finish or report empty range (when device is just trying to pre-
fill a range of virtual address and thus do not want to wait or
trigger page fault).
Changed since v1:
- use is_pmd_migration_entry() instead of open coding the
equivalent.
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Zi Yan <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Andrew Morton <[email protected]>
---
mm/hmm.c | 42 ++++++++++++++++++++++++++++++++++++------
1 file changed, 36 insertions(+), 6 deletions(-)
diff --git a/mm/hmm.c b/mm/hmm.c
index a16678d08127..fd3d19d98070 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -577,22 +577,44 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
+ pmd_t pmd;
- i = (addr - range->start) >> PAGE_SHIFT;
again:
- if (pmd_none(*pmdp))
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
+ if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
return hmm_pfns_bad(start, end, walk);
- if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
- pmd_t pmd;
+ if (is_pmd_migration_entry(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ bool fault, write_fault;
+ unsigned long npages;
+ uint64_t *pfns;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ 0, &fault, &write_fault);
+ if (fault || write_fault) {
+ hmm_vma_walk->last = addr;
+ pmd_migration_entry_wait(vma->vm_mm, pmdp);
+ return -EAGAIN;
+ }
+ return 0;
+ } else if (!pmd_present(pmd))
+ return hmm_pfns_bad(start, end, walk);
+
+ if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
* No need to take pmd_lock here, even if some other threads
* is splitting the huge pmd we will get that event through
@@ -607,13 +629,21 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
+ i = (addr - range->start) >> PAGE_SHIFT;
return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
}
- if (pmd_bad(*pmdp))
+ /*
+ * We have handled all the valid case above ie either none, migration,
+ * huge or transparent huge. At this point either it is a valid pmd
+ * entry pointing to pte directory or it is a bad pmd that will not
+ * recover.
+ */
+ if (pmd_bad(pmd))
return hmm_pfns_bad(start, end, walk);
ptep = pte_offset_map(pmdp, addr);
+ i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
int r;
--
2.17.1
On Fri, Aug 24, 2018 at 03:25:44PM -0400, [email protected] wrote:
> From: Ralph Campbell <[email protected]>
>
> Private ZONE_DEVICE pages use a special pte entry and thus are not
> present. Properly handle this case in map_pte(), it is already handled
> in check_pte(), the map_pte() part was lost in some rebase most probably.
>
> Without this patch the slow migration path can not migrate back private
> ZONE_DEVICE memory to regular memory. This was found after stress
> testing migration back to system memory. This ultimatly can lead the
> CPU to an infinite page fault loop on the special swap entry.
>
> Signed-off-by: Ralph Campbell <[email protected]>
> Signed-off-by: J?r?me Glisse <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Kirill A. Shutemov <[email protected]>
> Cc: [email protected]
> ---
> mm/page_vma_mapped.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index ae3c2a35d61b..1cf5b9bfb559 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -21,6 +21,15 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
> if (!is_swap_pte(*pvmw->pte))
> return false;
> } else {
> + if (is_swap_pte(*pvmw->pte)) {
> + swp_entry_t entry;
> +
> + /* Handle un-addressable ZONE_DEVICE memory */
> + entry = pte_to_swp_entry(*pvmw->pte);
> + if (is_device_private_entry(entry))
> + return true;
> + }
> +
This happens just for !PVMW_SYNC && PVMW_MIGRATION? I presume this
is triggered via the remove_migration_pte() code path? Doesn't
returning true here imply that we've taken the ptl lock for the
pvmw?
Balbir
On Fri, Aug 24, 2018 at 03:25:45PM -0400, [email protected] wrote:
> From: Ralph Campbell <[email protected]>
>
> In hmm_mirror_unregister(), mm->hmm is set to NULL and then
> mmu_notifier_unregister_no_release() is called. That creates a small
> window where mmu_notifier can call mmu_notifier_ops with mm->hmm equal
> to NULL. Fix this by first unregistering mmu notifier callbacks and
> then setting mm->hmm to NULL.
>
> Similarly in hmm_register(), set mm->hmm before registering mmu_notifier
> callbacks so callback functions always see mm->hmm set.
>
> Signed-off-by: Ralph Campbell <[email protected]>
> Reviewed-by: John Hubbard <[email protected]>
> Reviewed-by: J?r?me Glisse <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: [email protected]
Reviewed-by: Balbir Singh <[email protected]>
On Fri, Aug 31, 2018 at 12:05:38AM +1000, Balbir Singh wrote:
> On Fri, Aug 24, 2018 at 03:25:44PM -0400, [email protected] wrote:
> > From: Ralph Campbell <[email protected]>
> >
> > Private ZONE_DEVICE pages use a special pte entry and thus are not
> > present. Properly handle this case in map_pte(), it is already handled
> > in check_pte(), the map_pte() part was lost in some rebase most probably.
> >
> > Without this patch the slow migration path can not migrate back private
> > ZONE_DEVICE memory to regular memory. This was found after stress
> > testing migration back to system memory. This ultimatly can lead the
> > CPU to an infinite page fault loop on the special swap entry.
> >
> > Signed-off-by: Ralph Campbell <[email protected]>
> > Signed-off-by: J?r?me Glisse <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > Cc: Kirill A. Shutemov <[email protected]>
> > Cc: [email protected]
> > ---
> > mm/page_vma_mapped.c | 9 +++++++++
> > 1 file changed, 9 insertions(+)
> >
> > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > index ae3c2a35d61b..1cf5b9bfb559 100644
> > --- a/mm/page_vma_mapped.c
> > +++ b/mm/page_vma_mapped.c
> > @@ -21,6 +21,15 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
> > if (!is_swap_pte(*pvmw->pte))
> > return false;
> > } else {
> > + if (is_swap_pte(*pvmw->pte)) {
> > + swp_entry_t entry;
> > +
> > + /* Handle un-addressable ZONE_DEVICE memory */
> > + entry = pte_to_swp_entry(*pvmw->pte);
> > + if (is_device_private_entry(entry))
> > + return true;
> > + }
> > +
>
> This happens just for !PVMW_SYNC && PVMW_MIGRATION? I presume this
> is triggered via the remove_migration_pte() code path? Doesn't
> returning true here imply that we've taken the ptl lock for the
> pvmw?
This happens through try_to_unmap() from migrate_vma_unmap() and thus
has !PVMW_SYNC and !PVMW_MIGRATION
But you are right about the ptl lock, so looking at code we were just
doing pte modification without holding the pte lock but the
page_vma_mapped_walk() would not try to unlock as pvmw->ptl == NULL
so this never triggered any warning.
I am gonna post a v2 shortly which address that.
Cheers,
J?r?me
From: Ralph Campbell <[email protected]>
Private ZONE_DEVICE pages use a special pte entry and thus are not
present. Properly handle this case in map_pte(), it is already handled
in check_pte(), the map_pte() part was lost in some rebase most probably.
Without this patch the slow migration path can not migrate back private
ZONE_DEVICE memory to regular memory. This was found after stress
testing migration back to system memory. This ultimatly can lead the
CPU to an infinite page fault loop on the special swap entry.
Changes since v1:
- properly lock pte directory in map_pte()
Signed-off-by: Ralph Campbell <[email protected]>
Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: [email protected]
---
mm/page_vma_mapped.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index ae3c2a35d61b..bd67e23dce33 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -21,7 +21,14 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
if (!is_swap_pte(*pvmw->pte))
return false;
} else {
- if (!pte_present(*pvmw->pte))
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ /* Handle un-addressable ZONE_DEVICE memory */
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (!is_device_private_entry(entry))
+ return false;
+ } else if (!pte_present(*pvmw->pte))
return false;
}
}
--
2.17.1
On Fri, Aug 24, 2018 at 03:25:47PM -0400, [email protected] wrote:
> From: J?r?me Glisse <[email protected]>
>
> Use a structure to gather all the parameters for the update callback.
> This make it easier when adding new parameters by avoiding having to
> update all callback function signature.
>
> Signed-off-by: J?r?me Glisse <[email protected]>
> Cc: Ralph Campbell <[email protected]>
> Cc: John Hubbard <[email protected]>
> Cc: Andrew Morton <[email protected]>
> ---
> include/linux/hmm.h | 25 +++++++++++++++++--------
> mm/hmm.c | 27 ++++++++++++++-------------
> 2 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 1ff4bae7ada7..a7f7600b6bb0 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -274,13 +274,26 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
> struct hmm_mirror;
>
> /*
> - * enum hmm_update_type - type of update
> + * enum hmm_update_event - type of update
> * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
> */
> -enum hmm_update_type {
> +enum hmm_update_event {
> HMM_UPDATE_INVALIDATE,
> };
>
> +/*
> + * struct hmm_update - HMM update informations for callback
> + *
> + * @start: virtual start address of the range to update
> + * @end: virtual end address of the range to update
> + * @event: event triggering the update (what is happening)
> + */
> +struct hmm_update {
> + unsigned long start;
> + unsigned long end;
> + enum hmm_update_event event;
> +};
> +
I wonder if you want to add further information about the range,
like page_size, I guess the other side does not care about the
size. Do we care about sending multiple discontig ranges in
hmm_update? Should it be an array?
Balbir Singh
On Thu, Aug 30, 2018 at 10:41:56AM -0400, [email protected] wrote:
> From: Ralph Campbell <[email protected]>
>
> Private ZONE_DEVICE pages use a special pte entry and thus are not
> present. Properly handle this case in map_pte(), it is already handled
> in check_pte(), the map_pte() part was lost in some rebase most probably.
>
> Without this patch the slow migration path can not migrate back private
> ZONE_DEVICE memory to regular memory. This was found after stress
> testing migration back to system memory. This ultimatly can lead the
> CPU to an infinite page fault loop on the special swap entry.
>
> Changes since v1:
> - properly lock pte directory in map_pte()
>
> Signed-off-by: Ralph Campbell <[email protected]>
> Signed-off-by: J?r?me Glisse <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Kirill A. Shutemov <[email protected]>
> Cc: Balbir Singh <[email protected]>
> Cc: [email protected]
> ---
> mm/page_vma_mapped.c | 9 ++++++++-
> 1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index ae3c2a35d61b..bd67e23dce33 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -21,7 +21,14 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
> if (!is_swap_pte(*pvmw->pte))
> return false;
> } else {
> - if (!pte_present(*pvmw->pte))
> + if (is_swap_pte(*pvmw->pte)) {
> + swp_entry_t entry;
> +
> + /* Handle un-addressable ZONE_DEVICE memory */
> + entry = pte_to_swp_entry(*pvmw->pte);
> + if (!is_device_private_entry(entry))
> + return false;
OK, so we skip this pte from unmap since it's already unmapped? This prevents
try_to_unmap from unmapping it and it gets restored with MIGRATE_PFN_MIGRATE
flag cleared?
Sounds like the right thing, if I understand it correctly
Acked-by: Balbir Singh <[email protected]>
Balbir Singh.
On Fri, Aug 31, 2018 at 09:11:48AM +1000, Balbir Singh wrote:
> On Fri, Aug 24, 2018 at 03:25:47PM -0400, [email protected] wrote:
> > From: J?r?me Glisse <[email protected]>
> >
> > Use a structure to gather all the parameters for the update callback.
> > This make it easier when adding new parameters by avoiding having to
> > update all callback function signature.
> >
> > Signed-off-by: J?r?me Glisse <[email protected]>
> > Cc: Ralph Campbell <[email protected]>
> > Cc: John Hubbard <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > ---
> > include/linux/hmm.h | 25 +++++++++++++++++--------
> > mm/hmm.c | 27 ++++++++++++++-------------
> > 2 files changed, 31 insertions(+), 21 deletions(-)
> >
> > diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> > index 1ff4bae7ada7..a7f7600b6bb0 100644
> > --- a/include/linux/hmm.h
> > +++ b/include/linux/hmm.h
> > @@ -274,13 +274,26 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
> > struct hmm_mirror;
> >
> > /*
> > - * enum hmm_update_type - type of update
> > + * enum hmm_update_event - type of update
> > * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
> > */
> > -enum hmm_update_type {
> > +enum hmm_update_event {
> > HMM_UPDATE_INVALIDATE,
> > };
> >
> > +/*
> > + * struct hmm_update - HMM update informations for callback
> > + *
> > + * @start: virtual start address of the range to update
> > + * @end: virtual end address of the range to update
> > + * @event: event triggering the update (what is happening)
> > + */
> > +struct hmm_update {
> > + unsigned long start;
> > + unsigned long end;
> > + enum hmm_update_event event;
> > +};
> > +
>
> I wonder if you want to add further information about the range,
> like page_size, I guess the other side does not care about the
> size. Do we care about sending multiple discontig ranges in
> hmm_update? Should it be an array?
>
> Balbir Singh
This is a range of virtual address if a huge page is fully unmapped
then the range will cover the full huge page. It mirror mmu notifier
range callback because 99% of the time it is just use to pass down
mmu notifier invalidation. So we don't care about multi range at
least not yet.
Nor do we care about page size as it might vary in the range (range
can have a mix of THP and regular page) moreover the device driver
usualy ignore the page size.
Cheers,
J?r?me
On Fri, Aug 31, 2018 at 07:27:24PM +1000, Balbir Singh wrote:
> On Thu, Aug 30, 2018 at 10:41:56AM -0400, [email protected] wrote:
> > From: Ralph Campbell <[email protected]>
> >
> > Private ZONE_DEVICE pages use a special pte entry and thus are not
> > present. Properly handle this case in map_pte(), it is already handled
> > in check_pte(), the map_pte() part was lost in some rebase most probably.
> >
> > Without this patch the slow migration path can not migrate back private
> > ZONE_DEVICE memory to regular memory. This was found after stress
> > testing migration back to system memory. This ultimatly can lead the
> > CPU to an infinite page fault loop on the special swap entry.
> >
> > Changes since v1:
> > - properly lock pte directory in map_pte()
> >
> > Signed-off-by: Ralph Campbell <[email protected]>
> > Signed-off-by: J?r?me Glisse <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > Cc: Kirill A. Shutemov <[email protected]>
> > Cc: Balbir Singh <[email protected]>
> > Cc: [email protected]
> > ---
> > mm/page_vma_mapped.c | 9 ++++++++-
> > 1 file changed, 8 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > index ae3c2a35d61b..bd67e23dce33 100644
> > --- a/mm/page_vma_mapped.c
> > +++ b/mm/page_vma_mapped.c
> > @@ -21,7 +21,14 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
> > if (!is_swap_pte(*pvmw->pte))
> > return false;
> > } else {
> > - if (!pte_present(*pvmw->pte))
> > + if (is_swap_pte(*pvmw->pte)) {
> > + swp_entry_t entry;
> > +
> > + /* Handle un-addressable ZONE_DEVICE memory */
> > + entry = pte_to_swp_entry(*pvmw->pte);
> > + if (!is_device_private_entry(entry))
> > + return false;
>
> OK, so we skip this pte from unmap since it's already unmapped? This prevents
> try_to_unmap from unmapping it and it gets restored with MIGRATE_PFN_MIGRATE
> flag cleared?
>
> Sounds like the right thing, if I understand it correctly
Well not exactly we do not skip it, we replace it with a migration
pte see try_to_unmap_one() which get call with TTU_MIGRATION flag
set (which do not translate in PVMW_MIGRATION being set on contrary).
From migration point of view even if this is a swap pte, it is still
a valid mapping of the page and is counted as such for all intent and
purposes. The only thing we don't need is flushing CPU tlb or cache.
So this all happens when we are migrating something back to regular
memory either because of CPU fault or because the device driver want
to make room in its memory and decided to evict that page back to
regular memory.
Cheers,
J?r?me
On Fri, Aug 31, 2018 at 12:19:35PM -0400, Jerome Glisse wrote:
> On Fri, Aug 31, 2018 at 07:27:24PM +1000, Balbir Singh wrote:
> > On Thu, Aug 30, 2018 at 10:41:56AM -0400, [email protected] wrote:
> > > From: Ralph Campbell <[email protected]>
> > >
> > > Private ZONE_DEVICE pages use a special pte entry and thus are not
> > > present. Properly handle this case in map_pte(), it is already handled
> > > in check_pte(), the map_pte() part was lost in some rebase most probably.
> > >
> > > Without this patch the slow migration path can not migrate back private
> > > ZONE_DEVICE memory to regular memory. This was found after stress
> > > testing migration back to system memory. This ultimatly can lead the
> > > CPU to an infinite page fault loop on the special swap entry.
> > >
> > > Changes since v1:
> > > - properly lock pte directory in map_pte()
> > >
> > > Signed-off-by: Ralph Campbell <[email protected]>
> > > Signed-off-by: J?r?me Glisse <[email protected]>
> > > Cc: Andrew Morton <[email protected]>
> > > Cc: Kirill A. Shutemov <[email protected]>
> > > Cc: Balbir Singh <[email protected]>
> > > Cc: [email protected]
> > > ---
> > > mm/page_vma_mapped.c | 9 ++++++++-
> > > 1 file changed, 8 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> > > index ae3c2a35d61b..bd67e23dce33 100644
> > > --- a/mm/page_vma_mapped.c
> > > +++ b/mm/page_vma_mapped.c
> > > @@ -21,7 +21,14 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
> > > if (!is_swap_pte(*pvmw->pte))
> > > return false;
> > > } else {
> > > - if (!pte_present(*pvmw->pte))
> > > + if (is_swap_pte(*pvmw->pte)) {
> > > + swp_entry_t entry;
> > > +
> > > + /* Handle un-addressable ZONE_DEVICE memory */
> > > + entry = pte_to_swp_entry(*pvmw->pte);
> > > + if (!is_device_private_entry(entry))
> > > + return false;
> >
> > OK, so we skip this pte from unmap since it's already unmapped? This prevents
> > try_to_unmap from unmapping it and it gets restored with MIGRATE_PFN_MIGRATE
> > flag cleared?
> >
> > Sounds like the right thing, if I understand it correctly
>
> Well not exactly we do not skip it, we replace it with a migration
I think I missed the !is_device_private_entry and missed the ! part,
so that seems reasonable
Reviewed-by: Balbir Singh <[email protected]>
On Fri, Aug 24, 2018 at 03:25:42PM -0400, [email protected] wrote:
> From: J?r?me Glisse <[email protected]>
>
> Few fixes that only affect HMM users. Improve the synchronization call
> back so that we match was other mmu_notifier listener do and add proper
> support to the new blockable flags in the process.
>
> For curious folks here are branches to leverage HMM in various existing
> device drivers:
>
> https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-nouveau-v01
> https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-radeon-v00
> https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-intel-v00
>
> More to come (amd gpu, Mellanox, ...)
>
> I expect more of the preparatory work for nouveau will be merge in 4.20
> (like we have been doing since 4.16) and i will wait until this patchset
> is upstream before pushing the patches that actualy make use of HMM (to
> avoid complex tree inter-dependency).
>
Andrew do you want me to repost this on top of lastest mmotm ?
All conflict should be pretty trivial to fix.
Cheers,
J?r?me
On Fri, 12 Oct 2018 14:15:45 -0400 Jerome Glisse <[email protected]> wrote:
> On Fri, Aug 24, 2018 at 03:25:42PM -0400, [email protected] wrote:
> > From: J?r?me Glisse <[email protected]>
> >
> > Few fixes that only affect HMM users. Improve the synchronization call
> > back so that we match was other mmu_notifier listener do and add proper
> > support to the new blockable flags in the process.
> >
> > For curious folks here are branches to leverage HMM in various existing
> > device drivers:
> >
> > https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-nouveau-v01
> > https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-radeon-v00
> > https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-intel-v00
> >
> > More to come (amd gpu, Mellanox, ...)
> >
> > I expect more of the preparatory work for nouveau will be merge in 4.20
> > (like we have been doing since 4.16) and i will wait until this patchset
> > is upstream before pushing the patches that actualy make use of HMM (to
> > avoid complex tree inter-dependency).
> >
>
> Andrew do you want me to repost this on top of lastest mmotm ?
> All conflict should be pretty trivial to fix.
Please. I ducked v1 because a v2 was in the works. It's very late in
the cycle so you might want to prepare an urgent-for-4.19 series and a
for-4.20 series. Or, better, a single series with the appropriate
Cc:stable tags.
Please ensure that all review questions which have thus far been
received are appropriately answered in code comments and in changelogs.
Because if one reader was wondering about something, others will
wonder the same thing in the future.