When clearing a PTE the TLB should be flushed whilst still holding the
PTL to avoid a potential race with madvise/munmap/etc. For example
consider the following sequence:
CPU0 CPU1
---- ----
migrate_vma_collect_pmd()
pte_unmap_unlock()
madvise(MADV_DONTNEED)
-> zap_pte_range()
pte_offset_map_lock()
[ PTE not present, TLB not flushed ]
pte_unmap_unlock()
[ page is still accessible via stale TLB ]
flush_tlb_range()
In this case the page may still be accessed via the stale TLB entry
after madvise returns. Fix this by flushing the TLB while holding the
PTL.
Signed-off-by: Alistair Popple <[email protected]>
Reported-by: Nadav Amit <[email protected]>
Reviewed-by: "Huang, Ying" <[email protected]>
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Cc: [email protected]
---
Changes for v4:
- Added Review-by
Changes for v3:
- New for v3
---
mm/migrate_device.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d..6a5ef9f 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -254,13 +254,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
--
git-series 0.9.1
We were not correctly copying PTE dirty bits to pages during
migrate_vma_setup() calls. This could potentially lead to data loss, so
add a test for this.
Signed-off-by: Alistair Popple <[email protected]>
---
tools/testing/selftests/vm/hmm-tests.c | 124 ++++++++++++++++++++++++++-
1 file changed, 124 insertions(+)
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 529f53b..70fdb49 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1200,6 +1200,130 @@ TEST_F(hmm, migrate_multiple)
}
}
+static char cgroup[] = "/sys/fs/cgroup/hmm-test-XXXXXX";
+static int write_cgroup_param(char *cgroup_path, char *param, long value)
+{
+ int ret;
+ FILE *f;
+ char *filename;
+
+ if (asprintf(&filename, "%s/%s", cgroup_path, param) < 0)
+ return -1;
+
+ f = fopen(filename, "w");
+ if (!f) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = fprintf(f, "%ld\n", value);
+ if (ret < 0)
+ goto out1;
+
+ ret = 0;
+
+out1:
+ fclose(f);
+out:
+ free(filename);
+
+ return ret;
+}
+
+static int setup_cgroup(void)
+{
+ pid_t pid = getpid();
+ int ret;
+
+ if (!mkdtemp(cgroup))
+ return -1;
+
+ ret = write_cgroup_param(cgroup, "cgroup.procs", pid);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int destroy_cgroup(void)
+{
+ pid_t pid = getpid();
+ int ret;
+
+ ret = write_cgroup_param("/sys/fs/cgroup/cgroup.procs",
+ "cgroup.proc", pid);
+ if (ret)
+ return ret;
+
+ if (rmdir(cgroup))
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Try and migrate a dirty page that has previously been swapped to disk. This
+ * checks that we don't loose dirty bits.
+ */
+TEST_F(hmm, migrate_dirty_page)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int tmp = 0;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ ASSERT_EQ(setup_cgroup(), 0);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = 0;
+
+ ASSERT_FALSE(write_cgroup_param(cgroup, "memory.reclaim", 1UL<<30));
+
+ /* Fault pages back in from swap as clean pages */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ tmp += ptr[i];
+
+ /* Dirty the pte */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /*
+ * Attempt to migrate memory to device, which should fail because
+ * hopefully some pages are backed by swap storage.
+ */
+ ASSERT_TRUE(hmm_migrate_sys_to_dev(self->fd, buffer, npages));
+
+ ASSERT_FALSE(write_cgroup_param(cgroup, "memory.reclaim", 1UL<<30));
+
+ /* Check we still see the updated data after restoring from swap. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+ destroy_cgroup();
+}
+
/*
* Read anonymous memory multiple times.
*/
--
git-series 0.9.1
Currently we only call flush_cache_page() for the anon_exclusive case,
however in both cases we clear the pte so should flush the cache.
Signed-off-by: Alistair Popple <[email protected]>
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Cc: [email protected]
---
New for v4
---
mm/migrate_device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6a5ef9f..4cc849c 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
--
git-series 0.9.1
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over
to the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous
mappings. That means loss of the dirty bit usually doesn't result in
data loss because these pages are typically not file-backed. However
pages may be backed by swap storage which can result in data loss if an
attempt is made to migrate a dirty page that doesn't yet have the
PageDirty flag set.
In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte
to match what try_to_migrate_one() does.
Signed-off-by: Alistair Popple <[email protected]>
Acked-by: Peter Xu <[email protected]>
Reviewed-by: "Huang, Ying" <[email protected]>
Reported-by: "Huang, Ying" <[email protected]>
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Cc: [email protected]
---
Changes for v4:
- Added Reviewed-by
Changes for v3:
- Defer TLB flushing
- Split a TLB flushing fix into a separate change.
Changes for v2:
- Fixed up Reported-by tag.
- Added Peter's Acked-by.
- Atomically read and clear the pte to prevent the dirty bit getting
set after reading it.
- Added fixes tag
---
mm/migrate_device.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 4cc849c..dbf6c7a 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
--
git-series 0.9.1
On 02.09.22 02:35, Alistair Popple wrote:
> When clearing a PTE the TLB should be flushed whilst still holding the
> PTL to avoid a potential race with madvise/munmap/etc. For example
> consider the following sequence:
>
> CPU0 CPU1
> ---- ----
>
> migrate_vma_collect_pmd()
> pte_unmap_unlock()
> madvise(MADV_DONTNEED)
> -> zap_pte_range()
> pte_offset_map_lock()
> [ PTE not present, TLB not flushed ]
> pte_unmap_unlock()
> [ page is still accessible via stale TLB ]
> flush_tlb_range()
>
> In this case the page may still be accessed via the stale TLB entry
> after madvise returns. Fix this by flushing the TLB while holding the
> PTL.
>
> Signed-off-by: Alistair Popple <[email protected]>
> Reported-by: Nadav Amit <[email protected]>
> Reviewed-by: "Huang, Ying" <[email protected]>
> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
> Cc: [email protected]
>
> ---
>
> Changes for v4:
>
> - Added Review-by
>
> Changes for v3:
>
> - New for v3
> ---
> mm/migrate_device.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 27fb37d..6a5ef9f 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -254,13 +254,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> migrate->dst[migrate->npages] = 0;
> migrate->src[migrate->npages++] = mpfn;
> }
> - arch_leave_lazy_mmu_mode();
> - pte_unmap_unlock(ptep - 1, ptl);
>
> /* Only flush the TLB if we actually modified any entries */
> if (unmapped)
> flush_tlb_range(walk->vma, start, end);
>
> + arch_leave_lazy_mmu_mode();
> + pte_unmap_unlock(ptep - 1, ptl);
> +
> return 0;
> }
>
>
> base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
Acked-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On 02.09.22 02:35, Alistair Popple wrote:
> migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
> installs migration entries directly if it can lock the migrating page.
> When removing a dirty pte the dirty bit is supposed to be carried over
> to the underlying page to prevent it being lost.
>
> Currently migrate_vma_*() can only be used for private anonymous
> mappings. That means loss of the dirty bit usually doesn't result in
> data loss because these pages are typically not file-backed. However
> pages may be backed by swap storage which can result in data loss if an
> attempt is made to migrate a dirty page that doesn't yet have the
> PageDirty flag set.
>
> In this case migration will fail due to unexpected references but the
> dirty pte bit will be lost. If the page is subsequently reclaimed data
> won't be written back to swap storage as it is considered uptodate,
> resulting in data loss if the page is subsequently accessed.
>
> Prevent this by copying the dirty bit to the page when removing the pte
> to match what try_to_migrate_one() does.
>
> Signed-off-by: Alistair Popple <[email protected]>
> Acked-by: Peter Xu <[email protected]>
> Reviewed-by: "Huang, Ying" <[email protected]>
> Reported-by: "Huang, Ying" <[email protected]>
> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
> Cc: [email protected]
>
> ---
>
> Changes for v4:
>
> - Added Reviewed-by
>
> Changes for v3:
>
> - Defer TLB flushing
> - Split a TLB flushing fix into a separate change.
>
> Changes for v2:
>
> - Fixed up Reported-by tag.
> - Added Peter's Acked-by.
> - Atomically read and clear the pte to prevent the dirty bit getting
> set after reading it.
> - Added fixes tag
> ---
> mm/migrate_device.c | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 4cc849c..dbf6c7a 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -7,6 +7,7 @@
> #include <linux/export.h>
> #include <linux/memremap.h>
> #include <linux/migrate.h>
> +#include <linux/mm.h>
> #include <linux/mm_inline.h>
> #include <linux/mmu_notifier.h>
> #include <linux/oom.h>
> @@ -196,7 +197,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> flush_cache_page(vma, addr, pte_pfn(*ptep));
> anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
> if (anon_exclusive) {
> - ptep_clear_flush(vma, addr, ptep);
> + pte = ptep_clear_flush(vma, addr, ptep);
>
> if (page_try_share_anon_rmap(page)) {
> set_pte_at(mm, addr, ptep, pte);
> @@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> goto next;
> }
> } else {
> - ptep_get_and_clear(mm, addr, ptep);
> + pte = ptep_get_and_clear(mm, addr, ptep);
> }
>
> migrate->cpages++;
>
> + /* Set the dirty flag on the folio now the pte is gone. */
> + if (pte_dirty(pte))
> + folio_mark_dirty(page_folio(page));
> +
> /* Setup special migration page table entry */
> if (mpfn & MIGRATE_PFN_WRITE)
> entry = make_writable_migration_entry(
This matches what we do in try_to_unmap_one()
Acked-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On 02.09.22 02:35, Alistair Popple wrote:
> Currently we only call flush_cache_page() for the anon_exclusive case,
> however in both cases we clear the pte so should flush the cache.
>
> Signed-off-by: Alistair Popple <[email protected]>
> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
> Cc: [email protected]
>
> ---
>
> New for v4
> ---
> mm/migrate_device.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 6a5ef9f..4cc849c 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> bool anon_exclusive;
> pte_t swp_pte;
>
> + flush_cache_page(vma, addr, pte_pfn(*ptep));
> anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
> if (anon_exclusive) {
> - flush_cache_page(vma, addr, pte_pfn(*ptep));
> ptep_clear_flush(vma, addr, ptep);
>
> if (page_try_share_anon_rmap(page)) {
Reviewed-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On Fri, Sep 02, 2022 at 10:35:52AM +1000, Alistair Popple wrote:
> Currently we only call flush_cache_page() for the anon_exclusive case,
> however in both cases we clear the pte so should flush the cache.
>
> Signed-off-by: Alistair Popple <[email protected]>
> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
> Cc: [email protected]
This is the patch to start collide with David's.
David's patch has also unified both paths with ptep_get_and_clear(), but
this patch itself is also correct to me.
It'll probably just become no-diff after rebase, though.. I'm not sure how
the ordering would be at last, but anyway I think this patch stands as its
own too..
Acked-by: Peter Xu <[email protected]>
Thanks for tolerant with my nitpickings,
>
> ---
>
> New for v4
> ---
> mm/migrate_device.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 6a5ef9f..4cc849c 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -193,9 +193,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> bool anon_exclusive;
> pte_t swp_pte;
>
> + flush_cache_page(vma, addr, pte_pfn(*ptep));
> anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
> if (anon_exclusive) {
> - flush_cache_page(vma, addr, pte_pfn(*ptep));
> ptep_clear_flush(vma, addr, ptep);
>
> if (page_try_share_anon_rmap(page)) {
> --
> git-series 0.9.1
>
--
Peter Xu
On Fri, Sep 02, 2022 at 10:35:51AM +1000, Alistair Popple wrote:
> When clearing a PTE the TLB should be flushed whilst still holding the
> PTL to avoid a potential race with madvise/munmap/etc. For example
> consider the following sequence:
>
> CPU0 CPU1
> ---- ----
>
> migrate_vma_collect_pmd()
> pte_unmap_unlock()
> madvise(MADV_DONTNEED)
> -> zap_pte_range()
> pte_offset_map_lock()
> [ PTE not present, TLB not flushed ]
> pte_unmap_unlock()
> [ page is still accessible via stale TLB ]
> flush_tlb_range()
>
> In this case the page may still be accessed via the stale TLB entry
> after madvise returns. Fix this by flushing the TLB while holding the
> PTL.
>
> Signed-off-by: Alistair Popple <[email protected]>
> Reported-by: Nadav Amit <[email protected]>
> Reviewed-by: "Huang, Ying" <[email protected]>
> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
> Cc: [email protected]
Acked-by: Peter Xu <[email protected]>
--
Peter Xu
On 9/1/22 17:35, Alistair Popple wrote:
> We were not correctly copying PTE dirty bits to pages during
> migrate_vma_setup() calls. This could potentially lead to data loss, so
> add a test for this.
>
> Signed-off-by: Alistair Popple <[email protected]>
> ---
> tools/testing/selftests/vm/hmm-tests.c | 124 ++++++++++++++++++++++++++-
> 1 file changed, 124 insertions(+)
>
> diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
> index 529f53b..70fdb49 100644
> --- a/tools/testing/selftests/vm/hmm-tests.c
> +++ b/tools/testing/selftests/vm/hmm-tests.c
> @@ -1200,6 +1200,130 @@ TEST_F(hmm, migrate_multiple)
> }
> }
>
> +static char cgroup[] = "/sys/fs/cgroup/hmm-test-XXXXXX";
> +static int write_cgroup_param(char *cgroup_path, char *param, long value)
> +{
> + int ret;
> + FILE *f;
> + char *filename;
> +
> + if (asprintf(&filename, "%s/%s", cgroup_path, param) < 0)
> + return -1;
> +
> + f = fopen(filename, "w");
> + if (!f) {
> + ret = -1;
> + goto out;
> + }
> +
> + ret = fprintf(f, "%ld\n", value);
> + if (ret < 0)
> + goto out1;
> +
> + ret = 0;
> +
> +out1:
> + fclose(f);
> +out:
> + free(filename);
> +
> + return ret;
> +}
> +
> +static int setup_cgroup(void)
> +{
> + pid_t pid = getpid();
> + int ret;
> +
> + if (!mkdtemp(cgroup))
> + return -1;
> +
> + ret = write_cgroup_param(cgroup, "cgroup.procs", pid);
> + if (ret)
> + return ret;
> +
> + return 0;
> +}
> +
> +static int destroy_cgroup(void)
> +{
> + pid_t pid = getpid();
> + int ret;
> +
> + ret = write_cgroup_param("/sys/fs/cgroup/cgroup.procs",
> + "cgroup.proc", pid);
> + if (ret)
> + return ret;
> +
> + if (rmdir(cgroup))
> + return -1;
> +
> + return 0;
> +}
> +
> +/*
> + * Try and migrate a dirty page that has previously been swapped to disk. This
> + * checks that we don't loose dirty bits.
s/loose/lose/
> + */
> +TEST_F(hmm, migrate_dirty_page)
> +{
> + struct hmm_buffer *buffer;
> + unsigned long npages;
> + unsigned long size;
> + unsigned long i;
> + int *ptr;
> + int tmp = 0;
> +
> + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
> + ASSERT_NE(npages, 0);
> + size = npages << self->page_shift;
> +
> + buffer = malloc(sizeof(*buffer));
> + ASSERT_NE(buffer, NULL);
> +
> + buffer->fd = -1;
> + buffer->size = size;
> + buffer->mirror = malloc(size);
> + ASSERT_NE(buffer->mirror, NULL);
> +
> + ASSERT_EQ(setup_cgroup(), 0);
> +
> + buffer->ptr = mmap(NULL, size,
> + PROT_READ | PROT_WRITE,
> + MAP_PRIVATE | MAP_ANONYMOUS,
> + buffer->fd, 0);
> + ASSERT_NE(buffer->ptr, MAP_FAILED);
> +
> + /* Initialize buffer in system memory. */
> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
> + ptr[i] = 0;
> +
> + ASSERT_FALSE(write_cgroup_param(cgroup, "memory.reclaim", 1UL<<30));
> +
> + /* Fault pages back in from swap as clean pages */
> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
> + tmp += ptr[i];
> +
> + /* Dirty the pte */
> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
> + ptr[i] = i;
> +
> + /*
> + * Attempt to migrate memory to device, which should fail because
> + * hopefully some pages are backed by swap storage.
> + */
> + ASSERT_TRUE(hmm_migrate_sys_to_dev(self->fd, buffer, npages));
Are you really sure that you want to assert on that? Because doing so
guarantees a test failure if and when we every upgrade the kernel to
be able to migrate swap-backed pages. And I seem to recall that this
current inability to migrate swap-backed pages is considered a flaw
to be fixed, right?
> +
> + ASSERT_FALSE(write_cgroup_param(cgroup, "memory.reclaim", 1UL<<30));
> +
> + /* Check we still see the updated data after restoring from swap. */
> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
> + ASSERT_EQ(ptr[i], i);
> +
> + hmm_buffer_free(buffer);
> + destroy_cgroup();
> +}
> +
> /*
> * Read anonymous memory multiple times.
> */
thanks,
--
John Hubbard
NVIDIA
John Hubbard <[email protected]> writes:
> On 9/1/22 17:35, Alistair Popple wrote:
[...]
>> +/*
>> + * Try and migrate a dirty page that has previously been swapped to disk. This
>> + * checks that we don't loose dirty bits.
>
> s/loose/lose/
Thanks.
>> + */
>> +TEST_F(hmm, migrate_dirty_page)
>> +{
>> + struct hmm_buffer *buffer;
>> + unsigned long npages;
>> + unsigned long size;
>> + unsigned long i;
>> + int *ptr;
>> + int tmp = 0;
>> +
>> + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
>> + ASSERT_NE(npages, 0);
>> + size = npages << self->page_shift;
>> +
>> + buffer = malloc(sizeof(*buffer));
>> + ASSERT_NE(buffer, NULL);
>> +
>> + buffer->fd = -1;
>> + buffer->size = size;
>> + buffer->mirror = malloc(size);
>> + ASSERT_NE(buffer->mirror, NULL);
>> +
>> + ASSERT_EQ(setup_cgroup(), 0);
>> +
>> + buffer->ptr = mmap(NULL, size,
>> + PROT_READ | PROT_WRITE,
>> + MAP_PRIVATE | MAP_ANONYMOUS,
>> + buffer->fd, 0);
>> + ASSERT_NE(buffer->ptr, MAP_FAILED);
>> +
>> + /* Initialize buffer in system memory. */
>> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
>> + ptr[i] = 0;
>> +
>> + ASSERT_FALSE(write_cgroup_param(cgroup, "memory.reclaim", 1UL<<30));
>> +
>> + /* Fault pages back in from swap as clean pages */
>> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
>> + tmp += ptr[i];
>> +
>> + /* Dirty the pte */
>> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
>> + ptr[i] = i;
>> +
>> + /*
>> + * Attempt to migrate memory to device, which should fail because
>> + * hopefully some pages are backed by swap storage.
>> + */
>> + ASSERT_TRUE(hmm_migrate_sys_to_dev(self->fd, buffer, npages));
>
> Are you really sure that you want to assert on that? Because doing so
> guarantees a test failure if and when we every upgrade the kernel to
> be able to migrate swap-backed pages. And I seem to recall that this
> current inability to migrate swap-backed pages is considered a flaw
> to be fixed, right?
Right, that's a good point. I was using failure (ASSERT_TRUE) here as a
way of detecting that at least some pages are swap-backed, because if no
pages end up being swap-backed the test is invalid.
I'm not really sure what to do about it though. It's likely the fix for
swap-backed migration may make this bug impossible to hit anyway,
because the obvious fix is to just drop the pages from the swapcache
during migration which would force writeback during subsequent reclaim.
So I'm inclined to leave this here even if it only serves to remind us
about it when we do fix migration of swap-backed pages, because we will
of course run hmm-tests before submitting that fix :-) We can then
either fix the test or drop it if we think it's no longer possible to
hit.
>> +
>> + ASSERT_FALSE(write_cgroup_param(cgroup, "memory.reclaim", 1UL<<30));
>> +
>> + /* Check we still see the updated data after restoring from swap. */
>> + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
>> + ASSERT_EQ(ptr[i], i);
>> +
>> + hmm_buffer_free(buffer);
>> + destroy_cgroup();
>> +}
>> +
>> /*
>> * Read anonymous memory multiple times.
>> */
>
> thanks,
On 9/7/22 04:13, Alistair Popple wrote:
>>> + /*
>>> + * Attempt to migrate memory to device, which should fail because
>>> + * hopefully some pages are backed by swap storage.
>>> + */
>>> + ASSERT_TRUE(hmm_migrate_sys_to_dev(self->fd, buffer, npages));
>>
>> Are you really sure that you want to assert on that? Because doing so
>> guarantees a test failure if and when we every upgrade the kernel to
>> be able to migrate swap-backed pages. And I seem to recall that this
>> current inability to migrate swap-backed pages is considered a flaw
>> to be fixed, right?
>
> Right, that's a good point. I was using failure (ASSERT_TRUE) here as a
> way of detecting that at least some pages are swap-backed, because if no
> pages end up being swap-backed the test is invalid.
Yes. But "invalid" or "waived" is a much different test result than
"failed".
>
> I'm not really sure what to do about it though. It's likely the fix for
Remove the assert. If the test framework allows and you prefer, you
can print a warning.
> swap-backed migration may make this bug impossible to hit anyway,
> because the obvious fix is to just drop the pages from the swapcache
> during migration which would force writeback during subsequent reclaim.
>
> So I'm inclined to leave this here even if it only serves to remind us
> about it when we do fix migration of swap-backed pages, because we will
> of course run hmm-tests before submitting that fix :-) We can then
> either fix the test or drop it if we think it's no longer possible to
> hit.
Oh no no no, please. This is not how to do tests. If you want a TODO
list somewhere, there are other ways. But tests that require maintenance
when you change something are an anti-pattern.
thanks,
--
John Hubbard
NVIDIA
John Hubbard <[email protected]> writes:
> On 9/7/22 04:13, Alistair Popple wrote:
>>>> + /*
>>>> + * Attempt to migrate memory to device, which should fail because
>>>> + * hopefully some pages are backed by swap storage.
>>>> + */
>>>> + ASSERT_TRUE(hmm_migrate_sys_to_dev(self->fd, buffer, npages));
>>>
>>> Are you really sure that you want to assert on that? Because doing so
>>> guarantees a test failure if and when we every upgrade the kernel to
>>> be able to migrate swap-backed pages. And I seem to recall that this
>>> current inability to migrate swap-backed pages is considered a flaw
>>> to be fixed, right?
>> Right, that's a good point. I was using failure (ASSERT_TRUE) here as a
>> way of detecting that at least some pages are swap-backed, because if no
>> pages end up being swap-backed the test is invalid.
>
> Yes. But "invalid" or "waived" is a much different test result than
> "failed".
True. Unfortunately our test framework needs some love as I don't think
it's possible to return a result of "invalid" or "waived". We can skip a
test though, so that might be the best option here.
>> I'm not really sure what to do about it though. It's likely the fix for
>
> Remove the assert. If the test framework allows and you prefer, you
> can print a warning.
>
>> swap-backed migration may make this bug impossible to hit anyway,
>> because the obvious fix is to just drop the pages from the swapcache
>> during migration which would force writeback during subsequent reclaim.
>> So I'm inclined to leave this here even if it only serves to remind us
>> about it when we do fix migration of swap-backed pages, because we will
>> of course run hmm-tests before submitting that fix :-) We can then
>> either fix the test or drop it if we think it's no longer possible to
>> hit.
>
> Oh no no no, please. This is not how to do tests. If you want a TODO
> list somewhere, there are other ways. But tests that require maintenance
> when you change something are an anti-pattern.
Fair enough, I think what you're asking for is a higher level test that
doesn't rely on implementation side-effects. I wrote this test mostly to
discover if we could hit problems with the current implementation hence
why it's a bit messy.
But I think I can fix this up without relying on implementation
side-effects - really I just want to confirm that at least some pages
got swapped to disk which I can do via looking at /proc/self/pagemap.
- Alistair
> thanks,