v4:
- Rebase to v5.17-rc4
- Add r-b, and suggested-by on patch 2 [David]
- Fix spelling (s/Quotting/Quoting/) [David]
RFC V1: https://lore.kernel.org/lkml/[email protected]
RFC V2: https://lore.kernel.org/lkml/[email protected]
V3: https://lore.kernel.org/lkml/[email protected]
Patch 1 should fix a long standing bug for zap_pte_range() on zap_details
usage. The risk is we could have some swap entries skipped while we should
have zapped them.
Migration entries are not the major concern because file backed memory always
zap in the pattern that "first time without page lock, then re-zap with page
lock" hence the 2nd zap will always make sure all migration entries are already
recovered.
However there can be issues with real swap entries got skipped errornoously.
There's a reproducer provided in commit message of patch 1 for that.
Patch 2-4 are cleanups that are based on patch 1. After the whole patchset
applied, we should have a very clean view of zap_pte_range().
Only patch 1 needs to be backported to stable if necessary.
Please review, thanks.
Peter Xu (4):
mm: Don't skip swap entry even if zap_details specified
mm: Rename zap_skip_check_mapping() to should_zap_page()
mm: Change zap_details.zap_mapping into even_cows
mm: Rework swap handling of zap_pte_range
mm/memory.c | 85 +++++++++++++++++++++++++++++++----------------------
1 file changed, 50 insertions(+), 35 deletions(-)
--
2.32.0
The "details" pointer shouldn't be the token to decide whether we should skip
swap entries. For example, when the user specified details->zap_mapping==NULL,
it means the user wants to zap all the pages (including COWed pages), then we
need to look into swap entries because there can be private COWed pages that
was swapped out.
Skipping some swap entries when details is non-NULL may lead to wrongly leaving
some of the swap entries while we should have zapped them.
A reproducer of the problem:
===8<===
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include <stdio.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
int page_size;
int shmem_fd;
char *buffer;
void main(void)
{
int ret;
char val;
page_size = getpagesize();
shmem_fd = memfd_create("test", 0);
assert(shmem_fd >= 0);
ret = ftruncate(shmem_fd, page_size * 2);
assert(ret == 0);
buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
MAP_PRIVATE, shmem_fd, 0);
assert(buffer != MAP_FAILED);
/* Write private page, swap it out */
buffer[page_size] = 1;
madvise(buffer, page_size * 2, MADV_PAGEOUT);
/* This should drop private buffer[page_size] already */
ret = ftruncate(shmem_fd, page_size);
assert(ret == 0);
/* Recover the size */
ret = ftruncate(shmem_fd, page_size * 2);
assert(ret == 0);
/* Re-read the data, it should be all zero */
val = buffer[page_size];
if (val == 0)
printf("Good\n");
else
printf("BUG\n");
}
===8<===
We don't need to touch up the pmd path, because pmd never had a issue with swap
entries. For example, shmem pmd migration will always be split into pte level,
and same to swapping on anonymous.
Add another helper should_zap_cows() so that we can also check whether we
should zap private mappings when there's no page pointer specified.
This patch drops that trick, so we handle swap ptes coherently. Meanwhile we
should do the same check upon migration entry, hwpoison entry and genuine swap
entries too. To be explicit, we should still remember to keep the private
entries if even_cows==false, and always zap them when even_cows==true.
The issue seems to exist starting from the initial commit of git.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Peter Xu <[email protected]>
---
mm/memory.c | 45 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 36 insertions(+), 9 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..4bfeaca7cbc7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1313,6 +1313,17 @@ struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
};
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
+{
+ /* By default, zap all pages */
+ if (!details)
+ return true;
+
+ /* Or, we zap COWed pages only if the caller wants to */
+ return !details->zap_mapping;
+}
+
/*
* We set details->zap_mapping when we want to unmap shared but keep private
* pages. Return true if skip zapping this page, false otherwise.
@@ -1320,11 +1331,15 @@ struct zap_details {
static inline bool
zap_skip_check_mapping(struct zap_details *details, struct page *page)
{
- if (!details || !page)
+ /* If we can make a decision without *page.. */
+ if (should_zap_cows(details))
return false;
- return details->zap_mapping &&
- (details->zap_mapping != page_rmapping(page));
+ /* E.g. zero page */
+ if (!page)
+ return false;
+
+ return details->zap_mapping != page_rmapping(page);
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1405,17 +1420,29 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
- continue;
-
- if (!non_swap_entry(entry))
+ if (!non_swap_entry(entry)) {
+ /*
+ * If this is a genuine swap entry, then it must be an
+ * private anon page. If the caller wants to skip
+ * COWed pages, ignore it.
+ */
+ if (!should_zap_cows(details))
+ continue;
rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
+ } else if (is_migration_entry(entry)) {
struct page *page;
page = pfn_swap_entry_to_page(entry);
+ if (zap_skip_check_mapping(details, page))
+ continue;
rss[mm_counter(page)]--;
+ } else if (is_hwpoison_entry(entry)) {
+ /* If the caller wants to skip COWed pages, ignore it */
+ if (!should_zap_cows(details))
+ continue;
+ } else {
+ /* We should have covered all the swap entry types */
+ WARN_ON_ONCE(1);
}
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
--
2.32.0
The previous name is against the natural way people think. Invert the meaning
and also the return value. No functional change intended.
Suggested-by: David Hildenbrand <[email protected]>
Suggested-by: Hugh Dickins <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
Signed-off-by: Peter Xu <[email protected]>
---
mm/memory.c | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 4bfeaca7cbc7..14d8428ff4db 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1326,20 +1326,19 @@ static inline bool should_zap_cows(struct zap_details *details)
/*
* We set details->zap_mapping when we want to unmap shared but keep private
- * pages. Return true if skip zapping this page, false otherwise.
+ * pages. Return true if we should zap this page, false otherwise.
*/
-static inline bool
-zap_skip_check_mapping(struct zap_details *details, struct page *page)
+static inline bool should_zap_page(struct zap_details *details, struct page *page)
{
/* If we can make a decision without *page.. */
if (should_zap_cows(details))
- return false;
+ return true;
/* E.g. zero page */
if (!page)
- return false;
+ return true;
- return details->zap_mapping != page_rmapping(page);
+ return details->zap_mapping == page_rmapping(page);
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1374,7 +1373,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct page *page;
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(zap_skip_check_mapping(details, page)))
+ if (unlikely(!should_zap_page(details, page)))
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
@@ -1408,7 +1407,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
is_device_exclusive_entry(entry)) {
struct page *page = pfn_swap_entry_to_page(entry);
- if (unlikely(zap_skip_check_mapping(details, page)))
+ if (unlikely(!should_zap_page(details, page)))
continue;
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
@@ -1433,7 +1432,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct page *page;
page = pfn_swap_entry_to_page(entry);
- if (zap_skip_check_mapping(details, page))
+ if (!should_zap_page(details, page))
continue;
rss[mm_counter(page)]--;
} else if (is_hwpoison_entry(entry)) {
--
2.32.0
Clean the code up by merging the device private/exclusive swap entry handling
with the rest, then we merge the pte clear operation too.
struct* page is defined in multiple places in the function, move it upward.
free_swap_and_cache() is only useful for !non_swap_entry() case, put it into
the condition.
No functional change intended.
Signed-off-by: Peter Xu <[email protected]>
---
mm/memory.c | 21 ++++++---------------
1 file changed, 6 insertions(+), 15 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index ffa8c7dfe9ad..cade96024349 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,6 +1361,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
+ struct page *page;
+
if (pte_none(ptent))
continue;
@@ -1368,8 +1370,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;
if (pte_present(ptent)) {
- struct page *page;
-
page = vm_normal_page(vma, addr, ptent);
if (unlikely(!should_zap_page(details, page)))
continue;
@@ -1403,21 +1403,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
entry = pte_to_swp_entry(ptent);
if (is_device_private_entry(entry) ||
is_device_exclusive_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
-
+ page = pfn_swap_entry_to_page(entry);
if (unlikely(!should_zap_page(details, page)))
continue;
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
-
if (is_device_private_entry(entry))
page_remove_rmap(page, false);
-
put_page(page);
- continue;
- }
-
- if (!non_swap_entry(entry)) {
+ } else if (!non_swap_entry(entry)) {
/*
* If this is a genuine swap entry, then it must be an
* private anon page. If the caller wants to skip
@@ -1426,9 +1419,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (!should_zap_cows(details))
continue;
rss[MM_SWAPENTS]--;
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
} else if (is_migration_entry(entry)) {
- struct page *page;
-
page = pfn_swap_entry_to_page(entry);
if (!should_zap_page(details, page))
continue;
@@ -1441,8 +1434,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
/* We should have covered all the swap entry types */
WARN_ON_ONCE(1);
}
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
--
2.32.0
Currently we have a zap_mapping pointer maintained in zap_details, when it is
specified we only want to zap the pages that has the same mapping with what the
caller has specified.
But what we want to do is actually simpler: we want to skip zapping
private (COW-ed) pages in some cases. We can refer to unmap_mapping_pages()
callers where we could have passed in different even_cows values. The other
user is unmap_mapping_folio() where we always want to skip private pages.
According to Hugh, we used a mapping pointer for historical reason, as
explained here:
https://lore.kernel.org/lkml/[email protected]/
Quoting partly from Hugh:
Which raises the question again of why I did not just use a boolean flag
there originally: aah, I think I've found why. In those days there was a
horrible "optimization", for better performance on some benchmark I guess,
which when you read from /dev/zero into a private mapping, would map the zero
page there (look up read_zero_pagealigned() and zeromap_page_range() if you
dare). So there was another category of page to be skipped along with the
anon COWs, and I didn't want multiple tests in the zap loop, so checking
check_mapping against page->mapping did both. I think nowadays you could do
it by checking for PageAnon page (or genuine swap entry) instead.
This patch replaced the zap_details.zap_mapping pointer into the even_cows
boolean, then we check it against PageAnon.
Suggested-by: Hugh Dickins <[email protected]>
Signed-off-by: Peter Xu <[email protected]>
---
mm/memory.c | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 14d8428ff4db..ffa8c7dfe9ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1309,8 +1309,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
- struct address_space *zap_mapping; /* Check page->mapping if set */
struct folio *single_folio; /* Locked folio to be unmapped */
+ bool even_cows; /* Zap COWed private pages too? */
};
/* Whether we should zap all COWed (private) pages too */
@@ -1321,13 +1321,10 @@ static inline bool should_zap_cows(struct zap_details *details)
return true;
/* Or, we zap COWed pages only if the caller wants to */
- return !details->zap_mapping;
+ return details->even_cows;
}
-/*
- * We set details->zap_mapping when we want to unmap shared but keep private
- * pages. Return true if we should zap this page, false otherwise.
- */
+/* Decides whether we should zap this page with the page pointer specified */
static inline bool should_zap_page(struct zap_details *details, struct page *page)
{
/* If we can make a decision without *page.. */
@@ -1338,7 +1335,8 @@ static inline bool should_zap_page(struct zap_details *details, struct page *pag
if (!page)
return true;
- return details->zap_mapping == page_rmapping(page);
+ /* Otherwise we should only zap non-anon pages */
+ return !PageAnon(page);
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -3403,7 +3401,7 @@ void unmap_mapping_folio(struct folio *folio)
first_index = folio->index;
last_index = folio->index + folio_nr_pages(folio) - 1;
- details.zap_mapping = mapping;
+ details.even_cows = false;
details.single_folio = folio;
i_mmap_lock_write(mapping);
@@ -3432,7 +3430,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t first_index = start;
pgoff_t last_index = start + nr - 1;
- details.zap_mapping = even_cows ? NULL : mapping;
+ details.even_cows = even_cows;
if (last_index < first_index)
last_index = ULONG_MAX;
--
2.32.0
On Wed, 16 Feb 2022 17:48:07 +0800 Peter Xu <[email protected]> wrote:
> The "details" pointer shouldn't be the token to decide whether we should skip
> swap entries. For example, when the user specified details->zap_mapping==NULL,
> it means the user wants to zap all the pages (including COWed pages), then we
> need to look into swap entries because there can be private COWed pages that
> was swapped out.
I assume "user" here means "caller".
> Skipping some swap entries when details is non-NULL may lead to wrongly leaving
> some of the swap entries while we should have zapped them.
>
> A reproducer of the problem:
>
> ...
>
> The issue seems to exist starting from the initial commit of git.
I'll add cc:stable to this one, thanks.
On 2/16/22 1:48 AM, Peter Xu wrote:
> Currently we have a zap_mapping pointer maintained in zap_details, when it is
> specified we only want to zap the pages that has the same mapping with what the
> caller has specified.
>
> But what we want to do is actually simpler: we want to skip zapping
> private (COW-ed) pages in some cases. We can refer to unmap_mapping_pages()
> callers where we could have passed in different even_cows values. The other
> user is unmap_mapping_folio() where we always want to skip private pages.
>
> According to Hugh, we used a mapping pointer for historical reason, as
> explained here:
>
> https://lore.kernel.org/lkml/[email protected]/
>
> Quoting partly from Hugh:
>
> Which raises the question again of why I did not just use a boolean flag
> there originally: aah, I think I've found why. In those days there was a
> horrible "optimization", for better performance on some benchmark I guess,
> which when you read from /dev/zero into a private mapping, would map the zero
> page there (look up read_zero_pagealigned() and zeromap_page_range() if you
> dare). So there was another category of page to be skipped along with the
> anon COWs, and I didn't want multiple tests in the zap loop, so checking
> check_mapping against page->mapping did both. I think nowadays you could do
> it by checking for PageAnon page (or genuine swap entry) instead.
>
> This patch replaced the zap_details.zap_mapping pointer into the even_cows
> boolean, then we check it against PageAnon.
>
> Suggested-by: Hugh Dickins <[email protected]>
> Signed-off-by: Peter Xu <[email protected]>
> ---
> mm/memory.c | 16 +++++++---------
> 1 file changed, 7 insertions(+), 9 deletions(-)
Reviewed-by: John Hubbard <[email protected]>
thanks,
--
John Hubbard
NVIDIA
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 14d8428ff4db..ffa8c7dfe9ad 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1309,8 +1309,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
> * Parameter block passed down to zap_pte_range in exceptional cases.
> */
> struct zap_details {
> - struct address_space *zap_mapping; /* Check page->mapping if set */
> struct folio *single_folio; /* Locked folio to be unmapped */
> + bool even_cows; /* Zap COWed private pages too? */
> };
>
> /* Whether we should zap all COWed (private) pages too */
> @@ -1321,13 +1321,10 @@ static inline bool should_zap_cows(struct zap_details *details)
> return true;
>
> /* Or, we zap COWed pages only if the caller wants to */
> - return !details->zap_mapping;
> + return details->even_cows;
> }
>
> -/*
> - * We set details->zap_mapping when we want to unmap shared but keep private
> - * pages. Return true if we should zap this page, false otherwise.
> - */
> +/* Decides whether we should zap this page with the page pointer specified */
> static inline bool should_zap_page(struct zap_details *details, struct page *page)
> {
> /* If we can make a decision without *page.. */
> @@ -1338,7 +1335,8 @@ static inline bool should_zap_page(struct zap_details *details, struct page *pag
> if (!page)
> return true;
>
> - return details->zap_mapping == page_rmapping(page);
> + /* Otherwise we should only zap non-anon pages */
> + return !PageAnon(page);
> }
>
> static unsigned long zap_pte_range(struct mmu_gather *tlb,
> @@ -3403,7 +3401,7 @@ void unmap_mapping_folio(struct folio *folio)
> first_index = folio->index;
> last_index = folio->index + folio_nr_pages(folio) - 1;
>
> - details.zap_mapping = mapping;
> + details.even_cows = false;
> details.single_folio = folio;
>
> i_mmap_lock_write(mapping);
> @@ -3432,7 +3430,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
> pgoff_t first_index = start;
> pgoff_t last_index = start + nr - 1;
>
> - details.zap_mapping = even_cows ? NULL : mapping;
> + details.even_cows = even_cows;
> if (last_index < first_index)
> last_index = ULONG_MAX;
>
On Wed, Feb 16, 2022 at 06:54:55PM -0800, Andrew Morton wrote:
> On Wed, 16 Feb 2022 17:48:07 +0800 Peter Xu <[email protected]> wrote:
>
> > The "details" pointer shouldn't be the token to decide whether we should skip
> > swap entries. For example, when the user specified details->zap_mapping==NULL,
> > it means the user wants to zap all the pages (including COWed pages), then we
> > need to look into swap entries because there can be private COWed pages that
> > was swapped out.
>
> I assume "user" here means "caller".
Right.
>
> > Skipping some swap entries when details is non-NULL may lead to wrongly leaving
> > some of the swap entries while we should have zapped them.
> >
> > A reproducer of the problem:
> >
> > ...
> >
> > The issue seems to exist starting from the initial commit of git.
>
> I'll add cc:stable to this one, thanks.
Yes, thanks.
I didn't yet check the backports to stable yet, some of them might need some
tweaks on the patch. I'll follow that up when it hits it.
--
Peter Xu
On Wed, Feb 16, 2022 at 07:25:14PM -0800, John Hubbard wrote:
> On 2/16/22 1:48 AM, Peter Xu wrote:
> > Clean the code up by merging the device private/exclusive swap entry handling
> > with the rest, then we merge the pte clear operation too.
>
> Maybe also mention that you reduced the code duplication in the
> is_device_private_entry() area, by letting it fall through to the common
> pte_clear_not_present_full() at the end of the loop? Since you're listing
> the other changes, that one seems worth mentioning.
Isn't that the "we merge the pte clear operation" part? :)
I can add another sentence to it, if it looks better to you:
---8<---
Clean the code up by merging the device private/exclusive swap entry
handling with the rest, then we merge the pte clear operation too. We do
it by letting the private/exclusive block fall through to the last call to
pte_clear_not_present_full().
---8<---
Thanks for the review,
--
Peter Xu
On 2/16/22 20:46, Peter Xu wrote:
...
> I personally used 78/79 col width for a long time for different projects, but
> sure I can adjust my config. I found that the "official guide" points us to
> 75 instead:
>
> https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html
>
> The body of the explanation, line wrapped at 75 columns, which will be copied
> to the permanent changelog to describe this patch.
>
> I'll follow that.
>
Oh, I'll change to that too, then. Thanks for spotting that.
> [...]
>>> + if (!should_zap_cows(details))
>>> + continue;
>>> rss[MM_SWAPENTS]--;
>>> - else if (is_migration_entry(entry)) {
>>
>> Can we put a newline here, and before each "else" block? Because now it
>> is getting very dense, and the visual separation really helps.
>
> The thing is we don't have a rule to add empty lines here, or do we? Changing
> it could make it less like what we have had.
>
I'm not claiming rules or standards, this is just a special case of trying
to make it a little nicer. No big deal either way, of course.
thanks,
--
John Hubbard
NVIDIA
On 2/16/22 20:54, Peter Xu wrote:
> On Wed, Feb 16, 2022 at 07:25:14PM -0800, John Hubbard wrote:
>> On 2/16/22 1:48 AM, Peter Xu wrote:
>>> Clean the code up by merging the device private/exclusive swap entry handling
>>> with the rest, then we merge the pte clear operation too.
>>
>> Maybe also mention that you reduced the code duplication in the
>> is_device_private_entry() area, by letting it fall through to the common
>> pte_clear_not_present_full() at the end of the loop? Since you're listing
>> the other changes, that one seems worth mentioning.
>
> Isn't that the "we merge the pte clear operation" part? :)
>
Somehow that part wasn't as clear to me, but...
> I can add another sentence to it, if it looks better to you:
>
> ---8<---
> Clean the code up by merging the device private/exclusive swap entry
> handling with the rest, then we merge the pte clear operation too. We do
> it by letting the private/exclusive block fall through to the last call to
> pte_clear_not_present_full().
> ---8<---
>
...no need to change anything here. I just wanted the list to be complete,
and it is.
thanks,
--
John Hubbard
NVIDIA
On 2/16/22 1:48 AM, Peter Xu wrote:
> Clean the code up by merging the device private/exclusive swap entry handling
> with the rest, then we merge the pte clear operation too.
Maybe also mention that you reduced the code duplication in the
is_device_private_entry() area, by letting it fall through to the common
pte_clear_not_present_full() at the end of the loop? Since you're listing
the other changes, that one seems worth mentioning.
>
> struct* page is defined in multiple places in the function, move it upward.
>
> free_swap_and_cache() is only useful for !non_swap_entry() case, put it into
> the condition.
>
> No functional change intended.
>
> Signed-off-by: Peter Xu <[email protected]>
> ---
> mm/memory.c | 21 ++++++---------------
> 1 file changed, 6 insertions(+), 15 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index ffa8c7dfe9ad..cade96024349 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1361,6 +1361,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> arch_enter_lazy_mmu_mode();
> do {
> pte_t ptent = *pte;
> + struct page *page;
> +
> if (pte_none(ptent))
> continue;
>
> @@ -1368,8 +1370,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> break;
>
> if (pte_present(ptent)) {
> - struct page *page;
> -
> page = vm_normal_page(vma, addr, ptent);
> if (unlikely(!should_zap_page(details, page)))
> continue;
> @@ -1403,21 +1403,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> entry = pte_to_swp_entry(ptent);
> if (is_device_private_entry(entry) ||
> is_device_exclusive_entry(entry)) {
> - struct page *page = pfn_swap_entry_to_page(entry);
> -
> + page = pfn_swap_entry_to_page(entry);
> if (unlikely(!should_zap_page(details, page)))
> continue;
> - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
Yes! Good cleanup there.
> rss[mm_counter(page)]--;
> -
> if (is_device_private_entry(entry))
> page_remove_rmap(page, false);
> -
> put_page(page);
> - continue;
> - }
> -
> - if (!non_swap_entry(entry)) {
> + } else if (!non_swap_entry(entry)) {
> /*
> * If this is a genuine swap entry, then it must be an
> * private anon page. If the caller wants to skip
> @@ -1426,9 +1419,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> if (!should_zap_cows(details))
> continue;
> rss[MM_SWAPENTS]--;
> + if (unlikely(!free_swap_and_cache(entry)))
> + print_bad_pte(vma, addr, ptent, NULL);
> } else if (is_migration_entry(entry)) {
> - struct page *page;
> -
> page = pfn_swap_entry_to_page(entry);
> if (!should_zap_page(details, page))
> continue;
> @@ -1441,8 +1434,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> /* We should have covered all the swap entry types */
> WARN_ON_ONCE(1);
> }
> - if (unlikely(!free_swap_and_cache(entry)))
> - print_bad_pte(vma, addr, ptent, NULL);
> pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> } while (pte++, addr += PAGE_SIZE, addr != end);
>
Reviewed-by: John Hubbard <[email protected]>
thanks,
--
John Hubbard
NVIDIA
On Wed, Feb 16, 2022 at 07:15:30PM -0800, John Hubbard wrote:
> On 2/16/22 1:48 AM, Peter Xu wrote:
> > The "details" pointer shouldn't be the token to decide whether we should skip
> > swap entries. For example, when the user specified details->zap_mapping==NULL,
> > it means the user wants to zap all the pages (including COWed pages), then we
> > need to look into swap entries because there can be private COWed pages that
> > was swapped out.
>
> Hi Peter,
>
> The changes look good, just some minor readability suggestions below:
>
> (btw, hch is going to ask you to reflow all of the commit descriptions
> to 72 cols, so you might as well do it in advance. :)
Thanks for the heads-up. :)
I personally used 78/79 col width for a long time for different projects, but
sure I can adjust my config. I found that the "official guide" points us to
75 instead:
https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html
The body of the explanation, line wrapped at 75 columns, which will be copied
to the permanent changelog to describe this patch.
I'll follow that.
[...]
> > @@ -1320,11 +1331,15 @@ struct zap_details {
> > static inline bool
> > zap_skip_check_mapping(struct zap_details *details, struct page *page)
> > {
> > - if (!details || !page)
> > + /* If we can make a decision without *page.. */
> > + if (should_zap_cows(details))
> > return false;
> >
> > - return details->zap_mapping &&
> > - (details->zap_mapping != page_rmapping(page));
> > + /* E.g. zero page */
>
> It's a bit confusing to see a comment that "this could be the zero page", if
> the value is NULL. Maybe, "the caller passes NULL for the case of a zero
> page", or something along those lines?
It didn't show much difference here.. but for sure I can coordinate.
>
>
> > + if (!page)
> > + return false;
> > +
> > + return details->zap_mapping != page_rmapping(page);
> > }
> >
> > static unsigned long zap_pte_range(struct mmu_gather *tlb,
> > @@ -1405,17 +1420,29 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> > continue;
> > }
> >
> > - /* If details->check_mapping, we leave swap entries. */
> > - if (unlikely(details))
> > - continue;
> > -
> > - if (!non_swap_entry(entry))
> > + if (!non_swap_entry(entry)) {
> > + /*
> > + * If this is a genuine swap entry, then it must be an
> > + * private anon page. If the caller wants to skip
> > + * COWed pages, ignore it.
> > + */
>
> How about this instead:
>
> /* Genuine swap entry, and therefore a private anon page. */
Yes the last sentence is kind of redundant.
>
> > + if (!should_zap_cows(details))
> > + continue;
> > rss[MM_SWAPENTS]--;
> > - else if (is_migration_entry(entry)) {
>
> Can we put a newline here, and before each "else" block? Because now it
> is getting very dense, and the visual separation really helps.
The thing is we don't have a rule to add empty lines here, or do we? Changing
it could make it less like what we have had.
Personally it looks fine, because separations are done with either new lines or
indents. Here it's done via indents, IMHO.
>
> > + } else if (is_migration_entry(entry)) {
> > struct page *page;
> >
> > page = pfn_swap_entry_to_page(entry);
> > + if (zap_skip_check_mapping(details, page))
> > + continue;
> > rss[mm_counter(page)]--;
>
> Newline here.
>
> > + } else if (is_hwpoison_entry(entry)) {
> > + /* If the caller wants to skip COWed pages, ignore it */
>
> Likewise, I'd prefer we delete that comment, because it exactly matches
> what the following line of code says.
Will do.
>
> > + if (!should_zap_cows(details))
> > + continue;
>
> And newline here too.
>
> > + } else {
> > + /* We should have covered all the swap entry types */
> > + WARN_ON_ONCE(1);
> > }
> > if (unlikely(!free_swap_and_cache(entry)))
> > print_bad_pte(vma, addr, ptent, NULL);
>
> Those are all just nits, and as I mentioned, the actual changes look good
> to me, so:
>
> Reviewed-by: John Hubbard <[email protected]>
Thanks,
--
Peter Xu
On 2/16/22 1:48 AM, Peter Xu wrote:
> The "details" pointer shouldn't be the token to decide whether we should skip
> swap entries. For example, when the user specified details->zap_mapping==NULL,
> it means the user wants to zap all the pages (including COWed pages), then we
> need to look into swap entries because there can be private COWed pages that
> was swapped out.
Hi Peter,
The changes look good, just some minor readability suggestions below:
(btw, hch is going to ask you to reflow all of the commit descriptions
to 72 cols, so you might as well do it in advance. :)
>
> Skipping some swap entries when details is non-NULL may lead to wrongly leaving
> some of the swap entries while we should have zapped them.
>
> A reproducer of the problem:
>
> ===8<===
> #define _GNU_SOURCE /* See feature_test_macros(7) */
> #include <stdio.h>
> #include <assert.h>
> #include <unistd.h>
> #include <sys/mman.h>
> #include <sys/types.h>
>
> int page_size;
> int shmem_fd;
> char *buffer;
>
> void main(void)
> {
> int ret;
> char val;
>
> page_size = getpagesize();
> shmem_fd = memfd_create("test", 0);
> assert(shmem_fd >= 0);
>
> ret = ftruncate(shmem_fd, page_size * 2);
> assert(ret == 0);
>
> buffer = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
> MAP_PRIVATE, shmem_fd, 0);
> assert(buffer != MAP_FAILED);
>
> /* Write private page, swap it out */
> buffer[page_size] = 1;
> madvise(buffer, page_size * 2, MADV_PAGEOUT);
>
> /* This should drop private buffer[page_size] already */
> ret = ftruncate(shmem_fd, page_size);
> assert(ret == 0);
> /* Recover the size */
> ret = ftruncate(shmem_fd, page_size * 2);
> assert(ret == 0);
>
> /* Re-read the data, it should be all zero */
> val = buffer[page_size];
> if (val == 0)
> printf("Good\n");
> else
> printf("BUG\n");
> }
> ===8<===
>
> We don't need to touch up the pmd path, because pmd never had a issue with swap
> entries. For example, shmem pmd migration will always be split into pte level,
> and same to swapping on anonymous.
>
> Add another helper should_zap_cows() so that we can also check whether we
> should zap private mappings when there's no page pointer specified.
>
> This patch drops that trick, so we handle swap ptes coherently. Meanwhile we
> should do the same check upon migration entry, hwpoison entry and genuine swap
> entries too. To be explicit, we should still remember to keep the private
> entries if even_cows==false, and always zap them when even_cows==true.
>
> The issue seems to exist starting from the initial commit of git.
>
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Peter Xu <[email protected]>
> ---
> mm/memory.c | 45 ++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index c125c4969913..4bfeaca7cbc7 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1313,6 +1313,17 @@ struct zap_details {
> struct folio *single_folio; /* Locked folio to be unmapped */
> };
>
> +/* Whether we should zap all COWed (private) pages too */
> +static inline bool should_zap_cows(struct zap_details *details)
> +{
> + /* By default, zap all pages */
> + if (!details)
> + return true;
> +
> + /* Or, we zap COWed pages only if the caller wants to */
> + return !details->zap_mapping;
> +}
> +
> /*
> * We set details->zap_mapping when we want to unmap shared but keep private
> * pages. Return true if skip zapping this page, false otherwise.
> @@ -1320,11 +1331,15 @@ struct zap_details {
> static inline bool
> zap_skip_check_mapping(struct zap_details *details, struct page *page)
> {
> - if (!details || !page)
> + /* If we can make a decision without *page.. */
> + if (should_zap_cows(details))
> return false;
>
> - return details->zap_mapping &&
> - (details->zap_mapping != page_rmapping(page));
> + /* E.g. zero page */
It's a bit confusing to see a comment that "this could be the zero page", if
the value is NULL. Maybe, "the caller passes NULL for the case of a zero
page", or something along those lines?
> + if (!page)
> + return false;
> +
> + return details->zap_mapping != page_rmapping(page);
> }
>
> static unsigned long zap_pte_range(struct mmu_gather *tlb,
> @@ -1405,17 +1420,29 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> continue;
> }
>
> - /* If details->check_mapping, we leave swap entries. */
> - if (unlikely(details))
> - continue;
> -
> - if (!non_swap_entry(entry))
> + if (!non_swap_entry(entry)) {
> + /*
> + * If this is a genuine swap entry, then it must be an
> + * private anon page. If the caller wants to skip
> + * COWed pages, ignore it.
> + */
How about this instead:
/* Genuine swap entry, and therefore a private anon page. */
> + if (!should_zap_cows(details))
> + continue;
> rss[MM_SWAPENTS]--;
> - else if (is_migration_entry(entry)) {
Can we put a newline here, and before each "else" block? Because now it
is getting very dense, and the visual separation really helps.
> + } else if (is_migration_entry(entry)) {
> struct page *page;
>
> page = pfn_swap_entry_to_page(entry);
> + if (zap_skip_check_mapping(details, page))
> + continue;
> rss[mm_counter(page)]--;
Newline here.
> + } else if (is_hwpoison_entry(entry)) {
> + /* If the caller wants to skip COWed pages, ignore it */
Likewise, I'd prefer we delete that comment, because it exactly matches
what the following line of code says.
> + if (!should_zap_cows(details))
> + continue;
And newline here too.
> + } else {
> + /* We should have covered all the swap entry types */
> + WARN_ON_ONCE(1);
> }
> if (unlikely(!free_swap_and_cache(entry)))
> print_bad_pte(vma, addr, ptent, NULL);
Those are all just nits, and as I mentioned, the actual changes look good
to me, so:
Reviewed-by: John Hubbard <[email protected]>
thanks,
--
John Hubbard
NVIDIA
On 2/16/22 1:48 AM, Peter Xu wrote:
> The previous name is against the natural way people think. Invert the meaning
Definitely.
> and also the return value. No functional change intended.
>
> Suggested-by: David Hildenbrand <[email protected]>
> Suggested-by: Hugh Dickins <[email protected]>
> Reviewed-by: David Hildenbrand <[email protected]>
> Signed-off-by: Peter Xu <[email protected]>
> ---
> mm/memory.c | 17 ++++++++---------
> 1 file changed, 8 insertions(+), 9 deletions(-)
Reviewed-by: John Hubbard <[email protected]>
thanks,
--
John Hubbard
NVIDIA
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 4bfeaca7cbc7..14d8428ff4db 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1326,20 +1326,19 @@ static inline bool should_zap_cows(struct zap_details *details)
>
> /*
> * We set details->zap_mapping when we want to unmap shared but keep private
> - * pages. Return true if skip zapping this page, false otherwise.
> + * pages. Return true if we should zap this page, false otherwise.
> */
> -static inline bool
> -zap_skip_check_mapping(struct zap_details *details, struct page *page)
> +static inline bool should_zap_page(struct zap_details *details, struct page *page)
> {
> /* If we can make a decision without *page.. */
> if (should_zap_cows(details))
> - return false;
> + return true;
>
> /* E.g. zero page */
> if (!page)
> - return false;
> + return true;
>
> - return details->zap_mapping != page_rmapping(page);
> + return details->zap_mapping == page_rmapping(page);
> }
>
> static unsigned long zap_pte_range(struct mmu_gather *tlb,
> @@ -1374,7 +1373,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> struct page *page;
>
> page = vm_normal_page(vma, addr, ptent);
> - if (unlikely(zap_skip_check_mapping(details, page)))
> + if (unlikely(!should_zap_page(details, page)))
> continue;
> ptent = ptep_get_and_clear_full(mm, addr, pte,
> tlb->fullmm);
> @@ -1408,7 +1407,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> is_device_exclusive_entry(entry)) {
> struct page *page = pfn_swap_entry_to_page(entry);
>
> - if (unlikely(zap_skip_check_mapping(details, page)))
> + if (unlikely(!should_zap_page(details, page)))
> continue;
> pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> rss[mm_counter(page)]--;
> @@ -1433,7 +1432,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> struct page *page;
>
> page = pfn_swap_entry_to_page(entry);
> - if (zap_skip_check_mapping(details, page))
> + if (!should_zap_page(details, page))
> continue;
> rss[mm_counter(page)]--;
> } else if (is_hwpoison_entry(entry)) {