Currently try_to_unmap() return bool value by checking page_mapcount(),
however this may return false positive since page_mapcount() doesn't
check all subpages of compound page. The total_mapcount() could be used
instead, but its cost is higher since it traverses all subpages.
Actually the most callers of try_to_unmap() don't care about the
return value at all. So just need check if page is still mapped by
page_mapped() when necessary. And page_mapped() does bail out early
when it finds mapped subpage.
Suggested-by: Hugh Dickins <[email protected]>
Signed-off-by: Yang Shi <[email protected]>
---
include/linux/rmap.h | 2 +-
mm/huge_memory.c | 4 +---
mm/memory-failure.c | 13 ++++++-------
mm/rmap.c | 6 +-----
mm/vmscan.c | 3 ++-
5 files changed, 11 insertions(+), 17 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..116cb193110a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
int page_referenced(struct page *, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags);
-bool try_to_unmap(struct page *, enum ttu_flags flags);
+void try_to_unmap(struct page *, enum ttu_flags flags);
/* Avoid racy checks */
#define PVMW_SYNC (1 << 0)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 19195fca1aee..80fe642d742d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ try_to_unmap(page, ttu_flags);
}
static void remap_page(struct page *page, unsigned int nr)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9dcc9bcea731..6dd53ff34825 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
+ try_to_unmap(hpage, ttu);
} else {
if (!PageAnon(hpage)) {
/*
@@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
*/
mapping = hugetlb_page_mapping_lock_write(hpage);
if (mapping) {
- unmap_success = try_to_unmap(hpage,
- ttu|TTU_RMAP_LOCKED);
+ try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
- } else {
+ } else
pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
- unmap_success = false;
- }
} else {
- unmap_success = try_to_unmap(hpage, ttu);
+ try_to_unmap(hpage, ttu);
}
}
+
+ unmap_success = !page_mapped(hpage);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
diff --git a/mm/rmap.c b/mm/rmap.c
index a35cbbbded0d..728de421e43a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
- *
- * If unmap is successful, return true. Otherwise, false.
*/
-bool try_to_unmap(struct page *page, enum ttu_flags flags)
+void try_to_unmap(struct page *page, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
@@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
rmap_walk_locked(page, &rwc);
else
rmap_walk(page, &rwc);
-
- return !page_mapcount(page) ? true : false;
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f96d62159720..fa5052ace415 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
- if (!try_to_unmap(page, flags)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page)) {
stat->nr_unmap_fail += nr_pages;
if (!was_swapbacked && PageSwapBacked(page))
stat->nr_lazyfree_fail += nr_pages;
--
2.26.2
On Tue, May 25, 2021 at 09:21:44AM -0700, Yang Shi wrote:
> Currently try_to_unmap() return bool value by checking page_mapcount(),
> however this may return false positive since page_mapcount() doesn't
> check all subpages of compound page. The total_mapcount() could be used
> instead, but its cost is higher since it traverses all subpages.
>
> Actually the most callers of try_to_unmap() don't care about the
> return value at all. So just need check if page is still mapped by
> page_mapped() when necessary. And page_mapped() does bail out early
> when it finds mapped subpage.
>
> Suggested-by: Hugh Dickins <[email protected]>
> Signed-off-by: Yang Shi <[email protected]>
> ---
> include/linux/rmap.h | 2 +-
> mm/huge_memory.c | 4 +---
> mm/memory-failure.c | 13 ++++++-------
> mm/rmap.c | 6 +-----
> mm/vmscan.c | 3 ++-
> 5 files changed, 11 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index def5c62c93b3..116cb193110a 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
> int page_referenced(struct page *, int is_locked,
> struct mem_cgroup *memcg, unsigned long *vm_flags);
>
> -bool try_to_unmap(struct page *, enum ttu_flags flags);
> +void try_to_unmap(struct page *, enum ttu_flags flags);
>
> /* Avoid racy checks */
> #define PVMW_SYNC (1 << 0)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 19195fca1aee..80fe642d742d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
> {
> enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
> TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> - bool unmap_success;
>
> VM_BUG_ON_PAGE(!PageHead(page), page);
>
> if (PageAnon(page))
> ttu_flags |= TTU_SPLIT_FREEZE;
>
> - unmap_success = try_to_unmap(page, ttu_flags);
> - VM_BUG_ON_PAGE(!unmap_success, page);
> + try_to_unmap(page, ttu_flags);
> }
>
> static void remap_page(struct page *page, unsigned int nr)
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 9dcc9bcea731..6dd53ff34825 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
>
> if (!PageHuge(hpage)) {
> - unmap_success = try_to_unmap(hpage, ttu);
> + try_to_unmap(hpage, ttu);
> } else {
> if (!PageAnon(hpage)) {
> /*
> @@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> */
> mapping = hugetlb_page_mapping_lock_write(hpage);
> if (mapping) {
> - unmap_success = try_to_unmap(hpage,
> - ttu|TTU_RMAP_LOCKED);
> + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
> i_mmap_unlock_write(mapping);
> - } else {
> + } else
> pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
> - unmap_success = false;
> - }
> } else {
> - unmap_success = try_to_unmap(hpage, ttu);
> + try_to_unmap(hpage, ttu);
> }
> }
> +
> + unmap_success = !page_mapped(hpage);
> if (!unmap_success)
> pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
> pfn, page_mapcount(hpage));
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a35cbbbded0d..728de421e43a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
> *
> * Tries to remove all the page table entries which are mapping this
> * page, used in the pageout path. Caller must hold the page lock.
> - *
> - * If unmap is successful, return true. Otherwise, false.
> */
> -bool try_to_unmap(struct page *page, enum ttu_flags flags)
> +void try_to_unmap(struct page *page, enum ttu_flags flags)
> {
> struct rmap_walk_control rwc = {
> .rmap_one = try_to_unmap_one,
> @@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> rmap_walk_locked(page, &rwc);
> else
> rmap_walk(page, &rwc);
> -
> - return !page_mapcount(page) ? true : false;
Couldn't we use page_mapped instead of page_mapcount here?
With boolean return of try sematic looks reasonable to me
rather than void.
> }
>
> /**
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f96d62159720..fa5052ace415 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> if (unlikely(PageTransHuge(page)))
> flags |= TTU_SPLIT_HUGE_PMD;
>
> - if (!try_to_unmap(page, flags)) {
> + try_to_unmap(page, flags);
> + if (page_mapped(page)) {
> stat->nr_unmap_fail += nr_pages;
> if (!was_swapbacked && PageSwapBacked(page))
> stat->nr_lazyfree_fail += nr_pages;
> --
> 2.26.2
>
>
When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
return false positive for PTE-mapped THP since page_mapcount() is used
to check if the THP is unmapped, but it just checks compound mapount and
head page's mapcount. If the THP is PTE-mapped and head page is not
mapped, it may return false positive.
The try_to_unmap() has been changed to void function, so check
page_mapped() after it. And changed BUG_ON to WARN_ON since it is not a
fatal issue.
[1] https://lore.kernel.org/linux-mm/[email protected]/
Reviewed-by: Zi Yan <[email protected]>
Signed-off-by: Yang Shi <[email protected]>
---
v3: Incorporated the comments from Hugh. Keep Zi Yan's reviewed-by tag
since there is no fundamental change against v2.
v2: Removed dead code and updated the comment of try_to_unmap() per Zi
Yan.
mm/huge_memory.c | 17 +++++------------
1 file changed, 5 insertions(+), 12 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 80fe642d742d..72d81d8e01b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2343,6 +2343,8 @@ static void unmap_page(struct page *page)
ttu_flags |= TTU_SPLIT_FREEZE;
try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page, unsigned int nr)
@@ -2653,7 +2655,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
+ int mapcount, extra_pins, ret;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2712,7 +2714,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
@@ -2730,7 +2731,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
@@ -2752,16 +2752,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end);
ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
+fail:
+ if (mapping)
xa_unlock(&mapping->i_pages);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
--
2.26.2
On Tue, May 25, 2021 at 9:46 AM Minchan Kim <[email protected]> wrote:
>
> On Tue, May 25, 2021 at 09:21:44AM -0700, Yang Shi wrote:
> > Currently try_to_unmap() return bool value by checking page_mapcount(),
> > however this may return false positive since page_mapcount() doesn't
> > check all subpages of compound page. The total_mapcount() could be used
> > instead, but its cost is higher since it traverses all subpages.
> >
> > Actually the most callers of try_to_unmap() don't care about the
> > return value at all. So just need check if page is still mapped by
> > page_mapped() when necessary. And page_mapped() does bail out early
> > when it finds mapped subpage.
> >
> > Suggested-by: Hugh Dickins <[email protected]>
> > Signed-off-by: Yang Shi <[email protected]>
> > ---
> > include/linux/rmap.h | 2 +-
> > mm/huge_memory.c | 4 +---
> > mm/memory-failure.c | 13 ++++++-------
> > mm/rmap.c | 6 +-----
> > mm/vmscan.c | 3 ++-
> > 5 files changed, 11 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > index def5c62c93b3..116cb193110a 100644
> > --- a/include/linux/rmap.h
> > +++ b/include/linux/rmap.h
> > @@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
> > int page_referenced(struct page *, int is_locked,
> > struct mem_cgroup *memcg, unsigned long *vm_flags);
> >
> > -bool try_to_unmap(struct page *, enum ttu_flags flags);
> > +void try_to_unmap(struct page *, enum ttu_flags flags);
> >
> > /* Avoid racy checks */
> > #define PVMW_SYNC (1 << 0)
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 19195fca1aee..80fe642d742d 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
> > {
> > enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
> > TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> > - bool unmap_success;
> >
> > VM_BUG_ON_PAGE(!PageHead(page), page);
> >
> > if (PageAnon(page))
> > ttu_flags |= TTU_SPLIT_FREEZE;
> >
> > - unmap_success = try_to_unmap(page, ttu_flags);
> > - VM_BUG_ON_PAGE(!unmap_success, page);
> > + try_to_unmap(page, ttu_flags);
> > }
> >
> > static void remap_page(struct page *page, unsigned int nr)
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 9dcc9bcea731..6dd53ff34825 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
> >
> > if (!PageHuge(hpage)) {
> > - unmap_success = try_to_unmap(hpage, ttu);
> > + try_to_unmap(hpage, ttu);
> > } else {
> > if (!PageAnon(hpage)) {
> > /*
> > @@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > */
> > mapping = hugetlb_page_mapping_lock_write(hpage);
> > if (mapping) {
> > - unmap_success = try_to_unmap(hpage,
> > - ttu|TTU_RMAP_LOCKED);
> > + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
> > i_mmap_unlock_write(mapping);
> > - } else {
> > + } else
> > pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
> > - unmap_success = false;
> > - }
> > } else {
> > - unmap_success = try_to_unmap(hpage, ttu);
> > + try_to_unmap(hpage, ttu);
> > }
> > }
> > +
> > + unmap_success = !page_mapped(hpage);
> > if (!unmap_success)
> > pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
> > pfn, page_mapcount(hpage));
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index a35cbbbded0d..728de421e43a 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
> > *
> > * Tries to remove all the page table entries which are mapping this
> > * page, used in the pageout path. Caller must hold the page lock.
> > - *
> > - * If unmap is successful, return true. Otherwise, false.
> > */
> > -bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > +void try_to_unmap(struct page *page, enum ttu_flags flags)
> > {
> > struct rmap_walk_control rwc = {
> > .rmap_one = try_to_unmap_one,
> > @@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > rmap_walk_locked(page, &rwc);
> > else
> > rmap_walk(page, &rwc);
> > -
> > - return !page_mapcount(page) ? true : false;
>
> Couldn't we use page_mapped instead of page_mapcount here?
Yes, of course. Actually this has been discussed in v2 review. Most
(or half) callers actually don't check the return value of
try_to_unmap() except hwpoison, vmscan and THP split. It sounds
suboptimal to have everyone pay the cost. So I thought Hugh's
suggestion made sense to me.
Quoted the discussion below:
> @@ -1777,7 +1779,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> else
> rmap_walk(page, &rwc);
>
> - return !page_mapcount(page) ? true : false;
> + return !total_mapcount(page) ? true : false;
That always made me wince: "return !total_mapcount(page);" surely.
Or slightly better, "return !page_mapped(page);", since at least that
one breaks out as soon as it sees a mapcount. Though I guess I'm
being silly there, since that case should never occur, so both
total_mapcount() and page_mapped() scan through all pages.
Or better, change try_to_unmap() to void: most callers ignore its
return value anyway, and make their own decisions; the remaining
few could be changed to do the same. Though again, I may be
being silly, since the expensive THP case is not the common case.
> With boolean return of try sematic looks reasonable to me
> rather than void.
>
> > }
> >
> > /**
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f96d62159720..fa5052ace415 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> > if (unlikely(PageTransHuge(page)))
> > flags |= TTU_SPLIT_HUGE_PMD;
> >
> > - if (!try_to_unmap(page, flags)) {
> > + try_to_unmap(page, flags);
> > + if (page_mapped(page)) {
> > stat->nr_unmap_fail += nr_pages;
> > if (!was_swapbacked && PageSwapBacked(page))
> > stat->nr_lazyfree_fail += nr_pages;
> > --
> > 2.26.2
> >
> >
On Tue, May 25, 2021 at 10:07:05AM -0700, Yang Shi wrote:
> On Tue, May 25, 2021 at 9:46 AM Minchan Kim <[email protected]> wrote:
> >
> > On Tue, May 25, 2021 at 09:21:44AM -0700, Yang Shi wrote:
> > > Currently try_to_unmap() return bool value by checking page_mapcount(),
> > > however this may return false positive since page_mapcount() doesn't
> > > check all subpages of compound page. The total_mapcount() could be used
> > > instead, but its cost is higher since it traverses all subpages.
> > >
> > > Actually the most callers of try_to_unmap() don't care about the
> > > return value at all. So just need check if page is still mapped by
> > > page_mapped() when necessary. And page_mapped() does bail out early
> > > when it finds mapped subpage.
> > >
> > > Suggested-by: Hugh Dickins <[email protected]>
> > > Signed-off-by: Yang Shi <[email protected]>
> > > ---
> > > include/linux/rmap.h | 2 +-
> > > mm/huge_memory.c | 4 +---
> > > mm/memory-failure.c | 13 ++++++-------
> > > mm/rmap.c | 6 +-----
> > > mm/vmscan.c | 3 ++-
> > > 5 files changed, 11 insertions(+), 17 deletions(-)
> > >
> > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > > index def5c62c93b3..116cb193110a 100644
> > > --- a/include/linux/rmap.h
> > > +++ b/include/linux/rmap.h
> > > @@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
> > > int page_referenced(struct page *, int is_locked,
> > > struct mem_cgroup *memcg, unsigned long *vm_flags);
> > >
> > > -bool try_to_unmap(struct page *, enum ttu_flags flags);
> > > +void try_to_unmap(struct page *, enum ttu_flags flags);
> > >
> > > /* Avoid racy checks */
> > > #define PVMW_SYNC (1 << 0)
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 19195fca1aee..80fe642d742d 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
> > > {
> > > enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
> > > TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> > > - bool unmap_success;
> > >
> > > VM_BUG_ON_PAGE(!PageHead(page), page);
> > >
> > > if (PageAnon(page))
> > > ttu_flags |= TTU_SPLIT_FREEZE;
> > >
> > > - unmap_success = try_to_unmap(page, ttu_flags);
> > > - VM_BUG_ON_PAGE(!unmap_success, page);
> > > + try_to_unmap(page, ttu_flags);
> > > }
> > >
> > > static void remap_page(struct page *page, unsigned int nr)
> > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > > index 9dcc9bcea731..6dd53ff34825 100644
> > > --- a/mm/memory-failure.c
> > > +++ b/mm/memory-failure.c
> > > @@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > > collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
> > >
> > > if (!PageHuge(hpage)) {
> > > - unmap_success = try_to_unmap(hpage, ttu);
> > > + try_to_unmap(hpage, ttu);
> > > } else {
> > > if (!PageAnon(hpage)) {
> > > /*
> > > @@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > > */
> > > mapping = hugetlb_page_mapping_lock_write(hpage);
> > > if (mapping) {
> > > - unmap_success = try_to_unmap(hpage,
> > > - ttu|TTU_RMAP_LOCKED);
> > > + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
> > > i_mmap_unlock_write(mapping);
> > > - } else {
> > > + } else
> > > pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
> > > - unmap_success = false;
> > > - }
> > > } else {
> > > - unmap_success = try_to_unmap(hpage, ttu);
> > > + try_to_unmap(hpage, ttu);
> > > }
> > > }
> > > +
> > > + unmap_success = !page_mapped(hpage);
> > > if (!unmap_success)
> > > pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
> > > pfn, page_mapcount(hpage));
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index a35cbbbded0d..728de421e43a 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
> > > *
> > > * Tries to remove all the page table entries which are mapping this
> > > * page, used in the pageout path. Caller must hold the page lock.
> > > - *
> > > - * If unmap is successful, return true. Otherwise, false.
> > > */
> > > -bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > > +void try_to_unmap(struct page *page, enum ttu_flags flags)
> > > {
> > > struct rmap_walk_control rwc = {
> > > .rmap_one = try_to_unmap_one,
> > > @@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > > rmap_walk_locked(page, &rwc);
> > > else
> > > rmap_walk(page, &rwc);
> > > -
> > > - return !page_mapcount(page) ? true : false;
> >
> > Couldn't we use page_mapped instead of page_mapcount here?
>
> Yes, of course. Actually this has been discussed in v2 review. Most
> (or half) callers actually don't check the return value of
> try_to_unmap() except hwpoison, vmscan and THP split. It sounds
> suboptimal to have everyone pay the cost. So I thought Hugh's
> suggestion made sense to me.
Not sure most callers ignore the ret. I am seeing only do_migrate_range
ignores it. Other than that, they checked the success with page_mapped
in the end.
With returning void, I feel like it's not try sematic function
any longer. If you still want to go with it, I suggest adding
some comment how to check the function's successness in the
comment place you removed above.
>
> Quoted the discussion below:
>
> > @@ -1777,7 +1779,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > else
> > rmap_walk(page, &rwc);
> >
> > - return !page_mapcount(page) ? true : false;
> > + return !total_mapcount(page) ? true : false;
>
> That always made me wince: "return !total_mapcount(page);" surely.
>
> Or slightly better, "return !page_mapped(page);", since at least that
> one breaks out as soon as it sees a mapcount. Though I guess I'm
> being silly there, since that case should never occur, so both
> total_mapcount() and page_mapped() scan through all pages.
>
> Or better, change try_to_unmap() to void: most callers ignore its
> return value anyway, and make their own decisions; the remaining
> few could be changed to do the same. Though again, I may be
> being silly, since the expensive THP case is not the common case.
>
>
> > With boolean return of try sematic looks reasonable to me
> > rather than void.
> >
> > > }
> > >
> > > /**
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index f96d62159720..fa5052ace415 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> > > if (unlikely(PageTransHuge(page)))
> > > flags |= TTU_SPLIT_HUGE_PMD;
> > >
> > > - if (!try_to_unmap(page, flags)) {
> > > + try_to_unmap(page, flags);
> > > + if (page_mapped(page)) {
> > > stat->nr_unmap_fail += nr_pages;
> > > if (!was_swapbacked && PageSwapBacked(page))
> > > stat->nr_lazyfree_fail += nr_pages;
> > > --
> > > 2.26.2
> > >
> > >
On Tue, May 25, 2021 at 10:34 AM Minchan Kim <[email protected]> wrote:
>
> On Tue, May 25, 2021 at 10:07:05AM -0700, Yang Shi wrote:
> > On Tue, May 25, 2021 at 9:46 AM Minchan Kim <[email protected]> wrote:
> > >
> > > On Tue, May 25, 2021 at 09:21:44AM -0700, Yang Shi wrote:
> > > > Currently try_to_unmap() return bool value by checking page_mapcount(),
> > > > however this may return false positive since page_mapcount() doesn't
> > > > check all subpages of compound page. The total_mapcount() could be used
> > > > instead, but its cost is higher since it traverses all subpages.
> > > >
> > > > Actually the most callers of try_to_unmap() don't care about the
> > > > return value at all. So just need check if page is still mapped by
> > > > page_mapped() when necessary. And page_mapped() does bail out early
> > > > when it finds mapped subpage.
> > > >
> > > > Suggested-by: Hugh Dickins <[email protected]>
> > > > Signed-off-by: Yang Shi <[email protected]>
> > > > ---
> > > > include/linux/rmap.h | 2 +-
> > > > mm/huge_memory.c | 4 +---
> > > > mm/memory-failure.c | 13 ++++++-------
> > > > mm/rmap.c | 6 +-----
> > > > mm/vmscan.c | 3 ++-
> > > > 5 files changed, 11 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > > > index def5c62c93b3..116cb193110a 100644
> > > > --- a/include/linux/rmap.h
> > > > +++ b/include/linux/rmap.h
> > > > @@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
> > > > int page_referenced(struct page *, int is_locked,
> > > > struct mem_cgroup *memcg, unsigned long *vm_flags);
> > > >
> > > > -bool try_to_unmap(struct page *, enum ttu_flags flags);
> > > > +void try_to_unmap(struct page *, enum ttu_flags flags);
> > > >
> > > > /* Avoid racy checks */
> > > > #define PVMW_SYNC (1 << 0)
> > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > > index 19195fca1aee..80fe642d742d 100644
> > > > --- a/mm/huge_memory.c
> > > > +++ b/mm/huge_memory.c
> > > > @@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
> > > > {
> > > > enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
> > > > TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> > > > - bool unmap_success;
> > > >
> > > > VM_BUG_ON_PAGE(!PageHead(page), page);
> > > >
> > > > if (PageAnon(page))
> > > > ttu_flags |= TTU_SPLIT_FREEZE;
> > > >
> > > > - unmap_success = try_to_unmap(page, ttu_flags);
> > > > - VM_BUG_ON_PAGE(!unmap_success, page);
> > > > + try_to_unmap(page, ttu_flags);
> > > > }
> > > >
> > > > static void remap_page(struct page *page, unsigned int nr)
> > > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > > > index 9dcc9bcea731..6dd53ff34825 100644
> > > > --- a/mm/memory-failure.c
> > > > +++ b/mm/memory-failure.c
> > > > @@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > > > collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
> > > >
> > > > if (!PageHuge(hpage)) {
> > > > - unmap_success = try_to_unmap(hpage, ttu);
> > > > + try_to_unmap(hpage, ttu);
> > > > } else {
> > > > if (!PageAnon(hpage)) {
> > > > /*
> > > > @@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > > > */
> > > > mapping = hugetlb_page_mapping_lock_write(hpage);
> > > > if (mapping) {
> > > > - unmap_success = try_to_unmap(hpage,
> > > > - ttu|TTU_RMAP_LOCKED);
> > > > + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
> > > > i_mmap_unlock_write(mapping);
> > > > - } else {
> > > > + } else
> > > > pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
> > > > - unmap_success = false;
> > > > - }
> > > > } else {
> > > > - unmap_success = try_to_unmap(hpage, ttu);
> > > > + try_to_unmap(hpage, ttu);
> > > > }
> > > > }
> > > > +
> > > > + unmap_success = !page_mapped(hpage);
> > > > if (!unmap_success)
> > > > pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
> > > > pfn, page_mapcount(hpage));
> > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > index a35cbbbded0d..728de421e43a 100644
> > > > --- a/mm/rmap.c
> > > > +++ b/mm/rmap.c
> > > > @@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
> > > > *
> > > > * Tries to remove all the page table entries which are mapping this
> > > > * page, used in the pageout path. Caller must hold the page lock.
> > > > - *
> > > > - * If unmap is successful, return true. Otherwise, false.
> > > > */
> > > > -bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > > > +void try_to_unmap(struct page *page, enum ttu_flags flags)
> > > > {
> > > > struct rmap_walk_control rwc = {
> > > > .rmap_one = try_to_unmap_one,
> > > > @@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > > > rmap_walk_locked(page, &rwc);
> > > > else
> > > > rmap_walk(page, &rwc);
> > > > -
> > > > - return !page_mapcount(page) ? true : false;
> > >
> > > Couldn't we use page_mapped instead of page_mapcount here?
> >
> > Yes, of course. Actually this has been discussed in v2 review. Most
> > (or half) callers actually don't check the return value of
> > try_to_unmap() except hwpoison, vmscan and THP split. It sounds
> > suboptimal to have everyone pay the cost. So I thought Hugh's
> > suggestion made sense to me.
>
> Not sure most callers ignore the ret. I am seeing only do_migrate_range
> ignores it. Other than that, they checked the success with page_mapped
> in the end.
I'd think this falls into the "ignore" category as well since the code
doesn't check the return value of try_to_unmap() :-). The patch does
convert the return value check to page_mapped() check right after
try_to_unmap().
>
> With returning void, I feel like it's not try sematic function
> any longer. If you still want to go with it, I suggest adding
> some comment how to check the function's successness in the
> comment place you removed above.
Thanks for the suggestion. Will add some notes about how to do the check.
>
> >
> > Quoted the discussion below:
> >
> > > @@ -1777,7 +1779,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > > else
> > > rmap_walk(page, &rwc);
> > >
> > > - return !page_mapcount(page) ? true : false;
> > > + return !total_mapcount(page) ? true : false;
> >
> > That always made me wince: "return !total_mapcount(page);" surely.
> >
> > Or slightly better, "return !page_mapped(page);", since at least that
> > one breaks out as soon as it sees a mapcount. Though I guess I'm
> > being silly there, since that case should never occur, so both
> > total_mapcount() and page_mapped() scan through all pages.
> >
> > Or better, change try_to_unmap() to void: most callers ignore its
> > return value anyway, and make their own decisions; the remaining
> > few could be changed to do the same. Though again, I may be
> > being silly, since the expensive THP case is not the common case.
> >
> >
> > > With boolean return of try sematic looks reasonable to me
> > > rather than void.
> > >
> > > > }
> > > >
> > > > /**
> > > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > > index f96d62159720..fa5052ace415 100644
> > > > --- a/mm/vmscan.c
> > > > +++ b/mm/vmscan.c
> > > > @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> > > > if (unlikely(PageTransHuge(page)))
> > > > flags |= TTU_SPLIT_HUGE_PMD;
> > > >
> > > > - if (!try_to_unmap(page, flags)) {
> > > > + try_to_unmap(page, flags);
> > > > + if (page_mapped(page)) {
> > > > stat->nr_unmap_fail += nr_pages;
> > > > if (!was_swapbacked && PageSwapBacked(page))
> > > > stat->nr_lazyfree_fail += nr_pages;
> > > > --
> > > > 2.26.2
> > > >
> > > >
On Tue, 25 May 2021, Yang Shi wrote:
> Currently try_to_unmap() return bool value by checking page_mapcount(),
> however this may return false positive since page_mapcount() doesn't
> check all subpages of compound page. The total_mapcount() could be used
> instead, but its cost is higher since it traverses all subpages.
>
> Actually the most callers of try_to_unmap() don't care about the
> return value at all. So just need check if page is still mapped by
> page_mapped() when necessary. And page_mapped() does bail out early
> when it finds mapped subpage.
>
> Suggested-by: Hugh Dickins <[email protected]>
> Signed-off-by: Yang Shi <[email protected]>
Thanks for doing this, I like it, you can add
Acked-by: Hugh Dickins <[email protected]>
Let's all ignore checkpatch's complaint on "struct page *":
that try_to_unmap() prototype is consistent with others in rmap.h,
and changing them to "struct page *page" adds no information at all.
But a nit in mm/memory-failure.c: unmap_success no longer needs
to be initialized to true on entry to hwpoison_user_mappings().
More seriously, I question the ordering of the two patches:
I think this 1/2 should be 2/2, and the other be 1/2. The other is
the one which replaces a BUG by a WARN, which will be wanted Cc stable;
whereas this one is really just a cleanup - I don't think there's a
serious consequence from the faint possibility of wrong unmap_success
on THP, in either memory-failure.c or vmscan.c - wrong message, wrong
stats, but more than that? And memory-failure.c on THP somewhat a
work in progress IIUC.
Unfortunately, they can't just be swapped but need rediffing.
Or do you disagree that's a better ordering?
As it stands, I imagine my series of THP unmap fixes (most needing Cc
stable) coming on top of your "mm: thp: check page_mapped instead of
page_mapcount for split" (umm, that's a bad title), but before this
cleanup.
Though I'm responding before I've looked through the stable trees:
it's not of overriding importance, but I'll want to try to minimize
the number of patches that need redoing for older stables, which might
affect the ordering. (And it might end up easiest if I bring your two
patches into my series, then ask Andrew to replace.)
Further complicated by Andrew having brought in the nouveau reorg of
mm/rmap.c (oh, and that touches mm/huge_memory.c unmap_page() too).
I expect I'll have to send Andrew a series to go before all that,
together with fixups to that to sit on top of it. Sigh. Well,
we can only blame me for not getting those THP fixes in sooner.
Hugh
> ---
> include/linux/rmap.h | 2 +-
> mm/huge_memory.c | 4 +---
> mm/memory-failure.c | 13 ++++++-------
> mm/rmap.c | 6 +-----
> mm/vmscan.c | 3 ++-
> 5 files changed, 11 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index def5c62c93b3..116cb193110a 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
> int page_referenced(struct page *, int is_locked,
> struct mem_cgroup *memcg, unsigned long *vm_flags);
>
> -bool try_to_unmap(struct page *, enum ttu_flags flags);
> +void try_to_unmap(struct page *, enum ttu_flags flags);
>
> /* Avoid racy checks */
> #define PVMW_SYNC (1 << 0)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 19195fca1aee..80fe642d742d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
> {
> enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
> TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> - bool unmap_success;
>
> VM_BUG_ON_PAGE(!PageHead(page), page);
>
> if (PageAnon(page))
> ttu_flags |= TTU_SPLIT_FREEZE;
>
> - unmap_success = try_to_unmap(page, ttu_flags);
> - VM_BUG_ON_PAGE(!unmap_success, page);
> + try_to_unmap(page, ttu_flags);
> }
>
> static void remap_page(struct page *page, unsigned int nr)
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 9dcc9bcea731..6dd53ff34825 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
>
> if (!PageHuge(hpage)) {
> - unmap_success = try_to_unmap(hpage, ttu);
> + try_to_unmap(hpage, ttu);
> } else {
> if (!PageAnon(hpage)) {
> /*
> @@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> */
> mapping = hugetlb_page_mapping_lock_write(hpage);
> if (mapping) {
> - unmap_success = try_to_unmap(hpage,
> - ttu|TTU_RMAP_LOCKED);
> + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
> i_mmap_unlock_write(mapping);
> - } else {
> + } else
> pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
> - unmap_success = false;
> - }
> } else {
> - unmap_success = try_to_unmap(hpage, ttu);
> + try_to_unmap(hpage, ttu);
> }
> }
> +
> + unmap_success = !page_mapped(hpage);
> if (!unmap_success)
> pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
> pfn, page_mapcount(hpage));
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a35cbbbded0d..728de421e43a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
> *
> * Tries to remove all the page table entries which are mapping this
> * page, used in the pageout path. Caller must hold the page lock.
> - *
> - * If unmap is successful, return true. Otherwise, false.
> */
> -bool try_to_unmap(struct page *page, enum ttu_flags flags)
> +void try_to_unmap(struct page *page, enum ttu_flags flags)
> {
> struct rmap_walk_control rwc = {
> .rmap_one = try_to_unmap_one,
> @@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> rmap_walk_locked(page, &rwc);
> else
> rmap_walk(page, &rwc);
> -
> - return !page_mapcount(page) ? true : false;
> }
>
> /**
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f96d62159720..fa5052ace415 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> if (unlikely(PageTransHuge(page)))
> flags |= TTU_SPLIT_HUGE_PMD;
>
> - if (!try_to_unmap(page, flags)) {
> + try_to_unmap(page, flags);
> + if (page_mapped(page)) {
> stat->nr_unmap_fail += nr_pages;
> if (!was_swapbacked && PageSwapBacked(page))
> stat->nr_lazyfree_fail += nr_pages;
> --
> 2.26.2
On Tue, May 25, 2021 at 2:12 PM Hugh Dickins <[email protected]> wrote:
>
> On Tue, 25 May 2021, Yang Shi wrote:
>
> > Currently try_to_unmap() return bool value by checking page_mapcount(),
> > however this may return false positive since page_mapcount() doesn't
> > check all subpages of compound page. The total_mapcount() could be used
> > instead, but its cost is higher since it traverses all subpages.
> >
> > Actually the most callers of try_to_unmap() don't care about the
> > return value at all. So just need check if page is still mapped by
> > page_mapped() when necessary. And page_mapped() does bail out early
> > when it finds mapped subpage.
> >
> > Suggested-by: Hugh Dickins <[email protected]>
> > Signed-off-by: Yang Shi <[email protected]>
>
> Thanks for doing this, I like it, you can add
> Acked-by: Hugh Dickins <[email protected]>
Thank you.
>
> Let's all ignore checkpatch's complaint on "struct page *":
> that try_to_unmap() prototype is consistent with others in rmap.h,
> and changing them to "struct page *page" adds no information at all.
>
> But a nit in mm/memory-failure.c: unmap_success no longer needs
> to be initialized to true on entry to hwpoison_user_mappings().
Will fix it.
>
> More seriously, I question the ordering of the two patches:
> I think this 1/2 should be 2/2, and the other be 1/2. The other is
> the one which replaces a BUG by a WARN, which will be wanted Cc stable;
> whereas this one is really just a cleanup - I don't think there's a
> serious consequence from the faint possibility of wrong unmap_success
> on THP, in either memory-failure.c or vmscan.c - wrong message, wrong
> stats, but more than that? And memory-failure.c on THP somewhat a
> work in progress IIUC.
>
> Unfortunately, they can't just be swapped but need rediffing.
> Or do you disagree that's a better ordering?
No, I don't. Will reorder them in the next version, anyway I need to
fix something.
>
> As it stands, I imagine my series of THP unmap fixes (most needing Cc
> stable) coming on top of your "mm: thp: check page_mapped instead of
> page_mapcount for split" (umm, that's a bad title), but before this
> cleanup.
>
> Though I'm responding before I've looked through the stable trees:
> it's not of overriding importance, but I'll want to try to minimize
> the number of patches that need redoing for older stables, which might
> affect the ordering. (And it might end up easiest if I bring your two
> patches into my series, then ask Andrew to replace.)
>
> Further complicated by Andrew having brought in the nouveau reorg of
> mm/rmap.c (oh, and that touches mm/huge_memory.c unmap_page() too).
> I expect I'll have to send Andrew a series to go before all that,
> together with fixups to that to sit on top of it. Sigh. Well,
> we can only blame me for not getting those THP fixes in sooner.
>
> Hugh
>
> > ---
> > include/linux/rmap.h | 2 +-
> > mm/huge_memory.c | 4 +---
> > mm/memory-failure.c | 13 ++++++-------
> > mm/rmap.c | 6 +-----
> > mm/vmscan.c | 3 ++-
> > 5 files changed, 11 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > index def5c62c93b3..116cb193110a 100644
> > --- a/include/linux/rmap.h
> > +++ b/include/linux/rmap.h
> > @@ -194,7 +194,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
> > int page_referenced(struct page *, int is_locked,
> > struct mem_cgroup *memcg, unsigned long *vm_flags);
> >
> > -bool try_to_unmap(struct page *, enum ttu_flags flags);
> > +void try_to_unmap(struct page *, enum ttu_flags flags);
> >
> > /* Avoid racy checks */
> > #define PVMW_SYNC (1 << 0)
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 19195fca1aee..80fe642d742d 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2336,15 +2336,13 @@ static void unmap_page(struct page *page)
> > {
> > enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
> > TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
> > - bool unmap_success;
> >
> > VM_BUG_ON_PAGE(!PageHead(page), page);
> >
> > if (PageAnon(page))
> > ttu_flags |= TTU_SPLIT_FREEZE;
> >
> > - unmap_success = try_to_unmap(page, ttu_flags);
> > - VM_BUG_ON_PAGE(!unmap_success, page);
> > + try_to_unmap(page, ttu_flags);
> > }
> >
> > static void remap_page(struct page *page, unsigned int nr)
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 9dcc9bcea731..6dd53ff34825 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -1126,7 +1126,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
> >
> > if (!PageHuge(hpage)) {
> > - unmap_success = try_to_unmap(hpage, ttu);
> > + try_to_unmap(hpage, ttu);
> > } else {
> > if (!PageAnon(hpage)) {
> > /*
> > @@ -1138,17 +1138,16 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
> > */
> > mapping = hugetlb_page_mapping_lock_write(hpage);
> > if (mapping) {
> > - unmap_success = try_to_unmap(hpage,
> > - ttu|TTU_RMAP_LOCKED);
> > + try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
> > i_mmap_unlock_write(mapping);
> > - } else {
> > + } else
> > pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
> > - unmap_success = false;
> > - }
> > } else {
> > - unmap_success = try_to_unmap(hpage, ttu);
> > + try_to_unmap(hpage, ttu);
> > }
> > }
> > +
> > + unmap_success = !page_mapped(hpage);
> > if (!unmap_success)
> > pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
> > pfn, page_mapcount(hpage));
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index a35cbbbded0d..728de421e43a 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1748,10 +1748,8 @@ static int page_not_mapped(struct page *page)
> > *
> > * Tries to remove all the page table entries which are mapping this
> > * page, used in the pageout path. Caller must hold the page lock.
> > - *
> > - * If unmap is successful, return true. Otherwise, false.
> > */
> > -bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > +void try_to_unmap(struct page *page, enum ttu_flags flags)
> > {
> > struct rmap_walk_control rwc = {
> > .rmap_one = try_to_unmap_one,
> > @@ -1776,8 +1774,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
> > rmap_walk_locked(page, &rwc);
> > else
> > rmap_walk(page, &rwc);
> > -
> > - return !page_mapcount(page) ? true : false;
> > }
> >
> > /**
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f96d62159720..fa5052ace415 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
> > if (unlikely(PageTransHuge(page)))
> > flags |= TTU_SPLIT_HUGE_PMD;
> >
> > - if (!try_to_unmap(page, flags)) {
> > + try_to_unmap(page, flags);
> > + if (page_mapped(page)) {
> > stat->nr_unmap_fail += nr_pages;
> > if (!was_swapbacked && PageSwapBacked(page))
> > stat->nr_lazyfree_fail += nr_pages;
> > --
> > 2.26.2
On Tue, May 25, 2021 at 3:06 PM Hugh Dickins <[email protected]> wrote:
>
> On Tue, 25 May 2021, Yang Shi wrote:
>
> > When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
> > return false positive for PTE-mapped THP since page_mapcount() is used
> > to check if the THP is unmapped, but it just checks compound mapount and
> > head page's mapcount. If the THP is PTE-mapped and head page is not
> > mapped, it may return false positive.
>
> But those false positives did not matter because there was a separate
> DEBUG_VM check later.
>
> It's good to have the link to Wang Yugui's report, but that paragraph
> is not really about this patch, as it has evolved now: this patch
> consolidates the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE.
>
> >
> > The try_to_unmap() has been changed to void function, so check
> > page_mapped() after it. And changed BUG_ON to WARN_ON since it is not a
> > fatal issue.
>
> The change from DEBUG_VM BUG to VM_WARN_ON_ONCE is the most important
> part of this, and the reason it's good for stable: and the patch title
> ought to highlight that, not the page_mapcount business.
Will update the subject and the commit log accordingly.
>
> >
> > [1] https://lore.kernel.org/linux-mm/[email protected]/
> >
> > Reviewed-by: Zi Yan <[email protected]>
> > Signed-off-by: Yang Shi <[email protected]>
>
> This will be required Cc: [email protected]
> (but we don't want to Cc them on this mail).
>
> As I said on the other, I think this should be 1/2 not 2/2.
Sure.
>
> > ---
> > v3: Incorporated the comments from Hugh. Keep Zi Yan's reviewed-by tag
> > since there is no fundamental change against v2.
> > v2: Removed dead code and updated the comment of try_to_unmap() per Zi
> > Yan.
> > mm/huge_memory.c | 17 +++++------------
> > 1 file changed, 5 insertions(+), 12 deletions(-)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 80fe642d742d..72d81d8e01b1 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2343,6 +2343,8 @@ static void unmap_page(struct page *page)
> > ttu_flags |= TTU_SPLIT_FREEZE;
> >
> > try_to_unmap(page, ttu_flags);
> > +
> > + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
>
> There is one useful piece of information that dump_page() will not show:
> total_mapcount(page). Is there a way of crafting that into the output?
>
> Not with the macros available, I think. Maybe we should be optimistic
> and assume I already have the fixes, so not worth trying to refine the
> message (but I'm not entirely convinced of that!).
>
> The trouble with
> if (VM_WARN_ON_ONCE_PAGE(page_mapped(page), page))
> pr_warn("total_mapcount:%d\n", total_mapcount(page));
> is that it's printed regardless of the ONCEness. Another "trouble"
> is that it's printed so long after the page_mapped(page) check that
> it may be 0 by now - but one can see that as itself informative.
We should be able to make dump_page() print total mapcount, right? The
dump_page() should be just called in some error paths so taking some
extra overhead to dump more information seems harmless, or am I
missing something? Of course, this can be done in a separate patch.
>
> I guess leave it as you have it, I don't see an easy better
> (without going back to something like the old contortions).
>
> > }
> >
> > static void remap_page(struct page *page, unsigned int nr)
> > @@ -2653,7 +2655,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> > struct deferred_split *ds_queue = get_deferred_split_queue(head);
> > struct anon_vma *anon_vma = NULL;
> > struct address_space *mapping = NULL;
> > - int count, mapcount, extra_pins, ret;
> > + int mapcount, extra_pins, ret;
>
> Remove mapcount too.
>
> > pgoff_t end;
> >
> > VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
> > @@ -2712,7 +2714,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> > }
> >
> > unmap_page(head);
> > - VM_BUG_ON_PAGE(compound_mapcount(head), head);
> >
> > /* block interrupt reentry in xa_lock and spinlock */
> > local_irq_disable();
> > @@ -2730,7 +2731,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> >
> > /* Prevent deferred_split_scan() touching ->_refcount */
> > spin_lock(&ds_queue->split_queue_lock);
> > - count = page_count(head);
> > mapcount = total_mapcount(head);
> > if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
>
> mapcount was useful for printing in the hand-crafted message deleted,
> but serves no purpose now: just do the page_ref_freeze() without it.
Aha, yes, good catch. If mapcount is not zero, the refcount freeze
won't succeed.
>
> > if (!list_empty(page_deferred_list(head))) {
> > @@ -2752,16 +2752,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> > __split_huge_page(page, list, end);
> > ret = 0;
> > } else {
> > - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
> > - pr_alert("total_mapcount: %u, page_count(): %u\n",
> > - mapcount, count);
> > - if (PageTail(page))
> > - dump_page(head, NULL);
> > - dump_page(page, "total_mapcount(head) > 0");
> > - BUG();
> > - }
> > spin_unlock(&ds_queue->split_queue_lock);
> > -fail: if (mapping)
> > +fail:
> > + if (mapping)
> > xa_unlock(&mapping->i_pages);
> > local_irq_enable();
> > remap_page(head, thp_nr_pages(head));
> > --
> > 2.26.2
On Tue, 25 May 2021, Yang Shi wrote:
> When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
> return false positive for PTE-mapped THP since page_mapcount() is used
> to check if the THP is unmapped, but it just checks compound mapount and
> head page's mapcount. If the THP is PTE-mapped and head page is not
> mapped, it may return false positive.
But those false positives did not matter because there was a separate
DEBUG_VM check later.
It's good to have the link to Wang Yugui's report, but that paragraph
is not really about this patch, as it has evolved now: this patch
consolidates the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE.
>
> The try_to_unmap() has been changed to void function, so check
> page_mapped() after it. And changed BUG_ON to WARN_ON since it is not a
> fatal issue.
The change from DEBUG_VM BUG to VM_WARN_ON_ONCE is the most important
part of this, and the reason it's good for stable: and the patch title
ought to highlight that, not the page_mapcount business.
>
> [1] https://lore.kernel.org/linux-mm/[email protected]/
>
> Reviewed-by: Zi Yan <[email protected]>
> Signed-off-by: Yang Shi <[email protected]>
This will be required Cc: [email protected]
(but we don't want to Cc them on this mail).
As I said on the other, I think this should be 1/2 not 2/2.
> ---
> v3: Incorporated the comments from Hugh. Keep Zi Yan's reviewed-by tag
> since there is no fundamental change against v2.
> v2: Removed dead code and updated the comment of try_to_unmap() per Zi
> Yan.
> mm/huge_memory.c | 17 +++++------------
> 1 file changed, 5 insertions(+), 12 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 80fe642d742d..72d81d8e01b1 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2343,6 +2343,8 @@ static void unmap_page(struct page *page)
> ttu_flags |= TTU_SPLIT_FREEZE;
>
> try_to_unmap(page, ttu_flags);
> +
> + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
There is one useful piece of information that dump_page() will not show:
total_mapcount(page). Is there a way of crafting that into the output?
Not with the macros available, I think. Maybe we should be optimistic
and assume I already have the fixes, so not worth trying to refine the
message (but I'm not entirely convinced of that!).
The trouble with
if (VM_WARN_ON_ONCE_PAGE(page_mapped(page), page))
pr_warn("total_mapcount:%d\n", total_mapcount(page));
is that it's printed regardless of the ONCEness. Another "trouble"
is that it's printed so long after the page_mapped(page) check that
it may be 0 by now - but one can see that as itself informative.
I guess leave it as you have it, I don't see an easy better
(without going back to something like the old contortions).
> }
>
> static void remap_page(struct page *page, unsigned int nr)
> @@ -2653,7 +2655,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> struct deferred_split *ds_queue = get_deferred_split_queue(head);
> struct anon_vma *anon_vma = NULL;
> struct address_space *mapping = NULL;
> - int count, mapcount, extra_pins, ret;
> + int mapcount, extra_pins, ret;
Remove mapcount too.
> pgoff_t end;
>
> VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
> @@ -2712,7 +2714,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> }
>
> unmap_page(head);
> - VM_BUG_ON_PAGE(compound_mapcount(head), head);
>
> /* block interrupt reentry in xa_lock and spinlock */
> local_irq_disable();
> @@ -2730,7 +2731,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>
> /* Prevent deferred_split_scan() touching ->_refcount */
> spin_lock(&ds_queue->split_queue_lock);
> - count = page_count(head);
> mapcount = total_mapcount(head);
> if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
mapcount was useful for printing in the hand-crafted message deleted,
but serves no purpose now: just do the page_ref_freeze() without it.
> if (!list_empty(page_deferred_list(head))) {
> @@ -2752,16 +2752,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
> __split_huge_page(page, list, end);
> ret = 0;
> } else {
> - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
> - pr_alert("total_mapcount: %u, page_count(): %u\n",
> - mapcount, count);
> - if (PageTail(page))
> - dump_page(head, NULL);
> - dump_page(page, "total_mapcount(head) > 0");
> - BUG();
> - }
> spin_unlock(&ds_queue->split_queue_lock);
> -fail: if (mapping)
> +fail:
> + if (mapping)
> xa_unlock(&mapping->i_pages);
> local_irq_enable();
> remap_page(head, thp_nr_pages(head));
> --
> 2.26.2
On Tue, 25 May 2021, Yang Shi wrote:
> On Tue, May 25, 2021 at 3:06 PM Hugh Dickins <[email protected]> wrote:
> > On Tue, 25 May 2021, Yang Shi wrote:
> >
> > > When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
> > > return false positive for PTE-mapped THP since page_mapcount() is used
> > > to check if the THP is unmapped, but it just checks compound mapount and
> > > head page's mapcount. If the THP is PTE-mapped and head page is not
> > > mapped, it may return false positive.
> >
> > But those false positives did not matter because there was a separate
> > DEBUG_VM check later.
> >
> > It's good to have the link to Wang Yugui's report, but that paragraph
> > is not really about this patch, as it has evolved now: this patch
> > consolidates the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE.
> >
> > >
> > > The try_to_unmap() has been changed to void function, so check
> > > page_mapped() after it. And changed BUG_ON to WARN_ON since it is not a
> > > fatal issue.
> >
> > The change from DEBUG_VM BUG to VM_WARN_ON_ONCE is the most important
> > part of this, and the reason it's good for stable: and the patch title
> > ought to highlight that, not the page_mapcount business.
>
> Will update the subject and the commit log accordingly.
Thanks!
>
> >
> > >
> > > [1] https://lore.kernel.org/linux-mm/[email protected]/
> > >
> > > Reviewed-by: Zi Yan <[email protected]>
> > > Signed-off-by: Yang Shi <[email protected]>
> >
> > This will be required Cc: [email protected]
> > (but we don't want to Cc them on this mail).
> >
> > As I said on the other, I think this should be 1/2 not 2/2.
>
> Sure.
Great.
>
> >
> > > ---
> > > v3: Incorporated the comments from Hugh. Keep Zi Yan's reviewed-by tag
> > > since there is no fundamental change against v2.
> > > v2: Removed dead code and updated the comment of try_to_unmap() per Zi
> > > Yan.
> > > mm/huge_memory.c | 17 +++++------------
> > > 1 file changed, 5 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 80fe642d742d..72d81d8e01b1 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -2343,6 +2343,8 @@ static void unmap_page(struct page *page)
> > > ttu_flags |= TTU_SPLIT_FREEZE;
> > >
> > > try_to_unmap(page, ttu_flags);
> > > +
> > > + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
> >
> > There is one useful piece of information that dump_page() will not show:
> > total_mapcount(page). Is there a way of crafting that into the output?
> >
> > Not with the macros available, I think. Maybe we should be optimistic
> > and assume I already have the fixes, so not worth trying to refine the
> > message (but I'm not entirely convinced of that!).
> >
> > The trouble with
> > if (VM_WARN_ON_ONCE_PAGE(page_mapped(page), page))
> > pr_warn("total_mapcount:%d\n", total_mapcount(page));
> > is that it's printed regardless of the ONCEness. Another "trouble"
> > is that it's printed so long after the page_mapped(page) check that
> > it may be 0 by now - but one can see that as itself informative.
>
> We should be able to make dump_page() print total mapcount, right? The
> dump_page() should be just called in some error paths so taking some
> extra overhead to dump more information seems harmless, or am I
> missing something? Of course, this can be done in a separate patch.
I didn't want to ask that of you, but yes, if you're willing to add
total_mapcount() into dump_page(), I think that would be ideal; and
could be helpful for other cases too.
Looking through total_mapcount(), I think it's safe to call from
dump_page() - I always worry about extending crash info with
something that depends on a maybe-corrupted pointer which would
generate a further crash and either recurse or truncate the output -
but please check that carefully.
Yes, a separate patch please: which can come later on, and no
need for stable for that one, but good to know it's coming.
Thanks,
Hugh
On Tue, May 25, 2021 at 4:58 PM Hugh Dickins <[email protected]> wrote:
>
> On Tue, 25 May 2021, Yang Shi wrote:
> > On Tue, May 25, 2021 at 3:06 PM Hugh Dickins <[email protected]> wrote:
> > > On Tue, 25 May 2021, Yang Shi wrote:
> > >
> > > > When debugging the bug reported by Wang Yugui [1], try_to_unmap() may
> > > > return false positive for PTE-mapped THP since page_mapcount() is used
> > > > to check if the THP is unmapped, but it just checks compound mapount and
> > > > head page's mapcount. If the THP is PTE-mapped and head page is not
> > > > mapped, it may return false positive.
> > >
> > > But those false positives did not matter because there was a separate
> > > DEBUG_VM check later.
> > >
> > > It's good to have the link to Wang Yugui's report, but that paragraph
> > > is not really about this patch, as it has evolved now: this patch
> > > consolidates the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE.
> > >
> > > >
> > > > The try_to_unmap() has been changed to void function, so check
> > > > page_mapped() after it. And changed BUG_ON to WARN_ON since it is not a
> > > > fatal issue.
> > >
> > > The change from DEBUG_VM BUG to VM_WARN_ON_ONCE is the most important
> > > part of this, and the reason it's good for stable: and the patch title
> > > ought to highlight that, not the page_mapcount business.
> >
> > Will update the subject and the commit log accordingly.
>
> Thanks!
>
> >
> > >
> > > >
> > > > [1] https://lore.kernel.org/linux-mm/[email protected]/
> > > >
> > > > Reviewed-by: Zi Yan <[email protected]>
> > > > Signed-off-by: Yang Shi <[email protected]>
> > >
> > > This will be required Cc: [email protected]
> > > (but we don't want to Cc them on this mail).
> > >
> > > As I said on the other, I think this should be 1/2 not 2/2.
> >
> > Sure.
>
> Great.
>
> >
> > >
> > > > ---
> > > > v3: Incorporated the comments from Hugh. Keep Zi Yan's reviewed-by tag
> > > > since there is no fundamental change against v2.
> > > > v2: Removed dead code and updated the comment of try_to_unmap() per Zi
> > > > Yan.
> > > > mm/huge_memory.c | 17 +++++------------
> > > > 1 file changed, 5 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > > index 80fe642d742d..72d81d8e01b1 100644
> > > > --- a/mm/huge_memory.c
> > > > +++ b/mm/huge_memory.c
> > > > @@ -2343,6 +2343,8 @@ static void unmap_page(struct page *page)
> > > > ttu_flags |= TTU_SPLIT_FREEZE;
> > > >
> > > > try_to_unmap(page, ttu_flags);
> > > > +
> > > > + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
> > >
> > > There is one useful piece of information that dump_page() will not show:
> > > total_mapcount(page). Is there a way of crafting that into the output?
> > >
> > > Not with the macros available, I think. Maybe we should be optimistic
> > > and assume I already have the fixes, so not worth trying to refine the
> > > message (but I'm not entirely convinced of that!).
> > >
> > > The trouble with
> > > if (VM_WARN_ON_ONCE_PAGE(page_mapped(page), page))
> > > pr_warn("total_mapcount:%d\n", total_mapcount(page));
> > > is that it's printed regardless of the ONCEness. Another "trouble"
> > > is that it's printed so long after the page_mapped(page) check that
> > > it may be 0 by now - but one can see that as itself informative.
> >
> > We should be able to make dump_page() print total mapcount, right? The
> > dump_page() should be just called in some error paths so taking some
> > extra overhead to dump more information seems harmless, or am I
> > missing something? Of course, this can be done in a separate patch.
>
> I didn't want to ask that of you, but yes, if you're willing to add
> total_mapcount() into dump_page(), I think that would be ideal; and
> could be helpful for other cases too.
>
> Looking through total_mapcount(), I think it's safe to call from
> dump_page() - I always worry about extending crash info with
> something that depends on a maybe-corrupted pointer which would
> generate a further crash and either recurse or truncate the output -
> but please check that carefully.
Yes, it is possible. If the THP is being split, some VM_BUG_* might be
triggered if total_mapcount() is called. But it is still feasible to
print total mapcount as long as we implement a more robust version for
dump_page().
>
> Yes, a separate patch please: which can come later on, and no
> need for stable for that one, but good to know it's coming.
>
> Thanks,
> Hugh
On Wed, 26 May 2021, Yang Shi wrote:
> On Tue, May 25, 2021 at 4:58 PM Hugh Dickins <[email protected]> wrote:
> > On Tue, 25 May 2021, Yang Shi wrote:
> > >
> > > We should be able to make dump_page() print total mapcount, right? The
> > > dump_page() should be just called in some error paths so taking some
> > > extra overhead to dump more information seems harmless, or am I
> > > missing something? Of course, this can be done in a separate patch.
> >
> > I didn't want to ask that of you, but yes, if you're willing to add
> > total_mapcount() into dump_page(), I think that would be ideal; and
> > could be helpful for other cases too.
> >
> > Looking through total_mapcount(), I think it's safe to call from
> > dump_page() - I always worry about extending crash info with
> > something that depends on a maybe-corrupted pointer which would
> > generate a further crash and either recurse or truncate the output -
> > but please check that carefully.
>
> Yes, it is possible. If the THP is being split, some VM_BUG_* might be
> triggered if total_mapcount() is called. But it is still feasible to
> print total mapcount as long as we implement a more robust version for
> dump_page().
Oh dear. I think the very last thing the kernel needs is yet another
subtly different variant of *mapcount*().
Do you have a specific VM_BUG_* in mind there? Of course there's
the VM_BUG_ON_PAGE(PageTail) at the start of it, and you'd want to
print total_mapcount(head) to avoid that one.
Looks like __dump_page() is already careful about "head", checking
whether "page" is within the expected bounds. Of course, once we're
in serious VM_WARN territory, there might be races which could flip
fields midway: PageTail set by the time it reaches total_mapcount()?
Narrow the race (rather like it does with PageSlab) by testing
PageTail immediately before calling total_mapcount(head)?
Hugh
On Wed, May 26, 2021 at 3:48 PM Hugh Dickins <[email protected]> wrote:
>
> On Wed, 26 May 2021, Yang Shi wrote:
> > On Tue, May 25, 2021 at 4:58 PM Hugh Dickins <[email protected]> wrote:
> > > On Tue, 25 May 2021, Yang Shi wrote:
> > > >
> > > > We should be able to make dump_page() print total mapcount, right? The
> > > > dump_page() should be just called in some error paths so taking some
> > > > extra overhead to dump more information seems harmless, or am I
> > > > missing something? Of course, this can be done in a separate patch.
> > >
> > > I didn't want to ask that of you, but yes, if you're willing to add
> > > total_mapcount() into dump_page(), I think that would be ideal; and
> > > could be helpful for other cases too.
> > >
> > > Looking through total_mapcount(), I think it's safe to call from
> > > dump_page() - I always worry about extending crash info with
> > > something that depends on a maybe-corrupted pointer which would
> > > generate a further crash and either recurse or truncate the output -
> > > but please check that carefully.
> >
> > Yes, it is possible. If the THP is being split, some VM_BUG_* might be
> > triggered if total_mapcount() is called. But it is still feasible to
> > print total mapcount as long as we implement a more robust version for
> > dump_page().
>
> Oh dear. I think the very last thing the kernel needs is yet another
> subtly different variant of *mapcount*().
>
> Do you have a specific VM_BUG_* in mind there? Of course there's
> the VM_BUG_ON_PAGE(PageTail) at the start of it, and you'd want to
> print total_mapcount(head) to avoid that one.
There are two more places in total_mapcount() other than the tail page
assertion.
#1. compound_mapcount() has !PageCompound assertion. The similar
problem has been met before, please refer to commit 6dc5ea16c86f ("mm,
dump_page: do not crash with bad compound_mapcount()").
#2. PageDoubleMap has !PageHead assertion.
>
> Looks like __dump_page() is already careful about "head", checking
> whether "page" is within the expected bounds. Of course, once we're
> in serious VM_WARN territory, there might be races which could flip
> fields midway: PageTail set by the time it reaches total_mapcount()?
It seems possible, at least theoretically.
> Narrow the race (rather like it does with PageSlab) by testing
> PageTail immediately before calling total_mapcount(head)?
TBH I don't think of a simple testing to narrow all the races. We have
to add multiple testing in total_mapcount(), it seems too hacky.
Another variant like below might be neater?
+static inline int __total_mapcount(struct page *head)
+{
+ int i, compound, nr, ret;
+
+ compound = head_compound_mapcount(head);
+ nr = compound_nr(head);
+ if (PageHuge(head))
+ return compound;
+ ret = compound;
+ for (i = 0; i < nr; i++)
+ ret += atomic_read(&head[i]._mapcount) + 1;
+ /* File pages has compound_mapcount included in _mapcount */
+ if (!PageAnon(head))
+ return ret - compound * nr;
+ if (head[1].flags & PG_double_map)
+ ret -= nr;
+ return ret;
+}
>
> Hugh
On Wed, 26 May 2021, Yang Shi wrote:
> On Wed, May 26, 2021 at 3:48 PM Hugh Dickins <[email protected]> wrote:
> > On Wed, 26 May 2021, Yang Shi wrote:
> > > On Tue, May 25, 2021 at 4:58 PM Hugh Dickins <[email protected]> wrote:
> > > > On Tue, 25 May 2021, Yang Shi wrote:
> > > > >
> > > > > We should be able to make dump_page() print total mapcount, right? The
> > > > > dump_page() should be just called in some error paths so taking some
> > > > > extra overhead to dump more information seems harmless, or am I
> > > > > missing something? Of course, this can be done in a separate patch.
> > > >
> > > > I didn't want to ask that of you, but yes, if you're willing to add
> > > > total_mapcount() into dump_page(), I think that would be ideal; and
> > > > could be helpful for other cases too.
> > > >
> > > > Looking through total_mapcount(), I think it's safe to call from
> > > > dump_page() - I always worry about extending crash info with
> > > > something that depends on a maybe-corrupted pointer which would
> > > > generate a further crash and either recurse or truncate the output -
> > > > but please check that carefully.
> > >
> > > Yes, it is possible. If the THP is being split, some VM_BUG_* might be
> > > triggered if total_mapcount() is called. But it is still feasible to
> > > print total mapcount as long as we implement a more robust version for
> > > dump_page().
> >
> > Oh dear. I think the very last thing the kernel needs is yet another
> > subtly different variant of *mapcount*().
> >
> > Do you have a specific VM_BUG_* in mind there? Of course there's
> > the VM_BUG_ON_PAGE(PageTail) at the start of it, and you'd want to
> > print total_mapcount(head) to avoid that one.
>
> There are two more places in total_mapcount() other than the tail page
> assertion.
>
> #1. compound_mapcount() has !PageCompound assertion. The similar
> problem has been met before, please refer to commit 6dc5ea16c86f ("mm,
> dump_page: do not crash with bad compound_mapcount()").
Thanks for the useful reference.
> #2. PageDoubleMap has !PageHead assertion.
>
> >
> > Looks like __dump_page() is already careful about "head", checking
> > whether "page" is within the expected bounds. Of course, once we're
> > in serious VM_WARN territory, there might be races which could flip
> > fields midway: PageTail set by the time it reaches total_mapcount()?
>
> It seems possible, at least theoretically.
>
> > Narrow the race (rather like it does with PageSlab) by testing
> > PageTail immediately before calling total_mapcount(head)?
>
> TBH I don't think of a simple testing to narrow all the races. We have
> to add multiple testing in total_mapcount(), it seems too hacky.
> Another variant like below might be neater?
>
> +static inline int __total_mapcount(struct page *head)
> +{
> + int i, compound, nr, ret;
> +
> + compound = head_compound_mapcount(head);
> + nr = compound_nr(head);
> + if (PageHuge(head))
> + return compound;
> + ret = compound;
> + for (i = 0; i < nr; i++)
> + ret += atomic_read(&head[i]._mapcount) + 1;
> + /* File pages has compound_mapcount included in _mapcount */
> + if (!PageAnon(head))
> + return ret - compound * nr;
> + if (head[1].flags & PG_double_map)
> + ret -= nr;
> + return ret;
> +}
I still don't want any more of those lovely functions.
My current preference is just to drop the idea of trying
to show total_mapcount from __dump_page().
I might end up compromising on printing the result of
for (i = 0; i < nr; i++)
ret += atomic_read(&head[i]._mapcount) + 1;
(if we can safely decide nr), and leave all the convoluted
flags logic to the poor reader of the page dump.
But that in itself shows the limitation of printing total_mapcount(),
when individual corrupt _mapcounts might be -1, +2, -3, +4, ...
To some extent (modulo racing references to the THP), a good
total_mapcount can be inferred from the page reference count.
(But that probably describes better the situation when everything
is going correctly: maybe problems tend to come precisely when there
are multiple racing references.)
Dunno. My mind is not on it at the moment. I'm more concerned that
Wang Yugui's crash turns out not to be solved by any of the fixes
I have lined up: we certainly did not promise that it would be,
and it shouldn't stop advancing the fixes we already know, but I do
want to give it a little more thought before resuming on my patches.
Hugh
On Wed, May 26, 2021 at 5:57 PM Hugh Dickins <[email protected]> wrote:
>
> On Wed, 26 May 2021, Yang Shi wrote:
> > On Wed, May 26, 2021 at 3:48 PM Hugh Dickins <[email protected]> wrote:
> > > On Wed, 26 May 2021, Yang Shi wrote:
> > > > On Tue, May 25, 2021 at 4:58 PM Hugh Dickins <[email protected]> wrote:
> > > > > On Tue, 25 May 2021, Yang Shi wrote:
> > > > > >
> > > > > > We should be able to make dump_page() print total mapcount, right? The
> > > > > > dump_page() should be just called in some error paths so taking some
> > > > > > extra overhead to dump more information seems harmless, or am I
> > > > > > missing something? Of course, this can be done in a separate patch.
> > > > >
> > > > > I didn't want to ask that of you, but yes, if you're willing to add
> > > > > total_mapcount() into dump_page(), I think that would be ideal; and
> > > > > could be helpful for other cases too.
> > > > >
> > > > > Looking through total_mapcount(), I think it's safe to call from
> > > > > dump_page() - I always worry about extending crash info with
> > > > > something that depends on a maybe-corrupted pointer which would
> > > > > generate a further crash and either recurse or truncate the output -
> > > > > but please check that carefully.
> > > >
> > > > Yes, it is possible. If the THP is being split, some VM_BUG_* might be
> > > > triggered if total_mapcount() is called. But it is still feasible to
> > > > print total mapcount as long as we implement a more robust version for
> > > > dump_page().
> > >
> > > Oh dear. I think the very last thing the kernel needs is yet another
> > > subtly different variant of *mapcount*().
> > >
> > > Do you have a specific VM_BUG_* in mind there? Of course there's
> > > the VM_BUG_ON_PAGE(PageTail) at the start of it, and you'd want to
> > > print total_mapcount(head) to avoid that one.
> >
> > There are two more places in total_mapcount() other than the tail page
> > assertion.
> >
> > #1. compound_mapcount() has !PageCompound assertion. The similar
> > problem has been met before, please refer to commit 6dc5ea16c86f ("mm,
> > dump_page: do not crash with bad compound_mapcount()").
>
> Thanks for the useful reference.
>
> > #2. PageDoubleMap has !PageHead assertion.
> >
> > >
> > > Looks like __dump_page() is already careful about "head", checking
> > > whether "page" is within the expected bounds. Of course, once we're
> > > in serious VM_WARN territory, there might be races which could flip
> > > fields midway: PageTail set by the time it reaches total_mapcount()?
> >
> > It seems possible, at least theoretically.
> >
> > > Narrow the race (rather like it does with PageSlab) by testing
> > > PageTail immediately before calling total_mapcount(head)?
> >
> > TBH I don't think of a simple testing to narrow all the races. We have
> > to add multiple testing in total_mapcount(), it seems too hacky.
> > Another variant like below might be neater?
> >
> > +static inline int __total_mapcount(struct page *head)
> > +{
> > + int i, compound, nr, ret;
> > +
> > + compound = head_compound_mapcount(head);
> > + nr = compound_nr(head);
> > + if (PageHuge(head))
> > + return compound;
> > + ret = compound;
> > + for (i = 0; i < nr; i++)
> > + ret += atomic_read(&head[i]._mapcount) + 1;
> > + /* File pages has compound_mapcount included in _mapcount */
> > + if (!PageAnon(head))
> > + return ret - compound * nr;
> > + if (head[1].flags & PG_double_map)
> > + ret -= nr;
> > + return ret;
> > +}
>
> I still don't want any more of those lovely functions.
>
> My current preference is just to drop the idea of trying
> to show total_mapcount from __dump_page().
>
> I might end up compromising on printing the result of
> for (i = 0; i < nr; i++)
> ret += atomic_read(&head[i]._mapcount) + 1;
> (if we can safely decide nr), and leave all the convoluted
> flags logic to the poor reader of the page dump.
>
> But that in itself shows the limitation of printing total_mapcount(),
> when individual corrupt _mapcounts might be -1, +2, -3, +4, ...
>
> To some extent (modulo racing references to the THP), a good
> total_mapcount can be inferred from the page reference count.
> (But that probably describes better the situation when everything
> is going correctly: maybe problems tend to come precisely when there
> are multiple racing references.)
Yes, the total_mapcount could be decoded (not very precisely for some
cases as you said) from refcount, pincount, page type (anon or page
cache), etc.
We could do the calculation and print the number in dump_page(), or
just leave it to the reader of page dump.
>
> Dunno. My mind is not on it at the moment. I'm more concerned that
> Wang Yugui's crash turns out not to be solved by any of the fixes
> I have lined up: we certainly did not promise that it would be,
> and it shouldn't stop advancing the fixes we already know, but I do
> want to give it a little more thought before resuming on my patches.
>
> Hugh