From: Matthew Wilcox <[email protected]>
This patch set changes how we use the _mapcount field in struct page
so that we can store an extra 20+ bits of information about why the
page was allocated. We expose that information to userspace through
/proc/kpageflags to help diagnose memory usage. It can also help
debugging if we know what a page was most recently allocated for.
Changes since v4:
- Added Kirill's acks
- Fixed spelling (Kirill)
- Allow a few extra bits to be used in page_type.
Matthew Wilcox (4):
s390: Use _refcount for pgtables
mm: Split page_type out from _mapcount
mm: Mark pages allocated through vmalloc
mm: Mark pages in use for page tables
arch/s390/mm/pgalloc.c | 21 +++++++------
arch/tile/mm/pgtable.c | 3 ++
fs/proc/page.c | 4 +++
include/linux/mm.h | 2 ++
include/linux/mm_types.h | 13 +++++---
include/linux/page-flags.h | 57 ++++++++++++++++++++++------------
include/uapi/linux/kernel-page-flags.h | 3 +-
kernel/crash_core.c | 1 +
mm/page_alloc.c | 13 +++-----
mm/vmalloc.c | 2 ++
scripts/tags.sh | 6 ++--
tools/vm/page-types.c | 2 ++
12 files changed, 82 insertions(+), 45 deletions(-)
--
2.16.1
From: Matthew Wilcox <[email protected]>
Define a new PageTable bit in the page_type and use it to mark pages in
use as page tables. This can be helpful when debugging crashdumps or
analysing memory fragmentation. Add a KPF flag to report these pages
to userspace and update page-types.c to interpret that flag.
Note that only pages currently accounted as NR_PAGETABLES are tracked
as PageTable; this does not include pgd/p4d/pud/pmd pages. Those will
be the subject of a later patch.
Signed-off-by: Matthew Wilcox <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
---
arch/tile/mm/pgtable.c | 3 +++
fs/proc/page.c | 2 ++
include/linux/mm.h | 2 ++
include/linux/page-flags.h | 6 ++++++
include/uapi/linux/kernel-page-flags.h | 1 +
tools/vm/page-types.c | 1 +
6 files changed, 15 insertions(+)
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index ec5576fd3a86..6dff12db335d 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -206,6 +206,7 @@ struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
*/
for (i = 1; i < order; ++i) {
init_page_count(p+i);
+ __SetPageTable(p+i);
inc_zone_page_state(p+i, NR_PAGETABLE);
}
@@ -226,6 +227,7 @@ void pgtable_free(struct mm_struct *mm, struct page *p, int order)
for (i = 1; i < order; ++i) {
__free_page(p+i);
+ __ClearPageTable(p+i);
dec_zone_page_state(p+i, NR_PAGETABLE);
}
}
@@ -240,6 +242,7 @@ void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
for (i = 1; i < order; ++i) {
tlb_remove_page(tlb, pte + i);
+ __ClearPageTable(pte + i);
dec_zone_page_state(pte + i, NR_PAGETABLE);
}
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index c9757af919a3..80275e7a963b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -156,6 +156,8 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_BALLOON;
if (PageVmalloc(page))
u |= 1 << KPF_VMALLOC;
+ if (PageTable(page))
+ u |= 1 << KPF_PGTABLE;
if (page_is_idle(page))
u |= 1 << KPF_IDLE;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad06d42adb1a..7a15042d6828 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1829,6 +1829,7 @@ static inline bool pgtable_page_ctor(struct page *page)
{
if (!ptlock_init(page))
return false;
+ __SetPageTable(page);
inc_zone_page_state(page, NR_PAGETABLE);
return true;
}
@@ -1836,6 +1837,7 @@ static inline bool pgtable_page_ctor(struct page *page)
static inline void pgtable_page_dtor(struct page *page)
{
pte_lock_deinit(page);
+ __ClearPageTable(page);
dec_zone_page_state(page, NR_PAGETABLE);
}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1503d314bb3d..aa5c8c1c6d38 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -644,6 +644,7 @@ PAGEFLAG_FALSE(DoubleMap)
#define PG_balloon 0x00000100
#define PG_kmemcg 0x00000200
#define PG_vmalloc 0x00000400
+#define PG_table 0x00000800
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -687,6 +688,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg)
*/
PAGE_TYPE_OPS(Vmalloc, vmalloc)
+/*
+ * Marks pages in use as page tables.
+ */
+PAGE_TYPE_OPS(Table, table)
+
extern bool is_free_buddy_page(struct page *page);
__PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 5f1735ff05b3..3c51d8bf8b7b 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -36,5 +36,6 @@
#define KPF_ZERO_PAGE 24
#define KPF_IDLE 25
#define KPF_VMALLOC 26
+#define KPF_PGTABLE 27
#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 116f59eff5e2..bbb992694f05 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -132,6 +132,7 @@ static const char * const page_flag_names[] = {
[KPF_THP] = "t:thp",
[KPF_BALLOON] = "o:balloon",
[KPF_VMALLOC] = "V:vmalloc",
+ [KPF_PGTABLE] = "g:pgtable",
[KPF_ZERO_PAGE] = "z:zero_page",
[KPF_IDLE] = "i:idle_page",
--
2.16.1
From: Matthew Wilcox <[email protected]>
Use a bit in page_type to mark pages which have been allocated through
vmalloc. This can be helpful when debugging crashdumps or analysing
memory fragmentation. Add a KPF flag to report these pages to userspace
and update page-types.c to interpret that flag.
Signed-off-by: Matthew Wilcox <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
---
fs/proc/page.c | 2 ++
include/linux/page-flags.h | 6 ++++++
include/uapi/linux/kernel-page-flags.h | 2 +-
mm/vmalloc.c | 2 ++
tools/vm/page-types.c | 1 +
5 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1491918a33c3..c9757af919a3 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (PageVmalloc(page))
+ u |= 1 << KPF_VMALLOC;
if (page_is_idle(page))
u |= 1 << KPF_IDLE;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index be3cd45efeb6..1503d314bb3d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -643,6 +643,7 @@ PAGEFLAG_FALSE(DoubleMap)
#define PG_buddy 0x00000080
#define PG_balloon 0x00000100
#define PG_kmemcg 0x00000200
+#define PG_vmalloc 0x00000400
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -681,6 +682,11 @@ PAGE_TYPE_OPS(Balloon, balloon)
*/
PAGE_TYPE_OPS(Kmemcg, kmemcg)
+/*
+ * Pages allocated through vmalloc are tagged with this bit.
+ */
+PAGE_TYPE_OPS(Vmalloc, vmalloc)
+
extern bool is_free_buddy_page(struct page *page);
__PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index fa139841ec18..5f1735ff05b3 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -35,6 +35,6 @@
#define KPF_BALLOON 23
#define KPF_ZERO_PAGE 24
#define KPF_IDLE 25
-
+#define KPF_VMALLOC 26
#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ebff729cc956..3bc0538fc21b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1536,6 +1536,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];
BUG_ON(!page);
+ __ClearPageVmalloc(page);
__free_pages(page, 0);
}
@@ -1705,6 +1706,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = i;
goto fail;
}
+ __SetPageVmalloc(page);
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index a8783f48f77f..116f59eff5e2 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -131,6 +131,7 @@ static const char * const page_flag_names[] = {
[KPF_KSM] = "x:ksm",
[KPF_THP] = "t:thp",
[KPF_BALLOON] = "o:balloon",
+ [KPF_VMALLOC] = "V:vmalloc",
[KPF_ZERO_PAGE] = "z:zero_page",
[KPF_IDLE] = "i:idle_page",
--
2.16.1
From: Matthew Wilcox <[email protected]>
We're already using a union of many fields here, so stop abusing the
_mapcount and make page_type its own field. That implies renaming some
of the machinery that creates PageBuddy, PageBalloon and PageKmemcg;
bring back the PG_buddy, PG_balloon and PG_kmemcg names.
As suggested by Kirill, make page_type a bitmask. Because it starts out
life as -1 (thanks to sharing the storage with _mapcount), setting a
page flag means clearing the appropriate bit. This gives us space for
probably twenty or so extra bits (depending how paranoid we want to be
about _mapcount underflow).
Signed-off-by: Matthew Wilcox <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
---
include/linux/mm_types.h | 13 ++++++++-----
include/linux/page-flags.h | 45 ++++++++++++++++++++++++++-------------------
kernel/crash_core.c | 1 +
mm/page_alloc.c | 13 +++++--------
scripts/tags.sh | 6 +++---
5 files changed, 43 insertions(+), 35 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fd1af6b9591d..1c5dea402501 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -94,6 +94,14 @@ struct page {
};
union {
+ /*
+ * If the page is neither PageSlab nor PageAnon, the value
+ * stored here may help distinguish it from page cache pages.
+ * See page-flags.h for a list of page types which are
+ * currently stored here.
+ */
+ unsigned int page_type;
+
_slub_counter_t counters;
unsigned int active; /* SLAB */
struct { /* SLUB */
@@ -107,11 +115,6 @@ struct page {
/*
* Count of ptes mapped in mms, to show when
* page is mapped & limit reverse map searches.
- *
- * Extra information about page type may be
- * stored here for pages that are never mapped,
- * in which case the value MUST BE <= -2.
- * See page-flags.h for more details.
*/
atomic_t _mapcount;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 50c2b8786831..be3cd45efeb6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -630,49 +630,56 @@ PAGEFLAG_FALSE(DoubleMap)
#endif
/*
- * For pages that are never mapped to userspace, page->mapcount may be
- * used for storing extra information about page type. Any value used
- * for this purpose must be <= -2, but it's better start not too close
- * to -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a special page.
+ * For pages that are never mapped to userspace (and aren't PageSlab),
+ * page_type may be used. Because it is initialised to -1, we invert the
+ * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
+ * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
+ * low bits so that an underflow or overflow of page_mapcount() won't be
+ * mistaken for a page type value.
*/
-#define PAGE_MAPCOUNT_OPS(uname, lname) \
+
+#define PAGE_TYPE_BASE 0xf0000000
+/* Reserve 0x0000007f to catch underflows of page_mapcount */
+#define PG_buddy 0x00000080
+#define PG_balloon 0x00000100
+#define PG_kmemcg 0x00000200
+
+#define PageType(page, flag) \
+ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+
+#define PAGE_TYPE_OPS(uname, lname) \
static __always_inline int Page##uname(struct page *page) \
{ \
- return atomic_read(&page->_mapcount) == \
- PAGE_##lname##_MAPCOUNT_VALUE; \
+ return PageType(page, PG_##lname); \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
- VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \
- atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \
+ VM_BUG_ON_PAGE(!PageType(page, 0), page); \
+ page->page_type &= ~PG_##lname; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
- atomic_set(&page->_mapcount, -1); \
+ page->page_type |= PG_##lname; \
}
/*
- * PageBuddy() indicate that the page is free and in the buddy system
+ * PageBuddy() indicates that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
+PAGE_TYPE_OPS(Buddy, buddy)
/*
- * PageBalloon() is set on pages that are on the balloon page list
+ * PageBalloon() is true for pages that are on the balloon page list
* (see mm/balloon_compaction.c).
*/
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
+PAGE_TYPE_OPS(Balloon, balloon)
/*
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
* pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
*/
-#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512)
-PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG)
+PAGE_TYPE_OPS(Kmemcg, kmemcg)
extern bool is_free_buddy_page(struct page *page);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4f63597c824d..b02340fb99ff 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -458,6 +458,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb416723538f..ac0b24603030 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -744,16 +744,14 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
+ * we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -798,9 +796,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with PageBuddy.
+ * Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 78e546ff689c..8c3ae36d4ea8 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -188,9 +188,9 @@ regex_c=(
'/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/'
'/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/'
'/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/'
- '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/'
- '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
- '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
+ '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/Page\1/'
+ '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
+ '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
'/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/'
'/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/'
'/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/'
--
2.16.1
From: Matthew Wilcox <[email protected]>
s390 borrows the storage used for _mapcount in struct page in order to
account whether the bottom or top half is being used for 2kB page
tables. I want to use that for something else, so use the top byte of
_refcount instead of the bottom byte of _mapcount. _refcount may
temporarily be incremented by other CPUs that see a stale pointer to
this page in the page cache, but each CPU can only increment it by one,
and there are no systems with 2^24 CPUs today, so they will not change
the upper byte of _refcount. We do have to be a little careful not to
lose any of their writes (as they will subsequently decrement the
counter).
Signed-off-by: Matthew Wilcox <[email protected]>
Acked-by: Martin Schwidefsky <[email protected]>
---
arch/s390/mm/pgalloc.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index cb364153c43c..412c5f48a8e7 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -189,14 +189,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
- mask = atomic_read(&page->_mapcount);
+ mask = atomic_read(&page->_refcount) >> 24;
mask = (mask | (mask >> 4)) & 3;
if (mask != 3) {
table = (unsigned long *) page_to_phys(page);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
- atomic_xor_bits(&page->_mapcount, 1U << bit);
+ atomic_xor_bits(&page->_refcount,
+ 1U << (bit + 24));
list_del(&page->lru);
}
}
@@ -217,12 +218,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = (unsigned long *) page_to_phys(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_set(&page->_mapcount, 3);
+ atomic_xor_bits(&page->_refcount, 3 << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_set(&page->_mapcount, 1);
+ atomic_xor_bits(&page->_refcount, 1 << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
@@ -241,7 +242,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
+ mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ mask >>= 24;
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
@@ -252,7 +254,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
}
pgtable_page_dtor(page);
- atomic_set(&page->_mapcount, -1);
__free_page(page);
}
@@ -273,7 +274,8 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask >>= 24;
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
@@ -295,12 +297,13 @@ static void __tlb_remove_table(void *_table)
break;
case 1: /* lower 2K of a 4K page table */
case 2: /* higher 2K of a 4K page table */
- if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
+ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+ mask >>= 24;
+ if (mask != 0)
break;
/* fallthrough */
case 3: /* 4K page table with pgstes */
pgtable_page_dtor(page);
- atomic_set(&page->_mapcount, -1);
__free_page(page);
break;
}
--
2.16.1
On Wed, Mar 07, 2018 at 05:44:43AM -0800, Matthew Wilcox wrote:
> From: Matthew Wilcox <[email protected]>
>
> Define a new PageTable bit in the page_type and use it to mark pages in
> use as page tables. This can be helpful when debugging crashdumps or
> analysing memory fragmentation. Add a KPF flag to report these pages
> to userspace and update page-types.c to interpret that flag.
>
> Note that only pages currently accounted as NR_PAGETABLES are tracked
> as PageTable; this does not include pgd/p4d/pud/pmd pages. Those will
> be the subject of a later patch.
>
> Signed-off-by: Matthew Wilcox <[email protected]>
> Acked-by: Kirill A. Shutemov <[email protected]>
> ---
> arch/tile/mm/pgtable.c | 3 +++
> fs/proc/page.c | 2 ++
> include/linux/mm.h | 2 ++
> include/linux/page-flags.h | 6 ++++++
> include/uapi/linux/kernel-page-flags.h | 1 +
> tools/vm/page-types.c | 1 +
> 6 files changed, 15 insertions(+)
>
Helloi Matthew,
I have bisected a regression on OpenRISC in v4.18-rc1 to this commit. Using
our defconfig after boot I am getting:
BUG: Bad page state in process hostname pfn:00b5c
page:c1ff0b80 count:0 mapcount:-1024 mapping:00000000 index:0x0
flags: 0x0()
raw: 00000000 00000000 00000000 fffffbff 00000000 00000100 00000200 00000000
page dumped because: nonzero mapcount
Modules linked in:
CPU: 1 PID: 38 Comm: hostname Tainted: G B
4.17.0-simple-smp-07461-g1d40a5ea01d5-dirty #993
Call trace:
[<(ptrval)>] show_stack+0x44/0x54
[<(ptrval)>] dump_stack+0xb0/0xe8
[<(ptrval)>] bad_page+0x138/0x174
[<(ptrval)>] ? ipi_icache_page_inv+0x0/0x24
[<(ptrval)>] ? cpumask_next+0x24/0x34
[<(ptrval)>] free_pages_check_bad+0x6c/0xd0
[<(ptrval)>] free_pcppages_bulk+0x174/0x42c
[<(ptrval)>] free_unref_page_commit.isra.17+0xb8/0xc8
[<(ptrval)>] free_unref_page_list+0x10c/0x190
[<(ptrval)>] ? set_reset_devices+0x0/0x2c
[<(ptrval)>] release_pages+0x3a0/0x414
[<(ptrval)>] tlb_flush_mmu_free+0x5c/0x90
[<(ptrval)>] tlb_flush_mmu+0x90/0xa4
[<(ptrval)>] arch_tlb_finish_mmu+0x50/0x94
[<(ptrval)>] tlb_finish_mmu+0x30/0x64
[<(ptrval)>] exit_mmap+0x110/0x1e0
[<(ptrval)>] mmput+0x50/0xf0
[<(ptrval)>] do_exit+0x274/0xa94
[<(ptrval)>] ? _raw_spin_unlock_irqrestore+0x1c/0x2c
[<(ptrval)>] ? __up_read+0x70/0x88
[<(ptrval)>] do_group_exit+0x50/0x110
[<(ptrval)>] __wake_up_parent+0x0/0x38
[<(ptrval)>] _syscall_return+0x0/0x4
In this series we are overloading mapcount with page_type, the above is caused
due to this check in mm/page_alloc.c (free_pages_check_bad):
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
We can see in the dump above that _mapcount is fffffbff, this corresponds to the
'PG_table' flag. Which was added here. But it seems for some case in openrisc
its not getting cleared during page free.
This is as far as I got tracing it. It might be an issue with OpenRISC, but our
implementation is mostly generic. I will look into it more in the next few days
but I figured you might be able to spot something more quickly.
-Stafford
On Mon, Jun 18, 2018 at 12:09:31AM +0900, Stafford Horne wrote:
> On Wed, Mar 07, 2018 at 05:44:43AM -0800, Matthew Wilcox wrote:
> > Define a new PageTable bit in the page_type and use it to mark pages in
> > use as page tables. This can be helpful when debugging crashdumps or
> > analysing memory fragmentation. Add a KPF flag to report these pages
> > to userspace and update page-types.c to interpret that flag.
>
> I have bisected a regression on OpenRISC in v4.18-rc1 to this commit. Using
> our defconfig after boot I am getting:
Hi Stafford. Thanks for the report!
> BUG: Bad page state in process hostname pfn:00b5c
> page:c1ff0b80 count:0 mapcount:-1024 mapping:00000000 index:0x0
> flags: 0x0()
> raw: 00000000 00000000 00000000 fffffbff 00000000 00000100 00000200 00000000
> page dumped because: nonzero mapcount
> Modules linked in:
> CPU: 1 PID: 38 Comm: hostname Tainted: G B
> 4.17.0-simple-smp-07461-g1d40a5ea01d5-dirty #993
> Call trace:
> [<(ptrval)>] show_stack+0x44/0x54
> [<(ptrval)>] dump_stack+0xb0/0xe8
> [<(ptrval)>] bad_page+0x138/0x174
> [<(ptrval)>] ? ipi_icache_page_inv+0x0/0x24
> [<(ptrval)>] ? cpumask_next+0x24/0x34
> [<(ptrval)>] free_pages_check_bad+0x6c/0xd0
> [<(ptrval)>] free_pcppages_bulk+0x174/0x42c
> [<(ptrval)>] free_unref_page_commit.isra.17+0xb8/0xc8
> [<(ptrval)>] free_unref_page_list+0x10c/0x190
> [<(ptrval)>] ? set_reset_devices+0x0/0x2c
> [<(ptrval)>] release_pages+0x3a0/0x414
> [<(ptrval)>] tlb_flush_mmu_free+0x5c/0x90
> [<(ptrval)>] tlb_flush_mmu+0x90/0xa4
> [<(ptrval)>] arch_tlb_finish_mmu+0x50/0x94
> [<(ptrval)>] tlb_finish_mmu+0x30/0x64
> [<(ptrval)>] exit_mmap+0x110/0x1e0
> [<(ptrval)>] mmput+0x50/0xf0
> [<(ptrval)>] do_exit+0x274/0xa94
> [<(ptrval)>] ? _raw_spin_unlock_irqrestore+0x1c/0x2c
> [<(ptrval)>] ? __up_read+0x70/0x88
> [<(ptrval)>] do_group_exit+0x50/0x110
> [<(ptrval)>] __wake_up_parent+0x0/0x38
> [<(ptrval)>] _syscall_return+0x0/0x4
>
>
> In this series we are overloading mapcount with page_type, the above is caused
> due to this check in mm/page_alloc.c (free_pages_check_bad):
>
> if (unlikely(atomic_read(&page->_mapcount) != -1))
> bad_reason = "nonzero mapcount";
>
> We can see in the dump above that _mapcount is fffffbff, this corresponds to the
> 'PG_table' flag. Which was added here. But it seems for some case in openrisc
> its not getting cleared during page free.
>
> This is as far as I got tracing it. It might be an issue with OpenRISC, but our
> implementation is mostly generic. I will look into it more in the next few days
> but I figured you might be able to spot something more quickly.
More than happy to help. You've done a great job of debugging this.
I think the problem is in your __pte_free_tlb definition. Most other
architectures are doing:
#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)
while you're doing:
#define __pte_free_tlb(tlb, pte, addr) tlb_remove_page((tlb), (pte))
and that doesn't call pgtable_page_dtor().
Up to you how you want to fix this ;-) x86 defines a ___pte_free_tlb which
calls pgtable_page_dtor() before calling tlb_remove_table() as an example.
On Sun, Jun 17, 2018 at 11:52:22AM -0700, Matthew Wilcox wrote:
> On Mon, Jun 18, 2018 at 12:09:31AM +0900, Stafford Horne wrote:
> > On Wed, Mar 07, 2018 at 05:44:43AM -0800, Matthew Wilcox wrote:
> > > Define a new PageTable bit in the page_type and use it to mark pages in
> > > use as page tables. This can be helpful when debugging crashdumps or
> > > analysing memory fragmentation. Add a KPF flag to report these pages
> > > to userspace and update page-types.c to interpret that flag.
> >
> > I have bisected a regression on OpenRISC in v4.18-rc1 to this commit. Using
> > our defconfig after boot I am getting:
>
> Hi Stafford. Thanks for the report!
>
> > BUG: Bad page state in process hostname pfn:00b5c
> > page:c1ff0b80 count:0 mapcount:-1024 mapping:00000000 index:0x0
> > flags: 0x0()
> > raw: 00000000 00000000 00000000 fffffbff 00000000 00000100 00000200 00000000
> > page dumped because: nonzero mapcount
> > Modules linked in:
> > CPU: 1 PID: 38 Comm: hostname Tainted: G B
> > 4.17.0-simple-smp-07461-g1d40a5ea01d5-dirty #993
> > Call trace:
> > [<(ptrval)>] show_stack+0x44/0x54
> > [<(ptrval)>] dump_stack+0xb0/0xe8
> > [<(ptrval)>] bad_page+0x138/0x174
> > [<(ptrval)>] ? ipi_icache_page_inv+0x0/0x24
> > [<(ptrval)>] ? cpumask_next+0x24/0x34
> > [<(ptrval)>] free_pages_check_bad+0x6c/0xd0
> > [<(ptrval)>] free_pcppages_bulk+0x174/0x42c
> > [<(ptrval)>] free_unref_page_commit.isra.17+0xb8/0xc8
> > [<(ptrval)>] free_unref_page_list+0x10c/0x190
> > [<(ptrval)>] ? set_reset_devices+0x0/0x2c
> > [<(ptrval)>] release_pages+0x3a0/0x414
> > [<(ptrval)>] tlb_flush_mmu_free+0x5c/0x90
> > [<(ptrval)>] tlb_flush_mmu+0x90/0xa4
> > [<(ptrval)>] arch_tlb_finish_mmu+0x50/0x94
> > [<(ptrval)>] tlb_finish_mmu+0x30/0x64
> > [<(ptrval)>] exit_mmap+0x110/0x1e0
> > [<(ptrval)>] mmput+0x50/0xf0
> > [<(ptrval)>] do_exit+0x274/0xa94
> > [<(ptrval)>] ? _raw_spin_unlock_irqrestore+0x1c/0x2c
> > [<(ptrval)>] ? __up_read+0x70/0x88
> > [<(ptrval)>] do_group_exit+0x50/0x110
> > [<(ptrval)>] __wake_up_parent+0x0/0x38
> > [<(ptrval)>] _syscall_return+0x0/0x4
> >
> >
> > In this series we are overloading mapcount with page_type, the above is caused
> > due to this check in mm/page_alloc.c (free_pages_check_bad):
> >
> > if (unlikely(atomic_read(&page->_mapcount) != -1))
> > bad_reason = "nonzero mapcount";
> >
> > We can see in the dump above that _mapcount is fffffbff, this corresponds to the
> > 'PG_table' flag. Which was added here. But it seems for some case in openrisc
> > its not getting cleared during page free.
> >
> > This is as far as I got tracing it. It might be an issue with OpenRISC, but our
> > implementation is mostly generic. I will look into it more in the next few days
> > but I figured you might be able to spot something more quickly.
>
> More than happy to help. You've done a great job of debugging this.
> I think the problem is in your __pte_free_tlb definition. Most other
> architectures are doing:
>
> #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)
>
> while you're doing:
>
> #define __pte_free_tlb(tlb, pte, addr) tlb_remove_page((tlb), (pte))
>
> and that doesn't call pgtable_page_dtor().
>
> Up to you how you want to fix this ;-) x86 defines a ___pte_free_tlb which
> calls pgtable_page_dtor() before calling tlb_remove_table() as an example.
I will do it the x86 way unless anyone has a concern, I notice a few other do it
this way too. I have tested it out and it works fine.
Thanks a lot for your help.
-Stafford