From: Matthew Wilcox <[email protected]>
I want to use the _mapcount field to record what a page is in use as.
This can help with debugging and we can also expose that information to
userspace through /proc/kpageflags to help diagnose memory usage (not
included as part of this patch set).
First, we need s390 to stop using _mapcount for its own purposes;
Martin, I hope you have time to look at this patch. I must confess I
don't quite understand what the different bits are used for in the upper
nybble of the _mapcount, but I tried to replicate what you were doing
faithfully.
Matthew Wilcox (4):
s390: Use _refcount for pgtables
mm: Split page_type out from _map_count
mm: Mark pages allocated through vmalloc
mm: Mark pages in use for page tables
arch/s390/mm/pgalloc.c | 21 +++++++++--------
fs/proc/page.c | 2 +-
include/linux/mm.h | 2 ++
include/linux/mm_types.h | 13 +++++++----
include/linux/page-flags.h | 57 ++++++++++++++++++++++++++++++----------------
kernel/crash_core.c | 1 +
mm/page_alloc.c | 13 ++++-------
mm/vmalloc.c | 2 ++
scripts/tags.sh | 6 ++---
9 files changed, 72 insertions(+), 45 deletions(-)
--
2.16.1
From: Matthew Wilcox <[email protected]>
s390 borrows the storage used for _mapcount in struct page in order to
account whether the bottom or top half is being used for 2kB page
tables. I want to use that for something else, so use the top byte of
_refcount instead of the bottom byte of _mapcount. _refcount may
temporarily be incremented by other CPUs that see a stale pointer to
this page in the page cache, but each CPU can only increment it by one,
and there are no systems with 2^24 CPUs today, so they will not change
the upper byte of _refcount. We do have to be a little careful not to
lose any of their writes (as they will subsequently decrement the
counter).
Signed-off-by: Matthew Wilcox <[email protected]>
---
arch/s390/mm/pgalloc.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index cb364153c43c..412c5f48a8e7 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -189,14 +189,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
- mask = atomic_read(&page->_mapcount);
+ mask = atomic_read(&page->_refcount) >> 24;
mask = (mask | (mask >> 4)) & 3;
if (mask != 3) {
table = (unsigned long *) page_to_phys(page);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
- atomic_xor_bits(&page->_mapcount, 1U << bit);
+ atomic_xor_bits(&page->_refcount,
+ 1U << (bit + 24));
list_del(&page->lru);
}
}
@@ -217,12 +218,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = (unsigned long *) page_to_phys(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_set(&page->_mapcount, 3);
+ atomic_xor_bits(&page->_refcount, 3 << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_set(&page->_mapcount, 1);
+ atomic_xor_bits(&page->_refcount, 1 << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
@@ -241,7 +242,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
+ mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ mask >>= 24;
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
@@ -252,7 +254,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
}
pgtable_page_dtor(page);
- atomic_set(&page->_mapcount, -1);
__free_page(page);
}
@@ -273,7 +274,8 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask >>= 24;
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
@@ -295,12 +297,13 @@ static void __tlb_remove_table(void *_table)
break;
case 1: /* lower 2K of a 4K page table */
case 2: /* higher 2K of a 4K page table */
- if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
+ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+ mask >>= 24;
+ if (mask != 0)
break;
/* fallthrough */
case 3: /* 4K page table with pgstes */
pgtable_page_dtor(page);
- atomic_set(&page->_mapcount, -1);
__free_page(page);
break;
}
--
2.16.1
From: Matthew Wilcox <[email protected]>
Define a new PageTable bit in the page_type and use it to mark pages in
use as page tables.
Signed-off-by: Matthew Wilcox <[email protected]>
---
include/linux/mm.h | 2 ++
include/linux/page-flags.h | 6 ++++++
2 files changed, 8 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad06d42adb1a..7a15042d6828 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1829,6 +1829,7 @@ static inline bool pgtable_page_ctor(struct page *page)
{
if (!ptlock_init(page))
return false;
+ __SetPageTable(page);
inc_zone_page_state(page, NR_PAGETABLE);
return true;
}
@@ -1836,6 +1837,7 @@ static inline bool pgtable_page_ctor(struct page *page)
static inline void pgtable_page_dtor(struct page *page)
{
pte_lock_deinit(page);
+ __ClearPageTable(page);
dec_zone_page_state(page, NR_PAGETABLE);
}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8142ab716e90..ac6bab90849c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -644,6 +644,7 @@ PAGEFLAG_FALSE(DoubleMap)
#define PG_balloon 0x00000100
#define PG_kmemcg 0x00000200
#define PG_vmalloc 0x00000400
+#define PG_table 0x00000800
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -687,6 +688,11 @@ PAGE_TYPE_OPS(Kmemcg, kmemcg)
*/
PAGE_TYPE_OPS(Vmalloc, vmalloc)
+/*
+ * Marks pages in use as page tables.
+ */
+PAGE_TYPE_OPS(Table, table)
+
extern bool is_free_buddy_page(struct page *page);
__PAGEFLAG(Isolated, isolated, PF_ANY);
--
2.16.1
From: Matthew Wilcox <[email protected]>
Use a bit in page_type to mark pages which have been allocated through
vmalloc. This can be helpful when debugging or analysing crashdumps.
Signed-off-by: Matthew Wilcox <[email protected]>
---
include/linux/page-flags.h | 6 ++++++
mm/vmalloc.c | 2 ++
2 files changed, 8 insertions(+)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d151f590bbc6..8142ab716e90 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -643,6 +643,7 @@ PAGEFLAG_FALSE(DoubleMap)
#define PG_buddy 0x00000080
#define PG_balloon 0x00000100
#define PG_kmemcg 0x00000200
+#define PG_vmalloc 0x00000400
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -681,6 +682,11 @@ PAGE_TYPE_OPS(Balloon, balloon)
*/
PAGE_TYPE_OPS(Kmemcg, kmemcg)
+/*
+ * Pages allocated through vmalloc are tagged with this bit.
+ */
+PAGE_TYPE_OPS(Vmalloc, vmalloc)
+
extern bool is_free_buddy_page(struct page *page);
__PAGEFLAG(Isolated, isolated, PF_ANY);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ebff729cc956..3bc0538fc21b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1536,6 +1536,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];
BUG_ON(!page);
+ __ClearPageVmalloc(page);
__free_pages(page, 0);
}
@@ -1705,6 +1706,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = i;
goto fail;
}
+ __SetPageVmalloc(page);
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
--
2.16.1
From: Matthew Wilcox <[email protected]>
We're already using a union of many fields here, so stop abusing the
_map_count and make page_type its own field. That implies renaming some
of the machinery that creates PageBuddy, PageBalloon and PageKmemcg;
bring back the PG_buddy, PG_balloon and PG_kmemcg names.
As suggested by Kirill, make page_type a bitmask. Because it starts out
life as -1 (thanks to sharing the storage with _map_count), setting a
page flag means clearing the appropriate bit. This gives us space for
probably twenty or so extra bits (depending how paranoid we want to be
about _mapcount underflow).
Signed-off-by: Matthew Wilcox <[email protected]>
---
fs/proc/page.c | 2 +-
include/linux/mm_types.h | 13 ++++++++-----
include/linux/page-flags.h | 45 ++++++++++++++++++++++++++-------------------
kernel/crash_core.c | 1 +
mm/page_alloc.c | 13 +++++--------
scripts/tags.sh | 6 +++---
6 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1491918a33c3..35a67069fa02 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -106,7 +106,7 @@ u64 stable_page_flags(struct page *page)
* Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
* simple test in page_mapped() is not enough.
*/
- if (!PageSlab(page) && page_mapped(page))
+ if (!PageSlab(page) && !PageType(page, 0) && page_mapped(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fd1af6b9591d..1c5dea402501 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -94,6 +94,14 @@ struct page {
};
union {
+ /*
+ * If the page is neither PageSlab nor PageAnon, the value
+ * stored here may help distinguish it from page cache pages.
+ * See page-flags.h for a list of page types which are
+ * currently stored here.
+ */
+ unsigned int page_type;
+
_slub_counter_t counters;
unsigned int active; /* SLAB */
struct { /* SLUB */
@@ -107,11 +115,6 @@ struct page {
/*
* Count of ptes mapped in mms, to show when
* page is mapped & limit reverse map searches.
- *
- * Extra information about page type may be
- * stored here for pages that are never mapped,
- * in which case the value MUST BE <= -2.
- * See page-flags.h for more details.
*/
atomic_t _mapcount;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 50c2b8786831..d151f590bbc6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -630,49 +630,56 @@ PAGEFLAG_FALSE(DoubleMap)
#endif
/*
- * For pages that are never mapped to userspace, page->mapcount may be
- * used for storing extra information about page type. Any value used
- * for this purpose must be <= -2, but it's better start not too close
- * to -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a special page.
+ * For pages that are never mapped to userspace (and aren't PageSlab),
+ * page_type may be used. Because it is initialised to -1, we invert the
+ * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
+ * __ClearPageFoo *sets* the bit used for PageFoo. We leave a gap in the bit
+ * assignments so that an underflow of page_mapcount() won't be mistaken for
+ * a special page.
*/
-#define PAGE_MAPCOUNT_OPS(uname, lname) \
+
+#define PAGE_TYPE_BASE 0xff000000
+/* Reserve 0x0000007f to catch underflows of page_mapcount */
+#define PG_buddy 0x00000080
+#define PG_balloon 0x00000100
+#define PG_kmemcg 0x00000200
+
+#define PageType(page, flag) \
+ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+
+#define PAGE_TYPE_OPS(uname, lname) \
static __always_inline int Page##uname(struct page *page) \
{ \
- return atomic_read(&page->_mapcount) == \
- PAGE_##lname##_MAPCOUNT_VALUE; \
+ return PageType(page, PG_##lname); \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
- VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \
- atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \
+ VM_BUG_ON_PAGE(!PageType(page, 0), page); \
+ page->page_type &= ~PG_##lname; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
- atomic_set(&page->_mapcount, -1); \
+ page->page_type |= PG_##lname; \
}
/*
- * PageBuddy() indicate that the page is free and in the buddy system
+ * PageBuddy() indicates that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
+PAGE_TYPE_OPS(Buddy, buddy)
/*
- * PageBalloon() is set on pages that are on the balloon page list
+ * PageBalloon() is true for pages that are on the balloon page list
* (see mm/balloon_compaction.c).
*/
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
+PAGE_TYPE_OPS(Balloon, balloon)
/*
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
* pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
*/
-#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512)
-PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG)
+PAGE_TYPE_OPS(Kmemcg, kmemcg)
extern bool is_free_buddy_page(struct page *page);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4f63597c824d..b02340fb99ff 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -458,6 +458,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
VMCOREINFO_NUMBER(PG_head_mask);
+#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb416723538f..41e56cf5bd7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -744,16 +744,14 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
+ * we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -798,9 +796,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with PageBuddy().
+ * Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
diff --git a/scripts/tags.sh b/scripts/tags.sh
index 78e546ff689c..8c3ae36d4ea8 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -188,9 +188,9 @@ regex_c=(
'/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/'
'/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/'
'/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/'
- '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/'
- '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
- '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
+ '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/Page\1/'
+ '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
+ '/^PAGE_TYPE_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
'/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/'
'/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/'
'/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/'
--
2.16.1
On 02/28/2018 02:31 PM, Matthew Wilcox wrote:
> From: Matthew Wilcox <[email protected]>
>
> I want to use the _mapcount field to record what a page is in use as.
> This can help with debugging and we can also expose that information to
> userspace through /proc/kpageflags to help diagnose memory usage (not
> included as part of this patch set).
Hey,
Will there be updates to tools/vm/ also, or are these a different set of
(many) flags?
thanks,
--
~Randy
On Wed, Feb 28, 2018 at 03:22:49PM -0800, Randy Dunlap wrote:
> On 02/28/2018 02:31 PM, Matthew Wilcox wrote:
> > From: Matthew Wilcox <[email protected]>
> >
> > I want to use the _mapcount field to record what a page is in use as.
> > This can help with debugging and we can also expose that information to
> > userspace through /proc/kpageflags to help diagnose memory usage (not
> > included as part of this patch set).
>
> Hey,
>
> Will there be updates to tools/vm/ also, or are these a different set of
> (many) flags?
Those KPF flags are the ones I was talking about. I haven't looked into
what it takes to assign those flags yet.
On Wed, 28 Feb 2018 14:31:53 -0800
Matthew Wilcox <[email protected]> wrote:
> From: Matthew Wilcox <[email protected]>
>
> I want to use the _mapcount field to record what a page is in use as.
> This can help with debugging and we can also expose that information to
> userspace through /proc/kpageflags to help diagnose memory usage (not
> included as part of this patch set).
>
> First, we need s390 to stop using _mapcount for its own purposes;
> Martin, I hope you have time to look at this patch. I must confess I
> don't quite understand what the different bits are used for in the upper
> nybble of the _mapcount, but I tried to replicate what you were doing
> faithfully.
Yeah, that is a nasty bit of code. On s390 we have 2K page tables (pte)
but 4K pages. If we use full pages for the pte tables we waste 2K of
memory for each of the tables. So we allocate 4K and split it into two
2K pieces. Now we have to keep track of the pieces to be able to free
them again.
I try to give your patch a spin today. It should be stand-alone, no ?
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Thu, 1 Mar 2018 08:17:50 +0100
Martin Schwidefsky <[email protected]> wrote:
> On Wed, 28 Feb 2018 14:31:53 -0800
> Matthew Wilcox <[email protected]> wrote:
>
> > From: Matthew Wilcox <[email protected]>
> >
> > I want to use the _mapcount field to record what a page is in use as.
> > This can help with debugging and we can also expose that information to
> > userspace through /proc/kpageflags to help diagnose memory usage (not
> > included as part of this patch set).
> >
> > First, we need s390 to stop using _mapcount for its own purposes;
> > Martin, I hope you have time to look at this patch. I must confess I
> > don't quite understand what the different bits are used for in the upper
> > nybble of the _mapcount, but I tried to replicate what you were doing
> > faithfully.
>
> Yeah, that is a nasty bit of code. On s390 we have 2K page tables (pte)
> but 4K pages. If we use full pages for the pte tables we waste 2K of
> memory for each of the tables. So we allocate 4K and split it into two
> 2K pieces. Now we have to keep track of the pieces to be able to free
> them again.
>
> I try to give your patch a spin today. It should be stand-alone, no ?
Ok, that seems to work just fine. System boots and survived some stress
without loosing memory.
Acked-by: Martin Schwidefsky <[email protected]>
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Thu, Mar 01, 2018 at 08:17:50AM +0100, Martin Schwidefsky wrote:
> On Wed, 28 Feb 2018 14:31:53 -0800
> Matthew Wilcox <[email protected]> wrote:
>
> > From: Matthew Wilcox <[email protected]>
> >
> > I want to use the _mapcount field to record what a page is in use as.
> > This can help with debugging and we can also expose that information to
> > userspace through /proc/kpageflags to help diagnose memory usage (not
> > included as part of this patch set).
> >
> > First, we need s390 to stop using _mapcount for its own purposes;
> > Martin, I hope you have time to look at this patch. I must confess I
> > don't quite understand what the different bits are used for in the upper
> > nybble of the _mapcount, but I tried to replicate what you were doing
> > faithfully.
>
> Yeah, that is a nasty bit of code. On s390 we have 2K page tables (pte)
> but 4K pages. If we use full pages for the pte tables we waste 2K of
> memory for each of the tables. So we allocate 4K and split it into two
> 2K pieces. Now we have to keep track of the pieces to be able to free
> them again.
Have you considered to use slab for page table allocation instead?
IIRC some architectures practice this already.
--
Kirill A. Shutemov
On Wed, Feb 28, 2018 at 02:31:54PM -0800, Matthew Wilcox wrote:
> From: Matthew Wilcox <[email protected]>
>
> s390 borrows the storage used for _mapcount in struct page in order to
> account whether the bottom or top half is being used for 2kB page
> tables. I want to use that for something else, so use the top byte of
> _refcount instead of the bottom byte of _mapcount. _refcount may
> temporarily be incremented by other CPUs that see a stale pointer to
> this page in the page cache, but each CPU can only increment it by one,
> and there are no systems with 2^24 CPUs today, so they will not change
> the upper byte of _refcount. We do have to be a little careful not to
> lose any of their writes (as they will subsequently decrement the
> counter).
Hm. I'm more worried about false-negative put_page_testzero().
Are you sure it won't lead to leaks. I cannot say from the code changes.
And for page-table pages should have planty space in other fields.
IIRC page->mapping is unused there.
--
Kirill A. Shutemov
On Thu, 1 Mar 2018 15:53:10 +0300
"Kirill A. Shutemov" <[email protected]> wrote:
> On Wed, Feb 28, 2018 at 02:31:54PM -0800, Matthew Wilcox wrote:
> > From: Matthew Wilcox <[email protected]>
> >
> > s390 borrows the storage used for _mapcount in struct page in order to
> > account whether the bottom or top half is being used for 2kB page
> > tables. I want to use that for something else, so use the top byte of
> > _refcount instead of the bottom byte of _mapcount. _refcount may
> > temporarily be incremented by other CPUs that see a stale pointer to
> > this page in the page cache, but each CPU can only increment it by one,
> > and there are no systems with 2^24 CPUs today, so they will not change
> > the upper byte of _refcount. We do have to be a little careful not to
> > lose any of their writes (as they will subsequently decrement the
> > counter).
>
> Hm. I'm more worried about false-negative put_page_testzero().
> Are you sure it won't lead to leaks. I cannot say from the code changes.
>
> And for page-table pages should have planty space in other fields.
> IIRC page->mapping is unused there.
2^^24 put_page_testzero calls for page table pages? I don't think so.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Thu, Mar 01, 2018 at 03:04:20PM +0100, Martin Schwidefsky wrote:
> On Thu, 1 Mar 2018 15:53:10 +0300
> "Kirill A. Shutemov" <[email protected]> wrote:
>
> > On Wed, Feb 28, 2018 at 02:31:54PM -0800, Matthew Wilcox wrote:
> > > From: Matthew Wilcox <[email protected]>
> > >
> > > s390 borrows the storage used for _mapcount in struct page in order to
> > > account whether the bottom or top half is being used for 2kB page
> > > tables. I want to use that for something else, so use the top byte of
> > > _refcount instead of the bottom byte of _mapcount. _refcount may
> > > temporarily be incremented by other CPUs that see a stale pointer to
> > > this page in the page cache, but each CPU can only increment it by one,
> > > and there are no systems with 2^24 CPUs today, so they will not change
> > > the upper byte of _refcount. We do have to be a little careful not to
> > > lose any of their writes (as they will subsequently decrement the
> > > counter).
> >
> > Hm. I'm more worried about false-negative put_page_testzero().
> > Are you sure it won't lead to leaks. I cannot say from the code changes.
> >
> > And for page-table pages should have planty space in other fields.
> > IIRC page->mapping is unused there.
>
> 2^^24 put_page_testzero calls for page table pages? I don't think so.
No, I mean oposite: we don't free the page when we should. 2^24 is not
zero and page won't be freed if the acctual refcount (without the flag in
upper bits) drops to zero.
--
Kirill A. Shutemov
On Thu, 1 Mar 2018 17:28:55 +0300
"Kirill A. Shutemov" <[email protected]> wrote:
> On Thu, Mar 01, 2018 at 03:04:20PM +0100, Martin Schwidefsky wrote:
> > On Thu, 1 Mar 2018 15:53:10 +0300
> > "Kirill A. Shutemov" <[email protected]> wrote:
> >
> > > On Wed, Feb 28, 2018 at 02:31:54PM -0800, Matthew Wilcox wrote:
> > > > From: Matthew Wilcox <[email protected]>
> > > >
> > > > s390 borrows the storage used for _mapcount in struct page in order to
> > > > account whether the bottom or top half is being used for 2kB page
> > > > tables. I want to use that for something else, so use the top byte of
> > > > _refcount instead of the bottom byte of _mapcount. _refcount may
> > > > temporarily be incremented by other CPUs that see a stale pointer to
> > > > this page in the page cache, but each CPU can only increment it by one,
> > > > and there are no systems with 2^24 CPUs today, so they will not change
> > > > the upper byte of _refcount. We do have to be a little careful not to
> > > > lose any of their writes (as they will subsequently decrement the
> > > > counter).
> > >
> > > Hm. I'm more worried about false-negative put_page_testzero().
> > > Are you sure it won't lead to leaks. I cannot say from the code changes.
> > >
> > > And for page-table pages should have planty space in other fields.
> > > IIRC page->mapping is unused there.
> >
> > 2^^24 put_page_testzero calls for page table pages? I don't think so.
>
> No, I mean oposite: we don't free the page when we should. 2^24 is not
> zero and page won't be freed if the acctual refcount (without the flag in
> upper bits) drops to zero.
Ah, ok. But this is not a problem as the page is freed after both bits for
the two 2K pieces havbe been set to zero.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Thu, 1 Mar 2018 15:44:12 +0300
"Kirill A. Shutemov" <[email protected]> wrote:
> On Thu, Mar 01, 2018 at 08:17:50AM +0100, Martin Schwidefsky wrote:
> > On Wed, 28 Feb 2018 14:31:53 -0800
> > Matthew Wilcox <[email protected]> wrote:
> >
> > > From: Matthew Wilcox <[email protected]>
> > >
> > > I want to use the _mapcount field to record what a page is in use as.
> > > This can help with debugging and we can also expose that information to
> > > userspace through /proc/kpageflags to help diagnose memory usage (not
> > > included as part of this patch set).
> > >
> > > First, we need s390 to stop using _mapcount for its own purposes;
> > > Martin, I hope you have time to look at this patch. I must confess I
> > > don't quite understand what the different bits are used for in the upper
> > > nybble of the _mapcount, but I tried to replicate what you were doing
> > > faithfully.
> >
> > Yeah, that is a nasty bit of code. On s390 we have 2K page tables (pte)
> > but 4K pages. If we use full pages for the pte tables we waste 2K of
> > memory for each of the tables. So we allocate 4K and split it into two
> > 2K pieces. Now we have to keep track of the pieces to be able to free
> > them again.
>
> Have you considered to use slab for page table allocation instead?
> IIRC some architectures practice this already.
Well there is a complication with KVM and the page table management for
gmaps. If mm_alloc_pgste(mm) == true then a 4K page page table has to be
allocated. For the gmap I need a place to store an 8 byte value, currently
we use page->index. But the slab/slub code uses page->index for its own
purpose. This creates a conflict, but maybe doing a get_free_page for
mm_alloc_pgste(mm) == true and using a slab cache for normal page tables
might work.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.
On Thu, Mar 01, 2018 at 03:44:12PM +0300, Kirill A. Shutemov wrote:
> On Thu, Mar 01, 2018 at 08:17:50AM +0100, Martin Schwidefsky wrote:
> > Yeah, that is a nasty bit of code. On s390 we have 2K page tables (pte)
> > but 4K pages. If we use full pages for the pte tables we waste 2K of
> > memory for each of the tables. So we allocate 4K and split it into two
> > 2K pieces. Now we have to keep track of the pieces to be able to free
> > them again.
>
> Have you considered to use slab for page table allocation instead?
> IIRC some architectures practice this already.
You're not allowed to do that any more. Look at pgtable_page_ctor(),
or rather ptlock_init().
On Thu, 1 Mar 2018 06:50:58 -0800
Matthew Wilcox <[email protected]> wrote:
> On Thu, Mar 01, 2018 at 03:44:12PM +0300, Kirill A. Shutemov wrote:
> > On Thu, Mar 01, 2018 at 08:17:50AM +0100, Martin Schwidefsky wrote:
> > > Yeah, that is a nasty bit of code. On s390 we have 2K page tables (pte)
> > > but 4K pages. If we use full pages for the pte tables we waste 2K of
> > > memory for each of the tables. So we allocate 4K and split it into two
> > > 2K pieces. Now we have to keep track of the pieces to be able to free
> > > them again.
> >
> > Have you considered to use slab for page table allocation instead?
> > IIRC some architectures practice this already.
>
> You're not allowed to do that any more. Look at pgtable_page_ctor(),
> or rather ptlock_init().
Oh yes, I forgot about the ptl. This takes up some fields in struct page
which the slab/slub cache want to use as well.
--
blue skies,
Martin.
"Reality continues to ruin my life." - Calvin.