This series consists of a refactoring/correctness of updating the metadata
of tail pages and a couple of fixups for the refcounting part.
From this series on, instead of counting the stacks, we count the outstanding
nr_base_pages each stack has, which gives us a much better memory overview.
The other fixup is for the migration part.
A more detailed explanation can be found in the changelog of the respective
patches.
Oscar Salvador (3):
mm,page_owner: Update metada for tail pages
mm,page_owner: Fix refcount imbalance
mm,page_owner: Fix accounting of pages when migrating
Documentation/mm/page_owner.rst | 73 +++++++------
mm/page_owner.c | 184 +++++++++++++++++++-------------
2 files changed, 146 insertions(+), 111 deletions(-)
--
2.44.0
__set_page_owner_handle() and __reset_page_owner() update the metadata
of all pages when the page is of a higher-order, but we miss to do the
same when the pages are migrated.
__folio_copy_owner() only updates the metadata of the head page, meaning
that the information stored in the first page and the tail pages will not
match.
Strictly speaking that is not a big problem because 1) we do not print
tail pages and 2) upon splitting all tail pages will inherit the
metada of the head page, but it is better to have all metadata in check
should there be any problem, so it can ease debugging.
For that purpose, a couple of helpers are created
__update_page_owner_handle() which updates the metadata on allocation,
and __update_page_owner_free_handle() which does the same when the page
is freed.
__folio_copy_owner() will make use of both as it needs to entirely replace
the page_owner metadata for the new page.
Signed-off-by: Oscar Salvador <[email protected]>
---
mm/page_owner.c | 137 ++++++++++++++++++++++++++----------------------
1 file changed, 74 insertions(+), 63 deletions(-)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d17d1351ec84..52d1ced0b57f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -228,9 +228,58 @@ static void dec_stack_record_count(depot_stack_handle_t handle)
refcount_dec(&stack_record->count);
}
-void __reset_page_owner(struct page *page, unsigned short order)
+static inline void __update_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned short order,
+ gfp_t gfp_mask,
+ short last_migrate_reason, u64 ts_nsec,
+ pid_t pid, pid_t tgid, char *comm)
{
int i;
+ struct page_owner *page_owner;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = handle;
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = last_migrate_reason;
+ page_owner->pid = pid;
+ page_owner->tgid = tgid;
+ page_owner->ts_nsec = ts_nsec;
+ strscpy(page_owner->comm, comm,
+ sizeof(page_owner->comm));
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle,
+ unsigned short order,
+ pid_t pid, pid_t tgid,
+ u64 free_ts_nsec)
+{
+ int i;
+ struct page_owner *page_owner;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ /* Only __reset_page_owner() wants to clear the bit */
+ if (handle) {
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_owner->free_handle = handle;
+ }
+ page_owner->free_ts_nsec = free_ts_nsec;
+ page_owner->free_pid = current->pid;
+ page_owner->free_tgid = current->tgid;
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+void __reset_page_owner(struct page *page, unsigned short order)
+{
struct page_ext *page_ext;
depot_stack_handle_t handle;
depot_stack_handle_t alloc_handle;
@@ -245,16 +294,10 @@ void __reset_page_owner(struct page *page, unsigned short order)
alloc_handle = page_owner->handle;
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
- for (i = 0; i < (1 << order); i++) {
- __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
- page_owner->free_handle = handle;
- page_owner->free_ts_nsec = free_ts_nsec;
- page_owner->free_pid = current->pid;
- page_owner->free_tgid = current->tgid;
- page_ext = page_ext_next(page_ext);
- page_owner = get_page_owner(page_ext);
- }
+ __update_page_owner_free_handle(page_ext, handle, order, current->pid,
+ current->tgid, free_ts_nsec);
page_ext_put(page_ext);
+
if (alloc_handle != early_handle)
/*
* early_handle is being set as a handle for all those
@@ -266,36 +309,11 @@ void __reset_page_owner(struct page *page, unsigned short order)
dec_stack_record_count(alloc_handle);
}
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
- depot_stack_handle_t handle,
- unsigned short order, gfp_t gfp_mask)
-{
- struct page_owner *page_owner;
- int i;
- u64 ts_nsec = local_clock();
-
- for (i = 0; i < (1 << order); i++) {
- page_owner = get_page_owner(page_ext);
- page_owner->handle = handle;
- page_owner->order = order;
- page_owner->gfp_mask = gfp_mask;
- page_owner->last_migrate_reason = -1;
- page_owner->pid = current->pid;
- page_owner->tgid = current->tgid;
- page_owner->ts_nsec = ts_nsec;
- strscpy(page_owner->comm, current->comm,
- sizeof(page_owner->comm));
- __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
- __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
-
- page_ext = page_ext_next(page_ext);
- }
-}
-
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
struct page_ext *page_ext;
+ u64 ts_nsec = local_clock();
depot_stack_handle_t handle;
handle = save_stack(gfp_mask);
@@ -303,7 +321,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
+ current->pid, current->tgid, ts_nsec,
+ current->comm);
page_ext_put(page_ext);
inc_stack_record_count(handle, gfp_mask);
}
@@ -342,7 +362,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
struct page_ext *old_ext;
struct page_ext *new_ext;
- struct page_owner *old_page_owner, *new_page_owner;
+ struct page_owner *old_page_owner;
old_ext = page_ext_get(&old->page);
if (unlikely(!old_ext))
@@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
}
old_page_owner = get_page_owner(old_ext);
- new_page_owner = get_page_owner(new_ext);
- new_page_owner->order = old_page_owner->order;
- new_page_owner->gfp_mask = old_page_owner->gfp_mask;
- new_page_owner->last_migrate_reason =
- old_page_owner->last_migrate_reason;
- new_page_owner->handle = old_page_owner->handle;
- new_page_owner->pid = old_page_owner->pid;
- new_page_owner->tgid = old_page_owner->tgid;
- new_page_owner->free_pid = old_page_owner->free_pid;
- new_page_owner->free_tgid = old_page_owner->free_tgid;
- new_page_owner->ts_nsec = old_page_owner->ts_nsec;
- new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
- strcpy(new_page_owner->comm, old_page_owner->comm);
-
+ __update_page_owner_handle(new_ext, old_page_owner->handle,
+ old_page_owner->order, old_page_owner->gfp_mask,
+ old_page_owner->last_migrate_reason,
+ old_page_owner->ts_nsec, old_page_owner->pid,
+ old_page_owner->tgid, old_page_owner->comm);
/*
- * We don't clear the bit on the old folio as it's going to be freed
- * after migration. Until then, the info can be useful in case of
- * a bug, and the overall stats will be off a bit only temporarily.
- * Also, migrate_misplaced_transhuge_page() can still fail the
- * migration and then we want the old folio to retain the info. But
- * in that case we also don't need to explicitly clear the info from
- * the new page, which will be freed.
+ * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
+ * will be freed after migration. Keep them until then as they may be
+ * useful.
*/
- __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
- __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+ __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
+ old_page_owner->free_pid,
+ old_page_owner->free_tgid,
+ old_page_owner->free_ts_nsec);
+
page_ext_put(new_ext);
page_ext_put(old_ext);
}
@@ -787,8 +797,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
goto ext_put_continue;
/* Found early allocated page */
- __set_page_owner_handle(page_ext, early_handle,
- 0, 0);
+ __update_page_owner_handle(page_ext, early_handle, 0, 0,
+ -1, local_clock(), current->pid,
+ current->tgid, current->comm);
count++;
ext_put_continue:
page_ext_put(page_ext);
--
2.44.0
Upon migration, new allocated pages are being given the handle of the old
pages. This is problematic because it means that for the stack which
allocated the old page, we will be substracting the old page + the new one
when that page is freed, creating an accounting imbalance.
There is an interest in keeping it that way, as otherwise the output will
biased towards migration stacks should those operations occur often, but
that is not really helpful.
The link from the new page to the old stack is being performed by calling
__update_page_owner_handle() in __folio_copy_owner().
The only thing that is left is to link the migrate stack to the old
page, so the old page will be subtracted from the migrate stack,
avoiding by doing so any possible imbalance.
Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
Signed-off-by: Oscar Salvador <[email protected]>
---
mm/page_owner.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 5df0d6892bdc..b4476f45b376 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -366,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order)
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
+ int i;
struct page_ext *old_ext;
struct page_ext *new_ext;
struct page_owner *old_page_owner;
+ struct page_owner *new_page_owner;
+ depot_stack_handle_t migrate_handle;
old_ext = page_ext_get(&old->page);
if (unlikely(!old_ext))
@@ -381,6 +384,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
}
old_page_owner = get_page_owner(old_ext);
+ new_page_owner = get_page_owner(new_ext);
+ migrate_handle = new_page_owner->handle;
__update_page_owner_handle(new_ext, old_page_owner->handle,
old_page_owner->order, old_page_owner->gfp_mask,
old_page_owner->last_migrate_reason,
@@ -395,6 +400,16 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
old_page_owner->free_pid,
old_page_owner->free_tgid,
old_page_owner->free_ts_nsec);
+ /*
+ * We linked the original stack to the new folio, we need to do the same
+ * for the new one and the old folio otherwise there will be an imbalance
+ * when subtracting those pages from the stack.
+ */
+ for (i = 0; i < (1 << new_page_owner->order); i++) {
+ old_page_owner->handle = migrate_handle;
+ old_ext = page_ext_next(old_ext);
+ old_page_owner = get_page_owner(old_ext);
+ }
page_ext_put(new_ext);
page_ext_put(old_ext);
--
2.44.0
Current code does not contemplate scenarios were an allocation and
free operation on the same pages do not handle it in the same amount
at once.
To give an example, page_alloc_exact(), where we will allocate a page
of enough order to stafisfy the size request, but we will free the
remainings right away.
In the above example, we will increment the stack_record refcount
only once, but we will decrease it the same number of times as number
of unused pages we have to free.
This will lead to a warning because of refcount imbalance.
Fix this by recording the number of base pages in the refcount field.
Reported-by: [email protected]
Closes: https://lore.kernel.org/linux-mm/[email protected]
Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
Signed-off-by: Oscar Salvador <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
---
Documentation/mm/page_owner.rst | 73 +++++++++++++++++----------------
mm/page_owner.c | 34 ++++++++-------
2 files changed, 58 insertions(+), 49 deletions(-)
diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst
index 0d0334cd5179..3a45a20fc05a 100644
--- a/Documentation/mm/page_owner.rst
+++ b/Documentation/mm/page_owner.rst
@@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of
each page. It is already implemented and activated if page owner is
enabled. Other usages are more than welcome.
-It can also be used to show all the stacks and their outstanding
-allocations, which gives us a quick overview of where the memory is going
-without the need to screen through all the pages and match the allocation
-and free operation.
+It can also be used to show all the stacks and their current number of
+allocated base pages, which gives us a quick overview of where the memory
+is going without the need to screen through all the pages and match the
+allocation and free operation.
page owner is disabled by default. So, if you'd like to use it, you need
to add "page_owner=on" to your boot cmdline. If the kernel is built
@@ -75,42 +75,45 @@ Usage
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt
- prep_new_page+0xa9/0x120
- get_page_from_freelist+0x7e6/0x2140
- __alloc_pages+0x18a/0x370
- new_slab+0xc8/0x580
- ___slab_alloc+0x1f2/0xaf0
- __slab_alloc.isra.86+0x22/0x40
- kmem_cache_alloc+0x31b/0x350
- __khugepaged_enter+0x39/0x100
- dup_mmap+0x1c7/0x5ce
- copy_process+0x1afe/0x1c90
- kernel_clone+0x9a/0x3c0
- __do_sys_clone+0x66/0x90
- do_syscall_64+0x7f/0x160
- entry_SYSCALL_64_after_hwframe+0x6c/0x74
- stack_count: 234
+ post_alloc_hook+0x177/0x1a0
+ get_page_from_freelist+0xd01/0xd80
+ __alloc_pages+0x39e/0x7e0
+ allocate_slab+0xbc/0x3f0
+ ___slab_alloc+0x528/0x8a0
+ kmem_cache_alloc+0x224/0x3b0
+ sk_prot_alloc+0x58/0x1a0
+ sk_alloc+0x32/0x4f0
+ inet_create+0x427/0xb50
+ __sock_create+0x2e4/0x650
+ inet_ctl_sock_create+0x30/0x180
+ igmp_net_init+0xc1/0x130
+ ops_init+0x167/0x410
+ setup_net+0x304/0xa60
+ copy_net_ns+0x29b/0x4a0
+ create_new_namespaces+0x4a1/0x820
+ nr_base_pages: 16
...
...
echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
cat stacks_7000.txt
- prep_new_page+0xa9/0x120
- get_page_from_freelist+0x7e6/0x2140
- __alloc_pages+0x18a/0x370
- alloc_pages_mpol+0xdf/0x1e0
- folio_alloc+0x14/0x50
- filemap_alloc_folio+0xb0/0x100
- page_cache_ra_unbounded+0x97/0x180
- filemap_fault+0x4b4/0x1200
- __do_fault+0x2d/0x110
- do_pte_missing+0x4b0/0xa30
- __handle_mm_fault+0x7fa/0xb70
- handle_mm_fault+0x125/0x300
- do_user_addr_fault+0x3c9/0x840
- exc_page_fault+0x68/0x150
- asm_exc_page_fault+0x22/0x30
- stack_count: 8248
+ post_alloc_hook+0x177/0x1a0
+ get_page_from_freelist+0xd01/0xd80
+ __alloc_pages+0x39e/0x7e0
+ alloc_pages_mpol+0x22e/0x490
+ folio_alloc+0xd5/0x110
+ filemap_alloc_folio+0x78/0x230
+ page_cache_ra_order+0x287/0x6f0
+ filemap_get_pages+0x517/0x1160
+ filemap_read+0x304/0x9f0
+ xfs_file_buffered_read+0xe6/0x1d0 [xfs]
+ xfs_file_read_iter+0x1f0/0x380 [xfs]
+ __kernel_read+0x3b9/0x730
+ kernel_read_file+0x309/0x4d0
+ __do_sys_finit_module+0x381/0x730
+ do_syscall_64+0x8d/0x150
+ entry_SYSCALL_64_after_hwframe+0x62/0x6a
+ nr_base_pages: 20824
...
cat /sys/kernel/debug/page_owner > page_owner_full.txt
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 52d1ced0b57f..5df0d6892bdc 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
spin_unlock_irqrestore(&stack_list_lock, flags);
}
-static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
+static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
+ int nr_base_pages)
{
struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
@@ -217,15 +218,20 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
/* Add the new stack_record to our list */
add_stack_record_to_list(stack_record, gfp_mask);
}
- refcount_inc(&stack_record->count);
+ refcount_add(nr_base_pages, &stack_record->count);
}
-static void dec_stack_record_count(depot_stack_handle_t handle)
+static void dec_stack_record_count(depot_stack_handle_t handle,
+ int nr_base_pages)
{
struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
- if (stack_record)
- refcount_dec(&stack_record->count);
+ if (!stack_record)
+ return;
+
+ if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
+ pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
+ handle);
}
static inline void __update_page_owner_handle(struct page_ext *page_ext,
@@ -306,7 +312,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
* the machinery is not ready yet, we cannot decrement
* their refcount either.
*/
- dec_stack_record_count(alloc_handle);
+ dec_stack_record_count(alloc_handle, 1 << order);
}
noinline void __set_page_owner(struct page *page, unsigned short order,
@@ -325,7 +331,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
current->pid, current->tgid, ts_nsec,
current->comm);
page_ext_put(page_ext);
- inc_stack_record_count(handle, gfp_mask);
+ inc_stack_record_count(handle, gfp_mask, 1 << order);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -872,11 +878,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
return stack;
}
-static unsigned long page_owner_stack_threshold;
+static unsigned long page_owner_pages_threshold;
static int stack_print(struct seq_file *m, void *v)
{
- int i, stack_count;
+ int i, nr_base_pages;
struct stack *stack = v;
unsigned long *entries;
unsigned long nr_entries;
@@ -887,14 +893,14 @@ static int stack_print(struct seq_file *m, void *v)
nr_entries = stack_record->size;
entries = stack_record->entries;
- stack_count = refcount_read(&stack_record->count) - 1;
+ nr_base_pages = refcount_read(&stack_record->count) - 1;
- if (stack_count < 1 || stack_count < page_owner_stack_threshold)
+ if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
return 0;
for (i = 0; i < nr_entries; i++)
seq_printf(m, " %pS\n", (void *)entries[i]);
- seq_printf(m, "stack_count: %d\n\n", stack_count);
+ seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
return 0;
}
@@ -924,13 +930,13 @@ static const struct file_operations page_owner_stack_operations = {
static int page_owner_threshold_get(void *data, u64 *val)
{
- *val = READ_ONCE(page_owner_stack_threshold);
+ *val = READ_ONCE(page_owner_pages_threshold);
return 0;
}
static int page_owner_threshold_set(void *data, u64 val)
{
- WRITE_ONCE(page_owner_stack_threshold, val);
+ WRITE_ONCE(page_owner_pages_threshold, val);
return 0;
}
--
2.44.0
Hi Oscar,
On 26/03/2024 07:30, Oscar Salvador wrote:
> This series consists of a refactoring/correctness of updating the metadata
> of tail pages and a couple of fixups for the refcounting part.
>
> From this series on, instead of counting the stacks, we count the outstanding
> nr_base_pages each stack has, which gives us a much better memory overview.
> The other fixup is for the migration part.
>
> A more detailed explanation can be found in the changelog of the respective
> patches.
>
> Oscar Salvador (3):
> mm,page_owner: Update metada for tail pages
> mm,page_owner: Fix refcount imbalance
> mm,page_owner: Fix accounting of pages when migrating
>
> Documentation/mm/page_owner.rst | 73 +++++++------
> mm/page_owner.c | 184 +++++++++++++++++++-------------
> 2 files changed, 146 insertions(+), 111 deletions(-)
>
This fixes the following report from syzbot:
https://lore.kernel.org/linux-riscv/[email protected]/T/#t
So you can add:
Tested-by: Alexandre Ghiti <[email protected]>
Thanks,
Alex
On 2024/3/26 14:30, Oscar Salvador wrote:
> This series consists of a refactoring/correctness of updating the metadata
> of tail pages and a couple of fixups for the refcounting part.
>
>>From this series on, instead of counting the stacks, we count the outstanding
> nr_base_pages each stack has, which gives us a much better memory overview.
> The other fixup is for the migration part.
>
> A more detailed explanation can be found in the changelog of the respective
> patches.
Hi Oscar, this patchset fix the following issue when I test my migration
changes, but
[ 31.478715] ------------[ cut here ]------------
[ 31.480491] refcount_t: decrement hit 0; leaking memory.
[ 31.482724] WARNING: CPU: 7 PID: 113 at lib/refcount.c:31
refcount_warn_saturate+0x13c/0x148
[ 31.485390] Modules linked in:
[ 31.486800] CPU: 7 PID: 113 Comm: sh Not tainted
6.9.0-rc1-00178-g317c7bc0ef03 #139
[ 31.487883] Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0
02/06/2015
[ 31.489066] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[ 31.489872] pc : refcount_warn_saturate+0x13c/0x148
[ 31.490350] lr : refcount_warn_saturate+0x13c/0x148
[ 31.490829] sp : ffff800087913650
[ 31.491182] x29: ffff800087913650 x28: ffff00020624d010 x27:
0000000000000000
[ 31.492419] x26: 0000000000000001 x25: ffff800082dd0d60 x24:
0000000000000000
[ 31.493006] x23: 0000000754405c20 x22: ffff800082ec0000 x21:
00000000028c008e
[ 31.493527] x20: ffff0000d1638208 x19: ffff0000d1638200 x18:
0000000000000010
[ 31.493984] x17: 0000000000000006 x16: 0000000000000000 x15:
ffff0000c6253c18
[ 31.494330] x14: 0000000000000000 x13: 0000000000000002 x12:
ffff800082dff318
[ 31.494669] x11: ffff800082e6f640 x10: ffff800082e57600 x9 :
ffff80008015869c
[ 31.495056] x8 : 0000000000017fe8 x7 : c0000000ffffefff x6 :
0000000000000001
[ 31.495383] x5 : 0000000000057fa8 x4 : 0000000000000000 x3 :
0000000000000000
[ 31.495756] x2 : ffff800082dff228 x1 : 0000000000000000 x0 :
0000000000000000
[ 31.496367] Call trace:
[ 31.496528] refcount_warn_saturate+0x13c/0x148
[ 31.496760] __reset_page_owner+0x124/0x158
[ 31.496965] free_unref_page_prepare+0x2a4/0x440
[ 31.497183] free_unref_folios+0x118/0x460
[ 31.497368] folios_put_refs+0x11c/0x250
[ 31.497559] free_pages_and_swap_cache+0xd4/0x160
[ 31.497772] tlb_flush_mmu+0x8c/0x188
[ 31.497952] tlb_finish_mmu+0x54/0x160
[ 31.498123] exit_mmap+0x174/0x4e8
[ 31.498295] mmput+0xb4/0x1a0
[ 31.498429] begin_new_exec+0x474/0xd30
[ 31.498606] load_elf_binary+0x378/0x1488
[ 31.498792] bprm_execve+0x2a0/0x7e0
[ 31.498956] do_execveat_common.isra.0+0x19c/0x240
[ 31.499174] __arm64_sys_execve+0x48/0x68
[ 31.499358] invoke_syscall+0x4c/0x118
[ 31.499534] el0_svc_common.constprop.0+0x48/0xf0
[ 31.499816] do_el0_svc+0x24/0x38
[ 31.500063] el0_svc+0x4c/0x120
[ 31.500314] el0t_64_sync_handler+0xc0/0xc8
[ 31.500584] el0t_64_sync+0x190/0x198
I still see the following memory leak, could you check it?
/mnt/arm64 # cat /sys//kernel/debug/kmemleak
unreferenced object 0xffff000200d91000 (size 16):
comm "kworker/4:0", pid 42, jiffies 4294892753
hex dump (first 16 bytes):
60 37 57 c1 00 00 ff ff 00 00 00 00 00 00 00 00 `7W.............
backtrace (crc 4458f477):
[<(____ptrval____)>] kmemleak_alloc+0x3c/0x50
[<(____ptrval____)>] kmalloc_trace+0x20c/0x2e0
[<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
[<(____ptrval____)>] prep_new_page+0x108/0x138
[<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
[<(____ptrval____)>] __alloc_pages+0x1bc/0x440
[<(____ptrval____)>] new_slab+0x104/0x3c8
[<(____ptrval____)>] ___slab_alloc+0x368/0xb20
[<(____ptrval____)>] __slab_alloc.isra.0+0x3c/0x88
[<(____ptrval____)>] kmalloc_trace+0x280/0x2e0
[<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
[<(____ptrval____)>] prep_new_page+0x108/0x138
[<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
[<(____ptrval____)>] __alloc_pages+0x1bc/0x440
[<(____ptrval____)>] new_slab+0x104/0x3c8
[<(____ptrval____)>] ___slab_alloc+0x368/0xb20
unreferenced object 0xffff000200d90000 (size 16):
comm "kworker/4:0", pid 42, jiffies 4294892753
hex dump (first 16 bytes):
20 38 57 c1 00 00 ff ff 00 10 d9 00 02 00 ff ff 8W.............
backtrace (crc 786eca4d):
[<(____ptrval____)>] kmemleak_alloc+0x3c/0x50
[<(____ptrval____)>] kmalloc_trace+0x20c/0x2e0
[<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
[<(____ptrval____)>] prep_new_page+0x108/0x138
[<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
[<(____ptrval____)>] __alloc_pages+0x1bc/0x440
[<(____ptrval____)>] new_slab+0x104/0x3c8
[<(____ptrval____)>] ___slab_alloc+0x368/0xb20
[<(____ptrval____)>] __slab_alloc.isra.0+0x3c/0x88
[<(____ptrval____)>] kmalloc_node_trace+0x274/0x2f0
[<(____ptrval____)>] alloc_worker+0x2c/0x70
[<(____ptrval____)>] create_worker+0x58/0x278
[<(____ptrval____)>] worker_thread+0x260/0x320
[<(____ptrval____)>] kthread+0x130/0x148
[<(____ptrval____)>] ret_from_fork+0x10/0x20
unreferenced object 0xffff000200d90010 (size 16):
comm "kworker/4:0", pid 42, jiffies 4294892753
hex dump (first 16 bytes):
e0 39 57 c1 00 00 ff ff 00 00 d9 00 02 00 ff ff .9W.............
backtrace (crc d4c89665):
[<(____ptrval____)>] kmemleak_alloc+0x3c/0x50
[<(____ptrval____)>] kmalloc_trace+0x20c/0x2e0
[<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
[<(____ptrval____)>] prep_new_page+0x108/0x138
[<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
[<(____ptrval____)>] __alloc_pages+0x1bc/0x440
[<(____ptrval____)>] new_slab+0x104/0x3c8
[<(____ptrval____)>] ___slab_alloc+0x368/0xb20
[<(____ptrval____)>] __slab_alloc.isra.0+0x3c/0x88
[<(____ptrval____)>] kmalloc_trace+0x280/0x2e0
[<(____ptrval____)>] __kthread_create_on_node+0x7c/0x190
[<(____ptrval____)>] kthread_create_on_node+0x60/0x90
[<(____ptrval____)>] create_worker+0xd0/0x278
[<(____ptrval____)>] worker_thread+0x260/0x320
[<(____ptrval____)>] kthread+0x130/0x148
[<(____ptrval____)>] ret_from_fork+0x10/0x20
Thanks.
>
> Oscar Salvador (3):
> mm,page_owner: Update metada for tail pages
> mm,page_owner: Fix refcount imbalance
> mm,page_owner: Fix accounting of pages when migrating
>
> Documentation/mm/page_owner.rst | 73 +++++++------
> mm/page_owner.c | 184 +++++++++++++++++++-------------
> 2 files changed, 146 insertions(+), 111 deletions(-)
>
Subject: metada -> metadata
On 3/26/24 7:30 AM, Oscar Salvador wrote:
> __set_page_owner_handle() and __reset_page_owner() update the metadata
> of all pages when the page is of a higher-order, but we miss to do the
> same when the pages are migrated.
> __folio_copy_owner() only updates the metadata of the head page, meaning
> that the information stored in the first page and the tail pages will not
> match.
>
> Strictly speaking that is not a big problem because 1) we do not print
> tail pages and 2) upon splitting all tail pages will inherit the
> metada of the head page, but it is better to have all metadata in check
metadata
> should there be any problem, so it can ease debugging.
>
> For that purpose, a couple of helpers are created
> __update_page_owner_handle() which updates the metadata on allocation,
> and __update_page_owner_free_handle() which does the same when the page
> is freed.
>
> __folio_copy_owner() will make use of both as it needs to entirely replace
> the page_owner metadata for the new page.
>
> Signed-off-by: Oscar Salvador <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
Also I think this series should move to mm-hotfixes due to fixing bugs from rc1.
Some more nits:
> @@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
> }
>
> old_page_owner = get_page_owner(old_ext);
> - new_page_owner = get_page_owner(new_ext);
> - new_page_owner->order = old_page_owner->order;
> - new_page_owner->gfp_mask = old_page_owner->gfp_mask;
> - new_page_owner->last_migrate_reason =
> - old_page_owner->last_migrate_reason;
> - new_page_owner->handle = old_page_owner->handle;
> - new_page_owner->pid = old_page_owner->pid;
> - new_page_owner->tgid = old_page_owner->tgid;
> - new_page_owner->free_pid = old_page_owner->free_pid;
> - new_page_owner->free_tgid = old_page_owner->free_tgid;
> - new_page_owner->ts_nsec = old_page_owner->ts_nsec;
> - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
> - strcpy(new_page_owner->comm, old_page_owner->comm);
> -
> + __update_page_owner_handle(new_ext, old_page_owner->handle,
> + old_page_owner->order, old_page_owner->gfp_mask,
> + old_page_owner->last_migrate_reason,
> + old_page_owner->ts_nsec, old_page_owner->pid,
> + old_page_owner->tgid, old_page_owner->comm);
> /*
> - * We don't clear the bit on the old folio as it's going to be freed
> - * after migration. Until then, the info can be useful in case of
> - * a bug, and the overall stats will be off a bit only temporarily.
> - * Also, migrate_misplaced_transhuge_page() can still fail the
> - * migration and then we want the old folio to retain the info. But
> - * in that case we also don't need to explicitly clear the info from
> - * the new page, which will be freed.
> + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
> + * will be freed after migration. Keep them until then as they may be
> + * useful.
> */
The full old comment made sense, the new one sounds like it's talking about
the old folio ("will be freed after migration") but we're modifying the new
folio here. IIUC it means the case of migration failing and then the new
folio MIGHT be freed. So I think you made the comment too much concise to be
immediately clear?
> - __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
> - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
> + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
> + old_page_owner->free_pid,
> + old_page_owner->free_tgid,
> + old_page_owner->free_ts_nsec);
> +
> page_ext_put(new_ext);
> page_ext_put(old_ext);
> }
> @@ -787,8 +797,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
> goto ext_put_continue;
>
> /* Found early allocated page */
> - __set_page_owner_handle(page_ext, early_handle,
> - 0, 0);
> + __update_page_owner_handle(page_ext, early_handle, 0, 0,
> + -1, local_clock(), current->pid,
> + current->tgid, current->comm);
> count++;
> ext_put_continue:
> page_ext_put(page_ext);
On 3/26/24 7:30 AM, Oscar Salvador wrote:
> Upon migration, new allocated pages are being given the handle of the old
> pages. This is problematic because it means that for the stack which
> allocated the old page, we will be substracting the old page + the new one
> when that page is freed, creating an accounting imbalance.
>
> There is an interest in keeping it that way, as otherwise the output will
> biased towards migration stacks should those operations occur often, but
> that is not really helpful.
> The link from the new page to the old stack is being performed by calling
> __update_page_owner_handle() in __folio_copy_owner().
> The only thing that is left is to link the migrate stack to the old
> page, so the old page will be subtracted from the migrate stack,
> avoiding by doing so any possible imbalance.
>
> Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
> Signed-off-by: Oscar Salvador <[email protected]>
> ---
> mm/page_owner.c | 15 +++++++++++++++
> 1 file changed, 15 insertions(+)
>
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 5df0d6892bdc..b4476f45b376 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -366,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order)
>
> void __folio_copy_owner(struct folio *newfolio, struct folio *old)
> {
> + int i;
> struct page_ext *old_ext;
> struct page_ext *new_ext;
> struct page_owner *old_page_owner;
> + struct page_owner *new_page_owner;
> + depot_stack_handle_t migrate_handle;
>
> old_ext = page_ext_get(&old->page);
> if (unlikely(!old_ext))
> @@ -381,6 +384,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
> }
>
> old_page_owner = get_page_owner(old_ext);
> + new_page_owner = get_page_owner(new_ext);
> + migrate_handle = new_page_owner->handle;
> __update_page_owner_handle(new_ext, old_page_owner->handle,
> old_page_owner->order, old_page_owner->gfp_mask,
> old_page_owner->last_migrate_reason,
> @@ -395,6 +400,16 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
> old_page_owner->free_pid,
> old_page_owner->free_tgid,
> old_page_owner->free_ts_nsec);
> + /*
> + * We linked the original stack to the new folio, we need to do the same
> + * for the new one and the old folio otherwise there will be an imbalance
> + * when subtracting those pages from the stack.
> + */
> + for (i = 0; i < (1 << new_page_owner->order); i++) {
> + old_page_owner->handle = migrate_handle;
> + old_ext = page_ext_next(old_ext);
> + old_page_owner = get_page_owner(old_ext);
> + }
Can the migration still fail after __folio_copy_owner()? That goes again to
the comment you changed in patch 1/3. If it can, this will kinda create a
mess with the old folio's handles not reflecting reality? (although
refcounts will be ok)
So if that case is possible, could we instead just dec_stack_record_count()
for the handle that allocated the new folio (IIUC "migrate_handle" here) and
inc_stack_record_count() for the original handle that we duplicated from the
old to new. Then if either old is freed (successful migration) or new is
freed (failed migration), we'll have the correct refcounts.
>
> page_ext_put(new_ext);
> page_ext_put(old_ext);
On Tue, Apr 02, 2024 at 12:26:51PM +0200, Vlastimil Babka wrote:
> Can the migration still fail after __folio_copy_owner()? That goes again to
> the comment you changed in patch 1/3. If it can, this will kinda create a
> mess with the old folio's handles not reflecting reality? (although
> refcounts will be ok)
According to my research (I replied in patch#1), no, migration cannot
fail after __folio_copy_owner(), so we are safe here (Tm).
--
Oscar Salvador
SUSE Labs
On Tue, Apr 02, 2024 at 12:13:37PM +0200, Vlastimil Babka wrote:
> Subject: metada -> metadata
Ooops.
> > Signed-off-by: Oscar Salvador <[email protected]>
>
> Reviewed-by: Vlastimil Babka <[email protected]>
Thanks!
> > @@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
> > - * We don't clear the bit on the old folio as it's going to be freed
> > - * after migration. Until then, the info can be useful in case of
> > - * a bug, and the overall stats will be off a bit only temporarily.
> > - * Also, migrate_misplaced_transhuge_page() can still fail the
> > - * migration and then we want the old folio to retain the info. But
> > - * in that case we also don't need to explicitly clear the info from
> > - * the new page, which will be freed.
> > + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
> > + * will be freed after migration. Keep them until then as they may be
> > + * useful.
> > */
>
> The full old comment made sense, the new one sounds like it's talking about
> the old folio ("will be freed after migration") but we're modifying the new
> folio here. IIUC it means the case of migration failing and then the new
> folio MIGHT be freed. So I think you made the comment too much concise to be
> immediately clear?
It probably could be improved by saying that there is no need to clear
the bit from the old folio since that will be done when __reset_page_owner()
gets called on the old folio.
Now, answering your question about whether we can fail or not at this
stage.
I looked into this a few weeks ago and I made my mind that no, we cannot
fail at this stage, and the following is my reasoning.
This is the callchain that leads to folio_copy_owner:
migrate_folio_move
move_to_new_folio
migrate_folio
migrate_folio_extra
folio_migrate_copy
folio_copy
folio_migrate_flags
folio_copy_owner
folio_copy_owner() gets called only from folio_migrate_flags().
And all the functions that call folio_migrate_flags(), return
MIGRATEPAGE_SUCCESS right after calling it, so it is kinda the last
step of the migration.
So no, we cannot fail at this stage, so we do not have to worry about
undoing this.
--
Oscar Salvador
SUSE Labs
On 4/2/24 1:22 PM, Oscar Salvador wrote:
> On Tue, Apr 02, 2024 at 12:26:51PM +0200, Vlastimil Babka wrote:
>> Can the migration still fail after __folio_copy_owner()? That goes again to
>> the comment you changed in patch 1/3. If it can, this will kinda create a
>> mess with the old folio's handles not reflecting reality? (although
>> refcounts will be ok)
>
> According to my research (I replied in patch#1), no, migration cannot
> fail after __folio_copy_owner(), so we are safe here (Tm).
OK then,
Reviewed-by: Vlastimil Babka <[email protected]>
On Fri, Mar 29, 2024 at 12:54:30PM +0800, Kefeng Wang wrote:
> I still see the following memory leak, could you check it?
>
> /mnt/arm64 # cat /sys//kernel/debug/kmemleak
> unreferenced object 0xffff000200d91000 (size 16):
> comm "kworker/4:0", pid 42, jiffies 4294892753
> hex dump (first 16 bytes):
> 60 37 57 c1 00 00 ff ff 00 00 00 00 00 00 00 00 `7W.............
> backtrace (crc 4458f477):
> [<(____ptrval____)>] kmemleak_alloc+0x3c/0x50
> [<(____ptrval____)>] kmalloc_trace+0x20c/0x2e0
> [<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
> [<(____ptrval____)>] prep_new_page+0x108/0x138
> [<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
> [<(____ptrval____)>] __alloc_pages+0x1bc/0x440
> [<(____ptrval____)>] new_slab+0x104/0x3c8
> [<(____ptrval____)>] ___slab_alloc+0x368/0xb20
> [<(____ptrval____)>] __slab_alloc.isra.0+0x3c/0x88
> [<(____ptrval____)>] kmalloc_trace+0x280/0x2e0
> [<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
> [<(____ptrval____)>] prep_new_page+0x108/0x138
> [<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
> [<(____ptrval____)>] __alloc_pages+0x1bc/0x440
> [<(____ptrval____)>] new_slab+0x104/0x3c8
> [<(____ptrval____)>] ___slab_alloc+0x368/0xb20
Hi Kefeng Wang
You seem to be missing [1]
Could you try with that patch applied?
[1] https://lore.kernel.org/linux-mm/[email protected]/
Thanks
--
Oscar Salvador
SUSE Labs
On Thu, 28 Mar 2024 23:39:33 PDT (-0700), [email protected] wrote:
> Hi Oscar,
>
> On 26/03/2024 07:30, Oscar Salvador wrote:
>> This series consists of a refactoring/correctness of updating the metadata
>> of tail pages and a couple of fixups for the refcounting part.
>>
>> From this series on, instead of counting the stacks, we count the outstanding
>> nr_base_pages each stack has, which gives us a much better memory overview.
>> The other fixup is for the migration part.
>>
>> A more detailed explanation can be found in the changelog of the respective
>> patches.
>>
>> Oscar Salvador (3):
>> mm,page_owner: Update metada for tail pages
>> mm,page_owner: Fix refcount imbalance
>> mm,page_owner: Fix accounting of pages when migrating
>>
>> Documentation/mm/page_owner.rst | 73 +++++++------
>> mm/page_owner.c | 184 +++++++++++++++++++-------------
>> 2 files changed, 146 insertions(+), 111 deletions(-)
>>
>
> This fixes the following report from syzbot:
> https://lore.kernel.org/linux-riscv/[email protected]/T/#t
>
> So you can add:
>
> Tested-by: Alexandre Ghiti <[email protected]>
Acked-by: Palmer Dabbelt <[email protected]>
in case that helps any, but I think this is one for the MM folks Thanks
for chasing down the fix!
> Thanks,
>
> Alex
On 2024/4/2 22:20, Oscar Salvador wrote:
> On Fri, Mar 29, 2024 at 12:54:30PM +0800, Kefeng Wang wrote:
>> I still see the following memory leak, could you check it?
>>
>> /mnt/arm64 # cat /sys//kernel/debug/kmemleak
>> unreferenced object 0xffff000200d91000 (size 16):
>> comm "kworker/4:0", pid 42, jiffies 4294892753
>> hex dump (first 16 bytes):
>> 60 37 57 c1 00 00 ff ff 00 00 00 00 00 00 00 00 `7W.............
>> backtrace (crc 4458f477):
>> [<(____ptrval____)>] kmemleak_alloc+0x3c/0x50
>> [<(____ptrval____)>] kmalloc_trace+0x20c/0x2e0
>> [<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
>> [<(____ptrval____)>] prep_new_page+0x108/0x138
>> [<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
>> [<(____ptrval____)>] __alloc_pages+0x1bc/0x440
>> [<(____ptrval____)>] new_slab+0x104/0x3c8
>> [<(____ptrval____)>] ___slab_alloc+0x368/0xb20
>> [<(____ptrval____)>] __slab_alloc.isra.0+0x3c/0x88
>> [<(____ptrval____)>] kmalloc_trace+0x280/0x2e0
>> [<(____ptrval____)>] __set_page_owner+0x1d0/0x2a0
>> [<(____ptrval____)>] prep_new_page+0x108/0x138
>> [<(____ptrval____)>] get_page_from_freelist+0x79c/0x16b8
>> [<(____ptrval____)>] __alloc_pages+0x1bc/0x440
>> [<(____ptrval____)>] new_slab+0x104/0x3c8
>> [<(____ptrval____)>] ___slab_alloc+0x368/0xb20
>
> Hi Kefeng Wang
>
> You seem to be missing [1]
>
> Could you try with that patch applied?
>
> [1] https://lore.kernel.org/linux-mm/[email protected]/
Hi, re-run with clean v6.9-rc2 (already include 7844c0147211
mm,page_owner: fix recursion), after booting and wait some time,
the kmemleak is still occurred.
>
> Thanks
>