Currently THP allocation events data is fairly opaque, since you can
only get it system-wide. This patch makes it easier to reason about
transparent hugepage behaviour on a per-memcg basis.
For anonymous THP-backed pages, we already have MEMCG_RSS_HUGE in v1,
which is used for v1's rss_huge [sic]. This is reused here as it's
fairly involved to untangle NR_ANON_THPS right now to make it
per-memcg, since right now some of this is delegated to rmap before we
have any memcg actually assigned to the page. It's a good idea to rework
that, but let's leave untangling THP allocation for a future patch.
Signed-off-by: Chris Down <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
Documentation/admin-guide/cgroup-v2.rst | 14 ++++++++++++++
mm/huge_memory.c | 2 ++
mm/khugepaged.c | 2 ++
mm/memcontrol.c | 13 +++++++++++++
4 files changed, 31 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7bf3f129c68b..b6989b39ed8e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1189,6 +1189,10 @@ PAGE_SIZE multiple when read back.
Amount of cached filesystem data that was modified and
is currently being written back to disk
+ anon_thp
+ Amount of memory used in anonymous mappings backed by
+ transparent hugepages
+
inactive_anon, active_anon, inactive_file, active_file, unevictable
Amount of memory, swap-backed and filesystem-backed,
on the internal memory management lists used by the
@@ -1248,6 +1252,16 @@ PAGE_SIZE multiple when read back.
Amount of reclaimed lazyfree pages
+ thp_fault_alloc
+
+ Number of transparent hugepages which were allocated to satisfy
+ a page fault, including COW faults
+
+ thp_collapse_alloc
+
+ Number of transparent hugepages which were allocated to
+ allow collapsing an existing range of pages
+
memory.swap.current
A read-only single value file which exists on non-root
cgroups.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f5f1d4324fe2..6cb7a748aa33 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -617,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
}
return 0;
@@ -1339,6 +1340,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
}
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ceb242ca6ef6..54f3d33f897a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1075,6 +1075,7 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1503,6 +1504,7 @@ static void collapse_shmem(struct mm_struct *mm,
page_ref_add(new_page, HPAGE_PMD_NR - 1);
set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_anon(new_page);
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 18f4aefbe0bf..2f4fe2fb9046 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5603,6 +5603,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "file_writeback %llu\n",
(u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_printf(m, "anon_thp %llu\n",
+ (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
+
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
(u64)acc.lru_pages[i] * PAGE_SIZE);
@@ -5634,6 +5643,10 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
+ seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
+ seq_printf(m, "thp_collapse_alloc %lu\n",
+ acc.events[THP_COLLAPSE_ALLOC]);
+
return 0;
}
--
2.20.1
On Tue, Jan 29, 2019 at 03:58:52PM -0500, Chris Down wrote:
> Currently THP allocation events data is fairly opaque, since you can
> only get it system-wide. This patch makes it easier to reason about
> transparent hugepage behaviour on a per-memcg basis.
>
> For anonymous THP-backed pages, we already have MEMCG_RSS_HUGE in v1,
> which is used for v1's rss_huge [sic]. This is reused here as it's
> fairly involved to untangle NR_ANON_THPS right now to make it
> per-memcg, since right now some of this is delegated to rmap before we
> have any memcg actually assigned to the page. It's a good idea to rework
> that, but let's leave untangling THP allocation for a future patch.
>
> Signed-off-by: Chris Down <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
Looks good to me. It's useful to know if a cgroup is getting the THP
coverage and allocation policy it's asking for.
Acked-by: Johannes Weiner <[email protected]>
The fallback numbers could be useful as well, but they're tricky to
obtain as there isn't an obvious memcg context. We can do them later.
Hi Chris,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v5.0-rc4]
[cannot apply to next-20190131]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Chris-Down/mm-memcontrol-Expose-THP-events-on-a-per-memcg-basis/20190201-022143
config: x86_64-randconfig-j1-01290405 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
All errors (new ones prefixed by >>):
mm/memcontrol.c: In function 'memory_stat_show':
>> mm/memcontrol.c:5625:52: error: 'THP_FAULT_ALLOC' undeclared (first use in this function)
seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
^
mm/memcontrol.c:5625:52: note: each undeclared identifier is reported only once for each function it appears in
mm/memcontrol.c:5627:17: error: 'THP_COLLAPSE_ALLOC' undeclared (first use in this function)
acc.events[THP_COLLAPSE_ALLOC]);
^
vim +/THP_FAULT_ALLOC +5625 mm/memcontrol.c
5541
5542 static int memory_stat_show(struct seq_file *m, void *v)
5543 {
5544 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5545 struct accumulated_stats acc;
5546 int i;
5547
5548 /*
5549 * Provide statistics on the state of the memory subsystem as
5550 * well as cumulative event counters that show past behavior.
5551 *
5552 * This list is ordered following a combination of these gradients:
5553 * 1) generic big picture -> specifics and details
5554 * 2) reflecting userspace activity -> reflecting kernel heuristics
5555 *
5556 * Current memory state:
5557 */
5558
5559 memset(&acc, 0, sizeof(acc));
5560 acc.stats_size = MEMCG_NR_STAT;
5561 acc.events_size = NR_VM_EVENT_ITEMS;
5562 accumulate_memcg_tree(memcg, &acc);
5563
5564 seq_printf(m, "anon %llu\n",
5565 (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5566 seq_printf(m, "file %llu\n",
5567 (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5568 seq_printf(m, "kernel_stack %llu\n",
5569 (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5570 seq_printf(m, "slab %llu\n",
5571 (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5572 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5573 seq_printf(m, "sock %llu\n",
5574 (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5575
5576 seq_printf(m, "shmem %llu\n",
5577 (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5578 seq_printf(m, "file_mapped %llu\n",
5579 (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5580 seq_printf(m, "file_dirty %llu\n",
5581 (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5582 seq_printf(m, "file_writeback %llu\n",
5583 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5584
5585 /*
5586 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
5587 * with the NR_ANON_THP vm counter, but right now it's a pain in the
5588 * arse because it requires migrating the work out of rmap to a place
5589 * where the page->mem_cgroup is set up and stable.
5590 */
5591 seq_printf(m, "anon_thp %llu\n",
5592 (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
5593
5594 for (i = 0; i < NR_LRU_LISTS; i++)
5595 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5596 (u64)acc.lru_pages[i] * PAGE_SIZE);
5597
5598 seq_printf(m, "slab_reclaimable %llu\n",
5599 (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5600 seq_printf(m, "slab_unreclaimable %llu\n",
5601 (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5602
5603 /* Accumulated memory events */
5604
5605 seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5606 seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5607
5608 seq_printf(m, "workingset_refault %lu\n",
5609 acc.stat[WORKINGSET_REFAULT]);
5610 seq_printf(m, "workingset_activate %lu\n",
5611 acc.stat[WORKINGSET_ACTIVATE]);
5612 seq_printf(m, "workingset_nodereclaim %lu\n",
5613 acc.stat[WORKINGSET_NODERECLAIM]);
5614
5615 seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5616 seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5617 acc.events[PGSCAN_DIRECT]);
5618 seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5619 acc.events[PGSTEAL_DIRECT]);
5620 seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5621 seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5622 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5623 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5624
> 5625 seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
5626 seq_printf(m, "thp_collapse_alloc %lu\n",
5627 acc.events[THP_COLLAPSE_ALLOC]);
5628
5629 return 0;
5630 }
5631
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Chris,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v5.0-rc4]
[cannot apply to next-20190131]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Chris-Down/mm-memcontrol-Expose-THP-events-on-a-per-memcg-basis/20190201-022143
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gnu-gcc (Debian 8.2.0-11) 8.2.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.2.0 make.cross ARCH=sh
All errors (new ones prefixed by >>):
mm/memcontrol.c: In function 'memory_stat_show':
>> mm/memcontrol.c:5625:52: error: 'THP_FAULT_ALLOC' undeclared (first use in this function); did you mean 'THP_FILE_ALLOC'?
seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
^~~~~~~~~~~~~~~
THP_FILE_ALLOC
mm/memcontrol.c:5625:52: note: each undeclared identifier is reported only once for each function it appears in
>> mm/memcontrol.c:5627:17: error: 'THP_COLLAPSE_ALLOC' undeclared (first use in this function); did you mean 'THP_FILE_ALLOC'?
acc.events[THP_COLLAPSE_ALLOC]);
^~~~~~~~~~~~~~~~~~
THP_FILE_ALLOC
vim +5625 mm/memcontrol.c
5541
5542 static int memory_stat_show(struct seq_file *m, void *v)
5543 {
5544 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5545 struct accumulated_stats acc;
5546 int i;
5547
5548 /*
5549 * Provide statistics on the state of the memory subsystem as
5550 * well as cumulative event counters that show past behavior.
5551 *
5552 * This list is ordered following a combination of these gradients:
5553 * 1) generic big picture -> specifics and details
5554 * 2) reflecting userspace activity -> reflecting kernel heuristics
5555 *
5556 * Current memory state:
5557 */
5558
5559 memset(&acc, 0, sizeof(acc));
5560 acc.stats_size = MEMCG_NR_STAT;
5561 acc.events_size = NR_VM_EVENT_ITEMS;
5562 accumulate_memcg_tree(memcg, &acc);
5563
5564 seq_printf(m, "anon %llu\n",
5565 (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5566 seq_printf(m, "file %llu\n",
5567 (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5568 seq_printf(m, "kernel_stack %llu\n",
5569 (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5570 seq_printf(m, "slab %llu\n",
5571 (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5572 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5573 seq_printf(m, "sock %llu\n",
5574 (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5575
5576 seq_printf(m, "shmem %llu\n",
5577 (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5578 seq_printf(m, "file_mapped %llu\n",
5579 (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5580 seq_printf(m, "file_dirty %llu\n",
5581 (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5582 seq_printf(m, "file_writeback %llu\n",
5583 (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5584
5585 /*
5586 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
5587 * with the NR_ANON_THP vm counter, but right now it's a pain in the
5588 * arse because it requires migrating the work out of rmap to a place
5589 * where the page->mem_cgroup is set up and stable.
5590 */
5591 seq_printf(m, "anon_thp %llu\n",
5592 (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
5593
5594 for (i = 0; i < NR_LRU_LISTS; i++)
5595 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5596 (u64)acc.lru_pages[i] * PAGE_SIZE);
5597
5598 seq_printf(m, "slab_reclaimable %llu\n",
5599 (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5600 seq_printf(m, "slab_unreclaimable %llu\n",
5601 (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5602
5603 /* Accumulated memory events */
5604
5605 seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5606 seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5607
5608 seq_printf(m, "workingset_refault %lu\n",
5609 acc.stat[WORKINGSET_REFAULT]);
5610 seq_printf(m, "workingset_activate %lu\n",
5611 acc.stat[WORKINGSET_ACTIVATE]);
5612 seq_printf(m, "workingset_nodereclaim %lu\n",
5613 acc.stat[WORKINGSET_NODERECLAIM]);
5614
5615 seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5616 seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5617 acc.events[PGSCAN_DIRECT]);
5618 seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5619 acc.events[PGSTEAL_DIRECT]);
5620 seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5621 seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5622 seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5623 seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5624
> 5625 seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
5626 seq_printf(m, "thp_collapse_alloc %lu\n",
> 5627 acc.events[THP_COLLAPSE_ALLOC]);
5628
5629 return 0;
5630 }
5631
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
kbuild test robot writes:
>Thank you for the patch! Yet something to improve:
>
>[auto build test ERROR on linus/master]
>[also build test ERROR on v5.0-rc4]
This was already fixed and is now in linux-next.