2024-04-12 07:38:06

by Barry Song

[permalink] [raw]
Subject: [PATCH v5 0/4] mm: add per-order mTHP alloc and swpout counters

From: Barry Song <[email protected]>

The patchset introduces a framework to facilitate mTHP counters, starting
with the allocation and swap-out counters. Currently, only four new nodes
are appended to the stats directory for each mTHP size.

/sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
anon_fault_alloc
anon_fault_fallback
anon_fault_fallback_charge
anon_swpout
anon_swpout_fallback

These nodes are crucial for us to monitor the fragmentation levels of
both the buddy system and the swap partitions. In the future, we may
consider adding additional nodes for further insights.

-v5:
* rename anon_alloc to anon_fault_alloc, Barry/Ryan;
* add anon_fault_fallback_charge, Ryan;
* move to dynamic alloc_percpu as powerpc's PMD_ORDER is not const,
kernel test robot;
* make anon_fault_alloc and anon_fault_fallback more consistent
with thp_fault_alloc and thp_fault_fallback, Ryan;
* handle cpu hotplug properly, Ryan;
* add docs for new sysfs nodes and ABI, Andrew.

-v4:
* Many thanks to David and Ryan for your patience and valuable insights
throughout the numerous renaming efforts!
* Guard the case order > PMD_ORDER in count func rather than in callers,
Ryan;
* Add swpout counters;
* Add a helper DEFINE_MTHP_STAT_ATTR to avoid code duplication for various
counters;
link:
https://lore.kernel.org/linux-mm/[email protected]/

-v3:
https://lore.kernel.org/linux-mm/[email protected]/

Barry Song (2):
mm: add per-order mTHP anon_alloc and anon_alloc_fallback counters
mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters
mm: add docs for per-order mTHP counters and transhuge_page ABI
mm: correct the docs for thp_fault_alloc and thp_fault_fallback

.../sys-kernel-mm-transparent-hugepage | 17 +++++
Documentation/admin-guide/mm/transhuge.rst | 32 ++++++++-
include/linux/huge_mm.h | 53 +++++++++++++++
mm/huge_memory.c | 65 +++++++++++++++++++
mm/memory.c | 3 +
mm/page_alloc.c | 4 ++
mm/page_io.c | 1 +
mm/vmscan.c | 3 +
8 files changed, 176 insertions(+), 2 deletions(-)
create mode 100644 Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage

--
2.34.1



2024-04-12 07:38:16

by Barry Song

[permalink] [raw]
Subject: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

From: Barry Song <[email protected]>

Profiling a system blindly with mTHP has become challenging due to the
lack of visibility into its operations. Presenting the success rate of
mTHP allocations appears to be pressing need.

Recently, I've been experiencing significant difficulty debugging
performance improvements and regressions without these figures. It's
crucial for us to understand the true effectiveness of mTHP in real-world
scenarios, especially in systems with fragmented memory.

This patch establishes the framework for per-order mTHP
counters. It begins by introducing the anon_fault_alloc and
anon_fault_fallback counters. Additionally, to maintain consistency
with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
Incorporating additional counters should now be straightforward as well.

Signed-off-by: Barry Song <[email protected]>
Cc: Chris Li <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Domenico Cerasuolo <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Ryan Roberts <[email protected]>
Cc: Suren Baghdasaryan <[email protected]>
Cc: Yosry Ahmed <[email protected]>
Cc: Yu Zhao <[email protected]>
---
include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
mm/memory.c | 3 ++
mm/page_alloc.c | 4 +++
4 files changed, 119 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f6..c5beb54b97cb 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
enforce_sysfs, orders);
}

+enum mthp_stat_item {
+ MTHP_STAT_ANON_FAULT_ALLOC,
+ MTHP_STAT_ANON_FAULT_FALLBACK,
+ MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ __MTHP_STAT_COUNT
+};
+
+struct mthp_stat {
+ unsigned long stats[0][__MTHP_STAT_COUNT];
+};
+
+extern struct mthp_stat __percpu *mthp_stats;
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+ if (order <= 0 || order > PMD_ORDER || !mthp_stats)
+ return;
+
+ this_cpu_inc(mthp_stats->stats[order][item]);
+}
+
+static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
+{
+ if (order <= 0 || order > PMD_ORDER || !mthp_stats)
+ return;
+
+ this_cpu_add(mthp_stats->stats[order][item], delta);
+}
+
+/*
+ * Fold the foreign cpu mthp stats into our own.
+ *
+ * This is adding to the stats on one processor
+ * but keeps the global counts constant.
+ */
+static inline void mthp_stats_fold_cpu(int cpu)
+{
+ struct mthp_stat *fold_stat;
+ int i, j;
+
+ if (!mthp_stats)
+ return;
+ fold_stat = per_cpu_ptr(mthp_stats, cpu);
+ for (i = 1; i <= PMD_ORDER; i++) {
+ for (j = 0; j < __MTHP_STAT_COUNT; j++) {
+ count_mthp_stats(i, j, fold_stat->stats[i][j]);
+ fold_stat->stats[i][j] = 0;
+ }
+ }
+}
+
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dc30139590e6..21c4ac74b484 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
};

+struct mthp_stat __percpu *mthp_stats;
+
+static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
+{
+ unsigned long sum = 0;
+ int cpu;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
+
+ sum += this->stats[order][item];
+ }
+ cpus_read_unlock();
+
+ return sum;
+}
+
+#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ int order = to_thpsize(kobj)->order; \
+ \
+ return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
+} \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+
+static struct attribute *stats_attrs[] = {
+ &anon_fault_alloc_attr.attr,
+ &anon_fault_fallback_attr.attr,
+ &anon_fault_fallback_charge_attr.attr,
+ NULL,
+};
+
+static struct attribute_group stats_attr_group = {
+ .name = "stats",
+ .attrs = stats_attrs,
+};
+
static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
unsigned long size = (PAGE_SIZE << order) / SZ_1K;
@@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
return ERR_PTR(ret);
}

+ ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
+ if (ret) {
+ kobject_put(&thpsize->kobj);
+ return ERR_PTR(ret);
+ }
+
thpsize->order = order;
return thpsize;
}
@@ -691,6 +741,11 @@ static int __init hugepage_init(void)
*/
MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);

+ mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
+ sizeof(unsigned long));
+ if (!mthp_stats)
+ return -ENOMEM;
+
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
goto err_sysfs;
@@ -725,6 +780,8 @@ static int __init hugepage_init(void)
err_slab:
hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
+ free_percpu(mthp_stats);
+ mthp_stats = NULL;
return err;
}
subsys_initcall(hugepage_init);
@@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
folio_throttle_swaprate(folio, gfp);
@@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}

@@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
if (unlikely(!folio)) {
count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
diff --git a/mm/memory.c b/mm/memory.c
index 649a547fe8e3..06048af7cf9a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
folio = vma_alloc_folio(gfp, order, vma, addr, true);
if (folio) {
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
folio_put(folio);
goto next;
}
@@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
return folio;
}
next:
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
order = next_order(&orders, order);
}

@@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)

folio_ref_add(folio, nr_pages - 1);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
folio_add_new_anon_rmap(folio, vma, addr);
folio_add_lru_vma(folio, vma);
setpte:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b51becf03d1e..3135b5ca2457 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
*/
vm_events_fold_cpu(cpu);

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ mthp_stats_fold_cpu(cpu);
+#endif
+
/*
* Zero the differential counters of the dead processor
* so that the vm statistics are consistent.
--
2.34.1


2024-04-12 07:38:40

by Barry Song

[permalink] [raw]
Subject: [PATCH v5 3/4] mm: add docs for per-order mTHP counters and transhuge_page ABI

From: Barry Song <[email protected]>

This patch includes documentation for mTHP counters and an ABI file
for sys-kernel-mm-transparent-hugepage, which appears to have been
missing for some time.

Signed-off-by: Barry Song <[email protected]>
Cc: Chris Li <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Domenico Cerasuolo <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Ryan Roberts <[email protected]>
Cc: Suren Baghdasaryan <[email protected]>
Cc: Yosry Ahmed <[email protected]>
Cc: Yu Zhao <[email protected]>
Cc: Jonathan Corbet <[email protected]>
---
.../sys-kernel-mm-transparent-hugepage | 17 +++++++++++
Documentation/admin-guide/mm/transhuge.rst | 28 +++++++++++++++++++
2 files changed, 45 insertions(+)
create mode 100644 Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage

diff --git a/Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage b/Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage
new file mode 100644
index 000000000000..80dde0fd576c
--- /dev/null
+++ b/Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage
@@ -0,0 +1,17 @@
+What: /sys/kernel/mm/hugepages/
+Date: April 2024
+Contact: Barry Song <[email protected]>
+Description:
+ /sys/kernel/mm/transparent_hugepage/ contains a number of files and
+ subdirectories,
+ - defrag
+ - enabled
+ - hpage_pmd_size
+ - khugepaged
+ - shmem_enabled
+ - use_zero_page
+ - subdirectories of the form hugepages-<size>kB, where <size>
+ is the page size of the hugepages supported by the kernel/CPU
+ combination.
+
+ See Documentation/admin-guide/mm/transhuge.rst for details.
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 04eb45a2f940..f436ff982f22 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -447,6 +447,34 @@ thp_swpout_fallback
Usually because failed to allocate some continuous swap space
for the huge page.

+In /sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats, There are
+also individual counters for each huge page size, which can be utilized to
+monitor the system's effectiveness in providing huge pages for usage. Each
+counter has its own corresponding file.
+
+anon_fault_alloc
+ is incremented every time a huge page is successfully
+ allocated and charged to handle a page fault.
+
+anon_fault_fallback
+ is incremented if a page fault fails to allocate or charge
+ a huge page and instead falls back to using huge pages with
+ lower orders or small pages.
+
+anon_fault_fallback_charge
+ is incremented if a page fault fails to charge a huge page and
+ instead falls back to using huge pages with lower orders or
+ small pages even though the allocation was successful.
+
+anon_swpout
+ is incremented every time a huge page is swapout in one
+ piece without splitting.
+
+anon_swpout_fallback
+ is incremented if a huge page has to be split before swapout.
+ Usually because failed to allocate some continuous swap space
+ for the huge page.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help
--
2.34.1


2024-04-12 07:38:52

by Barry Song

[permalink] [raw]
Subject: [PATCH v5 4/4] mm: correct the docs for thp_fault_alloc and thp_fault_fallback

From: Barry Song <[email protected]>

The documentation does not align with the code. In
__do_huge_pmd_anonymous_page(), THP_FAULT_FALLBACK is incremented when
mem_cgroup_charge() fails, despite the allocation succeeding, whereas
THP_FAULT_ALLOC is only incremented after a successful charge.

Signed-off-by: Barry Song <[email protected]>
Cc: Chris Li <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Domenico Cerasuolo <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Ryan Roberts <[email protected]>
Cc: Suren Baghdasaryan <[email protected]>
Cc: Yosry Ahmed <[email protected]>
Cc: Yu Zhao <[email protected]>
Cc: Jonathan Corbet <[email protected]>
---
Documentation/admin-guide/mm/transhuge.rst | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index f436ff982f22..98e3a99ea780 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -369,7 +369,7 @@ monitor how successfully the system is providing huge pages for use.

thp_fault_alloc
is incremented every time a huge page is successfully
- allocated to handle a page fault.
+ allocated and charged to handle a page fault.

thp_collapse_alloc
is incremented by khugepaged when it has found
@@ -377,7 +377,7 @@ thp_collapse_alloc
successfully allocated a new huge page to store the data.

thp_fault_fallback
- is incremented if a page fault fails to allocate
+ is incremented if a page fault fails to allocate or charge
a huge page and instead falls back to using small pages.

thp_fault_fallback_charge
--
2.34.1


2024-04-12 07:47:50

by Barry Song

[permalink] [raw]
Subject: [PATCH v5 2/4] mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters

From: Barry Song <[email protected]>

This helps to display the fragmentation situation of the swapfile, knowing
the proportion of how much we haven't split large folios. So far, we only
support non-split swapout for anon memory, with the possibility of
expanding to shmem in the future. So, we add the "anon" prefix to the
counter names.

Signed-off-by: Barry Song <[email protected]>
Cc: Chris Li <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Domenico Cerasuolo <[email protected]>
Cc: Kairui Song <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Ryan Roberts <[email protected]>
Cc: Suren Baghdasaryan <[email protected]>
Cc: Yosry Ahmed <[email protected]>
Cc: Yu Zhao <[email protected]>
---
include/linux/huge_mm.h | 2 ++
mm/huge_memory.c | 4 ++++
mm/page_io.c | 1 +
mm/vmscan.c | 3 +++
4 files changed, 10 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c5beb54b97cb..b69c3b3e1436 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -268,6 +268,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_ALLOC,
MTHP_STAT_ANON_FAULT_FALLBACK,
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_ANON_SWPOUT,
+ MTHP_STAT_ANON_SWPOUT_FALLBACK,
__MTHP_STAT_COUNT
};

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 21c4ac74b484..13e74724d0c3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -557,11 +557,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
+DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);

static struct attribute *stats_attrs[] = {
&anon_fault_alloc_attr.attr,
&anon_fault_fallback_attr.attr,
&anon_fault_fallback_charge_attr.attr,
+ &anon_swpout_attr.attr,
+ &anon_swpout_fallback_attr.attr,
NULL,
};

diff --git a/mm/page_io.c b/mm/page_io.c
index a9a7c236aecc..46c603dddf04 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -217,6 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
count_memcg_folio_events(folio, THP_SWPOUT, 1);
count_vm_event(THP_SWPOUT);
}
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT);
#endif
count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bca2d9981c95..49bd94423961 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1231,6 +1231,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
goto activate_locked;
}
if (!add_to_swap(folio)) {
+ int __maybe_unused order = folio_order(folio);
+
if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
@@ -1242,6 +1244,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
}
+ count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(folio))
goto activate_locked_split;
--
2.34.1


2024-04-12 09:27:10

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

Hi Barry,

2 remaining comments - otherwise looks good. (same comments I just made in the
v4 conversation).

On 12/04/2024 08:37, Barry Song wrote:
> From: Barry Song <[email protected]>
>
> Profiling a system blindly with mTHP has become challenging due to the
> lack of visibility into its operations. Presenting the success rate of
> mTHP allocations appears to be pressing need.
>
> Recently, I've been experiencing significant difficulty debugging
> performance improvements and regressions without these figures. It's
> crucial for us to understand the true effectiveness of mTHP in real-world
> scenarios, especially in systems with fragmented memory.
>
> This patch establishes the framework for per-order mTHP
> counters. It begins by introducing the anon_fault_alloc and
> anon_fault_fallback counters. Additionally, to maintain consistency
> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> Incorporating additional counters should now be straightforward as well.
>
> Signed-off-by: Barry Song <[email protected]>
> Cc: Chris Li <[email protected]>
> Cc: David Hildenbrand <[email protected]>
> Cc: Domenico Cerasuolo <[email protected]>
> Cc: Kairui Song <[email protected]>
> Cc: Matthew Wilcox (Oracle) <[email protected]>
> Cc: Peter Xu <[email protected]>
> Cc: Ryan Roberts <[email protected]>
> Cc: Suren Baghdasaryan <[email protected]>
> Cc: Yosry Ahmed <[email protected]>
> Cc: Yu Zhao <[email protected]>
> ---
> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
> mm/memory.c | 3 ++
> mm/page_alloc.c | 4 +++
> 4 files changed, 119 insertions(+)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e896ca4760f6..c5beb54b97cb 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> enforce_sysfs, orders);
> }
>
> +enum mthp_stat_item {
> + MTHP_STAT_ANON_FAULT_ALLOC,
> + MTHP_STAT_ANON_FAULT_FALLBACK,
> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> + __MTHP_STAT_COUNT
> +};
> +
> +struct mthp_stat {
> + unsigned long stats[0][__MTHP_STAT_COUNT];
> +};
> +
> +extern struct mthp_stat __percpu *mthp_stats;
> +
> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> +{
> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> + return;
> +
> + this_cpu_inc(mthp_stats->stats[order][item]);
> +}
> +
> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> +{
> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> + return;
> +
> + this_cpu_add(mthp_stats->stats[order][item], delta);
> +}
> +
> +/*
> + * Fold the foreign cpu mthp stats into our own.
> + *
> + * This is adding to the stats on one processor
> + * but keeps the global counts constant.
> + */
> +static inline void mthp_stats_fold_cpu(int cpu)
> +{
> + struct mthp_stat *fold_stat;
> + int i, j;
> +
> + if (!mthp_stats)
> + return;
> + fold_stat = per_cpu_ptr(mthp_stats, cpu);
> + for (i = 1; i <= PMD_ORDER; i++) {
> + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> + count_mthp_stats(i, j, fold_stat->stats[i][j]);
> + fold_stat->stats[i][j] = 0;
> + }
> + }
> +}

This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
cpus should work.

> +
> #define transparent_hugepage_use_zero_page() \
> (transparent_hugepage_flags & \
> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index dc30139590e6..21c4ac74b484 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> .sysfs_ops = &kobj_sysfs_ops,
> };
>
> +struct mthp_stat __percpu *mthp_stats;
> +
> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> +{
> + unsigned long sum = 0;
> + int cpu;
> +
> + cpus_read_lock();
> + for_each_online_cpu(cpu) {
> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> +
> + sum += this->stats[order][item];
> + }
> + cpus_read_unlock();
> +
> + return sum;
> +}
> +
> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
> +static ssize_t _name##_show(struct kobject *kobj, \
> + struct kobj_attribute *attr, char *buf) \
> +{ \
> + int order = to_thpsize(kobj)->order; \
> + \
> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
> +} \
> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> +
> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +
> +static struct attribute *stats_attrs[] = {
> + &anon_fault_alloc_attr.attr,
> + &anon_fault_fallback_attr.attr,
> + &anon_fault_fallback_charge_attr.attr,
> + NULL,
> +};
> +
> +static struct attribute_group stats_attr_group = {
> + .name = "stats",
> + .attrs = stats_attrs,
> +};
> +
> static struct thpsize *thpsize_create(int order, struct kobject *parent)
> {
> unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> return ERR_PTR(ret);
> }
>
> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> + if (ret) {
> + kobject_put(&thpsize->kobj);
> + return ERR_PTR(ret);
> + }
> +
> thpsize->order = order;
> return thpsize;
> }
> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> */
> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>
> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> + sizeof(unsigned long));

Personally I think it would be cleaner to allocate statically using
ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.

> + if (!mthp_stats)
> + return -ENOMEM;
> +
> err = hugepage_init_sysfs(&hugepage_kobj);
> if (err)
> goto err_sysfs;
> @@ -725,6 +780,8 @@ static int __init hugepage_init(void)
> err_slab:
> hugepage_exit_sysfs(hugepage_kobj);
> err_sysfs:
> + free_percpu(mthp_stats);
> + mthp_stats = NULL;
> return err;
> }
> subsys_initcall(hugepage_init);
> @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> folio_put(folio);
> count_vm_event(THP_FAULT_FALLBACK);
> count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> return VM_FAULT_FALLBACK;
> }
> folio_throttle_swaprate(folio, gfp);
> @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> mm_inc_nr_ptes(vma->vm_mm);
> spin_unlock(vmf->ptl);
> count_vm_event(THP_FAULT_ALLOC);
> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> }
>
> @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
> if (unlikely(!folio)) {
> count_vm_event(THP_FAULT_FALLBACK);
> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> return VM_FAULT_FALLBACK;
> }
> return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> diff --git a/mm/memory.c b/mm/memory.c
> index 649a547fe8e3..06048af7cf9a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> folio = vma_alloc_folio(gfp, order, vma, addr, true);
> if (folio) {
> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> folio_put(folio);
> goto next;
> }
> @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> return folio;
> }
> next:
> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> order = next_order(&orders, order);
> }
>
> @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>
> folio_ref_add(folio, nr_pages - 1);
> add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> folio_add_new_anon_rmap(folio, vma, addr);
> folio_add_lru_vma(folio, vma);
> setpte:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b51becf03d1e..3135b5ca2457 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
> */
> vm_events_fold_cpu(cpu);
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + mthp_stats_fold_cpu(cpu);
> +#endif
> +
> /*
> * Zero the differential counters of the dead processor
> * so that the vm statistics are consistent.


2024-04-12 09:43:44

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 2/4] mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters

On 12/04/2024 08:37, Barry Song wrote:
> From: Barry Song <[email protected]>
>
> This helps to display the fragmentation situation of the swapfile, knowing
> the proportion of how much we haven't split large folios. So far, we only
> support non-split swapout for anon memory, with the possibility of
> expanding to shmem in the future. So, we add the "anon" prefix to the
> counter names.
>
> Signed-off-by: Barry Song <[email protected]>
> Cc: Chris Li <[email protected]>
> Cc: David Hildenbrand <[email protected]>
> Cc: Domenico Cerasuolo <[email protected]>
> Cc: Kairui Song <[email protected]>
> Cc: Matthew Wilcox (Oracle) <[email protected]>
> Cc: Peter Xu <[email protected]>
> Cc: Ryan Roberts <[email protected]>
> Cc: Suren Baghdasaryan <[email protected]>
> Cc: Yosry Ahmed <[email protected]>
> Cc: Yu Zhao <[email protected]>


LGTM!

Reviewed-by: Ryan Roberts <[email protected]>


> ---
> include/linux/huge_mm.h | 2 ++
> mm/huge_memory.c | 4 ++++
> mm/page_io.c | 1 +
> mm/vmscan.c | 3 +++
> 4 files changed, 10 insertions(+)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index c5beb54b97cb..b69c3b3e1436 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -268,6 +268,8 @@ enum mthp_stat_item {
> MTHP_STAT_ANON_FAULT_ALLOC,
> MTHP_STAT_ANON_FAULT_FALLBACK,
> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> + MTHP_STAT_ANON_SWPOUT,
> + MTHP_STAT_ANON_SWPOUT_FALLBACK,
> __MTHP_STAT_COUNT
> };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 21c4ac74b484..13e74724d0c3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -557,11 +557,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
> +DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
>
> static struct attribute *stats_attrs[] = {
> &anon_fault_alloc_attr.attr,
> &anon_fault_fallback_attr.attr,
> &anon_fault_fallback_charge_attr.attr,
> + &anon_swpout_attr.attr,
> + &anon_swpout_fallback_attr.attr,
> NULL,
> };
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index a9a7c236aecc..46c603dddf04 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -217,6 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
> count_memcg_folio_events(folio, THP_SWPOUT, 1);
> count_vm_event(THP_SWPOUT);
> }
> + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT);
> #endif
> count_vm_events(PSWPOUT, folio_nr_pages(folio));
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index bca2d9981c95..49bd94423961 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1231,6 +1231,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> goto activate_locked;
> }
> if (!add_to_swap(folio)) {
> + int __maybe_unused order = folio_order(folio);
> +
> if (!folio_test_large(folio))
> goto activate_locked_split;
> /* Fallback to swap normal pages */
> @@ -1242,6 +1244,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> THP_SWPOUT_FALLBACK, 1);
> count_vm_event(THP_SWPOUT_FALLBACK);
> }
> + count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK);
> #endif
> if (!add_to_swap(folio))
> goto activate_locked_split;


2024-04-12 09:43:57

by Barry Song

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
>
> Hi Barry,
>
> 2 remaining comments - otherwise looks good. (same comments I just made in the
> v4 conversation).
>
> On 12/04/2024 08:37, Barry Song wrote:
> > From: Barry Song <[email protected]>
> >
> > Profiling a system blindly with mTHP has become challenging due to the
> > lack of visibility into its operations. Presenting the success rate of
> > mTHP allocations appears to be pressing need.
> >
> > Recently, I've been experiencing significant difficulty debugging
> > performance improvements and regressions without these figures. It's
> > crucial for us to understand the true effectiveness of mTHP in real-world
> > scenarios, especially in systems with fragmented memory.
> >
> > This patch establishes the framework for per-order mTHP
> > counters. It begins by introducing the anon_fault_alloc and
> > anon_fault_fallback counters. Additionally, to maintain consistency
> > with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> > anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> > Incorporating additional counters should now be straightforward as well.
> >
> > Signed-off-by: Barry Song <[email protected]>
> > Cc: Chris Li <[email protected]>
> > Cc: David Hildenbrand <[email protected]>
> > Cc: Domenico Cerasuolo <[email protected]>
> > Cc: Kairui Song <[email protected]>
> > Cc: Matthew Wilcox (Oracle) <[email protected]>
> > Cc: Peter Xu <[email protected]>
> > Cc: Ryan Roberts <[email protected]>
> > Cc: Suren Baghdasaryan <[email protected]>
> > Cc: Yosry Ahmed <[email protected]>
> > Cc: Yu Zhao <[email protected]>
> > ---
> > include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> > mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
> > mm/memory.c | 3 ++
> > mm/page_alloc.c | 4 +++
> > 4 files changed, 119 insertions(+)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index e896ca4760f6..c5beb54b97cb 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> > enforce_sysfs, orders);
> > }
> >
> > +enum mthp_stat_item {
> > + MTHP_STAT_ANON_FAULT_ALLOC,
> > + MTHP_STAT_ANON_FAULT_FALLBACK,
> > + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> > + __MTHP_STAT_COUNT
> > +};
> > +
> > +struct mthp_stat {
> > + unsigned long stats[0][__MTHP_STAT_COUNT];
> > +};
> > +
> > +extern struct mthp_stat __percpu *mthp_stats;
> > +
> > +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> > +{
> > + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> > + return;
> > +
> > + this_cpu_inc(mthp_stats->stats[order][item]);
> > +}
> > +
> > +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> > +{
> > + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> > + return;
> > +
> > + this_cpu_add(mthp_stats->stats[order][item], delta);
> > +}
> > +
> > +/*
> > + * Fold the foreign cpu mthp stats into our own.
> > + *
> > + * This is adding to the stats on one processor
> > + * but keeps the global counts constant.
> > + */
> > +static inline void mthp_stats_fold_cpu(int cpu)
> > +{
> > + struct mthp_stat *fold_stat;
> > + int i, j;
> > +
> > + if (!mthp_stats)
> > + return;
> > + fold_stat = per_cpu_ptr(mthp_stats, cpu);
> > + for (i = 1; i <= PMD_ORDER; i++) {
> > + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> > + count_mthp_stats(i, j, fold_stat->stats[i][j]);
> > + fold_stat->stats[i][j] = 0;
> > + }
> > + }
> > +}
>
> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> cpus should work.
>
> > +
> > #define transparent_hugepage_use_zero_page() \
> > (transparent_hugepage_flags & \
> > (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index dc30139590e6..21c4ac74b484 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> > .sysfs_ops = &kobj_sysfs_ops,
> > };
> >
> > +struct mthp_stat __percpu *mthp_stats;
> > +
> > +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> > +{
> > + unsigned long sum = 0;
> > + int cpu;
> > +
> > + cpus_read_lock();
> > + for_each_online_cpu(cpu) {
> > + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> > +
> > + sum += this->stats[order][item];
> > + }
> > + cpus_read_unlock();
> > +
> > + return sum;
> > +}
> > +
> > +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
> > +static ssize_t _name##_show(struct kobject *kobj, \
> > + struct kobj_attribute *attr, char *buf) \
> > +{ \
> > + int order = to_thpsize(kobj)->order; \
> > + \
> > + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
> > +} \
> > +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> > +
> > +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> > +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > +
> > +static struct attribute *stats_attrs[] = {
> > + &anon_fault_alloc_attr.attr,
> > + &anon_fault_fallback_attr.attr,
> > + &anon_fault_fallback_charge_attr.attr,
> > + NULL,
> > +};
> > +
> > +static struct attribute_group stats_attr_group = {
> > + .name = "stats",
> > + .attrs = stats_attrs,
> > +};
> > +
> > static struct thpsize *thpsize_create(int order, struct kobject *parent)
> > {
> > unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> > @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> > return ERR_PTR(ret);
> > }
> >
> > + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> > + if (ret) {
> > + kobject_put(&thpsize->kobj);
> > + return ERR_PTR(ret);
> > + }
> > +
> > thpsize->order = order;
> > return thpsize;
> > }
> > @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> > */
> > MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >
> > + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> > + sizeof(unsigned long));
>
> Personally I think it would be cleaner to allocate statically using
> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.

Hi Ryan,

I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,

#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)

#define MAX_PTRS_PER_PTE PTRS_PER_PTE

#define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))

while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?


Am I missing something?

>
> > + if (!mthp_stats)
> > + return -ENOMEM;
> > +
> > err = hugepage_init_sysfs(&hugepage_kobj);
> > if (err)
> > goto err_sysfs;
> > @@ -725,6 +780,8 @@ static int __init hugepage_init(void)
> > err_slab:
> > hugepage_exit_sysfs(hugepage_kobj);
> > err_sysfs:
> > + free_percpu(mthp_stats);
> > + mthp_stats = NULL;
> > return err;
> > }
> > subsys_initcall(hugepage_init);
> > @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> > folio_put(folio);
> > count_vm_event(THP_FAULT_FALLBACK);
> > count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > return VM_FAULT_FALLBACK;
> > }
> > folio_throttle_swaprate(folio, gfp);
> > @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> > mm_inc_nr_ptes(vma->vm_mm);
> > spin_unlock(vmf->ptl);
> > count_vm_event(THP_FAULT_ALLOC);
> > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> > count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> > }
> >
> > @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> > folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
> > if (unlikely(!folio)) {
> > count_vm_event(THP_FAULT_FALLBACK);
> > + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> > return VM_FAULT_FALLBACK;
> > }
> > return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 649a547fe8e3..06048af7cf9a 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> > folio = vma_alloc_folio(gfp, order, vma, addr, true);
> > if (folio) {
> > if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
> > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> > folio_put(folio);
> > goto next;
> > }
> > @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> > return folio;
> > }
> > next:
> > + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> > order = next_order(&orders, order);
> > }
> >
> > @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >
> > folio_ref_add(folio, nr_pages - 1);
> > add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> > + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> > folio_add_new_anon_rmap(folio, vma, addr);
> > folio_add_lru_vma(folio, vma);
> > setpte:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index b51becf03d1e..3135b5ca2457 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
> > */
> > vm_events_fold_cpu(cpu);
> >
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > + mthp_stats_fold_cpu(cpu);
> > +#endif
> > +
> > /*
> > * Zero the differential counters of the dead processor
> > * so that the vm statistics are consistent.
>

2024-04-12 09:57:30

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On 12/04/2024 10:43, Barry Song wrote:
> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
>>
>> Hi Barry,
>>
>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>> v4 conversation).
>>
>> On 12/04/2024 08:37, Barry Song wrote:
>>> From: Barry Song <[email protected]>
>>>
>>> Profiling a system blindly with mTHP has become challenging due to the
>>> lack of visibility into its operations. Presenting the success rate of
>>> mTHP allocations appears to be pressing need.
>>>
>>> Recently, I've been experiencing significant difficulty debugging
>>> performance improvements and regressions without these figures. It's
>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>> scenarios, especially in systems with fragmented memory.
>>>
>>> This patch establishes the framework for per-order mTHP
>>> counters. It begins by introducing the anon_fault_alloc and
>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>> Incorporating additional counters should now be straightforward as well.
>>>
>>> Signed-off-by: Barry Song <[email protected]>
>>> Cc: Chris Li <[email protected]>
>>> Cc: David Hildenbrand <[email protected]>
>>> Cc: Domenico Cerasuolo <[email protected]>
>>> Cc: Kairui Song <[email protected]>
>>> Cc: Matthew Wilcox (Oracle) <[email protected]>
>>> Cc: Peter Xu <[email protected]>
>>> Cc: Ryan Roberts <[email protected]>
>>> Cc: Suren Baghdasaryan <[email protected]>
>>> Cc: Yosry Ahmed <[email protected]>
>>> Cc: Yu Zhao <[email protected]>
>>> ---
>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
>>> mm/memory.c | 3 ++
>>> mm/page_alloc.c | 4 +++
>>> 4 files changed, 119 insertions(+)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e896ca4760f6..c5beb54b97cb 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>> enforce_sysfs, orders);
>>> }
>>>
>>> +enum mthp_stat_item {
>>> + MTHP_STAT_ANON_FAULT_ALLOC,
>>> + MTHP_STAT_ANON_FAULT_FALLBACK,
>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>> + __MTHP_STAT_COUNT
>>> +};
>>> +
>>> +struct mthp_stat {
>>> + unsigned long stats[0][__MTHP_STAT_COUNT];
>>> +};
>>> +
>>> +extern struct mthp_stat __percpu *mthp_stats;
>>> +
>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>> +{
>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>> + return;
>>> +
>>> + this_cpu_inc(mthp_stats->stats[order][item]);
>>> +}
>>> +
>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>> +{
>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>> + return;
>>> +
>>> + this_cpu_add(mthp_stats->stats[order][item], delta);
>>> +}
>>> +
>>> +/*
>>> + * Fold the foreign cpu mthp stats into our own.
>>> + *
>>> + * This is adding to the stats on one processor
>>> + * but keeps the global counts constant.
>>> + */
>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>> +{
>>> + struct mthp_stat *fold_stat;
>>> + int i, j;
>>> +
>>> + if (!mthp_stats)
>>> + return;
>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>> + for (i = 1; i <= PMD_ORDER; i++) {
>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>> + fold_stat->stats[i][j] = 0;
>>> + }
>>> + }
>>> +}
>>
>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>> cpus should work.
>>
>>> +
>>> #define transparent_hugepage_use_zero_page() \
>>> (transparent_hugepage_flags & \
>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index dc30139590e6..21c4ac74b484 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>> .sysfs_ops = &kobj_sysfs_ops,
>>> };
>>>
>>> +struct mthp_stat __percpu *mthp_stats;
>>> +
>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>> +{
>>> + unsigned long sum = 0;
>>> + int cpu;
>>> +
>>> + cpus_read_lock();
>>> + for_each_online_cpu(cpu) {
>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>> +
>>> + sum += this->stats[order][item];
>>> + }
>>> + cpus_read_unlock();
>>> +
>>> + return sum;
>>> +}
>>> +
>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
>>> +static ssize_t _name##_show(struct kobject *kobj, \
>>> + struct kobj_attribute *attr, char *buf) \
>>> +{ \
>>> + int order = to_thpsize(kobj)->order; \
>>> + \
>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
>>> +} \
>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>> +
>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> +
>>> +static struct attribute *stats_attrs[] = {
>>> + &anon_fault_alloc_attr.attr,
>>> + &anon_fault_fallback_attr.attr,
>>> + &anon_fault_fallback_charge_attr.attr,
>>> + NULL,
>>> +};
>>> +
>>> +static struct attribute_group stats_attr_group = {
>>> + .name = "stats",
>>> + .attrs = stats_attrs,
>>> +};
>>> +
>>> static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>> {
>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>> return ERR_PTR(ret);
>>> }
>>>
>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>> + if (ret) {
>>> + kobject_put(&thpsize->kobj);
>>> + return ERR_PTR(ret);
>>> + }
>>> +
>>> thpsize->order = order;
>>> return thpsize;
>>> }
>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>> */
>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>
>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>> + sizeof(unsigned long));
>>
>> Personally I think it would be cleaner to allocate statically using
>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>
> Hi Ryan,
>
> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>
> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
>
> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>
> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
>
> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>
>
> Am I missing something?

PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:

PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE
4K 12 512
16K 14 2048
64K 16 8192

So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE

PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)

MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
(and its equal to PTRS_PER_PTE except for powerpc).

Pretty sure the math is correct?

>
>>
>>> + if (!mthp_stats)
>>> + return -ENOMEM;
>>> +
>>> err = hugepage_init_sysfs(&hugepage_kobj);
>>> if (err)
>>> goto err_sysfs;
>>> @@ -725,6 +780,8 @@ static int __init hugepage_init(void)
>>> err_slab:
>>> hugepage_exit_sysfs(hugepage_kobj);
>>> err_sysfs:
>>> + free_percpu(mthp_stats);
>>> + mthp_stats = NULL;
>>> return err;
>>> }
>>> subsys_initcall(hugepage_init);
>>> @@ -880,6 +937,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>> folio_put(folio);
>>> count_vm_event(THP_FAULT_FALLBACK);
>>> count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> return VM_FAULT_FALLBACK;
>>> }
>>> folio_throttle_swaprate(folio, gfp);
>>> @@ -929,6 +988,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>> mm_inc_nr_ptes(vma->vm_mm);
>>> spin_unlock(vmf->ptl);
>>> count_vm_event(THP_FAULT_ALLOC);
>>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>>> count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>> }
>>>
>>> @@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>> folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>>> if (unlikely(!folio)) {
>>> count_vm_event(THP_FAULT_FALLBACK);
>>> + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> return VM_FAULT_FALLBACK;
>>> }
>>> return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 649a547fe8e3..06048af7cf9a 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>> folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>> if (folio) {
>>> if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> folio_put(folio);
>>> goto next;
>>> }
>>> @@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>> return folio;
>>> }
>>> next:
>>> + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> order = next_order(&orders, order);
>>> }
>>>
>>> @@ -4485,6 +4487,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>
>>> folio_ref_add(folio, nr_pages - 1);
>>> add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
>>> + count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
>>> folio_add_new_anon_rmap(folio, vma, addr);
>>> folio_add_lru_vma(folio, vma);
>>> setpte:
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index b51becf03d1e..3135b5ca2457 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -5840,6 +5840,10 @@ static int page_alloc_cpu_dead(unsigned int cpu)
>>> */
>>> vm_events_fold_cpu(cpu);
>>>
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> + mthp_stats_fold_cpu(cpu);
>>> +#endif
>>> +
>>> /*
>>> * Zero the differential counters of the dead processor
>>> * so that the vm statistics are consistent.
>>


2024-04-12 10:18:34

by Barry Song

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <[email protected]> wrote:
>
> On 12/04/2024 10:43, Barry Song wrote:
> > On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
> >>
> >> Hi Barry,
> >>
> >> 2 remaining comments - otherwise looks good. (same comments I just made in the
> >> v4 conversation).
> >>
> >> On 12/04/2024 08:37, Barry Song wrote:
> >>> From: Barry Song <[email protected]>
> >>>
> >>> Profiling a system blindly with mTHP has become challenging due to the
> >>> lack of visibility into its operations.  Presenting the success rate of
> >>> mTHP allocations appears to be pressing need.
> >>>
> >>> Recently, I've been experiencing significant difficulty debugging
> >>> performance improvements and regressions without these figures.  It's
> >>> crucial for us to understand the true effectiveness of mTHP in real-world
> >>> scenarios, especially in systems with fragmented memory.
> >>>
> >>> This patch establishes the framework for per-order mTHP
> >>> counters. It begins by introducing the anon_fault_alloc and
> >>> anon_fault_fallback counters. Additionally, to maintain consistency
> >>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> >>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> >>> Incorporating additional counters should now be straightforward as well.
> >>>
> >>> Signed-off-by: Barry Song <[email protected]>
> >>> Cc: Chris Li <[email protected]>
> >>> Cc: David Hildenbrand <[email protected]>
> >>> Cc: Domenico Cerasuolo <[email protected]>
> >>> Cc: Kairui Song <[email protected]>
> >>> Cc: Matthew Wilcox (Oracle) <[email protected]>
> >>> Cc: Peter Xu <[email protected]>
> >>> Cc: Ryan Roberts <[email protected]>
> >>> Cc: Suren Baghdasaryan <[email protected]>
> >>> Cc: Yosry Ahmed <[email protected]>
> >>> Cc: Yu Zhao <[email protected]>
> >>> ---
> >>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
> >>>  mm/memory.c             |  3 ++
> >>>  mm/page_alloc.c         |  4 +++
> >>>  4 files changed, 119 insertions(+)
> >>>
> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>> index e896ca4760f6..c5beb54b97cb 100644
> >>> --- a/include/linux/huge_mm.h
> >>> +++ b/include/linux/huge_mm.h
> >>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>>                                         enforce_sysfs, orders);
> >>>  }
> >>>
> >>> +enum mthp_stat_item {
> >>> +     MTHP_STAT_ANON_FAULT_ALLOC,
> >>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
> >>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>> +     __MTHP_STAT_COUNT
> >>> +};
> >>> +
> >>> +struct mthp_stat {
> >>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
> >>> +};
> >>> +
> >>> +extern struct mthp_stat __percpu *mthp_stats;
> >>> +
> >>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>> +{
> >>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>> +             return;
> >>> +
> >>> +     this_cpu_inc(mthp_stats->stats[order][item]);
> >>> +}
> >>> +
> >>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> >>> +{
> >>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>> +             return;
> >>> +
> >>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
> >>> +}
> >>> +
> >>> +/*
> >>> + * Fold the foreign cpu mthp stats into our own.
> >>> + *
> >>> + * This is adding to the stats on one processor
> >>> + * but keeps the global counts constant.
> >>> + */
> >>> +static inline void mthp_stats_fold_cpu(int cpu)
> >>> +{
> >>> +     struct mthp_stat *fold_stat;
> >>> +     int i, j;
> >>> +
> >>> +     if (!mthp_stats)
> >>> +             return;
> >>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
> >>> +     for (i = 1; i <= PMD_ORDER; i++) {
> >>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> >>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
> >>> +                     fold_stat->stats[i][j] = 0;
> >>> +             }
> >>> +     }
> >>> +}
> >>
> >> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> >> cpus should work.
> >>
> >>> +
> >>>  #define transparent_hugepage_use_zero_page()                         \
> >>>       (transparent_hugepage_flags &                                   \
> >>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>> index dc30139590e6..21c4ac74b484 100644
> >>> --- a/mm/huge_memory.c
> >>> +++ b/mm/huge_memory.c
> >>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >>>       .sysfs_ops = &kobj_sysfs_ops,
> >>>  };
> >>>
> >>> +struct mthp_stat __percpu *mthp_stats;
> >>> +
> >>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> >>> +{
> >>> +     unsigned long sum = 0;
> >>> +     int cpu;
> >>> +
> >>> +     cpus_read_lock();
> >>> +     for_each_online_cpu(cpu) {
> >>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> >>> +
> >>> +             sum += this->stats[order][item];
> >>> +     }
> >>> +     cpus_read_unlock();
> >>> +
> >>> +     return sum;
> >>> +}
> >>> +
> >>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
> >>> +static ssize_t _name##_show(struct kobject *kobj,                    \
> >>> +                     struct kobj_attribute *attr, char *buf)         \
> >>> +{                                                                    \
> >>> +     int order = to_thpsize(kobj)->order;                            \
> >>> +                                                                     \
> >>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
> >>> +}                                                                    \
> >>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> >>> +
> >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>> +
> >>> +static struct attribute *stats_attrs[] = {
> >>> +     &anon_fault_alloc_attr.attr,
> >>> +     &anon_fault_fallback_attr.attr,
> >>> +     &anon_fault_fallback_charge_attr.attr,
> >>> +     NULL,
> >>> +};
> >>> +
> >>> +static struct attribute_group stats_attr_group = {
> >>> +     .name = "stats",
> >>> +     .attrs = stats_attrs,
> >>> +};
> >>> +
> >>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>  {
> >>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> >>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>               return ERR_PTR(ret);
> >>>       }
> >>>
> >>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> >>> +     if (ret) {
> >>> +             kobject_put(&thpsize->kobj);
> >>> +             return ERR_PTR(ret);
> >>> +     }
> >>> +
> >>>       thpsize->order = order;
> >>>       return thpsize;
> >>>  }
> >>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >>>        */
> >>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >>>
> >>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> >>> +                     sizeof(unsigned long));
> >>
> >> Personally I think it would be cleaner to allocate statically using
> >> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> >
> > Hi Ryan,
> >
> > I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> >
> > #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
> >
> > #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> >
> > #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
> >
> > while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> >
> >
> > Am I missing something?
>
> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>
> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
> 4K              12              512
> 16K             14              2048
> 64K             16              8192
>
> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>
> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>
> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
> (and its equal to PTRS_PER_PTE except for powerpc).
>
> Pretty sure the math is correct?

I am not convinced the math is correct :-)

while page size is 64KiB, the page table is as below,
PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)


+--------+--------+--------+--------+--------+--------+--------+--------+
|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
+--------+--------+--------+--------+--------+--------+--------+--------+
| | | | |
| | | | v
| | | | [15:0] in-page offset
| | | +----------> [28:16] L3 index
| | +--------------------------> [41:29] L2 index
| +-------------------------------> [47:42] L1 index (48-bit)
| [51:42] L1 index (52-bit)
+-------------------------------------------------> [63] TTBR0/1

while page size is 4KiB, the page table is as below,

+--------+--------+--------+--------+--------+--------+--------+--------+
|63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
+--------+--------+--------+--------+--------+--------+--------+--------+
| | | | | |
| | | | | v
| | | | | [11:0] in-page offset
| | | | +-> [20:12] L3 index
| | | +-----------> [29:21] L2 index
| | +---------------------> [38:30] L1 index
| +-------------------------------> [47:39] L0 index
+-------------------------------------------------> [63] TTBR0/1

PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).

You are only correct while page size = 4KiB.





2024-04-12 10:20:08

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 3/4] mm: add docs for per-order mTHP counters and transhuge_page ABI

On 12/04/2024 08:37, Barry Song wrote:
> From: Barry Song <[email protected]>
>
> This patch includes documentation for mTHP counters and an ABI file
> for sys-kernel-mm-transparent-hugepage, which appears to have been
> missing for some time.
>
> Signed-off-by: Barry Song <[email protected]>
> Cc: Chris Li <[email protected]>
> Cc: David Hildenbrand <[email protected]>
> Cc: Domenico Cerasuolo <[email protected]>
> Cc: Kairui Song <[email protected]>
> Cc: Matthew Wilcox (Oracle) <[email protected]>
> Cc: Peter Xu <[email protected]>
> Cc: Ryan Roberts <[email protected]>
> Cc: Suren Baghdasaryan <[email protected]>
> Cc: Yosry Ahmed <[email protected]>
> Cc: Yu Zhao <[email protected]>
> Cc: Jonathan Corbet <[email protected]>

A few nits, but regardless:

Reviewed-by: Ryan Roberts <[email protected]>

> ---
> .../sys-kernel-mm-transparent-hugepage | 17 +++++++++++
> Documentation/admin-guide/mm/transhuge.rst | 28 +++++++++++++++++++
> 2 files changed, 45 insertions(+)
> create mode 100644 Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage
>
> diff --git a/Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage b/Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage
> new file mode 100644
> index 000000000000..80dde0fd576c
> --- /dev/null
> +++ b/Documentation/ABI/testing/sys-kernel-mm-transparent-hugepage
> @@ -0,0 +1,17 @@
> +What: /sys/kernel/mm/hugepages/

Err, transparent_hugepage, right? copy/paste error?

> +Date: April 2024
> +Contact: Barry Song <[email protected]>

Looks like a bunch of mm sysfs interfaces use:

Contact: Linux memory management mailing list <[email protected]>

I'll leave that up to you!

> +Description:
> + /sys/kernel/mm/transparent_hugepage/ contains a number of files and
> + subdirectories,
> + - defrag
> + - enabled
> + - hpage_pmd_size
> + - khugepaged
> + - shmem_enabled
> + - use_zero_page
> + - subdirectories of the form hugepages-<size>kB, where <size>
> + is the page size of the hugepages supported by the kernel/CPU
> + combination.
> +
> + See Documentation/admin-guide/mm/transhuge.rst for details.> diff --git a/Documentation/admin-guide/mm/transhuge.rst
b/Documentation/admin-guide/mm/transhuge.rst
> index 04eb45a2f940..f436ff982f22 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -447,6 +447,34 @@ thp_swpout_fallback
> Usually because failed to allocate some continuous swap space
> for the huge page.
>
> +In /sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats, There are
> +also individual counters for each huge page size, which can be utilized to
> +monitor the system's effectiveness in providing huge pages for usage. Each
> +counter has its own corresponding file.
> +
> +anon_fault_alloc
> + is incremented every time a huge page is successfully
> + allocated and charged to handle a page fault.
> +
> +anon_fault_fallback
> + is incremented if a page fault fails to allocate or charge
> + a huge page and instead falls back to using huge pages with
> + lower orders or small pages.
> +
> +anon_fault_fallback_charge
> + is incremented if a page fault fails to charge a huge page and
> + instead falls back to using huge pages with lower orders or
> + small pages even though the allocation was successful.
> +
> +anon_swpout
> + is incremented every time a huge page is swapout in one

nit: swapout -> "swapped out"? Although I see this is just a copy/paste of the
description of the existing counter...

> + piece without splitting.
> +
> +anon_swpout_fallback
> + is incremented if a huge page has to be split before swapout.
> + Usually because failed to allocate some continuous swap space
> + for the huge page.
> +
> As the system ages, allocating huge pages may be expensive as the
> system uses memory compaction to copy data around memory to free a
> huge page for use. There are some counters in ``/proc/vmstat`` to help


2024-04-12 10:21:25

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 4/4] mm: correct the docs for thp_fault_alloc and thp_fault_fallback

On 12/04/2024 08:37, Barry Song wrote:
> From: Barry Song <[email protected]>
>
> The documentation does not align with the code. In
> __do_huge_pmd_anonymous_page(), THP_FAULT_FALLBACK is incremented when
> mem_cgroup_charge() fails, despite the allocation succeeding, whereas
> THP_FAULT_ALLOC is only incremented after a successful charge.
>
> Signed-off-by: Barry Song <[email protected]>
> Cc: Chris Li <[email protected]>
> Cc: David Hildenbrand <[email protected]>
> Cc: Domenico Cerasuolo <[email protected]>
> Cc: Kairui Song <[email protected]>
> Cc: Matthew Wilcox (Oracle) <[email protected]>
> Cc: Peter Xu <[email protected]>
> Cc: Ryan Roberts <[email protected]>
> Cc: Suren Baghdasaryan <[email protected]>
> Cc: Yosry Ahmed <[email protected]>
> Cc: Yu Zhao <[email protected]>
> Cc: Jonathan Corbet <[email protected]>

Reviewed-by: Ryan Roberts <[email protected]>

> ---
> Documentation/admin-guide/mm/transhuge.rst | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index f436ff982f22..98e3a99ea780 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -369,7 +369,7 @@ monitor how successfully the system is providing huge pages for use.
>
> thp_fault_alloc
> is incremented every time a huge page is successfully
> - allocated to handle a page fault.
> + allocated and charged to handle a page fault.
>
> thp_collapse_alloc
> is incremented by khugepaged when it has found
> @@ -377,7 +377,7 @@ thp_collapse_alloc
> successfully allocated a new huge page to store the data.
>
> thp_fault_fallback
> - is incremented if a page fault fails to allocate
> + is incremented if a page fault fails to allocate or charge
> a huge page and instead falls back to using small pages.
>
> thp_fault_fallback_charge


2024-04-12 10:30:02

by Barry Song

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <[email protected]> wrote:
>
> On 12/04/2024 11:17, Barry Song wrote:
> > On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <[email protected]> wrote:
> >>
> >> On 12/04/2024 10:43, Barry Song wrote:
> >>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
> >>>>
> >>>> Hi Barry,
> >>>>
> >>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
> >>>> v4 conversation).
> >>>>
> >>>> On 12/04/2024 08:37, Barry Song wrote:
> >>>>> From: Barry Song <[email protected]>
> >>>>>
> >>>>> Profiling a system blindly with mTHP has become challenging due to the
> >>>>> lack of visibility into its operations. Presenting the success rate of
> >>>>> mTHP allocations appears to be pressing need.
> >>>>>
> >>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>> performance improvements and regressions without these figures. It's
> >>>>> crucial for us to understand the true effectiveness of mTHP in real-world
> >>>>> scenarios, especially in systems with fragmented memory.
> >>>>>
> >>>>> This patch establishes the framework for per-order mTHP
> >>>>> counters. It begins by introducing the anon_fault_alloc and
> >>>>> anon_fault_fallback counters. Additionally, to maintain consistency
> >>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> >>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> >>>>> Incorporating additional counters should now be straightforward as well.
> >>>>>
> >>>>> Signed-off-by: Barry Song <[email protected]>
> >>>>> Cc: Chris Li <[email protected]>
> >>>>> Cc: David Hildenbrand <[email protected]>
> >>>>> Cc: Domenico Cerasuolo <[email protected]>
> >>>>> Cc: Kairui Song <[email protected]>
> >>>>> Cc: Matthew Wilcox (Oracle) <[email protected]>
> >>>>> Cc: Peter Xu <[email protected]>
> >>>>> Cc: Ryan Roberts <[email protected]>
> >>>>> Cc: Suren Baghdasaryan <[email protected]>
> >>>>> Cc: Yosry Ahmed <[email protected]>
> >>>>> Cc: Yu Zhao <[email protected]>
> >>>>> ---
> >>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
> >>>>> mm/memory.c | 3 ++
> >>>>> mm/page_alloc.c | 4 +++
> >>>>> 4 files changed, 119 insertions(+)
> >>>>>
> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>> index e896ca4760f6..c5beb54b97cb 100644
> >>>>> --- a/include/linux/huge_mm.h
> >>>>> +++ b/include/linux/huge_mm.h
> >>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>>>> enforce_sysfs, orders);
> >>>>> }
> >>>>>
> >>>>> +enum mthp_stat_item {
> >>>>> + MTHP_STAT_ANON_FAULT_ALLOC,
> >>>>> + MTHP_STAT_ANON_FAULT_FALLBACK,
> >>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>>> + __MTHP_STAT_COUNT
> >>>>> +};
> >>>>> +
> >>>>> +struct mthp_stat {
> >>>>> + unsigned long stats[0][__MTHP_STAT_COUNT];
> >>>>> +};
> >>>>> +
> >>>>> +extern struct mthp_stat __percpu *mthp_stats;
> >>>>> +
> >>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>>> +{
> >>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>> + return;
> >>>>> +
> >>>>> + this_cpu_inc(mthp_stats->stats[order][item]);
> >>>>> +}
> >>>>> +
> >>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> >>>>> +{
> >>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>> + return;
> >>>>> +
> >>>>> + this_cpu_add(mthp_stats->stats[order][item], delta);
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Fold the foreign cpu mthp stats into our own.
> >>>>> + *
> >>>>> + * This is adding to the stats on one processor
> >>>>> + * but keeps the global counts constant.
> >>>>> + */
> >>>>> +static inline void mthp_stats_fold_cpu(int cpu)
> >>>>> +{
> >>>>> + struct mthp_stat *fold_stat;
> >>>>> + int i, j;
> >>>>> +
> >>>>> + if (!mthp_stats)
> >>>>> + return;
> >>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu);
> >>>>> + for (i = 1; i <= PMD_ORDER; i++) {
> >>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> >>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]);
> >>>>> + fold_stat->stats[i][j] = 0;
> >>>>> + }
> >>>>> + }
> >>>>> +}
> >>>>
> >>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> >>>> cpus should work.
> >>>>
> >>>>> +
> >>>>> #define transparent_hugepage_use_zero_page() \
> >>>>> (transparent_hugepage_flags & \
> >>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>> index dc30139590e6..21c4ac74b484 100644
> >>>>> --- a/mm/huge_memory.c
> >>>>> +++ b/mm/huge_memory.c
> >>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >>>>> .sysfs_ops = &kobj_sysfs_ops,
> >>>>> };
> >>>>>
> >>>>> +struct mthp_stat __percpu *mthp_stats;
> >>>>> +
> >>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> >>>>> +{
> >>>>> + unsigned long sum = 0;
> >>>>> + int cpu;
> >>>>> +
> >>>>> + cpus_read_lock();
> >>>>> + for_each_online_cpu(cpu) {
> >>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> >>>>> +
> >>>>> + sum += this->stats[order][item];
> >>>>> + }
> >>>>> + cpus_read_unlock();
> >>>>> +
> >>>>> + return sum;
> >>>>> +}
> >>>>> +
> >>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
> >>>>> +static ssize_t _name##_show(struct kobject *kobj, \
> >>>>> + struct kobj_attribute *attr, char *buf) \
> >>>>> +{ \
> >>>>> + int order = to_thpsize(kobj)->order; \
> >>>>> + \
> >>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
> >>>>> +} \
> >>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> >>>>> +
> >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>>> +
> >>>>> +static struct attribute *stats_attrs[] = {
> >>>>> + &anon_fault_alloc_attr.attr,
> >>>>> + &anon_fault_fallback_attr.attr,
> >>>>> + &anon_fault_fallback_charge_attr.attr,
> >>>>> + NULL,
> >>>>> +};
> >>>>> +
> >>>>> +static struct attribute_group stats_attr_group = {
> >>>>> + .name = "stats",
> >>>>> + .attrs = stats_attrs,
> >>>>> +};
> >>>>> +
> >>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>> {
> >>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> >>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>> return ERR_PTR(ret);
> >>>>> }
> >>>>>
> >>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> >>>>> + if (ret) {
> >>>>> + kobject_put(&thpsize->kobj);
> >>>>> + return ERR_PTR(ret);
> >>>>> + }
> >>>>> +
> >>>>> thpsize->order = order;
> >>>>> return thpsize;
> >>>>> }
> >>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >>>>> */
> >>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >>>>>
> >>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> >>>>> + sizeof(unsigned long));
> >>>>
> >>>> Personally I think it would be cleaner to allocate statically using
> >>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> >>>
> >>> Hi Ryan,
> >>>
> >>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> >>>
> >>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
> >>>
> >>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> >>>
> >>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
> >>>
> >>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> >>>
> >>>
> >>> Am I missing something?
> >>
> >> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
> >>
> >> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE
> >> 4K 12 512
> >> 16K 14 2048
> >> 64K 16 8192
> >>
> >> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
> >>
> >> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
> >>
> >> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
> >> (and its equal to PTRS_PER_PTE except for powerpc).
> >>
> >> Pretty sure the math is correct?
> >
> > I am not convinced the math is correct :-)
> >
> > while page size is 64KiB, the page table is as below,
> > PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
>
> 1 << 13 = 8192
>
> Right? So:
>
> ilog2(8192) = 13
>
> What's wrong with that?
>
> I even checked in Python to make sure I'm not going mad:
>
> >>> import math
> >>> math.log2(8192)
> 13.0

You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.

>
> >
> >
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> > |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> > | | | | |
> > | | | | v
> > | | | | [15:0] in-page offset
> > | | | +----------> [28:16] L3 index
> > | | +--------------------------> [41:29] L2 index
> > | +-------------------------------> [47:42] L1 index (48-bit)
> > | [51:42] L1 index (52-bit)
> > +-------------------------------------------------> [63] TTBR0/1
> >
> > while page size is 4KiB, the page table is as below,
> >
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> > |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
> > +--------+--------+--------+--------+--------+--------+--------+--------+
> > | | | | | |
> > | | | | | v
> > | | | | | [11:0] in-page offset
> > | | | | +-> [20:12] L3 index
> > | | | +-----------> [29:21] L2 index
> > | | +---------------------> [38:30] L1 index
> > | +-------------------------------> [47:39] L0 index
> > +-------------------------------------------------> [63] TTBR0/1
> >
> > PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).
> >
> > You are only correct while page size = 4KiB.
> >
> >
> >
> >
>

2024-04-12 10:33:16

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On 12/04/2024 11:17, Barry Song wrote:
> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <[email protected]> wrote:
>>
>> On 12/04/2024 10:43, Barry Song wrote:
>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
>>>>
>>>> Hi Barry,
>>>>
>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>>>> v4 conversation).
>>>>
>>>> On 12/04/2024 08:37, Barry Song wrote:
>>>>> From: Barry Song <[email protected]>
>>>>>
>>>>> Profiling a system blindly with mTHP has become challenging due to the
>>>>> lack of visibility into its operations.  Presenting the success rate of
>>>>> mTHP allocations appears to be pressing need.
>>>>>
>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>> performance improvements and regressions without these figures.  It's
>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>>>> scenarios, especially in systems with fragmented memory.
>>>>>
>>>>> This patch establishes the framework for per-order mTHP
>>>>> counters. It begins by introducing the anon_fault_alloc and
>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>>>> Incorporating additional counters should now be straightforward as well.
>>>>>
>>>>> Signed-off-by: Barry Song <[email protected]>
>>>>> Cc: Chris Li <[email protected]>
>>>>> Cc: David Hildenbrand <[email protected]>
>>>>> Cc: Domenico Cerasuolo <[email protected]>
>>>>> Cc: Kairui Song <[email protected]>
>>>>> Cc: Matthew Wilcox (Oracle) <[email protected]>
>>>>> Cc: Peter Xu <[email protected]>
>>>>> Cc: Ryan Roberts <[email protected]>
>>>>> Cc: Suren Baghdasaryan <[email protected]>
>>>>> Cc: Yosry Ahmed <[email protected]>
>>>>> Cc: Yu Zhao <[email protected]>
>>>>> ---
>>>>>  include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>>>  mm/huge_memory.c        | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>>  mm/memory.c             |  3 ++
>>>>>  mm/page_alloc.c         |  4 +++
>>>>>  4 files changed, 119 insertions(+)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index e896ca4760f6..c5beb54b97cb 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>                                         enforce_sysfs, orders);
>>>>>  }
>>>>>
>>>>> +enum mthp_stat_item {
>>>>> +     MTHP_STAT_ANON_FAULT_ALLOC,
>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK,
>>>>> +     MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>> +     __MTHP_STAT_COUNT
>>>>> +};
>>>>> +
>>>>> +struct mthp_stat {
>>>>> +     unsigned long stats[0][__MTHP_STAT_COUNT];
>>>>> +};
>>>>> +
>>>>> +extern struct mthp_stat __percpu *mthp_stats;
>>>>> +
>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>> +{
>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>> +             return;
>>>>> +
>>>>> +     this_cpu_inc(mthp_stats->stats[order][item]);
>>>>> +}
>>>>> +
>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>>>> +{
>>>>> +     if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>> +             return;
>>>>> +
>>>>> +     this_cpu_add(mthp_stats->stats[order][item], delta);
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Fold the foreign cpu mthp stats into our own.
>>>>> + *
>>>>> + * This is adding to the stats on one processor
>>>>> + * but keeps the global counts constant.
>>>>> + */
>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>>>> +{
>>>>> +     struct mthp_stat *fold_stat;
>>>>> +     int i, j;
>>>>> +
>>>>> +     if (!mthp_stats)
>>>>> +             return;
>>>>> +     fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>>>> +     for (i = 1; i <= PMD_ORDER; i++) {
>>>>> +             for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>>>> +                     count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>>>> +                     fold_stat->stats[i][j] = 0;
>>>>> +             }
>>>>> +     }
>>>>> +}
>>>>
>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>>>> cpus should work.
>>>>
>>>>> +
>>>>>  #define transparent_hugepage_use_zero_page()                         \
>>>>>       (transparent_hugepage_flags &                                   \
>>>>>        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index dc30139590e6..21c4ac74b484 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>>>       .sysfs_ops = &kobj_sysfs_ops,
>>>>>  };
>>>>>
>>>>> +struct mthp_stat __percpu *mthp_stats;
>>>>> +
>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>>>> +{
>>>>> +     unsigned long sum = 0;
>>>>> +     int cpu;
>>>>> +
>>>>> +     cpus_read_lock();
>>>>> +     for_each_online_cpu(cpu) {
>>>>> +             struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>>>> +
>>>>> +             sum += this->stats[order][item];
>>>>> +     }
>>>>> +     cpus_read_unlock();
>>>>> +
>>>>> +     return sum;
>>>>> +}
>>>>> +
>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                 \
>>>>> +static ssize_t _name##_show(struct kobject *kobj,                    \
>>>>> +                     struct kobj_attribute *attr, char *buf)         \
>>>>> +{                                                                    \
>>>>> +     int order = to_thpsize(kobj)->order;                            \
>>>>> +                                                                     \
>>>>> +     return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));  \
>>>>> +}                                                                    \
>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>>>> +
>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>> +
>>>>> +static struct attribute *stats_attrs[] = {
>>>>> +     &anon_fault_alloc_attr.attr,
>>>>> +     &anon_fault_fallback_attr.attr,
>>>>> +     &anon_fault_fallback_charge_attr.attr,
>>>>> +     NULL,
>>>>> +};
>>>>> +
>>>>> +static struct attribute_group stats_attr_group = {
>>>>> +     .name = "stats",
>>>>> +     .attrs = stats_attrs,
>>>>> +};
>>>>> +
>>>>>  static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>  {
>>>>>       unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>               return ERR_PTR(ret);
>>>>>       }
>>>>>
>>>>> +     ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>>>> +     if (ret) {
>>>>> +             kobject_put(&thpsize->kobj);
>>>>> +             return ERR_PTR(ret);
>>>>> +     }
>>>>> +
>>>>>       thpsize->order = order;
>>>>>       return thpsize;
>>>>>  }
>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>>>        */
>>>>>       MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>>>
>>>>> +     mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>>>> +                     sizeof(unsigned long));
>>>>
>>>> Personally I think it would be cleaner to allocate statically using
>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>>>
>>> Hi Ryan,
>>>
>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>>>
>>> #define PMD_ORDER       (PMD_SHIFT - PAGE_SHIFT)
>>>
>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>>>
>>> #define PTRS_PER_PTE            (1 << (PAGE_SHIFT - 3))
>>>
>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>>>
>>>
>>> Am I missing something?
>>
>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>>
>> PAGE_SIZE       PAGE_SHIFT      PTRS_PER_PTE
>> 4K              12              512
>> 16K             14              2048
>> 64K             16              8192
>>
>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>>
>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>>
>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
>> (and its equal to PTRS_PER_PTE except for powerpc).
>>
>> Pretty sure the math is correct?
>
> I am not convinced the math is correct :-)
>
> while page size is 64KiB, the page table is as below,
> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)

1 << 13 = 8192

Right? So:

ilog2(8192) = 13

What's wrong with that?

I even checked in Python to make sure I'm not going mad:

>>> import math
>>> math.log2(8192)
13.0

>
>
> +--------+--------+--------+--------+--------+--------+--------+--------+
> |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
> +--------+--------+--------+--------+--------+--------+--------+--------+
> | | | | |
> | | | | v
> | | | | [15:0] in-page offset
> | | | +----------> [28:16] L3 index
> | | +--------------------------> [41:29] L2 index
> | +-------------------------------> [47:42] L1 index (48-bit)
> | [51:42] L1 index (52-bit)
> +-------------------------------------------------> [63] TTBR0/1
>
> while page size is 4KiB, the page table is as below,
>
> +--------+--------+--------+--------+--------+--------+--------+--------+
> |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
> +--------+--------+--------+--------+--------+--------+--------+--------+
> | | | | | |
> | | | | | v
> | | | | | [11:0] in-page offset
> | | | | +-> [20:12] L3 index
> | | | +-----------> [29:21] L2 index
> | | +---------------------> [38:30] L1 index
> | +-------------------------------> [47:39] L0 index
> +-------------------------------------------------> [63] TTBR0/1
>
> PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).
>
> You are only correct while page size = 4KiB.
>
>
>
>


2024-04-12 10:45:42

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On 12/04/2024 11:29, Barry Song wrote:
> On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <[email protected]> wrote:
>>
>> On 12/04/2024 11:17, Barry Song wrote:
>>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <[email protected]> wrote:
>>>>
>>>> On 12/04/2024 10:43, Barry Song wrote:
>>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
>>>>>>
>>>>>> Hi Barry,
>>>>>>
>>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>>>>>> v4 conversation).
>>>>>>
>>>>>> On 12/04/2024 08:37, Barry Song wrote:
>>>>>>> From: Barry Song <[email protected]>
>>>>>>>
>>>>>>> Profiling a system blindly with mTHP has become challenging due to the
>>>>>>> lack of visibility into its operations. Presenting the success rate of
>>>>>>> mTHP allocations appears to be pressing need.
>>>>>>>
>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>> performance improvements and regressions without these figures. It's
>>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>>>>>> scenarios, especially in systems with fragmented memory.
>>>>>>>
>>>>>>> This patch establishes the framework for per-order mTHP
>>>>>>> counters. It begins by introducing the anon_fault_alloc and
>>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>>>>>> Incorporating additional counters should now be straightforward as well.
>>>>>>>
>>>>>>> Signed-off-by: Barry Song <[email protected]>
>>>>>>> Cc: Chris Li <[email protected]>
>>>>>>> Cc: David Hildenbrand <[email protected]>
>>>>>>> Cc: Domenico Cerasuolo <[email protected]>
>>>>>>> Cc: Kairui Song <[email protected]>
>>>>>>> Cc: Matthew Wilcox (Oracle) <[email protected]>
>>>>>>> Cc: Peter Xu <[email protected]>
>>>>>>> Cc: Ryan Roberts <[email protected]>
>>>>>>> Cc: Suren Baghdasaryan <[email protected]>
>>>>>>> Cc: Yosry Ahmed <[email protected]>
>>>>>>> Cc: Yu Zhao <[email protected]>
>>>>>>> ---
>>>>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>>>> mm/memory.c | 3 ++
>>>>>>> mm/page_alloc.c | 4 +++
>>>>>>> 4 files changed, 119 insertions(+)
>>>>>>>
>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>> index e896ca4760f6..c5beb54b97cb 100644
>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>>> enforce_sysfs, orders);
>>>>>>> }
>>>>>>>
>>>>>>> +enum mthp_stat_item {
>>>>>>> + MTHP_STAT_ANON_FAULT_ALLOC,
>>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK,
>>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>>>> + __MTHP_STAT_COUNT
>>>>>>> +};
>>>>>>> +
>>>>>>> +struct mthp_stat {
>>>>>>> + unsigned long stats[0][__MTHP_STAT_COUNT];
>>>>>>> +};
>>>>>>> +
>>>>>>> +extern struct mthp_stat __percpu *mthp_stats;
>>>>>>> +
>>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>> +{
>>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>> + return;
>>>>>>> +
>>>>>>> + this_cpu_inc(mthp_stats->stats[order][item]);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>>>>>> +{
>>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>> + return;
>>>>>>> +
>>>>>>> + this_cpu_add(mthp_stats->stats[order][item], delta);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Fold the foreign cpu mthp stats into our own.
>>>>>>> + *
>>>>>>> + * This is adding to the stats on one processor
>>>>>>> + * but keeps the global counts constant.
>>>>>>> + */
>>>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>>>>>> +{
>>>>>>> + struct mthp_stat *fold_stat;
>>>>>>> + int i, j;
>>>>>>> +
>>>>>>> + if (!mthp_stats)
>>>>>>> + return;
>>>>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>>>>>> + for (i = 1; i <= PMD_ORDER; i++) {
>>>>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>>>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>>>>>> + fold_stat->stats[i][j] = 0;
>>>>>>> + }
>>>>>>> + }
>>>>>>> +}
>>>>>>
>>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>>>>>> cpus should work.
>>>>>>
>>>>>>> +
>>>>>>> #define transparent_hugepage_use_zero_page() \
>>>>>>> (transparent_hugepage_flags & \
>>>>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>> index dc30139590e6..21c4ac74b484 100644
>>>>>>> --- a/mm/huge_memory.c
>>>>>>> +++ b/mm/huge_memory.c
>>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>>>>> .sysfs_ops = &kobj_sysfs_ops,
>>>>>>> };
>>>>>>>
>>>>>>> +struct mthp_stat __percpu *mthp_stats;
>>>>>>> +
>>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>> +{
>>>>>>> + unsigned long sum = 0;
>>>>>>> + int cpu;
>>>>>>> +
>>>>>>> + cpus_read_lock();
>>>>>>> + for_each_online_cpu(cpu) {
>>>>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>>>>>> +
>>>>>>> + sum += this->stats[order][item];
>>>>>>> + }
>>>>>>> + cpus_read_unlock();
>>>>>>> +
>>>>>>> + return sum;
>>>>>>> +}
>>>>>>> +
>>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
>>>>>>> +static ssize_t _name##_show(struct kobject *kobj, \
>>>>>>> + struct kobj_attribute *attr, char *buf) \
>>>>>>> +{ \
>>>>>>> + int order = to_thpsize(kobj)->order; \
>>>>>>> + \
>>>>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
>>>>>>> +} \
>>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>>>>>> +
>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>>> +
>>>>>>> +static struct attribute *stats_attrs[] = {
>>>>>>> + &anon_fault_alloc_attr.attr,
>>>>>>> + &anon_fault_fallback_attr.attr,
>>>>>>> + &anon_fault_fallback_charge_attr.attr,
>>>>>>> + NULL,
>>>>>>> +};
>>>>>>> +
>>>>>>> +static struct attribute_group stats_attr_group = {
>>>>>>> + .name = "stats",
>>>>>>> + .attrs = stats_attrs,
>>>>>>> +};
>>>>>>> +
>>>>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>> {
>>>>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>> return ERR_PTR(ret);
>>>>>>> }
>>>>>>>
>>>>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>>>>>> + if (ret) {
>>>>>>> + kobject_put(&thpsize->kobj);
>>>>>>> + return ERR_PTR(ret);
>>>>>>> + }
>>>>>>> +
>>>>>>> thpsize->order = order;
>>>>>>> return thpsize;
>>>>>>> }
>>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>>>>> */
>>>>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>>>>>
>>>>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>>>>>> + sizeof(unsigned long));
>>>>>>
>>>>>> Personally I think it would be cleaner to allocate statically using
>>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>>>>>
>>>>> Hi Ryan,
>>>>>
>>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>>>>>
>>>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
>>>>>
>>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>>>>>
>>>>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
>>>>>
>>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>>>>>
>>>>>
>>>>> Am I missing something?
>>>>
>>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>>>>
>>>> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE
>>>> 4K 12 512
>>>> 16K 14 2048
>>>> 64K 16 8192
>>>>
>>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>>>>
>>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>>>>
>>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
>>>> (and its equal to PTRS_PER_PTE except for powerpc).
>>>>
>>>> Pretty sure the math is correct?
>>>
>>> I am not convinced the math is correct :-)
>>>
>>> while page size is 64KiB, the page table is as below,
>>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
>>
>> 1 << 13 = 8192
>>
>> Right? So:
>>
>> ilog2(8192) = 13
>>
>> What's wrong with that?
>>
>> I even checked in Python to make sure I'm not going mad:
>>
>>>>> import math
>>>>> math.log2(8192)
>> 13.0
>
> You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
> I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.

No worries! We got there in the end :)

Of course my suggestion relies on being able to get a compile-time constant from
ilog2(MAX_PTRS_PER_PTE). I think that should work, right?

>
>>
>>>
>>>
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>> |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>> | | | | |
>>> | | | | v
>>> | | | | [15:0] in-page offset
>>> | | | +----------> [28:16] L3 index
>>> | | +--------------------------> [41:29] L2 index
>>> | +-------------------------------> [47:42] L1 index (48-bit)
>>> | [51:42] L1 index (52-bit)
>>> +-------------------------------------------------> [63] TTBR0/1
>>>
>>> while page size is 4KiB, the page table is as below,
>>>
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>> |63 56|55 48|47 40|39 32|31 24|23 16|15 8|7 0|
>>> +--------+--------+--------+--------+--------+--------+--------+--------+
>>> | | | | | |
>>> | | | | | v
>>> | | | | | [11:0] in-page offset
>>> | | | | +-> [20:12] L3 index
>>> | | | +-----------> [29:21] L2 index
>>> | | +---------------------> [38:30] L1 index
>>> | +-------------------------------> [47:39] L0 index
>>> +-------------------------------------------------> [63] TTBR0/1
>>>
>>> PMD_ORDER = L2 index bits = [29:21] = 9 = ilog2(512).
>>>
>>> You are only correct while page size = 4KiB.
>>>
>>>
>>>
>>>
>>


2024-04-12 10:53:27

by Barry Song

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On Fri, Apr 12, 2024 at 10:38 PM Ryan Roberts <[email protected]> wrote:
>
> On 12/04/2024 11:29, Barry Song wrote:
> > On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <ryan.roberts@armcom> wrote:
> >>
> >> On 12/04/2024 11:17, Barry Song wrote:
> >>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <[email protected]> wrote:
> >>>>
> >>>> On 12/04/2024 10:43, Barry Song wrote:
> >>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
> >>>>>>
> >>>>>> Hi Barry,
> >>>>>>
> >>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
> >>>>>> v4 conversation).
> >>>>>>
> >>>>>> On 12/04/2024 08:37, Barry Song wrote:
> >>>>>>> From: Barry Song <[email protected]>
> >>>>>>>
> >>>>>>> Profiling a system blindly with mTHP has become challenging due to the
> >>>>>>> lack of visibility into its operations. Presenting the success rate of
> >>>>>>> mTHP allocations appears to be pressing need.
> >>>>>>>
> >>>>>>> Recently, I've been experiencing significant difficulty debugging
> >>>>>>> performance improvements and regressions without these figures. It's
> >>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
> >>>>>>> scenarios, especially in systems with fragmented memory.
> >>>>>>>
> >>>>>>> This patch establishes the framework for per-order mTHP
> >>>>>>> counters. It begins by introducing the anon_fault_alloc and
> >>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
> >>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
> >>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
> >>>>>>> Incorporating additional counters should now be straightforward as well.
> >>>>>>>
> >>>>>>> Signed-off-by: Barry Song <[email protected]>
> >>>>>>> Cc: Chris Li <[email protected]>
> >>>>>>> Cc: David Hildenbrand <[email protected]>
> >>>>>>> Cc: Domenico Cerasuolo <[email protected]>
> >>>>>>> Cc: Kairui Song <[email protected]>
> >>>>>>> Cc: Matthew Wilcox (Oracle) <[email protected]>
> >>>>>>> Cc: Peter Xu <[email protected]>
> >>>>>>> Cc: Ryan Roberts <[email protected]>
> >>>>>>> Cc: Suren Baghdasaryan <[email protected]>
> >>>>>>> Cc: Yosry Ahmed <[email protected]>
> >>>>>>> Cc: Yu Zhao <[email protected]>
> >>>>>>> ---
> >>>>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
> >>>>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
> >>>>>>> mm/memory.c | 3 ++
> >>>>>>> mm/page_alloc.c | 4 +++
> >>>>>>> 4 files changed, 119 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>> index e896ca4760f6..c5beb54b97cb 100644
> >>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >>>>>>> enforce_sysfs, orders);
> >>>>>>> }
> >>>>>>>
> >>>>>>> +enum mthp_stat_item {
> >>>>>>> + MTHP_STAT_ANON_FAULT_ALLOC,
> >>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK,
> >>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> >>>>>>> + __MTHP_STAT_COUNT
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +struct mthp_stat {
> >>>>>>> + unsigned long stats[0][__MTHP_STAT_COUNT];
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +extern struct mthp_stat __percpu *mthp_stats;
> >>>>>>> +
> >>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>>>>> +{
> >>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>>>> + return;
> >>>>>>> +
> >>>>>>> + this_cpu_inc(mthp_stats->stats[order][item]);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
> >>>>>>> +{
> >>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> >>>>>>> + return;
> >>>>>>> +
> >>>>>>> + this_cpu_add(mthp_stats->stats[order][item], delta);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * Fold the foreign cpu mthp stats into our own.
> >>>>>>> + *
> >>>>>>> + * This is adding to the stats on one processor
> >>>>>>> + * but keeps the global counts constant.
> >>>>>>> + */
> >>>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
> >>>>>>> +{
> >>>>>>> + struct mthp_stat *fold_stat;
> >>>>>>> + int i, j;
> >>>>>>> +
> >>>>>>> + if (!mthp_stats)
> >>>>>>> + return;
> >>>>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu);
> >>>>>>> + for (i = 1; i <= PMD_ORDER; i++) {
> >>>>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> >>>>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]);
> >>>>>>> + fold_stat->stats[i][j] = 0;
> >>>>>>> + }
> >>>>>>> + }
> >>>>>>> +}
> >>>>>>
> >>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
> >>>>>> cpus should work.
> >>>>>>
> >>>>>>> +
> >>>>>>> #define transparent_hugepage_use_zero_page() \
> >>>>>>> (transparent_hugepage_flags & \
> >>>>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
> >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>>>> index dc30139590e6..21c4ac74b484 100644
> >>>>>>> --- a/mm/huge_memory.c
> >>>>>>> +++ b/mm/huge_memory.c
> >>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
> >>>>>>> .sysfs_ops = &kobj_sysfs_ops,
> >>>>>>> };
> >>>>>>>
> >>>>>>> +struct mthp_stat __percpu *mthp_stats;
> >>>>>>> +
> >>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> >>>>>>> +{
> >>>>>>> + unsigned long sum = 0;
> >>>>>>> + int cpu;
> >>>>>>> +
> >>>>>>> + cpus_read_lock();
> >>>>>>> + for_each_online_cpu(cpu) {
> >>>>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> >>>>>>> +
> >>>>>>> + sum += this->stats[order][item];
> >>>>>>> + }
> >>>>>>> + cpus_read_unlock();
> >>>>>>> +
> >>>>>>> + return sum;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
> >>>>>>> +static ssize_t _name##_show(struct kobject *kobj, \
> >>>>>>> + struct kobj_attribute *attr, char *buf) \
> >>>>>>> +{ \
> >>>>>>> + int order = to_thpsize(kobj)->order; \
> >>>>>>> + \
> >>>>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
> >>>>>>> +} \
> >>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> >>>>>>> +
> >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> >>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> >>>>>>> +
> >>>>>>> +static struct attribute *stats_attrs[] = {
> >>>>>>> + &anon_fault_alloc_attr.attr,
> >>>>>>> + &anon_fault_fallback_attr.attr,
> >>>>>>> + &anon_fault_fallback_charge_attr.attr,
> >>>>>>> + NULL,
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +static struct attribute_group stats_attr_group = {
> >>>>>>> + .name = "stats",
> >>>>>>> + .attrs = stats_attrs,
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>>>> {
> >>>>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K;
> >>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
> >>>>>>> return ERR_PTR(ret);
> >>>>>>> }
> >>>>>>>
> >>>>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
> >>>>>>> + if (ret) {
> >>>>>>> + kobject_put(&thpsize->kobj);
> >>>>>>> + return ERR_PTR(ret);
> >>>>>>> + }
> >>>>>>> +
> >>>>>>> thpsize->order = order;
> >>>>>>> return thpsize;
> >>>>>>> }
> >>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
> >>>>>>> */
> >>>>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
> >>>>>>>
> >>>>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
> >>>>>>> + sizeof(unsigned long));
> >>>>>>
> >>>>>> Personally I think it would be cleaner to allocate statically using
> >>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
> >>>>>
> >>>>> Hi Ryan,
> >>>>>
> >>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
> >>>>>
> >>>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
> >>>>>
> >>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
> >>>>>
> >>>>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
> >>>>>
> >>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
> >>>>>
> >>>>>
> >>>>> Am I missing something?
> >>>>
> >>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
> >>>>
> >>>> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE
> >>>> 4K 12 512
> >>>> 16K 14 2048
> >>>> 64K 16 8192
> >>>>
> >>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
> >>>>
> >>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
> >>>>
> >>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
> >>>> (and its equal to PTRS_PER_PTE except for powerpc).
> >>>>
> >>>> Pretty sure the math is correct?
> >>>
> >>> I am not convinced the math is correct :-)
> >>>
> >>> while page size is 64KiB, the page table is as below,
> >>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
> >>
> >> 1 << 13 = 8192
> >>
> >> Right? So:
> >>
> >> ilog2(8192) = 13
> >>
> >> What's wrong with that?
> >>
> >> I even checked in Python to make sure I'm not going mad:
> >>
> >>>>> import math
> >>>>> math.log2(8192)
> >> 13.0
> >
> > You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
> > I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.
>
> No worries! We got there in the end :)
>
> Of course my suggestion relies on being able to get a compile-time constant from
> ilog2(MAX_PTRS_PER_PTE). I think that should work, right?

I guess so, ilog2 can detect compile-time const, otherwise, it will find the
last (most-significant) bit set.

I've implemented the following change, and the build all passed.
Currently conducting testing.

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index c5beb54b97cb..d4fdb2641070 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -272,47 +272,17 @@ enum mthp_stat_item {
};

struct mthp_stat {
- unsigned long stats[0][__MTHP_STAT_COUNT];
+ unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
};

-extern struct mthp_stat __percpu *mthp_stats;
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
- if (order <= 0 || order > PMD_ORDER || !mthp_stats)
+ if (order <= 0 || order > PMD_ORDER)
return;

- this_cpu_inc(mthp_stats->stats[order][item]);
-}
-
-static inline void count_mthp_stats(int order, enum mthp_stat_item
item, long delta)
-{
- if (order <= 0 || order > PMD_ORDER || !mthp_stats)
- return;
-
- this_cpu_add(mthp_stats->stats[order][item], delta);
-}
-
-/*
- * Fold the foreign cpu mthp stats into our own.
- *
- * This is adding to the stats on one processor
- * but keeps the global counts constant.
- */
-static inline void mthp_stats_fold_cpu(int cpu)
-{
- struct mthp_stat *fold_stat;
- int i, j;
-
- if (!mthp_stats)
- return;
- fold_stat = per_cpu_ptr(mthp_stats, cpu);
- for (i = 1; i <= PMD_ORDER; i++) {
- for (j = 0; j < __MTHP_STAT_COUNT; j++) {
- count_mthp_stats(i, j, fold_stat->stats[i][j]);
- fold_stat->stats[i][j] = 0;
- }
- }
+ this_cpu_inc(mthp_stats.stats[order][item]);
}

#define transparent_hugepage_use_zero_page() \
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 21c4ac74b484..e88961ffc398 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,20 +526,18 @@ static const struct kobj_type thpsize_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
};

-struct mthp_stat __percpu *mthp_stats;
+DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};

static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
{
unsigned long sum = 0;
int cpu;

- cpus_read_lock();
- for_each_online_cpu(cpu) {
- struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
+ for_each_possible_cpu(cpu) {
+ struct mthp_stat *this = &per_cpu(mthp_stats, cpu);

sum += this->stats[order][item];
}
- cpus_read_unlock();

return sum;
}
@@ -741,11 +739,6 @@ static int __init hugepage_init(void)
*/
MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);

- mthp_stats = __alloc_percpu((PMD_ORDER + 1) *
sizeof(mthp_stats->stats[0]),
- sizeof(unsigned long));
- if (!mthp_stats)
- return -ENOMEM;
-
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
goto err_sysfs;
@@ -780,8 +773,6 @@ static int __init hugepage_init(void)
err_slab:
hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
- free_percpu(mthp_stats);
- mthp_stats = NULL;
return err;
}
subsys_initcall(hugepage_init);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3135b5ca2457..b51becf03d1e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5840,10 +5840,6 @@ static int page_alloc_cpu_dead(unsigned int cpu)
*/
vm_events_fold_cpu(cpu);

-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- mthp_stats_fold_cpu(cpu);
-#endif
-
/*
* Zero the differential counters of the dead processor
* so that the vm statistics are consistent.


Thanks
Barry

2024-04-12 11:08:12

by Ryan Roberts

[permalink] [raw]
Subject: Re: [PATCH v5 1/4] mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters

On 12/04/2024 11:53, Barry Song wrote:
> On Fri, Apr 12, 2024 at 10:38 PM Ryan Roberts <[email protected]> wrote:
>>
>> On 12/04/2024 11:29, Barry Song wrote:
>>> On Fri, Apr 12, 2024 at 10:25 PM Ryan Roberts <[email protected]> wrote:
>>>>
>>>> On 12/04/2024 11:17, Barry Song wrote:
>>>>> On Fri, Apr 12, 2024 at 9:56 PM Ryan Roberts <[email protected]> wrote:
>>>>>>
>>>>>> On 12/04/2024 10:43, Barry Song wrote:
>>>>>>> On Fri, Apr 12, 2024 at 9:27 PM Ryan Roberts <[email protected]> wrote:
>>>>>>>>
>>>>>>>> Hi Barry,
>>>>>>>>
>>>>>>>> 2 remaining comments - otherwise looks good. (same comments I just made in the
>>>>>>>> v4 conversation).
>>>>>>>>
>>>>>>>> On 12/04/2024 08:37, Barry Song wrote:
>>>>>>>>> From: Barry Song <[email protected]>
>>>>>>>>>
>>>>>>>>> Profiling a system blindly with mTHP has become challenging due to the
>>>>>>>>> lack of visibility into its operations. Presenting the success rate of
>>>>>>>>> mTHP allocations appears to be pressing need.
>>>>>>>>>
>>>>>>>>> Recently, I've been experiencing significant difficulty debugging
>>>>>>>>> performance improvements and regressions without these figures. It's
>>>>>>>>> crucial for us to understand the true effectiveness of mTHP in real-world
>>>>>>>>> scenarios, especially in systems with fragmented memory.
>>>>>>>>>
>>>>>>>>> This patch establishes the framework for per-order mTHP
>>>>>>>>> counters. It begins by introducing the anon_fault_alloc and
>>>>>>>>> anon_fault_fallback counters. Additionally, to maintain consistency
>>>>>>>>> with thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
>>>>>>>>> anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
>>>>>>>>> Incorporating additional counters should now be straightforward as well.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Barry Song <[email protected]>
>>>>>>>>> Cc: Chris Li <[email protected]>
>>>>>>>>> Cc: David Hildenbrand <[email protected]>
>>>>>>>>> Cc: Domenico Cerasuolo <[email protected]>
>>>>>>>>> Cc: Kairui Song <[email protected]>
>>>>>>>>> Cc: Matthew Wilcox (Oracle) <[email protected]>
>>>>>>>>> Cc: Peter Xu <[email protected]>
>>>>>>>>> Cc: Ryan Roberts <[email protected]>
>>>>>>>>> Cc: Suren Baghdasaryan <[email protected]>
>>>>>>>>> Cc: Yosry Ahmed <[email protected]>
>>>>>>>>> Cc: Yu Zhao <[email protected]>
>>>>>>>>> ---
>>>>>>>>> include/linux/huge_mm.h | 51 ++++++++++++++++++++++++++++++++++
>>>>>>>>> mm/huge_memory.c | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>> mm/memory.c | 3 ++
>>>>>>>>> mm/page_alloc.c | 4 +++
>>>>>>>>> 4 files changed, 119 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>>> index e896ca4760f6..c5beb54b97cb 100644
>>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>>> @@ -264,6 +264,57 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>>>>>> enforce_sysfs, orders);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> +enum mthp_stat_item {
>>>>>>>>> + MTHP_STAT_ANON_FAULT_ALLOC,
>>>>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK,
>>>>>>>>> + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>>>>>>>>> + __MTHP_STAT_COUNT
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +struct mthp_stat {
>>>>>>>>> + unsigned long stats[0][__MTHP_STAT_COUNT];
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +extern struct mthp_stat __percpu *mthp_stats;
>>>>>>>>> +
>>>>>>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>>>> +{
>>>>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>>>> + return;
>>>>>>>>> +
>>>>>>>>> + this_cpu_inc(mthp_stats->stats[order][item]);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +static inline void count_mthp_stats(int order, enum mthp_stat_item item, long delta)
>>>>>>>>> +{
>>>>>>>>> + if (order <= 0 || order > PMD_ORDER || !mthp_stats)
>>>>>>>>> + return;
>>>>>>>>> +
>>>>>>>>> + this_cpu_add(mthp_stats->stats[order][item], delta);
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +/*
>>>>>>>>> + * Fold the foreign cpu mthp stats into our own.
>>>>>>>>> + *
>>>>>>>>> + * This is adding to the stats on one processor
>>>>>>>>> + * but keeps the global counts constant.
>>>>>>>>> + */
>>>>>>>>> +static inline void mthp_stats_fold_cpu(int cpu)
>>>>>>>>> +{
>>>>>>>>> + struct mthp_stat *fold_stat;
>>>>>>>>> + int i, j;
>>>>>>>>> +
>>>>>>>>> + if (!mthp_stats)
>>>>>>>>> + return;
>>>>>>>>> + fold_stat = per_cpu_ptr(mthp_stats, cpu);
>>>>>>>>> + for (i = 1; i <= PMD_ORDER; i++) {
>>>>>>>>> + for (j = 0; j < __MTHP_STAT_COUNT; j++) {
>>>>>>>>> + count_mthp_stats(i, j, fold_stat->stats[i][j]);
>>>>>>>>> + fold_stat->stats[i][j] = 0;
>>>>>>>>> + }
>>>>>>>>> + }
>>>>>>>>> +}
>>>>>>>>
>>>>>>>> This is a pretty horrible hack; I'm pretty sure just summing for all *possible*
>>>>>>>> cpus should work.
>>>>>>>>
>>>>>>>>> +
>>>>>>>>> #define transparent_hugepage_use_zero_page() \
>>>>>>>>> (transparent_hugepage_flags & \
>>>>>>>>> (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
>>>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>>>> index dc30139590e6..21c4ac74b484 100644
>>>>>>>>> --- a/mm/huge_memory.c
>>>>>>>>> +++ b/mm/huge_memory.c
>>>>>>>>> @@ -526,6 +526,50 @@ static const struct kobj_type thpsize_ktype = {
>>>>>>>>> .sysfs_ops = &kobj_sysfs_ops,
>>>>>>>>> };
>>>>>>>>>
>>>>>>>>> +struct mthp_stat __percpu *mthp_stats;
>>>>>>>>> +
>>>>>>>>> +static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
>>>>>>>>> +{
>>>>>>>>> + unsigned long sum = 0;
>>>>>>>>> + int cpu;
>>>>>>>>> +
>>>>>>>>> + cpus_read_lock();
>>>>>>>>> + for_each_online_cpu(cpu) {
>>>>>>>>> + struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
>>>>>>>>> +
>>>>>>>>> + sum += this->stats[order][item];
>>>>>>>>> + }
>>>>>>>>> + cpus_read_unlock();
>>>>>>>>> +
>>>>>>>>> + return sum;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>> +#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
>>>>>>>>> +static ssize_t _name##_show(struct kobject *kobj, \
>>>>>>>>> + struct kobj_attribute *attr, char *buf) \
>>>>>>>>> +{ \
>>>>>>>>> + int order = to_thpsize(kobj)->order; \
>>>>>>>>> + \
>>>>>>>>> + return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
>>>>>>>>> +} \
>>>>>>>>> +static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>>>>>>>> +
>>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>>>>>>>> +DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>>>>>>>> +
>>>>>>>>> +static struct attribute *stats_attrs[] = {
>>>>>>>>> + &anon_fault_alloc_attr.attr,
>>>>>>>>> + &anon_fault_fallback_attr.attr,
>>>>>>>>> + &anon_fault_fallback_charge_attr.attr,
>>>>>>>>> + NULL,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +static struct attribute_group stats_attr_group = {
>>>>>>>>> + .name = "stats",
>>>>>>>>> + .attrs = stats_attrs,
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>>>> {
>>>>>>>>> unsigned long size = (PAGE_SIZE << order) / SZ_1K;
>>>>>>>>> @@ -549,6 +593,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
>>>>>>>>> return ERR_PTR(ret);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> + ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
>>>>>>>>> + if (ret) {
>>>>>>>>> + kobject_put(&thpsize->kobj);
>>>>>>>>> + return ERR_PTR(ret);
>>>>>>>>> + }
>>>>>>>>> +
>>>>>>>>> thpsize->order = order;
>>>>>>>>> return thpsize;
>>>>>>>>> }
>>>>>>>>> @@ -691,6 +741,11 @@ static int __init hugepage_init(void)
>>>>>>>>> */
>>>>>>>>> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>>>>>>>>>
>>>>>>>>> + mthp_stats = __alloc_percpu((PMD_ORDER + 1) * sizeof(mthp_stats->stats[0]),
>>>>>>>>> + sizeof(unsigned long));
>>>>>>>>
>>>>>>>> Personally I think it would be cleaner to allocate statically using
>>>>>>>> ilog2(MAX_PTRS_PER_PTE) instead of PMD_ORDER.
>>>>>>>
>>>>>>> Hi Ryan,
>>>>>>>
>>>>>>> I don't understand why MAX_PTRS_PER_PTE is the correct size. For ARM64,
>>>>>>>
>>>>>>> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
>>>>>>>
>>>>>>> #define MAX_PTRS_PER_PTE PTRS_PER_PTE
>>>>>>>
>>>>>>> #define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3))
>>>>>>>
>>>>>>> while PAGE_SIZE is 16KiB or 64KiB, PTRS_PER_PTE can be a huge number?
>>>>>>>
>>>>>>>
>>>>>>> Am I missing something?
>>>>>>
>>>>>> PTRS_PER_PTE is the number of PTE entries in a PTE table. On arm64 its as follows:
>>>>>>
>>>>>> PAGE_SIZE PAGE_SHIFT PTRS_PER_PTE
>>>>>> 4K 12 512
>>>>>> 16K 14 2048
>>>>>> 64K 16 8192
>>>>>>
>>>>>> So (PTRS_PER_PTE * PAGE_SIZE) = PMD_SIZE
>>>>>>
>>>>>> PMD_ORDER is ilog2(PMD_SIZE / PAGE_SIZE) = ilog2(PTRS_PER_PTE)
>>>>>>
>>>>>> MAX_PTRS_PER_PTE is just the maximum value that PTRS_PER_PTE will ever have,
>>>>>> (and its equal to PTRS_PER_PTE except for powerpc).
>>>>>>
>>>>>> Pretty sure the math is correct?
>>>>>
>>>>> I am not convinced the math is correct :-)
>>>>>
>>>>> while page size is 64KiB, the page table is as below,
>>>>> PMD_ORDER = L2 index bits = [41:29] = 13 != ilog2(8192)
>>>>
>>>> 1 << 13 = 8192
>>>>
>>>> Right? So:
>>>>
>>>> ilog2(8192) = 13
>>>>
>>>> What's wrong with that?
>>>>
>>>> I even checked in Python to make sure I'm not going mad:
>>>>
>>>>>>> import math
>>>>>>> math.log2(8192)
>>>> 13.0
>>>
>>> You're correct. My mind fixated on the '16' in the line '64K 16 8192'.
>>> I mistakenly thought ilog2(8192) equals 16. Apologies for the confusion.
>>
>> No worries! We got there in the end :)
>>
>> Of course my suggestion relies on being able to get a compile-time constant from
>> ilog2(MAX_PTRS_PER_PTE). I think that should work, right?
>
> I guess so, ilog2 can detect compile-time const, otherwise, it will find the
> last (most-significant) bit set.
>
> I've implemented the following change, and the build all passed.
> Currently conducting testing.

LGTM - much cleaner!

>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index c5beb54b97cb..d4fdb2641070 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -272,47 +272,17 @@ enum mthp_stat_item {
> };
>
> struct mthp_stat {
> - unsigned long stats[0][__MTHP_STAT_COUNT];
> + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
> };
>
> -extern struct mthp_stat __percpu *mthp_stats;
> +DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>
> static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> {
> - if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> + if (order <= 0 || order > PMD_ORDER)
> return;
>
> - this_cpu_inc(mthp_stats->stats[order][item]);
> -}
> -
> -static inline void count_mthp_stats(int order, enum mthp_stat_item
> item, long delta)
> -{
> - if (order <= 0 || order > PMD_ORDER || !mthp_stats)
> - return;
> -
> - this_cpu_add(mthp_stats->stats[order][item], delta);
> -}
> -
> -/*
> - * Fold the foreign cpu mthp stats into our own.
> - *
> - * This is adding to the stats on one processor
> - * but keeps the global counts constant.
> - */
> -static inline void mthp_stats_fold_cpu(int cpu)
> -{
> - struct mthp_stat *fold_stat;
> - int i, j;
> -
> - if (!mthp_stats)
> - return;
> - fold_stat = per_cpu_ptr(mthp_stats, cpu);
> - for (i = 1; i <= PMD_ORDER; i++) {
> - for (j = 0; j < __MTHP_STAT_COUNT; j++) {
> - count_mthp_stats(i, j, fold_stat->stats[i][j]);
> - fold_stat->stats[i][j] = 0;
> - }
> - }
> + this_cpu_inc(mthp_stats.stats[order][item]);
> }
>
> #define transparent_hugepage_use_zero_page() \
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 21c4ac74b484..e88961ffc398 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -526,20 +526,18 @@ static const struct kobj_type thpsize_ktype = {
> .sysfs_ops = &kobj_sysfs_ops,
> };
>
> -struct mthp_stat __percpu *mthp_stats;
> +DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
>
> static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
> {
> unsigned long sum = 0;
> int cpu;
>
> - cpus_read_lock();
> - for_each_online_cpu(cpu) {
> - struct mthp_stat *this = per_cpu_ptr(mthp_stats, cpu);
> + for_each_possible_cpu(cpu) {
> + struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
>
> sum += this->stats[order][item];
> }
> - cpus_read_unlock();
>
> return sum;
> }
> @@ -741,11 +739,6 @@ static int __init hugepage_init(void)
> */
> MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
>
> - mthp_stats = __alloc_percpu((PMD_ORDER + 1) *
> sizeof(mthp_stats->stats[0]),
> - sizeof(unsigned long));
> - if (!mthp_stats)
> - return -ENOMEM;
> -
> err = hugepage_init_sysfs(&hugepage_kobj);
> if (err)
> goto err_sysfs;
> @@ -780,8 +773,6 @@ static int __init hugepage_init(void)
> err_slab:
> hugepage_exit_sysfs(hugepage_kobj);
> err_sysfs:
> - free_percpu(mthp_stats);
> - mthp_stats = NULL;
> return err;
> }
> subsys_initcall(hugepage_init);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3135b5ca2457..b51becf03d1e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5840,10 +5840,6 @@ static int page_alloc_cpu_dead(unsigned int cpu)
> */
> vm_events_fold_cpu(cpu);
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> - mthp_stats_fold_cpu(cpu);
> -#endif
> -
> /*
> * Zero the differential counters of the dead processor
> * so that the vm statistics are consistent.
>
>
> Thanks
> Barry