2020-10-01 14:00:47

by Liang, Kan

[permalink] [raw]
Subject: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

From: Kan Liang <[email protected]>

Current perf can report both virtual addresses and physical addresses,
but not the MMU page size. Without the MMU page size information of the
utilized page, users cannot decide whether to promote/demote large pages
to optimize memory usage.

Add a new sample type for the data MMU page size.

Current perf already has a facility to collect data virtual addresses.
A page walker is required to walk the pages tables and calculate the
MMU page size from a given virtual address.

On some platforms, e.g., X86, the page walker is invoked in an NMI
handler. So the page walker must be NMI-safe and low overhead. Besides,
the page walker should work for both user and kernel virtual address.
The existing generic page walker, e.g., walk_page_range_novma(), is a
little bit complex and doesn't guarantee the NMI-safe. The follow_page()
is only for user-virtual address.

Add a new function perf_get_page_size() to walk the page tables and
calculate the MMU page size. In the function:
- Interrupts have to be disabled to prevent any teardown of the page
tables.
- For user space threads, the current->mm is used for the page walker.
For kernel threads and the like, the current->mm is NULL. The init_mm
is used for the page walker. The active_mm is not used here, because
it can be NULL.
Quote from Peter Zijlstra,
"context_switch() can set prev->active_mm to NULL when it transfers it
to @next. It does this before @current is updated. So an NMI that
comes in between this active_mm swizzling and updating @current will
see !active_mm."
- The MMU page size is calculated from the page table level.

The method should work for all architectures, but it has only been
verified on X86. Should there be some architectures, which support perf,
where the method doesn't work, it can be fixed later separately.
Reporting the wrong page size would not be fatal for the architecture.

Some under discussion features may impact the method in the future.
Quote from Dave Hansen,
"There are lots of weird things folks are trying to do with the page
tables, like Address Space Isolation. For instance, if you get a
perf NMI when running userspace, current->mm->pgd is *different* than
the PGD that was in use when userspace was running. It's close enough
today, but it might not stay that way."
If the case happens later, lots of consecutive page walk errors will
happen. The worst case is that lots of page-size '0' are returned, which
would not be fatal.
In the perf tool, a check is implemented to detect this case. Once it
happens, a kernel patch could be implemented accordingly then.

Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
---
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 4 +-
kernel/events/core.c | 103 ++++++++++++++++++++++++++++++++
3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d279b97f..7e3785dd27d9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1034,6 +1034,7 @@ struct perf_sample_data {

u64 phys_addr;
u64 cgroup;
+ u64 data_page_size;
} ____cacheline_aligned;

/* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 077e7ee69e3d..cc6ea346e9f9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,8 +143,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
PERF_SAMPLE_AUX = 1U << 20,
PERF_SAMPLE_CGROUP = 1U << 21,
+ PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22,

- PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */

__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
@@ -896,6 +897,7 @@ enum perf_event_type {
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* { u64 size;
* char data[size]; } && PERF_SAMPLE_AUX
+ * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 45edb85344a1..dc0ae692e32b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,7 @@
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
+#include <linux/highmem.h>

#include "internal.h"

@@ -1894,6 +1895,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);

+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
event->header_size = size;
}

@@ -6937,6 +6941,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CGROUP)
perf_output_put(handle, data->cgroup);

+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);

@@ -6994,6 +7001,94 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}

+#ifdef CONFIG_MMU
+
+/*
+ * Return the MMU page size of a given virtual address
+ */
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ if (p4d_leaf(*p4d))
+ return 1ULL << P4D_SHIFT;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_leaf(*pud))
+ return 1ULL << PUD_SHIFT;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_leaf(*pmd))
+ return 1ULL << PMD_SHIFT;
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return 0;
+ }
+
+ pte_unmap(pte);
+ return PAGE_SIZE;
+}
+
+#else
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ return 0;
+}
+
+#endif
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = __perf_get_page_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
@@ -7149,6 +7244,14 @@ void perf_prepare_sample(struct perf_event_header *header,
}
#endif

+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ data->data_page_size = perf_get_page_size(data->addr);
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;

--
2.17.1


2020-10-09 12:21:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Fri, Oct 09, 2020 at 11:09:27AM +0200, Peter Zijlstra wrote:
> @@ -7046,6 +7059,10 @@ static u64 __perf_get_page_size(struct m
> return 0;
> }
>
> + page = pte_page(*pte);
> + if (PageHuge(page))
> + return page_size(compound_head(page));

Argh, this misses pte_unmap()..

> +
> pte_unmap(pte);
> return PAGE_SIZE;
> }

2020-10-09 12:27:17

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Fri, Oct 09, 2020 at 11:09:27AM +0200, Peter Zijlstra wrote:
> On Thu, Oct 01, 2020 at 06:57:46AM -0700, [email protected] wrote:
> > +/*
> > + * Return the MMU page size of a given virtual address
> > + */
> > +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> > +{
> > + pgd_t *pgd;
> > + p4d_t *p4d;
> > + pud_t *pud;
> > + pmd_t *pmd;
> > + pte_t *pte;
> > +
> > + pgd = pgd_offset(mm, addr);
> > + if (pgd_none(*pgd))
> > + return 0;
> > +
> > + p4d = p4d_offset(pgd, addr);
> > + if (!p4d_present(*p4d))
> > + return 0;
> > +
> > + if (p4d_leaf(*p4d))
> > + return 1ULL << P4D_SHIFT;
> > +
> > + pud = pud_offset(p4d, addr);
> > + if (!pud_present(*pud))
> > + return 0;
> > +
> > + if (pud_leaf(*pud))
> > + return 1ULL << PUD_SHIFT;
> > +
> > + pmd = pmd_offset(pud, addr);
> > + if (!pmd_present(*pmd))
> > + return 0;
> > +
> > + if (pmd_leaf(*pmd))
> > + return 1ULL << PMD_SHIFT;
> > +
> > + pte = pte_offset_map(pmd, addr);
> > + if (!pte_present(*pte)) {
> > + pte_unmap(pte);
> > + return 0;
> > + }
> > +
> > + pte_unmap(pte);
> > + return PAGE_SIZE;
> > +}
>
> So this mostly works, but gets a number of hugetlb and arch specific
> things wrong.
>
> With the first 3 patches, this is only exposed to x86 and Power.
> Michael, does the above work for you?
>
> Looking at:
>
> arch/powerpc/include/asm/book3s/64/hugetlb.h:check_and_get_huge_psize()
>
> You seem to limit yourself to page-table sizes, however if I then look
> at the same function in:
>
> arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
> arch/powerpc/include/asm/nohash/hugetlb-book3e.h
>
> it doesn't seem to constrain itself so.
>
> Patch 4 makes it all far worse by exposing it to pretty much everybody.
>
> Now, I think we can fix at least the user mappings with the below delta,
> but if archs are using non-page-table MMU sizes we'll need arch helpers.
>
> ARM64 is in that last boat.
>
> Will, can you live with the below, if not, what would you like to do,
> make the entire function __weak so that you can override it, or hook
> into it somewhere?

Hmm, so I don't think we currently have any PMUs that set 'data->addr'
on arm64, in which case maybe none of this currently matters for us.

However, I must admit that I couldn't figure out exactly what gets exposed
to userspace when the backend drivers don't look at the sample_type or
do anything with the addr field.

Will

2020-10-09 13:24:06

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE



On 10/9/2020 5:09 AM, Peter Zijlstra wrote:
> (we might not need the #ifdef gunk, but I've not yet dug out my cross
> compilers this morning)
>
> ---
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7009,6 +7009,7 @@ static u64 perf_virt_to_phys(u64 virt)
> */
> static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> + struct page *page;
> pgd_t *pgd;
> p4d_t *p4d;
> pud_t *pud;
> @@ -7030,15 +7031,27 @@ static u64 __perf_get_page_size(struct m
> if (!pud_present(*pud))
> return 0;
>
> - if (pud_leaf(*pud))
> + if (pud_leaf(*pud)) {
> +#ifdef pud_page
> + page = pud_page(*pud);
> + if (PageHuge(page))
> + return page_size(compound_head(page));

I think the page_size() returns the Kernel Page Size of a compound page.
What we want is the MMU page size.

If it's for the generic code, I think it should be a problem for X86.

Thanks,
Kan

> +#endif
> return 1ULL << PUD_SHIFT;
> + }
>
> pmd = pmd_offset(pud, addr);
> if (!pmd_present(*pmd))
> return 0;
>
> - if (pmd_leaf(*pmd))
> + if (pmd_leaf(*pmd)) {
> +#ifdef pmd_page
> + page = pmd_page(*pmd);
> + if (PageHuge(page))
> + return page_size(compound_head(page));
> +#endif
> return 1ULL << PMD_SHIFT;
> + }
>
> pte = pte_offset_map(pmd, addr);
> if (!pte_present(*pte)) {
> @@ -7046,6 +7059,10 @@ static u64 __perf_get_page_size(struct m
> return 0;
> }
>
> + page = pte_page(*pte);
> + if (PageHuge(page))
> + return page_size(compound_head(page));
> +
> pte_unmap(pte);
> return PAGE_SIZE;
> }

2020-10-09 17:13:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Thu, Oct 01, 2020 at 06:57:46AM -0700, [email protected] wrote:
> +/*
> + * Return the MMU page size of a given virtual address
> + */
> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> +{
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> +
> + pgd = pgd_offset(mm, addr);
> + if (pgd_none(*pgd))
> + return 0;
> +
> + p4d = p4d_offset(pgd, addr);
> + if (!p4d_present(*p4d))
> + return 0;
> +
> + if (p4d_leaf(*p4d))
> + return 1ULL << P4D_SHIFT;
> +
> + pud = pud_offset(p4d, addr);
> + if (!pud_present(*pud))
> + return 0;
> +
> + if (pud_leaf(*pud))
> + return 1ULL << PUD_SHIFT;
> +
> + pmd = pmd_offset(pud, addr);
> + if (!pmd_present(*pmd))
> + return 0;
> +
> + if (pmd_leaf(*pmd))
> + return 1ULL << PMD_SHIFT;
> +
> + pte = pte_offset_map(pmd, addr);
> + if (!pte_present(*pte)) {
> + pte_unmap(pte);
> + return 0;
> + }
> +
> + pte_unmap(pte);
> + return PAGE_SIZE;
> +}

So this mostly works, but gets a number of hugetlb and arch specific
things wrong.

With the first 3 patches, this is only exposed to x86 and Power.
Michael, does the above work for you?

Looking at:

arch/powerpc/include/asm/book3s/64/hugetlb.h:check_and_get_huge_psize()

You seem to limit yourself to page-table sizes, however if I then look
at the same function in:

arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
arch/powerpc/include/asm/nohash/hugetlb-book3e.h

it doesn't seem to constrain itself so.

Patch 4 makes it all far worse by exposing it to pretty much everybody.

Now, I think we can fix at least the user mappings with the below delta,
but if archs are using non-page-table MMU sizes we'll need arch helpers.

ARM64 is in that last boat.

Will, can you live with the below, if not, what would you like to do,
make the entire function __weak so that you can override it, or hook
into it somewhere?

DaveM, I saw Sparc64 also has 'funny' hugetlb sizes. Are those only for
hugetlb, or are you also using them for the kernel map?

(we might not need the #ifdef gunk, but I've not yet dug out my cross
compilers this morning)

---
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7009,6 +7009,7 @@ static u64 perf_virt_to_phys(u64 virt)
*/
static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
+ struct page *page;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -7030,15 +7031,27 @@ static u64 __perf_get_page_size(struct m
if (!pud_present(*pud))
return 0;

- if (pud_leaf(*pud))
+ if (pud_leaf(*pud)) {
+#ifdef pud_page
+ page = pud_page(*pud);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+#endif
return 1ULL << PUD_SHIFT;
+ }

pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd))
return 0;

- if (pmd_leaf(*pmd))
+ if (pmd_leaf(*pmd)) {
+#ifdef pmd_page
+ page = pmd_page(*pmd);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+#endif
return 1ULL << PMD_SHIFT;
+ }

pte = pte_offset_map(pmd, addr);
if (!pte_present(*pte)) {
@@ -7046,6 +7059,10 @@ static u64 __perf_get_page_size(struct m
return 0;
}

+ page = pte_page(*pte);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+
pte_unmap(pte);
return PAGE_SIZE;
}

2020-10-09 17:15:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Fri, Oct 09, 2020 at 10:37:51AM +0100, Will Deacon wrote:
> On Fri, Oct 09, 2020 at 11:09:27AM +0200, Peter Zijlstra wrote:

> > Patch 4 makes it all far worse by exposing it to pretty much everybody.
> >
> > Now, I think we can fix at least the user mappings with the below delta,
> > but if archs are using non-page-table MMU sizes we'll need arch helpers.
> >
> > ARM64 is in that last boat.
> >
> > Will, can you live with the below, if not, what would you like to do,
> > make the entire function __weak so that you can override it, or hook
> > into it somewhere?
>
> Hmm, so I don't think we currently have any PMUs that set 'data->addr'
> on arm64, in which case maybe none of this currently matters for us.
>
> However, I must admit that I couldn't figure out exactly what gets exposed
> to userspace when the backend drivers don't look at the sample_type or
> do anything with the addr field.

Patch 4:

https://lkml.kernel.org/r/[email protected]

is the one that exposes this to everybody with perf support. It will
then report the page-size for the code address (SAMPLE_IP).

2020-10-09 18:38:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Fri, Oct 09, 2020 at 08:29:25AM -0400, Liang, Kan wrote:
>
>
> On 10/9/2020 5:09 AM, Peter Zijlstra wrote:
> > (we might not need the #ifdef gunk, but I've not yet dug out my cross
> > compilers this morning)
> >
> > ---
> > --- a/kernel/events/core.c
> > +++ b/kernel/events/core.c
> > @@ -7009,6 +7009,7 @@ static u64 perf_virt_to_phys(u64 virt)
> > */
> > static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> > {
> > + struct page *page;
> > pgd_t *pgd;
> > p4d_t *p4d;
> > pud_t *pud;
> > @@ -7030,15 +7031,27 @@ static u64 __perf_get_page_size(struct m
> > if (!pud_present(*pud))
> > return 0;
> > - if (pud_leaf(*pud))
> > + if (pud_leaf(*pud)) {
> > +#ifdef pud_page
> > + page = pud_page(*pud);
> > + if (PageHuge(page))
> > + return page_size(compound_head(page));
>
> I think the page_size() returns the Kernel Page Size of a compound page.
> What we want is the MMU page size.
>
> If it's for the generic code, I think it should be a problem for X86.

See the PageHuge() condition before it. It only makes sense to provide a
hugetlb page-size if the actual hardware supports it.

For x86 hugetlb only supports PMD and PUD sized pages, so the added code
is pointless and should result in identical behaviour.

For architectures that have hugetlb page sizes that do not align with
the page-table levels (arm64, sparc64 and possibly power) this will
(hopefully) give the right number.

2020-10-09 19:43:41

by Michael Ellerman

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

Peter Zijlstra <[email protected]> writes:
> On Thu, Oct 01, 2020 at 06:57:46AM -0700, [email protected] wrote:
>> +/*
>> + * Return the MMU page size of a given virtual address
>> + */
>> +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
>> +{
>> + pgd_t *pgd;
>> + p4d_t *p4d;
>> + pud_t *pud;
>> + pmd_t *pmd;
>> + pte_t *pte;
>> +
>> + pgd = pgd_offset(mm, addr);
>> + if (pgd_none(*pgd))
>> + return 0;
>> +
>> + p4d = p4d_offset(pgd, addr);
>> + if (!p4d_present(*p4d))
>> + return 0;
>> +
>> + if (p4d_leaf(*p4d))
>> + return 1ULL << P4D_SHIFT;
>> +
>> + pud = pud_offset(p4d, addr);
>> + if (!pud_present(*pud))
>> + return 0;
>> +
>> + if (pud_leaf(*pud))
>> + return 1ULL << PUD_SHIFT;
>> +
>> + pmd = pmd_offset(pud, addr);
>> + if (!pmd_present(*pmd))
>> + return 0;
>> +
>> + if (pmd_leaf(*pmd))
>> + return 1ULL << PMD_SHIFT;
>> +
>> + pte = pte_offset_map(pmd, addr);
>> + if (!pte_present(*pte)) {
>> + pte_unmap(pte);
>> + return 0;
>> + }
>> +
>> + pte_unmap(pte);
>> + return PAGE_SIZE;
>> +}
>
> So this mostly works, but gets a number of hugetlb and arch specific
> things wrong.
>
> With the first 3 patches, this is only exposed to x86 and Power.
> Michael, does the above work for you?

It might work on our server CPUs (book3s) but in general no I don't
think it will work for all powerpc.

> Looking at:
>
> arch/powerpc/include/asm/book3s/64/hugetlb.h:check_and_get_huge_psize()
>
> You seem to limit yourself to page-table sizes, however if I then look
> at the same function in:
>
> arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
> arch/powerpc/include/asm/nohash/hugetlb-book3e.h
>
> it doesn't seem to constrain itself so.

Yeah the embedded CPUs have more weird sizes.

I think we have all the logic in our __find_linux_pte().

> Patch 4 makes it all far worse by exposing it to pretty much everybody.
>
> Now, I think we can fix at least the user mappings with the below delta,
> but if archs are using non-page-table MMU sizes we'll need arch helpers.
>
> ARM64 is in that last boat.

I think we probably need it to be weak so we can provide our own
version.

cheers

2020-10-12 08:50:48

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Sat, Oct 10, 2020 at 12:28:39AM +1100, Michael Ellerman wrote:
> Peter Zijlstra <[email protected]> writes:
> > Patch 4 makes it all far worse by exposing it to pretty much everybody.
> >
> > Now, I think we can fix at least the user mappings with the below delta,
> > but if archs are using non-page-table MMU sizes we'll need arch helpers.
> >
> > ARM64 is in that last boat.
>
> I think we probably need it to be weak so we can provide our own
> version.

I guess the same thing applies to us, but I can't really tell how accurate
this stuff needs to be for userspace. If it's trying to use the page-table
configuration to infer properties of the TLB, that's never going to be
reliable because the TLB can choose both to split and coalesce entries
as long as software can't tell.

Will

2020-10-13 17:23:38

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE



On 10/12/2020 4:48 AM, Will Deacon wrote:
> On Sat, Oct 10, 2020 at 12:28:39AM +1100, Michael Ellerman wrote:
>> Peter Zijlstra <[email protected]> writes:
>>> Patch 4 makes it all far worse by exposing it to pretty much everybody.
>>>
>>> Now, I think we can fix at least the user mappings with the below delta,
>>> but if archs are using non-page-table MMU sizes we'll need arch helpers.
>>>
>>> ARM64 is in that last boat.
>>
>> I think we probably need it to be weak so we can provide our own
>> version.
>
> I guess the same thing applies to us, but I can't really tell how accurate
> this stuff needs to be for userspace. If it's trying to use the page-table
> configuration to infer properties of the TLB, that's never going to be
> reliable because the TLB can choose both to split and coalesce entries
> as long as software can't tell.
>

Hi Peter,

It looks like everybody wants a __weak function. If so, I guess we
should drop the generic code in this patch. For X86, we have existing
functions to retrieve the page level and the page size. I think we don't
need the generic code either.
https://lkml.kernel.org/r/[email protected]/

Should I send a V10 patch to drop the generic code and implement an X86
specific perf_get_page_size()? Will, Michael, and others can implement
their version later separately.

Thanks,
Kan

2020-10-13 18:45:52

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Tue, Oct 13, 2020 at 05:46:15PM +0200, Peter Zijlstra wrote:
> Nah, that generic function, should work for 90% of all archs, it's just
> a few oddballs that need something else.
>
> Also, if we add that hugetlb exception, we'll even get the usermap for
> those oddballs right.
>
> I'll take this version after the merge window, I'll add __weak for the
> oddballs and also add the hugetlb userspace thing on top.

Like so..

---
Subject: perf,mm: Handle non-page-table-aligned hugetlbfs
From: Peter Zijlstra <[email protected]>
Date: Fri, 9 Oct 2020 11:09:27 +0200

A limited nunmber of architectures support hugetlbfs sizes that do not
align with the page-tables (ARM64, Power, Sparc64). Add support for
this to the generic perf_get_page_size() implementation, and also
allow an architecture to override this implementation.

This latter is only needed when it uses non-page-table aligned huge
pages in its kernel map.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/events/core.c | 39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6996,10 +6996,18 @@ static u64 perf_virt_to_phys(u64 virt)
#ifdef CONFIG_MMU

/*
- * Return the MMU page size of a given virtual address
+ * Return the MMU page size of a given virtual address.
+ *
+ * This generic implementation handles page-table aligned huge pages, as well
+ * as non-page-table aligned hugetlbfs compound pages.
+ *
+ * If an architecture supports and uses non-page-table aligned pages in their
+ * kernel mapping it will need to provide it's own implementation of this
+ * function.
*/
-static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
+ struct page *page;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -7021,15 +7029,27 @@ static u64 __perf_get_page_size(struct m
if (!pud_present(*pud))
return 0;

- if (pud_leaf(*pud))
+ if (pud_leaf(*pud)) {
+#ifdef pud_page
+ page = pud_page(*pud);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+#endif
return 1ULL << PUD_SHIFT;
+ }

pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd))
return 0;

- if (pmd_leaf(*pmd))
+ if (pmd_leaf(*pmd)) {
+#ifdef pmd_page
+ page = pmd_page(*pmd);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+#endif
return 1ULL << PMD_SHIFT;
+ }

pte = pte_offset_map(pmd, addr);
if (!pte_present(*pte)) {
@@ -7037,13 +7057,20 @@ static u64 __perf_get_page_size(struct m
return 0;
}

+ page = pte_page(*pte);
+ if (PageHuge(page)) {
+ u64 size = page_size(compound_head(page));
+ pte_unmap(pte);
+ return size;
+ }
+
pte_unmap(pte);
return PAGE_SIZE;
}

#else

-static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
return 0;
}
@@ -7074,7 +7101,7 @@ static u64 perf_get_page_size(unsigned l
mm = &init_mm;
}

- size = __perf_get_page_size(mm, addr);
+ size = arch_perf_get_page_size(mm, addr);

local_irq_restore(flags);

2020-10-14 00:08:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Tue, Oct 13, 2020 at 10:57:41AM -0400, Liang, Kan wrote:
>
>
> On 10/12/2020 4:48 AM, Will Deacon wrote:
> > On Sat, Oct 10, 2020 at 12:28:39AM +1100, Michael Ellerman wrote:
> > > Peter Zijlstra <[email protected]> writes:
> > > > Patch 4 makes it all far worse by exposing it to pretty much everybody.
> > > >
> > > > Now, I think we can fix at least the user mappings with the below delta,
> > > > but if archs are using non-page-table MMU sizes we'll need arch helpers.
> > > >
> > > > ARM64 is in that last boat.
> > >
> > > I think we probably need it to be weak so we can provide our own
> > > version.
> >
> > I guess the same thing applies to us, but I can't really tell how accurate
> > this stuff needs to be for userspace. If it's trying to use the page-table
> > configuration to infer properties of the TLB, that's never going to be
> > reliable because the TLB can choose both to split and coalesce entries
> > as long as software can't tell.
> >
>
> Hi Peter,
>
> It looks like everybody wants a __weak function. If so, I guess we should
> drop the generic code in this patch. For X86, we have existing functions to
> retrieve the page level and the page size. I think we don't need the generic
> code either.
> https://lkml.kernel.org/r/[email protected]/
>
> Should I send a V10 patch to drop the generic code and implement an X86
> specific perf_get_page_size()? Will, Michael, and others can implement their
> version later separately.

Nah, that generic function, should work for 90% of all archs, it's just
a few oddballs that need something else.

Also, if we add that hugetlb exception, we'll even get the usermap for
those oddballs right.

I'll take this version after the merge window, I'll add __weak for the
oddballs and also add the hugetlb userspace thing on top.

2020-10-20 17:19:56

by Leo Yan

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Fri, Oct 09, 2020 at 11:53:00AM +0200, Peter Zijlstra wrote:
> On Fri, Oct 09, 2020 at 10:37:51AM +0100, Will Deacon wrote:
> > On Fri, Oct 09, 2020 at 11:09:27AM +0200, Peter Zijlstra wrote:
>
> > > Patch 4 makes it all far worse by exposing it to pretty much everybody.
> > >
> > > Now, I think we can fix at least the user mappings with the below delta,
> > > but if archs are using non-page-table MMU sizes we'll need arch helpers.
> > >
> > > ARM64 is in that last boat.
> > >
> > > Will, can you live with the below, if not, what would you like to do,
> > > make the entire function __weak so that you can override it, or hook
> > > into it somewhere?
> >
> > Hmm, so I don't think we currently have any PMUs that set 'data->addr'
> > on arm64, in which case maybe none of this currently matters for us.
> >
> > However, I must admit that I couldn't figure out exactly what gets exposed
> > to userspace when the backend drivers don't look at the sample_type or
> > do anything with the addr field.
>
> Patch 4:
>
> https://lkml.kernel.org/r/[email protected]
>
> is the one that exposes this to everybody with perf support. It will
> then report the page-size for the code address (SAMPLE_IP).

I can see there have another potentail customer to use page-size is
Arm SPE, but Arm SPE is hardware trace based sample but not interrupt
based sample. For this case, I think this patch set cannot be
directly applied to the AUX trace data.

Thanks,
Leo

2020-10-20 20:31:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Tue, Oct 20, 2020 at 10:49:25AM +0800, Leo Yan wrote:
> I can see there have another potentail customer to use page-size is
> Arm SPE, but Arm SPE is hardware trace based sample but not interrupt
> based sample. For this case, I think this patch set cannot be
> directly applied to the AUX trace data.

IIRC SPE is decoded in userspace, at which point we simply cannot access
this data.

2020-10-20 21:24:26

by Leo Yan

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Tue, Oct 20, 2020 at 09:19:45AM +0200, Peter Zijlstra wrote:
> On Tue, Oct 20, 2020 at 10:49:25AM +0800, Leo Yan wrote:
> > I can see there have another potentail customer to use page-size is
> > Arm SPE, but Arm SPE is hardware trace based sample but not interrupt
> > based sample. For this case, I think this patch set cannot be
> > directly applied to the AUX trace data.
>
> IIRC SPE is decoded in userspace, at which point we simply cannot access
> this data.

Yes, it's decoded in userspace.

Thanks for clarification!
Leo

Subject: [tip: perf/core] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

The following commit has been merged into the perf/core branch of tip:

Commit-ID: 8d97e71811aaafe4abf611dc24822fd6e73df1a1
Gitweb: https://git.kernel.org/tip/8d97e71811aaafe4abf611dc24822fd6e73df1a1
Author: Kan Liang <[email protected]>
AuthorDate: Thu, 01 Oct 2020 06:57:46 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 29 Oct 2020 11:00:38 +01:00

perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

Current perf can report both virtual addresses and physical addresses,
but not the MMU page size. Without the MMU page size information of the
utilized page, users cannot decide whether to promote/demote large pages
to optimize memory usage.

Add a new sample type for the data MMU page size.

Current perf already has a facility to collect data virtual addresses.
A page walker is required to walk the pages tables and calculate the
MMU page size from a given virtual address.

On some platforms, e.g., X86, the page walker is invoked in an NMI
handler. So the page walker must be NMI-safe and low overhead. Besides,
the page walker should work for both user and kernel virtual address.
The existing generic page walker, e.g., walk_page_range_novma(), is a
little bit complex and doesn't guarantee the NMI-safe. The follow_page()
is only for user-virtual address.

Add a new function perf_get_page_size() to walk the page tables and
calculate the MMU page size. In the function:
- Interrupts have to be disabled to prevent any teardown of the page
tables.
- For user space threads, the current->mm is used for the page walker.
For kernel threads and the like, the current->mm is NULL. The init_mm
is used for the page walker. The active_mm is not used here, because
it can be NULL.
Quote from Peter Zijlstra,
"context_switch() can set prev->active_mm to NULL when it transfers it
to @next. It does this before @current is updated. So an NMI that
comes in between this active_mm swizzling and updating @current will
see !active_mm."
- The MMU page size is calculated from the page table level.

The method should work for all architectures, but it has only been
verified on X86. Should there be some architectures, which support perf,
where the method doesn't work, it can be fixed later separately.
Reporting the wrong page size would not be fatal for the architecture.

Some under discussion features may impact the method in the future.
Quote from Dave Hansen,
"There are lots of weird things folks are trying to do with the page
tables, like Address Space Isolation. For instance, if you get a
perf NMI when running userspace, current->mm->pgd is *different* than
the PGD that was in use when userspace was running. It's close enough
today, but it might not stay that way."
If the case happens later, lots of consecutive page walk errors will
happen. The worst case is that lots of page-size '0' are returned, which
would not be fatal.
In the perf tool, a check is implemented to detect this case. Once it
happens, a kernel patch could be implemented accordingly then.

Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Kan Liang <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/perf_event.h | 1 +-
include/uapi/linux/perf_event.h | 4 +-
kernel/events/core.c | 103 +++++++++++++++++++++++++++++++-
3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0c19d27..7e3785d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1034,6 +1034,7 @@ struct perf_sample_data {

u64 phys_addr;
u64 cgroup;
+ u64 data_page_size;
} ____cacheline_aligned;

/* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 077e7ee..cc6ea34 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,8 +143,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
PERF_SAMPLE_AUX = 1U << 20,
PERF_SAMPLE_CGROUP = 1U << 21,
+ PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22,

- PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */

__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
@@ -896,6 +897,7 @@ enum perf_event_type {
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* { u64 size;
* char data[size]; } && PERF_SAMPLE_AUX
+ * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fb662eb..a796db2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,7 @@
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
+#include <linux/highmem.h>

#include "internal.h"

@@ -1894,6 +1895,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);

+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
event->header_size = size;
}

@@ -6938,6 +6942,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CGROUP)
perf_output_put(handle, data->cgroup);

+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);

@@ -6995,6 +7002,94 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}

+#ifdef CONFIG_MMU
+
+/*
+ * Return the MMU page size of a given virtual address
+ */
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ if (p4d_leaf(*p4d))
+ return 1ULL << P4D_SHIFT;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_leaf(*pud))
+ return 1ULL << PUD_SHIFT;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_leaf(*pmd))
+ return 1ULL << PMD_SHIFT;
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return 0;
+ }
+
+ pte_unmap(pte);
+ return PAGE_SIZE;
+}
+
+#else
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ return 0;
+}
+
+#endif
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = __perf_get_page_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
@@ -7150,6 +7245,14 @@ void perf_prepare_sample(struct perf_event_header *header,
}
#endif

+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ data->data_page_size = perf_get_page_size(data->addr);
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;

2020-11-04 18:41:09

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE



On 10/13/2020 12:34 PM, Peter Zijlstra wrote:
> Subject: perf,mm: Handle non-page-table-aligned hugetlbfs
> From: Peter Zijlstra<[email protected]>
> Date: Fri, 9 Oct 2020 11:09:27 +0200
>
> A limited nunmber of architectures support hugetlbfs sizes that do not
> align with the page-tables (ARM64, Power, Sparc64). Add support for
> this to the generic perf_get_page_size() implementation, and also
> allow an architecture to override this implementation.
>
> This latter is only needed when it uses non-page-table aligned huge
> pages in its kernel map.
>
> Signed-off-by: Peter Zijlstra (Intel)<[email protected]>
> ---
> kernel/events/core.c | 39 +++++++++++++++++++++++++++++++++------
> 1 file changed, 33 insertions(+), 6 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6996,10 +6996,18 @@ static u64 perf_virt_to_phys(u64 virt)
> #ifdef CONFIG_MMU
>
> /*
> - * Return the MMU page size of a given virtual address
> + * Return the MMU page size of a given virtual address.
> + *
> + * This generic implementation handles page-table aligned huge pages, as well
> + * as non-page-table aligned hugetlbfs compound pages.
> + *
> + * If an architecture supports and uses non-page-table aligned pages in their
> + * kernel mapping it will need to provide it's own implementation of this
> + * function.
> */
> -static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> +__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> {
> + struct page *page;
> pgd_t *pgd;
> p4d_t *p4d;
> pud_t *pud;
> @@ -7021,15 +7029,27 @@ static u64 __perf_get_page_size(struct m
> if (!pud_present(*pud))
> return 0;
>
> - if (pud_leaf(*pud))
> + if (pud_leaf(*pud)) {
> +#ifdef pud_page
> + page = pud_page(*pud);
> + if (PageHuge(page))
> + return page_size(compound_head(page));
> +#endif
> return 1ULL << PUD_SHIFT;
> + }
>
> pmd = pmd_offset(pud, addr);
> if (!pmd_present(*pmd))
> return 0;
>
> - if (pmd_leaf(*pmd))
> + if (pmd_leaf(*pmd)) {
> +#ifdef pmd_page
> + page = pmd_page(*pmd);
> + if (PageHuge(page))
> + return page_size(compound_head(page));
> +#endif
> return 1ULL << PMD_SHIFT;
> + }
>
> pte = pte_offset_map(pmd, addr);
> if (!pte_present(*pte)) {
> @@ -7037,13 +7057,20 @@ static u64 __perf_get_page_size(struct m
> return 0;
> }
>
> + page = pte_page(*pte);
> + if (PageHuge(page)) {
> + u64 size = page_size(compound_head(page));
> + pte_unmap(pte);
> + return size;
> + }
> +

The PageHuge() check for PTE crashes my machine when I did page size
test. (Sorry, I didn't find the issue earlier. I just found some time to
re-run the page size test.)

It seems we don't need the check for PTE here. The size should be always
PAGE_SIZE, no? After I remove the check, everything looks good.

Thanks,
Kan

[ 167.383120] BUG: unable to handle page fault for address:
fffffca803fb8000
[ 167.383121] #PF: supervisor read access in kernel mode
[ 167.383121] #PF: error_code(0x0000) - not-present page
[ 167.383121] PGD 4adfc8067 P4D 4adfc8067 PUD 4adfc7067 PMD 0
[ 167.383122] Oops: 0000 [#1] SMP NOPTI
[ 167.383122] CPU: 0 PID: 2461 Comm: perf Not tainted
5.10.0-rc1_page_size+ #54
[ 167.383123] Hardware name: Intel Corporation Ice Lake Client
Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS
ICLSFWR1.R00.3162.A00.1904162000 04/16/2019
[ 167.383123] traps: PANIC: double fault, error_code: 0x0
[ 167.383123] double fault: 0000 [#2] SMP NOPTI
[ 167.383123] CPU: 0 PID: 2461 Comm: perf Not tainted
5.10.0-rc1_page_size+ #54
[ 167.383124] Hardware name: Intel Corporation Ice Lake Client
Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS
ICLSFWR1.R00.3162.A00.1904162000 04/16/2019
[ 167.383124] RIP: 0010:__sprint_symbol+0xb/0x100
[ 167.383124] Code: 85 e4 75 b9 48 85 c0 75 bc 49 89 d8 4c 89 e1 4c 89
fa 4c 89 f6 4c 89 ef e8 42 70 04 00 eb a6 0f 1f 44 00 00 55 48 89 e5 41
57 <41> 56 49 89 f6 41 55 41 89 cd 48 8d 4d b8 41 54 49 89 fc 53 48 63
[ 167.383125] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
[ 167.383125] RAX: 0000000000000053 RBX: 00000000ffff0a00 RCX:
0000000000000001
[ 167.383125] RDX: 0000000000000000 RSI: ffffffff9f8a6176 RDI:
fffffe000000b029
[ 167.383126] RBP: fffffe000000b008 R08: ffffffffa0bf8661 R09:
0000000000000001
[ 167.383126] R10: 000000000000000f R11: ffff9e641dc189c8 R12:
ffff9e641dc189c9
[ 167.383126] R13: ffff9e641dc1a7e0 R14: ffff0a00ffffff05 R15:
fffffe000000b029
[ 167.383126] FS: 0000000000000000(0000) GS:ffff9e641dc00000(0000)
knlGS:0000000000000000
[ 167.383127] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 167.383127] CR2: fffffe000000aff8 CR3: 000000014a0ba004 CR4:
0000000000770ef0
[ 167.383127] PKRU: 55555554
[ 167.383127] Call Trace:
[ 167.383127] <NMI>
[ 167.383128] sprint_symbol+0x15/0x20
[ 167.383128] symbol_string+0x49/0x90
[ 167.383128] pointer+0x118/0x3e0
[ 167.383128] vsnprintf+0x1ec/0x4e0
[ 167.383128] vscnprintf+0xd/0x30
[ 167.383129] printk_safe_log_store+0x65/0xe0
[ 167.383129] vprintk_func+0x8d/0x100
[ 167.383129] printk+0x58/0x6f
[ 167.383129] ? PageHuge+0x6/0x40
[ 167.383129] show_ip+0x2c/0x3d
[ 167.383130] show_iret_regs+0x17/0x40
[ 167.383130] __show_regs+0x27/0x40
[ 167.383130] ? dump_stack_print_info+0x9e/0xb0
[ 167.383130] show_regs+0x3b/0x50
[ 167.383130] __die_body+0x20/0x70
[ 167.383131] __die+0x2b/0x33
[ 167.383131] no_context+0x152/0x350
[ 167.383131] __bad_area_nosemaphore+0x50/0x160
[ 167.383131] bad_area_nosemaphore+0x16/0x20
[ 167.383131] do_kern_addr_fault+0x62/0x70
[ 167.383132] exc_page_fault+0xcd/0x150
[ 167.383132] asm_exc_page_fault+0x1e/0x30
[ 167.383132] RIP: 0010:PageHuge+0x6/0x40
[ 167.383132] Code: c8 48 89 0e 48 39 47 08 48 0f 46 47 08 48 89 02 c3
0f 1f 00 55 be 00 04 00 00 48 89 e5 e8 72 90 2a 00 5d c3 0f 1f 44 00 00
55 <48> 8b 07 48 89 e5 a9 00 00 01 00 75 09 48 8b 47 08 83 e0 01 74 17
[ 167.383133] RSP: 0018:fffffe000000b5c0 EFLAGS: 00010086
[ 167.383133] RAX: 00000000fee0017b RBX: fffffca803fb8000 RCX:
0000000000000000
[ 167.383133] RDX: 7fffffff011ffe84 RSI: ffff9e635b614fe8 RDI:
fffffca803fb8000
[ 167.383134] RBP: fffffe000000b5d8 R08: 000fffffc0000000 R09:
000000000000000d
[ 167.383134] R10: 0000000000000001 R11: 00000000000011da R12:
000000000048c14f
[ 167.383134] R13: fffffe000000b8c0 R14: ffff9e60955aa800 R15:
ffffffffff5fd340
[ 167.383134] ? arch_perf_get_page_size+0x2e4/0x330
[ 167.383135] perf_get_page_size.part.0+0x3c/0x50
[ 167.383135] perf_prepare_sample+0x1cc/0x740
[ 167.383135] perf_event_output_forward+0x30/0x90
[ 167.383135] ? sched_clock+0x9/0x10
[ 167.383135] ? __perf_event_account_interrupt+0xcc/0xe0
[ 167.383136] __perf_event_overflow+0x57/0xf0
[ 167.383136] perf_event_overflow+0x14/0x20
[ 167.383136] __intel_pmu_pebs_event+0x2a8/0x3a0
[ 167.383136] ? get_data_src.isra.0+0xc0/0xc0
[ 167.383136] ? perf_event_stop+0xa0/0xa0
[ 167.383137] ? native_apic_mem_write+0x2/0x10
[ 167.383137] ? native_apic_mem_write+0x2/0x10
[ 167.383137] intel_pmu_drain_pebs_icl+0x1bf/0x1f0
[ 167.383137] ? get_data_src.isra.0+0xc0/0xc0
[ 167.383137] handle_pmi_common+0xb6/0x2a0
[ 167.383138] intel_pmu_handle_irq+0xca/0x170
[ 167.383138] perf_event_nmi_handler+0x2d/0x50
[ 167.383138] nmi_handle+0x5d/0x100
[ 167.383138] default_do_nmi+0x45/0x110
[ 167.383138] exc_nmi+0x15a/0x1a0
[ 167.383139] end_repeat_nmi+0x16/0x55
[ 167.383139] RIP: 0010:perf_iterate_ctx+0x0/0x170
[ 167.383139] Code: b0 95 01 41 89 c5 72 d9 48 c7 c7 40 a0 fb a0 e8 16
47 b8 00 b8 f4 ff ff ff e9 cb fe ff ff 66 90 66 2e 0f 1f 84 00 00 00 00
00 <55> 48 89 e5 41 57 41 56 41 55 41 54 4c 8d 67 60 53 48 83 ec 08 48
[ 167.383140] RSP: 0018:ffffb51d01907ca8 EFLAGS: 00000246
[ 167.383140] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
0000000000000001
[ 167.383140] RDX: 0000000000000000 RSI: ffffffff9f815460 RDI:
ffff9e60c5423500
[ 167.383140] RBP: ffffb51d01907d08 R08: 00000026f8cdcae0 R09:
0000000000000021
[ 167.383141] R10: 0000000000000000 R11: ffff9e641dc11ec8 R12:
0000000000000000
[ 167.383141] R13: ffff9e641dc33300 R14: ffff9e60aaf69e00 R15:
ffff9e60c5423500
[ 167.383141] ? perf_event_stop+0xa0/0xa0
[ 167.383141] ? perf_swevent_init+0x190/0x190
[ 167.383142] ? perf_swevent_init+0x190/0x190
[ 167.383142] </NMI>
[ 167.383142] ? perf_event_exec+0x1ca/0x210
[ 167.383142] begin_new_exec+0x5ba/0x980
[ 167.383142] load_elf_binary+0x6ae/0x17a0
[ 167.383143] ? __kernel_read+0x1a0/0x2d0
[ 167.383143] ? __kernel_read+0x1a0/0x2d0
[ 167.383143] bprm_execve+0x2f6/0x660
[ 167.383143] do_execveat_common.isra.0+0x189/0x1c0
[ 167.383143] __x64_sys_execve+0x37/0x50
[ 167.383144] do_syscall_64+0x38/0x90
[ 167.383144] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 167.383144] RIP: 0033:0x7f2c347dae97
[ 167.383144] Code: Unable to access opcode bytes at RIP 0x7f2c347dae6d.
[ 167.383144] RSP: 002b:00007ffe73f584a8 EFLAGS: 00000202 ORIG_RAX:
000000000000003b
[ 167.383145] RAX: ffffffffffffffda RBX: 00005625a990cd70 RCX:
00007f2c347dae97
[ 167.383145] RDX: 00005625a990caf0 RSI: 00005625a990cd70 RDI:
00007ffe73f6069f
[ 167.383145] RBP: 00007ffe73f58540 R08: 00007ffe73f584a0 R09:
00007f2c368502b0
[ 167.383146] R10: 0000000000000016 R11: 0000000000000202 R12:
00005625a990caf0
[ 167.383146] R13: 00005625a8c06870 R14: 0000000000000000 R15:
00007ffe73f6069f
[ 167.383146] Modules linked in: snd_hda_codec_hdmi
snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio
nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo xt_addrtype
br_netfilter xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat nf_nat
xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c
ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter
ebtables ip6table_filter ip6_tables iptable_filter overlay bnep
hid_sensor_magn_3d hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d
hid_sensor_rotation hid_sensor_incl_3d hid_sensor_trigger
industrialio_triggered_buffer kfifo_buf hid_sensor_iio_common
industrialio hid_sensor_custom hid_sensor_hub intel_ishtp_hid
binfmt_misc spi_pxa2xx_platform dw_dmac dw_dmac_core 8250_dw mei_hdcp
i2c_designware_platform i2c_designware_core intel_rapl_msr wmi_bmof
intl_wmi_thunderbolt x86_pkg_temp_thermal snd_hda_intel intel_powerclamp
coretemp snd_intel_dspcfg nls_iso8859_1 snd_hda_codec kvm_intel
snd_hda_core snd_hwdep kvm
[ 167.383158] snd_pcm irqbypass snd_seq_midi crct10dif_pclmul
crc32_pclmul ghash_clmulni_intel snd_seq_midi_event snd_rawmidi
aesni_intel crypto_simd cryptd snd_seq glue_helper rapl snd_seq_device
intel_cstate snd_timer ofpart cmdlinepart intel_spi_pci snd e1000e
intel_spi spi_nor soundcore mtd iwlmvm mei_me mei mac80211 libarc4
iwlwifi intel_lpss_pci intel_lpss cfg80211 asix usbnet mii joydev btusb
btrtl btbcm btintel bluetooth ecdh_generic ecc intel_ish_ipc
processor_thermal_device intel_ishtp intel_rapl_common
int340x_thermal_zone thunderbolt intel_soc_dts_iosf ucsi_acpi typec_ucsi
typec wmi intel_hid int3400_thermal sparse_keymap acpi_pad
acpi_thermal_rel acpi_tad sch_fq_codel parport_pc ppdev lp parport
ip_tables x_tables hid_generic usbhid hid input_leds serio_raw mac_hid
pinctrl_icelake
[ 168.121417] ---[ end trace 7f4c6d2d09e5e90f ]---
[ 168.121417] RIP: 0010:PageHuge+0x6/0x40
[ 168.121417] ode: c8 48 89 0e 48 39 47 08 48 0f 46 47 08 48 89 02 c3
0f 1f 00 55 be 00 04 00 00 48 89 e5 e8 72 90 2a 00 5d c3 0f 1f 44 00 00
55 <48> 8b 07 48 89 e5 a9 00 00 01 00 75 09 48 8b 47 08 83 e0 01 74 17
[ 168.121418] RSP: 0018:fffffe000000b5c0 EFLAGS: 00010086
[ 168.121418] RAX: 00000000fee0017b RBX: fffffca803fb8000 RCX:
0000000000000000
[ 168.121419] RDX: 7fffffff011ffe84 RSI: ffff9e635b614fe8 RDI:
fffffca803fb8000
[ 168.121419] RBP: fffffe000000b5d8 R08: 000fffffc0000000 R09:
000000000000000d
[ 168.121419] R10: 0000000000000001 R11: 00000000000011da R12:
000000000048c14f
[ 168.121419] R13: fffffe000000b8c0 R14: ffff9e60955aa800 R15:
ffffffffff5fd340
[ 168.121420] FS: 0000000000000000(0000) GS:ffff9e641dc00000(0000)
knlGS:0000000000000000
[ 168.121420] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 168.121420] CR2: 00007f2c347dae6d CR3: 000000014a0ba004 CR4:
0000000000770ef0
[ 168.121420] PKRU: 55555554
[ 168.121421] Kernel panic - not syncing: Fatal exception in interrupt
[ 168.121520] Kernel Offset: 0x1e600000 from 0xffffffff81000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)

2020-11-10 15:24:45

by Liang, Kan

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE



On 11/4/2020 12:11 PM, Liang, Kan wrote:
>
>
> On 10/13/2020 12:34 PM, Peter Zijlstra wrote:
>> Subject: perf,mm: Handle non-page-table-aligned hugetlbfs
>> From: Peter Zijlstra<[email protected]>
>> Date: Fri, 9 Oct 2020 11:09:27 +0200
>>
>> A limited nunmber of architectures support hugetlbfs sizes that do not
>> align with the page-tables (ARM64, Power, Sparc64). Add support for
>> this to the generic perf_get_page_size() implementation, and also
>> allow an architecture to override this implementation.
>>
>> This latter is only needed when it uses non-page-table aligned huge
>> pages in its kernel map.
>>
>> Signed-off-by: Peter Zijlstra (Intel)<[email protected]>
>> ---
>>   kernel/events/core.c |   39 +++++++++++++++++++++++++++++++++------
>>   1 file changed, 33 insertions(+), 6 deletions(-)
>>
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6996,10 +6996,18 @@ static u64 perf_virt_to_phys(u64 virt)
>>   #ifdef CONFIG_MMU
>>   /*
>> - * Return the MMU page size of a given virtual address
>> + * Return the MMU page size of a given virtual address.
>> + *
>> + * This generic implementation handles page-table aligned huge pages,
>> as well
>> + * as non-page-table aligned hugetlbfs compound pages.
>> + *
>> + * If an architecture supports and uses non-page-table aligned pages
>> in their
>> + * kernel mapping it will need to provide it's own implementation of
>> this
>> + * function.
>>    */
>> -static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long
>> addr)
>> +__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned
>> long addr)
>>   {
>> +    struct page *page;
>>       pgd_t *pgd;
>>       p4d_t *p4d;
>>       pud_t *pud;
>> @@ -7021,15 +7029,27 @@ static u64 __perf_get_page_size(struct m
>>       if (!pud_present(*pud))
>>           return 0;
>> -    if (pud_leaf(*pud))
>> +    if (pud_leaf(*pud)) {
>> +#ifdef pud_page
>> +        page = pud_page(*pud);
>> +        if (PageHuge(page))
>> +            return page_size(compound_head(page));
>> +#endif
>>           return 1ULL << PUD_SHIFT;
>> +    }
>>       pmd = pmd_offset(pud, addr);
>>       if (!pmd_present(*pmd))
>>           return 0;
>> -    if (pmd_leaf(*pmd))
>> +    if (pmd_leaf(*pmd)) {
>> +#ifdef pmd_page
>> +        page = pmd_page(*pmd);
>> +        if (PageHuge(page))
>> +            return page_size(compound_head(page));
>> +#endif
>>           return 1ULL << PMD_SHIFT;
>> +    }
>>       pte = pte_offset_map(pmd, addr);
>>       if (!pte_present(*pte)) {
>> @@ -7037,13 +7057,20 @@ static u64 __perf_get_page_size(struct m
>>           return 0;
>>       }
>> +    page = pte_page(*pte);
>> +    if (PageHuge(page)) {
>> +        u64 size = page_size(compound_head(page));
>> +        pte_unmap(pte);
>> +        return size;
>> +    }
>> +
>
> The PageHuge() check for PTE crashes my machine when I did page size
> test. (Sorry, I didn't find the issue earlier. I just found some time to
> re-run the page size test.)
>
> It seems we don't need the check for PTE here. The size should be always
> PAGE_SIZE, no? After I remove the check, everything looks good.
>

Hi Peter,

Any comments for this issue?

As my understanding, we don't need the PageHuge() check for PTE page
here. However, I don't expect that the page fault crashes here. Because
perf already check the pte_present(*pte) before invoking PageHuge().

I'm not sure if it's triggered by other potential issues, so I'm asking
here rather than submiting a patch to simply remove the PageHuge() check.

Thanks,
Kan
>
> [  167.383120] BUG: unable to handle page fault for address:
> fffffca803fb8000
> [  167.383121] #PF: supervisor read access in kernel mode
> [  167.383121] #PF: error_code(0x0000) - not-present page
> [  167.383121] PGD 4adfc8067 P4D 4adfc8067 PUD 4adfc7067 PMD 0
> [  167.383122] Oops: 0000 [#1] SMP NOPTI
> [  167.383122] CPU: 0 PID: 2461 Comm: perf Not tainted
> 5.10.0-rc1_page_size+ #54
> [  167.383123] Hardware name: Intel Corporation Ice Lake Client
> Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS
> ICLSFWR1.R00.3162.A00.1904162000 04/16/2019
> [  167.383123] traps: PANIC: double fault, error_code: 0x0
> [  167.383123] double fault: 0000 [#2] SMP NOPTI
> [  167.383123] CPU: 0 PID: 2461 Comm: perf Not tainted
> 5.10.0-rc1_page_size+ #54
> [  167.383124] Hardware name: Intel Corporation Ice Lake Client
> Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS
> ICLSFWR1.R00.3162.A00.1904162000 04/16/2019
> [  167.383124] RIP: 0010:__sprint_symbol+0xb/0x100
> [  167.383124] Code: 85 e4 75 b9 48 85 c0 75 bc 49 89 d8 4c 89 e1 4c 89
> fa 4c 89 f6 4c 89 ef e8 42 70 04 00 eb a6 0f 1f 44 00 00 55 48 89 e5 41
> 57 <41> 56 49 89 f6 41 55 41 89 cd 48 8d 4d b8 41 54 49 89 fc 53 48 63
> [  167.383125] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
> [  167.383125] RAX: 0000000000000053 RBX: 00000000ffff0a00 RCX:
> 0000000000000001
> [  167.383125] RDX: 0000000000000000 RSI: ffffffff9f8a6176 RDI:
> fffffe000000b029
> [  167.383126] RBP: fffffe000000b008 R08: ffffffffa0bf8661 R09:
> 0000000000000001
> [  167.383126] R10: 000000000000000f R11: ffff9e641dc189c8 R12:
> ffff9e641dc189c9
> [  167.383126] R13: ffff9e641dc1a7e0 R14: ffff0a00ffffff05 R15:
> fffffe000000b029
> [  167.383126] FS:  0000000000000000(0000) GS:ffff9e641dc00000(0000)
> knlGS:0000000000000000
> [  167.383127] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  167.383127] CR2: fffffe000000aff8 CR3: 000000014a0ba004 CR4:
> 0000000000770ef0
> [  167.383127] PKRU: 55555554
> [  167.383127] Call Trace:
> [  167.383127]  <NMI>
> [  167.383128]  sprint_symbol+0x15/0x20
> [  167.383128]  symbol_string+0x49/0x90
> [  167.383128]  pointer+0x118/0x3e0
> [  167.383128]  vsnprintf+0x1ec/0x4e0
> [  167.383128]  vscnprintf+0xd/0x30
> [  167.383129]  printk_safe_log_store+0x65/0xe0
> [  167.383129]  vprintk_func+0x8d/0x100
> [  167.383129]  printk+0x58/0x6f
> [  167.383129]  ? PageHuge+0x6/0x40
> [  167.383129]  show_ip+0x2c/0x3d
> [  167.383130]  show_iret_regs+0x17/0x40
> [  167.383130]  __show_regs+0x27/0x40
> [  167.383130]  ? dump_stack_print_info+0x9e/0xb0
> [  167.383130]  show_regs+0x3b/0x50
> [  167.383130]  __die_body+0x20/0x70
> [  167.383131]  __die+0x2b/0x33
> [  167.383131]  no_context+0x152/0x350
> [  167.383131]  __bad_area_nosemaphore+0x50/0x160
> [  167.383131]  bad_area_nosemaphore+0x16/0x20
> [  167.383131]  do_kern_addr_fault+0x62/0x70
> [  167.383132]  exc_page_fault+0xcd/0x150
> [  167.383132]  asm_exc_page_fault+0x1e/0x30
> [  167.383132] RIP: 0010:PageHuge+0x6/0x40
> [  167.383132] Code: c8 48 89 0e 48 39 47 08 48 0f 46 47 08 48 89 02 c3
> 0f 1f 00 55 be 00 04 00 00 48 89 e5 e8 72 90 2a 00 5d c3 0f 1f 44 00 00
> 55 <48> 8b 07 48 89 e5 a9 00 00 01 00 75 09 48 8b 47 08 83 e0 01 74 17
> [  167.383133] RSP: 0018:fffffe000000b5c0 EFLAGS: 00010086
> [  167.383133] RAX: 00000000fee0017b RBX: fffffca803fb8000 RCX:
> 0000000000000000
> [  167.383133] RDX: 7fffffff011ffe84 RSI: ffff9e635b614fe8 RDI:
> fffffca803fb8000
> [  167.383134] RBP: fffffe000000b5d8 R08: 000fffffc0000000 R09:
> 000000000000000d
> [  167.383134] R10: 0000000000000001 R11: 00000000000011da R12:
> 000000000048c14f
> [  167.383134] R13: fffffe000000b8c0 R14: ffff9e60955aa800 R15:
> ffffffffff5fd340
> [  167.383134]  ? arch_perf_get_page_size+0x2e4/0x330
> [  167.383135]  perf_get_page_size.part.0+0x3c/0x50
> [  167.383135]  perf_prepare_sample+0x1cc/0x740
> [  167.383135]  perf_event_output_forward+0x30/0x90
> [  167.383135]  ? sched_clock+0x9/0x10
> [  167.383135]  ? __perf_event_account_interrupt+0xcc/0xe0
> [  167.383136]  __perf_event_overflow+0x57/0xf0
> [  167.383136]  perf_event_overflow+0x14/0x20
> [  167.383136]  __intel_pmu_pebs_event+0x2a8/0x3a0
> [  167.383136]  ? get_data_src.isra.0+0xc0/0xc0
> [  167.383136]  ? perf_event_stop+0xa0/0xa0
> [  167.383137]  ? native_apic_mem_write+0x2/0x10
> [  167.383137]  ? native_apic_mem_write+0x2/0x10
> [  167.383137]  intel_pmu_drain_pebs_icl+0x1bf/0x1f0
> [  167.383137]  ? get_data_src.isra.0+0xc0/0xc0
> [  167.383137]  handle_pmi_common+0xb6/0x2a0
> [  167.383138]  intel_pmu_handle_irq+0xca/0x170
> [  167.383138]  perf_event_nmi_handler+0x2d/0x50
> [  167.383138]  nmi_handle+0x5d/0x100
> [  167.383138]  default_do_nmi+0x45/0x110
> [  167.383138]  exc_nmi+0x15a/0x1a0
> [  167.383139]  end_repeat_nmi+0x16/0x55
> [  167.383139] RIP: 0010:perf_iterate_ctx+0x0/0x170
> [  167.383139] Code: b0 95 01 41 89 c5 72 d9 48 c7 c7 40 a0 fb a0 e8 16
> 47 b8 00 b8 f4 ff ff ff e9 cb fe ff ff 66 90 66 2e 0f 1f 84 00 00 00 00
> 00 <55> 48 89 e5 41 57 41 56 41 55 41 54 4c 8d 67 60 53 48 83 ec 08 48
> [  167.383140] RSP: 0018:ffffb51d01907ca8 EFLAGS: 00000246
> [  167.383140] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
> 0000000000000001
> [  167.383140] RDX: 0000000000000000 RSI: ffffffff9f815460 RDI:
> ffff9e60c5423500
> [  167.383140] RBP: ffffb51d01907d08 R08: 00000026f8cdcae0 R09:
> 0000000000000021
> [  167.383141] R10: 0000000000000000 R11: ffff9e641dc11ec8 R12:
> 0000000000000000
> [  167.383141] R13: ffff9e641dc33300 R14: ffff9e60aaf69e00 R15:
> ffff9e60c5423500
> [  167.383141]  ? perf_event_stop+0xa0/0xa0
> [  167.383141]  ? perf_swevent_init+0x190/0x190
> [  167.383142]  ? perf_swevent_init+0x190/0x190
> [  167.383142]  </NMI>
> [  167.383142]  ? perf_event_exec+0x1ca/0x210
> [  167.383142]  begin_new_exec+0x5ba/0x980
> [  167.383142]  load_elf_binary+0x6ae/0x17a0
> [  167.383143]  ? __kernel_read+0x1a0/0x2d0
> [  167.383143]  ? __kernel_read+0x1a0/0x2d0
> [  167.383143]  bprm_execve+0x2f6/0x660
> [  167.383143]  do_execveat_common.isra.0+0x189/0x1c0
> [  167.383143]  __x64_sys_execve+0x37/0x50
> [  167.383144]  do_syscall_64+0x38/0x90
> [  167.383144]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  167.383144] RIP: 0033:0x7f2c347dae97
> [  167.383144] Code: Unable to access opcode bytes at RIP 0x7f2c347dae6d.
> [  167.383144] RSP: 002b:00007ffe73f584a8 EFLAGS: 00000202 ORIG_RAX:
> 000000000000003b
> [  167.383145] RAX: ffffffffffffffda RBX: 00005625a990cd70 RCX:
> 00007f2c347dae97
> [  167.383145] RDX: 00005625a990caf0 RSI: 00005625a990cd70 RDI:
> 00007ffe73f6069f
> [  167.383145] RBP: 00007ffe73f58540 R08: 00007ffe73f584a0 R09:
> 00007f2c368502b0
> [  167.383146] R10: 0000000000000016 R11: 0000000000000202 R12:
> 00005625a990caf0
> [  167.383146] R13: 00005625a8c06870 R14: 0000000000000000 R15:
> 00007ffe73f6069f
> [  167.383146] Modules linked in: snd_hda_codec_hdmi
> snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio
> nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo xt_addrtype
> br_netfilter xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat nf_nat
> xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c
> ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter
> ebtables ip6table_filter ip6_tables iptable_filter overlay bnep
> hid_sensor_magn_3d hid_sensor_gyro_3d hid_sensor_als hid_sensor_accel_3d
> hid_sensor_rotation hid_sensor_incl_3d hid_sensor_trigger
> industrialio_triggered_buffer kfifo_buf hid_sensor_iio_common
> industrialio hid_sensor_custom hid_sensor_hub intel_ishtp_hid
> binfmt_misc spi_pxa2xx_platform dw_dmac dw_dmac_core 8250_dw mei_hdcp
> i2c_designware_platform i2c_designware_core intel_rapl_msr wmi_bmof
> intl_wmi_thunderbolt x86_pkg_temp_thermal snd_hda_intel intel_powerclamp
> coretemp snd_intel_dspcfg nls_iso8859_1 snd_hda_codec kvm_intel
> snd_hda_core snd_hwdep kvm
> [  167.383158]  snd_pcm irqbypass snd_seq_midi crct10dif_pclmul
> crc32_pclmul ghash_clmulni_intel snd_seq_midi_event snd_rawmidi
> aesni_intel crypto_simd cryptd snd_seq glue_helper rapl snd_seq_device
> intel_cstate snd_timer ofpart cmdlinepart intel_spi_pci snd e1000e
> intel_spi spi_nor soundcore mtd iwlmvm mei_me mei mac80211 libarc4
> iwlwifi intel_lpss_pci intel_lpss cfg80211 asix usbnet mii joydev btusb
> btrtl btbcm btintel bluetooth ecdh_generic ecc intel_ish_ipc
> processor_thermal_device intel_ishtp intel_rapl_common
> int340x_thermal_zone thunderbolt intel_soc_dts_iosf ucsi_acpi typec_ucsi
> typec wmi intel_hid int3400_thermal sparse_keymap acpi_pad
> acpi_thermal_rel acpi_tad sch_fq_codel parport_pc ppdev lp parport
> ip_tables x_tables hid_generic usbhid hid input_leds serio_raw mac_hid
> pinctrl_icelake
> [  168.121417] ---[ end trace 7f4c6d2d09e5e90f ]---
> [  168.121417] RIP: 0010:PageHuge+0x6/0x40
> [  168.121417] ode: c8 48 89 0e 48 39 47 08 48 0f 46 47 08 48 89 02 c3
> 0f 1f 00 55 be 00 04 00 00 48 89 e5 e8 72 90 2a 00 5d c3 0f 1f 44 00 00
> 55 <48> 8b 07 48 89 e5 a9 00 00 01 00 75 09 48 8b 47 08 83 e0 01 74 17
> [  168.121418] RSP: 0018:fffffe000000b5c0 EFLAGS: 00010086
> [  168.121418] RAX: 00000000fee0017b RBX: fffffca803fb8000 RCX:
> 0000000000000000
> [  168.121419] RDX: 7fffffff011ffe84 RSI: ffff9e635b614fe8 RDI:
> fffffca803fb8000
> [  168.121419] RBP: fffffe000000b5d8 R08: 000fffffc0000000 R09:
> 000000000000000d
> [  168.121419] R10: 0000000000000001 R11: 00000000000011da R12:
> 000000000048c14f
> [  168.121419] R13: fffffe000000b8c0 R14: ffff9e60955aa800 R15:
> ffffffffff5fd340
> [  168.121420] FS:  0000000000000000(0000) GS:ffff9e641dc00000(0000)
> knlGS:0000000000000000
> [  168.121420] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  168.121420] CR2: 00007f2c347dae6d CR3: 000000014a0ba004 CR4:
> 0000000000770ef0
> [  168.121420] PKRU: 55555554
> [  168.121421] Kernel panic - not syncing: Fatal exception in interrupt
> [  168.121520] Kernel Offset: 0x1e600000 from 0xffffffff81000000
> (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

2020-11-11 10:00:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 04, 2020 at 12:11:16PM -0500, Liang, Kan wrote:
> On 10/13/2020 12:34 PM, Peter Zijlstra wrote:

> > @@ -7037,13 +7057,20 @@ static u64 __perf_get_page_size(struct m
> > return 0;
> > }
> > + page = pte_page(*pte);
> > + if (PageHuge(page)) {
> > + u64 size = page_size(compound_head(page));
> > + pte_unmap(pte);
> > + return size;
> > + }
> > +
>
> The PageHuge() check for PTE crashes my machine when I did page size test.
> (Sorry, I didn't find the issue earlier. I just found some time to re-run
> the page size test.)
>
> It seems we don't need the check for PTE here. The size should be always
> PAGE_SIZE, no? After I remove the check, everything looks good.

That's the thing, an architecture could have non-page-table aligned
huge-pages. For example using 4 consecutive 4k pages to create 16k
pages. In that case the above code would trigger and find a 16k compound
page with HUGETLB_PAGE_DTOR (assuming it was created through hugetlbfs).

What is this page size test; I'd like to reproduce.

2020-11-11 11:25:29

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 10:57:50AM +0100, Peter Zijlstra wrote:
> On Wed, Nov 04, 2020 at 12:11:16PM -0500, Liang, Kan wrote:
> > On 10/13/2020 12:34 PM, Peter Zijlstra wrote:
>
> > > @@ -7037,13 +7057,20 @@ static u64 __perf_get_page_size(struct m
> > > return 0;
> > > }
> > > + page = pte_page(*pte);
> > > + if (PageHuge(page)) {
> > > + u64 size = page_size(compound_head(page));
> > > + pte_unmap(pte);
> > > + return size;
> > > + }
> > > +
> >
> > The PageHuge() check for PTE crashes my machine when I did page size test.
> > (Sorry, I didn't find the issue earlier. I just found some time to re-run
> > the page size test.)
> >
> > It seems we don't need the check for PTE here. The size should be always
> > PAGE_SIZE, no? After I remove the check, everything looks good.
>
> That's the thing, an architecture could have non-page-table aligned
> huge-pages. For example using 4 consecutive 4k pages to create 16k
> pages. In that case the above code would trigger and find a 16k compound
> page with HUGETLB_PAGE_DTOR (assuming it was created through hugetlbfs).

So, from your splat:

> [ 167.383120] BUG: unable to handle page fault for address: fffffca803fb8000
> [ 167.383121] #PF: supervisor read access in kernel mode
> [ 167.383121] #PF: error_code(0x0000) - not-present page
> [ 167.383121] PGD 4adfc8067 P4D 4adfc8067 PUD 4adfc7067 PMD 0
> [ 167.383122] Oops: 0000 [#1] SMP NOPTI
> [ 167.383122] CPU: 0 PID: 2461 Comm: perf Not tainted 5.10.0-rc1_page_size+ #54
> [ 167.383123] Hardware name: Intel Corporation Ice Lake Client Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS ICLSFWR1.R00.3162.A00.1904162000 04/16/2019
> [ 167.383123] traps: PANIC: double fault, error_code: 0x0
> [ 167.383123] double fault: 0000 [#2] SMP NOPTI
> [ 167.383123] CPU: 0 PID: 2461 Comm: perf Not tainted 5.10.0-rc1_page_size+ #54
> [ 167.383124] Hardware name: Intel Corporation Ice Lake Client Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS ICLSFWR1.R00.3162.A00.1904162000 04/16/2019
> [ 167.383124] RIP: 0010:__sprint_symbol+0xb/0x100
> [ 167.383124] Code: 85 e4 75 b9 48 85 c0 75 bc 49 89 d8 4c 89 e1 4c 89 fa 4c 89 f6 4c 89 ef e8 42 70 04 00 eb a6 0f 1f 44 00 00 55 48 89 e5 41 57 <41> 56 49 89 f6 41 55 41 89 cd 48 8d 4d b8 41 54 49 89 fc 53 48 63
> [ 167.383125] RSP: 0018:fffffe000000b000 EFLAGS: 00010046
> [ 167.383125] RAX: 0000000000000053 RBX: 00000000ffff0a00 RCX: 0000000000000001
> [ 167.383125] RDX: 0000000000000000 RSI: ffffffff9f8a6176 RDI: fffffe000000b029
> [ 167.383126] RBP: fffffe000000b008 R08: ffffffffa0bf8661 R09: 0000000000000001
> [ 167.383126] R10: 000000000000000f R11: ffff9e641dc189c8 R12: ffff9e641dc189c9
> [ 167.383126] R13: ffff9e641dc1a7e0 R14: ffff0a00ffffff05 R15: fffffe000000b029
> [ 167.383126] FS: 0000000000000000(0000) GS:ffff9e641dc00000(0000) knlGS:0000000000000000
> [ 167.383127] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 167.383127] CR2: fffffe000000aff8 CR3: 000000014a0ba004 CR4: 0000000000770ef0
> [ 167.383127] PKRU: 55555554
> [ 167.383127] Call Trace:
> [ 167.383127] <NMI>
> [ 167.383128] sprint_symbol+0x15/0x20
> [ 167.383128] symbol_string+0x49/0x90
> [ 167.383128] pointer+0x118/0x3e0
> [ 167.383128] vsnprintf+0x1ec/0x4e0
> [ 167.383128] vscnprintf+0xd/0x30
> [ 167.383129] printk_safe_log_store+0x65/0xe0
> [ 167.383129] vprintk_func+0x8d/0x100
> [ 167.383129] printk+0x58/0x6f
> [ 167.383129] ? PageHuge+0x6/0x40
> [ 167.383129] show_ip+0x2c/0x3d
> [ 167.383130] show_iret_regs+0x17/0x40
> [ 167.383130] __show_regs+0x27/0x40
> [ 167.383130] ? dump_stack_print_info+0x9e/0xb0
> [ 167.383130] show_regs+0x3b/0x50
> [ 167.383130] __die_body+0x20/0x70
> [ 167.383131] __die+0x2b/0x33
> [ 167.383131] no_context+0x152/0x350
> [ 167.383131] __bad_area_nosemaphore+0x50/0x160
> [ 167.383131] bad_area_nosemaphore+0x16/0x20
> [ 167.383131] do_kern_addr_fault+0x62/0x70
> [ 167.383132] exc_page_fault+0xcd/0x150
> [ 167.383132] asm_exc_page_fault+0x1e/0x30
> [ 167.383132] RIP: 0010:PageHuge+0x6/0x40
> [ 167.383132] Code: c8 48 89 0e 48 39 47 08 48 0f 46 47 08 48 89 02 c3 0f 1f 00 55 be 00 04 00 00 48 89 e5 e8 72 90 2a 00 5d c3 0f 1f 44 00 00 55 <48> 8b 07 48 89 e5 a9 00 00 01 00 75 09 48 8b 47 08 83 e0 01 74 17
> [ 167.383133] RSP: 0018:fffffe000000b5c0 EFLAGS: 00010086
> [ 167.383133] RAX: 00000000fee0017b RBX: fffffca803fb8000 RCX: 0000000000000000
> [ 167.383133] RDX: 7fffffff011ffe84 RSI: ffff9e635b614fe8 RDI: fffffca803fb8000
> [ 167.383134] RBP: fffffe000000b5d8 R08: 000fffffc0000000 R09: 000000000000000d
> [ 167.383134] R10: 0000000000000001 R11: 00000000000011da R12: 000000000048c14f
> [ 167.383134] R13: fffffe000000b8c0 R14: ffff9e60955aa800 R15: ffffffffff5fd340
> [ 167.383134] ? arch_perf_get_page_size+0x2e4/0x330
> [ 167.383135] perf_get_page_size.part.0+0x3c/0x50
> [ 167.383135] perf_prepare_sample+0x1cc/0x740
> [ 167.383135] perf_event_output_forward+0x30/0x90
> [ 167.383135] ? sched_clock+0x9/0x10
> [ 167.383135] ? __perf_event_account_interrupt+0xcc/0xe0
> [ 167.383136] __perf_event_overflow+0x57/0xf0
> [ 167.383136] perf_event_overflow+0x14/0x20

The faulting instruction decodes to:

48 8b 07 mov (%rdi),%rax

And the RDI value is indeed fffffca803fb8000 as per the first BUG line,
which, afaict, is in a hole per x86_64/mm.rst

Trying to match the Code: to PageHuge as generate here makes this the
PageCompound() test burning, not even compound_head() going bad.

must ponder more...

2020-11-11 12:46:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 12:22:46PM +0100, Peter Zijlstra wrote:

> Trying to match the Code: to PageHuge as generate here makes this the
> PageCompound() test burning, not even compound_head() going bad.
>
> must ponder more...

Oooh.. does this help?

---
kernel/events/core.c | 81 +++++++++++++++++++++++++++++++---------------------
1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d2f3ca792936..3b42576c99f1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7015,65 +7015,82 @@ static u64 perf_virt_to_phys(u64 virt)
*/
__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
struct page *page;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
return 0;

- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
+ p4dp = p4d_offset(&pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
return 0;

- if (p4d_leaf(*p4d))
+ if (p4d_leaf(p4d))
return 1ULL << P4D_SHIFT;

- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ pudp = pud_offset(&p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
return 0;

- if (pud_leaf(*pud)) {
+ if (pud_leaf(pud)) {
#ifdef pud_page
- page = pud_page(*pud);
- if (PageHuge(page))
- return page_size(compound_head(page));
+ if (!pud_devmap(pud)) {
+ page = pud_page(pud);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+ }
#endif
return 1ULL << PUD_SHIFT;
}

- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
+ pmdp = pmd_offset(&pud, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (!pmd_present(pmd))
return 0;

- if (pmd_leaf(*pmd)) {
+ if (pmd_leaf(pmd)) {
#ifdef pmd_page
- page = pmd_page(*pmd);
- if (PageHuge(page))
- return page_size(compound_head(page));
+ if (!pmd_devmap(pmd)) {
+ page = pmd_page(pmd);
+ if (PageHuge(page))
+ return page_size(compound_head(page));
+ }
#endif
return 1ULL << PMD_SHIFT;
}

- pte = pte_offset_map(pmd, addr);
- if (!pte_present(*pte)) {
- pte_unmap(pte);
+ ptep = pte_offset_map(&pmd, addr);
+ pte = READ_ONCE(*ptep); // gup_get_pte()
+ if (!pte_present(pte)) {
+ pte_unmap(ptep);
return 0;
}

- page = pte_page(*pte);
- if (PageHuge(page)) {
- u64 size = page_size(compound_head(page));
- pte_unmap(pte);
- return size;
+ if (!pte_devmap(pte) && !pte_special(pte)) {
+ page = pte_page(pte);
+ if (PageHuge(page)) {
+ u64 size = page_size(compound_head(page));
+ pte_unmap(ptep);
+ return size;
+ }
}

- pte_unmap(pte);
+ pte_unmap(ptep);
return PAGE_SIZE;
+
+#else /* CONFIG_ARCH_HAS_PTE_SPECIAL */
+
+ return 0;
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
}

#else

2020-11-11 15:33:13

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 01:43:57PM +0100, Peter Zijlstra wrote:
> + if (pud_leaf(pud)) {
> #ifdef pud_page
> - page = pud_page(*pud);
> - if (PageHuge(page))
> - return page_size(compound_head(page));
> + if (!pud_devmap(pud)) {
> + page = pud_page(pud);
> + if (PageHuge(page))
> + return page_size(compound_head(page));
> + }
> #endif
> return 1ULL << PUD_SHIFT;

This confuses me. Why only special-case hugetlbfs pages here? Should
they really be treated differently from THP? If you want to consider
that we might be mapping a page that's twice as big as a PUD entry and
this is only half of it, then the simple way is:

if (pud_leaf(pud)) {
#ifdef pud_page
page = compound_head(pud_page(*pud));
return page_size(page);
#else
return 1ULL << PUD_SHIFT;
#endif
}

Also, what's up with the special-casing of devmap pages here? Did the
devmap people fuck up their compound pages? If so, they should fix their
shit, not expect the rest of the kernel to work around this brokenness.

2020-11-11 15:55:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 03:30:22PM +0000, Matthew Wilcox wrote:
> On Wed, Nov 11, 2020 at 01:43:57PM +0100, Peter Zijlstra wrote:
> > + if (pud_leaf(pud)) {
> > #ifdef pud_page
> > - page = pud_page(*pud);
> > - if (PageHuge(page))
> > - return page_size(compound_head(page));
> > + if (!pud_devmap(pud)) {
> > + page = pud_page(pud);
> > + if (PageHuge(page))
> > + return page_size(compound_head(page));
> > + }
> > #endif
> > return 1ULL << PUD_SHIFT;
>
> This confuses me. Why only special-case hugetlbfs pages here? Should
> they really be treated differently from THP?

Do we have non-pagetable aligned THP ? I thought THP was always PUD
sized.

> If you want to consider that we might be mapping a page that's twice
> as big as a PUD entry and this is only half of it, then the simple way
> is:
>
> if (pud_leaf(pud)) {
> #ifdef pud_page
> page = compound_head(pud_page(*pud));
> return page_size(page);
> #else
> return 1ULL << PUD_SHIFT;
> #endif
> }
>
> Also, what's up with the special-casing of devmap pages here? Did the
> devmap people fuck up their compound pages? If so, they should fix their
> shit, not expect the rest of the kernel to work around this brokenness.

Well, the PTE code we have today (in tip/perf/core) is:

pte = pte_offset_map(pmd, addr);
if (!pte_present(*pte)) {
pte_unmap(pte);
return 0;
}

page = pte_page(*pte);
if (PageHuge(page)) {
u64 size = page_size(compound_head(page));
pte_unmap(pte);
return size;
}

pte_unmap(pte);
return PAGE_SIZE;

and that's crashing in PageHuge()'s PageCompound() test. Clearly I
should be checking pte_special() here (as well as all the READ_ONCE()s I
added in the patch you just commented on). But I wasn't quite sure about
devmap and paranoia won.

You're saying devmap should be valid compound pages? Then I can remove
all that and only keep pte_special().

2020-11-11 16:01:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 03:30:22PM +0000, Matthew Wilcox wrote:
> This confuses me. Why only special-case hugetlbfs pages here? Should
> they really be treated differently from THP? If you want to consider
> that we might be mapping a page that's twice as big as a PUD entry and
> this is only half of it, then the simple way is:
>
> if (pud_leaf(pud)) {
> #ifdef pud_page
> page = compound_head(pud_page(*pud));
> return page_size(page);

Also; this is 'wrong'. The purpose of this function is to return the
hardware TLB size of a given address. The above will return the compound
size, for any random compound page, which would be myrads of reasons.

So the PageHuge() thing tells us it is a HugeTLB page and those are
(barring hardware TLB promotion/demotion) guaranteed to reflect the
actual TLB size.

> #else
> return 1ULL << PUD_SHIFT;
> #endif
> }

2020-11-11 16:41:38

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 04:57:24PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 11, 2020 at 03:30:22PM +0000, Matthew Wilcox wrote:
> > This confuses me. Why only special-case hugetlbfs pages here? Should
> > they really be treated differently from THP? If you want to consider
> > that we might be mapping a page that's twice as big as a PUD entry and
> > this is only half of it, then the simple way is:
> >
> > if (pud_leaf(pud)) {
> > #ifdef pud_page
> > page = compound_head(pud_page(*pud));
> > return page_size(page);
>
> Also; this is 'wrong'. The purpose of this function is to return the
> hardware TLB size of a given address. The above will return the compound
> size, for any random compound page, which would be myrads of reasons.

Oh, then the whole thing is overly-complicated. This should just be

if (pud_leaf(pud))
return PUD_SIZE;

2020-11-11 17:26:44

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 04:38:48PM +0000, Matthew Wilcox wrote:
> On Wed, Nov 11, 2020 at 04:57:24PM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 11, 2020 at 03:30:22PM +0000, Matthew Wilcox wrote:
> > > This confuses me. Why only special-case hugetlbfs pages here? Should
> > > they really be treated differently from THP? If you want to consider
> > > that we might be mapping a page that's twice as big as a PUD entry and
> > > this is only half of it, then the simple way is:
> > >
> > > if (pud_leaf(pud)) {
> > > #ifdef pud_page
> > > page = compound_head(pud_page(*pud));
> > > return page_size(page);
> >
> > Also; this is 'wrong'. The purpose of this function is to return the
> > hardware TLB size of a given address. The above will return the compound
> > size, for any random compound page, which would be myrads of reasons.
>
> Oh, then the whole thing is overly-complicated. This should just be
>
> if (pud_leaf(pud))
> return PUD_SIZE;

But that doesn't handle non-pagetable aligned hugetlb sizes. Granted,
that's unlikely at the PUD level, but why be inconsistent..

So we really want:

if (p*d_leaf(p*d)) {
if (!'special') {
page = p*d_page(p*d);
if (PageHuge(page))
return page_size(compound_head(page));
}
return P*D_SIZE;
}

That gets us:

- regular page-table aligned large-pages
- 'funny' hugetlb sizes

The only thing it doesn't gets us is kernel usage of 'funny' sizes,
which is why that function is weak (arm64, power32, sparc64 have funny
sizes and at the very least arm64 uses them for kernel mappings too).

Now, when you add !PMD THP sizes (presumably for architectures that have
'funny' sizes, otherwise what's the point), then you get to add '||
PageTransHuge()' to the above PageHuge() (and fix PageTransHuge() to
actually do what it claims it does).

Arguably we could fix arm64 with something like the below, but then, I'd
have to audit powerpc32 and sparc64 again to see if I can make that work
for them too -- not today.

---

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7003,6 +7003,10 @@ static u64 perf_virt_to_phys(u64 virt)

#ifdef CONFIG_MMU

+#ifndef pte_cont
+#define pte_cont(pte) (false)
+#endif
+
/*
* Return the MMU page size of a given virtual address.
*
@@ -7077,7 +7081,7 @@ __weak u64 arch_perf_get_page_size(struc

if (!pte_devmap(pte) && !pte_special(pte)) {
page = pte_page(pte);
- if (PageHuge(page)) {
+ if (PageHuge(page) || pte_cont(pte)) {
u64 size = page_size(compound_head(page));
pte_unmap(ptep);
return size;

2020-11-11 18:30:55

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 06:22:53PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 11, 2020 at 04:38:48PM +0000, Matthew Wilcox wrote:
> > if (pud_leaf(pud))
> > return PUD_SIZE;
>
> But that doesn't handle non-pagetable aligned hugetlb sizes. Granted,
> that's unlikely at the PUD level, but why be inconsistent..
>
> So we really want:
>
> if (p*d_leaf(p*d)) {
> if (!'special') {
> page = p*d_page(p*d);
> if (PageHuge(page))
> return page_size(compound_head(page));
> }
> return P*D_SIZE;
> }

Still doesn't work because pages can be mapped at funny offsets.

What we really want is for a weak definition of

unsigned long tlb_size(struct mm_struct *mm, unsigned long addr)
{
if (p*d_leaf(p*d))
return p*d_size(p*d);
}

then ARM can look at its special bit in the page table to determine
whether this is a singleton or part of a brace of pages.

> Now, when you add !PMD THP sizes (presumably for architectures that have
> 'funny' sizes, otherwise what's the point), then you get to add '||

This is the problem with all the huge page support in Linux today.
It's written by people who work for hardware companies who think only
about exploiting the hardware features they sell. You all ignore the
very real software overhedas of trying to manage millions of pages.
I see a 6% reduction in kernel overhead when running kernbench using
THPs that may go as large as 256kB. On x86. Intel x86, at that.

2020-11-11 20:05:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 06:26:20PM +0000, Matthew Wilcox wrote:
> On Wed, Nov 11, 2020 at 06:22:53PM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 11, 2020 at 04:38:48PM +0000, Matthew Wilcox wrote:
> > > if (pud_leaf(pud))
> > > return PUD_SIZE;
> >
> > But that doesn't handle non-pagetable aligned hugetlb sizes. Granted,
> > that's unlikely at the PUD level, but why be inconsistent..
> >
> > So we really want:
> >
> > if (p*d_leaf(p*d)) {
> > if (!'special') {
> > page = p*d_page(p*d);
> > if (PageHuge(page))
> > return page_size(compound_head(page));
> > }
> > return P*D_SIZE;
> > }
>
> Still doesn't work because pages can be mapped at funny offsets.

Wait, what?! Is there hardware that has unaligned TLB page-sizes?

Can you start a 64K page at an 8k offset? I don't think I've ever seen
that. Still even with that, how would the above go wrong there? It would
find the compound page covering @addr, PageHuge() (and possibly some
addition arch specific condition) returns true and we get the compound
size to find the hardware page size used.

> What we really want is for a weak definition of
>
> unsigned long tlb_size(struct mm_struct *mm, unsigned long addr)
> {
> if (p*d_leaf(p*d))
> return p*d_size(p*d);
> }
>
> then ARM can look at its special bit in the page table to determine
> whether this is a singleton or part of a brace of pages.

That's basically what we provide. but really the only thing that's
missing from this generic page walker is the ability to detect if a
!PageHuge compound page is actually still a hardware page.

> > Now, when you add !PMD THP sizes (presumably for architectures that have
> > 'funny' sizes, otherwise what's the point), then you get to add '||
>
> This is the problem with all the huge page support in Linux today.
> It's written by people who work for hardware companies who think only
> about exploiting the hardware features they sell. You all ignore the
> very real software overhedas of trying to manage millions of pages.
> I see a 6% reduction in kernel overhead when running kernbench using
> THPs that may go as large as 256kB. On x86. Intel x86, at that.

That's a really nice improvement. However then this code doesn't care
about it. Please make it possible to distinguish between THP on hardware
pages vs software pages.

2020-11-12 01:57:55

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 09:00:00PM +0100, Peter Zijlstra wrote:
> On Wed, Nov 11, 2020 at 06:26:20PM +0000, Matthew Wilcox wrote:
> > On Wed, Nov 11, 2020 at 06:22:53PM +0100, Peter Zijlstra wrote:
> > > On Wed, Nov 11, 2020 at 04:38:48PM +0000, Matthew Wilcox wrote:
> > > > if (pud_leaf(pud))
> > > > return PUD_SIZE;
> > >
> > > But that doesn't handle non-pagetable aligned hugetlb sizes. Granted,
> > > that's unlikely at the PUD level, but why be inconsistent..
> > >
> > > So we really want:
> > >
> > > if (p*d_leaf(p*d)) {
> > > if (!'special') {
> > > page = p*d_page(p*d);
> > > if (PageHuge(page))
> > > return page_size(compound_head(page));
> > > }
> > > return P*D_SIZE;
> > > }
> >
> > Still doesn't work because pages can be mapped at funny offsets.
>
> Wait, what?! Is there hardware that has unaligned TLB page-sizes?

No, you can force a 2MB page to be mapped at an address which isn't
2MB aligned.

> Can you start a 64K page at an 8k offset? I don't think I've ever seen
> that. Still even with that, how would the above go wrong there? It would
> find the compound page covering @addr, PageHuge() (and possibly some
> addition arch specific condition) returns true and we get the compound
> size to find the hardware page size used.

On any architecture I can think of, that 2MB page will be mapped with 4kB
TLB entries.

> > What we really want is for a weak definition of
> >
> > unsigned long tlb_size(struct mm_struct *mm, unsigned long addr)
> > {
> > if (p*d_leaf(p*d))
> > return p*d_size(p*d);
> > }
> >
> > then ARM can look at its special bit in the page table to determine
> > whether this is a singleton or part of a brace of pages.
>
> That's basically what we provide. but really the only thing that's
> missing from this generic page walker is the ability to detect if a
> !PageHuge compound page is actually still a hardware page.
>
> > > Now, when you add !PMD THP sizes (presumably for architectures that have
> > > 'funny' sizes, otherwise what's the point), then you get to add '||
> >
> > This is the problem with all the huge page support in Linux today.
> > It's written by people who work for hardware companies who think only
> > about exploiting the hardware features they sell. You all ignore the
> > very real software overhedas of trying to manage millions of pages.
> > I see a 6% reduction in kernel overhead when running kernbench using
> > THPs that may go as large as 256kB. On x86. Intel x86, at that.
>
> That's a really nice improvement. However then this code doesn't care
> about it. Please make it possible to distinguish between THP on hardware
> pages vs software pages.

That can and should be done just by looking at the page table entries.
There's no need to convert it into a struct page. The CPU obviously
decides what TLB entry size to use based solely on the page tables,
so we can too.

2020-11-12 09:58:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Wed, Nov 11, 2020 at 10:33:44PM +0000, Matthew Wilcox wrote:
> On Wed, Nov 11, 2020 at 09:00:00PM +0100, Peter Zijlstra wrote:
> > On Wed, Nov 11, 2020 at 06:26:20PM +0000, Matthew Wilcox wrote:
> > > On Wed, Nov 11, 2020 at 06:22:53PM +0100, Peter Zijlstra wrote:
> > > > On Wed, Nov 11, 2020 at 04:38:48PM +0000, Matthew Wilcox wrote:
> > > > > if (pud_leaf(pud))
> > > > > return PUD_SIZE;
> > > >
> > > > But that doesn't handle non-pagetable aligned hugetlb sizes. Granted,
> > > > that's unlikely at the PUD level, but why be inconsistent..
> > > >
> > > > So we really want:
> > > >
> > > > if (p*d_leaf(p*d)) {
> > > > if (!'special') {
> > > > page = p*d_page(p*d);
> > > > if (PageHuge(page))
> > > > return page_size(compound_head(page));
> > > > }
> > > > return P*D_SIZE;
> > > > }
> > >
> > > Still doesn't work because pages can be mapped at funny offsets.
> >
> > Wait, what?! Is there hardware that has unaligned TLB page-sizes?
>
> No, you can force a 2MB page to be mapped at an address which isn't
> 2MB aligned.

But not a HugeTLB page; AFAICT mmap() will reject if you try and mmap a
hugetlb page out of alignment. So what I wrote above is still valid. If
PageHuge() we can be certain it is aligned properly and using a matching
hardware page size.

You just don't like it because you want me to be purely page-table
based.

2020-11-12 11:41:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Thu, Nov 12, 2020 at 10:53:58AM +0100, Peter Zijlstra wrote:
> You just don't like it because you want me to be purely page-table
> based.

How's something like this then? I failed to untangle Power's many MMUs
though :/

---
arch/arm64/include/asm/pgtable.h | 3 ++
arch/sparc/include/asm/pgtable_64.h | 14 +++++
arch/sparc/mm/hugetlbpage.c | 19 ++++---
include/linux/pgtable.h | 16 ++++++
kernel/events/core.c | 100 +++++++++++++-----------------------
5 files changed, 82 insertions(+), 70 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 4ff12a7adcfd..1cd2d986b0ca 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -503,6 +503,9 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
PMD_TYPE_SECT)
#define pmd_leaf(pmd) pmd_sect(pmd)

+#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
+#define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)
+
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 7ef6affa105e..1723e18ba89f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1121,6 +1121,20 @@ extern unsigned long cmdline_memory_size;

asmlinkage void do_sparc64_fault(struct pt_regs *regs);

+#ifdef CONFIG_HUGETLB_PAGE
+
+#define pud_leaf_size pud_leaf_size
+extern unsigned long pud_leaf_size(pud_t pud);
+
+#define pmd_leaf_size pmd_leaf_size
+extern unsigned long pmd_leaf_size(pmd_t pmd);
+
+#define pte_leaf_size pte_leaf_size
+extern unsigned long pte_leaf_size(pte_t pte);
+#endif
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
#endif /* !(__ASSEMBLY__) */

#endif /* !(_SPARC64_PGTABLE_H) */
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index ec423b5f17dd..3e806b87ec19 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -247,14 +247,17 @@ static unsigned int sun4u_huge_tte_to_shift(pte_t entry)
return shift;
}

-static unsigned int huge_tte_to_shift(pte_t entry)
+static unsigned long __tte_to_shift(pte_t entry)
{
- unsigned long shift;
-
if (tlb_type == hypervisor)
- shift = sun4v_huge_tte_to_shift(entry);
- else
- shift = sun4u_huge_tte_to_shift(entry);
+ return sun4v_huge_tte_to_shift(entry);
+
+ return sun4u_huge_tte_to_shift(entry);
+}
+
+static unsigned int huge_tte_to_shift(pte_t entry)
+{
+ unsigned long shift = __tte_to_shift(entry);

if (shift == PAGE_SHIFT)
WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n",
@@ -272,6 +275,10 @@ static unsigned long huge_tte_to_size(pte_t pte)
return size;
}

+unsigned long pud_leaf_size(pud_t pud) { return 1UL << __tte_to_shift((pte_t)pud); }
+unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << __tte_to_shift((pte_t)pmd); }
+unsigned long pte_leaf_size(pte_t pte) { return 1UL << __tte_to_shift((pte_t)pte); }
+
pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 71125a4676c4..35b9da397e55 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1481,4 +1481,20 @@ typedef unsigned int pgtbl_mod_mask;
#define pmd_leaf(x) 0
#endif

+#ifndef pgd_leaf_size(x)
+#define pgd_leaf_size(x) PGD_SIZE
+#endif
+#ifndef p4d_leaf_size(x)
+#define p4d_leaf_size(x) P4D_SIZE
+#endif
+#ifndef pud_leaf_size(x)
+#define pud_leaf_size(x) PUD_SIZE
+#endif
+#ifndef pmd_leaf_size(x)
+#define pmd_leaf_size(x) PMD_SIZE
+#endif
+#ifndef pte_leaf_size(x)
+#define pte_leaf_size(x) PAGE_SIZE
+#endif
+
#endif /* _LINUX_PGTABLE_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d2f3ca792936..fca04fcfc9ea 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7001,90 +7001,62 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}

-#ifdef CONFIG_MMU
-
/*
* Return the MMU page size of a given virtual address.
- *
- * This generic implementation handles page-table aligned huge pages, as well
- * as non-page-table aligned hugetlbfs compound pages.
- *
- * If an architecture supports and uses non-page-table aligned pages in their
- * kernel mapping it will need to provide it's own implementation of this
- * function.
*/
-__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
{
+ u64 size = 0;
+#ifdef CONFIG_HAVE_FAST_GUP
struct page *page;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
return 0;

- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
- return 0;
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);

- if (p4d_leaf(*p4d))
- return 1ULL << P4D_SHIFT;
-
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
return 0;

- if (pud_leaf(*pud)) {
-#ifdef pud_page
- page = pud_page(*pud);
- if (PageHuge(page))
- return page_size(compound_head(page));
-#endif
- return 1ULL << PUD_SHIFT;
- }
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);

- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
return 0;

- if (pmd_leaf(*pmd)) {
-#ifdef pmd_page
- page = pmd_page(*pmd);
- if (PageHuge(page))
- return page_size(compound_head(page));
-#endif
- return 1ULL << PMD_SHIFT;
- }
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);

- pte = pte_offset_map(pmd, addr);
- if (!pte_present(*pte)) {
- pte_unmap(pte);
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (!pmd_present(pmd))
return 0;
- }
-
- page = pte_page(*pte);
- if (PageHuge(page)) {
- u64 size = page_size(compound_head(page));
- pte_unmap(pte);
- return size;
- }

- pte_unmap(pte);
- return PAGE_SIZE;
-}
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);

-#else
+ ptep = pte_offset_map(&pmd, addr);
+ pte = READ_ONCE(*ptep); // gup_get_pte()
+ if (pte_present(pte))
+ size = pte_leaf_size(pte);
+ pte_unmap(ptep);

-static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
+#endif /* CONFIG_HAVE_FAST_GUP */
+ return size;
}

-#endif
-
static u64 perf_get_page_size(unsigned long addr)
{
struct mm_struct *mm;

2020-11-12 14:06:17

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH V9 1/4] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE

On Thu, Nov 12, 2020 at 12:36:45PM +0100, Peter Zijlstra wrote:
> On Thu, Nov 12, 2020 at 10:53:58AM +0100, Peter Zijlstra wrote:
> > You just don't like it because you want me to be purely page-table
> > based.
>
> How's something like this then? I failed to untangle Power's many MMUs
> though :/

Looks good to me. Might want to rename

> -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)
> +static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr)

to perf_get_tlb_entry_size() or some such.