2020-04-16 21:34:12

by Kirill A. Shutemov

[permalink] [raw]
Subject: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

Change of attributes of the pages may lead to fragmentation of direct
mapping over time and performance degradation as result.

With current code it's one way road: kernel tries to avoid splitting
large pages, but it doesn't restore them back even if page attributes
got compatible again.

Any change to the mapping may potentially allow to restore large page.

Hook up into cpa_flush() path to check if there's any pages to be
recovered in PUD_SIZE range around pages we've just touched.

CPUs don't like[1] to have to have TLB entries of different size for the
same memory, but looks like it's okay as long as these entries have
matching attributes[2]. Therefore it's critical to flush TLB before any
following changes to the mapping.

Note that we already allow for multiple TLB entries of different sizes
for the same memory now in split_large_page() path. It's not a new
situation.

set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
not remap such pages as large. Re-use one of software PTE bits to
indicate such pages.

[1] See Erratum 383 of AMD Family 10h Processors
[2] https://lore.kernel.org/linux-mm/[email protected]/

Signed-off-by: Kirill A. Shutemov <[email protected]>
---
arch/x86/include/asm/pgtable_types.h | 2 +
arch/x86/mm/pat/set_memory.c | 191 ++++++++++++++++++++++++++-
2 files changed, 188 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b6606fe6cfdf..11ed34804343 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -34,6 +34,7 @@
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
+#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4

/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -56,6 +57,7 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 5414fabad1ae..7cb04a436d86 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
__flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}

+static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
+
+static void cpa_restore_large_pages(struct cpa_data *cpa,
+ struct list_head *pgtables)
+{
+ unsigned long start, addr, end;
+ int i;
+
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ for (i = 0; i < cpa->numpages; i++)
+ restore_large_pages(__cpa_addr(cpa, i), pgtables);
+ return;
+ }
+
+ start = __cpa_addr(cpa, 0);
+ end = start + PAGE_SIZE * cpa->numpages;
+
+ for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
+ restore_large_pages(addr, pgtables);
+}
+
static void cpa_flush(struct cpa_data *data, int cache)
{
+ LIST_HEAD(pgtables);
+ struct page *page, *tmp;
struct cpa_data *cpa = data;
unsigned int i;

BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

+ cpa_restore_large_pages(data, &pgtables);
+
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
+ list_for_each_entry_safe(page, tmp, &pgtables, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }
return;
}

- if (cpa->numpages <= tlb_single_page_flush_ceiling)
- on_each_cpu(__cpa_flush_tlb, cpa, 1);
- else
+ if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
flush_tlb_all();
+ else
+ on_each_cpu(__cpa_flush_tlb, cpa, 1);
+
+ list_for_each_entry_safe(page, tmp, &pgtables, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }

if (!cache)
return;
@@ -1075,6 +1109,153 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}

+static void restore_pmd_page(pmd_t *pmd, unsigned long addr,
+ struct list_head *pgtables)
+{
+ pgprot_t pgprot;
+ pmd_t _pmd, old_pmd;
+ pte_t *pte, first;
+ int i = 0;
+
+ pte = pte_offset_kernel(pmd, addr);
+ first = *pte;
+
+ /* Make sure alignment is suitable */
+ if (PFN_PHYS(pte_pfn(first)) & ~PMD_MASK)
+ return;
+
+ /* The page is 4k intentionally */
+ if (pte_flags(first) & _PAGE_KERNEL_4K)
+ return;
+
+ /* Check that the rest of PTEs are compatible with the first one */
+ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
+ pte_t entry = *pte;
+ if (!pte_present(entry))
+ return;
+ if (pte_flags(entry) != pte_flags(first))
+ return;
+ if (pte_pfn(entry) - pte_pfn(first) != i)
+ return;
+ }
+
+ old_pmd = *pmd;
+
+ /* Success: set up a large page */
+ pgprot = pgprot_4k_2_large(pte_pgprot(first));
+ pgprot_val(pgprot) |= _PAGE_PSE;
+ _pmd = pfn_pmd(pte_pfn(first), pgprot);
+ set_pmd(pmd, _pmd);
+
+ /* Queue the page table to be freed after TLB flush */
+ list_add(&pmd_page(old_pmd)->lru, pgtables);
+
+ if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ /* Update all PGD tables to use the same large page */
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ /* Something is wrong if entries doesn't match */
+ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
+ continue;
+ set_pmd(pmd, _pmd);
+ }
+ }
+ pr_debug("2M restored at %#lx\n", addr);
+}
+
+static void restore_pud_page(pud_t *pud, unsigned long addr,
+ struct list_head *pgtables)
+{
+ bool restore_pud = direct_gbpages;
+ pmd_t *pmd, first;
+ int i;
+
+ pmd = pmd_offset(pud, addr);
+ first = *pmd;
+
+ /* Try to restore large page if possible */
+ if (pmd_present(first) && !pmd_large(first)) {
+ restore_pmd_page(pmd, addr, pgtables);
+ first = *pmd;
+ }
+
+ /*
+ * To restore PUD page all PMD entries must be large and
+ * have suitable alignment
+ */
+ if (!pmd_large(first) || (PFN_PHYS(pmd_pfn(first)) & ~PUD_MASK))
+ restore_pud = false;
+
+ /*
+ * Restore all PMD large pages when possible and track if we can
+ * restore PUD page.
+ *
+ * To restore PUD page, all following PMDs must be compatible with the
+ * first one.
+ */
+ for (i = 1, pmd++, addr += PMD_SIZE; i < PTRS_PER_PMD; i++, pmd++, addr += PMD_SIZE) {
+ pmd_t entry = *pmd;
+ if (!pmd_present(entry)) {
+ restore_pud = false;
+ continue;
+ }
+ if (!pmd_large(entry)) {
+ restore_pmd_page(pmd, addr, pgtables);
+ entry = *pmd;
+ }
+ if (!pmd_large(entry))
+ restore_pud = false;
+ if (pmd_flags(entry) != pmd_flags(first))
+ restore_pud = false;
+ if (pmd_pfn(entry) - pmd_pfn(first) != i * PTRS_PER_PTE)
+ restore_pud = false;
+ }
+
+ /* Restore PUD page and queue page table to be freed after TLB flush */
+ if (restore_pud) {
+ list_add(&pud_page(*pud)->lru, pgtables);
+ set_pud(pud, pfn_pud(pmd_pfn(first), pmd_pgprot(first)));
+ pr_debug("1G restored at %#lx\n", addr - PUD_SIZE);
+ }
+}
+
+/*
+ * Restore PMD and PUD pages in the kernel mapping around the address where
+ * possible.
+ *
+ * Caller must flush TLB and free page tables queued on the list before
+ * touching the new entries. CPU must not see TLB entries of different size
+ * with different attributes.
+ */
+static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ addr &= PUD_MASK;
+
+ spin_lock(&pgd_lock);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ goto out;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ goto out;
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud) || pud_large(*pud))
+ goto out;
+
+ restore_pud_page(pud, addr, pgtables);
+out:
+ spin_unlock(&pgd_lock);
+}
+
static bool try_to_free_pte_page(pte_t *pte)
{
int i;
@@ -1948,8 +2129,8 @@ int set_memory_np_noalias(unsigned long addr, int numpages)

int set_memory_4k(unsigned long addr, int numpages)
{
- return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
- __pgprot(0), 1, 0, NULL);
+ return change_page_attr_set_clr(&addr, numpages,
+ __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL);
}

int set_memory_nonglobal(unsigned long addr, int numpages)
--
2.26.1


2020-04-16 22:07:30

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On 4/16/20 2:32 PM, Kirill A. Shutemov wrote:
> With current code it's one way road: kernel tries to avoid splitting
> large pages, but it doesn't restore them back even if page attributes
> got compatible again.

Looks pretty sane to me, and sounds like something we've needed for a
long time.

I'd having doubts in my ability to find nasty corner cases in this code,
though. Could you rig up some tests to poke at this thing further? Maybe:

Record what the direct map looks like (even from userspace). Then,
allocate some memory, including odd-sized and aligned ranges. Try to do
things >4M like the hugetlbfs code does. Make the allocation (or a
piece of it) not-present (or whatever), which usually fractures some
large pages. Then put it back the way it was. All the large pages
should come back.

If it survives that for an hour or two, it should be pretty good to go.
Basically, fuzz it.

2020-04-16 22:15:00

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Thu, Apr 16, 2020 at 03:03:12PM -0700, Dave Hansen wrote:
> On 4/16/20 2:32 PM, Kirill A. Shutemov wrote:
> > With current code it's one way road: kernel tries to avoid splitting
> > large pages, but it doesn't restore them back even if page attributes
> > got compatible again.
>
> Looks pretty sane to me, and sounds like something we've needed for a
> long time.
>
> I'd having doubts in my ability to find nasty corner cases in this code,
> though. Could you rig up some tests to poke at this thing further? Maybe:
>
> Record what the direct map looks like (even from userspace). Then,
> allocate some memory, including odd-sized and aligned ranges. Try to do
> things >4M like the hugetlbfs code does. Make the allocation (or a
> piece of it) not-present (or whatever), which usually fractures some
> large pages. Then put it back the way it was. All the large pages
> should come back.
>
> If it survives that for an hour or two, it should be pretty good to go.
> Basically, fuzz it.

We already have it in kernel: CONFIG_CPA_DEBUG. It messes up with the
mapping every 30 seconds. It is pretty good for the change too. It
produces a lot of 2M/1G pages to be restored. I run it over night in my
setup and it survives.

--
Kirill A. Shutemov

2020-04-16 22:40:34

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On 4/16/20 3:12 PM, Kirill A. Shutemov wrote:
> We already have it in kernel: CONFIG_CPA_DEBUG. It messes up with the
> mapping every 30 seconds. It is pretty good for the change too. It
> produces a lot of 2M/1G pages to be restored. I run it over night in my
> setup and it survives.

That's good for stability, and thanks for running it! (and please add
that nugget to the changelog)

It's good that you see it restoring some mappings, but, does it restore
*all* the 1G/2M pages that it started with (minus the ones that were
fractured for other reasons)? That should be pretty easy to check for.

2020-04-17 00:56:01

by Mika Penttilä

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

Hi!

On 17.4.2020 0.32, Kirill A. Shutemov wrote:
> Change of attributes of the pages may lead to fragmentation of direct
> mapping over time and performance degradation as result.
>
> With current code it's one way road: kernel tries to avoid splitting
> large pages, but it doesn't restore them back even if page attributes
> got compatible again.
>
> Any change to the mapping may potentially allow to restore large page.
>
> Hook up into cpa_flush() path to check if there's any pages to be
> recovered in PUD_SIZE range around pages we've just touched.
>
> CPUs don't like[1] to have to have TLB entries of different size for the
> same memory, but looks like it's okay as long as these entries have
> matching attributes[2]. Therefore it's critical to flush TLB before any
> following changes to the mapping.
>
> Note that we already allow for multiple TLB entries of different sizes
> for the same memory now in split_large_page() path. It's not a new
> situation.
>
> set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
> not remap such pages as large. Re-use one of software PTE bits to
> indicate such pages.
>
> [1] See Erratum 383 of AMD Family 10h Processors
> [2] https://lore.kernel.org/linux-mm/[email protected]/
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> arch/x86/include/asm/pgtable_types.h | 2 +
> arch/x86/mm/pat/set_memory.c | 191 ++++++++++++++++++++++++++-
> 2 files changed, 188 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index b6606fe6cfdf..11ed34804343 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -34,6 +34,7 @@
> #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
> #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
> #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
> +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
> #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
>
> /* If _PAGE_BIT_PRESENT is clear, we use these: */
> @@ -56,6 +57,7 @@
> #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
> #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
> #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
> +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
> #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
> #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
> #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 5414fabad1ae..7cb04a436d86 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
> __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
> }
>
> +static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
> +
> +static void cpa_restore_large_pages(struct cpa_data *cpa,
> + struct list_head *pgtables)
> +{
> + unsigned long start, addr, end;
> + int i;
> +
> + if (cpa->flags & CPA_PAGES_ARRAY) {
> + for (i = 0; i < cpa->numpages; i++)
> + restore_large_pages(__cpa_addr(cpa, i), pgtables);
> + return;
> + }
> +
> + start = __cpa_addr(cpa, 0);
> + end = start + PAGE_SIZE * cpa->numpages;
> +
> + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
> + restore_large_pages(addr, pgtables);
> +}
> +
> static void cpa_flush(struct cpa_data *data, int cache)
> {
> + LIST_HEAD(pgtables);
> + struct page *page, *tmp;
> struct cpa_data *cpa = data;
> unsigned int i;
>
> BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
>
> + cpa_restore_large_pages(data, &pgtables);
> +
> if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> cpa_flush_all(cache);
> + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> + list_del(&page->lru);
> + __free_page(page);
> + }
> return;
> }
>
> - if (cpa->numpages <= tlb_single_page_flush_ceiling)
> - on_each_cpu(__cpa_flush_tlb, cpa, 1);
> - else
> + if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
> flush_tlb_all();
> + else
> + on_each_cpu(__cpa_flush_tlb, cpa, 1);
> +
> + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> + list_del(&page->lru);
> + __free_page(page);
> + }


The pagetable pages are freed here but ren't you leaking the leaf 4K
pages (except the first which is made large)?


>
> if (!cache)
> return;
> @@ -1075,6 +1109,153 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
> return 0;
> }
>
> +static void restore_pmd_page(pmd_t *pmd, unsigned long addr,
> + struct list_head *pgtables)
> +{
> + pgprot_t pgprot;
> + pmd_t _pmd, old_pmd;
> + pte_t *pte, first;
> + int i = 0;
> +
> + pte = pte_offset_kernel(pmd, addr);
> + first = *pte;
> +
> + /* Make sure alignment is suitable */
> + if (PFN_PHYS(pte_pfn(first)) & ~PMD_MASK)
> + return;
> +
> + /* The page is 4k intentionally */
> + if (pte_flags(first) & _PAGE_KERNEL_4K)
> + return;
> +
> + /* Check that the rest of PTEs are compatible with the first one */
> + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
> + pte_t entry = *pte;
> + if (!pte_present(entry))
> + return;
> + if (pte_flags(entry) != pte_flags(first))
> + return;
> + if (pte_pfn(entry) - pte_pfn(first) != i)
> + return;
> + }
> +
> + old_pmd = *pmd;
> +
> + /* Success: set up a large page */
> + pgprot = pgprot_4k_2_large(pte_pgprot(first));
> + pgprot_val(pgprot) |= _PAGE_PSE;
> + _pmd = pfn_pmd(pte_pfn(first), pgprot);
> + set_pmd(pmd, _pmd);
> +
> + /* Queue the page table to be freed after TLB flush */
> + list_add(&pmd_page(old_pmd)->lru, pgtables);
> +
> + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
> + struct page *page;
> +
> + /* Update all PGD tables to use the same large page */
> + list_for_each_entry(page, &pgd_list, lru) {
> + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
> + p4d_t *p4d = p4d_offset(pgd, addr);
> + pud_t *pud = pud_offset(p4d, addr);
> + pmd_t *pmd = pmd_offset(pud, addr);
> + /* Something is wrong if entries doesn't match */
> + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
> + continue;
> + set_pmd(pmd, _pmd);
> + }
> + }
> + pr_debug("2M restored at %#lx\n", addr);
> +}
> +
> +static void restore_pud_page(pud_t *pud, unsigned long addr,
> + struct list_head *pgtables)
> +{
> + bool restore_pud = direct_gbpages;
> + pmd_t *pmd, first;
> + int i;
> +
> + pmd = pmd_offset(pud, addr);
> + first = *pmd;
> +
> + /* Try to restore large page if possible */
> + if (pmd_present(first) && !pmd_large(first)) {
> + restore_pmd_page(pmd, addr, pgtables);
> + first = *pmd;
> + }
> +
> + /*
> + * To restore PUD page all PMD entries must be large and
> + * have suitable alignment
> + */
> + if (!pmd_large(first) || (PFN_PHYS(pmd_pfn(first)) & ~PUD_MASK))
> + restore_pud = false;
> +
> + /*
> + * Restore all PMD large pages when possible and track if we can
> + * restore PUD page.
> + *
> + * To restore PUD page, all following PMDs must be compatible with the
> + * first one.
> + */
> + for (i = 1, pmd++, addr += PMD_SIZE; i < PTRS_PER_PMD; i++, pmd++, addr += PMD_SIZE) {
> + pmd_t entry = *pmd;
> + if (!pmd_present(entry)) {
> + restore_pud = false;
> + continue;
> + }
> + if (!pmd_large(entry)) {
> + restore_pmd_page(pmd, addr, pgtables);
> + entry = *pmd;
> + }
> + if (!pmd_large(entry))
> + restore_pud = false;
> + if (pmd_flags(entry) != pmd_flags(first))
> + restore_pud = false;
> + if (pmd_pfn(entry) - pmd_pfn(first) != i * PTRS_PER_PTE)
> + restore_pud = false;
> + }
> +
> + /* Restore PUD page and queue page table to be freed after TLB flush */
> + if (restore_pud) {
> + list_add(&pud_page(*pud)->lru, pgtables);
> + set_pud(pud, pfn_pud(pmd_pfn(first), pmd_pgprot(first)));
> + pr_debug("1G restored at %#lx\n", addr - PUD_SIZE);
> + }
> +}
> +
> +/*
> + * Restore PMD and PUD pages in the kernel mapping around the address where
> + * possible.
> + *
> + * Caller must flush TLB and free page tables queued on the list before
> + * touching the new entries. CPU must not see TLB entries of different size
> + * with different attributes.
> + */
> +static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
> +{
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> +
> + addr &= PUD_MASK;
> +
> + spin_lock(&pgd_lock);
> + pgd = pgd_offset_k(addr);
> + if (pgd_none(*pgd))
> + goto out;
> + p4d = p4d_offset(pgd, addr);
> + if (p4d_none(*p4d))
> + goto out;
> + pud = pud_offset(p4d, addr);
> + if (!pud_present(*pud) || pud_large(*pud))
> + goto out;
> +
> + restore_pud_page(pud, addr, pgtables);
> +out:
> + spin_unlock(&pgd_lock);
> +}
> +
> static bool try_to_free_pte_page(pte_t *pte)
> {
> int i;
> @@ -1948,8 +2129,8 @@ int set_memory_np_noalias(unsigned long addr, int numpages)
>
> int set_memory_4k(unsigned long addr, int numpages)
> {
> - return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
> - __pgprot(0), 1, 0, NULL);
> + return change_page_attr_set_clr(&addr, numpages,
> + __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL);
> }
>
> int set_memory_nonglobal(unsigned long addr, int numpages)

--Mika



2020-04-17 11:44:36

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Fri, Apr 17, 2020 at 03:52:28AM +0300, Mika Penttil? wrote:
> Hi!
>
> On 17.4.2020 0.32, Kirill A. Shutemov wrote:
> > Change of attributes of the pages may lead to fragmentation of direct
> > mapping over time and performance degradation as result.
> >
> > With current code it's one way road: kernel tries to avoid splitting
> > large pages, but it doesn't restore them back even if page attributes
> > got compatible again.
> >
> > Any change to the mapping may potentially allow to restore large page.
> >
> > Hook up into cpa_flush() path to check if there's any pages to be
> > recovered in PUD_SIZE range around pages we've just touched.
> >
> > CPUs don't like[1] to have to have TLB entries of different size for the
> > same memory, but looks like it's okay as long as these entries have
> > matching attributes[2]. Therefore it's critical to flush TLB before any
> > following changes to the mapping.
> >
> > Note that we already allow for multiple TLB entries of different sizes
> > for the same memory now in split_large_page() path. It's not a new
> > situation.
> >
> > set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
> > not remap such pages as large. Re-use one of software PTE bits to
> > indicate such pages.
> >
> > [1] See Erratum 383 of AMD Family 10h Processors
> > [2] https://lore.kernel.org/linux-mm/[email protected]/
> >
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > ---
> > arch/x86/include/asm/pgtable_types.h | 2 +
> > arch/x86/mm/pat/set_memory.c | 191 ++++++++++++++++++++++++++-
> > 2 files changed, 188 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> > index b6606fe6cfdf..11ed34804343 100644
> > --- a/arch/x86/include/asm/pgtable_types.h
> > +++ b/arch/x86/include/asm/pgtable_types.h
> > @@ -34,6 +34,7 @@
> > #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
> > #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
> > #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
> > +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
> > #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
> > /* If _PAGE_BIT_PRESENT is clear, we use these: */
> > @@ -56,6 +57,7 @@
> > #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
> > #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
> > #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
> > +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
> > #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
> > #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
> > #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
> > diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> > index 5414fabad1ae..7cb04a436d86 100644
> > --- a/arch/x86/mm/pat/set_memory.c
> > +++ b/arch/x86/mm/pat/set_memory.c
> > @@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
> > __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
> > }
> > +static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
> > +
> > +static void cpa_restore_large_pages(struct cpa_data *cpa,
> > + struct list_head *pgtables)
> > +{
> > + unsigned long start, addr, end;
> > + int i;
> > +
> > + if (cpa->flags & CPA_PAGES_ARRAY) {
> > + for (i = 0; i < cpa->numpages; i++)
> > + restore_large_pages(__cpa_addr(cpa, i), pgtables);
> > + return;
> > + }
> > +
> > + start = __cpa_addr(cpa, 0);
> > + end = start + PAGE_SIZE * cpa->numpages;
> > +
> > + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
> > + restore_large_pages(addr, pgtables);
> > +}
> > +
> > static void cpa_flush(struct cpa_data *data, int cache)
> > {
> > + LIST_HEAD(pgtables);
> > + struct page *page, *tmp;
> > struct cpa_data *cpa = data;
> > unsigned int i;
> > BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
> > + cpa_restore_large_pages(data, &pgtables);
> > +
> > if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> > cpa_flush_all(cache);
> > + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> > + list_del(&page->lru);
> > + __free_page(page);
> > + }
> > return;
> > }
> > - if (cpa->numpages <= tlb_single_page_flush_ceiling)
> > - on_each_cpu(__cpa_flush_tlb, cpa, 1);
> > - else
> > + if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
> > flush_tlb_all();
> > + else
> > + on_each_cpu(__cpa_flush_tlb, cpa, 1);
> > +
> > + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> > + list_del(&page->lru);
> > + __free_page(page);
> > + }
>
>
> The pagetable pages are freed here but ren't you leaking the leaf 4K pages
> (except the first which is made large)?

Huh? There's no way to convert one 4k to large page. We convert all pages
in large page region to the large page.

--
Kirill A. Shutemov

2020-04-17 12:08:26

by Mika Penttilä

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation



On 17.4.2020 14.42, Kirill A. Shutemov wrote:
> On Fri, Apr 17, 2020 at 03:52:28AM +0300, Mika Penttilä wrote:
>> Hi!
>>
>> On 17.4.2020 0.32, Kirill A. Shutemov wrote:
>>> Change of attributes of the pages may lead to fragmentation of direct
>>> mapping over time and performance degradation as result.
>>>
>>> With current code it's one way road: kernel tries to avoid splitting
>>> large pages, but it doesn't restore them back even if page attributes
>>> got compatible again.
>>>
>>> Any change to the mapping may potentially allow to restore large page.
>>>
>>> Hook up into cpa_flush() path to check if there's any pages to be
>>> recovered in PUD_SIZE range around pages we've just touched.
>>>
>>> CPUs don't like[1] to have to have TLB entries of different size for the
>>> same memory, but looks like it's okay as long as these entries have
>>> matching attributes[2]. Therefore it's critical to flush TLB before any
>>> following changes to the mapping.
>>>
>>> Note that we already allow for multiple TLB entries of different sizes
>>> for the same memory now in split_large_page() path. It's not a new
>>> situation.
>>>
>>> set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
>>> not remap such pages as large. Re-use one of software PTE bits to
>>> indicate such pages.
>>>
>>> [1] See Erratum 383 of AMD Family 10h Processors
>>> [2] https://lore.kernel.org/linux-mm/[email protected]/
>>>
>>> Signed-off-by: Kirill A. Shutemov <[email protected]>
>>> ---
>>> arch/x86/include/asm/pgtable_types.h | 2 +
>>> arch/x86/mm/pat/set_memory.c | 191 ++++++++++++++++++++++++++-
>>> 2 files changed, 188 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
>>> index b6606fe6cfdf..11ed34804343 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -34,6 +34,7 @@
>>> #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
>>> #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
>>> #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
>>> +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
>>> #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
>>> /* If _PAGE_BIT_PRESENT is clear, we use these: */
>>> @@ -56,6 +57,7 @@
>>> #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
>>> #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
>>> #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
>>> +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
>>> #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
>>> #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
>>> #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
>>> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
>>> index 5414fabad1ae..7cb04a436d86 100644
>>> --- a/arch/x86/mm/pat/set_memory.c
>>> +++ b/arch/x86/mm/pat/set_memory.c
>>> @@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
>>> __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
>>> }
>>> +static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
>>> +
>>> +static void cpa_restore_large_pages(struct cpa_data *cpa,
>>> + struct list_head *pgtables)
>>> +{
>>> + unsigned long start, addr, end;
>>> + int i;
>>> +
>>> + if (cpa->flags & CPA_PAGES_ARRAY) {
>>> + for (i = 0; i < cpa->numpages; i++)
>>> + restore_large_pages(__cpa_addr(cpa, i), pgtables);
>>> + return;
>>> + }
>>> +
>>> + start = __cpa_addr(cpa, 0);
>>> + end = start + PAGE_SIZE * cpa->numpages;
>>> +
>>> + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
>>> + restore_large_pages(addr, pgtables);
>>> +}
>>> +
>>> static void cpa_flush(struct cpa_data *data, int cache)
>>> {
>>> + LIST_HEAD(pgtables);
>>> + struct page *page, *tmp;
>>> struct cpa_data *cpa = data;
>>> unsigned int i;
>>> BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
>>> + cpa_restore_large_pages(data, &pgtables);
>>> +
>>> if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
>>> cpa_flush_all(cache);
>>> + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
>>> + list_del(&page->lru);
>>> + __free_page(page);
>>> + }
>>> return;
>>> }
>>> - if (cpa->numpages <= tlb_single_page_flush_ceiling)
>>> - on_each_cpu(__cpa_flush_tlb, cpa, 1);
>>> - else
>>> + if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
>>> flush_tlb_all();
>>> + else
>>> + on_each_cpu(__cpa_flush_tlb, cpa, 1);
>>> +
>>> + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
>>> + list_del(&page->lru);
>>> + __free_page(page);
>>> + }
>>
>> The pagetable pages are freed here but ren't you leaking the leaf 4K pages
>> (except the first which is made large)?
> Huh? There's no way to convert one 4k to large page. We convert all pages
> in large page region to the large page.
>

Doh, sure and sorry , was short of coffee ...

--Mika

2020-04-17 12:22:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Fri, Apr 17, 2020 at 12:32:29AM +0300, Kirill A. Shutemov wrote:
> Change of attributes of the pages may lead to fragmentation of direct
> mapping over time and performance degradation as result.
>
> With current code it's one way road: kernel tries to avoid splitting
> large pages, but it doesn't restore them back even if page attributes
> got compatible again.
>
> Any change to the mapping may potentially allow to restore large page.
>
> Hook up into cpa_flush() path to check if there's any pages to be
> recovered in PUD_SIZE range around pages we've just touched.

What does this do to the cost of the various set_memory_*() calls? ISTR
there were performance concerns at some point, graphics drivers doing
lots of it, or something like that.

> CPUs don't like[1] to have to have TLB entries of different size for the
> same memory, but looks like it's okay as long as these entries have
> matching attributes[2]. Therefore it's critical to flush TLB before any
> following changes to the mapping.
>
> Note that we already allow for multiple TLB entries of different sizes
> for the same memory now in split_large_page() path. It's not a new
> situation.
>
> set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
> not remap such pages as large. Re-use one of software PTE bits to
> indicate such pages.
>
> [1] See Erratum 383 of AMD Family 10h Processors
> [2] https://lore.kernel.org/linux-mm/[email protected]/

Also, we can revert:

7af0145067bc ("x86/mm/cpa: Prevent large page split when ftrace flips RW on kernel text")

now. ftrace no longer does crazy things like that.

> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> arch/x86/include/asm/pgtable_types.h | 2 +
> arch/x86/mm/pat/set_memory.c | 191 ++++++++++++++++++++++++++-
> 2 files changed, 188 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index b6606fe6cfdf..11ed34804343 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -34,6 +34,7 @@
> #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
> #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
> #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
> +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
> #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
>
> /* If _PAGE_BIT_PRESENT is clear, we use these: */
> @@ -56,6 +57,7 @@
> #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
> #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
> #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
> +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
> #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
> #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
> #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 5414fabad1ae..7cb04a436d86 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
> __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
> }
>
> +static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
> +
> +static void cpa_restore_large_pages(struct cpa_data *cpa,
> + struct list_head *pgtables)

indent fail

> +{
> + unsigned long start, addr, end;
> + int i;
> +
> + if (cpa->flags & CPA_PAGES_ARRAY) {
> + for (i = 0; i < cpa->numpages; i++)
> + restore_large_pages(__cpa_addr(cpa, i), pgtables);
> + return;
> + }
> +
> + start = __cpa_addr(cpa, 0);
> + end = start + PAGE_SIZE * cpa->numpages;
> +
> + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)

we have within()

> + restore_large_pages(addr, pgtables);
> +}
> +
> static void cpa_flush(struct cpa_data *data, int cache)
> {
> + LIST_HEAD(pgtables);
> + struct page *page, *tmp;

xmas fail

> struct cpa_data *cpa = data;
> unsigned int i;
>
> BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
>
> + cpa_restore_large_pages(data, &pgtables);
> +
> if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> cpa_flush_all(cache);
> + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> + list_del(&page->lru);
> + __free_page(page);
> + }
> return;
> }
>
> - if (cpa->numpages <= tlb_single_page_flush_ceiling)
> - on_each_cpu(__cpa_flush_tlb, cpa, 1);
> - else
> + if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
> flush_tlb_all();
> + else
> + on_each_cpu(__cpa_flush_tlb, cpa, 1);
> +
> + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> + list_del(&page->lru);
> + __free_page(page);
> + }
>
> if (!cache)
> return;

That's a bit of a mess there, I'm thinking you can fix that with a goto.

> @@ -1075,6 +1109,153 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
> return 0;
> }
>
> +static void restore_pmd_page(pmd_t *pmd, unsigned long addr,
> + struct list_head *pgtables)

indent fail

> +{
> + pgprot_t pgprot;
> + pmd_t _pmd, old_pmd;
> + pte_t *pte, first;
> + int i = 0;

xmas failed again

> +
> + pte = pte_offset_kernel(pmd, addr);
> + first = *pte;
> +
> + /* Make sure alignment is suitable */
> + if (PFN_PHYS(pte_pfn(first)) & ~PMD_MASK)
> + return;
> +
> + /* The page is 4k intentionally */
> + if (pte_flags(first) & _PAGE_KERNEL_4K)
> + return;
> +
> + /* Check that the rest of PTEs are compatible with the first one */
> + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
> + pte_t entry = *pte;
> + if (!pte_present(entry))
> + return;
> + if (pte_flags(entry) != pte_flags(first))
> + return;
> + if (pte_pfn(entry) - pte_pfn(first) != i)

I think I'd perfer: pte_pfn(entry) != pte_pfn(first) + i

> + return;
> + }
> +
> + old_pmd = *pmd;
> +
> + /* Success: set up a large page */
> + pgprot = pgprot_4k_2_large(pte_pgprot(first));
> + pgprot_val(pgprot) |= _PAGE_PSE;
> + _pmd = pfn_pmd(pte_pfn(first), pgprot);
> + set_pmd(pmd, _pmd);
> +
> + /* Queue the page table to be freed after TLB flush */
> + list_add(&pmd_page(old_pmd)->lru, pgtables);
> +
> + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
> + struct page *page;
> +
> + /* Update all PGD tables to use the same large page */
> + list_for_each_entry(page, &pgd_list, lru) {
> + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
> + p4d_t *p4d = p4d_offset(pgd, addr);
> + pud_t *pud = pud_offset(p4d, addr);
> + pmd_t *pmd = pmd_offset(pud, addr);
> + /* Something is wrong if entries doesn't match */
> + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
> + continue;
> + set_pmd(pmd, _pmd);
> + }
> + }
> + pr_debug("2M restored at %#lx\n", addr);

While I appreciate it's usefulness while writing this, I do think we can
do without that print once we know it works.

> +}
> +
> +static void restore_pud_page(pud_t *pud, unsigned long addr,
> + struct list_head *pgtables)

indent fail

> +{
> + bool restore_pud = direct_gbpages;
> + pmd_t *pmd, first;
> + int i;
> +
> + pmd = pmd_offset(pud, addr);
> + first = *pmd;
> +
> + /* Try to restore large page if possible */
> + if (pmd_present(first) && !pmd_large(first)) {
> + restore_pmd_page(pmd, addr, pgtables);
> + first = *pmd;
> + }
> +
> + /*
> + * To restore PUD page all PMD entries must be large and
> + * have suitable alignment
> + */
> + if (!pmd_large(first) || (PFN_PHYS(pmd_pfn(first)) & ~PUD_MASK))
> + restore_pud = false;
> +
> + /*
> + * Restore all PMD large pages when possible and track if we can
> + * restore PUD page.
> + *
> + * To restore PUD page, all following PMDs must be compatible with the
> + * first one.
> + */
> + for (i = 1, pmd++, addr += PMD_SIZE; i < PTRS_PER_PMD; i++, pmd++, addr += PMD_SIZE) {
> + pmd_t entry = *pmd;
> + if (!pmd_present(entry)) {
> + restore_pud = false;
> + continue;
> + }
> + if (!pmd_large(entry)) {
> + restore_pmd_page(pmd, addr, pgtables);
> + entry = *pmd;
> + }
> + if (!pmd_large(entry))
> + restore_pud = false;
> + if (pmd_flags(entry) != pmd_flags(first))
> + restore_pud = false;
> + if (pmd_pfn(entry) - pmd_pfn(first) != i * PTRS_PER_PTE)

idem as above.

> + restore_pud = false;
> + }
> +
> + /* Restore PUD page and queue page table to be freed after TLB flush */
> + if (restore_pud) {
> + list_add(&pud_page(*pud)->lru, pgtables);
> + set_pud(pud, pfn_pud(pmd_pfn(first), pmd_pgprot(first)));
> + pr_debug("1G restored at %#lx\n", addr - PUD_SIZE);
> + }
> +}
> +
> +/*
> + * Restore PMD and PUD pages in the kernel mapping around the address where
> + * possible.
> + *
> + * Caller must flush TLB and free page tables queued on the list before
> + * touching the new entries. CPU must not see TLB entries of different size
> + * with different attributes.
> + */
> +static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
> +{
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> +
> + addr &= PUD_MASK;
> +
> + spin_lock(&pgd_lock);
> + pgd = pgd_offset_k(addr);
> + if (pgd_none(*pgd))
> + goto out;
> + p4d = p4d_offset(pgd, addr);
> + if (p4d_none(*p4d))
> + goto out;
> + pud = pud_offset(p4d, addr);
> + if (!pud_present(*pud) || pud_large(*pud))
> + goto out;
> +
> + restore_pud_page(pud, addr, pgtables);
> +out:
> + spin_unlock(&pgd_lock);
> +}

I find this odd, at best. AFAICT this does not attempt to reconstruct a
PMD around @addr when possible. When the first PMD of the PUD can't be
reconstructed, we give up entirely.

Why not something like:

static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;

spin_lock(&pgd_lock);
pgd = pgd_offset_k(addr);
if (pgd_none(*pgd))
goto out;

p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
goto out;

pud = pud_offset(p4d, addr);
if (!pud_present(*pud) || pud_large(*pud))
goto out;

pmd = pmd_offset(pud, addr);
if (pmd_present(*pmd) && !pmd_large(*pmd)) {
if (!restore_pmd_page(pmd, addr & PMD_MASK, pgtables))
goto out;
}

restore_pud_page(pud, addr & PUD_MASK, pgtables);
out:
spin_unlock(&pgd_lock);
}

That would also much simplify restore_pud_page(), it would not have to
call restore_pmd_page().

> +
> static bool try_to_free_pte_page(pte_t *pte)
> {
> int i;
> @@ -1948,8 +2129,8 @@ int set_memory_np_noalias(unsigned long addr, int numpages)
>
> int set_memory_4k(unsigned long addr, int numpages)
> {
> - return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
> - __pgprot(0), 1, 0, NULL);
> + return change_page_attr_set_clr(&addr, numpages,
> + __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL);

indent fail

> }
>
> int set_memory_nonglobal(unsigned long addr, int numpages)
> --
> 2.26.1
>

2020-04-17 14:45:18

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Fri, Apr 17, 2020 at 02:18:28PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 17, 2020 at 12:32:29AM +0300, Kirill A. Shutemov wrote:
> > Change of attributes of the pages may lead to fragmentation of direct
> > mapping over time and performance degradation as result.
> >
> > With current code it's one way road: kernel tries to avoid splitting
> > large pages, but it doesn't restore them back even if page attributes
> > got compatible again.
> >
> > Any change to the mapping may potentially allow to restore large page.
> >
> > Hook up into cpa_flush() path to check if there's any pages to be
> > recovered in PUD_SIZE range around pages we've just touched.
>
> What does this do to the cost of the various set_memory_*() calls? ISTR
> there were performance concerns at some point, graphics drivers doing
> lots of it, or something like that.

Okay, I'll come up with some numbers.

> > CPUs don't like[1] to have to have TLB entries of different size for the
> > same memory, but looks like it's okay as long as these entries have
> > matching attributes[2]. Therefore it's critical to flush TLB before any
> > following changes to the mapping.
> >
> > Note that we already allow for multiple TLB entries of different sizes
> > for the same memory now in split_large_page() path. It's not a new
> > situation.
> >
> > set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
> > not remap such pages as large. Re-use one of software PTE bits to
> > indicate such pages.
> >
> > [1] See Erratum 383 of AMD Family 10h Processors
> > [2] https://lore.kernel.org/linux-mm/[email protected]/
>
> Also, we can revert:
>
> 7af0145067bc ("x86/mm/cpa: Prevent large page split when ftrace flips RW on kernel text")
>
> now. ftrace no longer does crazy things like that.

Yeah, good point.

> > @@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
> > __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
> > }
> >
> > +static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
> > +
> > +static void cpa_restore_large_pages(struct cpa_data *cpa,
> > + struct list_head *pgtables)
>
> indent fail

Do you mean that it is not aligned to '(' on the previous line?

Okay, I'll fix it up (and change my vim setup). But I find indenting with
spaces disgusting.

> > +{
> > + unsigned long start, addr, end;
> > + int i;
> > +
> > + if (cpa->flags & CPA_PAGES_ARRAY) {
> > + for (i = 0; i < cpa->numpages; i++)
> > + restore_large_pages(__cpa_addr(cpa, i), pgtables);
> > + return;
> > + }
> > +
> > + start = __cpa_addr(cpa, 0);
> > + end = start + PAGE_SIZE * cpa->numpages;
> > +
> > + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
>
> we have within()

Neat. Thanks.
>
> > + restore_large_pages(addr, pgtables);
> > +}
> > +
> > static void cpa_flush(struct cpa_data *data, int cache)
> > {
> > + LIST_HEAD(pgtables);
> > + struct page *page, *tmp;
>
> xmas fail

Emm. What are rules here?

> > struct cpa_data *cpa = data;
> > unsigned int i;
> >
> > BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
> >
> > + cpa_restore_large_pages(data, &pgtables);
> > +
> > if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
> > cpa_flush_all(cache);
> > + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> > + list_del(&page->lru);
> > + __free_page(page);
> > + }
> > return;
> > }
> >
> > - if (cpa->numpages <= tlb_single_page_flush_ceiling)
> > - on_each_cpu(__cpa_flush_tlb, cpa, 1);
> > - else
> > + if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
> > flush_tlb_all();
> > + else
> > + on_each_cpu(__cpa_flush_tlb, cpa, 1);
> > +
> > + list_for_each_entry_safe(page, tmp, &pgtables, lru) {
> > + list_del(&page->lru);
> > + __free_page(page);
> > + }
> >
> > if (!cache)
> > return;
>
> That's a bit of a mess there, I'm thinking you can fix that with a goto.

Will look into this.

> > + /* Check that the rest of PTEs are compatible with the first one */
> > + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
> > + pte_t entry = *pte;
> > + if (!pte_present(entry))
> > + return;
> > + if (pte_flags(entry) != pte_flags(first))
> > + return;
> > + if (pte_pfn(entry) - pte_pfn(first) != i)
>
> I think I'd perfer: pte_pfn(entry) != pte_pfn(first) + i

Sure.

> > + pr_debug("2M restored at %#lx\n", addr);
>
> While I appreciate it's usefulness while writing this, I do think we can
> do without that print once we know it works.

Even with pr_debug()? I shouldn't be noisy for most users.

> > +/*
> > + * Restore PMD and PUD pages in the kernel mapping around the address where
> > + * possible.
> > + *
> > + * Caller must flush TLB and free page tables queued on the list before
> > + * touching the new entries. CPU must not see TLB entries of different size
> > + * with different attributes.
> > + */
> > +static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
> > +{
> > + pgd_t *pgd;
> > + p4d_t *p4d;
> > + pud_t *pud;
> > +
> > + addr &= PUD_MASK;
> > +
> > + spin_lock(&pgd_lock);
> > + pgd = pgd_offset_k(addr);
> > + if (pgd_none(*pgd))
> > + goto out;
> > + p4d = p4d_offset(pgd, addr);
> > + if (p4d_none(*p4d))
> > + goto out;
> > + pud = pud_offset(p4d, addr);
> > + if (!pud_present(*pud) || pud_large(*pud))
> > + goto out;
> > +
> > + restore_pud_page(pud, addr, pgtables);
> > +out:
> > + spin_unlock(&pgd_lock);
> > +}
>
> I find this odd, at best. AFAICT this does not attempt to reconstruct a
> PMD around @addr when possible. When the first PMD of the PUD can't be
> reconstructed, we give up entirely.

No, you misread. If the first PMD is not suitable, we give up
reconstructing PUD page, but we still look at all PMD of the PUD range.

But this might be excessive. What you are proposing below should be fine
and more efficient. If we do everything right the rest of PMDs in the PUD
have to already large or not suitable for reconstructing.

We might still look at the rest of PMDs for CONFIG_CPA_DEBUG, just to make
sure we don't miss some corner case where we didn't restore a PMD.

(Also I think about s/restore/reconstruct/g)

> Why not something like:
>
> static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
> {
> pgd_t *pgd;
> p4d_t *p4d;
> pud_t *pud;
> pmd_t *pmd;
>
> spin_lock(&pgd_lock);
> pgd = pgd_offset_k(addr);
> if (pgd_none(*pgd))
> goto out;
>
> p4d = p4d_offset(pgd, addr);
> if (p4d_none(*p4d))
> goto out;
>
> pud = pud_offset(p4d, addr);
> if (!pud_present(*pud) || pud_large(*pud))
> goto out;
>
> pmd = pmd_offset(pud, addr);
> if (pmd_present(*pmd) && !pmd_large(*pmd)) {
> if (!restore_pmd_page(pmd, addr & PMD_MASK, pgtables))
> goto out;
> }
>
> restore_pud_page(pud, addr & PUD_MASK, pgtables);
> out:
> spin_unlock(&pgd_lock);
> }
>
> That would also much simplify restore_pud_page(), it would not have to
> call restore_pmd_page().

--
Kirill A. Shutemov

2020-04-17 15:21:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Fri, Apr 17, 2020 at 05:42:44PM +0300, Kirill A. Shutemov wrote:
> On Fri, Apr 17, 2020 at 02:18:28PM +0200, Peter Zijlstra wrote:
> Do you mean that it is not aligned to '(' on the previous line?
>
> Okay, I'll fix it up (and change my vim setup). But I find indenting with
> spaces disgusting.

set cino=:0(0


> > > static void cpa_flush(struct cpa_data *data, int cache)
> > > {
> > > + LIST_HEAD(pgtables);
> > > + struct page *page, *tmp;
> >
> > xmas fail
>
> Emm. What are rules here?
>
> > > struct cpa_data *cpa = data;
> > > unsigned int i;

Basically we (tip) strive for the inverse xmas tree thing, so here that
would be:

struct cpa_data *cpa = data;
+ struct page *page, *tmp;
+ LIST_HEAD(pgtables);
unsigned int i;


> > > + pr_debug("2M restored at %#lx\n", addr);
> >
> > While I appreciate it's usefulness while writing this, I do think we can
> > do without that print once we know it works.
>
> Even with pr_debug()? I shouldn't be noisy for most users.

I always have debug on; also there is no counterpart on split either.

> > > +/*
> > > + * Restore PMD and PUD pages in the kernel mapping around the address where
> > > + * possible.
> > > + *
> > > + * Caller must flush TLB and free page tables queued on the list before
> > > + * touching the new entries. CPU must not see TLB entries of different size
> > > + * with different attributes.
> > > + */
> > > +static void restore_large_pages(unsigned long addr, struct list_head *pgtables)
> > > +{
> > > + pgd_t *pgd;
> > > + p4d_t *p4d;
> > > + pud_t *pud;
> > > +
> > > + addr &= PUD_MASK;
> > > +
> > > + spin_lock(&pgd_lock);
> > > + pgd = pgd_offset_k(addr);
> > > + if (pgd_none(*pgd))
> > > + goto out;
> > > + p4d = p4d_offset(pgd, addr);
> > > + if (p4d_none(*p4d))
> > > + goto out;
> > > + pud = pud_offset(p4d, addr);
> > > + if (!pud_present(*pud) || pud_large(*pud))
> > > + goto out;
> > > +
> > > + restore_pud_page(pud, addr, pgtables);
> > > +out:
> > > + spin_unlock(&pgd_lock);
> > > +}
> >
> > I find this odd, at best. AFAICT this does not attempt to reconstruct a
> > PMD around @addr when possible. When the first PMD of the PUD can't be
> > reconstructed, we give up entirely.
>
> No, you misread. If the first PMD is not suitable, we give up
> reconstructing PUD page, but we still look at all PMD of the PUD range.

Aah, indeed. I got my brain in a twist reading that pud function.

> But this might be excessive. What you are proposing below should be fine
> and more efficient. If we do everything right the rest of PMDs in the PUD
> have to already large or not suitable for reconstructing.

Just so.

> We might still look at the rest of PMDs for CONFIG_CPA_DEBUG, just to make
> sure we don't miss some corner case where we didn't restore a PMD.
>
> (Also I think about s/restore/reconstruct/g)

Right, and WARN if they do succeed collapsing ;-)

2020-04-17 15:49:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Fri, Apr 17, 2020 at 12:32:29AM +0300, Kirill A. Shutemov wrote:
> +static void cpa_restore_large_pages(struct cpa_data *cpa,
> + struct list_head *pgtables)
> +{
> + unsigned long start, addr, end;
> + int i;
> +

> + start = __cpa_addr(cpa, 0);
> + end = start + PAGE_SIZE * cpa->numpages;
> +
> + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
> + restore_large_pages(addr, pgtables);

Isn't that loop slightly broken?

Consider:

s e
|---------|---------|---------|---------|
a0 a1 a2 a3

Where s,e are @start,@end resp. and a# are the consecutive values of
@addr with PUD sized steps.

Then, since a3 is >= @end, we'll not take that iteration and we'll not
try and merge that last PUD, even though we possibly could. One fix is
to truncate @start (and with that @addr) to the beginning of the PUD.

Also, I'm afraid that with my proposal this loop needs to do PMD size
steps. In that regard your version does make some sense. But it is
indeed less efficient for small ranges.

One possible fix is to pass @start,@end into the
restore/reconstruct/collapse such that we can iterate the minimal set of
page-tables for each level.



2020-04-17 16:57:01

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

On Fri, Apr 17, 2020 at 05:47:14PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 17, 2020 at 12:32:29AM +0300, Kirill A. Shutemov wrote:
> > +static void cpa_restore_large_pages(struct cpa_data *cpa,
> > + struct list_head *pgtables)
> > +{
> > + unsigned long start, addr, end;
> > + int i;
> > +
>
> > + start = __cpa_addr(cpa, 0);
> > + end = start + PAGE_SIZE * cpa->numpages;
> > +
> > + for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
> > + restore_large_pages(addr, pgtables);
>
> Isn't that loop slightly broken?
>
> Consider:
>
> s e
> |---------|---------|---------|---------|
> a0 a1 a2 a3
>
> Where s,e are @start,@end resp. and a# are the consecutive values of
> @addr with PUD sized steps.
>
> Then, since a3 is >= @end, we'll not take that iteration and we'll not
> try and merge that last PUD, even though we possibly could. One fix is
> to truncate @start (and with that @addr) to the beginning of the PUD.

... or round_up() end. I'll fix it.

> Also, I'm afraid that with my proposal this loop needs to do PMD size
> steps. In that regard your version does make some sense. But it is
> indeed less efficient for small ranges.
>
> One possible fix is to pass @start,@end into the
> restore/reconstruct/collapse such that we can iterate the minimal set of
> page-tables for each level.

Yeah, I'll rework it.

I just realized I missed TLB flush: we need to flush TLB twice here. First
to get rid of all TLB entires for change we've made (before
reconstruction) and then the second time to get rid of small page TLB
entries. That's unfortunate.

--
Kirill A. Shutemov

2020-04-25 11:49:01

by Chen, Rong A

[permalink] [raw]
Subject: [x86/mm/pat] ae64ac1a83: BUG:Bad_page_state_in_process

Greeting,

FYI, we noticed the following commit (built with gcc-7):

commit: ae64ac1a83b2e4c1978f99f3c077c011a5e97350 ("[PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation")
url: https://github.com/0day-ci/linux/commits/Kirill-A-Shutemov/x86-mm-pat-Restore-large-pages-after-fragmentation/20200417-053430


in testcase: rcuperf
with following parameters:

runtime: 300s
perf_type: srcu



on test machine: qemu-system-i386 -enable-kvm -cpu SandyBridge -smp 2 -m 8G

caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):


+--------------------------------------------------+------------+------------+
| | 9786cab674 | ae64ac1a83 |
+--------------------------------------------------+------------+------------+
| boot_successes | 0 | 0 |
| boot_failures | 8 | 8 |
| BUG:soft_lockup-CPU##stuck_for#s![dma-fence:#:#] | 7 | |
| EIP:kthread_should_stop | 1 | |
| Kernel_panic-not_syncing:softlockup:hung_tasks | 7 | |
| BUG:kernel_hang_in_boot_stage | 1 | |
| EIP:thread_signal_callback | 6 | |
| BUG:Bad_page_state_in_process | 0 | 8 |
| BUG:kernel_NULL_pointer_dereference,address | 0 | 1 |
| Oops:#[##] | 0 | 1 |
| EIP:__rb_erase_color | 0 | 1 |
| Kernel_panic-not_syncing:Fatal_exception | 0 | 1 |
+--------------------------------------------------+------------+------------+


If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>


[ 44.684096] BUG: Bad page state in process swapper pfn:04f2f
[ 44.684814] page:ed23c5e0 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.685602] flags: 0x1000(reserved)
[ 44.686023] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.687005] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.687311] bad because of flags: 0x1000(reserved)
[ 44.687878] Modules linked in:
[ 44.688267] CPU: 0 PID: 1 Comm: swapper Not tainted 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.689200] Call Trace:
[ 44.689520] dump_stack+0x16/0x26
[ 44.689926] bad_page+0xa5/0x110
[ 44.690650] free_pages_check_bad+0x57/0x60
[ 44.691192] free_pcp_prepare+0x187/0x190
[ 44.691684] free_unref_page+0x13/0x40
[ 44.692161] __free_pages+0x25/0x40
[ 44.692602] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.693208] set_memory_ro+0x1f/0x30
[ 44.693984] init_real_mode+0xf3/0x10a
[ 44.694425] ? amd_uncore_init+0x285/0x285
[ 44.694925] do_one_initcall+0x81/0x1b0
[ 44.695390] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.697316] ? proc_register+0xa3/0x120
[ 44.697821] ? proc_create_seq_private+0x39/0x40
[ 44.698407] ? init_mm_internals+0x81/0x86
[ 44.698933] kernel_init_freeable+0x58/0x180
[ 44.699475] ? rest_init+0xf0/0xf0
[ 44.699918] kernel_init+0x8/0xe0
[ 44.700646] ret_from_fork+0x19/0x24
[ 44.700944] Disabling lock debugging due to kernel taint
[ 44.701351] BUG: Bad page state in process swapper pfn:04f2e
[ 44.701808] page:ed23c5c0 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.702310] flags: 0x1000(reserved)
[ 44.702571] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.703165] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.703639] bad because of flags: 0x1000(reserved)
[ 44.703972] Modules linked in:
[ 44.704204] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.704903] Call Trace:
[ 44.705111] dump_stack+0x16/0x26
[ 44.705371] bad_page+0xa5/0x110
[ 44.705621] free_pages_check_bad+0x57/0x60
[ 44.705942] free_pcp_prepare+0x187/0x190
[ 44.706284] free_unref_page+0x13/0x40
[ 44.706563] __free_pages+0x25/0x40
[ 44.706827] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.707307] set_memory_ro+0x1f/0x30
[ 44.707578] init_real_mode+0xf3/0x10a
[ 44.707858] ? amd_uncore_init+0x285/0x285
[ 44.708165] do_one_initcall+0x81/0x1b0
[ 44.708455] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.708808] ? proc_register+0xa3/0x120
[ 44.709092] ? proc_create_seq_private+0x39/0x40
[ 44.709434] ? init_mm_internals+0x81/0x86
[ 44.709738] kernel_init_freeable+0x58/0x180
[ 44.710054] ? rest_init+0xf0/0xf0
[ 44.710324] kernel_init+0x8/0xe0
[ 44.710637] ret_from_fork+0x19/0x24
[ 44.710897] BUG: Bad page state in process swapper pfn:04f2d
[ 44.711304] page:ed23c5a0 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.711789] flags: 0x1000(reserved)
[ 44.712039] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.712609] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.713078] bad because of flags: 0x1000(reserved)
[ 44.713417] Modules linked in:
[ 44.713638] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.713970] Call Trace:
[ 44.714151] dump_stack+0x16/0x26
[ 44.714392] bad_page+0xa5/0x110
[ 44.714626] free_pages_check_bad+0x57/0x60
[ 44.714925] free_pcp_prepare+0x187/0x190
[ 44.715211] free_unref_page+0x13/0x40
[ 44.715480] __free_pages+0x25/0x40
[ 44.715732] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.716067] set_memory_ro+0x1f/0x30
[ 44.717308] init_real_mode+0xf3/0x10a
[ 44.717578] ? amd_uncore_init+0x285/0x285
[ 44.717871] do_one_initcall+0x81/0x1b0
[ 44.718148] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.718477] ? proc_register+0xa3/0x120
[ 44.718752] ? proc_create_seq_private+0x39/0x40
[ 44.719081] ? init_mm_internals+0x81/0x86
[ 44.719375] kernel_init_freeable+0x58/0x180
[ 44.719681] ? rest_init+0xf0/0xf0
[ 44.720636] kernel_init+0x8/0xe0
[ 44.720883] ret_from_fork+0x19/0x24
[ 44.721150] BUG: Bad page state in process swapper pfn:04f2b
[ 44.721556] page:ed23c560 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.722039] flags: 0x1000(reserved)
[ 44.722289] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.722856] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.723310] bad because of flags: 0x1000(reserved)
[ 44.723649] Modules linked in:
[ 44.723870] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.723969] Call Trace:
[ 44.724159] dump_stack+0x16/0x26
[ 44.724400] bad_page+0xa5/0x110
[ 44.724643] free_pages_check_bad+0x57/0x60
[ 44.724941] free_pcp_prepare+0x187/0x190
[ 44.725228] free_unref_page+0x13/0x40
[ 44.725496] __free_pages+0x25/0x40
[ 44.725747] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.726082] set_memory_ro+0x1f/0x30
[ 44.726339] init_real_mode+0xf3/0x10a
[ 44.726607] ? amd_uncore_init+0x285/0x285
[ 44.727303] do_one_initcall+0x81/0x1b0
[ 44.727693] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.728159] ? proc_register+0xa3/0x120
[ 44.728571] ? proc_create_seq_private+0x39/0x40
[ 44.729061] ? init_mm_internals+0x81/0x86
[ 44.729493] kernel_init_freeable+0x58/0x180
[ 44.729943] ? rest_init+0xf0/0xf0
[ 44.730303] kernel_init+0x8/0xe0
[ 44.730664] ret_from_fork+0x19/0x24
[ 44.731087] BUG: Bad page state in process swapper pfn:04f2a
[ 44.731755] page:ed23c540 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.732556] flags: 0x1000(reserved)
[ 44.733979] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.734870] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.735663] bad because of flags: 0x1000(reserved)
[ 44.736200] Modules linked in:
[ 44.736565] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.737309] Call Trace:
[ 44.737605] dump_stack+0x16/0x26
[ 44.738004] bad_page+0xa5/0x110
[ 44.738396] free_pages_check_bad+0x57/0x60
[ 44.738912] free_pcp_prepare+0x187/0x190
[ 44.739359] free_unref_page+0x13/0x40
[ 44.739782] __free_pages+0x25/0x40
[ 44.740647] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.741191] set_memory_ro+0x1f/0x30
[ 44.741607] init_real_mode+0xf3/0x10a
[ 44.742030] ? amd_uncore_init+0x285/0x285
[ 44.742500] do_one_initcall+0x81/0x1b0
[ 44.742955] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.743480] ? proc_register+0xa3/0x120
[ 44.743979] ? proc_create_seq_private+0x39/0x40
[ 44.744516] ? init_mm_internals+0x81/0x86
[ 44.744998] kernel_init_freeable+0x58/0x180
[ 44.745488] ? rest_init+0xf0/0xf0
[ 44.745894] kernel_init+0x8/0xe0
[ 44.746328] ret_from_fork+0x19/0x24
[ 44.746762] BUG: Bad page state in process swapper pfn:04f29
[ 44.747312] page:ed23c520 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.748109] flags: 0x1000(reserved)
[ 44.748502] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.749426] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.750644] bad because of flags: 0x1000(reserved)
[ 44.751220] Modules linked in:
[ 44.751558] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.752572] Call Trace:
[ 44.752861] dump_stack+0x16/0x26
[ 44.753242] bad_page+0xa5/0x110
[ 44.753608] free_pages_check_bad+0x57/0x60
[ 44.753980] free_pcp_prepare+0x187/0x190
[ 44.754456] free_unref_page+0x13/0x40
[ 44.754921] __free_pages+0x25/0x40
[ 44.755318] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.755853] set_memory_ro+0x1f/0x30
[ 44.756270] init_real_mode+0xf3/0x10a
[ 44.756731] ? amd_uncore_init+0x285/0x285
[ 44.757313] do_one_initcall+0x81/0x1b0
[ 44.757780] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.758326] ? proc_register+0xa3/0x120
[ 44.758783] ? proc_create_seq_private+0x39/0x40
[ 44.759383] ? init_mm_internals+0x81/0x86
[ 44.759894] kernel_init_freeable+0x58/0x180
[ 44.760648] ? rest_init+0xf0/0xf0
[ 44.761085] kernel_init+0x8/0xe0
[ 44.761515] ret_from_fork+0x19/0x24
[ 44.761949] BUG: Bad page state in process swapper pfn:04f28
[ 44.763982] page:ed23c500 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.764473] flags: 0x1000(reserved)
[ 44.764726] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.765294] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.765754] bad because of flags: 0x1000(reserved)
[ 44.766160] Modules linked in:
[ 44.766383] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.767307] Call Trace:
[ 44.767491] dump_stack+0x16/0x26
[ 44.767731] bad_page+0xa5/0x110
[ 44.767987] free_pages_check_bad+0x57/0x60
[ 44.768285] free_pcp_prepare+0x187/0x190
[ 44.768571] free_unref_page+0x13/0x40
[ 44.768838] __free_pages+0x25/0x40
[ 44.769089] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.769423] set_memory_ro+0x1f/0x30
[ 44.769683] init_real_mode+0xf3/0x10a
[ 44.769952] ? amd_uncore_init+0x285/0x285
[ 44.770248] do_one_initcall+0x81/0x1b0
[ 44.770646] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.770977] ? proc_register+0xa3/0x120
[ 44.771250] ? proc_create_seq_private+0x39/0x40
[ 44.771580] ? init_mm_internals+0x81/0x86
[ 44.771889] kernel_init_freeable+0x58/0x180
[ 44.772207] ? rest_init+0xf0/0xf0
[ 44.772452] kernel_init+0x8/0xe0
[ 44.772692] ret_from_fork+0x19/0x24
[ 44.772955] BUG: Bad page state in process swapper pfn:04f27
[ 44.773503] page:ed23c4e0 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.773980] flags: 0x1000(reserved)
[ 44.774392] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.775352] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.775874] bad because of flags: 0x1000(reserved)
[ 44.776218] Modules linked in:
[ 44.776440] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.777308] Call Trace:
[ 44.777491] dump_stack+0x16/0x26
[ 44.777730] bad_page+0xa5/0x110
[ 44.777962] free_pages_check_bad+0x57/0x60
[ 44.778258] free_pcp_prepare+0x187/0x190
[ 44.778542] free_unref_page+0x13/0x40
[ 44.778810] __free_pages+0x25/0x40
[ 44.779063] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.779397] set_memory_ro+0x1f/0x30
[ 44.779752] init_real_mode+0xf3/0x10a
[ 44.780160] ? amd_uncore_init+0x285/0x285
[ 44.780643] do_one_initcall+0x81/0x1b0
[ 44.780919] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.781247] ? proc_register+0xa3/0x120
[ 44.781719] ? proc_create_seq_private+0x39/0x40
[ 44.782051] ? init_mm_internals+0x81/0x86
[ 44.782345] kernel_init_freeable+0x58/0x180
[ 44.782651] ? rest_init+0xf0/0xf0
[ 44.782895] kernel_init+0x8/0xe0
[ 44.783133] ret_from_fork+0x19/0x24
[ 44.783391] BUG: Bad page state in process swapper pfn:04f26
[ 44.783977] page:ed23c4c0 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.784481] flags: 0x1000(reserved)
[ 44.784731] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.785293] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.785744] bad because of flags: 0x1000(reserved)
[ 44.786082] Modules linked in:
[ 44.786304] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.786953] Call Trace:
[ 44.787312] dump_stack+0x16/0x26
[ 44.787553] bad_page+0xa5/0x110
[ 44.787785] free_pages_check_bad+0x57/0x60
[ 44.788096] free_pcp_prepare+0x187/0x190
[ 44.788382] free_unref_page+0x13/0x40
[ 44.788648] __free_pages+0x25/0x40
[ 44.788898] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.789230] set_memory_ro+0x1f/0x30
[ 44.789486] init_real_mode+0xf3/0x10a
[ 44.789752] ? amd_uncore_init+0x285/0x285
[ 44.790041] do_one_initcall+0x81/0x1b0
[ 44.790315] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.790640] ? proc_register+0xa3/0x120
[ 44.790913] ? proc_create_seq_private+0x39/0x40
[ 44.791240] ? init_mm_internals+0x81/0x86
[ 44.791539] kernel_init_freeable+0x58/0x180
[ 44.791846] ? rest_init+0xf0/0xf0
[ 44.792101] kernel_init+0x8/0xe0
[ 44.792337] ret_from_fork+0x19/0x24
[ 44.792604] BUG: Bad page state in process swapper pfn:04f25
[ 44.793019] page:ed23c4a0 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.793498] flags: 0x1000(reserved)
[ 44.793748] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.793971] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.794425] bad because of flags: 0x1000(reserved)
[ 44.794761] Modules linked in:
[ 44.794981] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.795629] Call Trace:
[ 44.795815] dump_stack+0x16/0x26
[ 44.796061] bad_page+0xa5/0x110
[ 44.796291] free_pages_check_bad+0x57/0x60
[ 44.796587] free_pcp_prepare+0x187/0x190
[ 44.797304] free_unref_page+0x13/0x40
[ 44.797571] __free_pages+0x25/0x40
[ 44.797821] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.798152] set_memory_ro+0x1f/0x30
[ 44.798406] init_real_mode+0xf3/0x10a
[ 44.798673] ? amd_uncore_init+0x285/0x285
[ 44.798968] do_one_initcall+0x81/0x1b0
[ 44.799240] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.799566] ? proc_register+0xa3/0x120
[ 44.799838] ? proc_create_seq_private+0x39/0x40
[ 44.800641] ? init_mm_internals+0x81/0x86
[ 44.800932] kernel_init_freeable+0x58/0x180
[ 44.801232] ? rest_init+0xf0/0xf0
[ 44.801473] kernel_init+0x8/0xe0
[ 44.801708] ret_from_fork+0x19/0x24
[ 44.801963] BUG: Bad page state in process swapper pfn:04f24
[ 44.802365] page:ed23c480 refcount:0 mapcount:0 mapping:(ptrval) index:0x0
[ 44.802842] flags: 0x1000(reserved)
[ 44.803971] raw: 00001000 00000100 00000122 00000000 00000000 00000000 ffffffff 00000000
[ 44.804552] page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
[ 44.804999] bad because of flags: 0x1000(reserved)
[ 44.805334] Modules linked in:
[ 44.805553] CPU: 0 PID: 1 Comm: swapper Tainted: G B 5.7.0-rc1-00047-gae64ac1a83b2e #2
[ 44.806194] Call Trace:
[ 44.806374] dump_stack+0x16/0x26
[ 44.806610] bad_page+0xa5/0x110
[ 44.806841] free_pages_check_bad+0x57/0x60
[ 44.807304] free_pcp_prepare+0x187/0x190
[ 44.807588] free_unref_page+0x13/0x40
[ 44.807854] __free_pages+0x25/0x40
[ 44.808117] change_page_attr_set_clr+0x1bd/0x3d0
[ 44.808461] set_memory_ro+0x1f/0x30
[ 44.808722] init_real_mode+0xf3/0x10a
[ 44.808988] ? amd_uncore_init+0x285/0x285
[ 44.809278] do_one_initcall+0x81/0x1b0
[ 44.809551] ? _raw_write_lock_irqsave+0x10/0x40
[ 44.809878] ? proc_register+0xa3/0x120
[ 44.810150] ? proc_create_seq_private+0x39/0x40
[ 44.810493] ? init_mm_internals+0x81/0x86
[ 44.810637] kernel_init_freeable+0x58/0x180
[ 44.811077] ? rest_init+0xf0/0xf0
[ 44.811445] kernel_init+0x8/0xe0
[ 44.811808] ret_from_fork+0x19/0x24
[ 44.812584] NMI watchdog: Perf NMI watchdog permanently disabled
[ 44.814149] devtmpfs: initialized
[ 44.817084] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 6370867519511994 ns
[ 44.817336] futex hash table entries: 16 (order: -3, 768 bytes, linear)
[ 44.820746] regulator-dummy: no parameters
[ 44.821294] thermal_sys: Registered thermal governor 'step_wise'
[ 44.821421] NET: Registered protocol family 16
[ 44.824061] audit: initializing netlink subsys (disabled)
[ 44.824998] EISA bus registered
[ 44.825378] cpuidle: using governor menu
[ 44.827062] ACPI: bus type PCI registered
[ 44.827627] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.
[ 44.828476] PCI: PCI BIOS revision 2.10 entry at 0xfd1bc, last bus=0
[ 44.847580] audit: type=2000 audit(1587758258.345:1): state=initialized audit_enabled=0 res=1
[ 44.848798] HugeTLB registered 2.00 MiB page size, pre-allocated 0 pages
[ 44.850363] workqueue: round-robin CPU selection forced, expect performance impact
[ 44.856267] gpio-f7188x: Not a Fintek device at 0x0000002e
[ 44.856933] gpio-f7188x: Not a Fintek device at 0x0000004e
[ 44.857444] ACPI: Added _OSI(Module Device)
[ 44.857936] ACPI: Added _OSI(Processor Device)
[ 44.858438] ACPI: Added _OSI(3.0 _SCP Extensions)
[ 44.859006] ACPI: Added _OSI(Processor Aggregator Device)
[ 44.859528] ACPI: Added _OSI(Linux-Dell-Video)
[ 44.859954] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
[ 44.860646] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
[ 44.862719] ACPI: 1 ACPI AML tables successfully acquired and loaded
[ 44.866154] ACPI: Interpreter enabled
[ 44.866606] ACPI: (supports S0 S5)
[ 44.867003] ACPI: Using PIC for interrupt routing
[ 44.867338] PCI: Using host bridge windows from ACPI; if necessary, use "pci=nocrs" and report a bug
[ 44.868124] ACPI: Enabled 2 GPEs in block 00 to 0F
[ 44.871856] ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
[ 44.872315] acpi PNP0A03:00: _OSC: OS supports [ASPM ClockPM Segments HPX-Type3]
[ 44.872889] PCI host bridge to bus 0000:00
[ 44.873329] pci_bus 0000:00: root bus resource [io 0x0000-0x0cf7 window]
[ 44.873979] pci_bus 0000:00: root bus resource [io 0x0d00-0xffff window]
[ 44.874763] pci_bus 0000:00: root bus resource [mem 0x000a0000-0x000bffff window]
[ 44.875638] pci_bus 0000:00: root bus resource [mem 0xc0000000-0xfebfffff window]
[ 44.876498] pci_bus 0000:00: root bus resource [mem 0x240000000-0x2bfffffff window]
[ 44.877313] pci_bus 0000:00: root bus resource [bus 00-ff]
[ 44.878007] pci 0000:00:00.0: [8086:1237] type 00 class 0x060000
[ 44.879161] pci 0000:00:01.0: [8086:7000] type 00 class 0x060100
[ 44.880662] pci 0000:00:01.1: [8086:7010] type 00 class 0x010180
[ 44.889122] pci 0000:00:01.1: reg 0x20: [io 0xc200-0xc20f]
[ 44.892474] pci 0000:00:01.1: legacy IDE quirk: reg 0x10: [io 0x01f0-0x01f7]
[ 44.893295] pci 0000:00:01.1: legacy IDE quirk: reg 0x14: [io 0x03f6]
[ 44.893973] pci 0000:00:01.1: legacy IDE quirk: reg 0x18: [io 0x0170-0x0177]
[ 44.894774] pci 0000:00:01.1: legacy IDE quirk: reg 0x1c: [io 0x0376]
[ 44.895852] pci 0000:00:01.3: [8086:7113] type 00 class 0x068000
[ 44.896877] pci 0000:00:01.3: quirk: [io 0x0600-0x063f] claimed by PIIX4 ACPI
[ 44.897345] pci 0000:00:01.3: quirk: [io 0x0700-0x070f] claimed by PIIX4 SMB
[ 44.900994] pci 0000:00:02.0: [1234:1111] type 00 class 0x030000
[ 44.907328] pci 0000:00:02.0: reg 0x10: [mem 0xfd000000-0xfdffffff pref]
[ 44.914000] pci 0000:00:02.0: reg 0x18: [mem 0xfebf0000-0xfebf0fff]
[ 44.924648] pci 0000:00:02.0: reg 0x30: [mem 0xfebe0000-0xfebeffff pref]
[ 44.925417] pci 0000:00:03.0: [8086:100e] type 00 class 0x020000
[ 44.927035] pci 0000:00:03.0: reg 0x10: [mem 0xfebc0000-0xfebdffff]
[ 44.928866] pci 0000:00:03.0: reg 0x14: [io 0xc000-0xc03f]
[ 44.936749] pci 0000:00:03.0: reg 0x30: [mem 0xfeb80000-0xfebbffff pref]
[ 44.937531] pci 0000:00:04.0: [1af4:1001] type 00 class 0x010000
[ 44.939650] pci 0000:00:04.0: reg 0x10: [io 0xc040-0xc07f]
[ 44.942024] pci 0000:00:04.0: reg 0x14: [mem 0xfebf1000-0xfebf1fff]
[ 44.953966] pci 0000:00:04.0: reg 0x20: [mem 0xfe000000-0xfe003fff 64bit pref]
[ 44.959277] pci 0000:00:05.0: [1af4:1001] type 00 class 0x010000
[ 44.963446] pci 0000:00:05.0: reg 0x10: [io 0xc080-0xc0bf]
[ 44.966776] pci 0000:00:05.0: reg 0x14: [mem 0xfebf2000-0xfebf2fff]
[ 44.976770] pci 0000:00:05.0: reg 0x20: [mem 0xfe004000-0xfe007fff 64bit pref]
[ 44.981292] pci 0000:00:06.0: [1af4:1001] type 00 class 0x010000
[ 44.986773] pci 0000:00:06.0: reg 0x10: [io 0xc0c0-0xc0ff]
[ 44.989996] pci 0000:00:06.0: reg 0x14: [mem 0xfebf3000-0xfebf3fff]
[ 44.998150] pci 0000:00:06.0: reg 0x20: [mem 0xfe008000-0xfe00bfff 64bit pref]
[ 45.001040] pci 0000:00:07.0: [1af4:1001] type 00 class 0x010000
[ 45.003204] pci 0000:00:07.0: reg 0x10: [io 0xc100-0xc13f]


To reproduce:

# build kernel
cd linux
cp config-5.7.0-rc1-00047-gae64ac1a83b2e .config
make HOSTCC=gcc-7 CC=gcc-7 ARCH=i386 olddefconfig prepare modules_prepare bzImage

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> job-script # job-script is attached in this email



Thanks,
Rong Chen


Attachments:
(No filename) (22.53 kB)
config-5.7.0-rc1-00047-gae64ac1a83b2e (146.42 kB)
job-script (4.49 kB)
dmesg.xz (14.45 kB)
Download all attachments