LinuxLists.cc - [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

2021-06-03 14:25:16

Subject: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

The per-cpu page allocator (PCP) only stores order-0 pages. This means
that all THP and "cheap" high-order allocations including SLUB contends
on the zone->lock. This patch extends the PCP allocator to store THP and
"cheap" high-order pages. Note that struct per_cpu_pages increases in
size to 256 bytes (4 cache lines) on x86-64.

Note that this is not necessarily a universal performance win because of
how it is implemented. High-order pages can cause pcp->high to be exceeded
prematurely for lower-orders so for example, a large number of THP pages
being freed could release order-0 pages from the PCP lists. Hence, much
depends on the allocation/free pattern as observed by a single CPU to
determine if caching helps or hurts a particular workload.

That said, basic performance testing passed. The following is a netperf
UDP_STREAM test which hits the relevant patches as some of the network
allocations are high-order.

netperf-udp
5.13.0-rc2 5.13.0-rc2
mm-pcpburst-v3r4 mm-pcphighorder-v1r7
Hmean send-64 261.46 ( 0.00%) 266.30 * 1.85%*
Hmean send-128 516.35 ( 0.00%) 536.78 * 3.96%*
Hmean send-256 1014.13 ( 0.00%) 1034.63 * 2.02%*
Hmean send-1024 3907.65 ( 0.00%) 4046.11 * 3.54%*
Hmean send-2048 7492.93 ( 0.00%) 7754.85 * 3.50%*
Hmean send-3312 11410.04 ( 0.00%) 11772.32 * 3.18%*
Hmean send-4096 13521.95 ( 0.00%) 13912.34 * 2.89%*
Hmean send-8192 21660.50 ( 0.00%) 22730.72 * 4.94%*
Hmean send-16384 31902.32 ( 0.00%) 32637.50 * 2.30%*

From a functional point of view, a patch like this is necessary to
make bulk allocation of high-order pages work with similar performance
to order-0 bulk allocations. The bulk allocator is not updated in this
series as it would have to be determined by bulk allocation users how
they want to track the order of pages allocated with the bulk allocator.

Signed-off-by: Mel Gorman <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
---
include/linux/mmzone.h | 20 +++++-
mm/internal.h | 2 +-
mm/page_alloc.c | 159 +++++++++++++++++++++++++++++------------
mm/swap.c | 2 +-
4 files changed, 135 insertions(+), 48 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e20d98c62beb..f1bed5b847ec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -333,6 +333,24 @@ enum zone_watermarks {
NR_WMARK
};

+/*
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
+ * for pageblock size for THP if configured.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define NR_PCP_THP 1
+#else
+#define NR_PCP_THP 0
+#endif
+#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
+
+/*
+ * Shift to encode migratetype and order in the same integer, with order
+ * in the least significant bits.
+ */
+#define NR_PCP_ORDER_WIDTH 8
+#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
+
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
@@ -349,7 +367,7 @@ struct per_cpu_pages {
#endif

/* Lists of pages, one per migrate type stored on the pcp-lists */
- struct list_head lists[MIGRATE_PCPTYPES];
+ struct list_head lists[NR_PCP_LISTS];
};

struct per_cpu_zonestat {
diff --git a/mm/internal.h b/mm/internal.h
index 651250e59ef5..fdb0530fa341 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -218,7 +218,7 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;

-extern void free_unref_page(struct page *page);
+extern void free_unref_page(struct page *page, unsigned int order);
extern void free_unref_page_list(struct list_head *list);

extern void zone_pcp_update(struct zone *zone, int cpu_online);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d45d00e069f9..7be71f9ad9f0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -676,10 +676,53 @@ static void bad_page(struct page *page, const char *reason)
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

+static inline unsigned int order_to_pindex(int migratetype, int order)
+{
+ int base = order;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ VM_BUG_ON(order != pageblock_order);
+ base = PAGE_ALLOC_COSTLY_ORDER + 1;
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return (MIGRATE_PCPTYPES * base) + migratetype;
+}
+
+static inline int pindex_to_order(unsigned int pindex)
+{
+ int order = pindex / MIGRATE_PCPTYPES;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ order = pageblock_order;
+ VM_BUG_ON(order != pageblock_order);
+ }
+#else
+ VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+ return order;
+}
+
+static inline bool pcp_allowed_order(unsigned int order)
+{
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ return true;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order == pageblock_order)
+ return true;
+#endif
+ return false;
+}
+
static inline void free_the_page(struct page *page, unsigned int order)
{
- if (order == 0) /* Via pcp? */
- free_unref_page(page);
+ if (pcp_allowed_order(order)) /* Via pcp? */
+ free_unref_page(page, order);
else
__free_pages_ok(page, order, FPI_NONE);
}
@@ -702,7 +745,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page), FPI_NONE);
+ free_the_page(page, compound_order(page));
}

void prep_compound_page(struct page *page, unsigned int order)
@@ -1352,9 +1395,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
* moved from pcp lists to free lists.
*/
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
{
- return free_pages_prepare(page, 0, true, FPI_NONE);
+ return free_pages_prepare(page, order, true, FPI_NONE);
}

static bool bulkfree_pcp_prepare(struct page *page)
@@ -1371,12 +1414,12 @@ static bool bulkfree_pcp_prepare(struct page *page)
* debug_pagealloc enabled, they are checked also immediately when being freed
* to the pcp lists.
*/
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return free_pages_prepare(page, 0, true, FPI_NONE);
+ return free_pages_prepare(page, order, true, FPI_NONE);
else
- return free_pages_prepare(page, 0, false, FPI_NONE);
+ return free_pages_prepare(page, order, false, FPI_NONE);
}

static bool bulkfree_pcp_prepare(struct page *page)
@@ -1408,8 +1451,10 @@ static inline void prefetch_buddy(struct page *page)
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
- int migratetype = 0;
+ int pindex = 0;
int batch_free = 0;
+ int nr_freed = 0;
+ unsigned int order;
int prefetch_nr = READ_ONCE(pcp->batch);
bool isolated_pageblocks;
struct page *page, *tmp;
@@ -1420,7 +1465,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
- while (count) {
+ while (count > 0) {
struct list_head *list;

/*
@@ -1432,24 +1477,31 @@ static void free_pcppages_bulk(struct zone *zone, int count,
*/
do {
batch_free++;
- if (++migratetype == MIGRATE_PCPTYPES)
- migratetype = 0;
- list = &pcp->lists[migratetype];
+ if (++pindex == NR_PCP_LISTS)
+ pindex = 0;
+ list = &pcp->lists[pindex];
} while (list_empty(list));

/* This is the only non-empty list. Free them all. */
- if (batch_free == MIGRATE_PCPTYPES)
+ if (batch_free == NR_PCP_LISTS)
batch_free = count;

+ order = pindex_to_order(pindex);
+ BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
- pcp->count--;
+ nr_freed += 1 << order;
+ count -= 1 << order;

if (bulkfree_pcp_prepare(page))
continue;

+ /* Encode order with the migratetype */
+ page->index <<= NR_PCP_ORDER_WIDTH;
+ page->index |= order;
+
list_add_tail(&page->lru, &head);

/*
@@ -1465,8 +1517,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
prefetch_buddy(page);
prefetch_nr--;
}
- } while (--count && --batch_free && !list_empty(list));
+ } while (count > 0 && --batch_free && !list_empty(list));
}
+ pcp->count -= nr_freed;

/*
* local_lock_irq held so equivalent to spin_lock_irqsave for
@@ -1481,14 +1534,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
+
+ /* mt has been encoded with the order (see above) */
+ order = mt & NR_PCP_ORDER_MASK;
+ mt >>= NR_PCP_ORDER_WIDTH;
+
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);

- __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
- trace_mm_page_pcpu_drain(page, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, order, mt);
}
spin_unlock(&zone->lock);
}
@@ -3265,11 +3323,12 @@ void mark_free_pages(struct zone *zone)
}
#endif /* CONFIG_PM */

-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+ unsigned int order)
{
int migratetype;

- if (!free_pcp_prepare(page))
+ if (!free_pcp_prepare(page, order))
return false;

migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3319,16 +3378,18 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
}

static void free_unref_page_commit(struct page *page, unsigned long pfn,
- int migratetype)
+ int migratetype, unsigned int order)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
int high;
+ int pindex;

__count_vm_event(PGFREE);
pcp = this_cpu_ptr(zone->per_cpu_pageset);
- list_add(&page->lru, &pcp->lists[migratetype]);
- pcp->count++;
+ pindex = order_to_pindex(migratetype, order);
+ list_add(&page->lru, &pcp->lists[pindex]);
+ pcp->count += 1 << order;
high = nr_pcp_high(pcp, zone);
if (pcp->count >= high) {
int batch = READ_ONCE(pcp->batch);
@@ -3338,15 +3399,15 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
}

/*
- * Free a 0-order page
+ * Free a pcp page
*/
-void free_unref_page(struct page *page)
+void free_unref_page(struct page *page, unsigned int order)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
int migratetype;

- if (!free_unref_page_prepare(page, pfn))
+ if (!free_unref_page_prepare(page, pfn, order))
return;

/*
@@ -3359,14 +3420,14 @@ void free_unref_page(struct page *page)
migratetype = get_pcppage_migratetype(page);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+ free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}

local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, pfn, migratetype);
+ free_unref_page_commit(page, pfn, migratetype, order);
local_unlock_irqrestore(&pagesets.lock, flags);
}

@@ -3383,7 +3444,7 @@ void free_unref_page_list(struct list_head *list)
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
pfn = page_to_pfn(page);
- if (!free_unref_page_prepare(page, pfn))
+ if (!free_unref_page_prepare(page, pfn, 0))
list_del(&page->lru);

/*
@@ -3415,7 +3476,7 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, 0);
migratetype = get_pcppage_migratetype(page);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn, migratetype);
+ free_unref_page_commit(page, pfn, migratetype, 0);

/*
* Guard against excessive IRQ disabled times when we get
@@ -3551,7 +3612,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,

/* Remove page from the per-cpu list, caller must protect the list */
static inline
-struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+ int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
@@ -3560,16 +3622,22 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,

do {
if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- READ_ONCE(pcp->batch), list,
+ int batch = READ_ONCE(pcp->batch);
+ int alloced;
+
+ batch = max(batch >> order, 2);
+ alloced = rmqueue_bulk(zone, order,
+ batch, list,
migratetype, alloc_flags);
+
+ pcp->count += alloced << order;
if (unlikely(list_empty(list)))
return NULL;
}

page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
- pcp->count--;
+ pcp->count -= 1 << order;
} while (check_new_pcp(page));

return page;
@@ -3577,8 +3645,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,

/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, gfp_t gfp_flags,
- int migratetype, unsigned int alloc_flags)
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype,
+ unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3594,8 +3663,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
*/
pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp->free_factor >>= 1;
- list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
+ list = &pcp->lists[order_to_pindex(migratetype, order)];
+ page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
local_unlock_irqrestore(&pagesets.lock, flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3616,15 +3685,15 @@ struct page *rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;

- if (likely(order == 0)) {
+ if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
* we need to skip it when CMA area isn't allowed.
*/
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
- page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
- migratetype, alloc_flags);
+ page = rmqueue_pcplist(preferred_zone, zone, order,
+ gfp_flags, migratetype, alloc_flags);
goto out;
}
}
@@ -5206,7 +5275,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
continue;
}

- page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+ page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
pcp, pcp_list);
if (unlikely(!page)) {
/* Try and get at least one page */
@@ -6756,13 +6825,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{
- int migratetype;
+ int pindex;

memset(pcp, 0, sizeof(*pcp));
memset(pzstats, 0, sizeof(*pzstats));

- for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
- INIT_LIST_HEAD(&pcp->lists[migratetype]);
+ for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+ INIT_LIST_HEAD(&pcp->lists[pindex]);

/*
* Set batch and high values safe for a boot pageset. A true percpu
diff --git a/mm/swap.c b/mm/swap.c
index dfb48cf9c2c9..b953039e087b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -95,7 +95,7 @@ static void __put_single_page(struct page *page)
{
__page_cache_release(page);
mem_cgroup_uncharge(page);
- free_unref_page(page);
+ free_unref_page(page, 0);
}

static void __put_compound_page(struct page *page)
--
2.26.2

2021-06-09 18:34:45

by Zi Yan

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On 3 Jun 2021, at 10:22, Mel Gorman wrote:

> The per-cpu page allocator (PCP) only stores order-0 pages. This means
> that all THP and "cheap" high-order allocations including SLUB contends
> on the zone->lock. This patch extends the PCP allocator to store THP and
> "cheap" high-order pages. Note that struct per_cpu_pages increases in
> size to 256 bytes (4 cache lines) on x86-64.
>
> Note that this is not necessarily a universal performance win because of
> how it is implemented. High-order pages can cause pcp->high to be exceeded
> prematurely for lower-orders so for example, a large number of THP pages
> being freed could release order-0 pages from the PCP lists. Hence, much
> depends on the allocation/free pattern as observed by a single CPU to
> determine if caching helps or hurts a particular workload.
>
> That said, basic performance testing passed. The following is a netperf
> UDP_STREAM test which hits the relevant patches as some of the network
> allocations are high-order.
>
> netperf-udp
> 5.13.0-rc2 5.13.0-rc2
> mm-pcpburst-v3r4 mm-pcphighorder-v1r7
> Hmean send-64 261.46 ( 0.00%) 266.30 * 1.85%*
> Hmean send-128 516.35 ( 0.00%) 536.78 * 3.96%*
> Hmean send-256 1014.13 ( 0.00%) 1034.63 * 2.02%*
> Hmean send-1024 3907.65 ( 0.00%) 4046.11 * 3.54%*
> Hmean send-2048 7492.93 ( 0.00%) 7754.85 * 3.50%*
> Hmean send-3312 11410.04 ( 0.00%) 11772.32 * 3.18%*
> Hmean send-4096 13521.95 ( 0.00%) 13912.34 * 2.89%*
> Hmean send-8192 21660.50 ( 0.00%) 22730.72 * 4.94%*
> Hmean send-16384 31902.32 ( 0.00%) 32637.50 * 2.30%*
>
> From a functional point of view, a patch like this is necessary to
> make bulk allocation of high-order pages work with similar performance
> to order-0 bulk allocations. The bulk allocator is not updated in this
> series as it would have to be determined by bulk allocation users how
> they want to track the order of pages allocated with the bulk allocator.
>
> Signed-off-by: Mel Gorman <[email protected]>
> Acked-by: Vlastimil Babka <[email protected]>
> ---
> include/linux/mmzone.h | 20 +++++-
> mm/internal.h | 2 +-
> mm/page_alloc.c | 159 +++++++++++++++++++++++++++++------------
> mm/swap.c | 2 +-
> 4 files changed, 135 insertions(+), 48 deletions(-)
>

Hi Mel,

I am not able to boot my QEMU VM with v5.13-rc5-mmotm-2021-06-07-18-33.
git bisect points to this patch. The VM got stuck at “Booting from ROM…”.

My kernel config is attached and my qemu command is:

qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
-drive file=~/qemu-image/vm.qcow2,if=virtio \
-append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
-pidfile vm.pid \
-netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
-device virtio-net-pci,netdev=mynet0 \
-m 16g -smp 6 -cpu host -enable-kvm -nographic \
-machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
-object memory-backend-ram,size=8g,id=m1 \
-numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1

The attached config has THP disabled. The VM cannot boot with THP enabled,
either.

—
Best Regards,
Yan, Zi

Attachments:

.config (136.50 kB)
signature.asc (871.00 B)
OpenPGP digital signature Download all attachments

2021-06-10 11:21:44

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On Wed, Jun 09, 2021 at 02:30:18PM -0400, Zi Yan wrote:
> On 3 Jun 2021, at 10:22, Mel Gorman wrote:
> > Signed-off-by: Mel Gorman <[email protected]>
> > Acked-by: Vlastimil Babka <[email protected]>
> > ---
> > include/linux/mmzone.h | 20 +++++-
> > mm/internal.h | 2 +-
> > mm/page_alloc.c | 159 +++++++++++++++++++++++++++++------------
> > mm/swap.c | 2 +-
> > 4 files changed, 135 insertions(+), 48 deletions(-)
> >
>
> Hi Mel,
>
> I am not able to boot my QEMU VM with v5.13-rc5-mmotm-2021-06-07-18-33.
> git bisect points to this patch. The VM got stuck at ???Booting from ROM??????.
>
> My kernel config is attached and my qemu command is:
>
> qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
> -drive file=~/qemu-image/vm.qcow2,if=virtio \
> -append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
> -pidfile vm.pid \
> -netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
> -device virtio-net-pci,netdev=mynet0 \
> -m 16g -smp 6 -cpu host -enable-kvm -nographic \
> -machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
> -object memory-backend-ram,size=8g,id=m1 \
> -numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1
>
> The attached config has THP disabled. The VM cannot boot with THP enabled,
> either.
>

There is not a lot of information to go on here. Can you confirm that a
revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
sounds like your console log is empty, does anything useful appear if
you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?

--
Mel Gorman
SUSE Labs

2021-06-10 11:44:47

by Zi Yan

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On 10 Jun 2021, at 7:18, Mel Gorman wrote:

> On Wed, Jun 09, 2021 at 02:30:18PM -0400, Zi Yan wrote:
>> On 3 Jun 2021, at 10:22, Mel Gorman wrote:
>>> Signed-off-by: Mel Gorman <[email protected]>
>>> Acked-by: Vlastimil Babka <[email protected]>
>>> ---
>>> include/linux/mmzone.h | 20 +++++-
>>> mm/internal.h | 2 +-
>>> mm/page_alloc.c | 159 +++++++++++++++++++++++++++++------------
>>> mm/swap.c | 2 +-
>>> 4 files changed, 135 insertions(+), 48 deletions(-)
>>>
>>
>> Hi Mel,
>>
>> I am not able to boot my QEMU VM with v5.13-rc5-mmotm-2021-06-07-18-33.
>> git bisect points to this patch. The VM got stuck at ???Booting from ROM??????.
>>
>> My kernel config is attached and my qemu command is:
>>
>> qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
>> -drive file=~/qemu-image/vm.qcow2,if=virtio \
>> -append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
>> -pidfile vm.pid \
>> -netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
>> -device virtio-net-pci,netdev=mynet0 \
>> -m 16g -smp 6 -cpu host -enable-kvm -nographic \
>> -machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
>> -object memory-backend-ram,size=8g,id=m1 \
>> -numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1
>>
>> The attached config has THP disabled. The VM cannot boot with THP enabled,
>> either.
>>
>
> There is not a lot of information to go on here. Can you confirm that a
> revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
> sounds like your console log is empty, does anything useful appear if
> you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?

Sure. I can confirm that reverting the patch makes the VM boot.
The important information I forgot to mention is that after I remove
the NUMA setting in the QEMU, the VM can boot too.

earlyprintk gave the error message (page out of zone boundary) when the VM could not boot:

[ 0.120569] mem auto-init: stack:off, heap alloc:off, heap free:off
[ 0.161237] Memory: 16396772K/16776684K available (18452K kernel code, 3336K rwdata, 8000K rodata, 1852K init, 1444K bss, 379656K reserved, 0K cma-reserve)
[ 0.162451] page 0x100041 outside node 1 zone Normal [ 0x240000 - 0x440000 ]
[ 0.163057] page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x100041
[ 0.163863] flags: 0x200000000000000(node=0|zone=2)
[ 0.164283] raw: 0200000000000000 dead000000000100 dead000000000122 0000000000000000
[ 0.164950] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
[ 0.165621] page dumped because: VM_BUG_ON_PAGE(page && bad_range(zone, page))
[ 0.166617] ------------[ cut here ]------------
[ 0.167355] kernel BUG at mm/page_alloc.c:3764!
[ 0.168079] invalid opcode: 0000 [#1] SMP NOPTI
[ 0.168801] CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.0-rc5-mm1+ #381
[ 0.169870] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
[ 0.171057] RIP: 0010:get_page_from_freelist+0x10e5/0x1410
[ 0.171763] Code: 4c 24 18 4c 89 ee 8b 14 24 e8 67 d8 fc ff e9 af f6 ff ff 45 31 ff e9 41 fe ff ff 48 c7 c6 60 51 a3 82 4c 89 ff e8 db 82 fd ff <0f> 0b 658
[ 0.174173] RSP: 0000:ffffffff82c03d08 EFLAGS: 00010046
[ 0.174767] RAX: 0000000000000042 RBX: ffff88843fffad00 RCX: 00000000ffffdfff
[ 0.175381] RDX: 0000000000000000 RSI: 00000000ffffffea RDI: 0000000000000000
[ 0.175998] RBP: ffff888237c28300 R08: ffffffff82d509c8 R09: 0000000000009ffb
[ 0.176613] R10: 00000000ffffe000 R11: 3fffffffffffffff R12: ffff888237c28408
[ 0.177229] R13: 0000000000000000 R14: 0000000000000001 R15: ffffea0004001040
[ 0.177852] FS: 0000000000000000(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000
[ 0.178550] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.179045] CR2: ffff88843ffff000 CR3: 0000000002c0c000 CR4: 00000000000406b0
[ 0.179664] Call Trace:
[ 0.179876] ? should_fail_alloc_page+0x5/0x10
[ 0.180260] ? prepare_alloc_pages+0xb4/0x190
[ 0.180637] __alloc_pages+0xe6/0x210
[ 0.180952] allocate_slab+0x9a/0x470
[ 0.181275] __kmem_cache_create+0x259/0x510
[ 0.181645] create_boot_cache+0x72/0x96
[ 0.181998] kmem_cache_init+0xb4/0x17b
[ 0.182332] start_kernel+0x3d5/0x695
[ 0.182649] ? x86_family+0x5/0x20
[ 0.182945] secondary_startup_64_no_verify+0xb0/0xbb
[ 0.183383] Modules linked in:
[ 0.183649] random: get_random_bytes called from oops_exit+0x35/0x60 with crng_init=0
[ 0.183655] ---[ end trace 5a27ab5b99c01a0e ]---
[ 0.184731] RIP: 0010:get_page_from_freelist+0x10e5/0x1410
[ 0.185203] Code: 4c 24 18 4c 89 ee 8b 14 24 e8 67 d8 fc ff e9 af f6 ff ff 45 31 ff e9 41 fe ff ff 48 c7 c6 60 51 a3 82 4c 89 ff e8 db 82 fd ff <0f> 0b 658
[ 0.186828] RSP: 0000:ffffffff82c03d08 EFLAGS: 00010046
[ 0.187283] RAX: 0000000000000042 RBX: ffff88843fffad00 RCX: 00000000ffffdfff
[ 0.187900] RDX: 0000000000000000 RSI: 00000000ffffffea RDI: 0000000000000000
[ 0.188522] RBP: ffff888237c28300 R08: ffffffff82d509c8 R09: 0000000000009ffb
[ 0.189134] R10: 00000000ffffe000 R11: 3fffffffffffffff R12: ffff888237c28408
[ 0.189752] R13: 0000000000000000 R14: 0000000000000001 R15: ffffea0004001040
[ 0.190374] FS: 0000000000000000(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000
[ 0.191074] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.191572] CR2: ffff88843ffff000 CR3: 0000000002c0c000 CR4: 00000000000406b0
[ 0.192188] Kernel panic - not syncing: Attempted to kill the idle task!
[ 0.192791] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

—
Best Regards,
Yan, Zi

Attachments:

signature.asc (871.00 B)
OpenPGP digital signature

2021-06-10 23:02:24

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On Thu, 10 Jun 2021 07:40:47 -0400 Zi Yan <[email protected]> wrote:

> >> The attached config has THP disabled. The VM cannot boot with THP enabled,
> >> either.
> >>
> >
> > There is not a lot of information to go on here. Can you confirm that a
> > revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
> > sounds like your console log is empty, does anything useful appear if
> > you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?
>
> Sure. I can confirm that reverting the patch makes the VM boot.
> The important information I forgot to mention is that after I remove
> the NUMA setting in the QEMU, the VM can boot too.

Thanks, I'll drop
mm-page_alloc-allow-high-order-pages-to-be-stored-on-the-per-cpu-lists.patch
for now.

2021-06-11 00:40:26

by Stephen Rothwell

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

Hi Andrew,

On Thu, 10 Jun 2021 15:59:40 -0700 Andrew Morton <[email protected]> wrote:
>
> On Thu, 10 Jun 2021 07:40:47 -0400 Zi Yan <[email protected]> wrote:
>
> > >> The attached config has THP disabled. The VM cannot boot with THP enabled,
> > >> either.
> > >>
> > >
> > > There is not a lot of information to go on here. Can you confirm that a
> > > revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
> > > sounds like your console log is empty, does anything useful appear if
> > > you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?
> >
> > Sure. I can confirm that reverting the patch makes the VM boot.
> > The important information I forgot to mention is that after I remove
> > the NUMA setting in the QEMU, the VM can boot too.
>
> Thanks, I'll drop
> mm-page_alloc-allow-high-order-pages-to-be-stored-on-the-per-cpu-lists.patch
> for now.
>

Dropped from linux-next today.

--
Cheers,
Stephen Rothwell

Attachments:

(No filename) (499.00 B)
OpenPGP digital signature

2021-06-11 08:12:53

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On Thu, Jun 10, 2021 at 07:40:47AM -0400, Zi Yan wrote:
> >> My kernel config is attached and my qemu command is:
> >>
> >> qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
> >> -drive file=~/qemu-image/vm.qcow2,if=virtio \
> >> -append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
> >> -pidfile vm.pid \
> >> -netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
> >> -device virtio-net-pci,netdev=mynet0 \
> >> -m 16g -smp 6 -cpu host -enable-kvm -nographic \
> >> -machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
> >> -object memory-backend-ram,size=8g,id=m1 \
> >> -numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1
> >>
> >> The attached config has THP disabled. The VM cannot boot with THP enabled,
> >> either.
> >>
> >
> > There is not a lot of information to go on here. Can you confirm that a
> > revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
> > sounds like your console log is empty, does anything useful appear if
> > you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?
>
> Sure. I can confirm that reverting the patch makes the VM boot.
> The important information I forgot to mention is that after I remove
> the NUMA setting in the QEMU, the VM can boot too.
>
> earlyprintk gave the error message (page out of zone boundary) when the VM could not boot:
>

Ok, thanks, that helps. For a page to be out of boundary, I either have
completely screwed the zone handling for PCP or, more likely, pages are
leaking onto the boot pagesets because of the batch count handling. It's
weird I did not see this on NUMA machines but nevertheless, I'll go
find it. It should not take long.

--
Mel Gorman
SUSE Labs

2021-06-11 08:36:47

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On Thu, Jun 10, 2021 at 07:40:47AM -0400, Zi Yan wrote:
> >> qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
> >> -drive file=~/qemu-image/vm.qcow2,if=virtio \
> >> -append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
> >> -pidfile vm.pid \
> >> -netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
> >> -device virtio-net-pci,netdev=mynet0 \
> >> -m 16g -smp 6 -cpu host -enable-kvm -nographic \
> >> -machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
> >> -object memory-backend-ram,size=8g,id=m1 \
> >> -numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1
> >>
> >> The attached config has THP disabled. The VM cannot boot with THP enabled,
> >> either.
> >>
> >
> > There is not a lot of information to go on here. Can you confirm that a
> > revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
> > sounds like your console log is empty, does anything useful appear if
> > you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?
>
> Sure. I can confirm that reverting the patch makes the VM boot.
> The important information I forgot to mention is that after I remove
> the NUMA setting in the QEMU, the VM can boot too.
>
> earlyprintk gave the error message (page out of zone boundary) when the VM could not boot:
>

Can you test with the following patch please?

--8<---
mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists -fix

Zi Ya reported the following problem

I am not able to boot my QEMU VM with v5.13-rc5-mmotm-2021-06-07-18-33.
git bisect points to this patch. The VM got stuck at "Booting from ROM"

"This patch" is "mm/page_alloc: Allow high-order pages to be stored on
the per-cpu lists" and earlyprintk showed the following

[ 0.161237] Memory: 16396772K/16776684K available (18452K kernel code, 3336K rwdata, 8000K rodata, 1852K init, 1444K bss, 379656K reserved, 0K cma-reserve)
[ 0.162451] page 0x100041 outside node 1 zone Normal [ 0x240000 - 0x440000 ]
[ 0.163057] page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x100041

The patch is allowing pages from different zones to exist on the PCP
lists which is not allowed. Review found two problems -- first, the
bulk allocator is not using the correct PCP lists. It happens to work
because it's order-0 only but it's wrong. The real problem is that the
boot pagesets can store free pages which is not allowed.

Signed-off-by: Mel Gorman <[email protected]>
---
mm/page_alloc.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d6d90f046c94..8472bae567f0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3625,7 +3625,15 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
int batch = READ_ONCE(pcp->batch);
int alloced;

- batch = max(batch >> order, 2);
+ /*
+ * Scale batch relative to order if batch implies
+ * free pages can be stored on the PCP. Batch can
+ * be 1 for small zones or for boot pagesets which
+ * should never store free pages as the pages may
+ * belong to arbitrary zones.
+ */
+ if (batch > 1)
+ batch = max(batch >> order, 2);
alloced = rmqueue_bulk(zone, order,
batch, list,
migratetype, alloc_flags);
@@ -5265,7 +5273,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
/* Attempt the batch allocation */
local_lock_irqsave(&pagesets.lock, flags);
pcp = this_cpu_ptr(zone->per_cpu_pageset);
- pcp_list = &pcp->lists[ac.migratetype];
+ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];

while (nr_populated < nr_pages) {

2021-06-11 12:19:42

by Zi Yan

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On 11 Jun 2021, at 4:34, Mel Gorman wrote:

> On Thu, Jun 10, 2021 at 07:40:47AM -0400, Zi Yan wrote:
>>>> qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
>>>> -drive file=~/qemu-image/vm.qcow2,if=virtio \
>>>> -append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
>>>> -pidfile vm.pid \
>>>> -netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
>>>> -device virtio-net-pci,netdev=mynet0 \
>>>> -m 16g -smp 6 -cpu host -enable-kvm -nographic \
>>>> -machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
>>>> -object memory-backend-ram,size=8g,id=m1 \
>>>> -numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1
>>>>
>>>> The attached config has THP disabled. The VM cannot boot with THP enabled,
>>>> either.
>>>>
>>>
>>> There is not a lot of information to go on here. Can you confirm that a
>>> revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
>>> sounds like your console log is empty, does anything useful appear if
>>> you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?
>>
>> Sure. I can confirm that reverting the patch makes the VM boot.
>> The important information I forgot to mention is that after I remove
>> the NUMA setting in the QEMU, the VM can boot too.
>>
>> earlyprintk gave the error message (page out of zone boundary) when the VM could not boot:
>>
>
> Can you test with the following patch please?
>
> --8<---
> mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists -fix
>
> Zi Ya reported the following problem
s/Zi Ya/Zi Yan/
>
> I am not able to boot my QEMU VM with v5.13-rc5-mmotm-2021-06-07-18-33.
> git bisect points to this patch. The VM got stuck at "Booting from ROM"
>
> "This patch" is "mm/page_alloc: Allow high-order pages to be stored on
> the per-cpu lists" and earlyprintk showed the following
>
> [ 0.161237] Memory: 16396772K/16776684K available (18452K kernel code, 3336K rwdata, 8000K rodata, 1852K init, 1444K bss, 379656K reserved, 0K cma-reserve)
> [ 0.162451] page 0x100041 outside node 1 zone Normal [ 0x240000 - 0x440000 ]
> [ 0.163057] page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x100041
>
> The patch is allowing pages from different zones to exist on the PCP
> lists which is not allowed. Review found two problems -- first, the
> bulk allocator is not using the correct PCP lists. It happens to work
> because it's order-0 only but it's wrong. The real problem is that the
> boot pagesets can store free pages which is not allowed.
>
> Signed-off-by: Mel Gorman <[email protected]>
> ---
> mm/page_alloc.c | 12 ++++++++++--
> 1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d6d90f046c94..8472bae567f0 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3625,7 +3625,15 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
> int batch = READ_ONCE(pcp->batch);
> int alloced;
>
> - batch = max(batch >> order, 2);
> + /*
> + * Scale batch relative to order if batch implies
> + * free pages can be stored on the PCP. Batch can
> + * be 1 for small zones or for boot pagesets which
> + * should never store free pages as the pages may
> + * belong to arbitrary zones.
> + */
> + if (batch > 1)
> + batch = max(batch >> order, 2);
> alloced = rmqueue_bulk(zone, order,
> batch, list,
> migratetype, alloc_flags);
> @@ -5265,7 +5273,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> /* Attempt the batch allocation */
> local_lock_irqsave(&pagesets.lock, flags);
> pcp = this_cpu_ptr(zone->per_cpu_pageset);
> - pcp_list = &pcp->lists[ac.migratetype];
> + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
>
> while (nr_populated < nr_pages) {

Yes. This patch solves the issue. Thanks.

—
Best Regards,
Yan, Zi

Attachments:

signature.asc (871.00 B)
OpenPGP digital signature

2021-06-11 13:59:58

by Mel Gorman

[permalink] [raw]

Subject: Re: [PATCH 2/2] mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists

On Fri, Jun 11, 2021 at 08:17:02AM -0400, Zi Yan wrote:
> On 11 Jun 2021, at 4:34, Mel Gorman wrote:
>
> > On Thu, Jun 10, 2021 at 07:40:47AM -0400, Zi Yan wrote:
> >>>> qemu-system-x86_64 -kernel ~/repos/linux-1gb-thp/arch/x86/boot/bzImage \
> >>>> -drive file=~/qemu-image/vm.qcow2,if=virtio \
> >>>> -append "nokaslr root=/dev/vda1 rw console=ttyS0 " \
> >>>> -pidfile vm.pid \
> >>>> -netdev user,id=mynet0,hostfwd=tcp::11022-:22 \
> >>>> -device virtio-net-pci,netdev=mynet0 \
> >>>> -m 16g -smp 6 -cpu host -enable-kvm -nographic \
> >>>> -machine hmat=on -object memory-backend-ram,size=8g,id=m0 \
> >>>> -object memory-backend-ram,size=8g,id=m1 \
> >>>> -numa node,memdev=m0,nodeid=0 -numa node,memdev=m1,nodeid=1
> >>>>
> >>>> The attached config has THP disabled. The VM cannot boot with THP enabled,
> >>>> either.
> >>>>
> >>>
> >>> There is not a lot of information to go on here. Can you confirm that a
> >>> revert of that specific patch from mmotm-2021-06-07-18-33 also boots? It
> >>> sounds like your console log is empty, does anything useful appear if
> >>> you add "earlyprintk=serial,ttyS0,115200" to the kernel command line?
> >>
> >> Sure. I can confirm that reverting the patch makes the VM boot.
> >> The important information I forgot to mention is that after I remove
> >> the NUMA setting in the QEMU, the VM can boot too.
> >>
> >> earlyprintk gave the error message (page out of zone boundary) when the VM could not boot:
> >>
> >
> > Can you test with the following patch please?
> >
> > --8<---
> > mm/page_alloc: Allow high-order pages to be stored on the per-cpu lists -fix
> >
> > Zi Ya reported the following problem
> s/Zi Ya/Zi Yan/

Sorry about that typo.

> >
> > I am not able to boot my QEMU VM with v5.13-rc5-mmotm-2021-06-07-18-33.
> > git bisect points to this patch. The VM got stuck at "Booting from ROM"
> >
> > "This patch" is "mm/page_alloc: Allow high-order pages to be stored on
> > the per-cpu lists" and earlyprintk showed the following
> >
> > [ 0.161237] Memory: 16396772K/16776684K available (18452K kernel code, 3336K rwdata, 8000K rodata, 1852K init, 1444K bss, 379656K reserved, 0K cma-reserve)
> > [ 0.162451] page 0x100041 outside node 1 zone Normal [ 0x240000 - 0x440000 ]
> > [ 0.163057] page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x100041
> >
> > The patch is allowing pages from different zones to exist on the PCP
> > lists which is not allowed. Review found two problems -- first, the
> > bulk allocator is not using the correct PCP lists. It happens to work
> > because it's order-0 only but it's wrong. The real problem is that the
> > boot pagesets can store free pages which is not allowed.
> >
> > Signed-off-by: Mel Gorman <[email protected]>
> > ---
> > mm/page_alloc.c | 12 ++++++++++--
> > 1 file changed, 10 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index d6d90f046c94..8472bae567f0 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -3625,7 +3625,15 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
> > int batch = READ_ONCE(pcp->batch);
> > int alloced;
> >
> > - batch = max(batch >> order, 2);
> > + /*
> > + * Scale batch relative to order if batch implies
> > + * free pages can be stored on the PCP. Batch can
> > + * be 1 for small zones or for boot pagesets which
> > + * should never store free pages as the pages may
> > + * belong to arbitrary zones.
> > + */
> > + if (batch > 1)
> > + batch = max(batch >> order, 2);
> > alloced = rmqueue_bulk(zone, order,
> > batch, list,
> > migratetype, alloc_flags);
> > @@ -5265,7 +5273,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
> > /* Attempt the batch allocation */
> > local_lock_irqsave(&pagesets.lock, flags);
> > pcp = this_cpu_ptr(zone->per_cpu_pageset);
> > - pcp_list = &pcp->lists[ac.migratetype];
> > + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
> >
> > while (nr_populated < nr_pages) {
>
> Yes. This patch solves the issue. Thanks.
>

Thanks. As Andrew dropped the patch from mmotm, I've send a v2 with the
fix included. Thanks for reporting and testing!

--
Mel Gorman
SUSE Labs