Playing with different memory sizes for a x86-64 guest, I discovered that
some memmaps (highest section if max_mem does not fall on the section
boundary) are marked as being valid and online, but contain garbage. We
have to properly initialize these memmaps.
Looking at /proc/kpageflags and friends, I found some more issues,
partially related to this.
David Hildenbrand (3):
mm: fix uninitialized memmaps on a partially populated last section
fs/proc/page.c: allow inspection of last section and fix end detection
mm: initialize memmap of unavailable memory directly
fs/proc/page.c | 15 ++++++++++++---
include/linux/mm.h | 6 ------
mm/page_alloc.c | 43 ++++++++++++++++++++++++++++++++-----------
3 files changed, 44 insertions(+), 20 deletions(-)
--
2.21.0
If max_pfn does not fall onto a section boundary, it is possible to inspect
PFNs up to max_pfn, and PFNs above max_pfn, however, max_pfn itself can't
be inspected. We can have a valid (and online) memmap at and above max_pfn
if max_pfn is not aligned to a section boundary. The whole early section
has a memmap and is marked online. Being able to inspect the state of these
PFNs is valuable for debugging, especially because max_pfn can change on
memory hotplug and expose these memmaps.
Also, querying page flags via "./page-types -r -a 0x144001,"
(tools/vm/page-types.c) inside a x86-64 guest with 4160MB under QEMU
results in an (almost) endless loop in user space, because the end is
not detected properly when starting after max_pfn.
Instead, let's allow to inspect all pages in the highest section and
return 0 directly if we try to access pages above that section.
While at it, check the count before adjusting it, to avoid masking user
errors.
Cc: Alexey Dobriyan <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Oscar Salvador <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Stephen Rothwell <[email protected]>
Cc: [email protected]
Signed-off-by: David Hildenbrand <[email protected]>
---
fs/proc/page.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e40dbfe1168e..da01d3d9999a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -29,6 +29,7 @@
static ssize_t kpagecount_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -37,9 +38,11 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
u64 pcount;
pfn = src / KPMSIZE;
- count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -208,6 +211,7 @@ u64 stable_page_flags(struct page *page)
static ssize_t kpageflags_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -215,9 +219,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
ssize_t ret = 0;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -253,6 +259,7 @@ static const struct file_operations proc_kpageflags_operations = {
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -261,9 +268,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
u64 ino;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
--
2.21.0
Let's make sure that all memory holes are actually marked
PageReserved(), that page_to_pfn() produces reliable results, and that
these pages are not detected as "mmap" pages due to the mapcount.
E.g., booting a x86-64 QEMU guest with 4160 MB:
[ 0.010585] Early memory node ranges
[ 0.010586] node 0: [mem 0x0000000000001000-0x000000000009efff]
[ 0.010588] node 0: [mem 0x0000000000100000-0x00000000bffdefff]
[ 0.010589] node 0: [mem 0x0000000100000000-0x0000000143ffffff]
max_pfn is 0x144000.
Before this change:
[root@localhost ~]# ./page-types -r -a 0x144000,
flags page-count MB symbolic-flags long-symbolic-flags
0x0000000000000800 16384 64 ___________M_______________________________ mmap
total 16384 64
After this change:
[root@localhost ~]# ./page-types -r -a 0x144000,
flags page-count MB symbolic-flags long-symbolic-flags
0x0000000100000000 16384 64 ___________________________r_______________ reserved
total 16384 64
IOW, especially the unavailable physical memory ("memory hole") in the last
section would not get properly marked PageReserved() and is indicated to be
"mmap" memory.
Drop the trace of that function from include/linux/mm.h - nobody else
needs it, and rename it accordingly.
Note: The fake zone/node might not be covered by the zone/node span. This
is not an urgent issue (for now, we had the same node/zone due to the
zeroing). We'll need a clean way to mark memory holes (e.g., using a page
type PageHole() if possible or a fake ZONE_INVALID) and eventually stop
marking these memory holes PageReserved().
Cc: Andrew Morton <[email protected]>
Cc: Oscar Salvador <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Naoya Horiguchi <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]>
---
include/linux/mm.h | 6 ------
mm/page_alloc.c | 33 ++++++++++++++++++++++-----------
2 files changed, 22 insertions(+), 17 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5dfbc0e56e67..93ee776c2a1e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2176,12 +2176,6 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
struct mminit_pfnnid_cache *state);
#endif
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
-void zero_resv_unavail(void);
-#else
-static inline void zero_resv_unavail(void) {}
-#endif
-
extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
enum memmap_context, struct vmem_altmap *);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1eb2ce7c79e4..85064abafcc3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6901,10 +6901,10 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
- * Zero all valid struct pages in range [spfn, epfn), return number of struct
- * pages zeroed
+ * Initialize all valid struct pages in the range [spfn, epfn) and mark them
+ * PageReserved(). Return the number of struct pages that were initialized.
*/
-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
{
unsigned long pfn;
u64 pgcnt = 0;
@@ -6915,7 +6915,13 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+ pageblock_nr_pages - 1;
continue;
}
- mm_zero_struct_page(pfn_to_page(pfn));
+ /*
+ * Use a fake node/zone (0) for now. Some of these pages
+ * (in memblock.reserved but not in memblock.memory) will
+ * get re-initialized via reserve_bootmem_region() later.
+ */
+ __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
+ __SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
@@ -6927,7 +6933,7 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly zero those struct pages.
+ * flags). We must explicitly initialize those struct pages.
*
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
@@ -6935,7 +6941,7 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* layout is manually configured via memmap=, or when the highest physical
* address (max_pfn) does not end on a section boundary.
*/
-void __init zero_resv_unavail(void)
+static void __init init_unavailable_mem(void)
{
phys_addr_t start, end;
u64 i, pgcnt;
@@ -6948,7 +6954,8 @@ void __init zero_resv_unavail(void)
for_each_mem_range(i, &memblock.memory, NULL,
NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
if (next < start)
- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ PFN_UP(start));
next = end;
}
@@ -6959,8 +6966,8 @@ void __init zero_resv_unavail(void)
* considered initialized. Make sure that memmap has a well defined
* state.
*/
- pgcnt += zero_pfn_range(PFN_DOWN(next),
- round_up(max_pfn, PAGES_PER_SECTION));
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
@@ -6969,6 +6976,10 @@ void __init zero_resv_unavail(void)
if (pgcnt)
pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
+#else
+static inline void __init init_unavailable_mem(void)
+{
+}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -7398,7 +7409,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- zero_resv_unavail();
+ init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
@@ -7593,7 +7604,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init free_area_init(unsigned long *zones_size)
{
- zero_resv_unavail();
+ init_unavailable_mem();
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
--
2.21.0
If max_pfn is not aligned to a section boundary, we can easily run into
BUGs. This can e.g., be triggered on x86-64 under QEMU by specifying a
memory size that is not a multiple of 128MB (e.g., 4097MB, but also
4160MB). I was told that on real HW, we can easily have this scenario
(esp., one of the main reasons sub-section hotadd of devmem was added).
The issue is, that we have a valid memmap (pfn_valid()) for the
whole section, and the whole section will be marked "online".
pfn_to_online_page() will succeed, but the memmap contains garbage.
E.g., doing a "cat /proc/kpageflags > /dev/null" results in
[ 303.218313] BUG: unable to handle page fault for address: fffffffffffffffe
[ 303.218899] #PF: supervisor read access in kernel mode
[ 303.219344] #PF: error_code(0x0000) - not-present page
[ 303.219787] PGD 12614067 P4D 12614067 PUD 12616067 PMD 0
[ 303.220266] Oops: 0000 [#1] SMP NOPTI
[ 303.220587] CPU: 0 PID: 424 Comm: cat Not tainted 5.4.0-next-20191128+ #17
[ 303.221169] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
[ 303.222140] RIP: 0010:stable_page_flags+0x4d/0x410
[ 303.222554] Code: f3 ff 41 89 c0 48 b8 00 00 00 00 01 00 00 00 45 84 c0 0f 85 cd 02 00 00 48 8b 53 08 48 8b 2b 48f
[ 303.224135] RSP: 0018:ffff9f5980187e58 EFLAGS: 00010202
[ 303.224576] RAX: fffffffffffffffe RBX: ffffda1285004000 RCX: ffff9f5980187dd4
[ 303.225178] RDX: 0000000000000001 RSI: ffffffff92662420 RDI: 0000000000000246
[ 303.225789] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000
[ 303.226405] R10: 0000000000000000 R11: 0000000000000000 R12: 00007f31d070e000
[ 303.227012] R13: 0000000000140100 R14: 00007f31d070e800 R15: ffffda1285004000
[ 303.227629] FS: 00007f31d08f6580(0000) GS:ffff90a6bba00000(0000) knlGS:0000000000000000
[ 303.228329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 303.228820] CR2: fffffffffffffffe CR3: 00000001332a2000 CR4: 00000000000006f0
[ 303.229438] Call Trace:
[ 303.229654] kpageflags_read.cold+0x57/0xf0
[ 303.230016] proc_reg_read+0x3c/0x60
[ 303.230332] vfs_read+0xc2/0x170
[ 303.230614] ksys_read+0x65/0xe0
[ 303.230898] do_syscall_64+0x5c/0xa0
[ 303.231216] entry_SYSCALL_64_after_hwframe+0x49/0xbe
This patch fixes that by at least zero-ing out that memmap (so e.g.,
page_to_pfn() will not crash). Commit 907ec5fca3dc ("mm: zero remaining
unavailable struct pages") tried to fix a similar issue, but forgot to
consider this special case.
After this patch, there are still problems to solve. E.g., not all of these
pages falling into a memory hole will actually get initialized later
and set PageReserved - they are only zeroed out - but at least the
immediate crashes are gone. A follow-up patch will take care of this.
Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
Cc: <[email protected]> # v4.15+
Cc: Naoya Horiguchi <[email protected]>
Cc: Pavel Tatashin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Steven Sistare <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Daniel Jordan <[email protected]>
Cc: Bob Picco <[email protected]>
Cc: Oscar Salvador <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]>
---
mm/page_alloc.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 62dcd6b76c80..1eb2ce7c79e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6932,7 +6932,8 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=.
+ * layout is manually configured via memmap=, or when the highest physical
+ * address (max_pfn) does not end on a section boundary.
*/
void __init zero_resv_unavail(void)
{
@@ -6950,7 +6951,16 @@ void __init zero_resv_unavail(void)
pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
next = end;
}
- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
+
+ /*
+ * Early sections always have a fully populated memmap for the whole
+ * section - see pfn_valid(). If the last section has holes at the
+ * end and that section is marked "online", the memmap will be
+ * considered initialized. Make sure that memmap has a well defined
+ * state.
+ */
+ pgcnt += zero_pfn_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
--
2.21.0
Hi David,
On Mon, Dec 09, 2019 at 06:48:34PM +0100, David Hildenbrand wrote:
> If max_pfn is not aligned to a section boundary, we can easily run into
> BUGs. This can e.g., be triggered on x86-64 under QEMU by specifying a
> memory size that is not a multiple of 128MB (e.g., 4097MB, but also
> 4160MB). I was told that on real HW, we can easily have this scenario
> (esp., one of the main reasons sub-section hotadd of devmem was added).
>
> The issue is, that we have a valid memmap (pfn_valid()) for the
> whole section, and the whole section will be marked "online".
> pfn_to_online_page() will succeed, but the memmap contains garbage.
>
> E.g., doing a "cat /proc/kpageflags > /dev/null" results in
>
> [ 303.218313] BUG: unable to handle page fault for address: fffffffffffffffe
> [ 303.218899] #PF: supervisor read access in kernel mode
> [ 303.219344] #PF: error_code(0x0000) - not-present page
> [ 303.219787] PGD 12614067 P4D 12614067 PUD 12616067 PMD 0
> [ 303.220266] Oops: 0000 [#1] SMP NOPTI
> [ 303.220587] CPU: 0 PID: 424 Comm: cat Not tainted 5.4.0-next-20191128+ #17
I can't reproduce this on x86-64 qemu, next-20191128 or mainline, with either
memory size. What config are you using? How often are you hitting it?
It may not have anything to do with the config, and I may be getting lucky with
the garbage in my memory.
Hi David,
I love your patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on linux/master v5.5-rc1 next-20191209]
[cannot apply to mmotm/master]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
url: https://github.com/0day-ci/linux/commits/David-Hildenbrand/mm-fix-max_pfn-not-falling-on-section-boundary/20191210-071011
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 6794862a16ef41f753abd75c03a152836e4c8028
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-7 (Debian 7.5.0-1) 7.5.0
reproduce:
# save the attached .config to linux build tree
make ARCH=um SUBARCH=x86_64
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
All error/warnings (new ones prefixed by >>):
In file included from include/asm-generic/bug.h:19:0,
from ./arch/um/include/generated/asm/bug.h:1,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/mm.h:9,
from include/linux/memblock.h:13,
from fs/proc/page.c:2:
fs/proc/page.c: In function 'kpagecount_read':
>> fs/proc/page.c:32:55: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'USEC_PER_SEC'?
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^
include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
^
>> fs/proc/page.c:32:37: note: in expansion of macro 'round_up'
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^~~~~~~~
fs/proc/page.c:32:55: note: each undeclared identifier is reported only once for each function it appears in
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^
include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
^
>> fs/proc/page.c:32:37: note: in expansion of macro 'round_up'
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^~~~~~~~
fs/proc/page.c: In function 'kpageflags_read':
fs/proc/page.c:212:55: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'USEC_PER_SEC'?
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^
include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
^
fs/proc/page.c:212:37: note: in expansion of macro 'round_up'
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^~~~~~~~
vim +32 fs/proc/page.c
> 2 #include <linux/memblock.h>
3 #include <linux/compiler.h>
4 #include <linux/fs.h>
5 #include <linux/init.h>
6 #include <linux/ksm.h>
7 #include <linux/mm.h>
8 #include <linux/mmzone.h>
9 #include <linux/huge_mm.h>
10 #include <linux/proc_fs.h>
11 #include <linux/seq_file.h>
12 #include <linux/hugetlb.h>
13 #include <linux/memcontrol.h>
14 #include <linux/mmu_notifier.h>
15 #include <linux/page_idle.h>
16 #include <linux/kernel-page-flags.h>
17 #include <linux/uaccess.h>
18 #include "internal.h"
19
20 #define KPMSIZE sizeof(u64)
21 #define KPMMASK (KPMSIZE - 1)
22 #define KPMBITS (KPMSIZE * BITS_PER_BYTE)
23
24 /* /proc/kpagecount - an array exposing page counts
25 *
26 * Each entry is a u64 representing the corresponding
27 * physical page count.
28 */
29 static ssize_t kpagecount_read(struct file *file, char __user *buf,
30 size_t count, loff_t *ppos)
31 {
> 32 const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
33 u64 __user *out = (u64 __user *)buf;
34 struct page *ppage;
35 unsigned long src = *ppos;
36 unsigned long pfn;
37 ssize_t ret = 0;
38 u64 pcount;
39
40 pfn = src / KPMSIZE;
41 if (src & KPMMASK || count & KPMMASK)
42 return -EINVAL;
43 if (src >= max_dump_pfn * KPMSIZE)
44 return 0;
45 count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
46
47 while (count > 0) {
48 /*
49 * TODO: ZONE_DEVICE support requires to identify
50 * memmaps that were actually initialized.
51 */
52 ppage = pfn_to_online_page(pfn);
53
54 if (!ppage || PageSlab(ppage) || page_has_type(ppage))
55 pcount = 0;
56 else
57 pcount = page_mapcount(ppage);
58
59 if (put_user(pcount, out)) {
60 ret = -EFAULT;
61 break;
62 }
63
64 pfn++;
65 out++;
66 count -= KPMSIZE;
67
68 cond_resched();
69 }
70
71 *ppos += (char __user *)out - buf;
72 if (!ret)
73 ret = (char __user *)out - buf;
74 return ret;
75 }
76
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Hi David,
I love your patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on linux/master v5.5-rc1 next-20191209]
[cannot apply to mmotm/master]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
url: https://github.com/0day-ci/linux/commits/David-Hildenbrand/mm-fix-max_pfn-not-falling-on-section-boundary/20191210-071011
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 6794862a16ef41f753abd75c03a152836e4c8028
config: i386-defconfig (attached as .config)
compiler: gcc-7 (Debian 7.5.0-1) 7.5.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
All errors (new ones prefixed by >>):
In file included from include/asm-generic/bug.h:19:0,
from arch/x86/include/asm/bug.h:83,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/mm.h:9,
from include/linux/memblock.h:13,
from fs/proc/page.c:2:
fs/proc/page.c: In function 'kpagecount_read':
>> fs/proc/page.c:32:55: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'PAGE_KERNEL_IO'?
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^
include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
^
fs/proc/page.c:32:37: note: in expansion of macro 'round_up'
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^~~~~~~~
fs/proc/page.c:32:55: note: each undeclared identifier is reported only once for each function it appears in
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^
include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
^
fs/proc/page.c:32:37: note: in expansion of macro 'round_up'
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^~~~~~~~
fs/proc/page.c: In function 'kpageflags_read':
fs/proc/page.c:212:55: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'PAGE_KERNEL_IO'?
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^
include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
^
fs/proc/page.c:212:37: note: in expansion of macro 'round_up'
const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
^~~~~~~~
vim +32 fs/proc/page.c
> 2 #include <linux/memblock.h>
3 #include <linux/compiler.h>
4 #include <linux/fs.h>
5 #include <linux/init.h>
6 #include <linux/ksm.h>
7 #include <linux/mm.h>
8 #include <linux/mmzone.h>
9 #include <linux/huge_mm.h>
10 #include <linux/proc_fs.h>
11 #include <linux/seq_file.h>
12 #include <linux/hugetlb.h>
13 #include <linux/memcontrol.h>
14 #include <linux/mmu_notifier.h>
15 #include <linux/page_idle.h>
16 #include <linux/kernel-page-flags.h>
17 #include <linux/uaccess.h>
18 #include "internal.h"
19
20 #define KPMSIZE sizeof(u64)
21 #define KPMMASK (KPMSIZE - 1)
22 #define KPMBITS (KPMSIZE * BITS_PER_BYTE)
23
24 /* /proc/kpagecount - an array exposing page counts
25 *
26 * Each entry is a u64 representing the corresponding
27 * physical page count.
28 */
29 static ssize_t kpagecount_read(struct file *file, char __user *buf,
30 size_t count, loff_t *ppos)
31 {
> 32 const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
33 u64 __user *out = (u64 __user *)buf;
34 struct page *ppage;
35 unsigned long src = *ppos;
36 unsigned long pfn;
37 ssize_t ret = 0;
38 u64 pcount;
39
40 pfn = src / KPMSIZE;
41 if (src & KPMMASK || count & KPMMASK)
42 return -EINVAL;
43 if (src >= max_dump_pfn * KPMSIZE)
44 return 0;
45 count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
46
47 while (count > 0) {
48 /*
49 * TODO: ZONE_DEVICE support requires to identify
50 * memmaps that were actually initialized.
51 */
52 ppage = pfn_to_online_page(pfn);
53
54 if (!ppage || PageSlab(ppage) || page_has_type(ppage))
55 pcount = 0;
56 else
57 pcount = page_mapcount(ppage);
58
59 if (put_user(pcount, out)) {
60 ret = -EFAULT;
61 break;
62 }
63
64 pfn++;
65 out++;
66 count -= KPMSIZE;
67
68 cond_resched();
69 }
70
71 *ppos += (char __user *)out - buf;
72 if (!ret)
73 ret = (char __user *)out - buf;
74 return ret;
75 }
76
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
On 09.12.19 22:15, Daniel Jordan wrote:
> Hi David,
>
> On Mon, Dec 09, 2019 at 06:48:34PM +0100, David Hildenbrand wrote:
>> If max_pfn is not aligned to a section boundary, we can easily run into
>> BUGs. This can e.g., be triggered on x86-64 under QEMU by specifying a
>> memory size that is not a multiple of 128MB (e.g., 4097MB, but also
>> 4160MB). I was told that on real HW, we can easily have this scenario
>> (esp., one of the main reasons sub-section hotadd of devmem was added).
>>
>> The issue is, that we have a valid memmap (pfn_valid()) for the
>> whole section, and the whole section will be marked "online".
>> pfn_to_online_page() will succeed, but the memmap contains garbage.
>>
>> E.g., doing a "cat /proc/kpageflags > /dev/null" results in
>>
>> [ 303.218313] BUG: unable to handle page fault for address: fffffffffffffffe
>> [ 303.218899] #PF: supervisor read access in kernel mode
>> [ 303.219344] #PF: error_code(0x0000) - not-present page
>> [ 303.219787] PGD 12614067 P4D 12614067 PUD 12616067 PMD 0
>> [ 303.220266] Oops: 0000 [#1] SMP NOPTI
>> [ 303.220587] CPU: 0 PID: 424 Comm: cat Not tainted 5.4.0-next-20191128+ #17
>
Hi Daniel,
> I can't reproduce this on x86-64 qemu, next-20191128 or mainline, with either
> memory size. What config are you using? How often are you hitting it?
Thanks for verifying! Hah, there is one piece missing to reproduce via
"cat /proc/kpageflags > /dev/null" that I ignored on my QEMU cmdline (see below)
I can reproduce it reliably (QEMU with "-m 4160M") via
[root@localhost ~]# uname -a
Linux localhost 5.5.0-rc1-next-20191209 #93 SMP Tue Dec 10 10:46:19 CET 2019 x86_64 x86_64 x86_64 GNU/Linux
[root@localhost ~]# ./page-types -r -a 0x144001
[ 200.476376] BUG: unable to handle page fault for address: fffffffffffffffe
[ 200.477500] #PF: supervisor read access in kernel mode
[ 200.478334] #PF: error_code(0x0000) - not-present page
[ 200.479076] PGD 59614067 P4D 59614067 PUD 59616067 PMD 0
[ 200.479557] Oops: 0000 [#4] SMP NOPTI
[ 200.479875] CPU: 0 PID: 603 Comm: page-types Tainted: G D W 5.5.0-rc1-next-20191209 #93
[ 200.480646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4
[ 200.481648] RIP: 0010:stable_page_flags+0x4d/0x410
[ 200.482061] Code: f3 ff 41 89 c0 48 b8 00 00 00 00 01 00 00 00 45 84 c0 0f 85 cd 02 00 00 48 8b 53 08 48 8b 2b 48f
[ 200.483644] RSP: 0018:ffffb139401cbe60 EFLAGS: 00010202
[ 200.484091] RAX: fffffffffffffffe RBX: fffffbeec5100040 RCX: 0000000000000000
[ 200.484697] RDX: 0000000000000001 RSI: ffffffff9535c7cd RDI: 0000000000000246
[ 200.485313] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000
[ 200.485917] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000144001
[ 200.486523] R13: 00007ffd6ba55f48 R14: 00007ffd6ba55f40 R15: ffffb139401cbf08
[ 200.487130] FS: 00007f68df717580(0000) GS:ffff9ec77fa00000(0000) knlGS:0000000000000000
[ 200.487804] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 200.488295] CR2: fffffffffffffffe CR3: 0000000135d48000 CR4: 00000000000006f0
[ 200.488897] Call Trace:
[ 200.489115] kpageflags_read+0xe9/0x140
[ 200.489447] proc_reg_read+0x3c/0x60
[ 200.489755] vfs_read+0xc2/0x170
[ 200.490037] ksys_pread64+0x65/0xa0
[ 200.490352] do_syscall_64+0x5c/0xa0
[ 200.490665] entry_SYSCALL_64_after_hwframe+0x49/0xbe
(tool located in tools/vm/page-types.c, see also patch #2)
To reproduce via "cat /proc/kpageflags > /dev/null", you have to
hot/coldplug one DIMM, to move max_pfn beyond the garbage memmap
(see also patch #2). My QEMU cmdline with Fedora 31:
qemu-system-x86_64 \
--enable-kvm \
-m 4160M,slots=4,maxmem=8G \
-hda Fedora-Cloud-Base-31-1.9.x86_64.qcow2 \
-machine pc \
-nographic \
-nodefaults \
-chardev stdio,id=serial,signal=off \
-device isa-serial,chardev=serial \
-object memory-backend-ram,id=mem0,size=1024M \
-device pc-dimm,id=dimm0,memdev=mem0
[root@localhost ~]# uname -a
Linux localhost 5.3.7-301.fc31.x86_64 #1 SMP Mon Oct 21 19:18:58 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux
[root@localhost ~]# cat /proc/kpageflags > /dev/null
[ 111.517275] BUG: unable to handle page fault for address: fffffffffffffffe
[ 111.517907] #PF: supervisor read access in kernel mode
[ 111.518333] #PF: error_code(0x0000) - not-present page
[ 111.518771] PGD a240e067 P4D a240e067 PUD a2410067 PMD 0
>
> It may not have anything to do with the config, and I may be getting lucky with
> the garbage in my memory.
>
Some things that might be relevant from my config.
# CONFIG_PAGE_POISONING is not set
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_SPARSEMEM_EXTREME=y
CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
CONFIG_SPARSEMEM_VMEMMAP=y
CONFIG_HAVE_MEMBLOCK_NODE_MAP=y
CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTPLUG_SPARSE=y
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
The F31 default config should make it trigger.
Will update this patch description - thanks!
...
--
Thanks,
David / dhildenb
On 10.12.19 02:04, kbuild test robot wrote:
> Hi David,
>
> I love your patch! Yet something to improve:
>
> [auto build test ERROR on linus/master]
> [also build test ERROR on linux/master v5.5-rc1 next-20191209]
> [cannot apply to mmotm/master]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
>
> url: https://github.com/0day-ci/linux/commits/David-Hildenbrand/mm-fix-max_pfn-not-falling-on-section-boundary/20191210-071011
> base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 6794862a16ef41f753abd75c03a152836e4c8028
> config: i386-defconfig (attached as .config)
> compiler: gcc-7 (Debian 7.5.0-1) 7.5.0
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=i386
>
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot <[email protected]>
>
> All errors (new ones prefixed by >>):
>
> In file included from include/asm-generic/bug.h:19:0,
> from arch/x86/include/asm/bug.h:83,
> from include/linux/bug.h:5,
> from include/linux/mmdebug.h:5,
> from include/linux/mm.h:9,
> from include/linux/memblock.h:13,
> from fs/proc/page.c:2:
> fs/proc/page.c: In function 'kpagecount_read':
>>> fs/proc/page.c:32:55: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'PAGE_KERNEL_IO'?
> const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
> ^
> include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
> #define __round_mask(x, y) ((__typeof__(x))((y)-1))
> ^
> fs/proc/page.c:32:37: note: in expansion of macro 'round_up'
> const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
> ^~~~~~~~
> fs/proc/page.c:32:55: note: each undeclared identifier is reported only once for each function it appears in
> const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
> ^
> include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
> #define __round_mask(x, y) ((__typeof__(x))((y)-1))
> ^
> fs/proc/page.c:32:37: note: in expansion of macro 'round_up'
> const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
> ^~~~~~~~
> fs/proc/page.c: In function 'kpageflags_read':
> fs/proc/page.c:212:55: error: 'PAGES_PER_SECTION' undeclared (first use in this function); did you mean 'PAGE_KERNEL_IO'?
> const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
> ^
> include/linux/kernel.h:62:46: note: in definition of macro '__round_mask'
> #define __round_mask(x, y) ((__typeof__(x))((y)-1))
> ^
> fs/proc/page.c:212:37: note: in expansion of macro 'round_up'
> const unsigned long max_dump_pfn = round_up(max_pfn, PAGES_PER_SECTION);
> ^~~~~~~~
>
So PAGES_PER_SECTION only applies to CONFIG_SPARSEMEM. Some local function
that messes with that should do the trick. Allows us to add a comment as
well :)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e40dbfe1168e..2984df28ccea 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -21,6 +21,21 @@
#define KPMMASK (KPMSIZE - 1)
#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+static inline unsigned long get_max_dump_pfn(void)
+{
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * The memmap of early sections is completely populated and marked
+ * online even if max_pfn does not fall on a section boundary -
+ * pfn_to_online_page() will succeed on all pages. Allow inspecting
+ * these memmaps.
+ */
+ return round_up(max_pfn, PAGES_PER_SECTION);
+#else
+ return max_pfn;
+#endif
+}
+
/* /proc/kpagecount - an array exposing page counts
*
* Each entry is a u64 representing the corresponding
@@ -29,6 +44,7 @@
static ssize_t kpagecount_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -37,9 +53,11 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
u64 pcount;
pfn = src / KPMSIZE;
- count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -208,6 +226,7 @@ u64 stable_page_flags(struct page *page)
static ssize_t kpageflags_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -215,9 +234,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
ssize_t ret = 0;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
@@ -253,6 +274,7 @@ static const struct file_operations proc_kpageflags_operations = {
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
+ const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
struct page *ppage;
unsigned long src = *ppos;
@@ -261,9 +283,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
u64 ino;
pfn = src / KPMSIZE;
- count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
if (src & KPMMASK || count & KPMMASK)
return -EINVAL;
+ if (src >= max_dump_pfn * KPMSIZE)
+ return 0;
+ count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
/*
--
2.21.0
--
Thanks,
David / dhildenb
On Tue, Dec 10, 2019 at 11:11:03AM +0100, David Hildenbrand wrote:
> Some things that might be relevant from my config.
>
> # CONFIG_PAGE_POISONING is not set
> CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
> CONFIG_SPARSEMEM_EXTREME=y
> CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
> CONFIG_SPARSEMEM_VMEMMAP=y
> CONFIG_HAVE_MEMBLOCK_NODE_MAP=y
> CONFIG_MEMORY_HOTPLUG=y
> CONFIG_MEMORY_HOTPLUG_SPARSE=y
> CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
Thanks for all that. After some poking around, turns out enabling DEBUG_VM
with its page poisoning let me hit it right away, which makes me wonder how
often someone would see this without it.
Anyway, fix looks good to me.
Tested-by: Daniel Jordan <[email protected]>