2014-01-11 20:49:42

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 0/4] EFI memmap fix v2

From: Borislav Petkov <[email protected]>

Ok, here's v2 rebased and rediffed against tip (which has the relevant
efi branches).

Toshi, I'm pushing it to:

git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git#efi-fixes

so that you can give it another run and so that the build robot can poke
at it.

Thanks.

Changelog:

* v1:

this is the result of Toshi and me debugging a #GP on one of his big HP
boxes sporting UEFI. Each commit message should be self-explanatory so
please look there.

This has more or less an RFC nature thus I'm sending it now to collect
feedback. It is going to wait in the EFI queue anyway after the kexec
stuff gets sorted out first.

Comments and suggestions as always are very much appreciated.

Borislav Petkov (4):
x86, ptdump: Add the functionality to dump an arbitrary pagetable
efi: Dump the EFI page table
x86, pageattr: Export page unmapping interface
efi: Make efi virtual runtime map passing more robust

arch/x86/include/asm/efi.h | 3 +-
arch/x86/include/asm/pgtable.h | 3 +-
arch/x86/include/asm/pgtable_types.h | 2 +
arch/x86/mm/dump_pagetables.c | 77 ++++++++++++++++++++++++------------
arch/x86/mm/pageattr.c | 44 +++++++++++++++------
arch/x86/platform/efi/efi.c | 73 ++++++++++++++++++++++++++--------
arch/x86/platform/efi/efi_32.c | 6 ++-
arch/x86/platform/efi/efi_64.c | 31 +++++++++++++--
8 files changed, 177 insertions(+), 62 deletions(-)

--
1.8.5.2.192.g7794a68


2014-01-11 20:49:48

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 3/4] x86, pageattr: Export page unmapping interface

From: Borislav Petkov <[email protected]>

We will use it in efi so expose it.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/pgtable_types.h | 2 ++
arch/x86/mm/pageattr.c | 44 +++++++++++++++++++++++++-----------
2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a83aa44bb1fb..765a4f52d6cd 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -384,6 +384,8 @@ extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags);
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages);
#endif /* !__ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b3b19f46c016..a3488689e301 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -692,6 +692,18 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
return true;
}

+static bool try_to_free_pud_page(pud_t *pud)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud_none(pud[i]))
+ return false;
+
+ free_page((unsigned long)pud);
+ return true;
+}
+
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -805,6 +817,16 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}

+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd_entry = root + pgd_index(addr);
+
+ unmap_pud_range(pgd_entry, addr, end);
+
+ if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+ pgd_clear(pgd_entry);
+}
+
static int alloc_pte_page(pmd_t *pmd)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -999,9 +1021,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
- bool allocd_pgd = false;
- pgd_t *pgd_entry;
pud_t *pud = NULL; /* shut up gcc */
+ pgd_t *pgd_entry;
int ret;

pgd_entry = cpa->pgd + pgd_index(addr);
@@ -1015,7 +1036,6 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
return -1;

set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
- allocd_pgd = true;
}

pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
@@ -1023,19 +1043,11 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)

ret = populate_pud(cpa, addr, pgd_entry, pgprot);
if (ret < 0) {
- unmap_pud_range(pgd_entry, addr,
+ unmap_pgd_range(cpa->pgd, addr,
addr + (cpa->numpages << PAGE_SHIFT));
-
- if (allocd_pgd) {
- /*
- * If I allocated this PUD page, I can just as well
- * free it in this error path.
- */
- pgd_clear(pgd_entry);
- free_page((unsigned long)pud);
- }
return ret;
}
+
cpa->numpages = ret;
return 0;
}
@@ -1861,6 +1873,12 @@ out:
return retval;
}

+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages)
+{
+ unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
--
1.8.5.2.192.g7794a68

2014-01-11 20:49:44

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 2/4] efi: Dump the EFI page table

From: Borislav Petkov <[email protected]>

This is very useful for debugging issues with the recently added
pagetable switching code for EFI virtual mode.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/platform/efi/efi.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index d62ec87a2b26..c34be4ce94c9 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -54,6 +54,7 @@
#include <asm/rtc.h>

#define EFI_DEBUG
+#undef EFI_PGT_DBG

#define EFI_MIN_RESERVE 5120

@@ -818,6 +819,15 @@ void efi_memory_uc(u64 addr, unsigned long size)
set_memory_uc(addr, npages);
}

+static void dump_pagetable(void)
+{
+#if defined(EFI_PGT_DBG) && defined(CONFIG_X86_PTDUMP)
+ pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+
+ walk_pgd_level(NULL, pgd);
+#endif
+}
+
void __init old_map_region(efi_memory_desc_t *md)
{
u64 start_pfn, end_pfn, end;
@@ -1033,6 +1043,7 @@ void __init efi_enter_virtual_mode(void)

efi_setup_page_tables();
efi_sync_low_kernel_mappings();
+ dump_pagetable();

if (!efi_setup) {
status = phys_efi_set_virtual_address_map(
--
1.8.5.2.192.g7794a68

2014-01-11 20:50:29

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

From: Borislav Petkov <[email protected]>

With reusing the ->trampoline_pgd page table for mapping EFI regions in
order to use them after having switched to EFI virtual mode, it is very
useful to be able to dump aforementioned page table in dmesg. This adds
that functionality through the walk_pgd_level() interface which can be
called from somewhere else.

The original functionality of dumping to debugfs remains untouched.

Cc: Arjan van de Ven <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/pgtable.h | 3 +-
arch/x86/mm/dump_pagetables.c | 77 ++++++++++++++++++++++++++++--------------
2 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bbc8b12fa443..0001851fa785 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -15,9 +15,10 @@
: (prot))

#ifndef __ASSEMBLY__
-
#include <asm/x86_init.h>

+void walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0002a3a33081..f987ecff9226 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -19,6 +19,8 @@

#include <asm/pgtable.h>

+static bool dump_to_dmesg;
+
/*
* The dumper groups pagetable entries of the same type into one, and for
* that it needs to keep some state when walking, and flush this state
@@ -88,6 +90,24 @@ static struct addr_marker address_markers[] = {
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)

+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ if (dump_to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, fmt, args...) \
+({ \
+ if (dump_to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
/*
* Print a readable form of a pgprot_t to the seq_file
*/
@@ -99,47 +119,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)

if (!pgprot_val(prot)) {
/* Not present */
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");
} else {
if (pr & _PAGE_USER)
- seq_printf(m, "USR ");
+ pt_dump_cont_printf(m, "USR ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");
if (pr & _PAGE_RW)
- seq_printf(m, "RW ");
+ pt_dump_cont_printf(m, "RW ");
else
- seq_printf(m, "ro ");
+ pt_dump_cont_printf(m, "ro ");
if (pr & _PAGE_PWT)
- seq_printf(m, "PWT ");
+ pt_dump_cont_printf(m, "PWT ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");
if (pr & _PAGE_PCD)
- seq_printf(m, "PCD ");
+ pt_dump_cont_printf(m, "PCD ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");

/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
- seq_printf(m, "PSE ");
+ pt_dump_cont_printf(m, "PSE ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");
} else {
if (pr & _PAGE_PAT)
- seq_printf(m, "pat ");
+ pt_dump_cont_printf(m, "pat ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");
}
if (pr & _PAGE_GLOBAL)
- seq_printf(m, "GLB ");
+ pt_dump_cont_printf(m, "GLB ");
else
- seq_printf(m, " ");
+ pt_dump_cont_printf(m, " ");
if (pr & _PAGE_NX)
- seq_printf(m, "NX ");
+ pt_dump_cont_printf(m, "NX ");
else
- seq_printf(m, "x ");
+ pt_dump_cont_printf(m, "x ");
}
- seq_printf(m, "%s\n", level_name[level]);
+ pt_dump_cont_printf(m, "%s\n", level_name[level]);
}

/*
@@ -178,7 +198,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
@@ -188,16 +208,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
+ pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);

delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(m, "%9lu%c ", delta, *unit);
+ pt_dump_cont_printf(m, "%9lu%c ", delta, *unit);
printk_prot(m, st->current_prot, st->level);

/*
@@ -207,7 +227,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
*/
if (st->current_address >= st->marker[1].start_address) {
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
}

st->start_address = st->current_address;
@@ -296,7 +316,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif

-static void walk_pgd_level(struct seq_file *m)
+void walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -306,6 +326,11 @@ static void walk_pgd_level(struct seq_file *m)
int i;
struct pg_state st;

+ if (pgd) {
+ start = pgd;
+ dump_to_dmesg = true;
+ }
+
memset(&st, 0, sizeof(st));

for (i = 0; i < PTRS_PER_PGD; i++) {
@@ -331,7 +356,7 @@ static void walk_pgd_level(struct seq_file *m)

static int ptdump_show(struct seq_file *m, void *v)
{
- walk_pgd_level(m);
+ walk_pgd_level(m, NULL);
return 0;
}

--
1.8.5.2.192.g7794a68

2014-01-11 20:50:27

by Borislav Petkov

[permalink] [raw]
Subject: [PATCH 4/4] efi: Make efi virtual runtime map passing more robust

From: Borislav Petkov <[email protected]>

Currently, running SetVirtualAddressMap() and passing the physical
address of the virtual map array was working only by a lucky coincidence
because the memory was present in the EFI page table too. Until Toshi
went and booted this on a big HP box - the krealloc() manner of resizing
the memmap we're doing did allocate from such physical addresses which
were not mapped anymore and boom:

http://lkml.kernel.org/r/[email protected]

One way to take care of that issue is to reimplement the krealloc thing
but with pages. We start with contiguous pages of order 1, i.e. 2 pages,
and when we deplete that memory (shouldn't happen all that often but you
know firmware) we realloc the next power-of-two pages.

Having the pages, it is much more handy and easy to map them into the
EFI page table with the already existing mapping code which we're using
for building the virtual mappings.

And, it doesn't matter all that much how much pages we've used as we're
freeing them right after they've fulfilled their purpose at the end of
the function anyway.

Reported-by: Toshi Kani <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/efi.h | 3 +-
arch/x86/platform/efi/efi.c | 62 ++++++++++++++++++++++++++++++------------
arch/x86/platform/efi/efi_32.c | 6 +++-
arch/x86/platform/efi/efi_64.c | 31 +++++++++++++++++++--
4 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 3b978c472d08..0e7973f7492e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -130,7 +130,8 @@ extern void efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
extern void efi_sync_low_kernel_mappings(void);
-extern void efi_setup_page_tables(void);
+extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
+extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
extern void __init old_map_region(efi_memory_desc_t *md);

struct efi_setup_data {
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index c34be4ce94c9..65a8c969db87 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -948,14 +948,36 @@ static void __init efi_map_regions_fixed(void)

}

+static void *realloc_pages(void *old_memmap, int old_shift)
+{
+ void *ret;
+
+ ret = (void *)__get_free_pages(GFP_KERNEL, old_shift + 1);
+ if (!ret)
+ goto out;
+
+ /*
+ * A first-time allocation doesn't have anything to copy.
+ */
+ if (!old_memmap)
+ return ret;
+
+ memcpy(ret, old_memmap, PAGE_SIZE << old_shift);
+
+out:
+ __free_pages(old_memmap, old_shift);
+ return ret;
+}
+
/*
- * Map efi memory ranges for runtime serivce and update new_memmap with virtual
- * addresses.
+ * Map the efi memory ranges of the runtime services and update new_mmap with
+ * virtual addresses.
*/
-static void * __init efi_map_regions(int *count)
+static void * __init efi_map_regions(int *count, int *pg_shift)
{
+ void *p, *new_memmap = NULL;
+ unsigned long left = 0;
efi_memory_desc_t *md;
- void *p, *tmp, *new_memmap = NULL;

for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
@@ -970,20 +992,23 @@ static void * __init efi_map_regions(int *count)
efi_map_region(md);
get_systab_virt_addr(md);

- tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size,
- GFP_KERNEL);
- if (!tmp)
- goto out;
- new_memmap = tmp;
+ if (left < memmap.desc_size) {
+ new_memmap = realloc_pages(new_memmap, *pg_shift);
+ if (!new_memmap)
+ return NULL;
+
+ left += PAGE_SIZE << *pg_shift;
+ (*pg_shift)++;
+ }
+
memcpy(new_memmap + (*count * memmap.desc_size), md,
memmap.desc_size);
+
+ left -= memmap.desc_size;
(*count)++;
}

return new_memmap;
-out:
- kfree(new_memmap);
- return NULL;
}

/*
@@ -1009,9 +1034,9 @@ out:
*/
void __init efi_enter_virtual_mode(void)
{
- efi_status_t status;
+ int err, count = 0, pg_shift = 0;
void *new_memmap = NULL;
- int err, count = 0;
+ efi_status_t status;

efi.systab = NULL;

@@ -1028,7 +1053,7 @@ void __init efi_enter_virtual_mode(void)
efi_map_regions_fixed();
} else {
efi_merge_regions();
- new_memmap = efi_map_regions(&count);
+ new_memmap = efi_map_regions(&count, &pg_shift);
if (!new_memmap) {
pr_err("Error reallocating memory, EFI runtime non-functional!\n");
return;
@@ -1041,7 +1066,9 @@ void __init efi_enter_virtual_mode(void)

BUG_ON(!efi.systab);

- efi_setup_page_tables();
+ if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift))
+ return;
+
efi_sync_low_kernel_mappings();
dump_pagetable();

@@ -1083,7 +1110,8 @@ void __init efi_enter_virtual_mode(void)
if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
runtime_code_page_mkexec();

- kfree(new_memmap);
+ efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
+ __free_pages(new_memmap, pg_shift);

/* clean DUMMY object */
efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 249b183cf417..d58a2015e22d 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -40,7 +40,11 @@
static unsigned long efi_rt_eflags;

void efi_sync_low_kernel_mappings(void) {}
-void efi_setup_page_tables(void) {}
+int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+ return 0;
+}
+void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {}

void __init efi_map_region(efi_memory_desc_t *md)
{
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6284f158a47d..3d66844ea156 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -137,12 +137,37 @@ void efi_sync_low_kernel_mappings(void)
sizeof(pgd_t) * num_pgds);
}

-void efi_setup_page_tables(void)
+int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
+ pgd_t *pgd;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ /*
+ * It can happen that the physical address of new_memmap lands in memory
+ * which is not mapped in the EFI page table. Therefore we need to go
+ * and ident-map those pages containing the map before calling
+ * phys_efi_set_virtual_address_map().
+ */
+ if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
+ pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
+ return 1;
+ }
+
efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
+ efi_scratch.use_pgd = true;

- if (!efi_enabled(EFI_OLD_MEMMAP))
- efi_scratch.use_pgd = true;
+ pgd = __va(efi_scratch.efi_pgt);
+
+ return 0;
+}
+
+void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+ pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+
+ kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
}

static void __init __map_region(efi_memory_desc_t *md, u64 va)
--
1.8.5.2.192.g7794a68

2014-01-11 20:59:50

by Joe Perches

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On Sat, 2014-01-11 at 21:49 +0100, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> With reusing the ->trampoline_pgd page table for mapping EFI regions in
> order to use them after having switched to EFI virtual mode, it is very
> useful to be able to dump aforementioned page table in dmesg. This adds
> that functionality through the walk_pgd_level() interface which can be
> called from somewhere else.
[]
> diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
[]
> @@ -306,6 +326,11 @@ static void walk_pgd_level(struct seq_file *m)
> int i;
> struct pg_state st;
>
> + if (pgd) {
> + start = pgd;
> + dump_to_dmesg = true;
> + }

else
dump_to_dmesg = false;

?

2014-01-13 05:32:52

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/11/14 at 09:49pm, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> With reusing the ->trampoline_pgd page table for mapping EFI regions in
> order to use them after having switched to EFI virtual mode, it is very
> useful to be able to dump aforementioned page table in dmesg. This adds
> that functionality through the walk_pgd_level() interface which can be
> called from somewhere else.
>
> The original functionality of dumping to debugfs remains untouched.
>
> Cc: Arjan van de Ven <[email protected]>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> arch/x86/include/asm/pgtable.h | 3 +-
> arch/x86/mm/dump_pagetables.c | 77 ++++++++++++++++++++++++++++--------------
> 2 files changed, 53 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index bbc8b12fa443..0001851fa785 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -15,9 +15,10 @@
> : (prot))
>
> #ifndef __ASSEMBLY__
> -
> #include <asm/x86_init.h>
>
> +void walk_pgd_level(struct seq_file *m, pgd_t *pgd);
> +
> /*
> * ZERO_PAGE is a global shared page that is always zero: used
> * for zero-mapped memory areas etc..
> diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
> index 0002a3a33081..f987ecff9226 100644
> --- a/arch/x86/mm/dump_pagetables.c
> +++ b/arch/x86/mm/dump_pagetables.c
> @@ -19,6 +19,8 @@
>
> #include <asm/pgtable.h>
>
> +static bool dump_to_dmesg;
> +
> /*
> * The dumper groups pagetable entries of the same type into one, and for
> * that it needs to keep some state when walking, and flush this state
> @@ -88,6 +90,24 @@ static struct addr_marker address_markers[] = {
> #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
> #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
>
> +#define pt_dump_seq_printf(m, fmt, args...) \
> +({ \
> + if (dump_to_dmesg) \
> + printk(KERN_INFO fmt, ##args); \

pr_info? This is for debug purpose, maybe pr_debug is better?
Ditto to other printk callbacks.

> + else \
> + if (m) \
> + seq_printf(m, fmt, ##args); \
> +})
> +
> +#define pt_dump_cont_printf(m, fmt, args...) \
> +({ \
> + if (dump_to_dmesg) \
> + printk(KERN_CONT fmt, ##args); \
> + else \
> + if (m) \
> + seq_printf(m, fmt, ##args); \
> +})
> +
> /*
> * Print a readable form of a pgprot_t to the seq_file
> */
> @@ -99,47 +119,47 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
>
> if (!pgprot_val(prot)) {
> /* Not present */
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
> } else {
> if (pr & _PAGE_USER)
> - seq_printf(m, "USR ");
> + pt_dump_cont_printf(m, "USR ");
> else
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
> if (pr & _PAGE_RW)
> - seq_printf(m, "RW ");
> + pt_dump_cont_printf(m, "RW ");
> else
> - seq_printf(m, "ro ");
> + pt_dump_cont_printf(m, "ro ");
> if (pr & _PAGE_PWT)
> - seq_printf(m, "PWT ");
> + pt_dump_cont_printf(m, "PWT ");
> else
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
> if (pr & _PAGE_PCD)
> - seq_printf(m, "PCD ");
> + pt_dump_cont_printf(m, "PCD ");
> else
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
>
> /* Bit 9 has a different meaning on level 3 vs 4 */
> if (level <= 3) {
> if (pr & _PAGE_PSE)
> - seq_printf(m, "PSE ");
> + pt_dump_cont_printf(m, "PSE ");
> else
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
> } else {
> if (pr & _PAGE_PAT)
> - seq_printf(m, "pat ");
> + pt_dump_cont_printf(m, "pat ");
> else
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
> }
> if (pr & _PAGE_GLOBAL)
> - seq_printf(m, "GLB ");
> + pt_dump_cont_printf(m, "GLB ");
> else
> - seq_printf(m, " ");
> + pt_dump_cont_printf(m, " ");
> if (pr & _PAGE_NX)
> - seq_printf(m, "NX ");
> + pt_dump_cont_printf(m, "NX ");
> else
> - seq_printf(m, "x ");
> + pt_dump_cont_printf(m, "x ");
> }
> - seq_printf(m, "%s\n", level_name[level]);
> + pt_dump_cont_printf(m, "%s\n", level_name[level]);
> }
>
> /*
> @@ -178,7 +198,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
> st->current_prot = new_prot;
> st->level = level;
> st->marker = address_markers;
> - seq_printf(m, "---[ %s ]---\n", st->marker->name);
> + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
> } else if (prot != cur || level != st->level ||
> st->current_address >= st->marker[1].start_address) {
> const char *unit = units;
> @@ -188,16 +208,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
> /*
> * Now print the actual finished series
> */
> - seq_printf(m, "0x%0*lx-0x%0*lx ",
> - width, st->start_address,
> - width, st->current_address);
> + pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
> + width, st->start_address,
> + width, st->current_address);
>
> delta = (st->current_address - st->start_address) >> 10;
> while (!(delta & 1023) && unit[1]) {
> delta >>= 10;
> unit++;
> }
> - seq_printf(m, "%9lu%c ", delta, *unit);
> + pt_dump_cont_printf(m, "%9lu%c ", delta, *unit);
> printk_prot(m, st->current_prot, st->level);
>
> /*
> @@ -207,7 +227,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
> */
> if (st->current_address >= st->marker[1].start_address) {
> st->marker++;
> - seq_printf(m, "---[ %s ]---\n", st->marker->name);
> + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
> }
>
> st->start_address = st->current_address;
> @@ -296,7 +316,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
> #define pgd_none(a) pud_none(__pud(pgd_val(a)))
> #endif
>
> -static void walk_pgd_level(struct seq_file *m)
> +void walk_pgd_level(struct seq_file *m, pgd_t *pgd)

How about do not limit to only if (pgd) case, instead do something like below:
set dump_to_dmesg as a module parameter, so user can enable it via kernel cmdline
or set it while insmod.

> {
> #ifdef CONFIG_X86_64
> pgd_t *start = (pgd_t *) &init_level4_pgt;
> @@ -306,6 +326,11 @@ static void walk_pgd_level(struct seq_file *m)
> int i;
> struct pg_state st;
>
> + if (pgd) {
> + start = pgd;
> + dump_to_dmesg = true;
> + }
> +
> memset(&st, 0, sizeof(st));
>
> for (i = 0; i < PTRS_PER_PGD; i++) {
> @@ -331,7 +356,7 @@ static void walk_pgd_level(struct seq_file *m)
>
> static int ptdump_show(struct seq_file *m, void *v)
> {
> - walk_pgd_level(m);
> + walk_pgd_level(m, NULL);
> return 0;
> }
>
> --
> 1.8.5.2.192.g7794a68
>

2014-01-13 05:40:39

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 4/4] efi: Make efi virtual runtime map passing more robust

On 01/11/14 at 09:49pm, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> Currently, running SetVirtualAddressMap() and passing the physical
> address of the virtual map array was working only by a lucky coincidence
> because the memory was present in the EFI page table too. Until Toshi
> went and booted this on a big HP box - the krealloc() manner of resizing
> the memmap we're doing did allocate from such physical addresses which
> were not mapped anymore and boom:
>
> http://lkml.kernel.org/r/[email protected]
>
> One way to take care of that issue is to reimplement the krealloc thing
> but with pages. We start with contiguous pages of order 1, i.e. 2 pages,
> and when we deplete that memory (shouldn't happen all that often but you
> know firmware) we realloc the next power-of-two pages.
>
> Having the pages, it is much more handy and easy to map them into the
> EFI page table with the already existing mapping code which we're using
> for building the virtual mappings.
>
> And, it doesn't matter all that much how much pages we've used as we're
> freeing them right after they've fulfilled their purpose at the end of
> the function anyway.
>
> Reported-by: Toshi Kani <[email protected]>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> arch/x86/include/asm/efi.h | 3 +-
> arch/x86/platform/efi/efi.c | 62 ++++++++++++++++++++++++++++++------------
> arch/x86/platform/efi/efi_32.c | 6 +++-
> arch/x86/platform/efi/efi_64.c | 31 +++++++++++++++++++--
> 4 files changed, 80 insertions(+), 22 deletions(-)
>
> diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
> index 3b978c472d08..0e7973f7492e 100644
> --- a/arch/x86/include/asm/efi.h
> +++ b/arch/x86/include/asm/efi.h
> @@ -130,7 +130,8 @@ extern void efi_memory_uc(u64 addr, unsigned long size);
> extern void __init efi_map_region(efi_memory_desc_t *md);
> extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
> extern void efi_sync_low_kernel_mappings(void);
> -extern void efi_setup_page_tables(void);
> +extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
> +extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
> extern void __init old_map_region(efi_memory_desc_t *md);
>
> struct efi_setup_data {
> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> index c34be4ce94c9..65a8c969db87 100644
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -948,14 +948,36 @@ static void __init efi_map_regions_fixed(void)
>
> }
>
> +static void *realloc_pages(void *old_memmap, int old_shift)
> +{
> + void *ret;
> +
> + ret = (void *)__get_free_pages(GFP_KERNEL, old_shift + 1);
> + if (!ret)
> + goto out;
> +
> + /*
> + * A first-time allocation doesn't have anything to copy.
> + */
> + if (!old_memmap)
> + return ret;
> +
> + memcpy(ret, old_memmap, PAGE_SIZE << old_shift);
> +
> +out:
> + __free_pages(old_memmap, old_shift);
> + return ret;
> +}
> +
> /*
> - * Map efi memory ranges for runtime serivce and update new_memmap with virtual
> - * addresses.
> + * Map the efi memory ranges of the runtime services and update new_mmap with
> + * virtual addresses.
> */
> -static void * __init efi_map_regions(int *count)
> +static void * __init efi_map_regions(int *count, int *pg_shift)
> {
> + void *p, *new_memmap = NULL;
> + unsigned long left = 0;
> efi_memory_desc_t *md;
> - void *p, *tmp, *new_memmap = NULL;
>
> for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
> md = p;
> @@ -970,20 +992,23 @@ static void * __init efi_map_regions(int *count)
> efi_map_region(md);
> get_systab_virt_addr(md);
>
> - tmp = krealloc(new_memmap, (*count + 1) * memmap.desc_size,
> - GFP_KERNEL);
> - if (!tmp)
> - goto out;
> - new_memmap = tmp;
> + if (left < memmap.desc_size) {
> + new_memmap = realloc_pages(new_memmap, *pg_shift);
> + if (!new_memmap)
> + return NULL;
> +
> + left += PAGE_SIZE << *pg_shift;
> + (*pg_shift)++;
> + }
> +
> memcpy(new_memmap + (*count * memmap.desc_size), md,
> memmap.desc_size);
> +
> + left -= memmap.desc_size;

Adding a safeguard check for desc_size is better though currently it's impossible
for the desc_size > PAGE_SIZE?

> (*count)++;
> }
>
> return new_memmap;
> -out:
> - kfree(new_memmap);
> - return NULL;
> }
>
> /*
> @@ -1009,9 +1034,9 @@ out:
> */
> void __init efi_enter_virtual_mode(void)
> {
> - efi_status_t status;
> + int err, count = 0, pg_shift = 0;
> void *new_memmap = NULL;
> - int err, count = 0;
> + efi_status_t status;
>
> efi.systab = NULL;
>
> @@ -1028,7 +1053,7 @@ void __init efi_enter_virtual_mode(void)
> efi_map_regions_fixed();
> } else {
> efi_merge_regions();
> - new_memmap = efi_map_regions(&count);
> + new_memmap = efi_map_regions(&count, &pg_shift);
> if (!new_memmap) {
> pr_err("Error reallocating memory, EFI runtime non-functional!\n");
> return;
> @@ -1041,7 +1066,9 @@ void __init efi_enter_virtual_mode(void)
>
> BUG_ON(!efi.systab);
>
> - efi_setup_page_tables();
> + if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift))
> + return;
> +
> efi_sync_low_kernel_mappings();
> dump_pagetable();
>
> @@ -1083,7 +1110,8 @@ void __init efi_enter_virtual_mode(void)
> if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
> runtime_code_page_mkexec();
>
> - kfree(new_memmap);
> + efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
> + __free_pages(new_memmap, pg_shift);
>
> /* clean DUMMY object */
> efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
> diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
> index 249b183cf417..d58a2015e22d 100644
> --- a/arch/x86/platform/efi/efi_32.c
> +++ b/arch/x86/platform/efi/efi_32.c
> @@ -40,7 +40,11 @@
> static unsigned long efi_rt_eflags;
>
> void efi_sync_low_kernel_mappings(void) {}
> -void efi_setup_page_tables(void) {}
> +int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
> +{
> + return 0;
> +}
> +void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {}
>
> void __init efi_map_region(efi_memory_desc_t *md)
> {
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index 6284f158a47d..3d66844ea156 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -137,12 +137,37 @@ void efi_sync_low_kernel_mappings(void)
> sizeof(pgd_t) * num_pgds);
> }
>
> -void efi_setup_page_tables(void)
> +int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
> {
> + pgd_t *pgd;
> +
> + if (efi_enabled(EFI_OLD_MEMMAP))
> + return 0;
> +
> + /*
> + * It can happen that the physical address of new_memmap lands in memory
> + * which is not mapped in the EFI page table. Therefore we need to go
> + * and ident-map those pages containing the map before calling
> + * phys_efi_set_virtual_address_map().
> + */
> + if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
> + pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
> + return 1;
> + }
> +
> efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
> + efi_scratch.use_pgd = true;
>
> - if (!efi_enabled(EFI_OLD_MEMMAP))
> - efi_scratch.use_pgd = true;
> + pgd = __va(efi_scratch.efi_pgt);
> +
> + return 0;
> +}
> +
> +void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
> +{
> + pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
> +
> + kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
> }
>
> static void __init __map_region(efi_memory_desc_t *md, u64 va)
> --
> 1.8.5.2.192.g7794a68
>

2014-01-13 10:18:28

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On Mon, Jan 13, 2014 at 01:32:40PM +0800, Dave Young wrote:
> pr_info? This is for debug purpose, maybe pr_debug is better?

I hate pr_debug as it depends on other defines I have to have defined
properly in order just so I get output.

> How about do not limit to only if (pgd) case, instead do something
> like below: set dump_to_dmesg as a module parameter

X86_PTDUMP is not a module.

And even though the idea to make the efi pagetable dumpable with a
kernel cmdline parameter doesn't sound all that bad, I'd like to hold
off on adding yet another efi= cmdline option and only do so until it is
really needed.

Thanks.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2014-01-13 10:27:40

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 4/4] efi: Make efi virtual runtime map passing more robust

On Mon, Jan 13, 2014 at 01:40:39PM +0800, Dave Young wrote:
> Adding a safeguard check for desc_size is better though currently it's
> impossible for the desc_size > PAGE_SIZE?

Huh, what?

sizeof(efi_memory_desc_t) is only 40 bytes AFAICT.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2014-01-13 12:23:45

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

>
> > How about do not limit to only if (pgd) case, instead do something
> > like below: set dump_to_dmesg as a module parameter
>
> X86_PTDUMP is not a module.

Hmm, I just see the module macros in the code, since it's a bool Kconfig
I think the dump_pagetables.c need a cleanup,
remove the #include <linux/module.h> and below lines:
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <[email protected]>");
MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");

>
> And even though the idea to make the efi pagetable dumpable with a
> kernel cmdline parameter doesn't sound all that bad, I'd like to hold
> off on adding yet another efi= cmdline option and only do so until it is
> really needed.

I think not necessary limit to efi=? Probably something like ptdump_dmesg=1|0

Thanks
Dave

2014-01-13 13:18:07

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 4/4] efi: Make efi virtual runtime map passing more robust

On Mon, Jan 13, 2014 at 11:27:34AM +0100, Borislav Petkov wrote:
> On Mon, Jan 13, 2014 at 01:40:39PM +0800, Dave Young wrote:
> > Adding a safeguard check for desc_size is better though currently it's
> > impossible for the desc_size > PAGE_SIZE?
>
> Huh, what?
>
> sizeof(efi_memory_desc_t) is only 40 bytes AFAICT.

I know this is nearly impossible, but is there possible for some corrupted
desc_size value or firmware bug? who knows...

Thanks
Dave

2014-01-13 14:48:16

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 1/13/2014 4:23 AM, Dave Young wrote:
>>
>>> How about do not limit to only if (pgd) case, instead do something
>>> like below: set dump_to_dmesg as a module parameter
>>
>> X86_PTDUMP is not a module.
>
> Hmm, I just see the module macros in the code, since it's a bool Kconfig
> I think the dump_pagetables.c need a cleanup,
> remove the #include <linux/module.h> and below lines:
> MODULE_LICENSE("GPL");
> MODULE_AUTHOR("Arjan van de Ven <[email protected]>");
> MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");

personally I consider it good form to always have this kind of information in .c files,
regardless of the KConfig side of thing...

2014-01-13 14:57:39

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On Mon, Jan 13, 2014 at 06:48:10AM -0800, Arjan van de Ven wrote:
> >MODULE_LICENSE("GPL");
> >MODULE_AUTHOR("Arjan van de Ven <[email protected]>");
> >MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
>
> personally I consider it good form to always have this kind of
> information in .c files, regardless of the KConfig side of thing...

Agreed.

Let's not touch this unless it is absolutely necessary and until there's
a real reason why it should be touched and not for the sake of the
cleanup itself only.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2014-01-13 15:49:19

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On Sat, 11 Jan, at 09:49:27PM, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> With reusing the ->trampoline_pgd page table for mapping EFI regions in
> order to use them after having switched to EFI virtual mode, it is very
> useful to be able to dump aforementioned page table in dmesg. This adds
> that functionality through the walk_pgd_level() interface which can be
> called from somewhere else.
>
> The original functionality of dumping to debugfs remains untouched.
>
> Cc: Arjan van de Ven <[email protected]>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> arch/x86/include/asm/pgtable.h | 3 +-
> arch/x86/mm/dump_pagetables.c | 77 ++++++++++++++++++++++++++++--------------
> 2 files changed, 53 insertions(+), 27 deletions(-)

[...]

> @@ -296,7 +316,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
> #define pgd_none(a) pud_none(__pud(pgd_val(a)))
> #endif
>
> -static void walk_pgd_level(struct seq_file *m)
> +void walk_pgd_level(struct seq_file *m, pgd_t *pgd)
> {

If you're going to start exporting this function, might I suggest
renaming it something like ptdump_walk_pgd_level()?

--
Matt Fleming, Intel Open Source Technology Center

2014-01-13 16:00:26

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH 2/4] efi: Dump the EFI page table

On Sat, 11 Jan, at 09:49:28PM, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> This is very useful for debugging issues with the recently added
> pagetable switching code for EFI virtual mode.
>
> Signed-off-by: Borislav Petkov <[email protected]>
> ---
> arch/x86/platform/efi/efi.c | 11 +++++++++++
> 1 file changed, 11 insertions(+)
>
> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> index d62ec87a2b26..c34be4ce94c9 100644
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -54,6 +54,7 @@
> #include <asm/rtc.h>
>
> #define EFI_DEBUG
> +#undef EFI_PGT_DBG

Should this be a Kconfig option? Hacking source files to enable debug
isn't the best idea. Plus you could make it depend on X86_PTDUMP.

--
Matt Fleming, Intel Open Source Technology Center

2014-01-13 16:28:49

by Toshi Kani

[permalink] [raw]
Subject: Re: [PATCH 0/4] EFI memmap fix v2

On Sat, 2014-01-11 at 21:49 +0100, Borislav Petkov wrote:
> From: Borislav Petkov <[email protected]>
>
> Ok, here's v2 rebased and rediffed against tip (which has the relevant
> efi branches).
>
> Toshi, I'm pushing it to:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git#efi-fixes
>
> so that you can give it another run and so that the build robot can poke
> at it.

I tested the 4 patches on top of Matt's efi tree under kexec branch.
Unfortunately, it hit the same panic again... I will send you the full
dmesg with EFI_PGT_DBG enabled in a separate email. I will also test
your git tree as well.

Thanks,
-Toshi

BUG: unable to handle kernel paging request at 00000207fec34020
IP: [<0000000072dcda76>] 0x72dcda76
PGD 0
Oops: 0000 [#1] SMP
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.13.0-rc1+ #82
Hardware name: HP CB920s x1, BIOS Bundle: 005.028.018 SFW: 012.124.000
10/28/2013
task: ffffffff81a10480 ti: ffffffff81a00000 task.ti: ffffffff81a00000
RIP: 0010:[<0000000072dcda76>] [<0000000072dcda76>] 0x72dcda76
RSP: 0000:ffffffff81a01df8 EFLAGS: 00010206
RAX: 00000000725fee18 RBX: 00000000725feda0 RCX: 00000207fec34000
RDX: 0000000072dfe070 RSI: 0000000060000202 RDI: 0000000072dcdac8
RBP: 0000000072dd0560 R08: 0000000000000000 R09: 000000000000001d
R10: 0000000000000030 R11: 8000000000000000 R12: ffff8a07fec34000
R13: 0000000000000001 R14: 0000000000000570 R15: 000000000009c000
FS: 0000000000000000(0000) GS:ffff88087fa00000(0000)
knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000207fec34020 CR3: 000000000009c000 CR4: 00000000000406b0
Stack:
0000000072dfb88f 0000000000000001 0000000000002643 0000000000000000
ffffffff81ace2a8 0000000072dcdac8 0000000072dcdb93 0000000000000001
ffffffff81a01f80 0000000000000570 ffff8a07fffbc000 0000000000000000
Call Trace:
[<ffffffff81046a4c>] ? efi_call4+0x6c/0xf0
[<ffffffff81b05a7a>] ? efi_enter_virtual_mode+0x35d/0x524
[<ffffffff81aede1d>] ? start_kernel+0x397/0x431
[<ffffffff81aed88f>] ? repair_env_string+0x5c/0x5c
[<ffffffff81aed5a3>] ? x86_64_start_reservations+0x2a/0x2c
[<ffffffff81aed696>] ? x86_64_start_kernel+0xf1/0xf4
Code: Bad RIP value.
RIP [<0000000072dcda76>] 0x72dcda76
RSP <ffffffff81a01df8>
CR2: 00000207fec34020
---[ end trace b50fb077cda23ee8 ]---


2014-01-13 17:00:35

by Toshi Kani

[permalink] [raw]
Subject: Re: [PATCH 0/4] EFI memmap fix v2

On Mon, 2014-01-13 at 09:22 -0700, Toshi Kani wrote:
> On Sat, 2014-01-11 at 21:49 +0100, Borislav Petkov wrote:
> > From: Borislav Petkov <[email protected]>
> >
> > Ok, here's v2 rebased and rediffed against tip (which has the relevant
> > efi branches).
> >
> > Toshi, I'm pushing it to:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git#efi-fixes
> >
> > so that you can give it another run and so that the build robot can poke
> > at it.
>
> I tested the 4 patches on top of Matt's efi tree under kexec branch.
> Unfortunately, it hit the same panic again... I will send you the full
> dmesg with EFI_PGT_DBG enabled in a separate email. I will also test
> your git tree as well.

The git tree kernel also hit the same panic. Will send you the dmesg
output in a separate email as well.

Thanks,
-Toshi

2014-01-14 01:36:46

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 2/4] efi: Dump the EFI page table

On 01/13/14 at 04:00pm, Matt Fleming wrote:
> On Sat, 11 Jan, at 09:49:28PM, Borislav Petkov wrote:
> > From: Borislav Petkov <[email protected]>
> >
> > This is very useful for debugging issues with the recently added
> > pagetable switching code for EFI virtual mode.
> >
> > Signed-off-by: Borislav Petkov <[email protected]>
> > ---
> > arch/x86/platform/efi/efi.c | 11 +++++++++++
> > 1 file changed, 11 insertions(+)
> >
> > diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> > index d62ec87a2b26..c34be4ce94c9 100644
> > --- a/arch/x86/platform/efi/efi.c
> > +++ b/arch/x86/platform/efi/efi.c
> > @@ -54,6 +54,7 @@
> > #include <asm/rtc.h>
> >
> > #define EFI_DEBUG
> > +#undef EFI_PGT_DBG
>
> Should this be a Kconfig option? Hacking source files to enable debug
> isn't the best idea. Plus you could make it depend on X86_PTDUMP.

I have same feeling, either a new Kconfig option or just print it while
ptdump_to_dmesg is true by default without extra Kconfig.

Thanks
Dave

2014-01-14 01:40:01

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/13/14 at 06:48am, Arjan van de Ven wrote:
> On 1/13/2014 4:23 AM, Dave Young wrote:
> >>
> >>>How about do not limit to only if (pgd) case, instead do something
> >>>like below: set dump_to_dmesg as a module parameter
> >>
> >>X86_PTDUMP is not a module.
> >
> >Hmm, I just see the module macros in the code, since it's a bool Kconfig
> >I think the dump_pagetables.c need a cleanup,
> >remove the #include <linux/module.h> and below lines:
> >MODULE_LICENSE("GPL");
> >MODULE_AUTHOR("Arjan van de Ven <[email protected]>");
> >MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
>
> personally I consider it good form to always have this kind of information in .c files,
> regardless of the KConfig side of thing...

I agree it's good to add these infomation, but they can be added in comment.
IMHO it will be better in that way.

Thanks
Dave

2014-01-14 18:19:03

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/13/2014 05:40 PM, Dave Young wrote:
> On 01/13/14 at 06:48am, Arjan van de Ven wrote:
>> On 1/13/2014 4:23 AM, Dave Young wrote:
>>>>
>>>>> How about do not limit to only if (pgd) case, instead do something
>>>>> like below: set dump_to_dmesg as a module parameter
>>>>
>>>> X86_PTDUMP is not a module.
>>>
>>> Hmm, I just see the module macros in the code, since it's a bool Kconfig
>>> I think the dump_pagetables.c need a cleanup,
>>> remove the #include <linux/module.h> and below lines:
>>> MODULE_LICENSE("GPL");
>>> MODULE_AUTHOR("Arjan van de Ven <[email protected]>");
>>> MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
>>
>> personally I consider it good form to always have this kind of information in .c files,
>> regardless of the KConfig side of thing...
>
> I agree it's good to add these infomation, but they can be added in comment.
> IMHO it will be better in that way.
>

Why the [Finnish] do you feel that information needs to be in a
different form because it is (currently!) not available as a module?

-hpa

2014-01-15 01:16:11

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/14/14 at 10:18am, H. Peter Anvin wrote:
> On 01/13/2014 05:40 PM, Dave Young wrote:
> > On 01/13/14 at 06:48am, Arjan van de Ven wrote:
> >> On 1/13/2014 4:23 AM, Dave Young wrote:
> >>>>
> >>>>> How about do not limit to only if (pgd) case, instead do something
> >>>>> like below: set dump_to_dmesg as a module parameter
> >>>>
> >>>> X86_PTDUMP is not a module.
> >>>
> >>> Hmm, I just see the module macros in the code, since it's a bool Kconfig
> >>> I think the dump_pagetables.c need a cleanup,
> >>> remove the #include <linux/module.h> and below lines:
> >>> MODULE_LICENSE("GPL");
> >>> MODULE_AUTHOR("Arjan van de Ven <[email protected]>");
> >>> MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
> >>
> >> personally I consider it good form to always have this kind of information in .c files,
> >> regardless of the KConfig side of thing...
> >
> > I agree it's good to add these infomation, but they can be added in comment.
> > IMHO it will be better in that way.
> >
>
> Why the [Finnish] do you feel that information needs to be in a
> different form because it is (currently!) not available as a module?

I think moving them to comment can avoid including extra linux/module.h

If a module specific parameter is needed maybe it's not bad to add dummy
module_init/module_exit

Thanks
Dave

2014-01-15 14:12:11

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/14/2014 05:16 PM, Dave Young wrote:
>>
>> Why the [Finnish] do you feel that information needs to be in a
>> different form because it is (currently!) not available as a module?
>
> I think moving them to comment can avoid including extra linux/module.h
>

And that matters, why?

-hpa

2014-01-16 01:44:51

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/15/14 at 06:11am, H. Peter Anvin wrote:
> On 01/14/2014 05:16 PM, Dave Young wrote:
> >>
> >> Why the [Finnish] do you feel that information needs to be in a
> >> different form because it is (currently!) not available as a module?
> >
> > I think moving them to comment can avoid including extra linux/module.h
> >
>
> And that matters, why?

At least the code will be cleaner and kernel size will be a bit smaller?

Thanks
Dave

2014-01-16 02:41:33

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 1/15/2014 5:44 PM, Dave Young wrote:
> On 01/15/14 at 06:11am, H. Peter Anvin wrote:
>> On 01/14/2014 05:16 PM, Dave Young wrote:
>>>>
>>>> Why the [Finnish] do you feel that information needs to be in a
>>>> different form because it is (currently!) not available as a module?
>>>
>>> I think moving them to comment can avoid including extra linux/module.h
>>>
>>
>> And that matters, why?
>
> At least the code will be cleaner and kernel size will be a bit smaller?
>

making something harder to grep and less standardized is hardly cleaner
and these things compile to nothing for non-modules.

2014-01-16 03:03:40

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/15/14 at 06:41pm, Arjan van de Ven wrote:
> On 1/15/2014 5:44 PM, Dave Young wrote:
> >On 01/15/14 at 06:11am, H. Peter Anvin wrote:
> >>On 01/14/2014 05:16 PM, Dave Young wrote:
> >>>>
> >>>>Why the [Finnish] do you feel that information needs to be in a
> >>>>different form because it is (currently!) not available as a module?
> >>>
> >>>I think moving them to comment can avoid including extra linux/module.h
> >>>
> >>
> >>And that matters, why?
> >
> >At least the code will be cleaner and kernel size will be a bit smaller?
> >
>
> making something harder to grep and less standardized is hardly cleaner
> and these things compile to nothing for non-modules.

It's not nothing, just very small increasement:
text data bss dec hex filename
7636121 1391824 9355264 18383209 1188169 vmlinux

text data bss dec hex filename
7636113 1391824 9355264 18383201 1188161 vmlinux

I do not want to insist on this minor problem, if you want please keep it.

Thanks
Dave

2014-01-16 04:05:17

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/15/2014 07:03 PM, Dave Young wrote:
>>
>> making something harder to grep and less standardized is hardly cleaner
>> and these things compile to nothing for non-modules.
>
> It's not nothing, just very small increasement:
> text data bss dec hex filename
> 7636121 1391824 9355264 18383209 1188169 vmlinux
>
> text data bss dec hex filename
> 7636113 1391824 9355264 18383201 1188161 vmlinux
>
> I do not want to insist on this minor problem, if you want please keep it.
>

Now *that* we can fix by making these macros expand to nothing if you
are compiling for a nonmodule, right? That will benefit code that
legitimately can compile as a module but is configured "y", too.

-hpa



2014-01-16 06:46:32

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/15/14 at 08:04pm, H. Peter Anvin wrote:
> On 01/15/2014 07:03 PM, Dave Young wrote:
> >>
> >> making something harder to grep and less standardized is hardly cleaner
> >> and these things compile to nothing for non-modules.
> >
> > It's not nothing, just very small increasement:
> > text data bss dec hex filename
> > 7636121 1391824 9355264 18383209 1188169 vmlinux
> >
> > text data bss dec hex filename
> > 7636113 1391824 9355264 18383201 1188161 vmlinux
> >
> > I do not want to insist on this minor problem, if you want please keep it.
> >
>
> Now *that* we can fix by making these macros expand to nothing if you
> are compiling for a nonmodule, right? That will benefit code that
> legitimately can compile as a module but is configured "y", too.

I'm not sure expanding to nothing is a good fix.

For built-in module we still can retrieve the module info in userspace via
/usr/sbin/modinfo, if we expand the Macros to nothing then userspace will
not see thus infomation anymore.

Another advantage of built-in module is it has module specific parameters which
should not necessary a generic kernel parameter.

If the *module* is really a nonmodule, IMHO it should add the information to
or add other macros macros instead of reuse the MODULE_*.

BTW, for macros in this file, for a nonmodule "license GPL" is unnecessary at all.

Thanks
Dave

2014-01-16 06:53:59

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

On 01/16/14 at 02:46pm, Dave Young wrote:
> On 01/15/14 at 08:04pm, H. Peter Anvin wrote:
> > On 01/15/2014 07:03 PM, Dave Young wrote:
> > >>
> > >> making something harder to grep and less standardized is hardly cleaner
> > >> and these things compile to nothing for non-modules.
> > >
> > > It's not nothing, just very small increasement:
> > > text data bss dec hex filename
> > > 7636121 1391824 9355264 18383209 1188169 vmlinux
> > >
> > > text data bss dec hex filename
> > > 7636113 1391824 9355264 18383201 1188161 vmlinux
> > >
> > > I do not want to insist on this minor problem, if you want please keep it.
> > >
> >
> > Now *that* we can fix by making these macros expand to nothing if you
> > are compiling for a nonmodule, right? That will benefit code that
> > legitimately can compile as a module but is configured "y", too.
>
> I'm not sure expanding to nothing is a good fix.
>
> For built-in module we still can retrieve the module info in userspace via
> /usr/sbin/modinfo, if we expand the Macros to nothing then userspace will
> not see thus infomation anymore.
>
> Another advantage of built-in module is it has module specific parameters which
> should not necessary a generic kernel parameter.
>
> If the *module* is really a nonmodule, IMHO it should add the information to

Above I means add infomation to file comment section.

> or add other macros macros instead of reuse the MODULE_*.
>
> BTW, for macros in this file, for a nonmodule "license GPL" is unnecessary at all.
>
> Thanks
> Dave

2014-01-16 14:52:27

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86, ptdump: Add the functionality to dump an arbitrary pagetable

>
> If the *module* is really a nonmodule, IMHO it should add the information to
> or add other macros macros instead of reuse the MODULE_*.
>
> BTW, for macros in this file, for a nonmodule "license GPL" is unnecessary at all.

I really fail to see what you have against machine-and-human readable data