2012-10-03 12:59:32

by Matt Fleming

[permalink] [raw]
Subject: [PATCH 0/3] x86/efi: Identity mapping pagetable

From: Matt Fleming <[email protected]>

This series upgrades real_mode_header->trampoline_pgd to a proper
kernel pagetable instead of just mapping the kernel text and module
space. It also inserts the physical mappings for anything we
ioremap(), so I/O regions are always accessible via their physical
addresses whenever this pagetable is loaded, making it a true identity
mapping.

These changes make it suitable for loading when calling virtual EFI
runtime functions on x86-64. The main benefit of this change is to fix
the ASUS firmware bug documented here,

https://lkml.org/lkml/2012/8/7/108

but having our own EFI pagetable makes sense anyway. The memory map
code is easily the most complicatd part of the EFI infrastructure, and
it's likely that there will be other funky stuff we have to do with
our memory map as more and more machines ship with various
implementations of EFI firmware.

Note that we only switch to the identity pagetable for x86-64. I'm
unaware of any bugs like the above on 32-bit EFI platforms, but we can
easily adopt the 64-bit scheme if one is discovered.

Matt Fleming (2):
x86, mm: Include the entire kernel memory map in trampoline_pgd
x86, efi: 1:1 pagetable mapping for virtual EFI calls

Xiaoyan Zhang (1):
x86/kernel: remove tboot 1:1 page table creation code

arch/x86/include/asm/efi.h | 28 +++++++++---
arch/x86/kernel/tboot.c | 78 +++------------------------------
arch/x86/mm/init_64.c | 9 +++-
arch/x86/mm/ioremap.c | 99 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/platform/efi/efi_64.c | 15 +++++++
arch/x86/realmode/init.c | 17 +++++++-
6 files changed, 163 insertions(+), 83 deletions(-)

--
1.7.11.4


2012-10-03 12:59:46

by Matt Fleming

[permalink] [raw]
Subject: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

From: Matt Fleming <[email protected]>

There are various pieces of code in arch/x86 that require a page table
with an identity mapping. Make trampoline_pgd a proper kernel page
table, it currently only includes the kernel text and module space
mapping.

One new feature of trampoline_pgd is that it now has mappings for the
physical I/O device addresses, which are inserted at ioremap()
time. Some broken implementations of EFI firmware require these
mappings to always be around.

Signed-off-by: Matt Fleming <[email protected]>
---
arch/x86/mm/init_64.c | 9 ++++-
arch/x86/mm/ioremap.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++
arch/x86/realmode/init.c | 17 ++++++++-
3 files changed, 122 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 2b6b4a3..fd4404f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -108,13 +108,13 @@ void sync_global_pgds(unsigned long start, unsigned long end)
for (address = start; address <= end; address += PGDIR_SIZE) {
const pgd_t *pgd_ref = pgd_offset_k(address);
struct page *page;
+ pgd_t *pgd;

if (pgd_none(*pgd_ref))
continue;

spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
spinlock_t *pgt_lock;

pgd = (pgd_t *)page_address(page) + pgd_index(address);
@@ -130,6 +130,13 @@ void sync_global_pgds(unsigned long start, unsigned long end)

spin_unlock(pgt_lock);
}
+
+ pgd = __va(real_mode_header->trampoline_pgd);
+ pgd += pgd_index(address);
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+
spin_unlock(&pgd_lock);
}
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78fe3f1..fd3129a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -50,6 +50,101 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}

+static void ident_pte_range(unsigned long paddr, unsigned long vaddr,
+ pmd_t *ppmd, pmd_t *vpmd, unsigned long end)
+{
+ pte_t *ppte = pte_offset_kernel(ppmd, paddr);
+ pte_t *vpte = pte_offset_kernel(vpmd, vaddr);
+
+ do {
+ set_pte(ppte, *vpte);
+ } while (ppte++, vpte++, vaddr += PAGE_SIZE, vaddr != end);
+}
+
+static int ident_pmd_range(unsigned long paddr, unsigned long vaddr,
+ pud_t *ppud, pud_t *vpud, unsigned long end)
+{
+ pmd_t *ppmd = pmd_offset(ppud, paddr);
+ pmd_t *vpmd = pmd_offset(vpud, vaddr);
+ unsigned long next;
+
+ do {
+ next = pmd_addr_end(vaddr, end);
+
+ if (!pmd_present(*ppmd)) {
+ pte_t *ppte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!ppte)
+ return 1;
+
+ set_pmd(ppmd, __pmd(_KERNPG_TABLE | __pa(ppte)));
+ }
+
+ ident_pte_range(paddr, vaddr, ppmd, vpmd, next);
+ } while (ppmd++, vpmd++, vaddr = next, vaddr != end);
+
+ return 0;
+}
+
+static int ident_pud_range(unsigned long paddr, unsigned long vaddr,
+ pgd_t *ppgd, pgd_t *vpgd, unsigned long end)
+{
+ pud_t *ppud = pud_offset(ppgd, paddr);
+ pud_t *vpud = pud_offset(vpgd, vaddr);
+ unsigned long next;
+
+ do {
+ next = pud_addr_end(vaddr, end);
+
+ if (!pud_present(*ppud)) {
+ pmd_t *ppmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!ppmd)
+ return 1;
+
+ set_pud(ppud, __pud(_KERNPG_TABLE | __pa(ppmd)));
+ }
+
+ if (ident_pmd_range(paddr, vaddr, ppud, vpud, next))
+ return 1;
+ } while (ppud++, vpud++, vaddr = next, vaddr != end);
+
+ return 0;
+}
+
+static int insert_identity_mapping(resource_size_t paddr, unsigned long vaddr,
+ unsigned long size)
+{
+ unsigned long end = vaddr + size;
+ unsigned long next;
+ pgd_t *vpgd, *ppgd;
+
+#ifdef CONFIG_X86_32
+ ppgd = initial_page_table + pgd_index(paddr);
+
+ if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
+ return 1;
+#else
+ ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);
+#endif
+
+ vpgd = pgd_offset_k(vaddr);
+ do {
+ next = pgd_addr_end(vaddr, end);
+
+ if (!pgd_present(*ppgd)) {
+ pud_t *ppud = (pud_t *)get_zeroed_page(GFP_KERNEL);
+ if (!ppud)
+ return 1;
+
+ set_pgd(ppgd, __pgd(_KERNPG_TABLE | __pa(ppud)));
+ }
+
+ if (ident_pud_range(paddr, vaddr, ppgd, vpgd, next))
+ return 1;
+ } while (ppgd++, vpgd++, vaddr = next, vaddr != end);
+
+ return 0;
+}
+
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
@@ -163,6 +258,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);

+ if (insert_identity_mapping(phys_addr, vaddr, size))
+ printk(KERN_WARNING "ioremap: unable to map 0x%llx in identity pagetable\n",
+ (unsigned long long)phys_addr);
+
/*
* Check if the request spans more than any BAR in the iomem resource
* tree.
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cbca565..8e6ab61 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -78,8 +78,21 @@ void __init setup_real_mode(void)
*trampoline_cr4_features = read_cr4();

trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
- trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
- trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+
+ /*
+ * Create an identity mapping for all of physical memory.
+ */
+ for (i = 0; i <= pgd_index(max_pfn << PAGE_SHIFT); i++) {
+ int index = pgd_index(PAGE_OFFSET) + i;
+
+ trampoline_pgd[i] = (u64)pgd_val(swapper_pg_dir[index]);
+ }
+
+ /*
+ * Copy the upper-half of the kernel pages tables.
+ */
+ for (i = pgd_index(PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = (u64)pgd_val(swapper_pg_dir[i]);
#endif
}

--
1.7.11.4

2012-10-03 12:59:49

by Matt Fleming

[permalink] [raw]
Subject: [PATCH 2/3] x86, efi: 1:1 pagetable mapping for virtual EFI calls

From: Matt Fleming <[email protected]>

Some firmware still needs a 1:1 (virt->phys) mapping even after we've
called SetVirtualAddressMap(). So install the mapping alongside our
existing kernel mapping whenever we make EFI calls in virtual mode.

This bug was discovered on ASUS machines where the firmware
implementation of GetTime() accesses the RTC device via physical
addresses, even though that's bogus per the UEFI spec since we've
informed the firmware via SetVirtualAddressMap() that the boottime
memory map is no longer valid.

This bug seems to be present in a lot of consumer devices, so there's
not a lot we can do about this spec violation apart from workaround
it.

Cc: JérômeCarretero <[email protected]>
Cc: Vasco Dias <[email protected]>
Signed-off-by: Matt Fleming <[email protected]>
---
arch/x86/include/asm/efi.h | 28 +++++++++++++++++++++-------
arch/x86/platform/efi/efi_64.c | 15 +++++++++++++++
2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c9dcc18..ae3bf3b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -69,23 +69,37 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
(u64)(a4), (u64)(a5), (u64)(a6))

+extern unsigned long efi_call_virt_prelog(void);
+extern void efi_call_virt_epilog(unsigned long);
+
+#define efi_callx(x, func, ...) \
+ ({ \
+ efi_status_t __status; \
+ unsigned long __pgd; \
+ \
+ __pgd = efi_call_virt_prelog(); \
+ __status = efi_call##x(func, __VA_ARGS__); \
+ efi_call_virt_epilog(__pgd); \
+ __status; \
+ })
+
#define efi_call_virt0(f) \
- efi_call0((void *)(efi.systab->runtime->f))
+ efi_callx(0, (void *)(efi.systab->runtime->f))
#define efi_call_virt1(f, a1) \
- efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
+ efi_callx(1, (void *)(efi.systab->runtime->f), (u64)(a1))
#define efi_call_virt2(f, a1, a2) \
- efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
+ efi_callx(2, (void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
#define efi_call_virt3(f, a1, a2, a3) \
- efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ efi_callx(3, (void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3))
#define efi_call_virt4(f, a1, a2, a3, a4) \
- efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ efi_callx(4, (void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4))
#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
- efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ efi_callx(5, (void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4), (u64)(a5))
#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
- efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+ efi_callx(6, (void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
(u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))

extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac3aa54..ddb0174 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -58,6 +58,21 @@ static void __init early_code_mapping_set_exec(int executable)
}
}

+unsigned long efi_call_virt_prelog(void)
+{
+ unsigned long saved;
+
+ saved = read_cr3();
+ write_cr3(real_mode_header->trampoline_pgd);
+
+ return saved;
+}
+
+void efi_call_virt_epilog(unsigned long saved)
+{
+ write_cr3(saved);
+}
+
void __init efi_call_phys_prelog(void)
{
unsigned long vaddress;
--
1.7.11.4

2012-10-03 12:59:53

by Matt Fleming

[permalink] [raw]
Subject: [PATCH 3/3] x86/kernel: remove tboot 1:1 page table creation code

From: Xiaoyan Zhang <[email protected]>

For TXT boot, while Linux kernel trys to shutdown/S3/S4/reboot, it
need to jump back to tboot code and do TXT teardown work. Previously
kernel zapped all mem page identity mapping (va=pa) after booting, so
tboot code mem address was mapped again with identity mapping. Now
kernel didn't zap the identity mapping page table, so tboot related
code can remove the remapping code before trapping back now.

Signed-off-by: Xiaoyan Zhang <[email protected]>
Acked-by: Gang Wei <[email protected]>
Signed-off-by: Matt Fleming <[email protected]>
---
arch/x86/kernel/tboot.c | 78 ++++---------------------------------------------
1 file changed, 5 insertions(+), 73 deletions(-)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index f84fe00..d4f460f 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -103,71 +103,13 @@ void __init tboot_probe(void)
pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
}

-static pgd_t *tboot_pg_dir;
-static struct mm_struct tboot_mm = {
- .mm_rb = RB_ROOT,
- .pgd = swapper_pg_dir,
- .mm_users = ATOMIC_INIT(2),
- .mm_count = ATOMIC_INIT(1),
- .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
- .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
- .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
-};
-
static inline void switch_to_tboot_pt(void)
{
- write_cr3(virt_to_phys(tboot_pg_dir));
-}
-
-static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
- pgprot_t prot)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(&tboot_mm, vaddr);
- pud = pud_alloc(&tboot_mm, pgd, vaddr);
- if (!pud)
- return -1;
- pmd = pmd_alloc(&tboot_mm, pud, vaddr);
- if (!pmd)
- return -1;
- pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
- if (!pte)
- return -1;
- set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
- pte_unmap(pte);
- return 0;
-}
-
-static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
- unsigned long nr)
-{
- /* Reuse the original kernel mapping */
- tboot_pg_dir = pgd_alloc(&tboot_mm);
- if (!tboot_pg_dir)
- return -1;
-
- for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
- if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
- return -1;
- }
-
- return 0;
-}
-
-static void tboot_create_trampoline(void)
-{
- u32 map_base, map_size;
-
- /* Create identity map for tboot shutdown code. */
- map_base = PFN_DOWN(tboot->tboot_base);
- map_size = PFN_UP(tboot->tboot_size);
- if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
- panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
- map_base, map_size);
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
}

#ifdef CONFIG_ACPI_SLEEP
@@ -225,14 +167,6 @@ void tboot_shutdown(u32 shutdown_type)
if (!tboot_enabled())
return;

- /*
- * if we're being called before the 1:1 mapping is set up then just
- * return and let the normal shutdown happen; this should only be
- * due to very early panic()
- */
- if (!tboot_pg_dir)
- return;
-
/* if this is S3 then set regions to MAC */
if (shutdown_type == TB_SHUTDOWN_S3)
if (tboot_setup_sleep())
@@ -343,8 +277,6 @@ static __init int tboot_late_init(void)
if (!tboot_enabled())
return 0;

- tboot_create_trampoline();
-
atomic_set(&ap_wfs_count, 0);
register_hotcpu_notifier(&tboot_cpu_notifier);

--
1.7.11.4

2012-10-03 13:31:30

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
>+static int insert_identity_mapping(resource_size_t paddr, unsigned long vaddr,
>+ unsigned long size)
>+{
>+ unsigned long end = vaddr + size;
>+ unsigned long next;
>+ pgd_t *vpgd, *ppgd;
>+
>+#ifdef CONFIG_X86_32
>+ ppgd = initial_page_table + pgd_index(paddr);
>+
>+ if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
>+ return 1;
>+#else
>+ ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);

Missing equivalent code (to the 32-bit one above) here - after all, you're trying
to potentially insert a 52-bit physical address into 48-bit virtual space.

>+#endif
>+
>+ vpgd = pgd_offset_k(vaddr);
>+ do {
>+ next = pgd_addr_end(vaddr, end);
>+
>+ if (!pgd_present(*ppgd)) {
>+ pud_t *ppud = (pud_t *)get_zeroed_page(GFP_KERNEL);
>+ if (!ppud)
>+ return 1;
>+
>+ set_pgd(ppgd, __pgd(_KERNPG_TABLE | __pa(ppud)));
>+ }
>+
>+ if (ident_pud_range(paddr, vaddr, ppgd, vpgd, next))
>+ return 1;
>+ } while (ppgd++, vpgd++, vaddr = next, vaddr != end);
>+
>+ return 0;
>+}
>+
>/*
>* Remap an arbitrary physical address space into the kernel virtual
>* address space. Needed when the kernel wants to access high addresses
>@@ -163,6 +258,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
> ret_addr = (void __iomem *) (vaddr + offset);
> mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
>
>+ if (insert_identity_mapping(phys_addr, vaddr, size))
>+ printk(KERN_WARNING "ioremap: unable to map 0x%llx in identity pagetable\n",
>+ (unsigned long long)phys_addr);

Isn't that going to trigger quite frequently on 32-bit kernels?

Jan

2012-10-03 14:03:56

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

On Wed, 2012-10-03 at 14:31 +0100, Jan Beulich wrote:
> >>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
> >+static int insert_identity_mapping(resource_size_t paddr, unsigned long vaddr,
> >+ unsigned long size)
> >+{
> >+ unsigned long end = vaddr + size;
> >+ unsigned long next;
> >+ pgd_t *vpgd, *ppgd;
> >+
> >+#ifdef CONFIG_X86_32
> >+ ppgd = initial_page_table + pgd_index(paddr);
> >+
> >+ if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
> >+ return 1;
> >+#else
> >+ ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);
>
> Missing equivalent code (to the 32-bit one above) here - after all, you're trying
> to potentially insert a 52-bit physical address into 48-bit virtual space.

Ah, true.

> >+#endif
> >+
> >+ vpgd = pgd_offset_k(vaddr);
> >+ do {
> >+ next = pgd_addr_end(vaddr, end);
> >+
> >+ if (!pgd_present(*ppgd)) {
> >+ pud_t *ppud = (pud_t *)get_zeroed_page(GFP_KERNEL);
> >+ if (!ppud)
> >+ return 1;
> >+
> >+ set_pgd(ppgd, __pgd(_KERNPG_TABLE | __pa(ppud)));
> >+ }
> >+
> >+ if (ident_pud_range(paddr, vaddr, ppgd, vpgd, next))
> >+ return 1;
> >+ } while (ppgd++, vpgd++, vaddr = next, vaddr != end);
> >+
> >+ return 0;
> >+}
> >+
> >/*
> >* Remap an arbitrary physical address space into the kernel virtual
> >* address space. Needed when the kernel wants to access high addresses
> >@@ -163,6 +258,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
> > ret_addr = (void __iomem *) (vaddr + offset);
> > mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
> >
> >+ if (insert_identity_mapping(phys_addr, vaddr, size))
> >+ printk(KERN_WARNING "ioremap: unable to map 0x%llx in identity pagetable\n",
> >+ (unsigned long long)phys_addr);
>
> Isn't that going to trigger quite frequently on 32-bit kernels?

Hmmm... yeah, probably, though it didn't during my testing. If it is
likely to trigger a lot then we might be best only inserting the
identity mmio mapping for 64-bit, and addressing the 32-bit case if we
ever actually need the identity pagetable.

2012-10-04 06:32:12

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> On 03.10.12 at 16:03, Matt Fleming <[email protected]> wrote:
> On Wed, 2012-10-03 at 14:31 +0100, Jan Beulich wrote:
>> >>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
>> >@@ -163,6 +258,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
>> > ret_addr = (void __iomem *) (vaddr + offset);
>> > mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
>> >
>> >+ if (insert_identity_mapping(phys_addr, vaddr, size))
>> >+ printk(KERN_WARNING "ioremap: unable to map 0x%llx in identity pagetable\n",
>> >+ (unsigned long long)phys_addr);
>>
>> Isn't that going to trigger quite frequently on 32-bit kernels?
>
> Hmmm... yeah, probably, though it didn't during my testing. If it is

That's suspicious, isn't it? In general, on any machine with more
than 3Gb of memory below the 4Gb boundary this ought to
trigger for _all_ mappings of MMIO space, and that's already only
considering the default of VMSPLIT_3G.

> likely to trigger a lot then we might be best only inserting the
> identity mmio mapping for 64-bit, and addressing the 32-bit case if we
> ever actually need the identity pagetable.

I think that would be the best choice for the moment.

Btw., once this set of yours is in - will I need to resubmit the
time handling patch that actually triggered this work, or will
you just reinstate it without further action on my part?

Jan

2012-10-04 09:18:55

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

On Thu, 2012-10-04 at 07:32 +0100, Jan Beulich wrote:
> >>> On 03.10.12 at 16:03, Matt Fleming <[email protected]> wrote:
> > On Wed, 2012-10-03 at 14:31 +0100, Jan Beulich wrote:
> >> >>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
> >> >@@ -163,6 +258,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
> >> > ret_addr = (void __iomem *) (vaddr + offset);
> >> > mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
> >> >
> >> >+ if (insert_identity_mapping(phys_addr, vaddr, size))
> >> >+ printk(KERN_WARNING "ioremap: unable to map 0x%llx in identity pagetable\n",
> >> >+ (unsigned long long)phys_addr);
> >>
> >> Isn't that going to trigger quite frequently on 32-bit kernels?
> >
> > Hmmm... yeah, probably, though it didn't during my testing. If it is
>
> That's suspicious, isn't it? In general, on any machine with more
> than 3Gb of memory below the 4Gb boundary this ought to
> trigger for _all_ mappings of MMIO space, and that's already only
> considering the default of VMSPLIT_3G.

I don't know about it being suspicious - I don't have any x86 machines
here with gigabytes of memory. But your point is still valid.

> > likely to trigger a lot then we might be best only inserting the
> > identity mmio mapping for 64-bit, and addressing the 32-bit case if we
> > ever actually need the identity pagetable.
>
> I think that would be the best choice for the moment.

OK, cool.

> Btw., once this set of yours is in - will I need to resubmit the
> time handling patch that actually triggered this work, or will
> you just reinstate it without further action on my part?

It's up to you. If you don't want to make any changes to your original
patch then I'll just re-apply it on top of this series, updating the
commit log to note why it got reverted and why it's now OK to re-apply.


2012-10-04 10:01:26

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> On 04.10.12 at 11:18, Matt Fleming <[email protected]> wrote:
> On Thu, 2012-10-04 at 07:32 +0100, Jan Beulich wrote:
>> Btw., once this set of yours is in - will I need to resubmit the
>> time handling patch that actually triggered this work, or will
>> you just reinstate it without further action on my part?
>
> It's up to you. If you don't want to make any changes to your original
> patch then I'll just re-apply it on top of this series, updating the
> commit log to note why it got reverted and why it's now OK to re-apply.

Afaict no changes should be necessary (so long as the patch
still applies).

Jan

2012-10-04 21:08:51

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

On 10/03/2012 06:31 AM, Jan Beulich wrote:
>>>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
>> +static int insert_identity_mapping(resource_size_t paddr, unsigned long vaddr,
>> + unsigned long size)
>> +{
>> + unsigned long end = vaddr + size;
>> + unsigned long next;
>> + pgd_t *vpgd, *ppgd;
>> +
>> +#ifdef CONFIG_X86_32
>> + ppgd = initial_page_table + pgd_index(paddr);
>> +
>> + if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
>> + return 1;
>> +#else
>> + ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);
>
> Missing equivalent code (to the 32-bit one above) here - after all, you're trying
> to potentially insert a 52-bit physical address into 48-bit virtual space.
>

We should have the check, but at least for Linux support we require
P <= V-2.

-hpa


--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2012-10-05 06:40:01

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> On 04.10.12 at 23:08, "H. Peter Anvin" <[email protected]> wrote:
> On 10/03/2012 06:31 AM, Jan Beulich wrote:
>>>>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
>>> +static int insert_identity_mapping(resource_size_t paddr, unsigned long
> vaddr,
>>> + unsigned long size)
>>> +{
>>> + unsigned long end = vaddr + size;
>>> + unsigned long next;
>>> + pgd_t *vpgd, *ppgd;
>>> +
>>> +#ifdef CONFIG_X86_32
>>> + ppgd = initial_page_table + pgd_index(paddr);
>>> +
>>> + if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
>>> + return 1;
>>> +#else
>>> + ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);
>>
>> Missing equivalent code (to the 32-bit one above) here - after all, you're
> trying
>> to potentially insert a 52-bit physical address into 48-bit virtual space.
>>
>
> We should have the check, but at least for Linux support we require
> P <= V-2.

Not really imo - P <= V - 1 should be sufficient here, as all that is
necessary is that the result represents a 1:1 mapping. Specifically,
there's no constraint to the virtual space limitation of the direct
mapping of RAM.

Jan

2012-10-05 06:48:07

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

On Fri, 2012-10-05 at 07:39 +0100, Jan Beulich wrote:
> >>> On 04.10.12 at 23:08, "H. Peter Anvin" <[email protected]> wrote:
> > On 10/03/2012 06:31 AM, Jan Beulich wrote:
> >>>>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
> >>> +static int insert_identity_mapping(resource_size_t paddr, unsigned long
> > vaddr,
> >>> + unsigned long size)
> >>> +{
> >>> + unsigned long end = vaddr + size;
> >>> + unsigned long next;
> >>> + pgd_t *vpgd, *ppgd;
> >>> +
> >>> +#ifdef CONFIG_X86_32
> >>> + ppgd = initial_page_table + pgd_index(paddr);
> >>> +
> >>> + if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
> >>> + return 1;
> >>> +#else
> >>> + ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);
> >>
> >> Missing equivalent code (to the 32-bit one above) here - after all, you're
> > trying
> >> to potentially insert a 52-bit physical address into 48-bit virtual space.
> >>
> >
> > We should have the check, but at least for Linux support we require
> > P <= V-2.
>
> Not really imo - P <= V - 1 should be sufficient here, as all that is
> necessary is that the result represents a 1:1 mapping. Specifically,
> there's no constraint to the virtual space limitation of the direct
> mapping of RAM.

Just to be clear, I was going to add this check,

/* Don't map over the guard hole. */
if (paddr >= 0x7fffffffffff || paddr + size > 0x7fffffffffff)
return 1;

Since I'm guessing mapping over the guard hole would be bad.

--
Matt Fleming, Intel Open Source Technology Center

2012-10-05 08:19:48

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> On 05.10.12 at 08:48, Matt Fleming <[email protected]> wrote:
> On Fri, 2012-10-05 at 07:39 +0100, Jan Beulich wrote:
>> >>> On 04.10.12 at 23:08, "H. Peter Anvin" <[email protected]> wrote:
>> > On 10/03/2012 06:31 AM, Jan Beulich wrote:
>> >>>>> Matt Fleming <[email protected]> 10/03/12 2:59 PM >>>
>> >>> +static int insert_identity_mapping(resource_size_t paddr, unsigned long
>> > vaddr,
>> >>> + unsigned long size)
>> >>> +{
>> >>> + unsigned long end = vaddr + size;
>> >>> + unsigned long next;
>> >>> + pgd_t *vpgd, *ppgd;
>> >>> +
>> >>> +#ifdef CONFIG_X86_32
>> >>> + ppgd = initial_page_table + pgd_index(paddr);
>> >>> +
>> >>> + if (paddr >= PAGE_OFFSET || paddr + size > PAGE_OFFSET)
>> >>> + return 1;
>> >>> +#else
>> >>> + ppgd = __va(real_mode_header->trampoline_pgd) + pgd_index(paddr);
>> >>
>> >> Missing equivalent code (to the 32-bit one above) here - after all, you're
>> > trying
>> >> to potentially insert a 52-bit physical address into 48-bit virtual space.
>> >>
>> >
>> > We should have the check, but at least for Linux support we require
>> > P <= V-2.
>>
>> Not really imo - P <= V - 1 should be sufficient here, as all that is
>> necessary is that the result represents a 1:1 mapping. Specifically,
>> there's no constraint to the virtual space limitation of the direct
>> mapping of RAM.
>
> Just to be clear, I was going to add this check,
>
> /* Don't map over the guard hole. */
> if (paddr >= 0x7fffffffffff || paddr + size > 0x7fffffffffff)
> return 1;

0x800000000000 would be the right numbers in both cases.

> Since I'm guessing mapping over the guard hole would be bad.

Really, you just can't map anything there (you'd most likely end
up mapping something at 0xffff800000000000, and that would
indeed be bad).

Jan

2012-10-05 16:28:16

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

On 10/04/2012 11:39 PM, Jan Beulich wrote:
>>
>> We should have the check, but at least for Linux support we require
>> P <= V-2.
>
> Not really imo - P <= V - 1 should be sufficient here, as all that is
> necessary is that the result represents a 1:1 mapping. Specifically,
> there's no constraint to the virtual space limitation of the direct
> mapping of RAM.
>

The P <= V-2 limitation doesn't come from this, it comes from the fact
that we need to have the regular kernel 1:1 map and still have space for
the kernel, vmalloc, ioremap and so on in the kernel part of the address
space; in theory it *could* be some fraction between 1 and 2, but since
hardware doesn't do fractional bits very well the above is what we have
been telling the hardware folks. :)

-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

2012-10-07 08:16:55

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> "H. Peter Anvin" <[email protected]> 10/05/12 6:28 PM >>>
>On 10/04/2012 11:39 PM, Jan Beulich wrote:
>>>
>>> We should have the check, but at least for Linux support we require
>>> P <= V-2.
>>
>> Not really imo - P <= V - 1 should be sufficient here, as all that is
>> necessary is that the result represents a 1:1 mapping. Specifically,
>> there's no constraint to the virtual space limitation of the direct
>> mapping of RAM.
>>
>
>The P <= V-2 limitation doesn't come from this, it comes from the fact
>that we need to have the regular kernel 1:1 map and still have space for
>the kernel, vmalloc, ioremap and so on in the kernel part of the address
>space; in theory it *could* be some fraction between 1 and 2, but since
>hardware doesn't do fractional bits very well the above is what we have
>been telling the hardware folks. :)

I understand all that, but here we're talking about a true 1:1 mapping
(i.e. in the lower half of the virtual address space), as opposed to the
direct mapping (in the upper half). We can obviously use all of the lower
half for this purpose.

Jan

2012-10-07 10:27:50

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

Actually, no, because we may need to double-map in the low address space per discussion at LPC.

Jan Beulich <[email protected]> wrote:

>>>> "H. Peter Anvin" <[email protected]> 10/05/12 6:28 PM >>>
>>On 10/04/2012 11:39 PM, Jan Beulich wrote:
>>>>
>>>> We should have the check, but at least for Linux support we require
>>>> P <= V-2.
>>>
>>> Not really imo - P <= V - 1 should be sufficient here, as all that
>is
>>> necessary is that the result represents a 1:1 mapping. Specifically,
>>> there's no constraint to the virtual space limitation of the direct
>>> mapping of RAM.
>>>
>>
>>The P <= V-2 limitation doesn't come from this, it comes from the fact
>
>>that we need to have the regular kernel 1:1 map and still have space
>for
>>the kernel, vmalloc, ioremap and so on in the kernel part of the
>address
>>space; in theory it *could* be some fraction between 1 and 2, but
>since
>>hardware doesn't do fractional bits very well the above is what we
>have
>>been telling the hardware folks. :)
>
>I understand all that, but here we're talking about a true 1:1 mapping
>(i.e. in the lower half of the virtual address space), as opposed to
>the
>direct mapping (in the upper half). We can obviously use all of the
>lower
>half for this purpose.
>
>Jan

--
Sent from my mobile phone. Please excuse brevity and lack of formatting.

2012-10-08 06:43:46

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> "H. Peter Anvin" <[email protected]> 10/07/12 12:27 PM >>>
>Actually, no, because we may need to double-map in the low address space per discussion at LPC.

Now this I don't follow (due to lack of context), but it would of course be
relevant for determining the need eventual further adjustments to the patch
we're discussing here.

Jan

>Jan Beulich <[email protected]> wrote:
>
>>>On 10/04/2012 11:39 PM, Jan Beulich wrote:
>>I understand all that, but here we're talking about a true 1:1 mapping
>>(i.e. in the lower half of the virtual address space), as opposed to
>>the
>>direct mapping (in the upper half). We can obviously use all of the
>>lower
>>half for this purpose.

2012-10-10 10:45:30

by Matt Fleming

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

On Thu, 2012-10-04 at 11:01 +0100, Jan Beulich wrote:
> >>> On 04.10.12 at 11:18, Matt Fleming <[email protected]> wrote:
> > On Thu, 2012-10-04 at 07:32 +0100, Jan Beulich wrote:
> >> Btw., once this set of yours is in - will I need to resubmit the
> >> time handling patch that actually triggered this work, or will
> >> you just reinstate it without further action on my part?
> >
> > It's up to you. If you don't want to make any changes to your original
> > patch then I'll just re-apply it on top of this series, updating the
> > commit log to note why it got reverted and why it's now OK to re-apply.
>
> Afaict no changes should be necessary (so long as the patch
> still applies).

How does this look? Matthew do you still want your Acked-by to be
applied?

---

>From 194b1acd3f39e479c484a4aba50e034a2b58e0d6 Mon Sep 17 00:00:00 2001
From: Jan Beulich <[email protected]>
Date: Fri, 25 May 2012 16:20:31 +0100
Subject: [PATCH] x86-64/efi: Use EFI to deal with platform wall clock (again)

Other than ix86, x86-64 on EFI so far didn't set the
{g,s}et_wallclock accessors to the EFI routines, thus
incorrectly using raw RTC accesses instead.

Simply removing the #ifdef around the respective code isn't
enough, however: While so far early get-time calls were done in
physical mode, this doesn't work properly for x86-64, as virtual
addresses would still need to be set up for all runtime regions
(which wasn't the case on the system I have access to), so
instead the patch moves the call to efi_enter_virtual_mode()
ahead (which in turn allows to drop all code related to calling
efi-get-time in physical mode).

Additionally the earlier calling of efi_set_executable()
requires the CPA code to cope, i.e. during early boot it must be
avoided to call cpa_flush_array(), as the first thing this
function does is a BUG_ON(irqs_disabled()).

Also make the two EFI functions in question here static -
they're not being referenced elsewhere.

History:

This commit was originally merged as bacef661acdb ("x86-64/efi:
Use EFI to deal with platform wall clock") but it resulted in some
ASUS machines no longer booting due to a firmware bug, and so was
reverted in f026cfa82f62. A pre-emptive fix for the buggy ASUS
firmware was merged in 03a1c254975e ("x86, efi: 1:1 pagetable
mapping for virtual EFI calls") so now this patch can be
reapplied.

Signed-off-by: Jan Beulich <[email protected]>
Tested-by: Matt Fleming <[email protected]>
Acked-by: Matthew Garrett <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Signed-off-by: Matt Fleming <[email protected]> [added commit history]
---
arch/x86/mm/pageattr.c | 10 ++++++----
arch/x86/platform/efi/efi.c | 30 ++++--------------------------
include/linux/efi.h | 2 --
init/main.c | 8 ++++----
4 files changed, 14 insertions(+), 36 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d..931930a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -919,11 +919,13 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,

/*
* On success we use clflush, when the CPU supports it to
- * avoid the wbindv. If the CPU does not support it and in the
- * error case we fall back to cpa_flush_all (which uses
- * wbindv):
+ * avoid the wbindv. If the CPU does not support it, in the
+ * error case, and during early boot (for EFI) we fall back
+ * to cpa_flush_all (which uses wbinvd):
*/
- if (!ret && cpu_has_clflush) {
+ if (early_boot_irqs_disabled)
+ __cpa_flush_all((void *)(long)cache);
+ else if (!ret && cpu_has_clflush) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
cpa_flush_array(addr, numpages, cache,
cpa.flags, pages);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index aded2a9..7578344 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -235,22 +235,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
return status;
}

-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
- efi_time_cap_t *tc)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- efi_call_phys_prelog();
- status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
- virt_to_phys(tc));
- efi_call_phys_epilog();
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-int efi_set_rtc_mmss(unsigned long nowtime)
+static int efi_set_rtc_mmss(unsigned long nowtime)
{
int real_seconds, real_minutes;
efi_status_t status;
@@ -279,7 +264,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
return 0;
}

-unsigned long efi_get_time(void)
+static unsigned long efi_get_time(void)
{
efi_status_t status;
efi_time_t eft;
@@ -635,18 +620,13 @@ static int __init efi_runtime_init(void)
}
/*
* We will only need *early* access to the following
- * two EFI runtime services before set_virtual_address_map
+ * EFI runtime service before set_virtual_address_map
* is invoked.
*/
- efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
runtime->set_virtual_address_map;
- /*
- * Make efi_get_time can be called before entering
- * virtual mode.
- */
- efi.get_time = phys_efi_get_time;
+
early_iounmap(runtime, sizeof(efi_runtime_services_t));

return 0;
@@ -734,12 +714,10 @@ void __init efi_init(void)
efi_enabled = 0;
return;
}
-#ifdef CONFIG_X86_32
if (efi_native) {
x86_platform.get_wallclock = efi_get_time;
x86_platform.set_wallclock = efi_set_rtc_mmss;
}
-#endif

#if EFI_DEBUG
print_efi_memmap();
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 337aefb..5e2308d 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -516,8 +516,6 @@ extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
extern int __init efi_uart_console_only (void);
extern void efi_initialize_iomem_resources(struct resource *code_resource,
struct resource *data_resource, struct resource *bss_resource);
-extern unsigned long efi_get_time(void);
-extern int efi_set_rtc_mmss(unsigned long nowtime);
extern void efi_reserve_boot_services(void);
extern struct efi_memory_map memmap;

diff --git a/init/main.c b/init/main.c
index db34c0e..5297109 100644
--- a/init/main.c
+++ b/init/main.c
@@ -461,6 +461,10 @@ static void __init mm_init(void)
percpu_init_late();
pgtable_cache_init();
vmalloc_init();
+#ifdef CONFIG_X86
+ if (efi_enabled)
+ efi_enter_virtual_mode();
+#endif
}

asmlinkage void __init start_kernel(void)
@@ -602,10 +606,6 @@ asmlinkage void __init start_kernel(void)
calibrate_delay();
pidmap_init();
anon_vma_init();
-#ifdef CONFIG_X86
- if (efi_enabled)
- efi_enter_virtual_mode();
-#endif
thread_info_cache_init();
cred_init();
fork_init(totalram_pages);
--
1.7.11.4




2012-10-15 07:22:48

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86, mm: Include the entire kernel memory map in trampoline_pgd

>>> On 10.10.12 at 12:45, Matt Fleming <[email protected]> wrote:
> On Thu, 2012-10-04 at 11:01 +0100, Jan Beulich wrote:
>> >>> On 04.10.12 at 11:18, Matt Fleming <[email protected]> wrote:
>> > On Thu, 2012-10-04 at 07:32 +0100, Jan Beulich wrote:
>> >> Btw., once this set of yours is in - will I need to resubmit the
>> >> time handling patch that actually triggered this work, or will
>> >> you just reinstate it without further action on my part?
>> >
>> > It's up to you. If you don't want to make any changes to your original
>> > patch then I'll just re-apply it on top of this series, updating the
>> > commit log to note why it got reverted and why it's now OK to re-apply.
>>
>> Afaict no changes should be necessary (so long as the patch
>> still applies).
>
> How does this look?

Perfectly fine - thanks!

Jan

> Matthew do you still want your Acked-by to be applied?
>
> ---
>
> From 194b1acd3f39e479c484a4aba50e034a2b58e0d6 Mon Sep 17 00:00:00 2001
> From: Jan Beulich <[email protected]>
> Date: Fri, 25 May 2012 16:20:31 +0100
> Subject: [PATCH] x86-64/efi: Use EFI to deal with platform wall clock
> (again)
>
> Other than ix86, x86-64 on EFI so far didn't set the
> {g,s}et_wallclock accessors to the EFI routines, thus
> incorrectly using raw RTC accesses instead.
>
> Simply removing the #ifdef around the respective code isn't
> enough, however: While so far early get-time calls were done in
> physical mode, this doesn't work properly for x86-64, as virtual
> addresses would still need to be set up for all runtime regions
> (which wasn't the case on the system I have access to), so
> instead the patch moves the call to efi_enter_virtual_mode()
> ahead (which in turn allows to drop all code related to calling
> efi-get-time in physical mode).
>
> Additionally the earlier calling of efi_set_executable()
> requires the CPA code to cope, i.e. during early boot it must be
> avoided to call cpa_flush_array(), as the first thing this
> function does is a BUG_ON(irqs_disabled()).
>
> Also make the two EFI functions in question here static -
> they're not being referenced elsewhere.
>
> History:
>
> This commit was originally merged as bacef661acdb ("x86-64/efi:
> Use EFI to deal with platform wall clock") but it resulted in some
> ASUS machines no longer booting due to a firmware bug, and so was
> reverted in f026cfa82f62. A pre-emptive fix for the buggy ASUS
> firmware was merged in 03a1c254975e ("x86, efi: 1:1 pagetable
> mapping for virtual EFI calls") so now this patch can be
> reapplied.
>
> Signed-off-by: Jan Beulich <[email protected]>
> Tested-by: Matt Fleming <[email protected]>
> Acked-by: Matthew Garrett <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Signed-off-by: Matt Fleming <[email protected]> [added commit history]
> ---
> arch/x86/mm/pageattr.c | 10 ++++++----
> arch/x86/platform/efi/efi.c | 30 ++++--------------------------
> include/linux/efi.h | 2 --
> init/main.c | 8 ++++----
> 4 files changed, 14 insertions(+), 36 deletions(-)
>
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index a718e0d..931930a 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -919,11 +919,13 @@ static int change_page_attr_set_clr(unsigned long
> *addr, int numpages,
>
> /*
> * On success we use clflush, when the CPU supports it to
> - * avoid the wbindv. If the CPU does not support it and in the
> - * error case we fall back to cpa_flush_all (which uses
> - * wbindv):
> + * avoid the wbindv. If the CPU does not support it, in the
> + * error case, and during early boot (for EFI) we fall back
> + * to cpa_flush_all (which uses wbinvd):
> */
> - if (!ret && cpu_has_clflush) {
> + if (early_boot_irqs_disabled)
> + __cpa_flush_all((void *)(long)cache);
> + else if (!ret && cpu_has_clflush) {
> if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
> cpa_flush_array(addr, numpages, cache,
> cpa.flags, pages);
> diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
> index aded2a9..7578344 100644
> --- a/arch/x86/platform/efi/efi.c
> +++ b/arch/x86/platform/efi/efi.c
> @@ -235,22 +235,7 @@ static efi_status_t __init
> phys_efi_set_virtual_address_map(
> return status;
> }
>
> -static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
> - efi_time_cap_t *tc)
> -{
> - unsigned long flags;
> - efi_status_t status;
> -
> - spin_lock_irqsave(&rtc_lock, flags);
> - efi_call_phys_prelog();
> - status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
> - virt_to_phys(tc));
> - efi_call_phys_epilog();
> - spin_unlock_irqrestore(&rtc_lock, flags);
> - return status;
> -}
> -
> -int efi_set_rtc_mmss(unsigned long nowtime)
> +static int efi_set_rtc_mmss(unsigned long nowtime)
> {
> int real_seconds, real_minutes;
> efi_status_t status;
> @@ -279,7 +264,7 @@ int efi_set_rtc_mmss(unsigned long nowtime)
> return 0;
> }
>
> -unsigned long efi_get_time(void)
> +static unsigned long efi_get_time(void)
> {
> efi_status_t status;
> efi_time_t eft;
> @@ -635,18 +620,13 @@ static int __init efi_runtime_init(void)
> }
> /*
> * We will only need *early* access to the following
> - * two EFI runtime services before set_virtual_address_map
> + * EFI runtime service before set_virtual_address_map
> * is invoked.
> */
> - efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
> efi_phys.set_virtual_address_map =
> (efi_set_virtual_address_map_t *)
> runtime->set_virtual_address_map;
> - /*
> - * Make efi_get_time can be called before entering
> - * virtual mode.
> - */
> - efi.get_time = phys_efi_get_time;
> +
> early_iounmap(runtime, sizeof(efi_runtime_services_t));
>
> return 0;
> @@ -734,12 +714,10 @@ void __init efi_init(void)
> efi_enabled = 0;
> return;
> }
> -#ifdef CONFIG_X86_32
> if (efi_native) {
> x86_platform.get_wallclock = efi_get_time;
> x86_platform.set_wallclock = efi_set_rtc_mmss;
> }
> -#endif
>
> #if EFI_DEBUG
> print_efi_memmap();
> diff --git a/include/linux/efi.h b/include/linux/efi.h
> index 337aefb..5e2308d 100644
> --- a/include/linux/efi.h
> +++ b/include/linux/efi.h
> @@ -516,8 +516,6 @@ extern u64 efi_mem_attribute (unsigned long phys_addr,
> unsigned long size);
> extern int __init efi_uart_console_only (void);
> extern void efi_initialize_iomem_resources(struct resource *code_resource,
> struct resource *data_resource, struct resource *bss_resource);
> -extern unsigned long efi_get_time(void);
> -extern int efi_set_rtc_mmss(unsigned long nowtime);
> extern void efi_reserve_boot_services(void);
> extern struct efi_memory_map memmap;
>
> diff --git a/init/main.c b/init/main.c
> index db34c0e..5297109 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -461,6 +461,10 @@ static void __init mm_init(void)
> percpu_init_late();
> pgtable_cache_init();
> vmalloc_init();
> +#ifdef CONFIG_X86
> + if (efi_enabled)
> + efi_enter_virtual_mode();
> +#endif
> }
>
> asmlinkage void __init start_kernel(void)
> @@ -602,10 +606,6 @@ asmlinkage void __init start_kernel(void)
> calibrate_delay();
> pidmap_init();
> anon_vma_init();
> -#ifdef CONFIG_X86
> - if (efi_enabled)
> - efi_enter_virtual_mode();
> -#endif
> thread_info_cache_init();
> cred_init();
> fork_init(totalram_pages);
> --
> 1.7.11.4