2008-01-15 05:50:26

by Huang, Ying

[permalink] [raw]
Subject: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap

This patch makes bt_ioremap can be used before paging_init via
providing an early implementation of set_fixmap that can be used
before paging_init. This makes boot_ioremap can be replaced by
bt_ioremap.

Signed-off-by: Huang Ying <[email protected]>

---
arch/x86/kernel/setup_32.c | 1
arch/x86/mm/init_32.c | 2 +
arch/x86/mm/ioremap_32.c | 87 +++++++++++++++++++++++++++++++++++++++++++--
include/asm-x86/io_32.h | 3 +
4 files changed, 91 insertions(+), 2 deletions(-)

--- a/arch/x86/mm/ioremap_32.c
+++ b/arch/x86/mm/ioremap_32.c
@@ -208,6 +208,89 @@ void iounmap(volatile void __iomem *addr
}
EXPORT_SYMBOL(iounmap);

+static __initdata int after_paging_init;
+static __initdata unsigned long bm_pte[1024]
+ __attribute__((aligned(PAGE_SIZE)));
+
+static inline unsigned long * __init bt_ioremap_pgd(unsigned long addr)
+{
+ return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+}
+
+static inline unsigned long * __init bt_ioremap_pte(unsigned long addr)
+{
+ return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+}
+
+void __init bt_ioremap_init(void)
+{
+ unsigned long *pgd;
+
+ pgd = bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+ *pgd = __pa(bm_pte) | _PAGE_TABLE;
+ memset(bm_pte, 0, sizeof(bm_pte));
+ BUG_ON(pgd != bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+}
+
+void __init bt_ioremap_clear(void)
+{
+ unsigned long *pgd;
+
+ pgd = bt_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+ *pgd = 0;
+ __flush_tlb_all();
+}
+
+void __init bt_ioremap_reset(void)
+{
+ enum fixed_addresses idx;
+ unsigned long *pte, phys, addr;
+
+ after_paging_init = 1;
+ for (idx = FIX_BTMAP_BEGIN; idx <= FIX_BTMAP_END; idx--) {
+ addr = fix_to_virt(idx);
+ pte = bt_ioremap_pte(addr);
+ if (!*pte & _PAGE_PRESENT) {
+ phys = *pte & PAGE_MASK;
+ set_fixmap(idx, phys);
+ }
+ }
+}
+
+static void __init __bt_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags)
+{
+ unsigned long *pte, addr = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ pte = bt_ioremap_pte(addr);
+ if (pgprot_val(flags))
+ *pte = (phys & PAGE_MASK) | pgprot_val(flags);
+ else
+ *pte = 0;
+ __flush_tlb_one(addr);
+}
+
+static inline void __init bt_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys)
+{
+ if (after_paging_init)
+ set_fixmap(idx, phys);
+ else
+ __bt_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init bt_clear_fixmap(enum fixed_addresses idx)
+{
+ if (after_paging_init)
+ clear_fixmap(idx);
+ else
+ __bt_set_fixmap(idx, 0, __pgprot(0));
+}
+
void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
{
unsigned long offset, last_addr;
@@ -244,7 +327,7 @@ void __init *bt_ioremap(unsigned long ph
*/
idx = FIX_BTMAP_BEGIN;
while (nrpages > 0) {
- set_fixmap(idx, phys_addr);
+ bt_set_fixmap(idx, phys_addr);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
@@ -267,7 +350,7 @@ void __init bt_iounmap(void *addr, unsig

idx = FIX_BTMAP_BEGIN;
while (nrpages > 0) {
- clear_fixmap(idx);
+ bt_clear_fixmap(idx);
--idx;
--nrpages;
}
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -423,9 +423,11 @@ static void __init pagetable_init (void)
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
+ bt_ioremap_clear();
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
+ bt_ioremap_reset();

permanent_kmaps_init(pgd_base);

--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -130,6 +130,9 @@ extern void iounmap(volatile void __iome
* mappings, before the real ioremap() is functional.
* A boot-time mapping is currently limited to at most 16 pages.
*/
+extern void bt_ioremap_init(void);
+extern void bt_ioremap_clear(void);
+extern void bt_ioremap_reset(void);
extern void *bt_ioremap(unsigned long offset, unsigned long size);
extern void bt_iounmap(void *addr, unsigned long size);
extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -624,6 +624,7 @@ void __init setup_arch(char **cmdline_p)
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
pre_setup_arch_hook();
early_cpu_init();
+ bt_ioremap_init();

efi_check_bios_type();
#ifdef CONFIG_EFI


2008-01-18 08:50:50

by Ian Campbell

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap


On Tue, 2008-01-15 at 13:45 +0800, Huang, Ying wrote:
> +void __init bt_ioremap_init(void)
> +{
> [...]
> + *pgd = __pa(bm_pte) | _PAGE_TABLE;
> +}
> [...]
> +static void __init __bt_set_fixmap(enum fixed_addresses idx,
> + unsigned long phys, pgprot_t flags)
> +{
> [...]
> + if (pgprot_val(flags))
> + *pte = (phys & PAGE_MASK) | pgprot_val(flags);
> + else
> + *pte = 0;
[...]

Shouldn't these, and the rest of the file, be using the PTE accessor
macros set_pte,clear_pte etc? The boot_ioremap it replaces seems to have
done. Otherwise these patches don't appear to be paravirt_ops clean. I
haven't had a chance to investigate fully so this might be a Xen bug or
unrelated to this patch but running current x86.git#mm as a Xen guest
ends up with it being shot in the head by the hypervisor with:

(XEN) Unhandled page fault in domain 16 on VCPU 0 (ec=0000)
(XEN) Pagetable walk from 00000000f54fe40e:
(XEN) L4[0x000] = 000000010c187027 00000000000003b2
(XEN) L3[0x003] = 000000010c186027 00000000000003b3
(XEN) L2[0x1aa] = 0000000000000000 ffffffffffffffff
(XEN) domain_crash_sync called from entry.S
(XEN) Domain 16 (vcpu#0) crashed on cpu#1:
(XEN) ----[ Xen-3.2.0-rc5 x86_64 debug=y Not tainted ]----
(XEN) CPU: 1
(XEN) RIP: e019:[<00000000c02601b1>]
(XEN) RFLAGS: 0000000000000282 CONTEXT: guest
(XEN) rax: 00000000f54fe40e rbx: 0000000000005b00 rcx: 0000000000000000
(XEN) rdx: 0000000000000000 rsi: 0000000000000000 rdi: 00000000c0255b00
(XEN) rbp: 0000000000000000 rsp: 00000000c0257f68 r8: 0000000000000000
(XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 0000000000000000
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr4: 00000000000006f0
(XEN) cr3: 000000011b240000 cr2: 00000000f54fe40e
(XEN) ds: e021 es: e021 fs: e021 gs: e021 ss: e021 cs: e019
(XEN) Guest stack trace from esp=c0257f68:
(XEN) 00000000 c02601b1 0001e019 00010082 c0223566 c0257fc8 00000000 c0260675
(XEN) c0223566 000002d7 c0257fec c02230fe 00000000 c03af000 c0257fec c02230fe
(XEN) 00000000 c025ba84 c0203000 c026317f c0257fe4 c0257fe8 c0257fec bfebcbf5
(XEN) c02766a0 c03af000 c0257fec c02230fe 00000000 c025f151 9fabc9f5 0000649d
(XEN) 01020800 00000f44 f5800000 00000000 c03af000 00000000

This corresponds to the dereference of *bp in get_bios_ebda():
static inline unsigned long get_bios_ebda(void)
{
unsigned short *bp;
unsigned long address;

bp = early_ioremap(0x40EUL, 2);
if (!bp)
return 0;

address = *bp;


Ian.

--
Ian Campbell

Most people can't understand how others can blow their noses differently
than they do.
-- Turgenev

2008-01-18 12:50:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap


* Ian Campbell <[email protected]> wrote:

> Shouldn't these, and the rest of the file, be using the PTE accessor
> macros set_pte,clear_pte etc? The boot_ioremap it replaces seems to
> have done. Otherwise these patches don't appear to be paravirt_ops
> clean. I haven't had a chance to investigate fully so this might be a
> Xen bug or unrelated to this patch but running current x86.git#mm as a
> Xen guest ends up with it being shot in the head by the hypervisor
> with:

willing to apply patches.

Ingo

2008-01-18 14:54:46

by huang ying

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap

On Jan 18, 2008 4:48 PM, Ian Campbell <[email protected]> wrote:
> On Tue, 2008-01-15 at 13:45 +0800, Huang, Ying wrote:
> > +void __init bt_ioremap_init(void)
> > +{
> > [...]
> > + *pgd = __pa(bm_pte) | _PAGE_TABLE;
> > +}
> > [...]
> > +static void __init __bt_set_fixmap(enum fixed_addresses idx,
> > + unsigned long phys, pgprot_t flags)
> > +{
> > [...]
> > + if (pgprot_val(flags))
> > + *pte = (phys & PAGE_MASK) | pgprot_val(flags);
> > + else
> > + *pte = 0;
> [...]
>
> Shouldn't these, and the rest of the file, be using the PTE accessor
> macros set_pte,clear_pte etc? The boot_ioremap it replaces seems to have
> done. Otherwise these patches don't appear to be paravirt_ops clean. I

If CONFIG_X86_PAE is defined, the set_pte, clear_pte etc will operate
3-level page tables, while on i386, the early page table is always
2-level, so set_pte, clear_pte etc functions can not be used here. The
boot_ioremap use a trick to deal with this problem. The CONFIG_X86_PAE
is undefined in arch/x86/mm/boot_ioremap_32.c unconditionally, so the
2-level page table handling function is always used.

Is the method used by boot_ioremap better for Xen?

> haven't had a chance to investigate fully so this might be a Xen bug or
> unrelated to this patch but running current x86.git#mm as a Xen guest
> ends up with it being shot in the head by the hypervisor with:
>
> (XEN) Unhandled page fault in domain 16 on VCPU 0 (ec=0000)
> (XEN) Pagetable walk from 00000000f54fe40e:
> (XEN) L4[0x000] = 000000010c187027 00000000000003b2
> (XEN) L3[0x003] = 000000010c186027 00000000000003b3
> (XEN) L2[0x1aa] = 0000000000000000 ffffffffffffffff
> (XEN) domain_crash_sync called from entry.S
> (XEN) Domain 16 (vcpu#0) crashed on cpu#1:
> (XEN) ----[ Xen-3.2.0-rc5 x86_64 debug=y Not tainted ]----
> (XEN) CPU: 1
> (XEN) RIP: e019:[<00000000c02601b1>]
> (XEN) RFLAGS: 0000000000000282 CONTEXT: guest
> (XEN) rax: 00000000f54fe40e rbx: 0000000000005b00 rcx: 0000000000000000
> (XEN) rdx: 0000000000000000 rsi: 0000000000000000 rdi: 00000000c0255b00
> (XEN) rbp: 0000000000000000 rsp: 00000000c0257f68 r8: 0000000000000000
> (XEN) r9: 0000000000000000 r10: 0000000000000000 r11: 0000000000000000
> (XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
> (XEN) r15: 0000000000000000 cr0: 000000008005003b cr4: 00000000000006f0
> (XEN) cr3: 000000011b240000 cr2: 00000000f54fe40e
> (XEN) ds: e021 es: e021 fs: e021 gs: e021 ss: e021 cs: e019
> (XEN) Guest stack trace from esp=c0257f68:
> (XEN) 00000000 c02601b1 0001e019 00010082 c0223566 c0257fc8 00000000 c0260675
> (XEN) c0223566 000002d7 c0257fec c02230fe 00000000 c03af000 c0257fec c02230fe
> (XEN) 00000000 c025ba84 c0203000 c026317f c0257fe4 c0257fe8 c0257fec bfebcbf5
> (XEN) c02766a0 c03af000 c0257fec c02230fe 00000000 c025f151 9fabc9f5 0000649d
> (XEN) 01020800 00000f44 f5800000 00000000 c03af000 00000000
>
> This corresponds to the dereference of *bp in get_bios_ebda():
> static inline unsigned long get_bios_ebda(void)
> {
> unsigned short *bp;
> unsigned long address;
>
> bp = early_ioremap(0x40EUL, 2);
> if (!bp)
> return 0;
>
> address = *bp;

This crash has nothing to do with this patch. Because this patch is
for i386 only, x86_64 has its own early_ioremap implementation.

Best Regards,
Huang Ying

2008-01-18 15:03:54

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap

huang ying wrote:
>
> If CONFIG_X86_PAE is defined, the set_pte, clear_pte etc will operate
> 3-level page tables, while on i386, the early page table is always
> 2-level, so set_pte, clear_pte etc functions can not be used here. The
> boot_ioremap use a trick to deal with this problem. The CONFIG_X86_PAE
> is undefined in arch/x86/mm/boot_ioremap_32.c unconditionally, so the
> 2-level page table handling function is always used.
>
> Is the method used by boot_ioremap better for Xen?
>

Eric Biederman had a patchset that makes a PAE kernel use PAE page
tables from the start. That is really The Right Thing[TM].

-hpa

2008-01-18 15:24:28

by Ian Campbell

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap


On Fri, 2008-01-18 at 22:54 +0800, huang ying wrote:
> On Jan 18, 2008 4:48 PM, Ian Campbell <[email protected]> wrote:
> > On Tue, 2008-01-15 at 13:45 +0800, Huang, Ying wrote:
> > > +void __init bt_ioremap_init(void)
> > > +{
> > > [...]
> > > + *pgd = __pa(bm_pte) | _PAGE_TABLE;
> > > +}
> > > [...]
> > > +static void __init __bt_set_fixmap(enum fixed_addresses idx,
> > > + unsigned long phys, pgprot_t flags)
> > > +{
> > > [...]
> > > + if (pgprot_val(flags))
> > > + *pte = (phys & PAGE_MASK) | pgprot_val(flags);
> > > + else
> > > + *pte = 0;
> > [...]
> >
> > Shouldn't these, and the rest of the file, be using the PTE accessor
> > macros set_pte,clear_pte etc? The boot_ioremap it replaces seems to have
> > done. Otherwise these patches don't appear to be paravirt_ops clean. I
>
> If CONFIG_X86_PAE is defined, the set_pte, clear_pte etc will operate
> 3-level page tables, while on i386, the early page table is always
> 2-level, so set_pte, clear_pte etc functions can not be used here. The
> boot_ioremap use a trick to deal with this problem. The CONFIG_X86_PAE
> is undefined in arch/x86/mm/boot_ioremap_32.c unconditionally, so the
> 2-level page table handling function is always used.

Ah, when booting a Xen guest you get 3 level page tables right from the
start.

I'm hacking on a patch right now but I hadn't appreciated/noticed that
this was a two level page table even on a PAE kernel which would explain
why it isn't exactly working as planned...

> Is the method used by boot_ioremap better for Xen?

Well, it worked ;-) I haven't looked closely enough at the two ways of
doing things to comment further but I'm sure the new way can be made to
work for Xen too somehow.

> [snip]

> This crash has nothing to do with this patch. Because this patch is
> for i386 only, x86_64 has its own early_ioremap implementation.

It's a 64 bit hypervisor but the guest in question is a 32 bit (PAE)
guest.

Ian
--
Ian Campbell
Current Noise: Mistress - Cheyne Stoking

You can't kiss a girl unexpectedly -- only sooner than she thought you would.

2008-01-18 15:55:06

by Ian Campbell

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap


On Fri, 2008-01-18 at 09:54 -0500, H. Peter Anvin wrote:
> huang ying wrote:
> >
> > If CONFIG_X86_PAE is defined, the set_pte, clear_pte etc will operate
> > 3-level page tables, while on i386, the early page table is always
> > 2-level, so set_pte, clear_pte etc functions can not be used here. The
> > boot_ioremap use a trick to deal with this problem. The CONFIG_X86_PAE
> > is undefined in arch/x86/mm/boot_ioremap_32.c unconditionally, so the
> > 2-level page table handling function is always used.
> >
> > Is the method used by boot_ioremap better for Xen?
> >
>
> Eric Biederman had a patchset that makes a PAE kernel use PAE page
> tables from the start. That is really The Right Thing[TM].

That's much saner than dup'ing up the early ioremap stuff to support
both PAE and non-PAE at runtime, which is about the only idea I've got
for fixing this right now...

I think I'll just back out the early_ioremap patches locally for now and
wait for Eric's patches which should cause the fix for this issue to
just fall out in the wash.

Ian.
--
Ian Campbell
Current Noise: Mistress - Mistress

Everything should be built top-down, except this time.

2008-01-18 16:23:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap


* Ian Campbell <[email protected]> wrote:

> > Eric Biederman had a patchset that makes a PAE kernel use PAE page
> > tables from the start. That is really The Right Thing[TM].
>
> That's much saner than dup'ing up the early ioremap stuff to support
> both PAE and non-PAE at runtime, which is about the only idea I've got
> for fixing this right now...
>
> I think I'll just back out the early_ioremap patches locally for now
> and wait for Eric's patches which should cause the fix for this issue
> to just fall out in the wash.

Eric's patchset is nowhere near being submitted - he _had_ a patchset.
So this needs to be fixed by the Xen-guest folks. (if the easiest/best
fix is to pick up Eric's patchset, then plese do it so)

Ingo

2008-01-18 17:30:31

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap

Ingo Molnar wrote:
> * Ian Campbell <[email protected]> wrote:
>
>
>>> Eric Biederman had a patchset that makes a PAE kernel use PAE page
>>> tables from the start. That is really The Right Thing[TM].
>>>
>> That's much saner than dup'ing up the early ioremap stuff to support
>> both PAE and non-PAE at runtime, which is about the only idea I've got
>> for fixing this right now...
>>
>> I think I'll just back out the early_ioremap patches locally for now
>> and wait for Eric's patches which should cause the fix for this issue
>> to just fall out in the wash.
>>
>
> Eric's patchset is nowhere near being submitted - he _had_ a patchset.
> So this needs to be fixed by the Xen-guest folks. (if the easiest/best
> fix is to pick up Eric's patchset, then plese do it so)
>

This is only relevent to dom0 isn't it? sct is looking at that at the
moment.

J

2008-01-18 17:46:18

by Ian Campbell

[permalink] [raw]
Subject: Re: [PATCH -mm 1/3] i386 boot: replace boot_ioremap with enhanced bt_ioremap - enhance bt_ioremap


On Fri, 2008-01-18 at 09:27 -0800, Jeremy Fitzhardinge wrote:
> Ingo Molnar wrote:
> > * Ian Campbell <[email protected]> wrote:
> >
> >
> >>> Eric Biederman had a patchset that makes a PAE kernel use PAE page
> >>> tables from the start. That is really The Right Thing[TM].
> >>>
> >> That's much saner than dup'ing up the early ioremap stuff to support
> >> both PAE and non-PAE at runtime, which is about the only idea I've got
> >> for fixing this right now...
> >>
> >> I think I'll just back out the early_ioremap patches locally for now
> >> and wait for Eric's patches which should cause the fix for this issue
> >> to just fall out in the wash.
> >>
> >
> > Eric's patchset is nowhere near being submitted - he _had_ a patchset.
> > So this needs to be fixed by the Xen-guest folks. (if the easiest/best
> > fix is to pick up Eric's patchset, then plese do it so)
> >
>
> This is only relevent to dom0 isn't it? sct is looking at that at the
> moment.

The early_ioremap stuff gets called on domU too, for example
get_bios_ebda(). Or was the intention that these should be gated at a
higher level?

Ian.
--
Ian Campbell

Pie are not square. Pie are round. Cornbread are square.