Hi all,
this is a small collection of critical xen bug fixes for 2.6.39-rc3:
the recent changes to the initial kernel pagetable allocation mechanism
(4b239f458c229de044d6905c2b0f9fe16ed9e01e in particular) caused a number
of issues on Xen.
This patch series fixes those issues and it is required just to boot a
2.6.39 linux kernel as regular xen guest.
The list of patches with a diffstat follows:
Stefano Stabellini (4):
xen: mask_rw_pte mark RO all pagetable pages up to pgt_buf_top
x86,xen: introduce x86_init.mapping.pagetable_reserve
xen: more debugging in the e820 parsing
xen: do not create the extra e820 region at an addr lower than 4G
arch/x86/include/asm/pgtable_types.h | 1 +
arch/x86/include/asm/x86_init.h | 9 +++++++++
arch/x86/kernel/x86_init.c | 4 ++++
arch/x86/mm/init.c | 9 +++++++--
arch/x86/xen/mmu.c | 17 ++++++++++++++++-
arch/x86/xen/setup.c | 6 +++++-
6 files changed, 42 insertions(+), 4 deletions(-)
The first two commits make sure pagetable pages are marked RO while
other pages are marked RW.
The third commit adds a couple of useful debugging statements.
The fourth commit fixes a boot crash on xen when booting as initial
domain: the xen extra memory region shouldn't start below 4G otherwise
e820_end_of_low_ram_pfn() could return an address above 4G. As a
consequence init_memory_mapping would end up mapping MMIO regions
without going through the fixmap.
A git branch with this series is available here:
git://xenbits.xen.org/people/sstabellini/linux-pvhvm.git 2.6.39-rc3-fixes
Comments are welcome.
- Stefano
From: Stefano Stabellini <[email protected]>
mask_rw_pte is currently checking if a pfn is a pagetable page if it
falls in the range pgt_buf_start - pgt_buf_end but that is incorrect
because pgt_buf_end is a moving target: pgt_buf_top is the real
boundary.
Signed-off-by: Stefano Stabellini <[email protected]>
---
arch/x86/xen/mmu.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c82df6c..6b833db 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1491,7 +1491,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
* it is RO.
*/
if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
+ pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
pte = pte_wrprotect(pte);
--
1.7.2.3
From: Stefano Stabellini <[email protected]>
Introduce a new x86_init hook called pagetable_reserve that during the
initial memory mapping is used to reserve a range of memory addresses for
kernel pagetable usage.
On native it just calls memblock_x86_reserve_range while on xen it also
takes care of setting the spare memory previously allocated
for kernel pagetable pages from RO to RW, so that it can be used for
other purposes.
Signed-off-by: Stefano Stabellini <[email protected]>
Cc: Yinghai Lu <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/pgtable_types.h | 1 +
arch/x86/include/asm/x86_init.h | 9 +++++++++
arch/x86/kernel/x86_init.c | 4 ++++
arch/x86/mm/init.c | 9 +++++++--
arch/x86/xen/mmu.c | 15 +++++++++++++++
5 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723..d56187c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+extern void native_pagetable_reserve(u64 start, u64 end);
#ifdef CONFIG_X86_32
extern void native_pagetable_setup_start(pgd_t *base);
extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2..d66b3a2 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,14 @@ struct x86_init_oem {
};
/**
+ * struct x86_init_mapping - platform specific initial kernel pagetable setup
+ * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
+ */
+struct x86_init_mapping {
+ void (*pagetable_reserve)(u64 start, u64 end);
+};
+
+/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_setup_start: platform specific pre paging_init() call
* @pagetable_setup_done: platform specific post paging_init() call
@@ -123,6 +131,7 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
+ struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e..75ef4b1 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
+ .mapping = {
+ .pagetable_reserve = native_pagetable_reserve,
+ },
+
.paging = {
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289..ed0650b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
}
+void native_pagetable_reserve(u64 start, u64 end)
+{
+ memblock_x86_reserve_range(start, end, "PGTABLE");
+}
+
struct map_range {
unsigned long start;
unsigned long end;
@@ -273,8 +278,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
__flush_tlb_all();
if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
- pgt_buf_end << PAGE_SHIFT, "PGTABLE");
+ x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+ PFN_PHYS(pgt_buf_end));
if (!after_bootmem)
early_memtest(start, end);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6b833db..fec8680 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
{
}
+static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
+{
+ /* reserve the range used */
+ memblock_x86_reserve_range(start, end, "PGTABLE");
+
+ /* set as RW the rest */
+ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
+ PFN_PHYS(pgt_buf_top));
+ while (end < PFN_PHYS(pgt_buf_top)) {
+ make_lowmem_page_readwrite(__va(end));
+ end += PAGE_SIZE;
+ }
+}
+
static void xen_post_allocator_init(void);
static __init void xen_pagetable_setup_done(pgd_t *base)
@@ -2100,6 +2114,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
void __init xen_init_mmu_ops(void)
{
+ x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
--
1.7.2.3
From: Stefano Stabellini <[email protected]>
Signed-off-by: Stefano Stabellini <[email protected]>
---
arch/x86/xen/setup.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fa0269a..9c38bd1 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,6 +61,8 @@ static __init void xen_add_extra_mem(unsigned long pages)
return;
e820_add_region(extra_start, size, E820_RAM);
+ printk(KERN_DEBUG "extra e820 region: start=%016Lx end=%016Lx\n",
+ extra_start, extra_start + size);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
@@ -231,6 +233,8 @@ char * __init xen_memory_setup(void)
for (i = 0; i < memmap.nr_entries; i++) {
unsigned long long end;
+ printk(KERN_DEBUG "e820_region: type=%d start=%016Lx end=%016Lx",
+ map[i].type, map[i].addr, map[i].size + map[i].addr);
/* Guard against non-page aligned E820 entries. */
if (map[i].type == E820_RAM)
map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
--
1.7.2.3
From: Stefano Stabellini <[email protected]>
Do not add the extra e820 region at a physical address lower than 4G
because it breaks e820_end_of_low_ram_pfn().
It is OK for us to move the xen_extra_mem_start up and down because this
is the index of the memory that can be ballooned in/out - it is memory
not available to the kernel during bootup.
Signed-off-by: Stefano Stabellini <[email protected]>
---
arch/x86/xen/setup.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 9c38bd1..a51e010 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -229,7 +229,7 @@ char * __init xen_memory_setup(void)
memcpy(map_raw, map, sizeof(map));
e820.nr_map = 0;
- xen_extra_mem_start = mem_end;
+ xen_extra_mem_start = max((1ULL << 32), mem_end);
for (i = 0; i < memmap.nr_entries; i++) {
unsigned long long end;
--
1.7.2.3
>>> On 12.04.11 at 13:19, <[email protected]> wrote:
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index 6b833db..fec8680 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t
> *base)
> {
> }
>
> +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
> +{
> + /* reserve the range used */
> + memblock_x86_reserve_range(start, end, "PGTABLE");
Wouldn't it be more natural (and involving less future changes) if
you called native_pagetable_reserve() here?
Jan
> +
> + /* set as RW the rest */
> + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
> + PFN_PHYS(pgt_buf_top));
> + while (end < PFN_PHYS(pgt_buf_top)) {
> + make_lowmem_page_readwrite(__va(end));
> + end += PAGE_SIZE;
> + }
> +}
> +
> static void xen_post_allocator_init(void);
>
> static __init void xen_pagetable_setup_done(pgd_t *base)
On Tue, Apr 12, 2011 at 12:19:51PM +0100, [email protected] wrote:
> From: Stefano Stabellini <[email protected]>
>
> Signed-off-by: Stefano Stabellini <[email protected]>
I am not entirely sure if we need these. You get all of this data by looking
at the Xen E820 and the guest E820 (to see the xen_extra_mem):
(XEN) Xen-e820 RAM map:
(XEN) 0000000000000000 - 000000000009f800 (usable)
(XEN) 000000000009f800 - 00000000000a0000 (reserved)
(XEN) 00000000000f0000 - 0000000000100000 (reserved)
(XEN) 0000000000100000 - 00000000cf5e0000 (usable)
(XEN) 00000000cf5e0000 - 00000000cf5e3000 (ACPI NVS)
(XEN) 00000000cf5e3000 - 00000000cf5f0000 (ACPI data)
(XEN) 00000000cf5f0000 - 00000000cf600000 (reserved)
(XEN) 00000000e0000000 - 00000000f0000000 (reserved)
(XEN) 00000000fec00000 - 0000000100000000 (reserved)
(XEN) 0000000100000000 - 0000000130000000 (usable)
..
[ 0.000000] BIOS-provided physical RAM map:
.. snip..
[ 0.000000] Xen: 0000000100000000 - 00000001a19e0000 (usable)
And your patch adds this:
[ 0.000000] e820_region: type=1 start=0000000000000000 end=000000000009f800
[ 0.000000] e820_region: type=2 start=000000000009f800 end=00000000000a0000
[ 0.000000] e820_region: type=2 start=00000000000f0000 end=0000000000100000
[ 0.000000] e820_region: type=1 start=0000000000100000 end=00000000cf5e0000
[ 0.000000] e820_region: type=4 start=00000000cf5e0000 end=00000000cf5e3000
[ 0.000000] e820_region: type=3 start=00000000cf5e3000 end=00000000cf5f0000
[ 0.000000] e820_region: type=2 start=00000000cf5f0000 end=00000000cf600000
[ 0.000000] e820_region: type=2 start=00000000e0000000 end=00000000f0000000
[ 0.000000] e820_region: type=2 start=00000000fec00000 end=0000000100000000
[ 0.000000] e820_region: type=1 start=0000000100000000 end=0000000130000000
[ 0.000000] released 0 pages of unused memory
[ 0.000000] extra e820 region: start=0000000100000000 end=00000001a19e0000
> ---
> arch/x86/xen/setup.c | 4 ++++
> 1 files changed, 4 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index fa0269a..9c38bd1 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -61,6 +61,8 @@ static __init void xen_add_extra_mem(unsigned long pages)
> return;
>
> e820_add_region(extra_start, size, E820_RAM);
> + printk(KERN_DEBUG "extra e820 region: start=%016Lx end=%016Lx\n",
> + extra_start, extra_start + size);
> sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
>
> memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
> @@ -231,6 +233,8 @@ char * __init xen_memory_setup(void)
> for (i = 0; i < memmap.nr_entries; i++) {
> unsigned long long end;
>
> + printk(KERN_DEBUG "e820_region: type=%d start=%016Lx end=%016Lx",
> + map[i].type, map[i].addr, map[i].size + map[i].addr);
> /* Guard against non-page aligned E820 entries. */
> if (map[i].type == E820_RAM)
> map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
> --
> 1.7.2.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Tue, Apr 12, 2011 at 12:19:49PM +0100, [email protected] wrote:
> From: Stefano Stabellini <[email protected]>
>
> mask_rw_pte is currently checking if a pfn is a pagetable page if it
> falls in the range pgt_buf_start - pgt_buf_end but that is incorrect
> because pgt_buf_end is a moving target: pgt_buf_top is the real
> boundary.
OK. Stuck it on the queue for rc3.
>
> Signed-off-by: Stefano Stabellini <[email protected]>
> ---
> arch/x86/xen/mmu.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index c82df6c..6b833db 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1491,7 +1491,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
> * it is RO.
> */
> if (((!is_early_ioremap_ptep(ptep) &&
> - pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
> + pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
> (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
> pte = pte_wrprotect(pte);
>
> --
> 1.7.2.3
On Tue, Apr 12, 2011 at 12:19:52PM +0100, [email protected] wrote:
> From: Stefano Stabellini <[email protected]>
>
> Do not add the extra e820 region at a physical address lower than 4G
> because it breaks e820_end_of_low_ram_pfn().
>
> It is OK for us to move the xen_extra_mem_start up and down because this
> is the index of the memory that can be ballooned in/out - it is memory
> not available to the kernel during bootup.
OK. Stuck it on rc3 queue.
>
> Signed-off-by: Stefano Stabellini <[email protected]>
> ---
> arch/x86/xen/setup.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
> index 9c38bd1..a51e010 100644
> --- a/arch/x86/xen/setup.c
> +++ b/arch/x86/xen/setup.c
> @@ -229,7 +229,7 @@ char * __init xen_memory_setup(void)
>
> memcpy(map_raw, map, sizeof(map));
> e820.nr_map = 0;
> - xen_extra_mem_start = mem_end;
> + xen_extra_mem_start = max((1ULL << 32), mem_end);
> for (i = 0; i < memmap.nr_entries; i++) {
> unsigned long long end;
>
> --
> 1.7.2.3
On Tue, 12 Apr 2011, Jan Beulich wrote:
> >>> On 12.04.11 at 13:19, <[email protected]> wrote:
> > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> > index 6b833db..fec8680 100644
> > --- a/arch/x86/xen/mmu.c
> > +++ b/arch/x86/xen/mmu.c
> > @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t
> > *base)
> > {
> > }
> >
> > +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
> > +{
> > + /* reserve the range used */
> > + memblock_x86_reserve_range(start, end, "PGTABLE");
>
> Wouldn't it be more natural (and involving less future changes) if
> you called native_pagetable_reserve() here?
Good point.
Patch update below:
---
commit fa4eeb1d4213fee248e8d141bb8d1504aae457ab
Author: Stefano Stabellini <[email protected]>
Date: Wed Mar 30 16:17:33 2011 +0000
x86,xen: introduce x86_init.mapping.pagetable_reserve
Introduce a new x86_init hook called pagetable_reserve that during the
initial memory mapping is used to reserve a range of memory addresses for
kernel pagetable usage.
On native it just calls memblock_x86_reserve_range while on xen it also
takes care of setting the spare memory previously allocated
for kernel pagetable pages from RO to RW, so that it can be used for
other purposes.
Signed-off-by: Stefano Stabellini <[email protected]>
Cc: Yinghai Lu <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Ingo Molnar <[email protected]>
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723..d56187c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+extern void native_pagetable_reserve(u64 start, u64 end);
#ifdef CONFIG_X86_32
extern void native_pagetable_setup_start(pgd_t *base);
extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2..d66b3a2 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,14 @@ struct x86_init_oem {
};
/**
+ * struct x86_init_mapping - platform specific initial kernel pagetable setup
+ * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
+ */
+struct x86_init_mapping {
+ void (*pagetable_reserve)(u64 start, u64 end);
+};
+
+/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_setup_start: platform specific pre paging_init() call
* @pagetable_setup_done: platform specific post paging_init() call
@@ -123,6 +131,7 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
+ struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e..75ef4b1 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
+ .mapping = {
+ .pagetable_reserve = native_pagetable_reserve,
+ },
+
.paging = {
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289..ed0650b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
}
+void native_pagetable_reserve(u64 start, u64 end)
+{
+ memblock_x86_reserve_range(start, end, "PGTABLE");
+}
+
struct map_range {
unsigned long start;
unsigned long end;
@@ -273,8 +278,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
__flush_tlb_all();
if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
- pgt_buf_end << PAGE_SHIFT, "PGTABLE");
+ x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+ PFN_PHYS(pgt_buf_end));
if (!after_bootmem)
early_memtest(start, end);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6b833db..7ad0292 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
{
}
+static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
+{
+ /* reserve the range used */
+ native_pagetable_reserve(start, end);
+
+ /* set as RW the rest */
+ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
+ PFN_PHYS(pgt_buf_top));
+ while (end < PFN_PHYS(pgt_buf_top)) {
+ make_lowmem_page_readwrite(__va(end));
+ end += PAGE_SIZE;
+ }
+}
+
static void xen_post_allocator_init(void);
static __init void xen_pagetable_setup_done(pgd_t *base)
@@ -2100,6 +2114,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
void __init xen_init_mmu_ops(void)
{
+ x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
On 04/12/2011 04:19 AM, [email protected] wrote:
> From: Stefano Stabellini<[email protected]>
>
> Introduce a new x86_init hook called pagetable_reserve that during the
> initial memory mapping is used to reserve a range of memory addresses for
> kernel pagetable usage.
>
> On native it just calls memblock_x86_reserve_range while on xen it also
> takes care of setting the spare memory previously allocated
> for kernel pagetable pages from RO to RW, so that it can be used for
> other purposes.
>
> Signed-off-by: Stefano Stabellini<[email protected]>
> Cc: Yinghai Lu<[email protected]>
> Cc: H. Peter Anvin<[email protected]>
> Cc: Ingo Molnar<[email protected]>
Acked-by: Yinghai Lu <[email protected]>
> ---
> arch/x86/include/asm/pgtable_types.h | 1 +
> arch/x86/include/asm/x86_init.h | 9 +++++++++
> arch/x86/kernel/x86_init.c | 4 ++++
> arch/x86/mm/init.c | 9 +++++++--
> arch/x86/xen/mmu.c | 15 +++++++++++++++
> 5 files changed, 36 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 7db7723..d56187c 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
> /* Install a pte for a particular vaddr in kernel space. */
> void set_pte_vaddr(unsigned long vaddr, pte_t pte);
>
> +extern void native_pagetable_reserve(u64 start, u64 end);
> #ifdef CONFIG_X86_32
> extern void native_pagetable_setup_start(pgd_t *base);
> extern void native_pagetable_setup_done(pgd_t *base);
> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
> index 643ebf2..d66b3a2 100644
> --- a/arch/x86/include/asm/x86_init.h
> +++ b/arch/x86/include/asm/x86_init.h
> @@ -68,6 +68,14 @@ struct x86_init_oem {
> };
>
> /**
> + * struct x86_init_mapping - platform specific initial kernel pagetable setup
> + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
> + */
> +struct x86_init_mapping {
> + void (*pagetable_reserve)(u64 start, u64 end);
> +};
> +
> +/**
> * struct x86_init_paging - platform specific paging functions
> * @pagetable_setup_start: platform specific pre paging_init() call
> * @pagetable_setup_done: platform specific post paging_init() call
> @@ -123,6 +131,7 @@ struct x86_init_ops {
> struct x86_init_mpparse mpparse;
> struct x86_init_irqs irqs;
> struct x86_init_oem oem;
> + struct x86_init_mapping mapping;
> struct x86_init_paging paging;
> struct x86_init_timers timers;
> struct x86_init_iommu iommu;
> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
> index c11514e..75ef4b1 100644
> --- a/arch/x86/kernel/x86_init.c
> +++ b/arch/x86/kernel/x86_init.c
> @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
> .banner = default_banner,
> },
>
> + .mapping = {
> + .pagetable_reserve = native_pagetable_reserve,
> + },
> +
> .paging = {
> .pagetable_setup_start = native_pagetable_setup_start,
> .pagetable_setup_done = native_pagetable_setup_done,
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 286d289..ed0650b 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
> end, pgt_buf_start<< PAGE_SHIFT, pgt_buf_top<< PAGE_SHIFT);
> }
>
> +void native_pagetable_reserve(u64 start, u64 end)
> +{
> + memblock_x86_reserve_range(start, end, "PGTABLE");
> +}
> +
> struct map_range {
> unsigned long start;
> unsigned long end;
> @@ -273,8 +278,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
> __flush_tlb_all();
>
> if (!after_bootmem&& pgt_buf_end> pgt_buf_start)
> - memblock_x86_reserve_range(pgt_buf_start<< PAGE_SHIFT,
> - pgt_buf_end<< PAGE_SHIFT, "PGTABLE");
> + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
> + PFN_PHYS(pgt_buf_end));
>
> if (!after_bootmem)
> early_memtest(start, end);
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> index 6b833db..fec8680 100644
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
> {
> }
>
> +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
> +{
> + /* reserve the range used */
> + memblock_x86_reserve_range(start, end, "PGTABLE");
> +
> + /* set as RW the rest */
> + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
> + PFN_PHYS(pgt_buf_top));
> + while (end< PFN_PHYS(pgt_buf_top)) {
> + make_lowmem_page_readwrite(__va(end));
> + end += PAGE_SIZE;
> + }
> +}
> +
it would be more cleaner to xen code if you make mark start/end to RO, and not make them all before as RO.
Thanks
Yinghai Lu
On Tue, 12 Apr 2011, Konrad Rzeszutek Wilk wrote:
> On Tue, Apr 12, 2011 at 12:19:51PM +0100, [email protected] wrote:
> > From: Stefano Stabellini <[email protected]>
> >
> > Signed-off-by: Stefano Stabellini <[email protected]>
>
> I am not entirely sure if we need these. You get all of this data by looking
> at the Xen E820 and the guest E820 (to see the xen_extra_mem):
>
> (XEN) Xen-e820 RAM map:
> (XEN) 0000000000000000 - 000000000009f800 (usable)
> (XEN) 000000000009f800 - 00000000000a0000 (reserved)
> (XEN) 00000000000f0000 - 0000000000100000 (reserved)
> (XEN) 0000000000100000 - 00000000cf5e0000 (usable)
> (XEN) 00000000cf5e0000 - 00000000cf5e3000 (ACPI NVS)
> (XEN) 00000000cf5e3000 - 00000000cf5f0000 (ACPI data)
> (XEN) 00000000cf5f0000 - 00000000cf600000 (reserved)
> (XEN) 00000000e0000000 - 00000000f0000000 (reserved)
> (XEN) 00000000fec00000 - 0000000100000000 (reserved)
> (XEN) 0000000100000000 - 0000000130000000 (usable)
> ..
>
> [ 0.000000] BIOS-provided physical RAM map:
> .. snip..
> [ 0.000000] Xen: 0000000100000000 - 00000001a19e0000 (usable)
>
> And your patch adds this:
>
> [ 0.000000] e820_region: type=1 start=0000000000000000 end=000000000009f800
> [ 0.000000] e820_region: type=2 start=000000000009f800 end=00000000000a0000
> [ 0.000000] e820_region: type=2 start=00000000000f0000 end=0000000000100000
> [ 0.000000] e820_region: type=1 start=0000000000100000 end=00000000cf5e0000
> [ 0.000000] e820_region: type=4 start=00000000cf5e0000 end=00000000cf5e3000
> [ 0.000000] e820_region: type=3 start=00000000cf5e3000 end=00000000cf5f0000
> [ 0.000000] e820_region: type=2 start=00000000cf5f0000 end=00000000cf600000
> [ 0.000000] e820_region: type=2 start=00000000e0000000 end=00000000f0000000
> [ 0.000000] e820_region: type=2 start=00000000fec00000 end=0000000100000000
> [ 0.000000] e820_region: type=1 start=0000000100000000 end=0000000130000000
> [ 0.000000] released 0 pages of unused memory
> [ 0.000000] extra e820 region: start=0000000100000000 end=00000001a19e0000
I have a machine here in which the e820 printed by Xen and the e820
in dom0 *before* any modifications differs:
(XEN) Xen-e820 RAM map:
(XEN) 0000000000000000 - 000000000009fc00 (usable)
(XEN) 000000000009fc00 - 00000000000a0000 (reserved)
(XEN) 00000000000e0000 - 0000000000100000 (reserved)
(XEN) 0000000000100000 - 00000000beb96000 (usable)
(XEN) 00000000beb96000 - 00000000bed97000 (ACPI NVS)
(XEN) 00000000bed97000 - 00000000bf651000 (usable)
(XEN) 00000000bf651000 - 00000000bf6e9000 (ACPI NVS)
(XEN) 00000000bf6e9000 - 00000000bf6ec000 (usable)
(XEN) 00000000bf6ec000 - 00000000bf6ff000 (ACPI data)
(XEN) 00000000bf6ff000 - 00000000bf700000 (usable)
[ 0.000000] e820_region: type=1 start=0000000000000000 end=000000000009fc00
[ 0.000000] e820_region: type=2 start=000000000009fc00 end=00000000000a0000
[ 0.000000] e820_region: type=2 start=00000000000e0000 end=0000000000100000
[ 0.000000] e820_region: type=1 start=0000000000100000 end=00000000beb96000
[ 0.000000] e820_region: type=4 start=00000000beb96000 end=00000000bed97000
[ 0.000000] e820_region: type=1 start=00000000bed97000 end=00000000bf651000
[ 0.000000] e820_region: type=4 start=00000000bf651000 end=00000000bf6e9000
[ 0.000000] e820_region: type=1 start=00000000bf6e9000 end=00000000bf6ec000
[ 0.000000] e820_region: type=3 start=00000000bf6ec000 end=00000000bf6ff000
[ 0.000000] e820_region: type=1 start=00000000bf6ff000 end=00000000bf700000
[ 0.000000] e820_region: type=2 start=00000000fec00000 end=00000000fec01000
[ 0.000000] e820_region: type=2 start=00000000fee00000 end=00000000fee01000
On Tue, 12 Apr 2011, Konrad Rzeszutek Wilk wrote:
> On Tue, Apr 12, 2011 at 12:19:49PM +0100, [email protected] wrote:
> > From: Stefano Stabellini <[email protected]>
> >
> > mask_rw_pte is currently checking if a pfn is a pagetable page if it
> > falls in the range pgt_buf_start - pgt_buf_end but that is incorrect
> > because pgt_buf_end is a moving target: pgt_buf_top is the real
> > boundary.
>
> OK. Stuck it on the queue for rc3.
great, thanks.
On Tue, 12 Apr 2011, Yinghai Lu wrote:
> > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> > index 6b833db..fec8680 100644
> > --- a/arch/x86/xen/mmu.c
> > +++ b/arch/x86/xen/mmu.c
> > @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
> > {
> > }
> >
> > +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
> > +{
> > + /* reserve the range used */
> > + memblock_x86_reserve_range(start, end, "PGTABLE");
> > +
> > + /* set as RW the rest */
> > + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
> > + PFN_PHYS(pgt_buf_top));
> > + while (end< PFN_PHYS(pgt_buf_top)) {
> > + make_lowmem_page_readwrite(__va(end));
> > + end += PAGE_SIZE;
> > + }
> > +}
> > +
>
> it would be more cleaner to xen code if you make mark start/end to RO, and not make them all before as RO.
Yes, that would be ideal, but we cannot do that because we don't know
exactly where is pgt_buf_end before allocating the pagetable pages and
the pagetable pages need to be marked RO before being hooked into the
pagetable. This is why we mark the whole range RO and after the
pagetable allocation when we know for sure where is pgt_buf_end we
modify the range pgt_buf_end-pgt_buf_top to RW.
> I have a machine here in which the e820 printed by Xen and the e820
> in dom0 *before* any modifications differs:
>
> (XEN) Xen-e820 RAM map:
> (XEN) 0000000000000000 - 000000000009fc00 (usable)
> (XEN) 000000000009fc00 - 00000000000a0000 (reserved)
> (XEN) 00000000000e0000 - 0000000000100000 (reserved)
> (XEN) 0000000000100000 - 00000000beb96000 (usable)
> (XEN) 00000000beb96000 - 00000000bed97000 (ACPI NVS)
> (XEN) 00000000bed97000 - 00000000bf651000 (usable)
> (XEN) 00000000bf651000 - 00000000bf6e9000 (ACPI NVS)
> (XEN) 00000000bf6e9000 - 00000000bf6ec000 (usable)
> (XEN) 00000000bf6ec000 - 00000000bf6ff000 (ACPI data)
> (XEN) 00000000bf6ff000 - 00000000bf700000 (usable)
>
> [ 0.000000] e820_region: type=1 start=0000000000000000 end=000000000009fc00
> [ 0.000000] e820_region: type=2 start=000000000009fc00 end=00000000000a0000
> [ 0.000000] e820_region: type=2 start=00000000000e0000 end=0000000000100000
> [ 0.000000] e820_region: type=1 start=0000000000100000 end=00000000beb96000
> [ 0.000000] e820_region: type=4 start=00000000beb96000 end=00000000bed97000
> [ 0.000000] e820_region: type=1 start=00000000bed97000 end=00000000bf651000
> [ 0.000000] e820_region: type=4 start=00000000bf651000 end=00000000bf6e9000
> [ 0.000000] e820_region: type=1 start=00000000bf6e9000 end=00000000bf6ec000
> [ 0.000000] e820_region: type=3 start=00000000bf6ec000 end=00000000bf6ff000
> [ 0.000000] e820_region: type=1 start=00000000bf6ff000 end=00000000bf700000
> [ 0.000000] e820_region: type=2 start=00000000fec00000 end=00000000fec01000
> [ 0.000000] e820_region: type=2 start=00000000fee00000 end=00000000fee01000
Ah, so the IOAPIC regions don't show up in the E820. Do they show up in the
E820 printed by the Linux kernel? If I use the Xen E820 output and what the
guest prints for its E820 I seem to get even this "hidden" area.
On Tue, Apr 12, 2011 at 10:40:57AM -0700, Yinghai Lu wrote:
> On 04/12/2011 04:19 AM, [email protected] wrote:
> >From: Stefano Stabellini<[email protected]>
> >
> >Introduce a new x86_init hook called pagetable_reserve that during the
> >initial memory mapping is used to reserve a range of memory addresses for
> >kernel pagetable usage.
> >
> >On native it just calls memblock_x86_reserve_range while on xen it also
> >takes care of setting the spare memory previously allocated
> >for kernel pagetable pages from RO to RW, so that it can be used for
> >other purposes.
> >
> >Signed-off-by: Stefano Stabellini<[email protected]>
> >Cc: Yinghai Lu<[email protected]>
> >Cc: H. Peter Anvin<[email protected]>
> >Cc: Ingo Molnar<[email protected]>
>
> Acked-by: Yinghai Lu <[email protected]>
Peter,
I stuck this and some of the other in this thread on 'stable/bug-fixes-rc3'
(git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git). But before
I send them to Linus to pull I need your OK? Would it be OK if I stuck
your Ack on the patch?
Or you could take this patch (the one I stuck in my tree has the description
a bit modified so I recommend that one).
Thanks,
Konrad
On 04/12/2011 04:19 AM, [email protected] wrote:
> From: Stefano Stabellini <[email protected]>
>
> Introduce a new x86_init hook called pagetable_reserve that during the
> initial memory mapping is used to reserve a range of memory addresses for
> kernel pagetable usage.
>
> On native it just calls memblock_x86_reserve_range while on xen it also
> takes care of setting the spare memory previously allocated
> for kernel pagetable pages from RO to RW, so that it can be used for
> other purposes.
>
What are the *semantics* of this hook?
Hooks are insanely nasty if they are just defined by a particular code
flow, as evidenced by the royal mess called paravirt_ops.
-hpa
On 04/13/2011 03:35 AM, Stefano Stabellini wrote:
> Yes, that would be ideal, but we cannot do that because we don't know
> exactly where is pgt_buf_end before allocating the pagetable pages and
> the pagetable pages need to be marked RO before being hooked into the
> pagetable. This is why we mark the whole range RO and after the
> pagetable allocation when we know for sure where is pgt_buf_end we
> modify the range pgt_buf_end-pgt_buf_top to RW.
The hell? You have to fill the pages before you hook them into the page
tables anyway (this means writing!) and then you have to mark them RO as
you add them to the page tables... anything else doesn't make any sense
at all.
-hpa
On 04/13/2011 11:03 AM, Konrad Rzeszutek Wilk wrote:
>
> Peter,
>
> I stuck this and some of the other in this thread on 'stable/bug-fixes-rc3'
> (git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git). But before
> I send them to Linus to pull I need your OK? Would it be OK if I stuck
> your Ack on the patch?
>
> Or you could take this patch (the one I stuck in my tree has the description
> a bit modified so I recommend that one).
>
You have my extremely firm NAK until the intended semantics of the new
hook is explained in detail.
-hpa
On Wed, Apr 13, 2011 at 11:35:02AM -0700, H. Peter Anvin wrote:
> On 04/13/2011 11:03 AM, Konrad Rzeszutek Wilk wrote:
> >
> > Peter,
> >
> > I stuck this and some of the other in this thread on 'stable/bug-fixes-rc3'
> > (git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git). But before
> > I send them to Linus to pull I need your OK? Would it be OK if I stuck
> > your Ack on the patch?
> >
> > Or you could take this patch (the one I stuck in my tree has the description
> > a bit modified so I recommend that one).
> >
>
> You have my extremely firm NAK until the intended semantics of the new
> hook is explained in detail.
<nods>I will let Stefano answer your questions as he has been battling
this particular regression and trying to find an acceptable solution
to the problem.
On Wed, 13 Apr 2011, Konrad Rzeszutek Wilk wrote:
> > I have a machine here in which the e820 printed by Xen and the e820
> > in dom0 *before* any modifications differs:
> >
> > (XEN) Xen-e820 RAM map:
> > (XEN) 0000000000000000 - 000000000009fc00 (usable)
> > (XEN) 000000000009fc00 - 00000000000a0000 (reserved)
> > (XEN) 00000000000e0000 - 0000000000100000 (reserved)
> > (XEN) 0000000000100000 - 00000000beb96000 (usable)
> > (XEN) 00000000beb96000 - 00000000bed97000 (ACPI NVS)
> > (XEN) 00000000bed97000 - 00000000bf651000 (usable)
> > (XEN) 00000000bf651000 - 00000000bf6e9000 (ACPI NVS)
> > (XEN) 00000000bf6e9000 - 00000000bf6ec000 (usable)
> > (XEN) 00000000bf6ec000 - 00000000bf6ff000 (ACPI data)
> > (XEN) 00000000bf6ff000 - 00000000bf700000 (usable)
> >
> > [ 0.000000] e820_region: type=1 start=0000000000000000 end=000000000009fc00
> > [ 0.000000] e820_region: type=2 start=000000000009fc00 end=00000000000a0000
> > [ 0.000000] e820_region: type=2 start=00000000000e0000 end=0000000000100000
> > [ 0.000000] e820_region: type=1 start=0000000000100000 end=00000000beb96000
> > [ 0.000000] e820_region: type=4 start=00000000beb96000 end=00000000bed97000
> > [ 0.000000] e820_region: type=1 start=00000000bed97000 end=00000000bf651000
> > [ 0.000000] e820_region: type=4 start=00000000bf651000 end=00000000bf6e9000
> > [ 0.000000] e820_region: type=1 start=00000000bf6e9000 end=00000000bf6ec000
> > [ 0.000000] e820_region: type=3 start=00000000bf6ec000 end=00000000bf6ff000
> > [ 0.000000] e820_region: type=1 start=00000000bf6ff000 end=00000000bf700000
>
>
> > [ 0.000000] e820_region: type=2 start=00000000fec00000 end=00000000fec01000
> > [ 0.000000] e820_region: type=2 start=00000000fee00000 end=00000000fee01000
>
> Ah, so the IOAPIC regions don't show up in the E820. Do they show up in the
> E820 printed by the Linux kernel? If I use the Xen E820 output and what the
> guest prints for its E820 I seem to get even this "hidden" area.
Yes, that's right. However it might be confusing if you are trying to
debug the transformations made by xen/setup.c to the e820...
In any case I don't feel strongly about this, it is just that I found
myself adding these printk's more than once so I thought that it might
be good to have them in any case for debugging. On the other hand I add
printk's all the time so surely it won't kill me to add these ones too.
On Wed, 13 Apr 2011, H. Peter Anvin wrote:
> On 04/13/2011 03:35 AM, Stefano Stabellini wrote:
> > Yes, that would be ideal, but we cannot do that because we don't know
> > exactly where is pgt_buf_end before allocating the pagetable pages and
> > the pagetable pages need to be marked RO before being hooked into the
> > pagetable. This is why we mark the whole range RO and after the
> > pagetable allocation when we know for sure where is pgt_buf_end we
> > modify the range pgt_buf_end-pgt_buf_top to RW.
>
> The hell? You have to fill the pages before you hook them into the page
> tables anyway (this means writing!) and then you have to mark them RO as
> you add them to the page tables... anything else doesn't make any sense
> at all.
Right.
The problem is that at some point init_memory_mapping is going reach the
pagetable pages area and map those pages too (I don't mean hooking the
pagetable pages in the pagetable, I mean mapping them as normal memory
that falls in the range of addresses passed to init_memory_mapping as
argument).
Some of those pages are already pagetable pages (they are in the range
pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
everything is fine.
Some of these pages are not pagetable pages yet (they fall in the range
pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
are going to be mapped RW. When these pages become pagetable pages and
are hooked into the pagetable, xen will find that the guest has already
a RW mapping of them somewhere and fail the operation.
In order to fix the issue I could mark all the pages in the entire range
pgt_buf_start-pgt_buf_top as RO, but then once the pagetable allocation
is completed only the range pgt_buf_start-pgt_buf_end is reserved by
init_memory_mapping therefore the kernel is going to crash as soon as
one of the pages in the range pgt_buf_end-pgt_buf_top is reused.
Initially I suggested to add two hooks: one to allocate the pagetable
pages memory and one to reserve the pagetable pages memory after the
allocation:
http://marc.info/?l=linux-kernel&m=130141955626268
Following Yinghai's suggestion I removed the first hook (currently
unnecessary because we would use the same implementation on native and
on xen) and modified the second one, that became
x86_init.mapping.pagetable_reserve.
On Wed, 13 Apr 2011, H. Peter Anvin wrote:
> On 04/12/2011 04:19 AM, [email protected] wrote:
> > From: Stefano Stabellini <[email protected]>
> >
> > Introduce a new x86_init hook called pagetable_reserve that during the
> > initial memory mapping is used to reserve a range of memory addresses for
> > kernel pagetable usage.
> >
> > On native it just calls memblock_x86_reserve_range while on xen it also
> > takes care of setting the spare memory previously allocated
> > for kernel pagetable pages from RO to RW, so that it can be used for
> > other purposes.
> >
>
> What are the *semantics* of this hook?
>
> Hooks are insanely nasty if they are just defined by a particular code
> flow, as evidenced by the royal mess called paravirt_ops.
I hope that the other email I have just sent clarifies the purpose of
the hook.
I admit that as it is it wouldn't find much usage outside
init_memory_mapping.
Maybe adding the corresponding hook to allocate the initial kernel
pagetable pages would help generalizing it. Or maybe we just need a
better comment in the code:
/* Reserve the kernel pagetable pages we used and free the other ones so
* that they can be reused for other purposes.
*
* On native it just means calling memblock_x86_reserve_range, on Xen it
* also means marking RW the pagetable pages that we allocated before
* but that haven't been used here.
*/
if (!after_bootmem && pgt_buf_end > pgt_buf_start)
x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
PFN_PHYS(pgt_buf_end));
On Thu, 14 Apr 2011, Stefano Stabellini wrote:
> On Wed, 13 Apr 2011, H. Peter Anvin wrote:
> > On 04/12/2011 04:19 AM, [email protected] wrote:
> > > From: Stefano Stabellini <[email protected]>
> > >
> > > Introduce a new x86_init hook called pagetable_reserve that during the
> > > initial memory mapping is used to reserve a range of memory addresses for
> > > kernel pagetable usage.
> > >
> > > On native it just calls memblock_x86_reserve_range while on xen it also
> > > takes care of setting the spare memory previously allocated
> > > for kernel pagetable pages from RO to RW, so that it can be used for
> > > other purposes.
> > >
> >
> > What are the *semantics* of this hook?
> >
> > Hooks are insanely nasty if they are just defined by a particular code
> > flow, as evidenced by the royal mess called paravirt_ops.
>
> I hope that the other email I have just sent clarifies the purpose of
> the hook.
> I admit that as it is it wouldn't find much usage outside
> init_memory_mapping.
> Maybe adding the corresponding hook to allocate the initial kernel
> pagetable pages would help generalizing it. Or maybe we just need a
> better comment in the code:
>
>
> /* Reserve the kernel pagetable pages we used and free the other ones so
> * that they can be reused for other purposes.
> *
> * On native it just means calling memblock_x86_reserve_range, on Xen it
> * also means marking RW the pagetable pages that we allocated before
> * but that haven't been used here.
> */
> if (!after_bootmem && pgt_buf_end > pgt_buf_start)
> x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
> PFN_PHYS(pgt_buf_end));
I added a detailed explanation to the commit message and in the code, I
hope this version is better:
commit 6f97ad736f304d600669cba1498e788099cea2cd
Author: Stefano Stabellini <[email protected]>
Date: Wed Mar 30 16:17:33 2011 +0000
x86,xen: introduce x86_init.mapping.pagetable_reserve
Introduce a new x86_init hook called pagetable_reserve that at the end
of init_memory_mapping is used to reserve a range of memory addresses for
the kernel pagetable pages we used and free the other ones.
On native it just calls memblock_x86_reserve_range while on xen it also
takes care of setting the spare memory previously allocated
for kernel pagetable pages from RO to RW, so that it can be used for
other purposes.
A detailed explanation of the reason why this hook is needed follows.
As a consequence of the commit:
commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
Author: Yinghai Lu <[email protected]>
Date: Fri Dec 17 16:58:28 2010 -0800
x86-64, mm: Put early page table high
at some point init_memory_mapping is going to reach the pagetable pages
area and map those pages too (mapping them as normal memory that falls
in the range of addresses passed to init_memory_mapping as argument).
Some of those pages are already pagetable pages (they are in the range
pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
everything is fine.
Some of these pages are not pagetable pages yet (they fall in the range
pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
are going to be mapped RW. When these pages become pagetable pages and
are hooked into the pagetable, xen will find that the guest has already
a RW mapping of them somewhere and fail the operation.
The reason Xen requires pagetables to be RO is that the hypervisor needs
to verify that the pagetables are valid before using them. The validation
operations are called "pinning" (more details in arch/x86/xen/mmu.c).
In order to fix the issue we mark all the pages in the entire range
pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
is completed only the range pgt_buf_start-pgt_buf_end is reserved by
init_memory_mapping. Hence the kernel is going to crash as soon as one
of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
ranges are RO).
For this reason we need a hook to reserve the kernel pagetable pages we
used and free the other ones so that they can be reused for other
purposes.
On native it just means calling memblock_x86_reserve_range, on Xen it
also means marking RW the pagetable pages that we allocated before but
that haven't been used before.
Another way to fix this is without using the hook is by adding a 'if
(xen_pv_domain)' in the 'init_memory_mapping' code and calling the Xen
counterpart, but that is just nasty.
Signed-off-by: Stefano Stabellini <[email protected]>
Acked-by: Yinghai Lu <[email protected]>
Acked-by: Konrad Rzeszutek Wilk <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Ingo Molnar <[email protected]>
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7db7723..d56187c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+extern void native_pagetable_reserve(u64 start, u64 end);
#ifdef CONFIG_X86_32
extern void native_pagetable_setup_start(pgd_t *base);
extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 643ebf2..d3d8590 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,17 @@ struct x86_init_oem {
};
/**
+ * struct x86_init_mapping - platform specific initial kernel pagetable setup
+ * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
+ *
+ * For more details on the purpose of this hook, look in
+ * init_memory_mapping and the commit that added it.
+ */
+struct x86_init_mapping {
+ void (*pagetable_reserve)(u64 start, u64 end);
+};
+
+/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_setup_start: platform specific pre paging_init() call
* @pagetable_setup_done: platform specific post paging_init() call
@@ -123,6 +134,7 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
+ struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e..75ef4b1 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
+ .mapping = {
+ .pagetable_reserve = native_pagetable_reserve,
+ },
+
.paging = {
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 286d289..08fee27 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
}
+void native_pagetable_reserve(u64 start, u64 end)
+{
+ memblock_x86_reserve_range(start, end, "PGTABLE");
+}
+
struct map_range {
unsigned long start;
unsigned long end;
@@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
__flush_tlb_all();
+ /*
+ * Reserve the kernel pagetable pages we used (pgt_buf_start -
+ * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
+ * so that they can be reused for other purposes.
+ *
+ * On native it just means calling memblock_x86_reserve_range, on Xen it
+ * also means marking RW the pagetable pages that we allocated before
+ * but that haven't been used.
+ *
+ * In fact on xen we mark RO the whole range pgt_buf_start -
+ * pgt_buf_top, because we have to make sure that when
+ * init_memory_mapping reaches the pagetable pages area, it maps
+ * RO all the pagetable pages, including the ones that are beyond
+ * pgt_buf_end at that time.
+ */
if (!after_bootmem && pgt_buf_end > pgt_buf_start)
- memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
- pgt_buf_end << PAGE_SHIFT, "PGTABLE");
+ x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+ PFN_PHYS(pgt_buf_end));
if (!after_bootmem)
early_memtest(start, end);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6b833db..7ad0292 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
{
}
+static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
+{
+ /* reserve the range used */
+ native_pagetable_reserve(start, end);
+
+ /* set as RW the rest */
+ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
+ PFN_PHYS(pgt_buf_top));
+ while (end < PFN_PHYS(pgt_buf_top)) {
+ make_lowmem_page_readwrite(__va(end));
+ end += PAGE_SIZE;
+ }
+}
+
static void xen_post_allocator_init(void);
static __init void xen_pagetable_setup_done(pgd_t *base)
@@ -2100,6 +2114,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
void __init xen_init_mmu_ops(void)
{
+ x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
On 04/14/2011 04:30 AM, Stefano Stabellini wrote:
>
> I hope that the other email I have just sent clarifies the purpose of
> the hook.
>
You're commingling rationale and (proposed) semantics. BOTH need to be
stated clearly.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
On Thu, 14 Apr 2011, H. Peter Anvin wrote:
> On 04/14/2011 04:30 AM, Stefano Stabellini wrote:
> >
> > I hope that the other email I have just sent clarifies the purpose of
> > the hook.
> >
>
> You're commingling rationale and (proposed) semantics. BOTH need to be
> stated clearly.
Sorry I didn't answer the right question straight away.
The semantics follow:
It is as if there is an implicit pvop after find_early_table_space
that exports the range pgt_buf_start - pgt_buf_top.
The range indicated is used, and it is the only range used, for the initial
kernel pagetable pages.
The range indicated is an overestimate.
It is implicit because we use the pgt_buf_* variables directly but
nonetheless it was in the first version of the patch.
After the pagetable setup done by kernel_physical_mapping_init, we know
exactly how many pages we actually used for the initial kernel
pagetables and how many are left unused.
The purpose of the second pvop (x86_init.mapping.pagetable_reserve) is
to notify sub-architectures of the actual range used for initial kernel
pagetable pages and partially revoke the previous indication.
If you agree with the proposed semantics above, I'll add the description
to the commit message as well as the code. I could also modify the
syntax of the new pvop call to better suit the semantics and/or restore
the first pvop call.
On Thu, 14 Apr 2011, Stefano Stabellini wrote:
> On Thu, 14 Apr 2011, H. Peter Anvin wrote:
> > On 04/14/2011 04:30 AM, Stefano Stabellini wrote:
> > >
> > > I hope that the other email I have just sent clarifies the purpose of
> > > the hook.
> > >
> >
> > You're commingling rationale and (proposed) semantics. BOTH need to be
> > stated clearly.
>
> Sorry I didn't answer the right question straight away.
> The semantics follow:
>
>
>
> It is as if there is an implicit pvop after find_early_table_space
> that exports the range pgt_buf_start - pgt_buf_top.
> The range indicated is used, and it is the only range used, for the initial
> kernel pagetable pages.
> The range indicated is an overestimate.
> It is implicit because we use the pgt_buf_* variables directly but
> nonetheless it was in the first version of the patch.
>
> After the pagetable setup done by kernel_physical_mapping_init, we know
> exactly how many pages we actually used for the initial kernel
> pagetables and how many are left unused.
>
> The purpose of the second pvop (x86_init.mapping.pagetable_reserve) is
> to notify sub-architectures of the actual range used for initial kernel
> pagetable pages and partially revoke the previous indication.
>
>
>
> If you agree with the proposed semantics above, I'll add the description
> to the commit message as well as the code. I could also modify the
> syntax of the new pvop call to better suit the semantics and/or restore
> the first pvop call.
>
ping?
Sorry, I don't mean to be pushy, it is just that I'll be AFK for 12 days
starting from next Friday and I think that this issue should really be
fixed in time for the 2.6.39 release, otherwise no 2.6.39 kernels will
be able to boot on any xen system.
On 04/18/2011 07:09 AM, Stefano Stabellini wrote:
>
> Sorry, I don't mean to be pushy, it is just that I'll be AFK for 12 days
> starting from next Friday and I think that this issue should really be
> fixed in time for the 2.6.39 release, otherwise no 2.6.39 kernels will
> be able to boot on any xen system.
YOU STILL HAVEN'T PROPOSED ANY SEMANTICS.
The semantics of a hook is a description of what the preconditions are,
what the postconditions are, and what exactly they are allowed or not
allowed to do.
This is a real pain to do, *exactly because hooks are a real pain*.
Most hooks that have been put in has been "oh, just do something at
point X in the code", which is a case of definition by implementation,
which is exactly how we ended up with the current mess.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
On Mon, 18 Apr 2011, H. Peter Anvin wrote:
> On 04/18/2011 07:09 AM, Stefano Stabellini wrote:
> >
> > Sorry, I don't mean to be pushy, it is just that I'll be AFK for 12 days
> > starting from next Friday and I think that this issue should really be
> > fixed in time for the 2.6.39 release, otherwise no 2.6.39 kernels will
> > be able to boot on any xen system.
>
> YOU STILL HAVEN'T PROPOSED ANY SEMANTICS.
>
> The semantics of a hook is a description of what the preconditions are,
> what the postconditions are, and what exactly they are allowed or not
> allowed to do.
>
> This is a real pain to do, *exactly because hooks are a real pain*.
> Most hooks that have been put in has been "oh, just do something at
> point X in the code", which is a case of definition by implementation,
> which is exactly how we ended up with the current mess.
>
x86_init.mapping.pagetable_setup_start(start, max_end)
Notifies the subarch that the kernel early startup intends to
use pages as page table pages.
These pages will have physical addresses in the range
start..max_end.
Can be called multiple times to setup kernel page table pages,
always in a pair with x86_init.mapping.pagetable_setup_done.
It must be called before x86_init.paging.pagetable_setup_start().
Preconditions: no kernel code after the 32-bit kernel entry point
has hooked any new page table pages into the active page table
except between a pair of
x86_init.mapping.pagetable_setup_start and
x86_init.mapping.pagetable_setup_done.
Postcondition: pages in the range start..max_end may be used as
page table pages and hooked into active page tables.
Pages in this range may NOT be used for any other purpose.
x86_init.mapping.pagetable_setup_done(real_end)
Notifies the subarch that the initial memory mapping is complete,
and of the actual range of pages used.
Preconditions:
x86_init.mapping.pagetable_setup_start was previously called; the
arguments to the calls must satisfy start <= real_end <= max_end
The pages in the range start..real_end have been hooked into the
active page table.
The pages in the range real_end..max_end are not currently in use.
Postconditions:
Pages in the range real_end..max_end may be used as normal
memory.
Pages in the range start..real_end might be hooked into the active
page table and cannot be used for other purposes.
New page table pages must be allocated using
pv_mmu_ops.alloc_pte/pmd/pud.
On 04/18/2011 07:42 AM, H. Peter Anvin wrote:
> On 04/18/2011 07:09 AM, Stefano Stabellini wrote:
>> Sorry, I don't mean to be pushy, it is just that I'll be AFK for 12 days
>> starting from next Friday and I think that this issue should really be
>> fixed in time for the 2.6.39 release, otherwise no 2.6.39 kernels will
>> be able to boot on any xen system.
> YOU STILL HAVEN'T PROPOSED ANY SEMANTICS.
>
> The semantics of a hook is a description of what the preconditions are,
> what the postconditions are, and what exactly they are allowed or not
> allowed to do.
>
> This is a real pain to do, *exactly because hooks are a real pain*.
> Most hooks that have been put in has been "oh, just do something at
> point X in the code", which is a case of definition by implementation,
> which is exactly how we ended up with the current mess.
Yeah. The basic problem is that the change to pagetable setup
introduced a regression when running under Xen. The foremost
consideration is to fix that regression.
I think the most pragmatic thing to do at this point - as much as it
pains me to say so - is just put in an explicit Xen hook in which does
the right thing, rather than try to prettify it as a general hook, since
it only has one user anyway.
If a second user comes along, when we can talk about generalizing it.
But it would be nice if we could come up with an initial pagetable
construction algorithm that follows the same rules as normal pagetable
management and uses the normal pvops hooks in the normal way so that we
don't need to special case it at all.
J