kexec: Avoid overwriting the current pgd (i386)
This patch upgrades the i386-specific kexec code to avoid overwriting the
current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
to start a secondary kernel that dumps the memory of the previous kernel.
The code introduces a new set of page tables called "page_table_a". These
tables are used to provide an executable identity mapping without overwriting
the current pgd.
Signed-off-by: Magnus Damm <[email protected]>
---
The patch has been tested with regular kexec and CONFIG_CRASH_DUMP.
Both PAE and non-PAE configurations work well. Apply on top of 2.6.17-rc3.
arch/i386/kernel/machine_kexec.c | 174 +++++++++++++++++++-----------------
arch/i386/kernel/relocate_kernel.S | 19 +++
include/linux/kexec.h | 11 ++
3 files changed, 121 insertions(+), 83 deletions(-)
--- 0001/arch/i386/kernel/machine_kexec.c
+++ work/arch/i386/kernel/machine_kexec.c 2006-05-01 11:38:13.000000000 +0900
@@ -2,6 +2,9 @@
* machine_kexec.c - handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <[email protected]>
*
+ * 2006-04-27 Magnus Damm <[email protected]>:
+ * - rewrote identity map code to avoid overwriting current pgd
+ *
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
@@ -19,72 +22,6 @@
#include <asm/desc.h>
#include <asm/system.h>
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L2_ATTR (_PAGE_PRESENT)
-
-#define LEVEL0_SIZE (1UL << 12UL)
-
-#ifndef CONFIG_X86_PAE
-#define LEVEL1_SIZE (1UL << 22UL)
-static u32 pgtable_level1[1024] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
- unsigned long level1_index, level2_index;
- u32 *pgtable_level2;
-
- /* Find the current page table */
- pgtable_level2 = __va(read_cr3());
-
- /* Find the indexes of the physical address to identity map */
- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
- level2_index = address / LEVEL1_SIZE;
-
- /* Identity map the page table entry */
- pgtable_level1[level1_index] = address | L0_ATTR;
- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-
- /* Flush the tlb so the new mapping takes effect.
- * Global tlb entries are not flushed but that is not an issue.
- */
- load_cr3(pgtable_level2);
-}
-
-#else
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-static u64 pgtable_level1[512] PAGE_ALIGNED;
-static u64 pgtable_level2[512] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
- unsigned long level1_index, level2_index, level3_index;
- u64 *pgtable_level3;
-
- /* Find the current page table */
- pgtable_level3 = __va(read_cr3());
-
- /* Find the indexes of the physical address to identity map */
- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
- level3_index = address / LEVEL2_SIZE;
-
- /* Identity map the page table entry */
- pgtable_level1[level1_index] = address | L0_ATTR;
- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
- set_64bit(&pgtable_level3[level3_index],
- __pa(pgtable_level2) | L2_ATTR);
-
- /* Flush the tlb so the new mapping takes effect.
- * Global tlb entries are not flushed but that is not an issue.
- */
- load_cr3(pgtable_level3);
-}
-#endif
-
static void set_idt(void *newidt, __u16 limit)
{
struct Xgt_desc_struct curidt;
@@ -96,7 +33,6 @@ static void set_idt(void *newidt, __u16
load_idt(&curidt);
};
-
static void set_gdt(void *newgdt, __u16 limit)
{
struct Xgt_desc_struct curgdt;
@@ -131,12 +67,66 @@ typedef asmlinkage NORET_TYPE void (*rel
unsigned long indirection_page,
unsigned long reboot_code_buffer,
unsigned long start_address,
- unsigned int has_pae) ATTRIB_NORET;
+ unsigned long page_table_a,
+ unsigned long has_pae) ATTRIB_NORET;
const extern unsigned char relocate_new_kernel[];
extern void relocate_new_kernel_end(void);
const extern unsigned int relocate_new_kernel_size;
+static int allocate_page_table_a(struct kimage *image)
+{
+ struct page *page;
+ int k = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
+
+ for (; k > 0; k--) {
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page)
+ return -ENOMEM;
+
+ clear_page(page_address(page));
+ image->page_table_a[k - 1] = page;
+ }
+
+ return 0;
+}
+
+/* workaround for include/asm-i386/pgtable-3level.h */
+
+#ifdef CONFIG_X86_PAE
+#undef pud_present
+#define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
+#endif
+
+#define pa_page(page) __pa(page_address(page))
+
+static int create_mapping(struct page *root, struct page **pages,
+ unsigned long va, unsigned long pa)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int k = 0;
+
+ pgd = (pgd_t *)page_address(root) + pgd_index(va);
+ if (!pgd_present(*pgd))
+ set_pgd(pgd, __pgd(pa_page(pages[k++]) | _KERNPG_TABLE));
+
+ pud = pud_offset(pgd, va);
+ if (!pud_present(*pud))
+ set_pud(pud, __pud(pa_page(pages[k++]) | _KERNPG_TABLE));
+
+ pmd = pmd_offset(pud, va);
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(pa_page(pages[k++]) | _KERNPG_TABLE));
+
+ pte = (pte_t *)page_address(pmd_page(*pmd)) + pte_index(va);
+ set_pte(pte, __pte(pa | _PAGE_KERNEL_EXEC));
+
+ return k;
+}
+
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
@@ -152,6 +142,33 @@ const extern unsigned int relocate_new_k
*/
int machine_kexec_prepare(struct kimage *image)
{
+ void *control_page;
+ unsigned long pa;
+ int k;
+
+ k = allocate_page_table_a(image);
+ if (k)
+ return k;
+
+ /* fill in control_page with assembly code */
+
+ control_page = page_address(image->control_code_page);
+ memcpy(control_page, relocate_new_kernel, relocate_new_kernel_size);
+
+ /* map the control_page at the virtual address of relocate_kernel.S */
+
+ pa = __pa(control_page);
+
+ k = create_mapping(image->page_table_a[0],
+ &image->page_table_a[1],
+ (unsigned long)relocate_new_kernel, pa);
+
+ /* identity map the control_page */
+
+ create_mapping(image->page_table_a[0],
+ &image->page_table_a[k + 1],
+ pa, pa);
+
return 0;
}
@@ -170,24 +187,16 @@ void machine_kexec_cleanup(struct kimage
NORET_TYPE void machine_kexec(struct kimage *image)
{
unsigned long page_list;
- unsigned long reboot_code_buffer;
-
+ unsigned long control_code;
+ unsigned long page_table_a;
relocate_new_kernel_t rnk;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
- /* Compute some offsets */
- reboot_code_buffer = page_to_pfn(image->control_code_page)
- << PAGE_SHIFT;
page_list = image->head;
-
- /* Set up an identity mapping for the reboot_code_buffer */
- identity_map_page(reboot_code_buffer);
-
- /* copy it out */
- memcpy((void *)reboot_code_buffer, relocate_new_kernel,
- relocate_new_kernel_size);
+ control_code = __pa(page_address(image->control_code_page));
+ page_table_a = __pa(page_address(image->page_table_a[0]));
/* The segment registers are funny things, they are
* automatically loaded from a table, in memory wherever you
@@ -209,6 +218,7 @@ NORET_TYPE void machine_kexec(struct kim
set_idt(phys_to_virt(0),0);
/* now call it */
- rnk = (relocate_new_kernel_t) reboot_code_buffer;
- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
+ rnk = (relocate_new_kernel_t) relocate_new_kernel;
+ (*rnk)(page_list, control_code, image->start,
+ page_table_a, (unsigned long)cpu_has_pae);
}
--- 0001/arch/i386/kernel/relocate_kernel.S
+++ work/arch/i386/kernel/relocate_kernel.S 2006-05-01 11:13:14.000000000 +0900
@@ -7,7 +7,11 @@
*/
#include <linux/linkage.h>
+#include <asm/page.h>
+.text
+.align (1 << PAGE_SHIFT)
+
/*
* Must be relocatable PIC code callable as a C function, that once
* it starts can not use the previous processes stack.
@@ -18,18 +22,31 @@ relocate_new_kernel:
movl 4(%esp), %ebx /* page_list */
movl 8(%esp), %ebp /* reboot_code_buffer */
movl 12(%esp), %edx /* start address */
- movl 16(%esp), %ecx /* cpu_has_pae */
+ movl 16(%esp), %edi /* page_table_a */
+ movl 20(%esp), %ecx /* cpu_has_pae */
/* zero out flags, and disable interrupts */
pushl $0
popfl
+ /* switch to page_table_a */
+ movl %edi, %eax
+ movl %eax, %cr3
+
/* set a new stack at the bottom of our page... */
lea 4096(%ebp), %esp
/* store the parameters back on the stack */
pushl %edx /* store the start address */
+ /* jump to identity mapped page */
+ movl %ebp, %eax
+ addl $(identity_mapped - relocate_new_kernel), %eax
+ pushl %eax
+ ret
+
+identity_mapped:
+
/* Set cr0 to a known state:
* 31 0 == Paging disabled
* 18 0 == Alignment check disabled
--- 0001/include/linux/kexec.h
+++ work/include/linux/kexec.h 2006-05-01 11:13:14.000000000 +0900
@@ -69,6 +69,17 @@ struct kimage {
unsigned long start;
struct page *control_code_page;
+ /* page_table_a[] holds enough pages to create a new page table
+ * that maps the control page twice..
+ */
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+ struct page *page_table_a[3]; /* (2 * pte) + pgd */
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
+ struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
+#endif
+
unsigned long nr_segments;
struct kexec_segment segment[KEXEC_SEGMENT_MAX];
On Mon, May 01, 2006 at 06:49:16PM +0900, Magnus Damm wrote:
> kexec: Avoid overwriting the current pgd (i386)
>
> This patch upgrades the i386-specific kexec code to avoid overwriting the
> current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
> to start a secondary kernel that dumps the memory of the previous kernel.
>
> The code introduces a new set of page tables called "page_table_a". These
> tables are used to provide an executable identity mapping without overwriting
> the current pgd.
True, current pgd is overwritten but that effects only user space mappings
and currently "crash" supports only backtracing kernel space code. But at
the same time probably it is not a bad idea to maintain a separate page
table and switch to that instead of overwriting the existing pgd. This
shall help if in future user space backtracing is also supported.
[..]
>
> +static int allocate_page_table_a(struct kimage *image)
> +{
> + struct page *page;
> + int k = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
> +
> + for (; k > 0; k--) {
> + page = kimage_alloc_control_pages(image, 0);
> + if (!page)
> + return -ENOMEM;
> +
> + clear_page(page_address(page));
> + image->page_table_a[k - 1] = page;
I think you also need to write the logic to free those pages if somebody
chooses to unload the pre-loaded kernel.
[..]
> --- 0001/include/linux/kexec.h
> +++ work/include/linux/kexec.h 2006-05-01 11:13:14.000000000 +0900
> @@ -69,6 +69,17 @@ struct kimage {
> unsigned long start;
> struct page *control_code_page;
>
> + /* page_table_a[] holds enough pages to create a new page table
> + * that maps the control page twice..
> + */
> +
> +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
> + struct page *page_table_a[3]; /* (2 * pte) + pgd */
> +#endif
> +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
> + struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
> +#endif
> +
Would it make a cleaner interface if these array of pointers are maintained
in arch dependent code as global variables instead of putting in arch
independent code. Existing code does something simlar. This becomes further
ugly when another array comes into the picture for x86_64 in next patch.
(page_table_b)
Thanks
Vivek
Vivek Goyal <[email protected]> writes:
> On Mon, May 01, 2006 at 06:49:16PM +0900, Magnus Damm wrote:
>> kexec: Avoid overwriting the current pgd (i386)
>>
>> This patch upgrades the i386-specific kexec code to avoid overwriting the
>> current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
>> to start a secondary kernel that dumps the memory of the previous kernel.
>>
>> The code introduces a new set of page tables called "page_table_a". These
>> tables are used to provide an executable identity mapping without overwriting
>> the current pgd.
>
> True, current pgd is overwritten but that effects only user space mappings
> and currently "crash" supports only backtracing kernel space code. But at
> the same time probably it is not a bad idea to maintain a separate page
> table and switch to that instead of overwriting the existing pgd. This
> shall help if in future user space backtracing is also supported.
>
> [..]
>>
>> +static int allocate_page_table_a(struct kimage *image)
>> +{
>> + struct page *page;
>> + int k = sizeof(image->page_table_a) / sizeof(image->page_table_a[0]);
>> +
>> + for (; k > 0; k--) {
>> + page = kimage_alloc_control_pages(image, 0);
>> + if (!page)
>> + return -ENOMEM;
>> +
>> + clear_page(page_address(page));
>> + image->page_table_a[k - 1] = page;
>
> I think you also need to write the logic to free those pages if somebody
> chooses to unload the pre-loaded kernel.
Because these are control pages we already keep track of them,
so we can free them along with everything else.
> [..]
>> --- 0001/include/linux/kexec.h
>> +++ work/include/linux/kexec.h 2006-05-01 11:13:14.000000000 +0900
>> @@ -69,6 +69,17 @@ struct kimage {
>> unsigned long start;
>> struct page *control_code_page;
>>
>> + /* page_table_a[] holds enough pages to create a new page table
>> + * that maps the control page twice..
>> + */
>> +
>> +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
>> + struct page *page_table_a[3]; /* (2 * pte) + pgd */
>> +#endif
>> +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
>> + struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
>> +#endif
>> +
>
> Would it make a cleaner interface if these array of pointers are maintained
> in arch dependent code as global variables instead of putting in arch
> independent code. Existing code does something simlar. This becomes further
> ugly when another array comes into the picture for x86_64 in next patch.
> (page_table_b)
Well global variables don't quite work in the normal case.
However it probably makes most sense to maintain the needed variables
in a structure on the control page. Which will keep them out of harms way,
and won't require patches to the generic code.
Eric
On 5/1/06, Vivek Goyal <[email protected]> wrote:
> On Mon, May 01, 2006 at 06:49:16PM +0900, Magnus Damm wrote:
> > kexec: Avoid overwriting the current pgd (i386)
> >
> > This patch upgrades the i386-specific kexec code to avoid overwriting the
> > current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used
> > to start a secondary kernel that dumps the memory of the previous kernel.
> >
> > The code introduces a new set of page tables called "page_table_a". These
> > tables are used to provide an executable identity mapping without overwriting
> > the current pgd.
>
> True, current pgd is overwritten but that effects only user space mappings
> and currently "crash" supports only backtracing kernel space code. But at
> the same time probably it is not a bad idea to maintain a separate page
> table and switch to that instead of overwriting the existing pgd. This
> shall help if in future user space backtracing is also supported.
I agree, but I also think that overwriting user space mappings is bad
from the trace perspective too, especially if you look at kernel data.
I would say that user space page tables are just another kernel data
structure, and overwriting them may result in things like inconsistent
page->mapcount values.
Thanks for the comments,
/ magnus
On 5/2/06, Eric W. Biederman <[email protected]> wrote:
> Vivek Goyal <[email protected]> writes:
> > On Mon, May 01, 2006 at 06:49:16PM +0900, Magnus Damm wrote:
> >> --- 0001/include/linux/kexec.h
> >> +++ work/include/linux/kexec.h 2006-05-01 11:13:14.000000000 +0900
> >> @@ -69,6 +69,17 @@ struct kimage {
> >> unsigned long start;
> >> struct page *control_code_page;
> >>
> >> + /* page_table_a[] holds enough pages to create a new page table
> >> + * that maps the control page twice..
> >> + */
> >> +
> >> +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
> >> + struct page *page_table_a[3]; /* (2 * pte) + pgd */
> >> +#endif
> >> +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_PAE)
> >> + struct page *page_table_a[5]; /* (2 * pte) + (2 * pmd) + pgd */
> >> +#endif
> >> +
> >
> > Would it make a cleaner interface if these array of pointers are maintained
> > in arch dependent code as global variables instead of putting in arch
> > independent code. Existing code does something simlar. This becomes further
> > ugly when another array comes into the picture for x86_64 in next patch.
> > (page_table_b)
>
> Well global variables don't quite work in the normal case.
>
> However it probably makes most sense to maintain the needed variables
> in a structure on the control page. Which will keep them out of harms way,
> and won't require patches to the generic code.
I agree with both of you that the #ifdefs in struct kimage should be
avoided, but I wonder if adding variables in a structure on the
control page is the easiest and cleanest way.
I think that defining a structure for each architecture in
include/asm/kexec.h that is included in struct kimage is the best way
to go. Then each architecture can have whatever data it wants there,
and we both avoid #ifdefs in struct kimage _and_ we stay away from
overly complicated code that just allocates, frees and parses
architecture-dependent data.
Thanks for the input,
/ magnus
"Magnus Damm" <[email protected]> writes:
> On 5/2/06, Eric W. Biederman <[email protected]> wrote:
>>
>> Well global variables don't quite work in the normal case.
>>
>> However it probably makes most sense to maintain the needed variables
>> in a structure on the control page. Which will keep them out of harms way,
>> and won't require patches to the generic code.
>
> I agree with both of you that the #ifdefs in struct kimage should be
> avoided, but I wonder if adding variables in a structure on the
> control page is the easiest and cleanest way.
>
> I think that defining a structure for each architecture in
> include/asm/kexec.h that is included in struct kimage is the best way
> to go. Then each architecture can have whatever data it wants there,
> and we both avoid #ifdefs in struct kimage _and_ we stay away from
> overly complicated code that just allocates, frees and parses
> architecture-dependent data.
Well I think it would be fairly simple to have a structure:
struct control_page {
type variabe;
...
code[0];
};
Or something like that we can work with.
The big reason for doing this is that I believe control pages
have additional protection that struct kimage does, being allocated
in areas where the kernel never sets up DMA transfers. Possibly
that needs to be fixed, but this is something we need to be very
careful with.
For a page table all we need to store is the physical address of the
first page. Storing and working with a struct page entry is the wrong
thing to do. I would prefer to stomp the kernel data structures than
to add an extra dependencies on the original panic'd kernel. At first
glance I am afraid that you current code introduces extra
dependencies.
You don't need two x86_64 page tables as you can easily map
all of the kernel virtual address, and the identity mapped physical
address until the x86_64 kernel stops using an 8TB/8TB split.
Eric
On 5/2/06, Eric W. Biederman <[email protected]> wrote:
> "Magnus Damm" <[email protected]> writes:
>
> > On 5/2/06, Eric W. Biederman <[email protected]> wrote:
> >>
> >> Well global variables don't quite work in the normal case.
> >>
> >> However it probably makes most sense to maintain the needed variables
> >> in a structure on the control page. Which will keep them out of harms way,
> >> and won't require patches to the generic code.
> >
> > I agree with both of you that the #ifdefs in struct kimage should be
> > avoided, but I wonder if adding variables in a structure on the
> > control page is the easiest and cleanest way.
> >
> > I think that defining a structure for each architecture in
> > include/asm/kexec.h that is included in struct kimage is the best way
> > to go. Then each architecture can have whatever data it wants there,
> > and we both avoid #ifdefs in struct kimage _and_ we stay away from
> > overly complicated code that just allocates, frees and parses
> > architecture-dependent data.
>
> Well I think it would be fairly simple to have a structure:
> struct control_page {
> type variabe;
> ...
> code[0];
> };
>
> Or something like that we can work with.
>
> The big reason for doing this is that I believe control pages
> have additional protection that struct kimage does, being allocated
> in areas where the kernel never sets up DMA transfers. Possibly
> that needs to be fixed, but this is something we need to be very
> careful with.
I suppose you mean that control pages have additional protection that
struct kimage does _not_ have. Protection provided by
kimage_alloc_control_pages(), right?
I agree with you that this protection is good. But I do not see how
that applies to my patch, because the page_table_a[] pages pointed out
by struct kimage are read out by machine_kexec() and passed as
arguments to the assembly code. So the assembly code itself never
tries to access struct kimage. All data accessed by the assembly code
is allocated with kimage_alloc_control_pages(), isn't that good
enough? Or maybe I'm misunderstanding?
On top of that I fear that my patch likes the fact that the assembly
code in the control page is page aligned. But it is probably no biggie
to change. I'm just lazy. =)
> For a page table all we need to store is the physical address of the
> first page. Storing and working with a struct page entry is the wrong
> thing to do. I would prefer to stomp the kernel data structures than
> to add an extra dependencies on the original panic'd kernel. At first
> glance I am afraid that you current code introduces extra
> dependencies.
I used struct page *[] because the control page was a struct page *. I
never use the contents of what the struct page points to, I only use
them to convert to physical/virtual addresses or pfns. So I would say
that no extra dependencies are introduced at all actually. But I may
be wrong.
I would be happy to change the struct page *[] to unsigned long [] or
something else, but I must say I like the typing that struct page
provides.
Regarding storing the just root page or all pages - I stored all pages
because I need to pass them to the Xen hypervisor which will fill in
new values in page_table_a. page_table_b OTOH never gets modified by
the hypervisor, which is why page_table_a is an array of pointers and
page_table_b is just a root pointer.
> You don't need two x86_64 page tables as you can easily map
> all of the kernel virtual address, and the identity mapped physical
> address until the x86_64 kernel stops using an 8TB/8TB split.
Sure. I just thought that one page table with a mix of 4K pages and
huge pages would result in difficult code. The current page_table_a
code is actually more or less the same on x86 and x86_64, with the
exception of some macro magic.
Thanks for the detailed reply!
/ magnus