Hi,
The following two patches make it possible to restore the memory state from a
hibernation image with the help of a kernel different from the image one.
The first patch adds the generic, platform independent code needed for that.
The second patch implements the idea on x86_64.
Greetings,
Rafael
From: Rafael J. Wysocki <[email protected]>
Add the bits needed for supporting arbitrary boot kernels to the common
hibernation code.
To support arbitrary boot kernels, make it possible to replace the 'struct
new_utsname' and the kernel version in the hibernation image header by some
architecture specific data that will be used to verify if the image is valid
and to restore the image.
Signed-off-by: Rafael J. Wysocki <[email protected]>
Acked-by: Pavel Machek <[email protected]>
---
kernel/power/power.h | 6 +++-
kernel/power/snapshot.c | 60 ++++++++++++++++++++++++++++++++++++------------
2 files changed, 49 insertions(+), 17 deletions(-)
Index: linux-2.6.23-rc3/kernel/power/power.h
===================================================================
--- linux-2.6.23-rc3.orig/kernel/power/power.h
+++ linux-2.6.23-rc3/kernel/power/power.h
@@ -11,14 +11,16 @@ struct swsusp_info {
unsigned long size;
} __attribute__((aligned(PAGE_SIZE)));
-
-
#ifdef CONFIG_HIBERNATION
+/* Maximum size of architecture specific data in a hibernation header */
+#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
+
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
*/
#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+
/*
* Keep 1 MB of memory free so that device drivers can allocate some pages in
* their .suspend() routines without breaking the suspend to disk.
Index: linux-2.6.23-rc3/kernel/power/snapshot.c
===================================================================
--- linux-2.6.23-rc3.orig/kernel/power/snapshot.c
+++ linux-2.6.23-rc3/kernel/power/snapshot.c
@@ -1239,17 +1239,29 @@ asmlinkage int swsusp_save(void)
return 0;
}
-static void init_header(struct swsusp_info *info)
+#ifdef ARCH_HAS_HIBERNATION_HEADER
+static int init_header_complete(struct swsusp_info *info)
{
- memset(info, 0, sizeof(struct swsusp_info));
+ return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
+}
+#else /* !ARCH_HAS_HIBERNATION_HEADER */
+static int init_header_complete(struct swsusp_info *info)
+{
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
info->version_code = LINUX_VERSION_CODE;
+ return 0;
+}
+#endif /* !ARCH_HAS_HIBERNATION_HEADER */
+
+static int init_header(struct swsusp_info *info)
+{
+ memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = num_physpages;
- memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
- info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
info->size = info->pages;
info->size <<= PAGE_SHIFT;
+ return init_header_complete(info);
}
/**
@@ -1303,7 +1315,11 @@ int snapshot_read_next(struct snapshot_h
return -ENOMEM;
}
if (!handle->offset) {
- init_header((struct swsusp_info *)buffer);
+ int error;
+
+ error = init_header((struct swsusp_info *)buffer);
+ if (error)
+ return error;
handle->buffer = buffer;
memory_bm_position_reset(&orig_bm);
memory_bm_position_reset(©_bm);
@@ -1394,22 +1410,36 @@ duplicate_memory_bitmap(struct memory_bi
}
}
-static inline int check_header(struct swsusp_info *info)
+#ifdef ARCH_HAS_HIBERNATION_HEADER
+static char *check_image_kernel(struct swsusp_info *info)
+{
+ return arch_hibernation_header_restore(info) ?
+ "architecture specific data" : NULL;
+}
+#else /* !ARCH_HAS_HIBERNATION_HEADER */
+static char *check_image_kernel(struct swsusp_info *info)
{
- char *reason = NULL;
-
if (info->version_code != LINUX_VERSION_CODE)
- reason = "kernel version";
- if (info->num_physpages != num_physpages)
- reason = "memory size";
+ return "kernel version";
if (strcmp(info->uts.sysname,init_utsname()->sysname))
- reason = "system type";
+ return "system type";
if (strcmp(info->uts.release,init_utsname()->release))
- reason = "kernel release";
+ return "kernel release";
if (strcmp(info->uts.version,init_utsname()->version))
- reason = "version";
+ return "version";
if (strcmp(info->uts.machine,init_utsname()->machine))
- reason = "machine";
+ return "machine";
+ return NULL;
+}
+#endif /* !ARCH_HAS_HIBERNATION_HEADER */
+
+static int check_header(struct swsusp_info *info)
+{
+ char *reason;
+
+ reason = check_image_kernel(info);
+ if (!reason && info->num_physpages != num_physpages)
+ reason = "memory size";
if (reason) {
printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
return -EPERM;
From: Rafael J. Wysocki <[email protected]>
Make it possible to restore a hibernation image on x86_64 with the help of a
kernel different from the one in the image.
The idea is to split the core restoration code into two separate parts and to
place each of them in a different page. ?The first part belongs to the boot
kernel and is executed as the last step of the image kernel's memory restoration
procedure. ?It restores all of the image kernel's memory that has not been
restored yet except for the one page containing the very code that is being
executed at that time. ?The final operation performed by it is a jump to the
second part of the core restoration code that belongs to the image kernel and
has just been restored. ?This code restores the last remaining page of the image
kernel's memory containing the first, already executed, part of the core
restoration code (temporary page tables created by the boot kernel are used at
this stage). ?It also makes the CPU switch to the image kernel's page tables and
restores the state of general purpose registers (including the stack pointer)
from before the hibernation.
The main issue with this idea is that in order to jump to the second part of the
restoration code the boot kernel needs to know its address. ?However, this
address may be passed to it in the image header. ?Namely, the part of the image
header previously used for checking if the version of the image kernel is
correct can be replaced with some architecture specific data that will allow
the boot kernel to jump to the right address within the image kernel. ?These
data should also be used for checking if the image kernel is compatible with
the boot kernel (as far as the memory restroration procedure is concerned).
It can be done, for example, with the help of a "magic" value that has to be
equal in both kernels, so that they can be regarded as compatible.
Signed-off-by: Rafael J. Wysocki <[email protected]>
Acked-by: Pavel Machek <[email protected]>
---
arch/x86_64/kernel/suspend.c | 43 ++++++++++++++++++++++++++
arch/x86_64/kernel/suspend_asm.S | 63 +++++++++++++++++++++++++++++++++------
include/asm-x86_64/suspend.h | 6 +++
3 files changed, 103 insertions(+), 9 deletions(-)
Index: linux-2.6.23-rc3/arch/x86_64/kernel/suspend_asm.S
===================================================================
--- linux-2.6.23-rc3.orig/arch/x86_64/kernel/suspend_asm.S 2007-08-21 20:36:49.000000000 +0200
+++ linux-2.6.23-rc3/arch/x86_64/kernel/suspend_asm.S 2007-08-21 21:16:01.000000000 +0200
@@ -2,8 +2,8 @@
*
* Distribute under GPLv2.
*
- * swsusp_arch_resume may not use any stack, nor any variable that is
- * not "NoSave" during copying pages:
+ * swsusp_arch_resume must not use any stack or any nonlocal variables while
+ * copying pages:
*
* Its rewriting one kernel image with another. What is stack in "old"
* image could very well be data page in "new" image, and overwriting
@@ -36,10 +36,20 @@ ENTRY(swsusp_arch_suspend)
pushfq
popq pt_regs_eflags(%rax)
+ /* save the address of restore_registers */
+ movq $restore_registers, %rax
+ movq %rax, restore_jump_address(%rip)
+
call swsusp_save
ret
ENTRY(restore_image)
+ /* compute the address of the page we are at and store it in R9 */
+ movq $(restore_image - __START_KERNEL_map), %rax
+ movq $__PAGE_OFFSET, %r9
+ addq %rax, %r9
+ andq $PAGE_MASK, %r9
+
/* switch to temporary page tables */
movq $__PAGE_OFFSET, %rdx
movq temp_level4_pgt(%rip), %rax
@@ -54,6 +64,11 @@ ENTRY(restore_image)
movq %rcx, %cr3;
movq %rax, %cr4; # turn PGE back on
+ /* prepare to jump to the image kernel */
+ movq restore_jump_address(%rip), %rax
+
+ /* copy image data to their original locations */
+ xorq %r10, %r10
movq restore_pblist(%rip), %rdx
loop:
testq %rdx, %rdx
@@ -62,16 +77,46 @@ loop:
/* get addresses from the pbe and copy the page */
movq pbe_address(%rdx), %rsi
movq pbe_orig_address(%rdx), %rdi
- movq $512, %rcx
+ /* skip the page we are at (address stored in R9) */
+ cmpq %rdi, %r9
+ jne 1f
+ /* save the address of the data to be copied to the skipped page */
+ movq %rsi, %r10
+ jmp 2f
+1: movq $(PAGE_SIZE >> 3), %rcx
rep
movsq
/* progress to the next pbe */
- movq pbe_next(%rdx), %rdx
+2: movq pbe_next(%rdx), %rdx
jmp loop
done:
+ /* jump to the restore_registers address from the image header */
+ jmpq *%rax
+ /*
+ * NOTE: This assumes that the boot kernel's text mapping covers the
+ * image kernel's page containing restore_registers and the address of
+ * this page is the same as in the image kernel's text mapping (it
+ * should always be true, because the text mapping is linear, starting
+ * from 0, and is supposed to cover the entire kernel text for every
+ * kernel).
+ */
+
+.balign PAGE_SIZE
+ /* code below belongs to the image kernel */
+ENTRY(restore_registers)
+ /* check if we have one more image page to copy */
+ testq %r10, %r10
+ jz 1f
+ /* copy the skipped page */
+ movq %r10, %rsi
+ movq %r9, %rdi
+ movq $(PAGE_SIZE >> 3), %rcx
+ rep
+ movsq
+
/* go back to the original page tables */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
+1: movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
movq %rax, %cr3
@@ -84,10 +129,7 @@ done:
movq %rcx, %cr3
movq %rax, %cr4; # turn PGE back on
- movl $24, %eax
- movl %eax, %ds
-
- /* We don't restore %rax, it must be 0 anyway */
+ /* restore GPRs (we don't restore %rax, it must be 0 anyway) */
movq $saved_context, %rax
movq pt_regs_rsp(%rax), %rsp
movq pt_regs_rbp(%rax), %rbp
@@ -109,4 +151,7 @@ done:
xorq %rax, %rax
+ /* tell the hibernation core that we've just restored the memory */
+ movq %rax, in_suspend(%rip)
+
ret
Index: linux-2.6.23-rc3/arch/x86_64/kernel/suspend.c
===================================================================
--- linux-2.6.23-rc3.orig/arch/x86_64/kernel/suspend.c 2007-08-21 20:36:49.000000000 +0200
+++ linux-2.6.23-rc3/arch/x86_64/kernel/suspend.c 2007-08-21 21:07:45.000000000 +0200
@@ -144,6 +144,12 @@ void fix_processor_context(void)
/* Defined in arch/x86_64/kernel/suspend_asm.S */
extern int restore_image(void);
+/*
+ * Address to jump to in the last phase of restore in order to get to the image
+ * kernel's text (this value is passed in the image header).
+ */
+unsigned long restore_jump_address;
+
pgd_t *temp_level4_pgt;
static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
@@ -230,4 +236,41 @@ int pfn_is_nosave(unsigned long pfn)
unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
+
+struct restore_data_record {
+ unsigned long jump_address;
+ unsigned long control;
+};
+
+#define RESTORE_MAGIC 0x0123456789ABCDEFUL
+
+/**
+ * arch_hibernation_header_save - populate the architecture specific part
+ * of a hibernation image header
+ * @addr: address to save the data at
+ */
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+ struct restore_data_record *rdr = addr;
+
+ if (max_size < sizeof(struct restore_data_record))
+ return -EOVERFLOW;
+ rdr->jump_address = restore_jump_address;
+ rdr->control = (restore_jump_address ^ RESTORE_MAGIC);
+ return 0;
+}
+
+/**
+ * arch_hibernation_header_restore - read the architecture specific data
+ * from the hibernation image header
+ * @addr: address to read the data from
+ */
+int arch_hibernation_header_restore(void *addr)
+{
+ struct restore_data_record *rdr = addr;
+
+ restore_jump_address = rdr->jump_address;
+ return (rdr->control == (restore_jump_address ^ RESTORE_MAGIC)) ?
+ 0 : -EINVAL;
+}
#endif /* CONFIG_HIBERNATION */
Index: linux-2.6.23-rc3/include/asm-x86_64/suspend.h
===================================================================
--- linux-2.6.23-rc3.orig/include/asm-x86_64/suspend.h 2007-08-21 20:36:49.000000000 +0200
+++ linux-2.6.23-rc3/include/asm-x86_64/suspend.h 2007-08-21 20:37:47.000000000 +0200
@@ -43,4 +43,10 @@ extern void fix_processor_context(void);
/* routines for saving/restoring kernel state */
extern int acpi_save_state_mem(void);
+#define ARCH_HAS_HIBERNATION_HEADER
+
+/* arch/x86_64/kernel/suspend.c */
+extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
+extern int arch_hibernation_header_restore(void *addr);
+
#endif /* __ASM_X86_64_SUSPEND_H */
On Fri, 2007-08-24 at 12:11 +0200, Rafael J. Wysocki wrote:
> The idea is [...]
Shouldn't that actually be added into some generic (non-x86-64) doc
file?
johannes
On Friday, 24 August 2007 12:59, Johannes Berg wrote:
> On Fri, 2007-08-24 at 12:11 +0200, Rafael J. Wysocki wrote:
>
> > The idea is [...]
>
> Shouldn't that actually be added into some generic (non-x86-64) doc
> file?
Yes, I will update the documentation in the future.
Greetings,
Rafael
On Fri, 24 Aug 2007 12:11:54 +0200
"Rafael J. Wysocki" <[email protected]> wrote:
> Index: linux-2.6.23-rc3/include/asm-x86_64/suspend.h
> ===================================================================
> --- linux-2.6.23-rc3.orig/include/asm-x86_64/suspend.h 2007-08-21 20:36:49.000000000 +0200
> +++ linux-2.6.23-rc3/include/asm-x86_64/suspend.h 2007-08-21 20:37:47.000000000 +0200
> @@ -43,4 +43,10 @@ extern void fix_processor_context(void);
> /* routines for saving/restoring kernel state */
> extern int acpi_save_state_mem(void);
>
> +#define ARCH_HAS_HIBERNATION_HEADER
The preferred way of doing this is via Kconfig, please. ie: add a
CONFIG_HIBERNATION_HEADER to arch/x86_64/Kconfig.
> +
> +/* arch/x86_64/kernel/suspend.c */
> +extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
> +extern int arch_hibernation_header_restore(void *addr);
Given that these are called from non-arch-specific code, they must have the
same signature across all architectures. So there's no point in putting
the prototypes into an arch-specific header file.
It would be better to do something like this in (say) suspend.h:
#ifdef CONFIG_HIBERNATION_HEADER
extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
extern int arch_hibernation_header_restore(void *addr);
#else
static inline int arch_hibernation_header_save(void *addr,
unsigned int max_size)
{
return 0;
}
static inline int arch_hibernation_header_restore(void *addr)
{
return 0;
}
#endif
then go nuke some ifdefs from the .c files.
Hi!
> From: Rafael J. Wysocki <[email protected]>
>
> Make it possible to restore a hibernation image on x86_64 with the help of a
> kernel different from the one in the image.
>
> The idea is to split the core restoration code into two separate parts and to
> place each of them in a different page. ?The first part belongs to the boot
What happens in case where both parts want to be
at the same place? (Like kernel being restored is 4KB smaller, so that
routines now collide?)
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Friday, 24 August 2007 22:46, Pavel Machek wrote:
> Hi!
>
> > From: Rafael J. Wysocki <[email protected]>
> >
> > Make it possible to restore a hibernation image on x86_64 with the help of a
> > kernel different from the one in the image.
> >
> > The idea is to split the core restoration code into two separate parts and to
> > place each of them in a different page. ?The first part belongs to the boot
>
> What happens in case where both parts want to be
> at the same place? (Like kernel being restored is 4KB smaller, so that
> routines now collide?)
Bad things, but I can't see how to avoid that reliably.
Greetings,
Rafael
On Sat, 25 Aug 2007, Rafael J. Wysocki wrote:
> On Friday, 24 August 2007 22:46, Pavel Machek wrote:
>> Hi!
>>
>>> From: Rafael J. Wysocki <[email protected]>
>>>
>>> Make it possible to restore a hibernation image on x86_64 with the help of a
>>> kernel different from the one in the image.
>>>
>>> The idea is to split the core restoration code into two separate parts and to
>>> place each of them in a different page. ?The first part belongs to the boot
>>
>> What happens in case where both parts want to be
>> at the same place? (Like kernel being restored is 4KB smaller, so that
>> routines now collide?)
>
> Bad things, but I can't see how to avoid that reliably.
can you at least detect it reliably? (feed a program both kernel images
and have it tell you 'yes/no')
David Lang
On Saturday, 25 August 2007 20:32, [email protected] wrote:
> On Sat, 25 Aug 2007, Rafael J. Wysocki wrote:
>
> > On Friday, 24 August 2007 22:46, Pavel Machek wrote:
> >> Hi!
> >>
> >>> From: Rafael J. Wysocki <[email protected]>
> >>>
> >>> Make it possible to restore a hibernation image on x86_64 with the help of a
> >>> kernel different from the one in the image.
> >>>
> >>> The idea is to split the core restoration code into two separate parts and to
> >>> place each of them in a different page. ?The first part belongs to the boot
> >>
> >> What happens in case where both parts want to be
> >> at the same place? (Like kernel being restored is 4KB smaller, so that
> >> routines now collide?)
> >
> > Bad things, but I can't see how to avoid that reliably.
>
> can you at least detect it reliably? (feed a program both kernel images
> and have it tell you 'yes/no')
Well, I have an idea how to handle that, but I need to test it. Stay tuned. :-)
Greetings,
Rafael
On Saturday, 25 August 2007 01:23, Andrew Morton wrote:
> On Fri, 24 Aug 2007 12:11:54 +0200
> "Rafael J. Wysocki" <[email protected]> wrote:
>
> > Index: linux-2.6.23-rc3/include/asm-x86_64/suspend.h
> > ===================================================================
> > --- linux-2.6.23-rc3.orig/include/asm-x86_64/suspend.h 2007-08-21 20:36:49.000000000 +0200
> > +++ linux-2.6.23-rc3/include/asm-x86_64/suspend.h 2007-08-21 20:37:47.000000000 +0200
> > @@ -43,4 +43,10 @@ extern void fix_processor_context(void);
> > /* routines for saving/restoring kernel state */
> > extern int acpi_save_state_mem(void);
> >
> > +#define ARCH_HAS_HIBERNATION_HEADER
>
> The preferred way of doing this is via Kconfig, please. ie: add a
> CONFIG_HIBERNATION_HEADER to arch/x86_64/Kconfig.
OK
> > +
> > +/* arch/x86_64/kernel/suspend.c */
> > +extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
> > +extern int arch_hibernation_header_restore(void *addr);
>
> Given that these are called from non-arch-specific code, they must have the
> same signature across all architectures. So there's no point in putting
> the prototypes into an arch-specific header file.
>
> It would be better to do something like this in (say) suspend.h:
>
> #ifdef CONFIG_HIBERNATION_HEADER
> extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
> extern int arch_hibernation_header_restore(void *addr);
> #else
> static inline int arch_hibernation_header_save(void *addr,
> unsigned int max_size)
> {
> return 0;
> }
>
> static inline int arch_hibernation_header_restore(void *addr)
> {
> return 0;
> }
> #endif
>
> then go nuke some ifdefs from the .c files.
The ifdefs in snapshot.c are necessary anyway, because they are around some
code that is only compiled when the CONFIG_HIBERNATION_HEADER is undefined.
I'll post the reworked patches in a new thread once again after the other issue
raised by Pavel gets settled.
On Saturday, 25 August 2007 20:27, Rafael J. Wysocki wrote:
> On Friday, 24 August 2007 22:46, Pavel Machek wrote:
> > Hi!
> >
> > > From: Rafael J. Wysocki <[email protected]>
> > >
> > > Make it possible to restore a hibernation image on x86_64 with the help of a
> > > kernel different from the one in the image.
> > >
> > > The idea is to split the core restoration code into two separate parts and to
> > > place each of them in a different page. ?The first part belongs to the boot
> >
> > What happens in case where both parts want to be
> > at the same place? (Like kernel being restored is 4KB smaller, so that
> > routines now collide?)
>
> Bad things, but I can't see how to avoid that reliably.
Below is an analogous patch without this problem. The slightly ugly thing
about it is that all pages in the temporary mapping have the NX bit cleard
now, so that we can run some code out of one of them. Still, IMO, that isn't
really important, because the temporary page tables are dropped as soon as
we jump to restore_registers.
Greetings,
Rafael
---
From: Rafael J. Wysocki <[email protected]>
Make it possible to restore a hibernation image on x86_64 with the help of a
kernel different from the one in the image.
The idea is to split the core restoration code into two separate parts and to
place each of them in a different page. ?The first part belongs to the boot
kernel and is executed as the last step of the image kernel's memory restoration
procedure. ?Before being executed, it is relocated to a safe page that won't be
overwritten while copying the image kernel pages.
The final operation performed by it is a jump to the second part of the core
restoration code that belongs to the image kernel and has just been restored.
This code makes the CPU switch to the image kernel's page tables and
restores the state of general purpose registers (including the stack pointer)
from before the hibernation.
The main issue with this idea is that in order to jump to the second part of the
core restoration code the boot kernel needs to know its address. ?However, this
address may be passed to it in the image header. ?Namely, the part of the image
header previously used for checking if the version of the image kernel is
correct can be replaced with some architecture specific data that will allow
the boot kernel to jump to the right address within the image kernel. ?These
data should also be used for checking if the image kernel is compatible with
the boot kernel (as far as the memory restroration procedure is concerned).
It can be done, for example, with the help of a "magic" value that has to be
equal in both kernels, so that they can be regarded as compatible.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
arch/x86_64/Kconfig | 5 +++
arch/x86_64/kernel/suspend.c | 54 ++++++++++++++++++++++++++++++++++++++-
arch/x86_64/kernel/suspend_asm.S | 41 ++++++++++++++++++++++++-----
include/asm-x86_64/suspend.h | 3 ++
4 files changed, 95 insertions(+), 8 deletions(-)
Index: linux-2.6.23-rc3/arch/x86_64/kernel/suspend_asm.S
===================================================================
--- linux-2.6.23-rc3.orig/arch/x86_64/kernel/suspend_asm.S 2007-08-25 22:09:53.000000000 +0200
+++ linux-2.6.23-rc3/arch/x86_64/kernel/suspend_asm.S 2007-08-25 22:10:25.000000000 +0200
@@ -2,8 +2,8 @@
*
* Distribute under GPLv2.
*
- * swsusp_arch_resume may not use any stack, nor any variable that is
- * not "NoSave" during copying pages:
+ * swsusp_arch_resume must not use any stack or any nonlocal variables while
+ * copying pages:
*
* Its rewriting one kernel image with another. What is stack in "old"
* image could very well be data page in "new" image, and overwriting
@@ -36,6 +36,10 @@ ENTRY(swsusp_arch_suspend)
pushfq
popq pt_regs_eflags(%rax)
+ /* save the address of restore_registers */
+ movq $restore_registers, %rax
+ movq %rax, restore_jump_address(%rip)
+
call swsusp_save
ret
@@ -54,7 +58,16 @@ ENTRY(restore_image)
movq %rcx, %cr3;
movq %rax, %cr4; # turn PGE back on
+ /* prepare to jump to the image kernel */
+ movq restore_jump_address(%rip), %rax
+
+ /* prepare to copy image data to their original locations */
movq restore_pblist(%rip), %rdx
+ movq relocated_restore_code(%rip), %rcx
+ jmpq *%rcx
+
+ /* code below has been relocated to a safe page */
+ENTRY(core_restore_code)
loop:
testq %rdx, %rdx
jz done
@@ -62,7 +75,7 @@ loop:
/* get addresses from the pbe and copy the page */
movq pbe_address(%rdx), %rsi
movq pbe_orig_address(%rdx), %rdi
- movq $512, %rcx
+ movq $(PAGE_SIZE >> 3), %rcx
rep
movsq
@@ -70,6 +83,20 @@ loop:
movq pbe_next(%rdx), %rdx
jmp loop
done:
+ /* jump to the restore_registers address from the image header */
+ jmpq *%rax
+ /*
+ * NOTE: This assumes that the boot kernel's text mapping covers the
+ * image kernel's page containing restore_registers and the address of
+ * this page is the same as in the image kernel's text mapping (it
+ * should always be true, because the text mapping is linear, starting
+ * from 0, and is supposed to cover the entire kernel text for every
+ * kernel).
+ *
+ * code below belongs to the image kernel
+ */
+
+ENTRY(restore_registers)
/* go back to the original page tables */
movq $(init_level4_pgt - __START_KERNEL_map), %rax
addq phys_base(%rip), %rax
@@ -84,10 +111,7 @@ done:
movq %rcx, %cr3
movq %rax, %cr4; # turn PGE back on
- movl $24, %eax
- movl %eax, %ds
-
- /* We don't restore %rax, it must be 0 anyway */
+ /* restore GPRs (we don't restore %rax, it must be 0 anyway) */
movq $saved_context, %rax
movq pt_regs_rsp(%rax), %rsp
movq pt_regs_rbp(%rax), %rbp
@@ -109,4 +133,7 @@ done:
xorq %rax, %rax
+ /* tell the hibernation core that we've just restored the memory */
+ movq %rax, in_suspend(%rip)
+
ret
Index: linux-2.6.23-rc3/arch/x86_64/kernel/suspend.c
===================================================================
--- linux-2.6.23-rc3.orig/arch/x86_64/kernel/suspend.c 2007-08-25 22:09:53.000000000 +0200
+++ linux-2.6.23-rc3/arch/x86_64/kernel/suspend.c 2007-08-25 22:11:43.000000000 +0200
@@ -144,8 +144,16 @@ void fix_processor_context(void)
/* Defined in arch/x86_64/kernel/suspend_asm.S */
extern int restore_image(void);
+/*
+ * Address to jump to in the last phase of restore in order to get to the image
+ * kernel's text (this value is passed in the image header).
+ */
+unsigned long restore_jump_address;
+
pgd_t *temp_level4_pgt;
+void *relocated_restore_code;
+
static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
{
long i, j;
@@ -169,7 +177,7 @@ static int res_phys_pud_init(pud_t *pud,
if (paddr >= end)
break;
- pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+ pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
pe &= __supported_pte_mask;
set_pmd(pmd, __pmd(pe));
}
@@ -216,6 +224,13 @@ int swsusp_arch_resume(void)
/* We have got enough memory and from now on we cannot recover */
if ((error = set_up_temporary_mappings()))
return error;
+
+ relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC);
+ if (!relocated_restore_code)
+ return -ENOMEM;
+ memcpy(relocated_restore_code, &core_restore_code,
+ &restore_registers - &core_restore_code);
+
restore_image();
return 0;
}
@@ -230,4 +245,41 @@ int pfn_is_nosave(unsigned long pfn)
unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
+
+struct restore_data_record {
+ unsigned long jump_address;
+ unsigned long control;
+};
+
+#define RESTORE_MAGIC 0x0123456789ABCDEFUL
+
+/**
+ * arch_hibernation_header_save - populate the architecture specific part
+ * of a hibernation image header
+ * @addr: address to save the data at
+ */
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+ struct restore_data_record *rdr = addr;
+
+ if (max_size < sizeof(struct restore_data_record))
+ return -EOVERFLOW;
+ rdr->jump_address = restore_jump_address;
+ rdr->control = (restore_jump_address ^ RESTORE_MAGIC);
+ return 0;
+}
+
+/**
+ * arch_hibernation_header_restore - read the architecture specific data
+ * from the hibernation image header
+ * @addr: address to read the data from
+ */
+int arch_hibernation_header_restore(void *addr)
+{
+ struct restore_data_record *rdr = addr;
+
+ restore_jump_address = rdr->jump_address;
+ return (rdr->control == (restore_jump_address ^ RESTORE_MAGIC)) ?
+ 0 : -EINVAL;
+}
#endif /* CONFIG_HIBERNATION */
Index: linux-2.6.23-rc3/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.23-rc3.orig/arch/x86_64/Kconfig 2007-08-25 22:09:53.000000000 +0200
+++ linux-2.6.23-rc3/arch/x86_64/Kconfig 2007-08-25 22:10:25.000000000 +0200
@@ -710,6 +710,11 @@ menu "Power management options"
source kernel/power/Kconfig
+config ARCH_HIBERNATION_HEADER
+ bool
+ depends on HIBERNATION
+ default y
+
source "drivers/acpi/Kconfig"
source "arch/x86_64/kernel/cpufreq/Kconfig"
Index: linux-2.6.23-rc3/include/asm-x86_64/suspend.h
===================================================================
--- linux-2.6.23-rc3.orig/include/asm-x86_64/suspend.h 2007-08-25 22:09:53.000000000 +0200
+++ linux-2.6.23-rc3/include/asm-x86_64/suspend.h 2007-08-25 22:10:25.000000000 +0200
@@ -43,4 +43,7 @@ extern void fix_processor_context(void);
/* routines for saving/restoring kernel state */
extern int acpi_save_state_mem(void);
+extern char core_restore_code;
+extern char restore_registers;
+
#endif /* __ASM_X86_64_SUSPEND_H */
On Fri, 2007-08-24 at 16:23 -0700, Andrew Morton wrote:
> The preferred way of doing this is via Kconfig, please. ie: add a
> CONFIG_HIBERNATION_HEADER to arch/x86_64/Kconfig.
> It would be better to do something like this in (say) suspend.h:
>
> #ifdef CONFIG_HIBERNATION_HEADER
> extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
> extern int arch_hibernation_header_restore(void *addr);
> #else
> static inline int arch_hibernation_header_save(void *addr,
In fact, I guess we don't need to bother with this at all. The generic
code for doing this (via the utsname based header) is tiny, so as far as
I can tell it could just be made weak symbols (by this I mean
init_header_complete() and check_image_kernel()), and then all the
#ifdefs can just go.
johannes
On Saturday, 25 August 2007 21:13, Johannes Berg wrote:
> On Fri, 2007-08-24 at 16:23 -0700, Andrew Morton wrote:
>
> > The preferred way of doing this is via Kconfig, please. ie: add a
> > CONFIG_HIBERNATION_HEADER to arch/x86_64/Kconfig.
>
> > It would be better to do something like this in (say) suspend.h:
> >
> > #ifdef CONFIG_HIBERNATION_HEADER
> > extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
> > extern int arch_hibernation_header_restore(void *addr);
> > #else
> > static inline int arch_hibernation_header_save(void *addr,
>
> In fact, I guess we don't need to bother with this at all. The generic
> code for doing this (via the utsname based header) is tiny, so as far as
> I can tell it could just be made weak symbols (by this I mean
> init_header_complete() and check_image_kernel()), and then all the
> #ifdefs can just go.
Well, I don't like the "weak symbols" stuff, but I have managed to limit the
number of additional #ifdefs in snapshot.c to just one.
The "generic" patch is now the following:
---
From: Rafael J. Wysocki <[email protected]>
Add the bits needed for supporting arbitrary boot kernels to the common
hibernation code.
To support arbitrary boot kernels, make it possible to replace the 'struct
new_utsname' and the kernel version in the hibernation image header by some
architecture specific data that will be used to verify if the image is valid
and to restore the image.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
kernel/power/power.h | 20 +++++++++++++++++-
kernel/power/snapshot.c | 53 +++++++++++++++++++++++++++++++-----------------
2 files changed, 54 insertions(+), 19 deletions(-)
Index: linux-2.6.23-rc3/kernel/power/power.h
===================================================================
--- linux-2.6.23-rc3.orig/kernel/power/power.h 2007-08-23 23:13:34.000000000 +0200
+++ linux-2.6.23-rc3/kernel/power/power.h 2007-08-25 21:18:59.000000000 +0200
@@ -11,14 +11,32 @@ struct swsusp_info {
unsigned long size;
} __attribute__((aligned(PAGE_SIZE)));
+#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ARCH_HIBERNATION_HEADER
+/* Maximum size of architecture specific data in a hibernation header */
+#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
+extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
+extern int arch_hibernation_header_restore(void *addr);
+
+static inline int init_header_complete(struct swsusp_info *info)
+{
+ return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
+}
+
+static inline char *check_image_kernel(struct swsusp_info *info)
+{
+ return arch_hibernation_header_restore(info) ?
+ "architecture specific data" : NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-#ifdef CONFIG_HIBERNATION
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
*/
#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+
/*
* Keep 1 MB of memory free so that device drivers can allocate some pages in
* their .suspend() routines without breaking the suspend to disk.
Index: linux-2.6.23-rc3/kernel/power/snapshot.c
===================================================================
--- linux-2.6.23-rc3.orig/kernel/power/snapshot.c 2007-08-23 23:13:34.000000000 +0200
+++ linux-2.6.23-rc3/kernel/power/snapshot.c 2007-08-25 21:21:55.000000000 +0200
@@ -1239,17 +1239,39 @@ asmlinkage int swsusp_save(void)
return 0;
}
-static void init_header(struct swsusp_info *info)
+#ifndef CONFIG_ARCH_HIBERNATION_HEADER
+static int init_header_complete(struct swsusp_info *info)
{
- memset(info, 0, sizeof(struct swsusp_info));
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
info->version_code = LINUX_VERSION_CODE;
+ return 0;
+}
+
+static char *check_image_kernel(struct swsusp_info *info)
+{
+ if (info->version_code != LINUX_VERSION_CODE)
+ return "kernel version";
+ if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ return "system type";
+ if (strcmp(info->uts.release,init_utsname()->release))
+ return "kernel release";
+ if (strcmp(info->uts.version,init_utsname()->version))
+ return "version";
+ if (strcmp(info->uts.machine,init_utsname()->machine))
+ return "machine";
+ return NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+
+static int init_header(struct swsusp_info *info)
+{
+ memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = num_physpages;
- memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
- info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
info->size = info->pages;
info->size <<= PAGE_SHIFT;
+ return init_header_complete(info);
}
/**
@@ -1303,7 +1325,11 @@ int snapshot_read_next(struct snapshot_h
return -ENOMEM;
}
if (!handle->offset) {
- init_header((struct swsusp_info *)buffer);
+ int error;
+
+ error = init_header((struct swsusp_info *)buffer);
+ if (error)
+ return error;
handle->buffer = buffer;
memory_bm_position_reset(&orig_bm);
memory_bm_position_reset(©_bm);
@@ -1394,22 +1420,13 @@ duplicate_memory_bitmap(struct memory_bi
}
}
-static inline int check_header(struct swsusp_info *info)
+static int check_header(struct swsusp_info *info)
{
- char *reason = NULL;
+ char *reason;
- if (info->version_code != LINUX_VERSION_CODE)
- reason = "kernel version";
- if (info->num_physpages != num_physpages)
+ reason = check_image_kernel(info);
+ if (!reason && info->num_physpages != num_physpages)
reason = "memory size";
- if (strcmp(info->uts.sysname,init_utsname()->sysname))
- reason = "system type";
- if (strcmp(info->uts.release,init_utsname()->release))
- reason = "kernel release";
- if (strcmp(info->uts.version,init_utsname()->version))
- reason = "version";
- if (strcmp(info->uts.machine,init_utsname()->machine))
- reason = "machine";
if (reason) {
printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
return -EPERM;
On Mon, 2007-08-27 at 13:06 +0200, Rafael J. Wysocki wrote:
> Well, I don't like the "weak symbols" stuff, but I have managed to limit the
> number of additional #ifdefs in snapshot.c to just one.
>
> The "generic" patch is now the following:
Fine with me, I was just throwing out ideas anyway :)
johannes
On Sat 2007-08-25 22:42:05, Rafael J. Wysocki wrote:
> On Saturday, 25 August 2007 20:27, Rafael J. Wysocki wrote:
> > On Friday, 24 August 2007 22:46, Pavel Machek wrote:
> > > Hi!
> > >
> > > > From: Rafael J. Wysocki <[email protected]>
> > > >
> > > > Make it possible to restore a hibernation image on x86_64 with the help of a
> > > > kernel different from the one in the image.
> > > >
> > > > The idea is to split the core restoration code into two separate parts and to
> > > > place each of them in a different page. ?The first part belongs to the boot
> > >
> > > What happens in case where both parts want to be
> > > at the same place? (Like kernel being restored is 4KB smaller, so that
> > > routines now collide?)
> >
> > Bad things, but I can't see how to avoid that reliably.
>
> Below is an analogous patch without this problem. The slightly ugly thing
> about it is that all pages in the temporary mapping have the NX bit cleard
> now, so that we can run some code out of one of them. Still, IMO, that isn't
> really important, because the temporary page tables are dropped as soon as
> we jump to restore_registers.
>
> Greetings,
> Rafael
>
> ---
> From: Rafael J. Wysocki <[email protected]>
>
> Make it possible to restore a hibernation image on x86_64 with the help of a
> kernel different from the one in the image.
>
> The idea is to split the core restoration code into two separate parts and to
> place each of them in a different page. ?The first part belongs to the boot
> kernel and is executed as the last step of the image kernel's memory restoration
> procedure. ?Before being executed, it is relocated to a safe page that won't be
> overwritten while copying the image kernel pages.
>
> The final operation performed by it is a jump to the second part of the core
> restoration code that belongs to the image kernel and has just been restored.
> This code makes the CPU switch to the image kernel's page tables and
> restores the state of general purpose registers (including the stack pointer)
> from before the hibernation.
>
> The main issue with this idea is that in order to jump to the second part of the
> core restoration code the boot kernel needs to know its address. ?However, this
> address may be passed to it in the image header. ?Namely, the part of the image
> header previously used for checking if the version of the image kernel is
> correct can be replaced with some architecture specific data that will allow
> the boot kernel to jump to the right address within the image kernel. ?These
> data should also be used for checking if the image kernel is compatible with
> the boot kernel (as far as the memory restroration procedure is concerned).
> It can be done, for example, with the help of a "magic" value that has to be
> equal in both kernels, so that they can be regarded as compatible.
>
> Signed-off-by: Rafael J. Wysocki <[email protected]>
ACK.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
[Sorry for the duplicate, but my mailer decided to mess up the CC list for an
unknown reason.]
On Saturday, 25 August 2007 21:13, Johannes Berg wrote:
> On Fri, 2007-08-24 at 16:23 -0700, Andrew Morton wrote:
>
> > The preferred way of doing this is via Kconfig, please. ie: add a
> > CONFIG_HIBERNATION_HEADER to arch/x86_64/Kconfig.
>
> > It would be better to do something like this in (say) suspend.h:
> >
> > #ifdef CONFIG_HIBERNATION_HEADER
> > extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
> > extern int arch_hibernation_header_restore(void *addr);
> > #else
> > static inline int arch_hibernation_header_save(void *addr,
>
> In fact, I guess we don't need to bother with this at all. The generic
> code for doing this (via the utsname based header) is tiny, so as far as
> I can tell it could just be made weak symbols (by this I mean
> init_header_complete() and check_image_kernel()), and then all the
> #ifdefs can just go.
Well, I don't like the "weak symbols" stuff, but I have managed to limit the
number of additional #ifdefs in snapshot.c to just one.
The "generic" patch is now the following:
---
From: Rafael J. Wysocki <[email protected]>
Add the bits needed for supporting arbitrary boot kernels to the common
hibernation code.
To support arbitrary boot kernels, make it possible to replace the 'struct
new_utsname' and the kernel version in the hibernation image header by some
architecture specific data that will be used to verify if the image is valid
and to restore the image.
Signed-off-by: Rafael J. Wysocki <[email protected]>
---
kernel/power/power.h | 20 +++++++++++++++++-
kernel/power/snapshot.c | 53 +++++++++++++++++++++++++++++++-----------------
2 files changed, 54 insertions(+), 19 deletions(-)
Index: linux-2.6.23-rc3/kernel/power/power.h
===================================================================
--- linux-2.6.23-rc3.orig/kernel/power/power.h 2007-08-23 23:13:34.000000000 +0200
+++ linux-2.6.23-rc3/kernel/power/power.h 2007-08-25 21:18:59.000000000 +0200
@@ -11,14 +11,32 @@ struct swsusp_info {
unsigned long size;
} __attribute__((aligned(PAGE_SIZE)));
+#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ARCH_HIBERNATION_HEADER
+/* Maximum size of architecture specific data in a hibernation header */
+#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
+extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
+extern int arch_hibernation_header_restore(void *addr);
+
+static inline int init_header_complete(struct swsusp_info *info)
+{
+ return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
+}
+
+static inline char *check_image_kernel(struct swsusp_info *info)
+{
+ return arch_hibernation_header_restore(info) ?
+ "architecture specific data" : NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
-#ifdef CONFIG_HIBERNATION
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
*/
#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+
/*
* Keep 1 MB of memory free so that device drivers can allocate some pages in
* their .suspend() routines without breaking the suspend to disk.
Index: linux-2.6.23-rc3/kernel/power/snapshot.c
===================================================================
--- linux-2.6.23-rc3.orig/kernel/power/snapshot.c 2007-08-23 23:13:34.000000000 +0200
+++ linux-2.6.23-rc3/kernel/power/snapshot.c 2007-08-25 21:21:55.000000000 +0200
@@ -1239,17 +1239,39 @@ asmlinkage int swsusp_save(void)
return 0;
}
-static void init_header(struct swsusp_info *info)
+#ifndef CONFIG_ARCH_HIBERNATION_HEADER
+static int init_header_complete(struct swsusp_info *info)
{
- memset(info, 0, sizeof(struct swsusp_info));
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
info->version_code = LINUX_VERSION_CODE;
+ return 0;
+}
+
+static char *check_image_kernel(struct swsusp_info *info)
+{
+ if (info->version_code != LINUX_VERSION_CODE)
+ return "kernel version";
+ if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ return "system type";
+ if (strcmp(info->uts.release,init_utsname()->release))
+ return "kernel release";
+ if (strcmp(info->uts.version,init_utsname()->version))
+ return "version";
+ if (strcmp(info->uts.machine,init_utsname()->machine))
+ return "machine";
+ return NULL;
+}
+#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+
+static int init_header(struct swsusp_info *info)
+{
+ memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = num_physpages;
- memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
- info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
info->size = info->pages;
info->size <<= PAGE_SHIFT;
+ return init_header_complete(info);
}
/**
@@ -1303,7 +1325,11 @@ int snapshot_read_next(struct snapshot_h
return -ENOMEM;
}
if (!handle->offset) {
- init_header((struct swsusp_info *)buffer);
+ int error;
+
+ error = init_header((struct swsusp_info *)buffer);
+ if (error)
+ return error;
handle->buffer = buffer;
memory_bm_position_reset(&orig_bm);
memory_bm_position_reset(©_bm);
@@ -1394,22 +1420,13 @@ duplicate_memory_bitmap(struct memory_bi
}
}
-static inline int check_header(struct swsusp_info *info)
+static int check_header(struct swsusp_info *info)
{
- char *reason = NULL;
+ char *reason;
- if (info->version_code != LINUX_VERSION_CODE)
- reason = "kernel version";
- if (info->num_physpages != num_physpages)
+ reason = check_image_kernel(info);
+ if (!reason && info->num_physpages != num_physpages)
reason = "memory size";
- if (strcmp(info->uts.sysname,init_utsname()->sysname))
- reason = "system type";
- if (strcmp(info->uts.release,init_utsname()->release))
- reason = "kernel release";
- if (strcmp(info->uts.version,init_utsname()->version))
- reason = "version";
- if (strcmp(info->uts.machine,init_utsname()->machine))
- reason = "machine";
if (reason) {
printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
return -EPERM;