2022-07-25 09:15:45

by 黄杰

[permalink] [raw]
Subject: [PATCH 0/4] faster kexec reboot

From: "huangjie.albert" <[email protected]>

In many time-sensitive scenarios, we need a shorter time to restart
the kernel. However, in the current kexec fast restart code, there
are many places in the memory copy operation, verification operation
and decompression operation, which take more time than 500ms. Through
the following patch series. machine_kexec-->start_kernel only takes 15ms

How to measure time:

c code:
uint64_t current_cycles(void)
{
uint32_t low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return ((uint64_t)low) | ((uint64_t)high << 32);
}
assembly code:
pushq %rax
pushq %rdx
rdtsc
mov %eax,%eax
shl $0x20,%rdx
or %rax,%rdx
movq %rdx,0x840(%r14)
popq %rdx
popq %rax
the timestamp may store in boot_params or kexec control page, so we can
get the all timestamp after kernel boot up.

huangjie.albert (4):
kexec: reuse crash kernel reserved memory for normal kexec
kexec: add CONFING_KEXEC_PURGATORY_SKIP_SIG
x86: Support the uncompressed kernel to speed up booting
x86: boot: avoid memory copy if kernel is uncompressed

arch/x86/Kconfig | 10 +++++++++
arch/x86/boot/compressed/Makefile | 5 ++++-
arch/x86/boot/compressed/head_64.S | 8 +++++--
arch/x86/boot/compressed/misc.c | 35 +++++++++++++++++++++++++-----
arch/x86/purgatory/purgatory.c | 7 ++++++
include/linux/kexec.h | 9 ++++----
include/uapi/linux/kexec.h | 2 ++
kernel/kexec.c | 19 +++++++++++++++-
kernel/kexec_core.c | 16 ++++++++------
kernel/kexec_file.c | 20 +++++++++++++++--
scripts/Makefile.lib | 5 +++++
11 files changed, 114 insertions(+), 22 deletions(-)

--
2.31.1


2022-07-25 09:33:04

by 黄杰

[permalink] [raw]
Subject: [PATCH 4/4] x86: boot: avoid memory copy if kernel is uncompressed

From: "huangjie.albert" <[email protected]>

1、if kernel is uncompressed. we do not need to relocate
kernel image for decompression

2、if kaslr is disabled, we do not need to do a memory copy
before prase_elf.

Two memory copies can be skipped with this patch. this can
save aboat 20ms during booting.

Signed-off-by: huangjie.albert <[email protected]>
---
arch/x86/boot/compressed/head_64.S | 8 ++++++--
arch/x86/boot/compressed/misc.c | 22 +++++++++++++++++-----
2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d33f060900d2..9e7770c7047b 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -398,10 +398,13 @@ SYM_CODE_START(startup_64)
1:

/* Target address to relocate to for decompression */
+#ifdef CONFIG_KERNEL_UNCOMPRESSED
+ movq %rbp, %rbx
+#else
movl BP_init_size(%rsi), %ebx
subl $ rva(_end), %ebx
addq %rbp, %rbx
-
+#endif
/* Set up the stack */
leaq rva(boot_stack_end)(%rbx), %rsp

@@ -522,6 +525,7 @@ trampoline_return:
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
+#ifndef CONFIG_KERNEL_UNCOMPRESSED
pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq rva(_bss-8)(%rbx), %rdi
@@ -531,7 +535,7 @@ trampoline_return:
rep movsq
cld
popq %rsi
-
+#endif
/*
* The GDT may get overwritten either during the copy we just did or
* during extract_kernel below. To avoid any issues, repoint the GDTR
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index c23c0f525d93..d8445562d4e9 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -290,7 +290,7 @@ static inline void handle_relocations(void *output, unsigned long output_len,
{ }
#endif

-static void parse_elf(void *output)
+static void parse_elf(void *output, void *input)
{
#ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -302,7 +302,7 @@ static void parse_elf(void *output)
void *dest;
int i;

- memcpy(&ehdr, output, sizeof(ehdr));
+ memcpy(&ehdr, input, sizeof(ehdr));
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
@@ -317,7 +317,7 @@ static void parse_elf(void *output)
if (!phdrs)
error("Failed to allocate space for phdrs");

- memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
+ memcpy(phdrs, input + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);

for (i = 0; i < ehdr.e_phnum; i++) {
phdr = &phdrs[i];
@@ -334,7 +334,7 @@ static void parse_elf(void *output)
#else
dest = (void *)(phdr->p_paddr);
#endif
- memmove(dest, output + phdr->p_offset, phdr->p_filesz);
+ memmove(dest, input + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
@@ -467,9 +467,21 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#endif

debug_putstr("\nDecompressing Linux... ");
+
+#ifdef CONFIG_KERNEL_UNCOMPRESSED
+ if (cmdline_find_option_bool("nokaslr")) {
+ parse_elf(output, input_data);
+ } else {
+ __decompress(input_data, input_len, NULL, NULL, output, output_len,
+ NULL, error);
+ parse_elf(output, output);
+ }
+#else
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
- parse_elf(output);
+ parse_elf(output, output);
+#endif
+
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");

--
2.31.1

2022-07-25 13:10:38

by 黄杰

[permalink] [raw]
Subject: Fwd: [PATCH 4/4] x86: boot: avoid memory copy if kernel is uncompressed

---------- Forwarded message ---------
发件人: Albert Huang <[email protected]>
Date: 2022年7月25日周一 16:40
Subject: [PATCH 4/4] x86: boot: avoid memory copy if kernel is uncompressed
To:
Cc: huangjie.albert <[email protected]>, Thomas Gleixner
<[email protected]>, Ingo Molnar <[email protected]>, Borislav Petkov
<[email protected]>, Dave Hansen <[email protected]>,
<[email protected]>, H. Peter Anvin <[email protected]>, Eric Biederman
<[email protected]>, Masahiro Yamada <[email protected]>,
Michal Marek <[email protected]>, Nick Desaulniers
<[email protected]>, Kirill A. Shutemov
<[email protected]>, Kuppuswamy Sathyanarayanan
<[email protected]>, Michael Roth
<[email protected]>, Nathan Chancellor <[email protected]>, Ard
Biesheuvel <[email protected]>, Mark Rutland <[email protected]>,
Sean Christopherson <[email protected]>, Peter Zijlstra
<[email protected]>, Kees Cook <[email protected]>, Tony Luck
<[email protected]>, <[email protected]>,
<[email protected]>, <[email protected]>


From: "huangjie.albert" <[email protected]>

1、if kernel is uncompressed. we do not need to relocate
kernel image for decompression

2、if kaslr is disabled, we do not need to do a memory copy
before prase_elf.

Two memory copies can be skipped with this patch. this can
save aboat 20ms during booting.

Signed-off-by: huangjie.albert <[email protected]>
---
arch/x86/boot/compressed/head_64.S | 8 ++++++--
arch/x86/boot/compressed/misc.c | 22 +++++++++++++++++-----
2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S
b/arch/x86/boot/compressed/head_64.S
index d33f060900d2..9e7770c7047b 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -398,10 +398,13 @@ SYM_CODE_START(startup_64)
1:

/* Target address to relocate to for decompression */
+#ifdef CONFIG_KERNEL_UNCOMPRESSED
+ movq %rbp, %rbx
+#else
movl BP_init_size(%rsi), %ebx
subl $ rva(_end), %ebx
addq %rbp, %rbx
-
+#endif
/* Set up the stack */
leaq rva(boot_stack_end)(%rbx), %rsp

@@ -522,6 +525,7 @@ trampoline_return:
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
+#ifndef CONFIG_KERNEL_UNCOMPRESSED
pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq rva(_bss-8)(%rbx), %rdi
@@ -531,7 +535,7 @@ trampoline_return:
rep movsq
cld
popq %rsi
-
+#endif
/*
* The GDT may get overwritten either during the copy we just did or
* during extract_kernel below. To avoid any issues, repoint the GDTR
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index c23c0f525d93..d8445562d4e9 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -290,7 +290,7 @@ static inline void handle_relocations(void
*output, unsigned long output_len,
{ }
#endif

-static void parse_elf(void *output)
+static void parse_elf(void *output, void *input)
{
#ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -302,7 +302,7 @@ static void parse_elf(void *output)
void *dest;
int i;

- memcpy(&ehdr, output, sizeof(ehdr));
+ memcpy(&ehdr, input, sizeof(ehdr));
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
@@ -317,7 +317,7 @@ static void parse_elf(void *output)
if (!phdrs)
error("Failed to allocate space for phdrs");

- memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
+ memcpy(phdrs, input + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);

for (i = 0; i < ehdr.e_phnum; i++) {
phdr = &phdrs[i];
@@ -334,7 +334,7 @@ static void parse_elf(void *output)
#else
dest = (void *)(phdr->p_paddr);
#endif
- memmove(dest, output + phdr->p_offset, phdr->p_filesz);
+ memmove(dest, input + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
@@ -467,9 +467,21 @@ asmlinkage __visible void *extract_kernel(void
*rmode, memptr heap,
#endif

debug_putstr("\nDecompressing Linux... ");
+
+#ifdef CONFIG_KERNEL_UNCOMPRESSED
+ if (cmdline_find_option_bool("nokaslr")) {
+ parse_elf(output, input_data);
+ } else {
+ __decompress(input_data, input_len, NULL, NULL,
output, output_len,
+ NULL, error);
+ parse_elf(output, output);
+ }
+#else
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
- parse_elf(output);
+ parse_elf(output, output);
+#endif
+
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");

--
2.31.1

2022-07-25 13:41:38

by 黄杰

[permalink] [raw]
Subject: Fwd: [PATCH 0/4] faster kexec reboot

---------- Forwarded message ---------
发件人: Albert Huang <[email protected]>
Date: 2022年7月25日周一 16:39
Subject: [PATCH 0/4] faster kexec reboot
To:
Cc: huangjie.albert <[email protected]>, Thomas Gleixner
<[email protected]>, Ingo Molnar <[email protected]>, Borislav Petkov
<[email protected]>, Dave Hansen <[email protected]>,
<[email protected]>, H. Peter Anvin <[email protected]>, Eric Biederman
<[email protected]>, Masahiro Yamada <[email protected]>,
Michal Marek <[email protected]>, Nick Desaulniers
<[email protected]>, Kirill A. Shutemov
<[email protected]>, Michael Roth
<[email protected]>, Kuppuswamy Sathyanarayanan
<[email protected]>, Nathan Chancellor
<[email protected]>, Peter Zijlstra <[email protected]>, Sean
Christopherson <[email protected]>, Joerg Roedel <[email protected]>,
Mark Rutland <[email protected]>, Kees Cook
<[email protected]>, <[email protected]>,
<[email protected]>, <[email protected]>


From: "huangjie.albert" <[email protected]>

In many time-sensitive scenarios, we need a shorter time to restart
the kernel. However, in the current kexec fast restart code, there
are many places in the memory copy operation, verification operation
and decompression operation, which take more time than 500ms. Through
the following patch series. machine_kexec-->start_kernel only takes 15ms

How to measure time:

c code:
uint64_t current_cycles(void)
{
uint32_t low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return ((uint64_t)low) | ((uint64_t)high << 32);
}
assembly code:
pushq %rax
pushq %rdx
rdtsc
mov %eax,%eax
shl $0x20,%rdx
or %rax,%rdx
movq %rdx,0x840(%r14)
popq %rdx
popq %rax
the timestamp may store in boot_params or kexec control page, so we can
get the all timestamp after kernel boot up.

huangjie.albert (4):
kexec: reuse crash kernel reserved memory for normal kexec
kexec: add CONFING_KEXEC_PURGATORY_SKIP_SIG
x86: Support the uncompressed kernel to speed up booting
x86: boot: avoid memory copy if kernel is uncompressed

arch/x86/Kconfig | 10 +++++++++
arch/x86/boot/compressed/Makefile | 5 ++++-
arch/x86/boot/compressed/head_64.S | 8 +++++--
arch/x86/boot/compressed/misc.c | 35 +++++++++++++++++++++++++-----
arch/x86/purgatory/purgatory.c | 7 ++++++
include/linux/kexec.h | 9 ++++----
include/uapi/linux/kexec.h | 2 ++
kernel/kexec.c | 19 +++++++++++++++-
kernel/kexec_core.c | 16 ++++++++------
kernel/kexec_file.c | 20 +++++++++++++++--
scripts/Makefile.lib | 5 +++++
11 files changed, 114 insertions(+), 22 deletions(-)

--
2.31.1

2022-07-25 17:11:01

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/4] faster kexec reboot

Albert Huang <[email protected]> writes:

> From: "huangjie.albert" <[email protected]>
>
> In many time-sensitive scenarios, we need a shorter time to restart
> the kernel. However, in the current kexec fast restart code, there
> are many places in the memory copy operation, verification operation
> and decompression operation, which take more time than 500ms. Through
> the following patch series. machine_kexec-->start_kernel only takes
> 15ms

Is this a tiny embedded device you are taking the timings of?

How are you handling driver shutdown and restart? I would expect those
to be a larger piece of the puzzle than memory.

My desktop can do something like 128GiB/s. Which would suggest that
copying 128MiB of kernel+initrd would take perhaps 10ms. The SHA256
implementation may not be tuned so that could be part of the performance
issue. The SHA256 hash has a reputation for having fast
implementations. I chose SHA256 originally simply because it has more
bits so it makes the odds of detecting an error higher.


If all you care about is booting a kernel as fast as possible it make
make sense to have a large reserved region of memory like we have for
the kexec on panic kernel. If that really makes sense I recommend
adding a second kernel command line option and a reserving second region
of reserved memory. That makes telling if the are any conflicts simple.


I am having a hard time seeing how anyone else would want these options.
Losing megabytes of memory simply because you might reboot using kexec
seems like the wrong side of a trade-off.

The CONFIG_KEXEC_PURGATORY_SKIP_SIG option is very misnamed. It is not
signature verification that is happening it is a hash verification.
There are not encrypted bits at play. Instead there is a check to
ensure that the kernel has not been corrupted by in-flight DMA that some
driver forgot to shut down.

So you are building a version of kexec that if something goes wrong it
could very easily eat your data, or otherwise do some very bad things
that are absolutely non-trivial to debug.

That the decision to skip the sha256 hash that prevents corruption is
happening at compile time, instead of at run-time, will guarantee the
option is simply not available on any general purpose kernel
configuration. Given how dangerous it is to skip the hash verification
it is probably not a bad thing overall, but it is most definitely
something that will make maintenance more difficult.


If done well I don't see why anyone would mind a uncompressed kernel
but I don't see what the advantage of what you are doing is over using
vmlinux is the build directory. It isn't a bzImage but it is the
uncompressed kernel.

As I proof of concept I think what you are doing goes a way to showing
that things can be improved. My overall sense is that improving things
the way you are proposing does not help the general case and simply adds
to the maintenance burden.

Eric

>
> How to measure time:
>
> c code:
> uint64_t current_cycles(void)
> {
> uint32_t low, high;
> asm volatile("rdtsc" : "=a"(low), "=d"(high));
> return ((uint64_t)low) | ((uint64_t)high << 32);
> }
> assembly code:
> pushq %rax
> pushq %rdx
> rdtsc
> mov %eax,%eax
> shl $0x20,%rdx
> or %rax,%rdx
> movq %rdx,0x840(%r14)
> popq %rdx
> popq %rax
> the timestamp may store in boot_params or kexec control page, so we can
> get the all timestamp after kernel boot up.
>
> huangjie.albert (4):
> kexec: reuse crash kernel reserved memory for normal kexec
> kexec: add CONFING_KEXEC_PURGATORY_SKIP_SIG
> x86: Support the uncompressed kernel to speed up booting
> x86: boot: avoid memory copy if kernel is uncompressed
>
> arch/x86/Kconfig | 10 +++++++++
> arch/x86/boot/compressed/Makefile | 5 ++++-
> arch/x86/boot/compressed/head_64.S | 8 +++++--
> arch/x86/boot/compressed/misc.c | 35 +++++++++++++++++++++++++-----
> arch/x86/purgatory/purgatory.c | 7 ++++++
> include/linux/kexec.h | 9 ++++----
> include/uapi/linux/kexec.h | 2 ++
> kernel/kexec.c | 19 +++++++++++++++-
> kernel/kexec_core.c | 16 ++++++++------
> kernel/kexec_file.c | 20 +++++++++++++++--
> scripts/Makefile.lib | 5 +++++
> 11 files changed, 114 insertions(+), 22 deletions(-)

2022-07-26 06:08:48

by 黄杰

[permalink] [raw]
Subject: Re: [External] Re: [PATCH 0/4] faster kexec reboot

Hi
Eric W. Biederman
Thank you for your advice and opinion, I am very honored

Eric W. Biederman <[email protected]> 于2022年7月26日周二 01:04写道:
>
> Albert Huang <[email protected]> writes:
>
> > From: "huangjie.albert" <[email protected]>
> >
> > In many time-sensitive scenarios, we need a shorter time to restart
> > the kernel. However, in the current kexec fast restart code, there
> > are many places in the memory copy operation, verification operation
> > and decompression operation, which take more time than 500ms. Through
> > the following patch series. machine_kexec-->start_kernel only takes
> > 15ms
>
> Is this a tiny embedded device you are taking the timings of?
>
> How are you handling driver shutdown and restart? I would expect those
> to be a larger piece of the puzzle than memory.

There is no way to make the code universal in the time optimization here,
and various devices need to be customized, but we have some solutions to
achieve the maintenance and recovery of these devices,
especially the scanning and initialization of pci devices

>
> My desktop can do something like 128GiB/s. Which would suggest that
> copying 128MiB of kernel+initrd would take perhaps 10ms. The SHA256
> implementation may not be tuned so that could be part of the performance
> issue. The SHA256 hash has a reputation for having fast
> implementations. I chose SHA256 originally simply because it has more
> bits so it makes the odds of detecting an error higher.
>

Yes, sha256 is a better choice, but if there is no memory copy between
kexec load
and kexec -e, and this part of the memory is reserved. Don't think
this part of memory will be changed.
Especially in virtual machine scenarios

>
> If all you care about is booting a kernel as fast as possible it make
> make sense to have a large reserved region of memory like we have for
> the kexec on panic kernel. If that really makes sense I recommend
> adding a second kernel command line option and a reserving second region
> of reserved memory. That makes telling if the are any conflicts simple.
>

I initially implemented re-adding a parameter and region, but I
figured out later
that it doesn't really make sense and would waste extra memory.

>
> I am having a hard time seeing how anyone else would want these options.
> Losing megabytes of memory simply because you might reboot using kexec
> seems like the wrong side of a trade-off.

Reuse the memory reserved by the crash kernel? Why does it increase
memory consumption?

>
> The CONFIG_KEXEC_PURGATORY_SKIP_SIG option is very misnamed. It is not
> signature verification that is happening it is a hash verification.
> There are not encrypted bits at play. Instead there is a check to
> ensure that the kernel has not been corrupted by in-flight DMA that some
> driver forgot to shut down.
>
Thanks for pointing that out.
but Even if the data is detected to have been changed, there is
currently no way to recover it.
I don't have a good understanding of this place yet. maybe for security reasons?


> So you are building a version of kexec that if something goes wrong it
> could very easily eat your data, or otherwise do some very bad things
> that are absolutely non-trivial to debug.
>
> That the decision to skip the sha256 hash that prevents corruption is
> happening at compile time, instead of at run-time, will guarantee the
> option is simply not available on any general purpose kernel
> configuration. Given how dangerous it is to skip the hash verification
> it is probably not a bad thing overall, but it is most definitely
> something that will make maintenance more difficult.
>

Maybe parameters will be a better choice. What do you think ?

>
> If done well I don't see why anyone would mind a uncompressed kernel
> but I don't see what the advantage of what you are doing is over using
> vmlinux is the build directory. It isn't a bzImage but it is the
> uncompressed kernel.
>


> As I proof of concept I think what you are doing goes a way to showing
> that things can be improved. My overall sense is that improving things
> the way you are proposing does not help the general case and simply adds
> to the maintenance burden.

I don't think so. The kernel startup time of some lightweight virtual
machines maybe
100-200ms (start_kernel->init). But this kexec->start_kernel took more
than 500ms.
This is still valuable, and the overall code size is also very small.

> Eric
>
> >
> > How to measure time:
> >
> > c code:
> > uint64_t current_cycles(void)
> > {
> > uint32_t low, high;
> > asm volatile("rdtsc" : "=a"(low), "=d"(high));
> > return ((uint64_t)low) | ((uint64_t)high << 32);
> > }
> > assembly code:
> > pushq %rax
> > pushq %rdx
> > rdtsc
> > mov %eax,%eax
> > shl $0x20,%rdx
> > or %rax,%rdx
> > movq %rdx,0x840(%r14)
> > popq %rdx
> > popq %rax
> > the timestamp may store in boot_params or kexec control page, so we can
> > get the all timestamp after kernel boot up.
> >
> > huangjie.albert (4):
> > kexec: reuse crash kernel reserved memory for normal kexec
> > kexec: add CONFING_KEXEC_PURGATORY_SKIP_SIG
> > x86: Support the uncompressed kernel to speed up booting
> > x86: boot: avoid memory copy if kernel is uncompressed
> >
> > arch/x86/Kconfig | 10 +++++++++
> > arch/x86/boot/compressed/Makefile | 5 ++++-
> > arch/x86/boot/compressed/head_64.S | 8 +++++--
> > arch/x86/boot/compressed/misc.c | 35 +++++++++++++++++++++++++-----
> > arch/x86/purgatory/purgatory.c | 7 ++++++
> > include/linux/kexec.h | 9 ++++----
> > include/uapi/linux/kexec.h | 2 ++
> > kernel/kexec.c | 19 +++++++++++++++-
> > kernel/kexec_core.c | 16 ++++++++------
> > kernel/kexec_file.c | 20 +++++++++++++++--
> > scripts/Makefile.lib | 5 +++++
> > 11 files changed, 114 insertions(+), 22 deletions(-)

2022-07-28 02:07:01

by 黄杰

[permalink] [raw]
Subject: Re: [External] Re: [PATCH 0/4] faster kexec reboot

黄杰 <[email protected]> 于2022年7月26日周二 13:53写道:
>
> Hi
> Eric W. Biederman
> Thank you for your advice and opinion, I am very honored
>
> Eric W. Biederman <[email protected]> 于2022年7月26日周二 01:04写道:
> >
> > Albert Huang <[email protected]> writes:
> >
> > > From: "huangjie.albert" <[email protected]>
> > >
> > > In many time-sensitive scenarios, we need a shorter time to restart
> > > the kernel. However, in the current kexec fast restart code, there
> > > are many places in the memory copy operation, verification operation
> > > and decompression operation, which take more time than 500ms. Through
> > > the following patch series. machine_kexec-->start_kernel only takes
> > > 15ms
> >
> > Is this a tiny embedded device you are taking the timings of?
> >
> > How are you handling driver shutdown and restart? I would expect those
> > to be a larger piece of the puzzle than memory.
>
> There is no way to make the code universal in the time optimization here,
> and various devices need to be customized, but we have some solutions to
> achieve the maintenance and recovery of these devices,
> especially the scanning and initialization of pci devices
>
> >
> > My desktop can do something like 128GiB/s. Which would suggest that
> > copying 128MiB of kernel+initrd would take perhaps 10ms. The SHA256
> > implementation may not be tuned so that could be part of the performance
> > issue. The SHA256 hash has a reputation for having fast
> > implementations. I chose SHA256 originally simply because it has more
> > bits so it makes the odds of detecting an error higher.
> >
>
> Yes, sha256 is a better choice, but if there is no memory copy between
> kexec load
> and kexec -e, and this part of the memory is reserved. Don't think
> this part of memory will be changed.
> Especially in virtual machine scenarios
>

hi Eric :

Do you know why this sha256 check is put here? I feel that it is
better to put it in the system call of kexec -e.
If the verification is not passed, the second kernel will not be
started, and some prompt information will be
printed at the same time, which seems to be better than when the
second kernel is started.
Doing the verification operation will be more friendly, and it can
also reduce downtime.

BR
albert.

> >
> > If all you care about is booting a kernel as fast as possible it make
> > make sense to have a large reserved region of memory like we have for
> > the kexec on panic kernel. If that really makes sense I recommend
> > adding a second kernel command line option and a reserving second region
> > of reserved memory. That makes telling if the are any conflicts simple.
> >
>
> I initially implemented re-adding a parameter and region, but I
> figured out later
> that it doesn't really make sense and would waste extra memory.
>
> >
> > I am having a hard time seeing how anyone else would want these options.
> > Losing megabytes of memory simply because you might reboot using kexec
> > seems like the wrong side of a trade-off.
>
> Reuse the memory reserved by the crash kernel? Why does it increase
> memory consumption?
>
> >
> > The CONFIG_KEXEC_PURGATORY_SKIP_SIG option is very misnamed. It is not
> > signature verification that is happening it is a hash verification.
> > There are not encrypted bits at play. Instead there is a check to
> > ensure that the kernel has not been corrupted by in-flight DMA that some
> > driver forgot to shut down.
> >
> Thanks for pointing that out.
> but Even if the data is detected to have been changed, there is
> currently no way to recover it.
> I don't have a good understanding of this place yet. maybe for security reasons?
>
>
> > So you are building a version of kexec that if something goes wrong it
> > could very easily eat your data, or otherwise do some very bad things
> > that are absolutely non-trivial to debug.
> >
> > That the decision to skip the sha256 hash that prevents corruption is
> > happening at compile time, instead of at run-time, will guarantee the
> > option is simply not available on any general purpose kernel
> > configuration. Given how dangerous it is to skip the hash verification
> > it is probably not a bad thing overall, but it is most definitely
> > something that will make maintenance more difficult.
> >
>
> Maybe parameters will be a better choice. What do you think ?
>
> >
> > If done well I don't see why anyone would mind a uncompressed kernel
> > but I don't see what the advantage of what you are doing is over using
> > vmlinux is the build directory. It isn't a bzImage but it is the
> > uncompressed kernel.
> >
>
>
> > As I proof of concept I think what you are doing goes a way to showing
> > that things can be improved. My overall sense is that improving things
> > the way you are proposing does not help the general case and simply adds
> > to the maintenance burden.
>
> I don't think so. The kernel startup time of some lightweight virtual
> machines maybe
> 100-200ms (start_kernel->init). But this kexec->start_kernel took more
> than 500ms.
> This is still valuable, and the overall code size is also very small.
>
> > Eric
> >
> > >
> > > How to measure time:
> > >
> > > c code:
> > > uint64_t current_cycles(void)
> > > {
> > > uint32_t low, high;
> > > asm volatile("rdtsc" : "=a"(low), "=d"(high));
> > > return ((uint64_t)low) | ((uint64_t)high << 32);
> > > }
> > > assembly code:
> > > pushq %rax
> > > pushq %rdx
> > > rdtsc
> > > mov %eax,%eax
> > > shl $0x20,%rdx
> > > or %rax,%rdx
> > > movq %rdx,0x840(%r14)
> > > popq %rdx
> > > popq %rax
> > > the timestamp may store in boot_params or kexec control page, so we can
> > > get the all timestamp after kernel boot up.
> > >
> > > huangjie.albert (4):
> > > kexec: reuse crash kernel reserved memory for normal kexec
> > > kexec: add CONFING_KEXEC_PURGATORY_SKIP_SIG
> > > x86: Support the uncompressed kernel to speed up booting
> > > x86: boot: avoid memory copy if kernel is uncompressed
> > >
> > > arch/x86/Kconfig | 10 +++++++++
> > > arch/x86/boot/compressed/Makefile | 5 ++++-
> > > arch/x86/boot/compressed/head_64.S | 8 +++++--
> > > arch/x86/boot/compressed/misc.c | 35 +++++++++++++++++++++++++-----
> > > arch/x86/purgatory/purgatory.c | 7 ++++++
> > > include/linux/kexec.h | 9 ++++----
> > > include/uapi/linux/kexec.h | 2 ++
> > > kernel/kexec.c | 19 +++++++++++++++-
> > > kernel/kexec_core.c | 16 ++++++++------
> > > kernel/kexec_file.c | 20 +++++++++++++++--
> > > scripts/Makefile.lib | 5 +++++
> > > 11 files changed, 114 insertions(+), 22 deletions(-)