2015-12-23 11:12:43

by Xunlei Pang

[permalink] [raw]
Subject: [PATCH 1/2] kexec: Introduce a protection mechanism for the crashkernel reserved memory

For the cases that some kernel (module) path stamps the crash
reserved memory(already mapped by the kernel) where has been
loaded the second kernel data, the kdump kernel will probably
fail to boot when panic happens (or even not happens) leaving
the culprit at large, this is unacceptable.

The patch introduces a mechanism for detecting such cases:
1) After each crash kexec loading, it simply marks the reserved
memory regions readonly since we no longer access it after that.
When someone stamps the region, the first kernel will panic and
trigger the kdump. The weak arch_kexec_protect_crashkres() is
introduced to do the actual protection.

2) To allow multiple loading, once 1) was done we also need to
remark the reserved memory to readwrite each time a system call
related to kdump is made. The weak arch_kexec_unprotect_crashkres()
is introduced to do the actual protection.

The architecture can make its specific implementation by overriding
arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

Signed-off-by: Xunlei Pang <[email protected]>
---
include/linux/kexec.h | 2 ++
kernel/kexec.c | 9 ++++++++-
kernel/kexec_core.c | 6 ++++++
kernel/kexec_file.c | 8 +++++++-
4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 7b68d27..ebd6950 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -329,6 +329,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
Elf_Shdr *sechdrs, unsigned int relsec);
int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);

#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index d873b64..3680f9c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
return -EBUSY;

dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH)
+ if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }
+
if (nr_segments > 0) {
unsigned long i;

@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
image = xchg(dest_image, image);

out:
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c823f30..de4dd80 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1560,3 +1560,9 @@ void __weak crash_map_reserved_pages(void)

void __weak crash_unmap_reserved_pages(void)
{}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0..211eb97 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -327,8 +327,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
return -EBUSY;

dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
+ if (flags & KEXEC_FILE_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }

if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -377,6 +380,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
exchange:
image = xchg(dest_image, image);
out:
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);
return ret;
--
2.5.0


2015-12-23 11:12:47

by Xunlei Pang

[permalink] [raw]
Subject: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

Implement the protection method for the crash kernel memory
reservation for the 64-bit x86 kdump.

Signed-off-by: Xunlei Pang <[email protected]>
---
Only provided x86_64 implementation, as I've only tested on x86_64 so far.

arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f..a3d289c 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -536,3 +536,46 @@ overflow:
return -ENOEXEC;
}
#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_KEXEC_CORE
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ if (!start || !end || start >= end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end + 1 - start) >> PAGE_SHIFT;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
+
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ /* Control code page is located in the 2nd page. */
+ control = control + PAGE_SIZE;
+ kexec_mark_range(crashk_res.start, control - 1, protect);
+ kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_crashkres(false);
+}
+#endif
--
2.5.0

2015-12-24 05:54:33

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

Ccing Vivek

On 12/23/15 at 07:12pm, Xunlei Pang wrote:
> Implement the protection method for the crash kernel memory
> reservation for the 64-bit x86 kdump.
>
> Signed-off-by: Xunlei Pang <[email protected]>
> ---
> Only provided x86_64 implementation, as I've only tested on x86_64 so far.
>
> arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 43 insertions(+)
>
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index 819ab3f..a3d289c 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -536,3 +536,46 @@ overflow:
> return -ENOEXEC;
> }
> #endif /* CONFIG_KEXEC_FILE */
> +
> +#ifdef CONFIG_KEXEC_CORE

The file is only compiled when CONFIG_KEXEC_CORE=y so #ifdef is not necessary

> +static int
> +kexec_mark_range(unsigned long start, unsigned long end, bool protect)
> +{
> + struct page *page;
> + unsigned int nr_pages;
> +
> + if (!start || !end || start >= end)
> + return 0;
> +
> + page = pfn_to_page(start >> PAGE_SHIFT);
> + nr_pages = (end + 1 - start) >> PAGE_SHIFT;
> + if (protect)
> + return set_pages_ro(page, nr_pages);
> + else
> + return set_pages_rw(page, nr_pages);

May use set_memory_ro/rw to avoid converting to *page?

> +}
> +
> +static void kexec_mark_crashkres(bool protect)
> +{
> + unsigned long control;
> +
> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
> +
> + /* Don't touch the control code page used in crash_kexec().*/
> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
> + /* Control code page is located in the 2nd page. */
> + control = control + PAGE_SIZE;
> + kexec_mark_range(crashk_res.start, control - 1, protect);
> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);

X86 kexec will copy the page while kexecing, could you check if we can move
that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
that we can make a arch-independent implementation.

> +}
> +
> +void arch_kexec_protect_crashkres(void)
> +{
> + kexec_mark_crashkres(true);
> +}
> +
> +void arch_kexec_unprotect_crashkres(void)
> +{
> + kexec_mark_crashkres(false);
> +}
> +#endif
> --
> 2.5.0
>
>
> _______________________________________________
> kexec mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/kexec

2015-12-24 06:06:03

by Xunlei Pang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/24/2015 at 01:54 PM, Dave Young wrote:
> Ccing Vivek
>
> On 12/23/15 at 07:12pm, Xunlei Pang wrote:
>> Implement the protection method for the crash kernel memory
>> reservation for the 64-bit x86 kdump.
>>
>> Signed-off-by: Xunlei Pang <[email protected]>
>> ---
>> Only provided x86_64 implementation, as I've only tested on x86_64 so far.
>>
>> arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 43 insertions(+)
>>
>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>> index 819ab3f..a3d289c 100644
>> --- a/arch/x86/kernel/machine_kexec_64.c
>> +++ b/arch/x86/kernel/machine_kexec_64.c
>> @@ -536,3 +536,46 @@ overflow:
>> return -ENOEXEC;
>> }
>> #endif /* CONFIG_KEXEC_FILE */
>> +
>> +#ifdef CONFIG_KEXEC_CORE
> The file is only compiled when CONFIG_KEXEC_CORE=y so #ifdef is not necessary

Yes, indeed. I'll remove this macro and send v2 later.

>
>> +static int
>> +kexec_mark_range(unsigned long start, unsigned long end, bool protect)
>> +{
>> + struct page *page;
>> + unsigned int nr_pages;
>> +
>> + if (!start || !end || start >= end)
>> + return 0;
>> +
>> + page = pfn_to_page(start >> PAGE_SHIFT);
>> + nr_pages = (end + 1 - start) >> PAGE_SHIFT;
>> + if (protect)
>> + return set_pages_ro(page, nr_pages);
>> + else
>> + return set_pages_rw(page, nr_pages);
> May use set_memory_ro/rw to avoid converting to *page?

on x86 it just a wrapper of set_memory_ro/rw, I think both are ok.

>
>> +}
>> +
>> +static void kexec_mark_crashkres(bool protect)
>> +{
>> + unsigned long control;
>> +
>> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
>> +
>> + /* Don't touch the control code page used in crash_kexec().*/
>> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
>> + /* Control code page is located in the 2nd page. */
>> + control = control + PAGE_SIZE;
>> + kexec_mark_range(crashk_res.start, control - 1, protect);
>> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
> X86 kexec will copy the page while kexecing, could you check if we can move
> that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
> that we can make a arch-independent implementation.

For some arch, may use huge tlb directly to do the kernel mapping,
in such cases, we can't implement this function. So I think it should
be arch-dependent.

Regards,
Xunlei

>
>> +}
>> +
>> +void arch_kexec_protect_crashkres(void)
>> +{
>> + kexec_mark_crashkres(true);
>> +}
>> +
>> +void arch_kexec_unprotect_crashkres(void)
>> +{
>> + kexec_mark_crashkres(false);
>> +}
>> +#endif
>> --
>> 2.5.0
>>
>>
>> _______________________________________________
>> kexec mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/kexec

2015-12-24 06:16:24

by Dave Young

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

Hi, Xunlei

On 12/24/15 at 02:05pm, Xunlei Pang wrote:
> On 12/24/2015 at 01:54 PM, Dave Young wrote:
> > Ccing Vivek
> >
> > On 12/23/15 at 07:12pm, Xunlei Pang wrote:
> >> Implement the protection method for the crash kernel memory
> >> reservation for the 64-bit x86 kdump.
> >>
> >> Signed-off-by: Xunlei Pang <[email protected]>
> >> ---
> >> Only provided x86_64 implementation, as I've only tested on x86_64 so far.
> >>
> >> arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 43 insertions(+)
> >>
> >> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> >> index 819ab3f..a3d289c 100644
> >> --- a/arch/x86/kernel/machine_kexec_64.c
> >> +++ b/arch/x86/kernel/machine_kexec_64.c
> >> @@ -536,3 +536,46 @@ overflow:
> >> return -ENOEXEC;
> >> }
> >> #endif /* CONFIG_KEXEC_FILE */
> >> +
> >> +#ifdef CONFIG_KEXEC_CORE
> > The file is only compiled when CONFIG_KEXEC_CORE=y so #ifdef is not necessary
>
> Yes, indeed. I'll remove this macro and send v2 later.
>
> >
> >> +static int
> >> +kexec_mark_range(unsigned long start, unsigned long end, bool protect)
> >> +{
> >> + struct page *page;
> >> + unsigned int nr_pages;
> >> +
> >> + if (!start || !end || start >= end)
> >> + return 0;
> >> +
> >> + page = pfn_to_page(start >> PAGE_SHIFT);
> >> + nr_pages = (end + 1 - start) >> PAGE_SHIFT;
> >> + if (protect)
> >> + return set_pages_ro(page, nr_pages);
> >> + else
> >> + return set_pages_rw(page, nr_pages);
> > May use set_memory_ro/rw to avoid converting to *page?
>
> on x86 it just a wrapper of set_memory_ro/rw, I think both are ok.

Ok, I have no strong opinion on that..

>
> >
> >> +}
> >> +
> >> +static void kexec_mark_crashkres(bool protect)
> >> +{
> >> + unsigned long control;
> >> +
> >> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
> >> +
> >> + /* Don't touch the control code page used in crash_kexec().*/
> >> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
> >> + /* Control code page is located in the 2nd page. */
> >> + control = control + PAGE_SIZE;

Though it works because the control code is less than 1 page, but use the macro
of KEXEC_CONTROL_PAGE_SIZE looks better..

> >> + kexec_mark_range(crashk_res.start, control - 1, protect);
> >> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
> > X86 kexec will copy the page while kexecing, could you check if we can move
> > that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
> > that we can make a arch-independent implementation.
>
> For some arch, may use huge tlb directly to do the kernel mapping,
> in such cases, we can't implement this function. So I think it should
> be arch-dependent.

Ok, that's fine.

>
> Regards,
> Xunlei
>
> >
> >> +}
> >> +
> >> +void arch_kexec_protect_crashkres(void)
> >> +{
> >> + kexec_mark_crashkres(true);
> >> +}
> >> +
> >> +void arch_kexec_unprotect_crashkres(void)
> >> +{
> >> + kexec_mark_crashkres(false);
> >> +}
> >> +#endif
> >> --
> >> 2.5.0
> >>
> >>
> >> _______________________________________________
> >> kexec mailing list
> >> [email protected]
> >> http://lists.infradead.org/mailman/listinfo/kexec
>
>
> _______________________________________________
> kexec mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/kexec

Thanks
Dave

2015-12-24 06:44:40

by Xunlei Pang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/24/2015 at 02:16 PM, Dave Young wrote:
> Hi, Xunlei
>
> On 12/24/15 at 02:05pm, Xunlei Pang wrote:
>> On 12/24/2015 at 01:54 PM, Dave Young wrote:
>>> Ccing Vivek
>>>
>>> On 12/23/15 at 07:12pm, Xunlei Pang wrote:
>>>> Implement the protection method for the crash kernel memory
>>>> reservation for the 64-bit x86 kdump.
>>>>
>>>> Signed-off-by: Xunlei Pang <[email protected]>
>>>> ---
>>>> Only provided x86_64 implementation, as I've only tested on x86_64 so far.
>>>>
>>>> arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
>>>> 1 file changed, 43 insertions(+)
>>>>
>>>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>>>> index 819ab3f..a3d289c 100644
>>>> --- a/arch/x86/kernel/machine_kexec_64.c
>>>> +++ b/arch/x86/kernel/machine_kexec_64.c
>>>> @@ -536,3 +536,46 @@ overflow:
>>>> return -ENOEXEC;
>>>> }
>>>> #endif /* CONFIG_KEXEC_FILE */
>>>> +
>>>> +#ifdef CONFIG_KEXEC_CORE
>>> The file is only compiled when CONFIG_KEXEC_CORE=y so #ifdef is not necessary
>> Yes, indeed. I'll remove this macro and send v2 later.
>>
>>>> +static int
>>>> +kexec_mark_range(unsigned long start, unsigned long end, bool protect)
>>>> +{
>>>> + struct page *page;
>>>> + unsigned int nr_pages;
>>>> +
>>>> + if (!start || !end || start >= end)
>>>> + return 0;
>>>> +
>>>> + page = pfn_to_page(start >> PAGE_SHIFT);
>>>> + nr_pages = (end + 1 - start) >> PAGE_SHIFT;
>>>> + if (protect)
>>>> + return set_pages_ro(page, nr_pages);
>>>> + else
>>>> + return set_pages_rw(page, nr_pages);
>>> May use set_memory_ro/rw to avoid converting to *page?
>> on x86 it just a wrapper of set_memory_ro/rw, I think both are ok.
> Ok, I have no strong opinion on that..
>
>>>> +}
>>>> +
>>>> +static void kexec_mark_crashkres(bool protect)
>>>> +{
>>>> + unsigned long control;
>>>> +
>>>> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
>>>> +
>>>> + /* Don't touch the control code page used in crash_kexec().*/
>>>> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
>>>> + /* Control code page is located in the 2nd page. */
>>>> + control = control + PAGE_SIZE;
> Though it works because the control code is less than 1 page, but use the macro
> of KEXEC_CONTROL_PAGE_SIZE looks better..
>
>>>> + kexec_mark_range(crashk_res.start, control - 1, protect);
>>>> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
>>> X86 kexec will copy the page while kexecing, could you check if we can move
>>> that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
>>> that we can make a arch-independent implementation.
>> For some arch, may use huge tlb directly to do the kernel mapping,
>> in such cases, we can't implement this function. So I think it should
>> be arch-dependent.
> Ok, that's fine.

At least moving the x86 control-copying code into arch-related
machine_kexec_prepare() should work, and this can omit the
special treatment of the control code page.

Regards,
Xunlei

>
>> Regards,
>> Xunlei
>>
>>>> +}
>>>> +
>>>> +void arch_kexec_protect_crashkres(void)
>>>> +{
>>>> + kexec_mark_crashkres(true);
>>>> +}
>>>> +
>>>> +void arch_kexec_unprotect_crashkres(void)
>>>> +{
>>>> + kexec_mark_crashkres(false);
>>>> +}
>>>> +#endif
>>>> --
>>>> 2.5.0
>>>>
>>>>
>>>> _______________________________________________
>>>> kexec mailing list
>>>> [email protected]
>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>
>> _______________________________________________
>> kexec mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/kexec
> Thanks
> Dave
>
> _______________________________________________
> kexec mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/kexec

2015-12-26 15:18:33

by Minfei Huang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/23/15 at 07:12pm, Xunlei Pang wrote:
> Implement the protection method for the crash kernel memory
> reservation for the 64-bit x86 kdump.
>
> Signed-off-by: Xunlei Pang <[email protected]>
> ---
> Only provided x86_64 implementation, as I've only tested on x86_64 so far.
>
> arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 43 insertions(+)
>
> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
> index 819ab3f..a3d289c 100644
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -536,3 +536,46 @@ overflow:
> return -ENOEXEC;
> }
> #endif /* CONFIG_KEXEC_FILE */
> +
> +#ifdef CONFIG_KEXEC_CORE
> +static int
> +kexec_mark_range(unsigned long start, unsigned long end, bool protect)
> +{
> + struct page *page;
> + unsigned int nr_pages;
> +
> + if (!start || !end || start >= end)
> + return 0;
> +
> + page = pfn_to_page(start >> PAGE_SHIFT);
> + nr_pages = (end + 1 - start) >> PAGE_SHIFT;

The start and end may across two pages, although the range is small than
PAGE_SIZE. You can use following to calculate count of page.

nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;

Thanks
Minfei

> + if (protect)
> + return set_pages_ro(page, nr_pages);
> + else
> + return set_pages_rw(page, nr_pages);
> +}

2015-12-28 06:32:41

by Xunlei Pang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/24/2015 at 02:44 PM, Xunlei Pang wrote:
> On 12/24/2015 at 02:16 PM, Dave Young wrote:
>> Hi, Xunlei
>>
>> On 12/24/15 at 02:05pm, Xunlei Pang wrote:
>>> On 12/24/2015 at 01:54 PM, Dave Young wrote:
>>>> Ccing Vivek
>>>>
>>>> On 12/23/15 at 07:12pm, Xunlei Pang wrote:
>>>>> Implement the protection method for the crash kernel memory
>>>>> reservation for the 64-bit x86 kdump.
>>>>>
>>>>> Signed-off-by: Xunlei Pang <[email protected]>
>>>>> ---
>>>>> Only provided x86_64 implementation, as I've only tested on x86_64 so far.
>>>>>
>>>>> arch/x86/kernel/machine_kexec_64.c | 43 ++++++++++++++++++++++++++++++++++++++
>>>>> 1 file changed, 43 insertions(+)
>>>>>
>>>>> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
>>>>> index 819ab3f..a3d289c 100644
>>>>> --- a/arch/x86/kernel/machine_kexec_64.c
>>>>> +++ b/arch/x86/kernel/machine_kexec_64.c
>>>>> @@ -536,3 +536,46 @@ overflow:
>>>>> return -ENOEXEC;
>>>>> }
>>>>> #endif /* CONFIG_KEXEC_FILE */
>>>>> +
>>>>> +#ifdef CONFIG_KEXEC_CORE
>>>> The file is only compiled when CONFIG_KEXEC_CORE=y so #ifdef is not necessary
>>> Yes, indeed. I'll remove this macro and send v2 later.
>>>
>>>>> +static int
>>>>> +kexec_mark_range(unsigned long start, unsigned long end, bool protect)
>>>>> +{
>>>>> + struct page *page;
>>>>> + unsigned int nr_pages;
>>>>> +
>>>>> + if (!start || !end || start >= end)
>>>>> + return 0;
>>>>> +
>>>>> + page = pfn_to_page(start >> PAGE_SHIFT);
>>>>> + nr_pages = (end + 1 - start) >> PAGE_SHIFT;
>>>>> + if (protect)
>>>>> + return set_pages_ro(page, nr_pages);
>>>>> + else
>>>>> + return set_pages_rw(page, nr_pages);
>>>> May use set_memory_ro/rw to avoid converting to *page?
>>> on x86 it just a wrapper of set_memory_ro/rw, I think both are ok.
>> Ok, I have no strong opinion on that..
>>
>>>>> +}
>>>>> +
>>>>> +static void kexec_mark_crashkres(bool protect)
>>>>> +{
>>>>> + unsigned long control;
>>>>> +
>>>>> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
>>>>> +
>>>>> + /* Don't touch the control code page used in crash_kexec().*/
>>>>> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
>>>>> + /* Control code page is located in the 2nd page. */
>>>>> + control = control + PAGE_SIZE;
>> Though it works because the control code is less than 1 page, but use the macro
>> of KEXEC_CONTROL_PAGE_SIZE looks better..

The 1st page is pagetable, control code page locates at the 2nd page.
The following kexec_mark_range() wants to mark ro from crashk_res.start
to the 1st page(included), so here we must use PAGE_SIZE.

>>
>>>>> + kexec_mark_range(crashk_res.start, control - 1, protect);
>>>>> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
>>>> X86 kexec will copy the page while kexecing, could you check if we can move
>>>> that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
>>>> that we can make a arch-independent implementation.
>>> For some arch, may use huge tlb directly to do the kernel mapping,
>>> in such cases, we can't implement this function. So I think it should
>>> be arch-dependent.
>> Ok, that's fine.
> At least moving the x86 control-copying code into arch-related
> machine_kexec_prepare() should work, and this can omit the
> special treatment of the control code page.

The "relocate_kernel" routine in "relocate_kernel_64.S" will use it as
a temp storage "for jumping back"(as its comment), so we can't mark
it readonly.

>
> Regards,
> Xunlei
>
>>> Regards,
>>> Xunlei
>>>
>>>>> +}
>>>>> +
>>>>> +void arch_kexec_protect_crashkres(void)
>>>>> +{
>>>>> + kexec_mark_crashkres(true);
>>>>> +}
>>>>> +
>>>>> +void arch_kexec_unprotect_crashkres(void)
>>>>> +{
>>>>> + kexec_mark_crashkres(false);
>>>>> +}
>>>>> +#endif
>>>>> --
>>>>> 2.5.0
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> kexec mailing list
>>>>> [email protected]
>>>>> http://lists.infradead.org/mailman/listinfo/kexec
>>> _______________________________________________
>>> kexec mailing list
>>> [email protected]
>>> http://lists.infradead.org/mailman/listinfo/kexec
>> Thanks
>> Dave
>>
>> _______________________________________________
>> kexec mailing list
>> [email protected]
>> http://lists.infradead.org/mailman/listinfo/kexec

2015-12-28 12:12:11

by Minfei Huang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/28/15 at 02:32pm, Xunlei Pang wrote:
> On 12/24/2015 at 02:44 PM, Xunlei Pang wrote:
> >>>>> +static void kexec_mark_crashkres(bool protect)
> >>>>> +{
> >>>>> + unsigned long control;
> >>>>> +
> >>>>> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
> >>>>> +
> >>>>> + /* Don't touch the control code page used in crash_kexec().*/
> >>>>> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
> >>>>> + /* Control code page is located in the 2nd page. */
> >>>>> + control = control + PAGE_SIZE;
> >> Though it works because the control code is less than 1 page, but use the macro
> >> of KEXEC_CONTROL_PAGE_SIZE looks better..
>
> The 1st page is pagetable, control code page locates at the 2nd page.
> The following kexec_mark_range() wants to mark ro from crashk_res.start
> to the 1st page(included), so here we must use PAGE_SIZE.
>
> >>
> >>>>> + kexec_mark_range(crashk_res.start, control - 1, protect);
> >>>>> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
> >>>> X86 kexec will copy the page while kexecing, could you check if we can move
> >>>> that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
> >>>> that we can make a arch-independent implementation.
> >>> For some arch, may use huge tlb directly to do the kernel mapping,
> >>> in such cases, we can't implement this function. So I think it should
> >>> be arch-dependent.
> >> Ok, that's fine.
> > At least moving the x86 control-copying code into arch-related
> > machine_kexec_prepare() should work, and this can omit the
> > special treatment of the control code page.
>
> The "relocate_kernel" routine in "relocate_kernel_64.S" will use it as
> a temp storage "for jumping back"(as its comment), so we can't mark
> it readonly.

kexec will copy the relocate_kernel binary to control_code_page in
function machine_kexec. This is a major reason to set the region
control_code_page to control_code_page + PAGE_SIZE with mode read/write.

Thanks
Minfei

2015-12-28 12:18:03

by Minfei Huang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/28/15 at 08:14pm, Minfei Huang wrote:
> On 12/28/15 at 02:32pm, Xunlei Pang wrote:
> > On 12/24/2015 at 02:44 PM, Xunlei Pang wrote:
> > >>>>> +static void kexec_mark_crashkres(bool protect)
> > >>>>> +{
> > >>>>> + unsigned long control;
> > >>>>> +
> > >>>>> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
> > >>>>> +
> > >>>>> + /* Don't touch the control code page used in crash_kexec().*/
> > >>>>> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
> > >>>>> + /* Control code page is located in the 2nd page. */
> > >>>>> + control = control + PAGE_SIZE;
> > >> Though it works because the control code is less than 1 page, but use the macro
> > >> of KEXEC_CONTROL_PAGE_SIZE looks better..
> >
> > The 1st page is pagetable, control code page locates at the 2nd page.
> > The following kexec_mark_range() wants to mark ro from crashk_res.start
> > to the 1st page(included), so here we must use PAGE_SIZE.
> >
> > >>
> > >>>>> + kexec_mark_range(crashk_res.start, control - 1, protect);
> > >>>>> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
> > >>>> X86 kexec will copy the page while kexecing, could you check if we can move
> > >>>> that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
> > >>>> that we can make a arch-independent implementation.
> > >>> For some arch, may use huge tlb directly to do the kernel mapping,
> > >>> in such cases, we can't implement this function. So I think it should
> > >>> be arch-dependent.
> > >> Ok, that's fine.
> > > At least moving the x86 control-copying code into arch-related
> > > machine_kexec_prepare() should work, and this can omit the
> > > special treatment of the control code page.
> >
> > The "relocate_kernel" routine in "relocate_kernel_64.S" will use it as
> > a temp storage "for jumping back"(as its comment), so we can't mark
> > it readonly.
>
> kexec will copy the relocate_kernel binary to control_code_page in
> function machine_kexec. This is a major reason to set the region
> control_code_page to control_code_page + PAGE_SIZE with mode read/write.

Fix it.
The region is from control_code_page + PAGE_SIZE to
control_code_page + PAGE_SIZE + PAGE_SIZE.

Thanks
Minfei

2015-12-29 11:02:11

by Xunlei Pang

[permalink] [raw]
Subject: Re: [PATCH 2/2] kexec: Provide arch_kexec_protect(unprotect)_crashkres()

On 12/28/2015 at 08:14 PM, Minfei Huang wrote:
> On 12/28/15 at 02:32pm, Xunlei Pang wrote:
>> On 12/24/2015 at 02:44 PM, Xunlei Pang wrote:
>>>>>>> +static void kexec_mark_crashkres(bool protect)
>>>>>>> +{
>>>>>>> + unsigned long control;
>>>>>>> +
>>>>>>> + kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
>>>>>>> +
>>>>>>> + /* Don't touch the control code page used in crash_kexec().*/
>>>>>>> + control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
>>>>>>> + /* Control code page is located in the 2nd page. */
>>>>>>> + control = control + PAGE_SIZE;
>>>> Though it works because the control code is less than 1 page, but use the macro
>>>> of KEXEC_CONTROL_PAGE_SIZE looks better..
>> The 1st page is pagetable, control code page locates at the 2nd page.
>> The following kexec_mark_range() wants to mark ro from crashk_res.start
>> to the 1st page(included), so here we must use PAGE_SIZE.
>>
>>>>>>> + kexec_mark_range(crashk_res.start, control - 1, protect);
>>>>>>> + kexec_mark_range(control + PAGE_SIZE, crashk_res.end, protect);
>>>>>> X86 kexec will copy the page while kexecing, could you check if we can move
>>>>>> that copying to earliyer while kexec loading, maybe machine_kexec_prepare so
>>>>>> that we can make a arch-independent implementation.
>>>>> For some arch, may use huge tlb directly to do the kernel mapping,
>>>>> in such cases, we can't implement this function. So I think it should
>>>>> be arch-dependent.
>>>> Ok, that's fine.
>>> At least moving the x86 control-copying code into arch-related
>>> machine_kexec_prepare() should work, and this can omit the
>>> special treatment of the control code page.
>> The "relocate_kernel" routine in "relocate_kernel_64.S" will use it as
>> a temp storage "for jumping back"(as its comment), so we can't mark
>> it readonly.
> kexec will copy the relocate_kernel binary to control_code_page in
> function machine_kexec. This is a major reason to set the region
> control_code_page to control_code_page + PAGE_SIZE with mode read/write.

Yes, I mean after avoiding this copy by moving the x86 control-copying
code into arch-related machine_kexec_prepare(), we still can't mark it
readonly because of its temp storage role.

Of course we can still do that by setting its kernel pte to rwx and do a
local tlb invalidation when crash_kexec(), but I think we can simply leave
that alone, since its content is meaningless before crash happens where
the code is copied into it. Unless you want to capture the wrong kernel
operations that only write to this reserved page.

Regards,
Xunlei

>
> Thanks
> Minfei
>
> _______________________________________________
> kexec mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/kexec

2016-03-22 18:39:34

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] kexec: Introduce a protection mechanism for the crashkernel reserved memory

On Wed, 23 Dec 2015 19:12:25 +0800 Xunlei Pang <[email protected]> wrote:

> For the cases that some kernel (module) path stamps the crash
> reserved memory(already mapped by the kernel) where has been
> loaded the second kernel data, the kdump kernel will probably
> fail to boot when panic happens (or even not happens) leaving
> the culprit at large, this is unacceptable.
>
> The patch introduces a mechanism for detecting such cases:
> 1) After each crash kexec loading, it simply marks the reserved
> memory regions readonly since we no longer access it after that.
> When someone stamps the region, the first kernel will panic and
> trigger the kdump. The weak arch_kexec_protect_crashkres() is
> introduced to do the actual protection.
>
> 2) To allow multiple loading, once 1) was done we also need to
> remark the reserved memory to readwrite each time a system call
> related to kdump is made. The weak arch_kexec_unprotect_crashkres()
> is introduced to do the actual protection.
>
> The architecture can make its specific implementation by overriding
> arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

I'd like to see a bit more review activity on these patches from the
other kexec developers, please. I'll include both the patches below.


Also I don't have any record of reviews of these two:

http://ozlabs.org/~akpm/mmotm/broken-out/kexec-make-a-pair-of-map-unmap-reserved-pages-in-error-path.patch
http://ozlabs.org/~akpm/mmotm/broken-out/kexec-do-a-cleanup-for-function-kexec_load.patch

Thanks.


From: Xunlei Pang <[email protected]>
Subject: kexec: introduce a protection mechanism for the crashkernel reserved memory

For the cases that some kernel (module) path stamps the crash reserved
memory(already mapped by the kernel) where has been loaded the second
kernel data, the kdump kernel will probably fail to boot when panic
happens (or even not happens) leaving the culprit at large, this is
unacceptable.

The patch introduces a mechanism for detecting such cases:

1) After each crash kexec loading, it simply marks the reserved memory
regions readonly since we no longer access it after that. When someone
stamps the region, the first kernel will panic and trigger the kdump.
The weak arch_kexec_protect_crashkres() is introduced to do the actual
protection.

2) To allow multiple loading, once 1) was done we also need to remark
the reserved memory to readwrite each time a system call related to
kdump is made. The weak arch_kexec_unprotect_crashkres() is introduced
to do the actual protection.

The architecture can make its specific implementation by overriding
arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

Signed-off-by: Xunlei Pang <[email protected]>
Cc: Eric Biederman <[email protected]>
Cc: Dave Young <[email protected]>
Cc: Minfei Huang <[email protected]>
Cc: Vivek Goyal <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

include/linux/kexec.h | 2 ++
kernel/kexec.c | 9 ++++++++-
kernel/kexec_core.c | 6 ++++++
kernel/kexec_file.c | 8 +++++++-
4 files changed, 23 insertions(+), 2 deletions(-)

diff -puN include/linux/kexec.h~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory include/linux/kexec.h
--- a/include/linux/kexec.h~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory
+++ a/include/linux/kexec.h
@@ -317,6 +317,8 @@ int __weak arch_kexec_apply_relocations_
Elf_Shdr *sechdrs, unsigned int relsec);
int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);

#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
diff -puN kernel/kexec.c~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory kernel/kexec.c
--- a/kernel/kexec.c~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory
+++ a/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned lon
return -EBUSY;

dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH)
+ if (flags & KEXEC_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }
+
if (nr_segments > 0) {
unsigned long i;

@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned lon
image = xchg(dest_image, image);

out:
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);

diff -puN kernel/kexec_core.c~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory kernel/kexec_core.c
--- a/kernel/kexec_core.c~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory
+++ a/kernel/kexec_core.c
@@ -1559,3 +1559,9 @@ void __weak crash_map_reserved_pages(voi

void __weak crash_unmap_reserved_pages(void)
{}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff -puN kernel/kexec_file.c~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory kernel/kexec_file.c
--- a/kernel/kexec_file.c~kexec-introduce-a-protection-mechanism-for-the-crashkernel-reserved-memory
+++ a/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, ke
return -EBUSY;

dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
+ if (flags & KEXEC_FILE_ON_CRASH) {
dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ }

if (flags & KEXEC_FILE_UNLOAD)
goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, ke
exchange:
image = xchg(dest_image, image);
out:
+ if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+
mutex_unlock(&kexec_mutex);
kimage_free(image);
return ret;
_


From: Xunlei Pang <[email protected]>
Subject: kexec: provide arch_kexec_protect(unprotect)_crashkres()

Implement the protection method for the crash kernel memory reservation
for the 64-bit x86 kdump.

Signed-off-by: Xunlei Pang <[email protected]>
Cc: Eric Biederman <[email protected]>
Cc: Dave Young <[email protected]>
Cc: Minfei Huang <[email protected]>
Cc: Vivek Goyal <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

arch/x86/kernel/machine_kexec_64.c | 45 +++++++++++++++++++++++++++
1 file changed, 45 insertions(+)

diff -puN arch/x86/kernel/machine_kexec_64.c~kexec-provide-arch_kexec_protectunprotect_crashkres arch/x86/kernel/machine_kexec_64.c
--- a/arch/x86/kernel/machine_kexec_64.c~kexec-provide-arch_kexec_protectunprotect_crashkres
+++ a/arch/x86/kernel/machine_kexec_64.c
@@ -538,3 +538,48 @@ overflow:
return -ENOEXEC;
}
#endif /* CONFIG_KEXEC_FILE */
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
+
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ /* Control code page is located in the 2nd page. */
+ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_crashkres(false);
+}
_