Hi,
I've been working on a patch that turns the kernel's .rodata section to be
actually read only, eg any write attempts to it cause a segmentation fault.
During this work I found a bug in the change_page_attr() code on x86-64, and
this patch 1 is the bugfix for that. Patch 2 will be the actual introduction
of the readonly option.
The bug fixed here is the following: when splitting a 2Mb PSE page that also
happened to contain kernel text, the split PMD would get the NX bit set
eventhough the individual pte entries for the 4Kb pages would not. This
causes problems still since this NX bit overrides in practice the sub bits.
The solution I've chosen is I think the right one: On splitting a 2Mb page
in the change_page_attr() code, inherit the existing pagetable permissions
exactly, for both the PMD page and all the 4Kb pte entries (with the
exception of the one that you wanted to change, but the code already did
that part correct).
Signed-off-by: Arjan van de Ven <[email protected]>
diff -purN linux-2.6.14/arch/x86_64/mm/pageattr.c linux-2.6.14-fordiff/arch/x86_64/mm/pageattr.c
--- linux-2.6.14/arch/x86_64/mm/pageattr.c 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6.14-fordiff/arch/x86_64/mm/pageattr.c 2005-11-05 18:27:07.000000000 +0100
@@ -128,6 +131,7 @@ __change_page_attr(unsigned long address
pte_t *kpte;
struct page *kpte_page;
unsigned kpte_flags;
+ pgprot_t ref_prot2;
kpte = lookup_address(address);
if (!kpte) return 0;
kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -140,10 +144,14 @@ __change_page_attr(unsigned long address
* split_large_page will take the reference for this change_page_attr
* on the split page.
*/
- struct page *split = split_large_page(address, prot, ref_prot);
+
+ struct page *split;
+ ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
+
+ split = split_large_page(address, prot, ref_prot2);
if (!split)
return -ENOMEM;
- set_pte(kpte,mk_pte(split, ref_prot));
+ set_pte(kpte,mk_pte(split, ref_prot2));
kpte_page = split;
}
get_page(kpte_page);
diff -purN linux-2.6.14/include/asm-x86_64/pgtable.h linux-2.6.14-fordiff/include/asm-x86_64/pgtable.h
--- linux-2.6.14/include/asm-x86_64/pgtable.h 2005-11-03 18:49:01.000000000 +0100
+++ linux-2.6.14-fordiff/include/asm-x86_64/pgtable.h 2005-11-05 13:20:44.000000000 +0100
@@ -119,6 +119,8 @@ static inline pte_t ptep_get_and_clear_f
#define pte_same(a, b) ((a).pte == (b).pte)
+#define pte_pgprot(a) (__pgprot((a).pte & ~(PHYSICAL_PAGE_MASK)))
+
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
#define PUD_SIZE (1UL << PUD_SHIFT)
Hi,
I've been working on a patch that turns the kernel's .rodata section to be
actually read only, eg any write attempts to it cause a segmentation fault.
This patch introduces the actual debug option to catch any writes to rodata
Signed-off-by: Arjan van de Ven <[email protected]>
diff -purN linux-2.6.14/arch/x86_64/Kconfig.debug linux-2.6.14-fordiff/arch/x86_64/Kconfig.debug
--- linux-2.6.14/arch/x86_64/Kconfig.debug 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6.14-fordiff/arch/x86_64/Kconfig.debug 2005-11-07 10:57:03.000000000 +0100
@@ -51,6 +51,18 @@ config IOMMU_LEAK
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
+config DEBUG_RODATA
+ bool "Write protect kernel read-only data structures"
+ depends on DEBUG_KERNEL
+ help
+ Mark the kernel read-only data as write-protected in the pagetables,
+ in order to catch accidental (and incorrect) writes to such const data.
+ This option may have a slight performance impact because a portion
+ of the kernel code won't be covered by a 2Mb TLB anymore.
+ If in doubt, say "N".
+
+
+
#config X86_REMOTE_DEBUG
# bool "kgdb debugging stub"
diff -purN linux-2.6.14/arch/x86_64/mm/init.c linux-2.6.14-fordiff/arch/x86_64/mm/init.c
--- linux-2.6.14/arch/x86_64/mm/init.c 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6.14-fordiff/arch/x86_64/mm/init.c 2005-11-07 10:52:17.000000000 +0100
@@ -87,7 +87,7 @@ void show_mem(void)
/* References to section boundaries */
extern char _text, _etext, _edata, __bss_start, _end[];
-extern char __init_begin, __init_end;
+extern char __init_begin, __init_end, __start_rodata, __end_rodata;
int after_bootmem;
@@ -449,6 +449,27 @@ void __init mem_init(void)
#endif
}
+
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void)
+{
+ unsigned long addr;
+
+ addr = (unsigned long)(&__start_rodata);
+ for (; addr < (unsigned long)(&__end_rodata); addr += PAGE_SIZE)
+ change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
+
+ printk ("Write protecting the kernel read-only data: %luk \n", (&__end_rodata - &__start_rodata) >> 10);
+
+ /*
+ * change_page_attr_addr() requires a global_flush_tlb() call after it. We do this after the printk
+ * so that if something went wrong in the change, the printk gets out at least to give a better debug hint
+ * of who is the culprit.
+ */
+ global_flush_tlb();
+}
+#endif
+
extern char __initdata_begin[], __initdata_end[];
void free_initmem(void)
diff -purN linux-2.6.14/include/asm-generic/vmlinux.lds.h linux-2.6.14-fordiff/include/asm-generic/vmlinux.lds.h
--- linux-2.6.14/include/asm-generic/vmlinux.lds.h 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6.14-fordiff/include/asm-generic/vmlinux.lds.h 2005-11-05 13:08:52.000000000 +0100
@@ -10,6 +10,8 @@
#define ALIGN_FUNCTION() . = ALIGN(8)
#define RODATA \
+ . = ALIGN(4096); \
+ __start_rodata = .; \
.rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
*(.rodata) *(.rodata.*) \
*(__vermagic) /* Kernel version magic */ \
@@ -67,6 +69,8 @@
__ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
*(__ksymtab_strings) \
} \
+ __end_rodata = .; \
+ . = ALIGN(4096); \
\
/* Built-in module parameters. */ \
__param : AT(ADDR(__param) - LOAD_OFFSET) { \
diff -purN linux-2.6.14/init/main.c linux-2.6.14-fordiff/init/main.c
--- linux-2.6.14/init/main.c 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6.14-fordiff/init/main.c 2005-11-07 11:03:26.000000000 +0100
@@ -100,6 +100,11 @@ extern void acpi_early_init(void);
#else
static inline void acpi_early_init(void) { }
#endif
+#ifdef CONFIG_DEBUG_RODATA
+extern void mark_rodata_ro(void);
+#else
+static inline void mark_rodata_ro(void) { }
+#endif
#ifdef CONFIG_TC
extern void tc_init(void);
@@ -710,6 +715,7 @@ static int init(void * unused)
*/
free_initmem();
unlock_kernel();
+ mark_rodata_ro();
system_state = SYSTEM_RUNNING;
numa_default_policy();
On Mon, 2005-11-07 at 10:58 +0000, [email protected] wrote:
> Hi,
>
> I've been working on a patch that turns the kernel's .rodata section to be
> actually read only, eg any write attempts to it cause a segmentation fault.
>
> This patch introduces the actual debug option to catch any writes to rodata
Why a debug option? From what I can tell, it doesn't impact runtime
performance much and it provides good protection. Any reason not to
make it an always-on feature?
josh
On Mon, 2005-11-07 at 08:06 -0600, Josh Boyer wrote:
> On Mon, 2005-11-07 at 10:58 +0000, [email protected] wrote:
> > Hi,
> >
> > I've been working on a patch that turns the kernel's .rodata section to be
> > actually read only, eg any write attempts to it cause a segmentation fault.
> >
> > This patch introduces the actual debug option to catch any writes to rodata
>
> Why a debug option? From what I can tell, it doesn't impact runtime
> performance much and it provides good protection. Any reason not to
> make it an always-on feature?
personally I'd like that but there is a chance of a tiny perf regression
and usually there are people objecting to that.
(It's not clear cut: while the last bit of the kernel no longer is
covered by a 2Mb tlb, most intel cpus have very few of such tlbs in the
first place and this would free up one such tlb for other things (say
the stack data) or even the userspace database), so it's not all that
clear cut what the cost of this is)
2005/11/7, Arjan van de Ven <[email protected]>:
> On Mon, 2005-11-07 at 08:06 -0600, Josh Boyer wrote:
> > On Mon, 2005-11-07 at 10:58 +0000, [email protected] wrote:
> > > Hi,
> > >
> > > I've been working on a patch that turns the kernel's .rodata section to be
> > > actually read only, eg any write attempts to it cause a segmentation fault.
> > >
> > > This patch introduces the actual debug option to catch any writes to rodata
> >
> > Why a debug option? From what I can tell, it doesn't impact runtime
> > performance much and it provides good protection. Any reason not to
> > make it an always-on feature?
>
> personally I'd like that but there is a chance of a tiny perf regression
> and usually there are people objecting to that.
>
> (It's not clear cut: while the last bit of the kernel no longer is
> covered by a 2Mb tlb, most intel cpus have very few of such tlbs in the
> first place and this would free up one such tlb for other things (say
> the stack data) or even the userspace database), so it's not all that
> clear cut what the cost of this is)
I'm dumb. But how is "the last bit of the kernel no longer is covered
by a 2Mb tlb"? Could you explain a bit more?
--
Coywolf Qi Hunt
http://sosdg.org/~coywolf/
> people objecting to that.
> >
> > (It's not clear cut: while the last bit of the kernel no longer is
> > covered by a 2Mb tlb, most intel cpus have very few of such tlbs in the
> > first place and this would free up one such tlb for other things (say
> > the stack data) or even the userspace database), so it's not all that
> > clear cut what the cost of this is)
>
> I'm dumb. But how is "the last bit of the kernel no longer is covered
> by a 2Mb tlb"? Could you explain a bit more?
in memory it'll look something like this
0 2 4 6
-- kernel text -- + -- kernel text -- + --- k. text-- rodata -- + --
normally the range from 0 to 6 is covered with 2Mb tlb's.
Now to make rodata read only, the hugetlb entry covering 4-6 Mb range
needs to be split into 4Kb entries, so that the rodata portion can have
different permissions than the rest of that range.
2005/11/11, Arjan van de Ven <[email protected]>:
> > people objecting to that.
> > >
> > > (It's not clear cut: while the last bit of the kernel no longer is
> > > covered by a 2Mb tlb, most intel cpus have very few of such tlbs in the
> > > first place and this would free up one such tlb for other things (say
> > > the stack data) or even the userspace database), so it's not all that
> > > clear cut what the cost of this is)
> >
> > I'm dumb. But how is "the last bit of the kernel no longer is covered
> > by a 2Mb tlb"? Could you explain a bit more?
>
> in memory it'll look something like this
>
> 0 2 4 6
> -- kernel text -- + -- kernel text -- + --- k. text-- rodata -- + --
>
> normally the range from 0 to 6 is covered with 2Mb tlb's.
> Now to make rodata read only, the hugetlb entry covering 4-6 Mb range
> needs to be split into 4Kb entries, so that the rodata portion can have
> different permissions than the rest of that range.
Indeed. Thanks.
And we could also mark text section read-only and data/stack section
noexec if NX is supported. But I doubt the whole thing would really
help much. Kill the kernel thread? We can't. We only run into a panic.
Anyway I'd attach a quick patch to mark text section read only in the
next mail.
If it's ok, I'd add Kconfig support. Comments?
--
Coywolf Qi Hunt
http://sosdg.org/~coywolf/
On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
> And we could also mark text section read-only and data/stack section
> noexec if NX is supported. But I doubt the whole thing would really
> help much. Kill the kernel thread? We can't. We only run into a panic.
> Anyway I'd attach a quick patch to mark text section read only in the
> next mail.
>
> If it's ok, I'd add Kconfig support. Comments?
Signed-off-by: Coywolf Qi Hunt <[email protected]>
---
diff -pruN 2.6.14-mm2/init/main.c 2.6.14-mm2-cy/init/main.c
--- 2.6.14-mm2/init/main.c 2005-11-11 22:34:21.000000000 +0800
+++ 2.6.14-mm2-cy/init/main.c 2005-11-12 02:50:45.000000000 +0800
@@ -660,6 +660,18 @@ static inline void fixup_cpu_present_map
#endif
}
+void mark_text_ro(void)
+{
+ unsigned long addr = (unsigned long)&_text;
+
+ for (; addr < (unsigned long)&_etext; addr += PAGE_SIZE)
+ change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
+
+ printk ("Write protecting the kernel text data: %luk\n",
+ (unsigned long)(_etext - _text) >> 10);
+ global_flush_tlb();
+}
+
static int init(void * unused)
{
lock_kernel();
@@ -716,6 +728,7 @@ static int init(void * unused)
*/
free_initmem();
unlock_kernel();
+ mark_text_ro();
mark_rodata_ro();
system_state = SYSTEM_RUNNING;
numa_default_policy();
On Fri, 2005-11-11 at 14:04 -0500, Coywolf Qi Hunt wrote:
> On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
> > And we could also mark text section read-only and data/stack section
> > noexec if NX is supported. But I doubt the whole thing would really
> > help much. Kill the kernel thread? We can't. We only run into a panic.
> > Anyway I'd attach a quick patch to mark text section read only in the
> > next mail.
> >
> > If it's ok, I'd add Kconfig support. Comments?
>
change_page_attr() is only available on x86 and x86-64 so it needs to be
in arch/ somewhere...
On Fri, 11 Nov 2005, Coywolf Qi Hunt wrote:
> On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
>> And we could also mark text section read-only and data/stack section
>> noexec if NX is supported. But I doubt the whole thing would really
>> help much. Kill the kernel thread? We can't. We only run into a panic.
>> Anyway I'd attach a quick patch to mark text section read only in the
>> next mail.
>>
>> If it's ok, I'd add Kconfig support. Comments?
>
>
> Signed-off-by: Coywolf Qi Hunt <[email protected]>
> ---
>
> diff -pruN 2.6.14-mm2/init/main.c 2.6.14-mm2-cy/init/main.c
> --- 2.6.14-mm2/init/main.c 2005-11-11 22:34:21.000000000 +0800
> +++ 2.6.14-mm2-cy/init/main.c 2005-11-12 02:50:45.000000000 +0800
> @@ -660,6 +660,18 @@ static inline void fixup_cpu_present_map
> #endif
> }
>
> +void mark_text_ro(void)
> +{
> + unsigned long addr = (unsigned long)&_text;
> +
> + for (; addr < (unsigned long)&_etext; addr += PAGE_SIZE)
> + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
> +
> + printk ("Write protecting the kernel text data: %luk\n",
> + (unsigned long)(_etext - _text) >> 10);
> + global_flush_tlb();
> +}
> +
> static int init(void * unused)
> {
> lock_kernel();
> @@ -716,6 +728,7 @@ static int init(void * unused)
> */
> free_initmem();
> unlock_kernel();
> + mark_text_ro();
> mark_rodata_ro();
> system_state = SYSTEM_RUNNING;
> numa_default_policy();
> -
Assuming ix86, what is read-only? Certainly the text section needs
to be marked execute!
So is it:
Execute-Only
Execute-Only, accessed
Execute/Read
Execute/Read, accessed
Execute-only, conforming
Execute-only, conforming, accessed
Execute/Read-Only, conforming
Execute/Read-Only, conforming, accessed.
(all from page 5-12 of ix484 programmer's reference)
????
You need to WRITE to the text segment to be able to load executables
(modules) and kernel threads. The user-mode code, children of `init`
need to have writable .text segments to be able to exec().
Cheers,
Dick Johnson
Penguin : Linux version 2.6.13.4 on an i686 machine (5589.48 BogoMips).
Warning : 98.36% of all statistics are fiction.
.
****************************************************************
The information transmitted in this message is confidential and may be privileged. Any review, retransmission, dissemination, or other use of this information by persons or entities other than the intended recipient is prohibited. If you are not the intended recipient, please notify Analogic Corporation immediately - by replying to this message or by sending an email to [email protected] - and destroy all copies of this information, including any attachments, without reading or disclosing them.
Thank you.
On Friday 11 November 2005 20:04, Coywolf Qi Hunt wrote:
> On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
> > And we could also mark text section read-only and data/stack section
> > noexec if NX is supported. But I doubt the whole thing would really
> > help much. Kill the kernel thread? We can't. We only run into a panic.
> > Anyway I'd attach a quick patch to mark text section read only in the
> > next mail.
I think this whole thing is only usable as a debugging option. It shouldn't
be used by default on production systems because it will increase TLB
pressure by splitting up the large pages used by kernel. And TLB pressure
is critical in many workloads.
It definitely shouldn't be on by default.
Then the text section will likely not be page aligned, so it would be
surprising if it even worked.
At least on x86-64 it is pretty useless too because the .text section can
be accessed over its alias in the direct mapping.
Overall I doubt it is worth it even as a debugging option. I so far cannot
remember a single bug that was caused by overwriting kernel text.
-Andi
Andi Kleen writes:
[...]
>
> Overall I doubt it is worth it even as a debugging option. I so far cannot
> remember a single bug that was caused by overwriting kernel text.
I wouldn't forget that one for a long time:
http://marc.theaimsgroup.com/?l=linux-kernel&m=106503116306729&w=2
>
> -Andi
>
Nikita.
Coywolf Qi Hunt <[email protected]> wrote:
> +++ 2.6.14-mm2-cy/init/main.c????????2005-11-12 02:50:45.000000000 +0800
...
> +void mark_text_ro(void)
...
> + printk ("Write protecting the kernel text data: %luk\n",
> + (unsigned long)(_etext - _text) >> 10);
This message should have a priority level, shouldn't it?
--
Ich danke GMX daf?r, die Verwendung meiner Adressen mittels per SPF
verbreiteten L?gen zu sabotieren.
On Sat, Nov 12, 2005 at 12:03:51AM +0100, Bodo Eggert wrote:
> Coywolf Qi Hunt <[email protected]> wrote:
>
> > +++ 2.6.14-mm2-cy/init/main.c????????2005-11-12 02:50:45.000000000 +0800
> ...
> > +void mark_text_ro(void)
> ...
> > + printk ("Write protecting the kernel text data: %luk\n",
> > + (unsigned long)(_etext - _text) >> 10);
>
> This message should have a priority level, shouldn't it?
It doesn't matter. It will fall back into the default level.
Coywolf
2005/11/12, linux-os (Dick Johnson) <[email protected]>:
>
> On Fri, 11 Nov 2005, Coywolf Qi Hunt wrote:
>
> > On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
> >> And we could also mark text section read-only and data/stack section
> >> noexec if NX is supported. But I doubt the whole thing would really
> >> help much. Kill the kernel thread? We can't. We only run into a panic.
> >> Anyway I'd attach a quick patch to mark text section read only in the
> >> next mail.
> >>
> >> If it's ok, I'd add Kconfig support. Comments?
> >
> >
> > Signed-off-by: Coywolf Qi Hunt <[email protected]>
> > ---
> >
> > diff -pruN 2.6.14-mm2/init/main.c 2.6.14-mm2-cy/init/main.c
> > --- 2.6.14-mm2/init/main.c 2005-11-11 22:34:21.000000000 +0800
> > +++ 2.6.14-mm2-cy/init/main.c 2005-11-12 02:50:45.000000000 +0800
> > @@ -660,6 +660,18 @@ static inline void fixup_cpu_present_map
> > #endif
> > }
> >
> > +void mark_text_ro(void)
> > +{
> > + unsigned long addr = (unsigned long)&_text;
> > +
> > + for (; addr < (unsigned long)&_etext; addr += PAGE_SIZE)
> > + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
> > +
> > + printk ("Write protecting the kernel text data: %luk\n",
> > + (unsigned long)(_etext - _text) >> 10);
> > + global_flush_tlb();
> > +}
> > +
> > static int init(void * unused)
> > {
> > lock_kernel();
> > @@ -716,6 +728,7 @@ static int init(void * unused)
> > */
> > free_initmem();
> > unlock_kernel();
> > + mark_text_ro();
> > mark_rodata_ro();
> > system_state = SYSTEM_RUNNING;
> > numa_default_policy();
> > -
>
>
> Assuming ix86, what is read-only? Certainly the text section needs
> to be marked execute!
>
> So is it:
> Execute-Only
> Execute-Only, accessed
> Execute/Read
> Execute/Read, accessed
> Execute-only, conforming
> Execute-only, conforming, accessed
> Execute/Read-Only, conforming
> Execute/Read-Only, conforming, accessed.
> (all from page 5-12 of ix484 programmer's reference)
>
> ????
>
> You need to WRITE to the text segment to be able to load executables
> (modules) and kernel threads. The user-mode code, children of `init`
> need to have writable .text segments to be able to exec().
Ah, thanks, Mr Wrong.
>
> Cheers,
> Dick Johnson
> Penguin : Linux version 2.6.13.4 on an i686 machine (5589.48 BogoMips).
> Warning : 98.36% of all statistics are fiction.
> .
>
> ****************************************************************
> The information transmitted in this message is confidential and may be privileged. Any review, retransmission, dissemination, or other use of this information by persons or entities other than the intended recipient is prohibited. If you are not the intended recipient, please notify Analogic Corporation immediately - by replying to this message or by sending an email to [email protected] - and destroy all copies of this information, including any attachments, without reading or disclosing them.
>
> Thank you.
>
--
Coywolf Qi Hunt
http://sosdg.org/~coywolf/
2005/11/12, Andi Kleen <[email protected]>:
> On Friday 11 November 2005 20:04, Coywolf Qi Hunt wrote:
> > On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
> > > And we could also mark text section read-only and data/stack section
> > > noexec if NX is supported. But I doubt the whole thing would really
> > > help much. Kill the kernel thread? We can't. We only run into a panic.
> > > Anyway I'd attach a quick patch to mark text section read only in the
> > > next mail.
>
>
> I think this whole thing is only usable as a debugging option. It shouldn't
> be used by default on production systems because it will increase TLB
> pressure by splitting up the large pages used by kernel. And TLB pressure
> is critical in many workloads.
>
> It definitely shouldn't be on by default.
>
> Then the text section will likely not be page aligned, so it would be
> surprising if it even worked.
It works. I have tested it with { c=_stext[0]; _stext[0]=c;}. No
effect when it's disabled; panic when it's enabled.
The symbol `_text' is always page aligned. `_etext' is not, but we don't care.
(Bugs: It would conflict with kprobes.)
>
> At least on x86-64 it is pretty useless too because the .text section can
> be accessed over its alias in the direct mapping.
OK, for x86 only then.
>
> Overall I doubt it is worth it even as a debugging option. I so far cannot
> remember a single bug that was caused by overwriting kernel text.
I had the same concern basically. But I am convinced after seeing the
bug Nikita Danilov points out.
--
Coywolf Qi Hunt
http://sosdg.org/~coywolf/
On Sat, Nov 12, 2005 at 10:32:34PM +0800, Coywolf Qi Hunt wrote:
> 2005/11/12, Andi Kleen <[email protected]>:
> > On Friday 11 November 2005 20:04, Coywolf Qi Hunt wrote:
> > > On Sat, Nov 12, 2005 at 02:57:02AM +0800, Coywolf Qi Hunt wrote:
> > > > And we could also mark text section read-only and data/stack section
> > > > noexec if NX is supported. But I doubt the whole thing would really
> > > > help much. Kill the kernel thread? We can't. We only run into a panic.
> > > > Anyway I'd attach a quick patch to mark text section read only in the
> > > > next mail.
> >
> >
> > I think this whole thing is only usable as a debugging option. It shouldn't
> > be used by default on production systems because it will increase TLB
> > pressure by splitting up the large pages used by kernel. And TLB pressure
> > is critical in many workloads.
> >
> > It definitely shouldn't be on by default.
> >
> > Then the text section will likely not be page aligned, so it would be
> > surprising if it even worked.
>
> It works. I have tested it with { c=_stext[0]; _stext[0]=c;}. No
> effect when it's disabled; panic when it's enabled.
>
> The symbol `_text' is always page aligned. `_etext' is not, but we don't care.
>
> (Bugs: It would conflict with kprobes.)
>
> >
> > At least on x86-64 it is pretty useless too because the .text section can
> > be accessed over its alias in the direct mapping.
>
> OK, for x86 only then.
>
> >
> > Overall I doubt it is worth it even as a debugging option. I so far cannot
> > remember a single bug that was caused by overwriting kernel text.
>
> I had the same concern basically. But I am convinced after seeing the
> bug Nikita Danilov points out.
Mark text section read-only for debug purpose. This is paranoid, but useful to
guard on some bugs.
Signed-off-by: Coywolf Qi Hunt <[email protected]>
---
arch/i386/Kconfig.debug | 9 +++++++++
arch/i386/mm/init.c | 15 ++++++++++++++-
init/main.c | 10 +++++++++-
3 files changed, 32 insertions(+), 2 deletions(-)
diff -pruN 2.6.14-mm2/arch/i386/Kconfig.debug 2.6.14-mm2-cy/arch/i386/Kconfig.debug
--- 2.6.14-mm2/arch/i386/Kconfig.debug 2005-11-11 22:33:28.000000000 +0800
+++ 2.6.14-mm2-cy/arch/i386/Kconfig.debug 2005-11-12 23:10:23.000000000 +0800
@@ -42,6 +42,15 @@ config DEBUG_PAGEALLOC
This results in a large slowdown, but helps to find certain types
of memory corruptions.
+config DEBUG_ROTEXT
+ bool "Write protect kernel text"
+ depends on DEBUG_KERNEL
+ help
+ Mark the kernel text as write-protected in the pagetables,
+ in order to catch accidental (and incorrect) writes to kernel text
+ area. This option will increase TLB pressure thus impact performance.
+ Note this may conflict with kprobes. If in doubt, say "N".
+
config DEBUG_RODATA
bool "Write protect kernel read-only data structures"
depends on DEBUG_KERNEL
diff -pruN 2.6.14-mm2/arch/i386/mm/init.c 2.6.14-mm2-cy/arch/i386/mm/init.c
--- 2.6.14-mm2/arch/i386/mm/init.c 2005-11-11 22:33:29.000000000 +0800
+++ 2.6.14-mm2-cy/arch/i386/mm/init.c 2005-11-12 23:07:12.000000000 +0800
@@ -735,8 +735,21 @@ void free_initmem(void)
printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
}
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_DEBUG_ROTEXT
+void mark_text_ro(void)
+{
+ unsigned long addr = (unsigned long)&_text;
+
+ for (; addr < (unsigned long)&_etext; addr += PAGE_SIZE)
+ change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
+
+ printk ("Write protecting the kernel text data: %luk\n",
+ (unsigned long)(_etext - _text) >> 10);
+ global_flush_tlb();
+}
+#endif
+#ifdef CONFIG_DEBUG_RODATA
extern char __start_rodata, __end_rodata;
void mark_rodata_ro(void)
{
diff -pruN 2.6.14-mm2/init/main.c 2.6.14-mm2-cy/init/main.c
--- 2.6.14-mm2/init/main.c 2005-11-11 22:34:21.000000000 +0800
+++ 2.6.14-mm2-cy/init/main.c 2005-11-12 23:07:12.000000000 +0800
@@ -100,8 +100,15 @@ extern void acpi_early_init(void);
#else
static inline void acpi_early_init(void) { }
#endif
+
+#ifdef CONFIG_DEBUG_ROTEXT
+ extern void mark_text_ro(void);
+#else
+ static inline void mark_text_ro(void) { }
+#endif
+
#ifndef CONFIG_DEBUG_RODATA
-inline void mark_rodata_ro(void) { }
+static inline void mark_rodata_ro(void) { }
#endif
#ifdef CONFIG_TC
@@ -716,6 +723,7 @@ static int init(void * unused)
*/
free_initmem();
unlock_kernel();
+ mark_text_ro();
mark_rodata_ro();
system_state = SYSTEM_RUNNING;
numa_default_policy();
On Saturday 12 November 2005 00:30, Nikita Danilov wrote:
> Andi Kleen writes:
>
> [...]
>
> >
> > Overall I doubt it is worth it even as a debugging option. I so far cannot
> > remember a single bug that was caused by overwriting kernel text.
>
> I wouldn't forget that one for a long time:
>
> http://marc.theaimsgroup.com/?l=linux-kernel&m=106503116306729&w=2
Ok then maybe as a debug CONFIG, but definitely not default.
-Andi
On Sat, 12 Nov 2005 11:34:55 -0500,
Coywolf Qi Hunt <[email protected]> wrote:
>+config DEBUG_ROTEXT
>+ bool "Write protect kernel text"
>+ depends on DEBUG_KERNEL
>+ help
>+ Mark the kernel text as write-protected in the pagetables,
>+ in order to catch accidental (and incorrect) writes to kernel text
>+ area. This option will increase TLB pressure thus impact performance.
>+ Note this may conflict with kprobes. If in doubt, say "N".
Also conflicts with kdb, kgdb, and any other kernel debugger that uses
software breakpoints.
on PPC32, the kernel uses memory thru BAT registers. How would this
works?
--
Linh Dang