2010-06-07 01:01:06

by Siarhei Liakh

[permalink] [raw]
Subject: [PATCH 2/4 V4] [tip:x86/mm] Set first MB as RW+NX

This patch ensures that there are no mixed (RW+X) pages mapped within
first megabyte of kernel space (from PAGE_OFFSET up to beginning of .text).
This is achieved through introduction of new compile-time option
CONFIG_DEBUG_SET_1ST_MB_RWNX which controls how is_kernel_text()
and static_protections() view the memory area in question.

Special accommodations have been made for BIOS32/PCI BIOS services:
according to BIOS32 specification
(http://members.datafast.net.au/dft0802/specs/bios32.pdf), at most two
pages per BIOS32 service should be set executable and no pages need to
be writeable. This patch modifies bios32_service() to set proper page
access permissions at time of service discovery, as described in the
specification. Further, hardcoded protection of memory area between 640k
to 1Mb have been removed from static_protections(), since only pages
mentioned above need to be executable, not whole BIOS region.

The net result of the patch application is as follows:
========== BEFORE ==========
0xc0000000-0xc0100000 1M RW GLB x pte
0xc0100000-0xc04b6000 3800K ro GLB x pte
...

=========== AFTER ===========
0xc0000000-0xc00fb000 1004K RW GLB NX pte
0xc00fb000-0xc00fd000 8K ro GLB x pte
0xc00fd000-0xc0100000 12K RW GLB NX pte
0xc0100000-0xc04b6000 3800K ro GLB x pte
...

Note that first MB is now RW+NX, except for 8K of RO+X used by BIOS32.

The patch have been developed for Linux 2.6.33-rc5 x86 by Siarhei Liakh
<[email protected]> and Xuxian Jiang <[email protected]>.

V1: original patch "Reducing footprint of BIOS32 service mappings"
V2: Set first MB as RW+NX for linux 2.6.33-rc5
V3: minor adjustments for -tip
V4: Replaced "1st" with "first"

Signed-off-by: Siarhei Liakh <[email protected]>
Signed-off-by: Xuxian Jiang <[email protected]>
---

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 8b3c711..adeaa39 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -121,6 +121,17 @@ config DEBUG_NX_TEST
and the software setup of this feature.
If in doubt, say "N"

+config DEBUG_SET_1ST_MB_RWNX
+ bool "Set first MB of kernel space mapping as RW+NX"
+ default n
+ depends on X86
+ ---help---
+ This option helps to catch unintended modifications/execution of
+ 1st megabyte of kernel address space. Such protection may interfere
+ with run-time code patching and legacy BIOS services residing within
+ the first MB of physical address space.
+ If in doubt, say "N".
+
config 4KSTACKS
bool "Use 4Kb for kernel stacks instead of 8Kb"
depends on X86_32
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca7909..8fb9c3d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -52,6 +52,12 @@
#include <asm/page_types.h>
#include <asm/init.h>

+#ifdef CONFIG_DEBUG_SET_1ST_MB_RWNX
+#define TEXT_START _text
+#else
+#define TEXT_START PAGE_OFFSET
+#endif
+
unsigned long highstart_pfn, highend_pfn;

static noinline int do_test_wp_bit(void);
@@ -225,7 +231,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)

static inline int is_kernel_text(unsigned long addr)
{
- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ if (addr >= (unsigned long)TEXT_START
+ && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e793..93ac4e6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -256,13 +256,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
{
pgprot_t forbidden = __pgprot(0);

+#ifndef CONFIG_DEBUG_SET_1ST_MB_RWNX
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
-
+#endif
/*
* The kernel text needs to be executable for obvious reasons
* Does not cover __inittext since that is gone later on. On
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d16..1102096 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
#include <linux/uaccess.h>
#include <asm/pci_x86.h>
#include <asm/pci-functions.h>
+#include <asm/cacheflush.h>

/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -59,6 +60,16 @@ static struct {
unsigned short segment;
} bios32_indirect = { 0, __KERNEL_CS };

+/* Set two consecutive pages as read-only and executable */
+void set_2_pages_rox(unsigned long addr)
+{
+#ifdef CONFIG_DEBUG_SET_1ST_MB_RWNX
+ unsigned long pg_address = PFN_DOWN(addr + PAGE_OFFSET) << PAGE_SHIFT;
+ set_memory_ro(pg_address, 2);
+ set_memory_x(pg_address, 2);
+#endif
+}
+
/*
* Returns the entry point for the given service, NULL on error
*/
@@ -83,15 +94,19 @@ static unsigned long bios32_service(unsigned long service)
local_irq_restore(flags);

switch (return_code) {
- case 0:
- return address + entry;
- case 0x80: /* Not present */
- printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service);
- return 0;
- default: /* Shouldn't happen */
- printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n",
- service, return_code);
- return 0;
+ case 0: /* Service present, set proper page access */
+ address += entry;
+ set_2_pages_rox(address);
+ return address;
+ case 0x80: /* Not present */
+ printk(KERN_WARNING "bios32_service(0x%lx): not present\n",
+ service);
+ return 0;
+ default: /* Shouldn't happen */
+ printk(KERN_WARNING "bios32_service(0x%lx): "
+ "returned 0x%x -- BIOS bug!\n",
+ service, return_code);
+ return 0;
}
}

@@ -332,6 +347,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
bios32_entry);
bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+ set_2_pages_rox(bios32_entry);
if (check_pcibios())
return &pci_bios_access;
}


2010-06-11 22:55:34

by Siarhei Liakh

[permalink] [raw]
Subject: [tip:x86/mm] x86, mm: Set first MB as RW+NX

Commit-ID: c226a2feba210f0ee4a7c5afe08b3ddbcc6990e2
Gitweb: http://git.kernel.org/tip/c226a2feba210f0ee4a7c5afe08b3ddbcc6990e2
Author: Siarhei Liakh <[email protected]>
AuthorDate: Sun, 6 Jun 2010 21:00:57 -0400
Committer: H. Peter Anvin <[email protected]>
CommitDate: Fri, 11 Jun 2010 15:12:52 -0700

x86, mm: Set first MB as RW+NX

This patch ensures that there are no mixed (RW+X) pages mapped within
first megabyte of kernel space (from PAGE_OFFSET up to beginning of .text).
This is achieved through introduction of new compile-time option
CONFIG_DEBUG_SET_1ST_MB_RWNX which controls how is_kernel_text()
and static_protections() view the memory area in question.

Special accommodations have been made for BIOS32/PCI BIOS services:
according to BIOS32 specification
(http://members.datafast.net.au/dft0802/specs/bios32.pdf), at most two
pages per BIOS32 service should be set executable and no pages need to
be writeable. This patch modifies bios32_service() to set proper page
access permissions at time of service discovery, as described in the
specification. Further, hardcoded protection of memory area between 640k
to 1Mb have been removed from static_protections(), since only pages
mentioned above need to be executable, not whole BIOS region.

The net result of the patch application is as follows:
========== BEFORE ==========
0xc0000000-0xc0100000 1M RW GLB x pte
0xc0100000-0xc04b6000 3800K ro GLB x pte
...

=========== AFTER ===========
0xc0000000-0xc00fb000 1004K RW GLB NX pte
0xc00fb000-0xc00fd000 8K ro GLB x pte
0xc00fd000-0xc0100000 12K RW GLB NX pte
0xc0100000-0xc04b6000 3800K ro GLB x pte
...

Note that first MB is now RW+NX, except for 8K of RO+X used by BIOS32.

The patch have been developed for Linux 2.6.33-rc5 x86 by Siarhei Liakh
<[email protected]> and Xuxian Jiang <[email protected]>.

V1: original patch "Reducing footprint of BIOS32 service mappings"
V2: Set first MB as RW+NX for linux 2.6.33-rc5
V3: minor adjustments for -tip
V4: Replaced "1st" with "first"

Signed-off-by: Siarhei Liakh <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Xuxian Jiang <[email protected]>
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/Kconfig.debug | 11 +++++++++++
arch/x86/mm/init_32.c | 9 ++++++++-
arch/x86/mm/pageattr.c | 3 ++-
arch/x86/pci/pcbios.c | 34 +++++++++++++++++++++++++---------
4 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7508508..4567981 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -121,6 +121,17 @@ config DEBUG_NX_TEST
and the software setup of this feature.
If in doubt, say "N"

+config DEBUG_SET_1ST_MB_RWNX
+ bool "Set first MB of kernel space mapping as RW+NX"
+ default n
+ depends on X86
+ ---help---
+ This option helps to catch unintended modifications/execution of
+ 1st megabyte of kernel address space. Such protection may interfere
+ with run-time code patching and legacy BIOS services residing within
+ the first MB of physical address space.
+ If in doubt, say "N".
+
config 4KSTACKS
bool "Use 4Kb for kernel stacks instead of 8Kb"
depends on X86_32
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca7909..8fb9c3d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -52,6 +52,12 @@
#include <asm/page_types.h>
#include <asm/init.h>

+#ifdef CONFIG_DEBUG_SET_1ST_MB_RWNX
+#define TEXT_START _text
+#else
+#define TEXT_START PAGE_OFFSET
+#endif
+
unsigned long highstart_pfn, highend_pfn;

static noinline int do_test_wp_bit(void);
@@ -225,7 +231,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)

static inline int is_kernel_text(unsigned long addr)
{
- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ if (addr >= (unsigned long)TEXT_START
+ && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 91ce756..fbae8c1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -257,13 +257,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
pgprot_t forbidden = __pgprot(0);
pgprot_t required = __pgprot(0);

+#ifndef CONFIG_DEBUG_SET_1ST_MB_RWNX
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
-
+#endif
/*
* The kernel text needs to be executable for obvious reasons
* Does not cover __inittext since that is gone later on. On
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d16..1102096 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
#include <linux/uaccess.h>
#include <asm/pci_x86.h>
#include <asm/pci-functions.h>
+#include <asm/cacheflush.h>

/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -59,6 +60,16 @@ static struct {
unsigned short segment;
} bios32_indirect = { 0, __KERNEL_CS };

+/* Set two consecutive pages as read-only and executable */
+void set_2_pages_rox(unsigned long addr)
+{
+#ifdef CONFIG_DEBUG_SET_1ST_MB_RWNX
+ unsigned long pg_address = PFN_DOWN(addr + PAGE_OFFSET) << PAGE_SHIFT;
+ set_memory_ro(pg_address, 2);
+ set_memory_x(pg_address, 2);
+#endif
+}
+
/*
* Returns the entry point for the given service, NULL on error
*/
@@ -83,15 +94,19 @@ static unsigned long bios32_service(unsigned long service)
local_irq_restore(flags);

switch (return_code) {
- case 0:
- return address + entry;
- case 0x80: /* Not present */
- printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service);
- return 0;
- default: /* Shouldn't happen */
- printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n",
- service, return_code);
- return 0;
+ case 0: /* Service present, set proper page access */
+ address += entry;
+ set_2_pages_rox(address);
+ return address;
+ case 0x80: /* Not present */
+ printk(KERN_WARNING "bios32_service(0x%lx): not present\n",
+ service);
+ return 0;
+ default: /* Shouldn't happen */
+ printk(KERN_WARNING "bios32_service(0x%lx): "
+ "returned 0x%x -- BIOS bug!\n",
+ service, return_code);
+ return 0;
}
}

@@ -332,6 +347,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
bios32_entry);
bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+ set_2_pages_rox(bios32_entry);
if (check_pcibios())
return &pci_bios_access;
}

2010-06-12 13:01:04

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip:x86/mm] x86, mm: Set first MB as RW+NX


FYI, the NX commits in tip:x86/mm have triggered a new boot crash in -tip
testing (x86, 32-bit):

[ 1.176004] calling pci_arch_init+0x0/0x54 @ 1
[ 1.185812] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
[ 1.188000] BUG: unable to handle kernel paging request at c00fbfcb
[ 1.188000] IP: [<c00fbfcb>] 0xc00fbfcb
[ 1.188000] *pdpt = 0000000001d99001 *pde = 000000000240a067 *pte = 80000000000fb163
[ 1.188000] Oops: 0011 [#1] SMP
[ 1.188000] last sysfs file:
[ 1.188000] Modules linked in:
[ 1.188000]
[ 1.188000] Pid: 1, comm: swapper Not tainted 2.6.35-rc3-tip-00975-gb7201cb-dirty #8395 A8N-E/System Product Name
[ 1.188000] EIP: 0060:[<c00fbfcb>] EFLAGS: 00010046 CPU: 1
[ 1.188000] EIP is at 0xc00fbfcb
[ 1.188000] EAX: 0000b101 EBX: 000f0000 ECX: f6838000 EDX: 00000001
[ 1.188000] ESI: 000f21d0 EDI: c1ceddd4 EBP: f6821fa8 ESP: f6821f7a
[ 1.188000] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
[ 1.188000] Process swapper (pid: 1, ti=f6821000 task=f6838000 task.ti=f6821000)
[ 1.188000] Stack:
[ 1.188000] c00f2411 c00f21e1 c2210006 0060c1d4 3fab0000 0202c10a b1010000 335f0000
[ 1.188000] <0> 00015f32 00000000 00020000 1fb40000 c055f682 5ffcc1d4 1fd04655 103af682
[ 1.188000] <0> c036c100 0000c1d4 f6940000 0002c1d6 00020000 1fe40000 73a1f682 0000c1d1
[ 1.188000] Call Trace:
[ 1.188000] Code: ef 66 58 66 5a c3 e8 42 ff cb 00 00 00 00 00 00 66 51 0a ff 75 11 c0 ed 03 80 fd 1f 77 05 80 fd 00 73 04 66 59 f9 c3 66 59 f8 c3 <b1> 05 90 90 c3 b3 28 b7 08 c3 00 00 00 00 00 00 00 00 00 00 00
[ 1.188000] EIP: [<c00fbfcb>] 0xc00fbfcb SS:ESP 0068:f6821f7a
[ 1.188000] CR2: 00000000c00fbfcb
[ 1.188000] ---[ end trace 5a5d197966b56a2e ]---

Config and full bootlog attached. I've excluded them from tip:master for now.

Thanks,

Ingo


Attachments:
(No filename) (1.80 kB)
config (76.45 kB)
crash.log (30.43 kB)
Download all attachments

2010-06-12 16:38:39

by matthieu castet

[permalink] [raw]
Subject: Re: [tip:x86/mm] x86, mm: Set first MB as RW+NX

hi,

> Special accommodations have been made for BIOS32/PCI BIOS services:
> according to BIOS32 specification
> (http://members.datafast.net.au/dft0802/specs/bios32.pdf), at most two
> pages per BIOS32 service should be set executable and no pages need to
> be writeable.
>From my understanding only the service directory take 2 pages.

The no info for the pci service :
- the length field could be used to find the number of page it takes.
- could we assume data ro ?

Also for easier debugging, set_2_pages_rox should print some info.


Matthieu

2010-06-12 20:04:52

by matthieu castet

[permalink] [raw]
Subject: Re: [tip:x86/mm] x86, mm: Set first MB as RW+NX

Selon [email protected]:

> hi,
>
> > Special accommodations have been made for BIOS32/PCI BIOS services:
> > according to BIOS32 specification
> > (http://members.datafast.net.au/dft0802/specs/bios32.pdf), at most two
> > pages per BIOS32 service should be set executable and no pages need to
> > be writeable.
> From my understanding only the service directory take 2 pages.
>
> The no info for the pci service :
> - the length field could be used to find the number of page it takes.
> - could we assume data ro ?
>

http://members.datafast.net.au/dft0802/specs/bios21.pdf got all info for pci
bios [1].
So I was right : we shouldn't assume pci bios routine take only 2 pages, but use
the length parameter.


Matthieu



[1]
The 32-bit PCI BIOS functions must be accessed using CALL FAR. The CS and DS
descriptors must be setup to encompass the physical addresses specified by the
Base and
Length parameters returned by the BIOS32 Service Directory. The CS and DS
descriptors must have the same base. The calling environment must allow access
to IO
space and provide at least 1K of stack space. Platform BIOS writers must assume
that CS
is execute-only and DS is read-only.

2010-06-17 00:36:29

by Siarhei Liakh

[permalink] [raw]
Subject: Re: [tip:x86/mm] x86, mm: Set first MB as RW+NX

>> > Special accommodations have been made for BIOS32/PCI BIOS services:
>> > according to BIOS32 specification
>> > (http://members.datafast.net.au/dft0802/specs/bios32.pdf), at most two
>> > pages per BIOS32 service should be set executable and no pages need to
>> > be writeable.
>> From my understanding only the service directory take 2 pages.
>>
>> The no info for the pci service :
>> - the length field could be used to find the number of page it takes.
>> - could we assume data ro ?
>>
>
> http://members.datafast.net.au/dft0802/specs/bios21.pdf got all info for pci
> bios [1].
> So I was right : we shouldn't assume pci bios routine take only 2 pages, but use
> the length parameter.

Thanks for looking into this. I will be posting another patch shortly.