2022-08-29 10:42:46

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2] x86/mm: Refuse W^X violations


x86 has STRICT_*_RWX, but not even a warning when someone violates it.

Add this warning and fully refuse the transition.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/mm/pat/set_memory.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)

--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -580,6 +580,33 @@ static inline pgprot_t static_protection
}

/*
+ * Validate and enforce strict W^X semantics.
+ */
+static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
+ unsigned long pfn, unsigned long npg)
+{
+ unsigned long end;
+
+ if (!cpu_feature_enabled(X86_FEATURE_NX))
+ return new;
+
+ if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
+ return new;
+
+ if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
+ return new;
+
+ end = start + npg * PAGE_SIZE - 1;
+ WARN_ONCE(1, "CPA refuse W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
+ (unsigned long long)pgprot_val(old),
+ (unsigned long long)pgprot_val(new),
+ start, end, pfn);
+
+ /* refuse the transition into WX */
+ return old;
+}
+
+/*
* Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry and the level of the mapping.
*/
@@ -885,6 +912,8 @@ static int __should_split_large_page(pte
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);

+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+
/*
* If there is a conflict, split the large page.
*
@@ -1525,6 +1554,7 @@ static int __change_page_attr(struct cpa

if (level == PG_LEVEL_4K) {
pte_t new_pte;
+ pgprot_t old_prot = pte_pgprot(old_pte);
pgprot_t new_prot = pte_pgprot(old_pte);
unsigned long pfn = pte_pfn(old_pte);

@@ -1536,6 +1566,8 @@ static int __change_page_attr(struct cpa
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);

+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+
new_prot = pgprot_clear_protnone_bits(new_prot);

/*


2022-08-29 19:25:06

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On Mon, Aug 29, 2022 at 12:18:03PM +0200, Peter Zijlstra wrote:
>
> x86 has STRICT_*_RWX, but not even a warning when someone violates it.
>
> Add this warning and fully refuse the transition.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

Reviewed-by: Kees Cook <[email protected]>

--
Kees Cook

Subject: [tip: x86/mm] x86/mm: Refuse W^X violations

The following commit has been merged into the x86/mm branch of tip:

Commit-ID: 652c5bf380ad018e15006a7f8349800245ddbbad
Gitweb: https://git.kernel.org/tip/652c5bf380ad018e15006a7f8349800245ddbbad
Author: Peter Zijlstra <[email protected]>
AuthorDate: Mon, 29 Aug 2022 12:18:03 +02:00
Committer: Dave Hansen <[email protected]>
CommitterDate: Thu, 01 Sep 2022 11:10:19 -07:00

x86/mm: Refuse W^X violations

x86 has STRICT_*_RWX, but not even a warning when someone violates it.

Add this warning and fully refuse the transition.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Dave Hansen <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/mm/pat/set_memory.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6a9043b..1a2d637 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -580,6 +580,33 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
}

/*
+ * Validate and enforce strict W^X semantics.
+ */
+static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
+ unsigned long pfn, unsigned long npg)
+{
+ unsigned long end;
+
+ if (!cpu_feature_enabled(X86_FEATURE_NX))
+ return new;
+
+ if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
+ return new;
+
+ if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
+ return new;
+
+ end = start + npg * PAGE_SIZE - 1;
+ WARN_ONCE(1, "CPA refuse W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
+ (unsigned long long)pgprot_val(old),
+ (unsigned long long)pgprot_val(new),
+ start, end, pfn);
+
+ /* refuse the transition into WX */
+ return old;
+}
+
+/*
* Lookup the page table entry for a virtual address in a specific pgd.
* Return a pointer to the entry and the level of the mapping.
*/
@@ -885,6 +912,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);

+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+
/*
* If there is a conflict, split the large page.
*
@@ -1525,6 +1554,7 @@ repeat:

if (level == PG_LEVEL_4K) {
pte_t new_pte;
+ pgprot_t old_prot = pte_pgprot(old_pte);
pgprot_t new_prot = pte_pgprot(old_pte);
unsigned long pfn = pte_pfn(old_pte);

@@ -1536,6 +1566,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);

+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+
new_prot = pgprot_clear_protnone_bits(new_prot);

/*

2022-09-21 20:13:25

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

Hi,

On Mon, Aug 29, 2022 at 12:18:03PM +0200, Peter Zijlstra wrote:
>
> x86 has STRICT_*_RWX, but not even a warning when someone violates it.
>
> Add this warning and fully refuse the transition.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

I see the following crash when trying to boot qemu using images with
PAE enabled. I checked again after applying "x86/mm/32: Fix W^X detection
when page tables do not support NX", but that did not fix the problem.

Guenter

---
[ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0
ILLOPC: cbc65efa: 0f 0b
[ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0
[ 2.043743] Modules linked in:
[ 2.043978] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc6-next-20220921 #1
[ 2.044277] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 2.044572] EIP: __change_page_attr_set_clr+0xdca/0xdd0
[ 2.044751] Code: 10 8b 45 ac 89 7c 24 04 89 74 24 14 89 4c 24 1c 8d 8e ff 0f 00 00 89 4c 24 18 89 44 24 08 c7 04 24 44 67 08 cd e8 56 38 fb 00 <0f> 0b eb 83 66 90 55 89 e5 57 56 89 d6 53 89 c3 83 ec 58 31 d2 8b
[ 2.045179] EAX: 00000074 EBX: 000a0063 ECX: 00000000 EDX: 00000002
[ 2.045315] ESI: c00a0000 EDI: 00000063 EBP: c115fe4c ESP: c115fd34
[ 2.045445] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00000282
[ 2.045585] CR0: 80050033 CR2: ffbff000 CR3: 0d57c000 CR4: 000006f0
[ 2.046170] Call Trace:
[ 2.046631] ? __purge_vmap_area_lazy+0x6c/0x640
[ 2.046768] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0
[ 2.046923] ? __mutex_unlock_slowpath+0x2b/0x2b0
[ 2.047035] ? purge_fragmented_blocks_allcpus+0x64/0x2c0
[ 2.047199] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0
[ 2.047315] ? _vm_unmap_aliases.part.0+0x54/0x1f0
[ 2.047496] change_page_attr_set_clr+0x11d/0x2d0
[ 2.047738] set_memory_x+0x56/0x60
[ 2.047863] pci_pcbios_init+0xc8/0x28c
[ 2.047981] ? pcibios_resource_survey+0x63/0x63
[ 2.048152] pci_arch_init+0x3c/0x73
[ 2.048242] ? pcibios_resource_survey+0x63/0x63
[ 2.048340] do_one_initcall+0x4f/0x2e0
[ 2.048442] ? __this_cpu_preempt_check+0xf/0x11
[ 2.048578] ? rcu_read_lock_sched_held+0x41/0x70
[ 2.048684] ? trace_initcall_level+0x65/0xa6
[ 2.048805] kernel_init_freeable+0x210/0x264
[ 2.048908] ? rest_init+0x140/0x140
[ 2.049002] kernel_init+0x15/0x110
[ 2.049211] ? schedule_tail_wrapper+0x9/0xc
[ 2.049312] ret_from_fork+0x1c/0x28
[ 2.049547] irq event stamp: 7715
[ 2.049633] hardirqs last enabled at (7723): [<cbce7119>] __up_console_sem+0x69/0x80
[ 2.049822] hardirqs last disabled at (7730): [<cbce70fd>] __up_console_sem+0x4d/0x80
[ 2.049972] softirqs last enabled at (7176): [<cbc29ac7>] call_on_stack+0x47/0x60
[ 2.050153] softirqs last disabled at (7167): [<cbc29ac7>] call_on_stack+0x47/0x60
[ 2.050307] ---[ end trace 0000000000000000 ]---
[ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.
[ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
[ 2.051115] BUG: unable to handle page fault for address: c00fd2bf
[ 2.051115] #PF: supervisor instruction fetch in kernel mode
[ 2.051115] #PF: error_code(0x0011) - permissions violation
[ 2.051115] *pdpt = 000000000d578001 *pde = 000000000dc18063 *pte = 80000000000fd063
[ 2.051115] Oops: 0011 [#1] PREEMPT SMP PTI
[ 2.051115] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 6.0.0-rc6-next-20220921 #1
[ 2.051115] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
[ 2.051115] EIP: 0xc00fd2bf
[ 2.051115] Code: 06 1e 8c d0 8e d8 66 89 e3 66 0f b7 e4 66 89 e0 66 e8 43 e8 ff ff 66 89 dc 1f 07 66 5f 66 5e 66 5d 66 5b 66 5a 66 59 66 58 cf <9c> 3d 24 50 43 49 75 13 bb 00 00 0f 00 b9 00 00 01 00 ba 1d d2 00
[ 2.051115] EAX: 49435024 EBX: 00000000 ECX: 00000000 EDX: cd1a027f
[ 2.051115] ESI: 00000200 EDI: cd50e7f4 EBP: c115ff08 ESP: c115fee0
[ 2.051115] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00000046
[ 2.051115] CR0: 80050033 CR2: c00fd2bf CR3: 0d57c000 CR4: 000006f0
[ 2.051115] Call Trace:
[ 2.051115] ? pci_pcbios_init+0xfa/0x28c
[ 2.051115] ? pcibios_resource_survey+0x63/0x63
[ 2.051115] pci_arch_init+0x3c/0x73
[ 2.051115] ? pcibios_resource_survey+0x63/0x63
[ 2.051115] do_one_initcall+0x4f/0x2e0
[ 2.051115] ? __this_cpu_preempt_check+0xf/0x11
[ 2.051115] ? rcu_read_lock_sched_held+0x41/0x70
[ 2.051115] ? trace_initcall_level+0x65/0xa6
[ 2.051115] kernel_init_freeable+0x210/0x264
[ 2.051115] ? rest_init+0x140/0x140
[ 2.051115] kernel_init+0x15/0x110
[ 2.051115] ? schedule_tail_wrapper+0x9/0xc
[ 2.051115] ret_from_fork+0x1c/0x28
[ 2.051115] Modules linked in:
[ 2.051115] CR2: 00000000c00fd2bf
[ 2.051115] ---[ end trace 0000000000000000 ]---
[ 2.051115] EIP: 0xc00fd2bf
[ 2.051115] Code: 06 1e 8c d0 8e d8 66 89 e3 66 0f b7 e4 66 89 e0 66 e8 43 e8 ff ff 66 89 dc 1f 07 66 5f 66 5e 66 5d 66 5b 66 5a 66 59 66 58 cf <9c> 3d 24 50 43 49 75 13 bb 00 00 0f 00 b9 00 00 01 00 ba 1d d2 00
[ 2.051115] EAX: 49435024 EBX: 00000000 ECX: 00000000 EDX: cd1a027f
[ 2.051115] ESI: 00000200 EDI: cd50e7f4 EBP: c115ff08 ESP: c115fee0
[ 2.051115] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00000046
[ 2.051115] CR0: 80050033 CR2: c00fd2bf CR3: 0d57c000 CR4: 000006f0
[ 2.051426] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009

---
# bad: [ef08d387bbbc20df740ced8caee0ffac835869ac] Add linux-next specific files for 20220920
# good: [521a547ced6477c54b4b0cc206000406c221b4d6] Linux 6.0-rc6
git bisect start 'HEAD' 'v6.0-rc6'
# good: [df970c033333b10c728198606fe787535e08ab8a] Merge branch 'drm-next' of git://git.freedesktop.org/git/drm/drm.git
git bisect good df970c033333b10c728198606fe787535e08ab8a
# bad: [c46ae7d9b6ad0283ffd7b40117b52444d68e083e] Merge branch 'usb-next' of git://git.kernel.org/pub/scm/linux/kernel/git/johan/usb-serial.git
git bisect bad c46ae7d9b6ad0283ffd7b40117b52444d68e083e
# good: [6a21588fd7f579342d71f2c543d7dca6fd44ff8a] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git
git bisect good 6a21588fd7f579342d71f2c543d7dca6fd44ff8a
# bad: [9b5a7d7a43dc87c6326a23394f37d0786dc9e712] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
git bisect bad 9b5a7d7a43dc87c6326a23394f37d0786dc9e712
# good: [00a0886a99d2aba28e8c9f1c124d9cbbaadab693] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/audit.git
git bisect good 00a0886a99d2aba28e8c9f1c124d9cbbaadab693
# good: [57b16b0bfae3a029815b845e8e623fb02d255d68] Merge branch into tip/master: 'x86/cache'
git bisect good 57b16b0bfae3a029815b845e8e623fb02d255d68
# good: [2632186d3de796a47b2dc00ac9dc9bbe6e70796b] Merge remote-tracking branch 'spi/for-6.1' into spi-next
git bisect good 2632186d3de796a47b2dc00ac9dc9bbe6e70796b
# good: [65c4764941bb230ef00164771fba0cdad0bfd3e4] dt-bindings: phy: hisilicon,hi3670-usb3: simplify example
git bisect good 65c4764941bb230ef00164771fba0cdad0bfd3e4
# bad: [32aefecc271aa1ca4431e0f9094e5a578922527b] Merge branch into tip/master: 'x86/mm'
git bisect bad 32aefecc271aa1ca4431e0f9094e5a578922527b
# good: [16ac81825892970fbe5f32fb379466d19d3d3134] Merge branch into tip/master: 'x86/cpu'
git bisect good 16ac81825892970fbe5f32fb379466d19d3d3134
# good: [77614503f9f135323315a53d60dc001f1a429f7c] Merge branch into tip/master: 'x86/misc'
git bisect good 77614503f9f135323315a53d60dc001f1a429f7c
# bad: [1043897681808118c0f7e70b210774000fe06621] Merge branch 'linus' into x86/mm, to refresh the branch
git bisect bad 1043897681808118c0f7e70b210774000fe06621
# bad: [652c5bf380ad018e15006a7f8349800245ddbbad] x86/mm: Refuse W^X violations
git bisect bad 652c5bf380ad018e15006a7f8349800245ddbbad
# good: [86af8230ce138e0423f43f6b104f3fa050aced6d] x86/mm: Rename set_memory_present() to set_memory_p()
git bisect good 86af8230ce138e0423f43f6b104f3fa050aced6d
# first bad commit: [652c5bf380ad018e15006a7f8349800245ddbbad] x86/mm: Refuse W^X violations

2022-09-21 21:51:27

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On 9/21/22 13:07, Guenter Roeck wrote:
> [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0
> ILLOPC: cbc65efa: 0f 0b
> [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0
...
> [ 2.050307] ---[ end trace 0000000000000000 ]---
> [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.
> [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
> [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf

This _looks_ like it is working as intended. The PCI BIOS code tried to
make a RWX page. The CPA code refused to do it and presumably returned
an error, leaving a RW page, non-executable page. The PCI code didn't
check the set_memory_x() return code and tried to go execute anyway.
That resulted in the oops.

I was able to reproduce this pretty easily. The workaround from dmesg
is pci=nobios. That seems to do the trick for me, although that advise
was sandwiched between a warning and an oops, so not the easiest to find.

I'm a bit torn what to do on this one. Breaking the boot is bad, but so
is leaving RWX memory around.

Thoughts?

2022-09-21 23:03:10

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On 9/21/22 13:59, Dave Hansen wrote:
> On 9/21/22 13:07, Guenter Roeck wrote:
>> [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0
>> ILLOPC: cbc65efa: 0f 0b
>> [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0
> ...
>> [ 2.050307] ---[ end trace 0000000000000000 ]---
>> [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.
>> [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
>> [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf
>
> This _looks_ like it is working as intended. The PCI BIOS code tried to
> make a RWX page. The CPA code refused to do it and presumably returned
> an error, leaving a RW page, non-executable page. The PCI code didn't
> check the set_memory_x() return code and tried to go execute anyway.
> That resulted in the oops.
>
> I was able to reproduce this pretty easily. The workaround from dmesg
> is pci=nobios. That seems to do the trick for me, although that advise
> was sandwiched between a warning and an oops, so not the easiest to find.
>
> I'm a bit torn what to do on this one. Breaking the boot is bad, but so
> is leaving RWX memory around.
>
> Thoughts?

For my part I'll do what the above suggests, ie run tests with PAE enabled
with pci=nobios command line option. AFAICS that hides the problem in my tests.
I am just not sure if that is really appropriate.

Guenter

2022-09-22 03:12:40

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On 9/21/22 15:59, Guenter Roeck wrote:
> On 9/21/22 13:59, Dave Hansen wrote:
>> On 9/21/22 13:07, Guenter Roeck wrote:
>>> [    2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0
>>> ILLOPC: cbc65efa: 0f 0b
>>> [    2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0
>> ...
>>> [    2.050307] ---[ end trace 0000000000000000 ]---
>>> [    2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.
>>> [    2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
>>> [    2.051115] BUG: unable to handle page fault for address: c00fd2bf
>>
>> This _looks_ like it is working as intended.  The PCI BIOS code tried to
>> make a RWX page.  The CPA code refused to do it and presumably returned
>> an error, leaving a RW page, non-executable page.  The PCI code didn't
>> check the set_memory_x() return code and tried to go execute anyway.
>> That resulted in the oops.
>>
>> I was able to reproduce this pretty easily.  The workaround from dmesg
>> is pci=nobios.  That seems to do the trick for me, although that advise
>> was sandwiched between a warning and an oops, so not the easiest to find.
>>
>> I'm a bit torn what to do on this one.  Breaking the boot is bad, but so
>> is leaving RWX memory around.
>>
>> Thoughts?
>
> For my part I'll do what the above suggests, ie run tests with PAE enabled
> with pci=nobios command line option. AFAICS that hides the problem in my tests.
> I am just not sure if that is really appropriate.
>

Oh well, that "helped" to hide one of the crashes. Here is another one.
This is with PAE enabled and booting through efi32.

Guenter

---
[ 1.080779] ------------[ cut here ]------------
[ 1.080959] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000d0770000 - 0x00000000d0770fff PFN edcd
ILLOPC: c7465efa: 0f 0b
[ 1.081467] WARNING: CPU: 0 PID: 0 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0
[ 1.082120] Modules linked in:
[ 1.082476] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 6.0.0-rc6-next-20220921 #1
[ 1.082706] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015
[ 1.082988] EIP: __change_page_attr_set_clr+0xdca/0xdd0
[ 1.083187] Code: 10 8b 45 ac 89 7c 24 04 89 74 24 14 89 4c 24 1c 8d 8e ff 0f 00 00 89 4c 24 18 89 44 24 08 c7 04 24 38 67 88 c8 e8 56 38 fb 00 <0f> 0b eb 83 66 90 55 89 e5 57 56 89 d6 53 89 c3 83 ec 58 31 d2 8b
[ 1.083672] EAX: 00000076 EBX: 0edcd063 ECX: 00000000 EDX: 00000003
[ 1.083830] ESI: d0770000 EDI: 00000063 EBP: c8a3dea8 ESP: c8a3dd90
[ 1.083984] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00200296
[ 1.084286] CR0: 80050033 CR2: ffbff000 CR3: 08d7c000 CR4: 000006b0
[ 1.084501] Call Trace:
[ 1.084849] ? __this_cpu_preempt_check+0xf/0x11
[ 1.085053] ? __purge_vmap_area_lazy+0x6c/0x640
[ 1.085269] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0
[ 1.085415] ? __mutex_unlock_slowpath+0x2b/0x2b0
[ 1.085536] ? purge_fragmented_blocks_allcpus+0x64/0x2c0
[ 1.085696] ? _vm_unmap_aliases.part.0+0x1d8/0x1f0
[ 1.085820] ? _vm_unmap_aliases.part.0+0x54/0x1f0
[ 1.086004] change_page_attr_set_clr+0x11d/0x2d0
[ 1.086313] ? __efi_memmap_init+0x70/0xd3
[ 1.086475] set_memory_x+0x56/0x60
[ 1.086592] efi_runtime_update_mappings+0x36/0x42
[ 1.086717] efi_enter_virtual_mode+0x351/0x36e
[ 1.086860] start_kernel+0x57d/0x60f
[ 1.086956] ? set_intr_gate+0x42/0x55
[ 1.087079] i386_start_kernel+0x43/0x45
[ 1.087272] startup_32_smp+0x161/0x164
[ 1.087491] irq event stamp: 6582
[ 1.087593] hardirqs last enabled at (6590): [<c74e7119>] __up_console_sem+0x69/0x80
[ 1.087824] hardirqs last disabled at (6597): [<c74e70fd>] __up_console_sem+0x4d/0x80
[ 1.088010] softirqs last enabled at (6571): [<c7429a94>] call_on_stack+0x14/0x60
[ 1.088278] softirqs last disabled at (6614): [<c7429a94>] call_on_stack+0x14/0x60
[ 1.088466] ---[ end trace 0000000000000000 ]---
[ 1.089237] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
[ 1.089237] BUG: unable to handle page fault for address: d0810e2a
[ 1.089237] #PF: supervisor instruction fetch in kernel mode
[ 1.089237] #PF: error_code(0x0011) - permissions violation
[ 1.089237] *pdpt = 0000000008d78001 *pde = 000000000eec6067 *pte = 800000000fe98063
[ 1.089237] Oops: 0011 [#1] PREEMPT SMP PTI
[ 1.089237] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W 6.0.0-rc6-next-20220921 #1
[ 1.089237] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015
[ 1.089237] EIP: 0xd0810e2a
[ 1.089237] Code: 75 0c ff 75 08 68 c1 45 81 d0 6a 40 e8 ef ce ff ff 83 c4 20 83 ec 0c 53 e8 d4 cf ff ff 83 c4 10 31 c0 8d 65 f4 5b 5e 5f 5d c3 <55> 89 e5 57 56 53 bb 02 00 00 80 83 ec 5c 8b 7d 08 85 ff 0f 84 ed
[ 1.089237] EAX: d0810e2a EBX: 00200202 ECX: 00000049 EDX: 00000000
[ 1.089237] ESI: c8a3df30 EDI: c84c5000 EBP: c8a3df20 ESP: c8a3def8
[ 1.089237] DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00200202
[ 1.089237] CR0: 80050033 CR2: d0810e2a CR3: 08d7c000 CR4: 000006b0
[ 1.089237] Call Trace:
[ 1.089237] ? virt_efi_set_variable_nonblocking+0x80/0xf0
[ 1.089237] ? virt_efi_reset_system+0xe0/0xe0
[ 1.089237] efi_delete_dummy_variable+0x55/0x70
[ 1.089237] efi_enter_virtual_mode+0x356/0x36e
[ 1.089237] start_kernel+0x57d/0x60f
[ 1.089237] ? set_intr_gate+0x42/0x55
[ 1.089237] i386_start_kernel+0x43/0x45
[ 1.089237] startup_32_smp+0x161/0x164
[ 1.089237] Modules linked in:
[ 1.089237] CR2: 00000000d0810e2a
[ 1.089237] ---[ end trace 0000000000000000 ]---
[ 1.089237] EIP: 0xd0810e2a

2022-09-22 08:19:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote:

> Oh well, that "helped" to hide one of the crashes. Here is another one.
> This is with PAE enabled and booting through efi32.

> [ 1.086592] efi_runtime_update_mappings+0x36/0x42
> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e
> [ 1.086860] start_kernel+0x57d/0x60f
> [ 1.086956] ? set_intr_gate+0x42/0x55
> [ 1.087079] i386_start_kernel+0x43/0x45
> [ 1.087272] startup_32_smp+0x161/0x164

Does this help? Dave; perhaps we should just let i386 be i386 and let it
bitrot :/

diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e06a199423c0..d81e379fcd43 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void)
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;

+ set_memory_ro(md->virt_addr, md->num_pages);
set_memory_x(md->virt_addr, md->num_pages);
}
}

2022-09-22 15:37:35

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On 9/22/22 00:46, Peter Zijlstra wrote:
> On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote:
>
>> Oh well, that "helped" to hide one of the crashes. Here is another one.
>> This is with PAE enabled and booting through efi32.
>
>> [ 1.086592] efi_runtime_update_mappings+0x36/0x42
>> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e
>> [ 1.086860] start_kernel+0x57d/0x60f
>> [ 1.086956] ? set_intr_gate+0x42/0x55
>> [ 1.087079] i386_start_kernel+0x43/0x45
>> [ 1.087272] startup_32_smp+0x161/0x164
>
> Does this help? Dave; perhaps we should just let i386 be i386 and let it
> bitrot :/

How about we just turn off enforcement for now so that the poor i386
folks can at least boot? I have the feeling we're going to get bored
with even the warnings if they persist for too long, though.

Untested patch to make i386 violations harmless is attached.


Attachments:
norefuse.patch (1.16 kB)

2022-09-22 16:38:38

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On 9/22/22 00:46, Peter Zijlstra wrote:
> On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote:
>
>> Oh well, that "helped" to hide one of the crashes. Here is another one.
>> This is with PAE enabled and booting through efi32.
>
>> [ 1.086592] efi_runtime_update_mappings+0x36/0x42
>> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e
>> [ 1.086860] start_kernel+0x57d/0x60f
>> [ 1.086956] ? set_intr_gate+0x42/0x55
>> [ 1.087079] i386_start_kernel+0x43/0x45
>> [ 1.087272] startup_32_smp+0x161/0x164
>
> Does this help? Dave; perhaps we should just let i386 be i386 and let it
> bitrot :/
>
> diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
> index e06a199423c0..d81e379fcd43 100644
> --- a/arch/x86/platform/efi/efi_32.c
> +++ b/arch/x86/platform/efi/efi_32.c
> @@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void)
> if (md->type != EFI_RUNTIME_SERVICES_CODE)
> continue;
>
> + set_memory_ro(md->virt_addr, md->num_pages);
> set_memory_x(md->virt_addr, md->num_pages);
> }
> }

Yes, it does.

Tested-by: Guenter Roeck <[email protected]>

Guenter

2022-09-22 16:41:03

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On 9/22/22 08:00, Dave Hansen wrote:
> On 9/22/22 00:46, Peter Zijlstra wrote:
>> On Wed, Sep 21, 2022 at 08:09:13PM -0700, Guenter Roeck wrote:
>>
>>> Oh well, that "helped" to hide one of the crashes. Here is another one.
>>> This is with PAE enabled and booting through efi32.
>>
>>> [ 1.086592] efi_runtime_update_mappings+0x36/0x42
>>> [ 1.086717] efi_enter_virtual_mode+0x351/0x36e
>>> [ 1.086860] start_kernel+0x57d/0x60f
>>> [ 1.086956] ? set_intr_gate+0x42/0x55
>>> [ 1.087079] i386_start_kernel+0x43/0x45
>>> [ 1.087272] startup_32_smp+0x161/0x164
>>
>> Does this help? Dave; perhaps we should just let i386 be i386 and let it
>> bitrot :/
>
> How about we just turn off enforcement for now so that the poor i386
> folks can at least boot? I have the feeling we're going to get bored
> with even the warnings if they persist for too long, though.
>

Problem with unfixed warnings is that they hide other problems if persistent,
and they result in warnings to be seen just as useless noise.

Case in point: In ChromeOS, we get literally hundreds of thousands of warning
reports each day (most from drm and wireless drivers). Those originate from
upstream code. No one really cares, and none ever get fixed. Please don't add
more if you don't plan to fix them.

Thanks,
Guenter

2022-09-22 20:36:07

by Dave Hansen

[permalink] [raw]
Subject: [PATCH] x86/mm+efi: Avoid creating W+X mappings

From: Peter Zijlstra <[email protected]>

I'm planning on sticking this in x86/mm so that it goes upstream
along with the W+X detection code.

--

A recent x86/mm change warns and refuses to create W+X mappings.

The 32-bit EFI code tries to create such a mapping and trips over
the new W+X refusal.

Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.

Reported-by: Guenter Roeck <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Tested-by: Guenter Roeck <[email protected]>
Cc: Ard Biesheuvel <[email protected]>
Cc: Darren Hart <[email protected]>
Cc: Andy Shevchenko <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: "H. Peter Anvin" <[email protected]>
Link: https://lore.kernel.org/all/[email protected]/
---
arch/x86/platform/efi/efi_32.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e06a199423c0..d81e379fcd43 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void)
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;

+ set_memory_ro(md->virt_addr, md->num_pages);
set_memory_x(md->virt_addr, md->num_pages);
}
}
--
2.34.1

2022-09-22 22:46:36

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Thu, 22 Sept 2022 at 21:32, Dave Hansen <[email protected]> wrote:
>
> From: Peter Zijlstra <[email protected]>
>
> I'm planning on sticking this in x86/mm so that it goes upstream
> along with the W+X detection code.
>
> --
>
> A recent x86/mm change warns and refuses to create W+X mappings.
>
> The 32-bit EFI code tries to create such a mapping and trips over
> the new W+X refusal.
>
> Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.
>

This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and
.data sections of the EFI runtime PE/COFF executables in memory, so
you are essentially making .data and .bss read-only. (Whether those
executables actually modify their .data and .bss at runtime is a
different matter, but the point is that it used to be possible)

More recent firmwares may provide a 'memory attributes table'
separately which describes the individual sections, but older 32-bit
firmwares are not even built with 4k section alignment, so code and
data may share a single page. Note that we haven't wired up this
memory attributes table on i386 at the moment, and I seriously doubt
that 32-bit firmware in the field exposes it.

Can we just turn off this feature for 32-bit?

> Reported-by: Guenter Roeck <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Tested-by: Guenter Roeck <[email protected]>
> Cc: Ard Biesheuvel <[email protected]>
> Cc: Darren Hart <[email protected]>
> Cc: Andy Shevchenko <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: "H. Peter Anvin" <[email protected]>
> Link: https://lore.kernel.org/all/[email protected]/
> ---
> arch/x86/platform/efi/efi_32.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
> index e06a199423c0..d81e379fcd43 100644
> --- a/arch/x86/platform/efi/efi_32.c
> +++ b/arch/x86/platform/efi/efi_32.c
> @@ -136,6 +136,7 @@ void __init efi_runtime_update_mappings(void)
> if (md->type != EFI_RUNTIME_SERVICES_CODE)
> continue;
>
> + set_memory_ro(md->virt_addr, md->num_pages);
> set_memory_x(md->virt_addr, md->num_pages);
> }
> }

2022-09-23 07:23:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote:
> On Thu, 22 Sept 2022 at 21:32, Dave Hansen <[email protected]> wrote:
> >
> > From: Peter Zijlstra <[email protected]>
> >
> > I'm planning on sticking this in x86/mm so that it goes upstream
> > along with the W+X detection code.
> >
> > --
> >
> > A recent x86/mm change warns and refuses to create W+X mappings.
> >
> > The 32-bit EFI code tries to create such a mapping and trips over
> > the new W+X refusal.
> >
> > Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.
> >
>
> This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and
> .data sections of the EFI runtime PE/COFF executables in memory, so
> you are essentially making .data and .bss read-only. (Whether those
> executables actually modify their .data and .bss at runtime is a
> different matter, but the point is that it used to be possible)
>
> More recent firmwares may provide a 'memory attributes table'
> separately which describes the individual sections, but older 32-bit
> firmwares are not even built with 4k section alignment, so code and
> data may share a single page. Note that we haven't wired up this
> memory attributes table on i386 at the moment, and I seriously doubt
> that 32-bit firmware in the field exposes it.
>
> Can we just turn off this feature for 32-bit?

Goodie; some seriously security minded people who did that EFI turd :/
Let's just heap it on the pile of 32bit sucks and should not be
considered a security target anymore and indeed kill this feature.


2022-09-23 10:00:06

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

(cc Kees)

On Fri, 23 Sept 2022 at 09:00, Peter Zijlstra <[email protected]> wrote:
>
> On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote:
> > On Thu, 22 Sept 2022 at 21:32, Dave Hansen <[email protected]> wrote:
> > >
> > > From: Peter Zijlstra <[email protected]>
> > >
> > > I'm planning on sticking this in x86/mm so that it goes upstream
> > > along with the W+X detection code.
> > >
> > > --
> > >
> > > A recent x86/mm change warns and refuses to create W+X mappings.
> > >
> > > The 32-bit EFI code tries to create such a mapping and trips over
> > > the new W+X refusal.
> > >
> > > Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.
> > >
> >
> > This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and
> > .data sections of the EFI runtime PE/COFF executables in memory, so
> > you are essentially making .data and .bss read-only. (Whether those
> > executables actually modify their .data and .bss at runtime is a
> > different matter, but the point is that it used to be possible)
> >
> > More recent firmwares may provide a 'memory attributes table'
> > separately which describes the individual sections, but older 32-bit
> > firmwares are not even built with 4k section alignment, so code and
> > data may share a single page. Note that we haven't wired up this
> > memory attributes table on i386 at the moment, and I seriously doubt
> > that 32-bit firmware in the field exposes it.
> >
> > Can we just turn off this feature for 32-bit?
>
> Goodie; some seriously security minded people who did that EFI turd :/

To be fair, most people tended to care more about memory footprint
than about security at the time. And I don't recall a lot of
enthusiasm in the Linux community either for rounding up kernel
sections so they could be mapped with W^X permissions. And without
PAE, all memory is executable anyway.

> Let's just heap it on the pile of 32bit sucks and should not be
> considered a security target anymore and indeed kill this feature.
>

I take it this issue is triggered by the fact that i386 maps the EFI
runtime regions into the kernel page tables, and are therefore always
mapped, right? If anyone cares enough about this to go and fix it, we
could switch to the approach we use everywhere else, i.e., treat EFI
memory as user space mappings, and activate them only while a runtime
service is in progress.

But frankly, why would anyone still be running this? With the EFI
mixed mode support, only systems with CPUs that don't actually
implement long mode still need this, and I am skeptical that such
deployments would use recent kernels.

2022-09-23 14:43:23

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On 9/23/22 02:49, Ard Biesheuvel wrote:
> (cc Kees)
>
> On Fri, 23 Sept 2022 at 09:00, Peter Zijlstra <[email protected]> wrote:
>>
>> On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote:
>>> On Thu, 22 Sept 2022 at 21:32, Dave Hansen <[email protected]> wrote:
>>>>
>>>> From: Peter Zijlstra <[email protected]>
>>>>
>>>> I'm planning on sticking this in x86/mm so that it goes upstream
>>>> along with the W+X detection code.
>>>>
>>>> --
>>>>
>>>> A recent x86/mm change warns and refuses to create W+X mappings.
>>>>
>>>> The 32-bit EFI code tries to create such a mapping and trips over
>>>> the new W+X refusal.
>>>>
>>>> Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.
>>>>
>>>
>>> This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and
>>> .data sections of the EFI runtime PE/COFF executables in memory, so
>>> you are essentially making .data and .bss read-only. (Whether those
>>> executables actually modify their .data and .bss at runtime is a
>>> different matter, but the point is that it used to be possible)
>>>
>>> More recent firmwares may provide a 'memory attributes table'
>>> separately which describes the individual sections, but older 32-bit
>>> firmwares are not even built with 4k section alignment, so code and
>>> data may share a single page. Note that we haven't wired up this
>>> memory attributes table on i386 at the moment, and I seriously doubt
>>> that 32-bit firmware in the field exposes it.
>>>
>>> Can we just turn off this feature for 32-bit?
>>
>> Goodie; some seriously security minded people who did that EFI turd :/
>
> To be fair, most people tended to care more about memory footprint
> than about security at the time. And I don't recall a lot of
> enthusiasm in the Linux community either for rounding up kernel
> sections so they could be mapped with W^X permissions. And without
> PAE, all memory is executable anyway.
>
>> Let's just heap it on the pile of 32bit sucks and should not be
>> considered a security target anymore and indeed kill this feature.
>>
>
> I take it this issue is triggered by the fact that i386 maps the EFI
> runtime regions into the kernel page tables, and are therefore always
> mapped, right? If anyone cares enough about this to go and fix it, we
> could switch to the approach we use everywhere else, i.e., treat EFI
> memory as user space mappings, and activate them only while a runtime
> service is in progress.
>
> But frankly, why would anyone still be running this? With the EFI
> mixed mode support, only systems with CPUs that don't actually
> implement long mode still need this, and I am skeptical that such
> deployments would use recent kernels.

It is supported, thus I run qemu tests for it. That is the whole point
of testing, after all. If PAE (assuming that is what you are talking
about) is no longer supported or supportable, its support should be
removed. If so, I'll be very happy to stop testing it.

Thanks,
Guenter

2022-09-23 14:53:58

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Fri, 23 Sept 2022 at 15:58, Guenter Roeck <[email protected]> wrote:
>
> On 9/23/22 02:49, Ard Biesheuvel wrote:
> > (cc Kees)
> >
> > On Fri, 23 Sept 2022 at 09:00, Peter Zijlstra <[email protected]> wrote:
> >>
> >> On Fri, Sep 23, 2022 at 12:08:57AM +0200, Ard Biesheuvel wrote:
> >>> On Thu, 22 Sept 2022 at 21:32, Dave Hansen <[email protected]> wrote:
> >>>>
> >>>> From: Peter Zijlstra <[email protected]>
> >>>>
> >>>> I'm planning on sticking this in x86/mm so that it goes upstream
> >>>> along with the W+X detection code.
> >>>>
> >>>> --
> >>>>
> >>>> A recent x86/mm change warns and refuses to create W+X mappings.
> >>>>
> >>>> The 32-bit EFI code tries to create such a mapping and trips over
> >>>> the new W+X refusal.
> >>>>
> >>>> Make the EFI_RUNTIME_SERVICES_CODE mapping read-only to fix it.
> >>>>
> >>>
> >>> This is not safe. EFI_RUNTIME_SERVICES_CODE covers both .text and
> >>> .data sections of the EFI runtime PE/COFF executables in memory, so
> >>> you are essentially making .data and .bss read-only. (Whether those
> >>> executables actually modify their .data and .bss at runtime is a
> >>> different matter, but the point is that it used to be possible)
> >>>
> >>> More recent firmwares may provide a 'memory attributes table'
> >>> separately which describes the individual sections, but older 32-bit
> >>> firmwares are not even built with 4k section alignment, so code and
> >>> data may share a single page. Note that we haven't wired up this
> >>> memory attributes table on i386 at the moment, and I seriously doubt
> >>> that 32-bit firmware in the field exposes it.
> >>>
> >>> Can we just turn off this feature for 32-bit?
> >>
> >> Goodie; some seriously security minded people who did that EFI turd :/
> >
> > To be fair, most people tended to care more about memory footprint
> > than about security at the time. And I don't recall a lot of
> > enthusiasm in the Linux community either for rounding up kernel
> > sections so they could be mapped with W^X permissions. And without
> > PAE, all memory is executable anyway.
> >
> >> Let's just heap it on the pile of 32bit sucks and should not be
> >> considered a security target anymore and indeed kill this feature.
> >>
> >
> > I take it this issue is triggered by the fact that i386 maps the EFI
> > runtime regions into the kernel page tables, and are therefore always
> > mapped, right? If anyone cares enough about this to go and fix it, we
> > could switch to the approach we use everywhere else, i.e., treat EFI
> > memory as user space mappings, and activate them only while a runtime
> > service is in progress.
> >
> > But frankly, why would anyone still be running this? With the EFI
> > mixed mode support, only systems with CPUs that don't actually
> > implement long mode still need this, and I am skeptical that such
> > deployments would use recent kernels.
>
> It is supported, thus I run qemu tests for it. That is the whole point
> of testing, after all.

I completely agree with that, and I think all the testing you do is
extremely valuable.

> If PAE (assuming that is what you are talking about)

Not at all - I was referring to i386 support in general.

I was basically making the point that we still support i386 without
PAE (which is a prerequisite for supporting non-executable mappings),
and if we are going to be pedantic about security on this
architecture, we should probably make PAE mandatory as well.

If we are ok with the current state, enabling this permission check on
i386 makes no sense.

> is no longer supported or supportable, its support should be
> removed. If so, I'll be very happy to stop testing it.
>

I'd say there are better ways to spend those cycles, but for the time
being, I think we should continue testing it, as otherwise it will
just bit rot.

2022-09-23 18:45:29

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Fri, Sep 23, 2022 at 04:26:58PM +0200, Ard Biesheuvel wrote:
> I was basically making the point that we still support i386 without
> PAE (which is a prerequisite for supporting non-executable mappings),
> and if we are going to be pedantic about security on this
> architecture, we should probably make PAE mandatory as well.

My expectation would be that if someone is running modern kernels on i386,
they're not using PAE. If they care about PAE, I'd expect them to have
long since moved to x86_64.

> If we are ok with the current state, enabling this permission check on
> i386 makes no sense.

I'd agree. If it's a choice between "spend a lot of time making sure
this works correctly on i386" and "don't do this at all on i386", I
would pick the latter. If someone steps up to do the former, then by
all means take the patches.

--
Kees Cook

2022-09-23 20:05:21

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Fri, 23 Sept 2022 at 20:31, Kees Cook <[email protected]> wrote:
>
> On Fri, Sep 23, 2022 at 04:26:58PM +0200, Ard Biesheuvel wrote:
> > I was basically making the point that we still support i386 without
> > PAE (which is a prerequisite for supporting non-executable mappings),
> > and if we are going to be pedantic about security on this
> > architecture, we should probably make PAE mandatory as well.
>
> My expectation would be that if someone is running modern kernels on i386,
> they're not using PAE. If they care about PAE, I'd expect them to have
> long since moved to x86_64.
>

Not sure I follow. If they care about PAE, they turn it on. Or do you
mean 'if they care about being able to address lots of physical
memory'? Because the *other* reason you might care about PAE is
because it gives you NX support.

But currently, PAE is not even enabled in the i386_defconfig, and
defaults to off. This means people that are unaware of this won't
enable it, and will be running without NX support.

> > If we are ok with the current state, enabling this permission check on
> > i386 makes no sense.
>
> I'd agree. If it's a choice between "spend a lot of time making sure
> this works correctly on i386" and "don't do this at all on i386", I
> would pick the latter. If someone steps up to do the former, then by
> all means take the patches.
>

OK, so it seems we're all in violent agreement here. And if there is
ever a push for enabling security features on 32-bit, we can add this
to the laundry list of things that need to be looked at.

2022-09-23 22:04:50

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Fri, Sep 23, 2022 at 09:53:02PM +0200, Ard Biesheuvel wrote:
> On Fri, 23 Sept 2022 at 20:31, Kees Cook <[email protected]> wrote:
> >
> > On Fri, Sep 23, 2022 at 04:26:58PM +0200, Ard Biesheuvel wrote:
> > > I was basically making the point that we still support i386 without
> > > PAE (which is a prerequisite for supporting non-executable mappings),
> > > and if we are going to be pedantic about security on this
> > > architecture, we should probably make PAE mandatory as well.
> >
> > My expectation would be that if someone is running modern kernels on i386,
> > they're not using PAE. If they care about PAE, I'd expect them to have
> > long since moved to x86_64.
> >
>
> Not sure I follow. If they care about PAE, they turn it on. Or do you
> mean 'if they care about being able to address lots of physical
> memory'? Because the *other* reason you might care about PAE is
> because it gives you NX support.

Right, I meant if they care about NX (and the topic of this thread)
they want PAE, and if they want PAE, they likely moved to x86_64 long
long ago for new kernels.

> But currently, PAE is not even enabled in the i386_defconfig, and
> defaults to off. This means people that are unaware of this won't
> enable it, and will be running without NX support.

And they all make me cry. ;)

> > > If we are ok with the current state, enabling this permission check on
> > > i386 makes no sense.
> >
> > I'd agree. If it's a choice between "spend a lot of time making sure
> > this works correctly on i386" and "don't do this at all on i386", I
> > would pick the latter. If someone steps up to do the former, then by
> > all means take the patches.
> >
>
> OK, so it seems we're all in violent agreement here. And if there is
> ever a push for enabling security features on 32-bit, we can add this
> to the laundry list of things that need to be looked at.

Yup.

--
Kees Cook

2022-09-23 22:46:00

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On 9/23/22 14:19, Kees Cook wrote:
>> But currently, PAE is not even enabled in the i386_defconfig, and
>> defaults to off. This means people that are unaware of this won't
>> enable it, and will be running without NX support.
> And they all make me cry. ;)

It's been like that for a long time, presumably because the defconfig
should *boot* in as many cases as possible. It wouldn't be hard to
change. It also wouldn't be hard to default to HIGHMEM4G (non-PAE) on
targeted builds for CPUs that don't support it. Patch attached to do
that, if anyone else has an opinion.

We should probably just leave i386 alone, but it breaks my heart to see
Kees in tears.


Attachments:
pae.patch (924.00 B)

2022-09-23 22:46:21

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

Dave Hansen <[email protected]> writes:

> On 9/23/22 14:19, Kees Cook wrote:
>>> But currently, PAE is not even enabled in the i386_defconfig, and
>>> defaults to off. This means people that are unaware of this won't
>>> enable it, and will be running without NX support.
>> And they all make me cry. ;)
>
> It's been like that for a long time, presumably because the defconfig
> should *boot* in as many cases as possible. It wouldn't be hard to
> change. It also wouldn't be hard to default to HIGHMEM4G (non-PAE) on
> targeted builds for CPUs that don't support it. Patch attached to do
> that, if anyone else has an opinion.
>
> We should probably just leave i386 alone, but it breaks my heart to see
> Kees in tears.

Is it at all possible to simply drop efi support for 32bit builds?

Last I looked (and it was quite a while ago) efi was only supported
same architecture. So we are talking about 32bit efi for 32bit kernels.

I think there were only a handful of systems that ever shipped 32bit
efi, because when 32bit efi came out 64bit processors had been shipping
for several years already.

We still probably need to deal with whatever is needed for the BIOS.

If there are enough interesting systems to care to keep the few systems
that shipped with 32bit efi support going it probably does make sense to
change how it is implemented because using the kernel's page tables has
been nasty and given kexec all kinds of challenges to support because
not only does efi happen strange mapping attributes but efi also winds
up living at a fixed virtual address, that can't be changed. So if you
care about anything like address space layout randomization efi provides
a well know fixed target that defeats all of your work there as well.

Can we do something to isolate 32bit efi so it is not a painpoint?

Given how long 8bit and 16bit systems have lasted I rather suspect 32bit
x86 will last in some embedded form for a very long time. PAE came in
about the first pentium's I think so most embedded i386 processors
should support it.

Eric

2022-09-24 00:07:33

by Kees Cook

[permalink] [raw]
Subject: Re: [PATCH] x86/mm+efi: Avoid creating W+X mappings

On Fri, Sep 23, 2022 at 03:15:15PM -0700, Dave Hansen wrote:
> On 9/23/22 14:19, Kees Cook wrote:
> >> But currently, PAE is not even enabled in the i386_defconfig, and
> >> defaults to off. This means people that are unaware of this won't
> >> enable it, and will be running without NX support.
> > And they all make me cry. ;)
>
> It's been like that for a long time, presumably because the defconfig
> should *boot* in as many cases as possible. It wouldn't be hard to
> change. It also wouldn't be hard to default to HIGHMEM4G (non-PAE) on
> targeted builds for CPUs that don't support it. Patch attached to do
> that, if anyone else has an opinion.
>
> We should probably just leave i386 alone, but it breaks my heart to see
> Kees in tears.

*dabs his eyes with tissue*

> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index f9920f1341c8..fad978c7b7c5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1363,9 +1363,14 @@ config X86_CPUID
> with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
> /dev/cpu/31/cpuid.
>
> +config CPU_HAS_PAE
> + def_bool y
> + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
> +
> choice
> prompt "High Memory Support"
> default HIGHMEM4G
> + default HIGHMEM64G if CPU_HAS_PAE
> depends on X86_32
>
> config NOHIGHMEM
> @@ -1412,7 +1417,7 @@ config HIGHMEM4G
>
> config HIGHMEM64G
> bool "64GB"
> - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
> + depends on CPU_HAS_PAE
> select X86_PAE
> help
> Select this if you have a 32-bit processor and more than 4

I feel happy now! :)

--
Kees Cook

2022-10-02 10:39:05

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On Wed 2022-09-21 13:59:06, Dave Hansen wrote:
> On 9/21/22 13:07, Guenter Roeck wrote:
> > [ 2.042861] CPA refuse W^X violation: 8000000000000063 -> 0000000000000063 range: 0x00000000c00a0000 - 0x00000000c00a0fff PFN a0
> > ILLOPC: cbc65efa: 0f 0b
> > [ 2.043267] WARNING: CPU: 0 PID: 1 at arch/x86/mm/pat/set_memory.c:600 __change_page_attr_set_clr+0xdca/0xdd0
> ...
> > [ 2.050307] ---[ end trace 0000000000000000 ]---
> > [ 2.050762] PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.
> > [ 2.051115] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
> > [ 2.051115] BUG: unable to handle page fault for address: c00fd2bf
>
> This _looks_ like it is working as intended. The PCI BIOS code tried to
> make a RWX page. The CPA code refused to do it and presumably returned
> an error, leaving a RW page, non-executable page. The PCI code didn't
> check the set_memory_x() return code and tried to go execute anyway.
> That resulted in the oops.
>
> I was able to reproduce this pretty easily. The workaround from dmesg
> is pci=nobios. That seems to do the trick for me, although that advise
> was sandwiched between a warning and an oops, so not the easiest to find.
>
> I'm a bit torn what to do on this one. Breaking the boot is bad, but so
> is leaving RWX memory around.

Well, the original patch is bad. Boot regressions are not acceptable.

We should first add an WARN_ON(), debug and fix the failures, then we
can start refusing the transitions.

Best regards,
Pavel

--
People of Russia, stop Putin before his war on Ukraine escalates.


Attachments:
(No filename) (1.61 kB)
signature.asc (201.00 B)
Download all attachments

2022-10-24 17:20:55

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mm: Refuse W^X violations

On Mon, 29 Aug 2022 12:18:03 +0200
Peter Zijlstra <[email protected]> wrote:

> x86 has STRICT_*_RWX, but not even a warning when someone violates it.
>
> Add this warning and fully refuse the transition.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

So, this now triggers on enabling function tracing at boot up:

"ftrace=function"

This:

SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=4, Nodes=1
Kernel/User page tables isolation: enabled
ftrace: allocating 68738 entries in 269 pages
ftrace: allocated 269 pages with 4 groups
Starting tracer 'function'
------------[ cut here ]------------
CPA detected W^X violation: 8000000000000063 -> 0000000000000063 range: 0xffffffffc0013000 - 0xffffffffc0013fff PFN 10031b
WARNING: CPU: 0 PID: 0 at arch/x86/mm/pat/set_memory.c:609 verify_rwx+0x61/0x6d
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 6.1.0-rc1-test+ #3
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
RIP: 0010:verify_rwx+0x61/0x6d
Code: e5 01 00 75 27 49 c1 e0 0c 48 89 d1 48 89 fe 48 c7 c7 5b b3 92 84 4e 8d 44 02 ff 48 89 da c6 05 71 29 e5 01 01 e8 35 90 e2 00 <0f> 0b 48 89 d8 5b 5d e9 6f 95 1a 01 0f 1f 44 00 00 55 48 89 e5 53
RSP: 0000:ffffffff84c03b08 EFLAGS: 00010086
RAX: 0000000000000000 RBX: 0000000000000063 RCX: 0000000000000003
RDX: 0000000000000003 RSI: ffffffff84c039b0 RDI: 0000000000000001
RBP: ffffffff84c03b10 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000025 R12: ffff8e730031c098
R13: 000000000010031b R14: 800000010031b063 R15: 8000000000000063
FS: 0000000000000000(0000) GS:ffff8e7416a00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffff8e73fd801000 CR3: 00000001fcc22001 CR4: 00000000000606f0
Call Trace:
<TASK>
__change_page_attr_set_clr+0x146/0x8a6
? __mutex_unlock_slowpath+0x41/0x213
? mutex_unlock+0x12/0x18
? _vm_unmap_aliases+0x126/0x136
change_page_attr_set_clr+0x135/0x268
? find_vmap_area+0x32/0x3e
? __fentry__+0x10/0x10
change_page_attr_clear.constprop.0+0x16/0x1c
set_memory_x+0x2c/0x32
arch_ftrace_update_trampoline+0x218/0x2db
? ftrace_caller_op_ptr+0x17/0x17
ftrace_update_trampoline+0x16/0xa1
? tracing_gen_ctx+0x1c/0x1c
__register_ftrace_function+0x93/0xb2
ftrace_startup+0x21/0xf0
? tracing_gen_ctx+0x1c/0x1c
register_ftrace_function_nolock+0x26/0x40
register_ftrace_function+0x4e/0x143
? mutex_unlock+0x12/0x18
? tracing_gen_ctx+0x1c/0x1c
function_trace_init+0x7d/0xc3
tracer_init+0x23/0x2c
tracing_set_tracer+0x1d5/0x206
register_tracer+0x1c0/0x1e4
init_function_trace+0x90/0x96
early_trace_init+0x25c/0x352
start_kernel+0x424/0x6e4
x86_64_start_reservations+0x24/0x2a
x86_64_start_kernel+0x8c/0x95
secondary_startup_64_no_verify+0xe0/0xeb
</TASK>
---[ end trace 0000000000000000 ]---


-- Steve