This series is a whole bunch of page fault cleanups, plus a couple
of OOPS diagnostic improvements. The overall goals are to clean up
handling of the faulting CPL, the USER bit in the error_code, and
the log messages generated by #PF OOPSes.
This series can also be seen as CET preparation. CET introduces the
WRUSS instruction, which is the very first way for CPL 0 code to
cause a #PF fault with the USER bit set. Let's get the page fault
code into shape before we start using WRUSS :)
Changes from v1:
- Rebase on top of tip:x86/mm, now that a bunch of v1 was applied.
The only material changes are that 'x86/fault: Check
user_mode(regs) when validating a stack extension' is gone
because the code it fixed has been deleted and that 'x86/fault:
Remove sw_error_code' lost the hunk that changed the same code.
Andy Lutomirski (5):
x86/fault: Remove sw_error_code
x86/fault: Don't try to recover from an implicit supervisor access
x86/oops: Show the correct CS value in show_regs()
x86/fault: Decode page fault OOPSes better
x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code
arch/x86/entry/vsyscall/vsyscall_64.c | 2 +-
arch/x86/kernel/process_64.c | 5 +-
arch/x86/mm/fault.c | 144 +++++++++++++++++++-------
3 files changed, 108 insertions(+), 43 deletions(-)
--
2.17.2
One of Linus' favorite hobbies seems to be looking at OOPSes and
decoding the error code in his head. This is not one of my favorite
hobbies :)
Teach the page fault OOPS hander to decode the error code. If it's
a !USER fault from user mode, print an explicit note to that effect
and print out the addresses of various tables that might cause such
an error.
With this patch applied, if I intentionally point the LDT at 0x0 and
run the x86 selftests, I get:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
HW error: normal kernel read fault
This was a system access from user code
IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f)
LDTR: 0x50 -- base=0x0 limit=0xfff7
TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f
PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0
SMP PTI
CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317
Hardware name: ...
RIP: 0033:0x401454
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/mm/fault.c | 84 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ca38bd0472f2..f5efbdba2b6d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/desc.h> /* store_idt(), ... */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
return 0;
}
+static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
+{
+ u32 offset = (index >> 3) * sizeof(struct desc_struct);
+ unsigned long addr;
+ struct ldttss_desc desc;
+
+ if (index == 0) {
+ pr_alert("%s: NULL\n", name);
+ return;
+ }
+
+ if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
+ pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
+ return;
+ }
+
+ if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
+ sizeof(struct ldttss_desc))) {
+ pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
+ name, index);
+ return;
+ }
+
+ addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+#ifdef CONFIG_X86_64
+ addr |= ((u64)desc.base3 << 32);
+#endif
+ pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
+ name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
+}
+
+static void errstr(unsigned long ec, char *buf, unsigned long mask,
+ const char *txt)
+{
+ if (ec & mask) {
+ if (buf[0])
+ strcat(buf, " ");
+ strcat(buf, txt);
+ }
+}
+
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
+ char errtxt[64];
+
if (!oops_may_print())
return;
@@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
(void *)address);
+ errtxt[0] = 0;
+ errstr(error_code, errtxt, X86_PF_PROT, "PROT");
+ errstr(error_code, errtxt, X86_PF_WRITE, "WRITE");
+ errstr(error_code, errtxt, X86_PF_USER, "USER");
+ errstr(error_code, errtxt, X86_PF_RSVD, "RSVD");
+ errstr(error_code, errtxt, X86_PF_INSTR, "INSTR");
+ errstr(error_code, errtxt, X86_PF_PK, "PK");
+ pr_alert("HW error: %s\n", error_code ? errtxt :
+ "normal kernel read fault");
+ if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+ struct desc_ptr idt, gdt;
+ u16 ldtr, tr;
+
+ pr_alert("This was a system access from user code\n");
+
+ /*
+ * This can happen for quite a few reasons. The more obvious
+ * ones are faults accessing the GDT, or LDT. Perhaps
+ * surprisingly, if the CPU tries to deliver a benign or
+ * contributory exception from user code and gets a page fault
+ * during delivery, the page fault can be delivered as though
+ * it originated directly from user code. This could happen
+ * due to wrong permissions on the IDT, GDT, LDT, TSS, or
+ * kernel or IST stack.
+ */
+ store_idt(&idt);
+
+ /* Usable even on Xen PV -- it's just slow. */
+ native_store_gdt(&gdt);
+
+ pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
+ idt.address, idt.size, gdt.address, gdt.size);
+
+ store_ldt(ldtr);
+ show_ldttss(&gdt, "LDTR", ldtr);
+
+ store_tr(tr);
+ show_ldttss(&gdt, "TR", tr);
+ }
+
dump_pagetable(address);
}
--
2.17.2
This avoids a situation in which we attempt to apply various fixups
that are not intended to handle implicit supervisor accesses from
user mode if we screw up in away that causes this type of fault.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/mm/fault.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82881bc5feef..ca38bd0472f2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long flags;
int sig;
+ if (user_mode(regs)) {
+ /*
+ * This is an implicit supervisor-mode access from user
+ * mode. Bypass all the kernel-mode recovery code and just
+ * OOPS.
+ */
+ goto oops;
+ }
+
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
@@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (IS_ENABLED(CONFIG_EFI))
efi_recover_from_page_fault(address);
+oops:
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice:
--
2.17.2
All of the fault handling code now corrently checks user_mode(regs)
as needed, and nothing depends on the X86_PF_USER bit being munged.
Get rid of the sw_error code and use hw_error_code everywhere.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/mm/fault.c | 50 ++++++++++-----------------------------------
1 file changed, 11 insertions(+), 39 deletions(-)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b898a38093a3..82881bc5feef 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
- unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
@@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
}
- /*
- * hw_error_code is literally the "page fault error code" passed to
- * the kernel directly from the hardware. But, we will shortly be
- * modifying it in software, so give it a new name.
- */
- sw_error_code = hw_error_code;
-
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs,
*/
if (user_mode(regs)) {
local_irq_enable();
- /*
- * Up to this point, X86_PF_USER set in hw_error_code
- * indicated a user-mode access. But, after this,
- * X86_PF_USER in sw_error_code will indicate either
- * that, *or* an implicit kernel(supervisor)-mode access
- * which originated from user mode.
- */
- if (!(hw_error_code & X86_PF_USER)) {
- /*
- * The CPU was in user mode, but the CPU says
- * the fault was not a user-mode access.
- * Must be an implicit kernel-mode access,
- * which we do not expect to happen in the
- * user address space.
- */
- pr_warn_once("kernel-mode error from user-mode: %lx\n",
- hw_error_code);
-
- sw_error_code |= X86_PF_USER;
- }
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (sw_error_code & X86_PF_WRITE)
+ if (hw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (sw_error_code & X86_PF_INSTR)
+ if (hw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
- if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+ if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
@@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, sw_error_code, address);
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
retry:
@@ -1361,17 +1333,17 @@ void do_user_addr_fault(struct pt_regs *regs,
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
@@ -1380,8 +1352,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* we can handle it..
*/
good_area:
- if (unlikely(access_error(sw_error_code, vma))) {
- bad_area_access_error(regs, sw_error_code, address, vma);
+ if (unlikely(access_error(hw_error_code, vma))) {
+ bad_area_access_error(regs, hw_error_code, address, vma);
return;
}
@@ -1420,13 +1392,13 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
/* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
+ no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, sw_error_code, address, fault);
+ mm_fault_error(regs, hw_error_code, address, fault);
return;
}
--
2.17.2
show_regs() shows the CS in the CPU register instead of the value in
regs. This means that we'll probably print "CS: 0010" almost all
the time regardless of what was actually in CS when the kernel
malfunctioned. This gives a particularly confusing result if we
OOPSed due to an implicit supervisor access from user mode.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/kernel/process_64.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0e0b4288a4b2..2b8e6324fa20 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -66,7 +66,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex, gsindex;
- unsigned int ds, cs, es;
+ unsigned int ds, es;
show_iret_regs(regs);
@@ -98,7 +98,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
}
asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
@@ -114,7 +113,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
es, cr0);
printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
cr4);
--
2.17.2
Rather than hardcoding 6 with a comment, use the defined constants.
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/vsyscall/vsyscall_64.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 85fd85d52ffd..d78bcc03e60e 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
struct thread_struct *thread = ¤t->thread;
- thread->error_code = 6; /* user fault, no page, write */
+ thread->error_code = X86_PF_USER | X86_PF_WRITE;
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
--
2.17.2
* Andy Lutomirski <[email protected]> wrote:
> One of Linus' favorite hobbies seems to be looking at OOPSes and
> decoding the error code in his head. This is not one of my favorite
> hobbies :)
>
> Teach the page fault OOPS hander to decode the error code. If it's
> a !USER fault from user mode, print an explicit note to that effect
> and print out the addresses of various tables that might cause such
> an error.
>
> With this patch applied, if I intentionally point the LDT at 0x0 and
> run the x86 selftests, I get:
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
> HW error: normal kernel read fault
> This was a system access from user code
> IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f)
> LDTR: 0x50 -- base=0x0 limit=0xfff7
> TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f
> PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0
> SMP PTI
> CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317
> Hardware name: ...
> RIP: 0033:0x401454
I've applied your series, with one small edit, the following message:
> HW error: normal kernel read fault
will IMHO confuse the heck out of users, thinking that their hardware is
broken...
Yes, the message is accurate, in MM pagefault language it's indeed the HW
error code, but it's a language very few people speak.
So I edited it over to say '#PF error code'. I also applied a few other
minor cleanups - see the changelog below.
Let me know if you have any objections.
Thanks,
Ingo
===============>
From a2aa52ab16efbee40ad118ebac4a5e438f5b43ee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <[email protected]>
Date: Thu, 22 Nov 2018 09:34:03 +0100
Subject: [PATCH] x86/fault: Clean up the page fault oops decoder a bit
- Make the oops messages a bit less scary (don't mention 'HW errors')
- Turn 'PROT USER' (which is visually easily confused with PROT_USER)
into individual bit descriptors: "[PROT] [USER]".
This also makes "[normal kernel read fault]" more apparent.
- De-abbreviate variables to make the code easier to read
- Use vertical alignment where appropriate.
- Add comment about string size limits and the helper function.
- Remove unnecessary line breaks.
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Cc: [email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/mm/fault.c | 38 +++++++++++++++++++++++---------------
1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f5efbdba2b6d..2ff25ad33233 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -603,10 +603,13 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}
-static void errstr(unsigned long ec, char *buf, unsigned long mask,
- const char *txt)
+/*
+ * This helper function transforms the #PF error_code bits into
+ * "[PROT] [USER]" type of descriptive, almost human-readable error strings:
+ */
+static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
{
- if (ec & mask) {
+ if (error_code & mask) {
if (buf[0])
strcat(buf, " ");
strcat(buf, txt);
@@ -614,10 +617,9 @@ static void errstr(unsigned long ec, char *buf, unsigned long mask,
}
static void
-show_fault_oops(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- char errtxt[64];
+ char err_txt[64];
if (!oops_may_print())
return;
@@ -646,15 +648,21 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
(void *)address);
- errtxt[0] = 0;
- errstr(error_code, errtxt, X86_PF_PROT, "PROT");
- errstr(error_code, errtxt, X86_PF_WRITE, "WRITE");
- errstr(error_code, errtxt, X86_PF_USER, "USER");
- errstr(error_code, errtxt, X86_PF_RSVD, "RSVD");
- errstr(error_code, errtxt, X86_PF_INSTR, "INSTR");
- errstr(error_code, errtxt, X86_PF_PK, "PK");
- pr_alert("HW error: %s\n", error_code ? errtxt :
- "normal kernel read fault");
+ err_txt[0] = 0;
+
+ /*
+ * Note: length of these appended strings including the separation space and the
+ * zero delimiter must fit into err_txt[].
+ */
+ err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
+ err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
+ err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
+ err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
+ err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
+ err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
+
+ pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
+
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
u16 ldtr, tr;
On Wed, Nov 21, 2018 at 03:11:21PM -0800, Andy Lutomirski wrote:
> This series is a whole bunch of page fault cleanups, plus a couple
> of OOPS diagnostic improvements. The overall goals are to clean up
> handling of the faulting CPL, the USER bit in the error_code, and
> the log messages generated by #PF OOPSes.
>
> This series can also be seen as CET preparation. CET introduces the
> WRUSS instruction, which is the very first way for CPL 0 code to
> cause a #PF fault with the USER bit set. Let's get the page fault
> code into shape before we start using WRUSS :)
>
> Changes from v1:
> - Rebase on top of tip:x86/mm, now that a bunch of v1 was applied.
> The only material changes are that 'x86/fault: Check
> user_mode(regs) when validating a stack extension' is gone
> because the code it fixed has been deleted and that 'x86/fault:
> Remove sw_error_code' lost the hunk that changed the same code.
>
> Andy Lutomirski (5):
> x86/fault: Remove sw_error_code
> x86/fault: Don't try to recover from an implicit supervisor access
> x86/oops: Show the correct CS value in show_regs()
> x86/fault: Decode page fault OOPSes better
> x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code
>
> arch/x86/entry/vsyscall/vsyscall_64.c | 2 +-
> arch/x86/kernel/process_64.c | 5 +-
> arch/x86/mm/fault.c | 144 +++++++++++++++++++-------
> 3 files changed, 108 insertions(+), 43 deletions(-)
All looks good to me,
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Commit-ID: ebb53e2597e2dc7637ab213df006e99681b6ee25
Gitweb: https://git.kernel.org/tip/ebb53e2597e2dc7637ab213df006e99681b6ee25
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 21 Nov 2018 15:11:23 -0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 22 Nov 2018 09:23:00 +0100
x86/fault: Don't try to recover from an implicit supervisor access
This avoids a situation in which we attempt to apply various fixups
that are not intended to handle implicit supervisor accesses from
user mode if we screw up in a way that causes this type of fault.
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Link: http://lkml.kernel.org/r/9999f151d72ff352265f3274c5ab3a4105090f49.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/mm/fault.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 82881bc5feef..ca38bd0472f2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -653,6 +653,15 @@ no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long flags;
int sig;
+ if (user_mode(regs)) {
+ /*
+ * This is an implicit supervisor-mode access from user
+ * mode. Bypass all the kernel-mode recovery code and just
+ * OOPS.
+ */
+ goto oops;
+ }
+
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
@@ -738,6 +747,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (IS_ENABLED(CONFIG_EFI))
efi_recover_from_page_fault(address);
+oops:
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice:
Commit-ID: a1a371c468f7238b7826fde55786b02377faf8e2
Gitweb: https://git.kernel.org/tip/a1a371c468f7238b7826fde55786b02377faf8e2
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 21 Nov 2018 15:11:25 -0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 22 Nov 2018 09:24:28 +0100
x86/fault: Decode page fault OOPSes better
One of Linus' favorite hobbies seems to be looking at OOPSes and
decoding the error code in his head. This is not one of my favorite
hobbies :)
Teach the page fault OOPS hander to decode the error code. If it's
a !USER fault from user mode, print an explicit note to that effect
and print out the addresses of various tables that might cause such
an error.
With this patch applied, if I intentionally point the LDT at 0x0 and
run the x86 selftests, I get:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
HW error: normal kernel read fault
This was a system access from user code
IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f)
LDTR: 0x50 -- base=0x0 limit=0xfff7
TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f
PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0
SMP PTI
CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317
Hardware name: ...
RIP: 0033:0x401454
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Link: http://lkml.kernel.org/r/11212acb25980cd1b3030875cd9502414fbb214d.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/mm/fault.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 84 insertions(+)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ca38bd0472f2..f5efbdba2b6d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/desc.h> /* store_idt(), ... */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -571,10 +572,53 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
return 0;
}
+static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
+{
+ u32 offset = (index >> 3) * sizeof(struct desc_struct);
+ unsigned long addr;
+ struct ldttss_desc desc;
+
+ if (index == 0) {
+ pr_alert("%s: NULL\n", name);
+ return;
+ }
+
+ if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
+ pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
+ return;
+ }
+
+ if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
+ sizeof(struct ldttss_desc))) {
+ pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
+ name, index);
+ return;
+ }
+
+ addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+#ifdef CONFIG_X86_64
+ addr |= ((u64)desc.base3 << 32);
+#endif
+ pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
+ name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
+}
+
+static void errstr(unsigned long ec, char *buf, unsigned long mask,
+ const char *txt)
+{
+ if (ec & mask) {
+ if (buf[0])
+ strcat(buf, " ");
+ strcat(buf, txt);
+ }
+}
+
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
{
+ char errtxt[64];
+
if (!oops_may_print())
return;
@@ -602,6 +646,46 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
(void *)address);
+ errtxt[0] = 0;
+ errstr(error_code, errtxt, X86_PF_PROT, "PROT");
+ errstr(error_code, errtxt, X86_PF_WRITE, "WRITE");
+ errstr(error_code, errtxt, X86_PF_USER, "USER");
+ errstr(error_code, errtxt, X86_PF_RSVD, "RSVD");
+ errstr(error_code, errtxt, X86_PF_INSTR, "INSTR");
+ errstr(error_code, errtxt, X86_PF_PK, "PK");
+ pr_alert("HW error: %s\n", error_code ? errtxt :
+ "normal kernel read fault");
+ if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+ struct desc_ptr idt, gdt;
+ u16 ldtr, tr;
+
+ pr_alert("This was a system access from user code\n");
+
+ /*
+ * This can happen for quite a few reasons. The more obvious
+ * ones are faults accessing the GDT, or LDT. Perhaps
+ * surprisingly, if the CPU tries to deliver a benign or
+ * contributory exception from user code and gets a page fault
+ * during delivery, the page fault can be delivered as though
+ * it originated directly from user code. This could happen
+ * due to wrong permissions on the IDT, GDT, LDT, TSS, or
+ * kernel or IST stack.
+ */
+ store_idt(&idt);
+
+ /* Usable even on Xen PV -- it's just slow. */
+ native_store_gdt(&gdt);
+
+ pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
+ idt.address, idt.size, gdt.address, gdt.size);
+
+ store_ldt(ldtr);
+ show_ldttss(&gdt, "LDTR", ldtr);
+
+ store_tr(tr);
+ show_ldttss(&gdt, "TR", tr);
+ }
+
dump_pagetable(address);
}
Commit-ID: 0ed32f1aa66ee758e6c8164f549f7ff9d399a20e
Gitweb: https://git.kernel.org/tip/0ed32f1aa66ee758e6c8164f549f7ff9d399a20e
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 21 Nov 2018 15:11:22 -0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 22 Nov 2018 09:22:59 +0100
x86/fault: Remove sw_error_code
All of the fault handling code now corrently checks user_mode(regs)
as needed, and nothing depends on the X86_PF_USER bit being munged.
Get rid of the sw_error code and use hw_error_code everywhere.
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Link: http://lkml.kernel.org/r/078f5b8ae6e8c79ff8ee7345b5c476c45003e5ac.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/mm/fault.c | 50 +++++++++++---------------------------------------
1 file changed, 11 insertions(+), 39 deletions(-)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b898a38093a3..82881bc5feef 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1217,7 +1217,6 @@ void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
- unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
@@ -1262,13 +1261,6 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
}
- /*
- * hw_error_code is literally the "page fault error code" passed to
- * the kernel directly from the hardware. But, we will shortly be
- * modifying it in software, so give it a new name.
- */
- sw_error_code = hw_error_code;
-
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
@@ -1278,26 +1270,6 @@ void do_user_addr_fault(struct pt_regs *regs,
*/
if (user_mode(regs)) {
local_irq_enable();
- /*
- * Up to this point, X86_PF_USER set in hw_error_code
- * indicated a user-mode access. But, after this,
- * X86_PF_USER in sw_error_code will indicate either
- * that, *or* an implicit kernel(supervisor)-mode access
- * which originated from user mode.
- */
- if (!(hw_error_code & X86_PF_USER)) {
- /*
- * The CPU was in user mode, but the CPU says
- * the fault was not a user-mode access.
- * Must be an implicit kernel-mode access,
- * which we do not expect to happen in the
- * user address space.
- */
- pr_warn_once("kernel-mode error from user-mode: %lx\n",
- hw_error_code);
-
- sw_error_code |= X86_PF_USER;
- }
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1306,9 +1278,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (sw_error_code & X86_PF_WRITE)
+ if (hw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (sw_error_code & X86_PF_INSTR)
+ if (hw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1321,7 +1293,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
- if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+ if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
@@ -1345,7 +1317,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, sw_error_code, address);
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
retry:
@@ -1361,17 +1333,17 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
@@ -1380,8 +1352,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(sw_error_code, vma))) {
- bad_area_access_error(regs, sw_error_code, address, vma);
+ if (unlikely(access_error(hw_error_code, vma))) {
+ bad_area_access_error(regs, hw_error_code, address, vma);
return;
}
@@ -1420,13 +1392,13 @@ good_area:
return;
/* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
+ no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, sw_error_code, address, fault);
+ mm_fault_error(regs, hw_error_code, address, fault);
return;
}
Commit-ID: af2ebdcf044039e89da3cd44c0f04dea317020c5
Gitweb: https://git.kernel.org/tip/af2ebdcf044039e89da3cd44c0f04dea317020c5
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 21 Nov 2018 15:11:26 -0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 22 Nov 2018 09:24:27 +0100
x86/vsyscall/64: Use X86_PF constants in the simulated #PF error code
Rather than hardcoding 6 with a comment, use the defined constants.
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Link: http://lkml.kernel.org/r/e023f20352b0d05a8b0205629897917262d2ad68.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/entry/vsyscall/vsyscall_64.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 85fd85d52ffd..d78bcc03e60e 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -102,7 +102,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
struct thread_struct *thread = ¤t->thread;
- thread->error_code = 6; /* user fault, no page, write */
+ thread->error_code = X86_PF_USER | X86_PF_WRITE;
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
Commit-ID: d38bc89c72e7235ac889ae64fe7828e2e61a18af
Gitweb: https://git.kernel.org/tip/d38bc89c72e7235ac889ae64fe7828e2e61a18af
Author: Andy Lutomirski <[email protected]>
AuthorDate: Wed, 21 Nov 2018 15:11:24 -0800
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 22 Nov 2018 09:23:01 +0100
x86/oops: Show the correct CS value in show_regs()
show_regs() shows the CS in the CPU register instead of the value in
regs. This means that we'll probably print "CS: 0010" almost all
the time regardless of what was actually in CS when the kernel
malfunctioned. This gives a particularly confusing result if we
OOPSed due to an implicit supervisor access from user mode.
Signed-off-by: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Link: http://lkml.kernel.org/r/4e36812b6e1e95236a812021d35cbf22746b5af6.1542841400.git.luto@kernel.org
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/process_64.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0e0b4288a4b2..2b8e6324fa20 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -66,7 +66,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex, gsindex;
- unsigned int ds, cs, es;
+ unsigned int ds, es;
show_iret_regs(regs);
@@ -98,7 +98,6 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
}
asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
@@ -114,7 +113,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
es, cr0);
printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
cr4);
On Thu, Nov 22, 2018 at 09:41:19AM +0100, Ingo Molnar wrote:
>
> * Andy Lutomirski <[email protected]> wrote:
>
> > One of Linus' favorite hobbies seems to be looking at OOPSes and
> > decoding the error code in his head. This is not one of my favorite
> > hobbies :)
> >
> > Teach the page fault OOPS hander to decode the error code. If it's
> > a !USER fault from user mode, print an explicit note to that effect
> > and print out the addresses of various tables that might cause such
> > an error.
> >
> > With this patch applied, if I intentionally point the LDT at 0x0 and
> > run the x86 selftests, I get:
> >
> > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
> > HW error: normal kernel read fault
> > This was a system access from user code
> > IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f)
> > LDTR: 0x50 -- base=0x0 limit=0xfff7
> > TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f
> > PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0
> > SMP PTI
> > CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317
> > Hardware name: ...
> > RIP: 0033:0x401454
>
> I've applied your series, with one small edit, the following message:
>
> > HW error: normal kernel read fault
>
> will IMHO confuse the heck out of users, thinking that their hardware is
> broken...
>
> Yes, the message is accurate, in MM pagefault language it's indeed the HW
> error code, but it's a language very few people speak.
>
> So I edited it over to say '#PF error code'. I also applied a few other
> minor cleanups - see the changelog below.
I responded to the original thread a hair too late...
What about something like this instead of manually handling the case
where error_code==0 so that we get e.g. "[KERNEL] [READ]" instead of
"normal kernel read fault"? Getting "[PROT] [KERNEL] [READ]" seems
useful.
IMO "[normal kernel read fault]" followed by "This was a system access
from user code" is still confusing.
---
8b29ee4351d5c625aa9ca2765f8da5e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <[email protected]>
Date: Tue, 27 Nov 2018 07:09:57 -0800
Subject: [PATCH] x86/fault: Print "KERNEL" and "READ" for #PF error codes
...and explicitly state that it's a "code" that's being printed.
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Yu-cheng Yu <[email protected]>
Cc: [email protected]
Cc: Ingo Molnar <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
arch/x86/mm/fault.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2ff25ad33233..510e263c256b 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
-
- pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
+ err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
+ err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
+ "[READ]");
+ pr_alert("#PF error code: %s\n", err_txt);
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
--
2.19.2
>
> Let me know if you have any objections.
>
> Thanks,
>
> Ingo
>
> ===============>
> From a2aa52ab16efbee40ad118ebac4a5e438f5b43ee Mon Sep 17 00:00:00 2001
> From: Ingo Molnar <[email protected]>
> Date: Thu, 22 Nov 2018 09:34:03 +0100
> Subject: [PATCH] x86/fault: Clean up the page fault oops decoder a bit
>
> - Make the oops messages a bit less scary (don't mention 'HW errors')
>
> - Turn 'PROT USER' (which is visually easily confused with PROT_USER)
> into individual bit descriptors: "[PROT] [USER]".
> This also makes "[normal kernel read fault]" more apparent.
>
> - De-abbreviate variables to make the code easier to read
>
> - Use vertical alignment where appropriate.
>
> - Add comment about string size limits and the helper function.
>
> - Remove unnecessary line breaks.
>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Yu-cheng Yu <[email protected]>
> Cc: [email protected]
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> arch/x86/mm/fault.c | 38 +++++++++++++++++++++++---------------
> 1 file changed, 23 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index f5efbdba2b6d..2ff25ad33233 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -603,10 +603,13 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
> name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
> }
>
> -static void errstr(unsigned long ec, char *buf, unsigned long mask,
> - const char *txt)
> +/*
> + * This helper function transforms the #PF error_code bits into
> + * "[PROT] [USER]" type of descriptive, almost human-readable error strings:
> + */
> +static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
> {
> - if (ec & mask) {
> + if (error_code & mask) {
> if (buf[0])
> strcat(buf, " ");
> strcat(buf, txt);
> @@ -614,10 +617,9 @@ static void errstr(unsigned long ec, char *buf, unsigned long mask,
> }
>
> static void
> -show_fault_oops(struct pt_regs *regs, unsigned long error_code,
> - unsigned long address)
> +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
> {
> - char errtxt[64];
> + char err_txt[64];
>
> if (!oops_may_print())
> return;
> @@ -646,15 +648,21 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
> address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
> (void *)address);
>
> - errtxt[0] = 0;
> - errstr(error_code, errtxt, X86_PF_PROT, "PROT");
> - errstr(error_code, errtxt, X86_PF_WRITE, "WRITE");
> - errstr(error_code, errtxt, X86_PF_USER, "USER");
> - errstr(error_code, errtxt, X86_PF_RSVD, "RSVD");
> - errstr(error_code, errtxt, X86_PF_INSTR, "INSTR");
> - errstr(error_code, errtxt, X86_PF_PK, "PK");
> - pr_alert("HW error: %s\n", error_code ? errtxt :
> - "normal kernel read fault");
> + err_txt[0] = 0;
> +
> + /*
> + * Note: length of these appended strings including the separation space and the
> + * zero delimiter must fit into err_txt[].
> + */
> + err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
> + err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
> + err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
> + err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> + err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> + err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> +
> + pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> +
> if (!(error_code & X86_PF_USER) && user_mode(regs)) {
> struct desc_ptr idt, gdt;
> u16 ldtr, tr;
On Tue, Nov 27, 2018 at 7:32 AM Sean Christopherson
<[email protected]> wrote:
>
> On Thu, Nov 22, 2018 at 09:41:19AM +0100, Ingo Molnar wrote:
> >
> > * Andy Lutomirski <[email protected]> wrote:
> >
> > > One of Linus' favorite hobbies seems to be looking at OOPSes and
> > > decoding the error code in his head. This is not one of my favorite
> > > hobbies :)
> > >
> > > Teach the page fault OOPS hander to decode the error code. If it's
> > > a !USER fault from user mode, print an explicit note to that effect
> > > and print out the addresses of various tables that might cause such
> > > an error.
> > >
> > > With this patch applied, if I intentionally point the LDT at 0x0 and
> > > run the x86 selftests, I get:
> > >
> > > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
> > > HW error: normal kernel read fault
> > > This was a system access from user code
> > > IDT: 0xfffffe0000000000 (limit=0xfff) GDT: 0xfffffe0000001000 (limit=0x7f)
> > > LDTR: 0x50 -- base=0x0 limit=0xfff7
> > > TR: 0x40 -- base=0xfffffe0000003000 limit=0x206f
> > > PGD 800000000456e067 P4D 800000000456e067 PUD 4623067 PMD 0
> > > SMP PTI
> > > CPU: 0 PID: 153 Comm: ldt_gdt_64 Not tainted 4.19.0+ #1317
> > > Hardware name: ...
> > > RIP: 0033:0x401454
> >
> > I've applied your series, with one small edit, the following message:
> >
> > > HW error: normal kernel read fault
> >
> > will IMHO confuse the heck out of users, thinking that their hardware is
> > broken...
> >
> > Yes, the message is accurate, in MM pagefault language it's indeed the HW
> > error code, but it's a language very few people speak.
> >
> > So I edited it over to say '#PF error code'. I also applied a few other
> > minor cleanups - see the changelog below.
>
> I responded to the original thread a hair too late...
>
> What about something like this instead of manually handling the case
> where error_code==0 so that we get e.g. "[KERNEL] [READ]" instead of
> "normal kernel read fault"? Getting "[PROT] [KERNEL] [READ]" seems
> useful.
>
> IMO "[normal kernel read fault]" followed by "This was a system access
> from user code" is still confusing.
>
> ---
> 8b29ee4351d5c625aa9ca2765f8da5e Mon Sep 17 00:00:00 2001
> From: Sean Christopherson <[email protected]>
> Date: Tue, 27 Nov 2018 07:09:57 -0800
> Subject: [PATCH] x86/fault: Print "KERNEL" and "READ" for #PF error codes
>
> ...and explicitly state that it's a "code" that's being printed.
>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Yu-cheng Yu <[email protected]>
> Cc: [email protected]
> Cc: Ingo Molnar <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/mm/fault.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 2ff25ad33233..510e263c256b 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
> err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> -
> - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> + err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
> + err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
> + "[READ]");
> + pr_alert("#PF error code: %s\n", err_txt);
>
Seems generally nice, but I would suggest making the bit-not-set name
be another parameter to err_str_append(). I'm also slightly uneasy
about making "KERNEL" look like a bit, but I guess it doesn't bother
me too much.
Want to send a real patch?
On Tue, Dec 04, 2018 at 11:22:25AM -0800, Andy Lutomirski wrote:
> On Tue, Nov 27, 2018 at 7:32 AM Sean Christopherson
> <[email protected]> wrote:
> > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > index 2ff25ad33233..510e263c256b 100644
> > --- a/arch/x86/mm/fault.c
> > +++ b/arch/x86/mm/fault.c
> > @@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
> > err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> > err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> > err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> > -
> > - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> > + err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
> > + err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
> > + "[READ]");
> > + pr_alert("#PF error code: %s\n", err_txt);
> >
>
> Seems generally nice, but I would suggest making the bit-not-set name
> be another parameter to err_str_append(). I'm also slightly uneasy
> about making "KERNEL" look like a bit, but I guess it doesn't bother
> me too much.
What about "SUPERVISOR" instead of "KERNEL"? It'd be consistent with
the SDM and hopefully less likely to be misconstrued as something else.
> Want to send a real patch?
Will do.
On Tue, Dec 4, 2018 at 11:34 AM Sean Christopherson
<[email protected]> wrote:
>
> On Tue, Dec 04, 2018 at 11:22:25AM -0800, Andy Lutomirski wrote:
> > On Tue, Nov 27, 2018 at 7:32 AM Sean Christopherson
> > <[email protected]> wrote:
> > > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > > index 2ff25ad33233..510e263c256b 100644
> > > --- a/arch/x86/mm/fault.c
> > > +++ b/arch/x86/mm/fault.c
> > > @@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
> > > err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> > > err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> > > err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> > > -
> > > - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> > > + err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
> > > + err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
> > > + "[READ]");
> > > + pr_alert("#PF error code: %s\n", err_txt);
> > >
> >
> > Seems generally nice, but I would suggest making the bit-not-set name
> > be another parameter to err_str_append(). I'm also slightly uneasy
> > about making "KERNEL" look like a bit, but I guess it doesn't bother
> > me too much.
>
> What about "SUPERVISOR" instead of "KERNEL"? It'd be consistent with
> the SDM and hopefully less likely to be misconstrued as something else.
Or even just [!USER], perhaps.
On Tue, Dec 04, 2018 at 11:47:10AM -0800, Andy Lutomirski wrote:
> On Tue, Dec 4, 2018 at 11:34 AM Sean Christopherson
> <[email protected]> wrote:
> >
> > On Tue, Dec 04, 2018 at 11:22:25AM -0800, Andy Lutomirski wrote:
> > > On Tue, Nov 27, 2018 at 7:32 AM Sean Christopherson
> > > <[email protected]> wrote:
> > > > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > > > index 2ff25ad33233..510e263c256b 100644
> > > > --- a/arch/x86/mm/fault.c
> > > > +++ b/arch/x86/mm/fault.c
> > > > @@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
> > > > err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> > > > err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> > > > err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> > > > -
> > > > - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> > > > + err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
> > > > + err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
> > > > + "[READ]");
> > > > + pr_alert("#PF error code: %s\n", err_txt);
> > > >
> > >
> > > Seems generally nice, but I would suggest making the bit-not-set name
> > > be another parameter to err_str_append(). I'm also slightly uneasy
> > > about making "KERNEL" look like a bit, but I guess it doesn't bother
> > > me too much.
> >
> > What about "SUPERVISOR" instead of "KERNEL"? It'd be consistent with
> > the SDM and hopefully less likely to be misconstrued as something else.
>
> Or even just [!USER], perhaps.
I thought about that too, but the pedant in me didn't like the inconsistency
of doing "READ" instead of "[!WRITE] [!INSTR]", and IMO "READ" is a lot more
readable (no pun intended). I also like having completely different text,
makes it harder to miss a single "!" and go down the wrong path.
On Tue, Dec 4, 2018 at 11:52 AM Sean Christopherson
<[email protected]> wrote:
>
> On Tue, Dec 04, 2018 at 11:47:10AM -0800, Andy Lutomirski wrote:
> > On Tue, Dec 4, 2018 at 11:34 AM Sean Christopherson
> > <[email protected]> wrote:
> > >
> > > On Tue, Dec 04, 2018 at 11:22:25AM -0800, Andy Lutomirski wrote:
> > > > On Tue, Nov 27, 2018 at 7:32 AM Sean Christopherson
> > > > <[email protected]> wrote:
> > > > > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > > > > index 2ff25ad33233..510e263c256b 100644
> > > > > --- a/arch/x86/mm/fault.c
> > > > > +++ b/arch/x86/mm/fault.c
> > > > > @@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
> > > > > err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> > > > > err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> > > > > err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> > > > > -
> > > > > - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> > > > > + err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
> > > > > + err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
> > > > > + "[READ]");
> > > > > + pr_alert("#PF error code: %s\n", err_txt);
> > > > >
> > > >
> > > > Seems generally nice, but I would suggest making the bit-not-set name
> > > > be another parameter to err_str_append(). I'm also slightly uneasy
> > > > about making "KERNEL" look like a bit, but I guess it doesn't bother
> > > > me too much.
> > >
> > > What about "SUPERVISOR" instead of "KERNEL"? It'd be consistent with
> > > the SDM and hopefully less likely to be misconstrued as something else.
> >
> > Or even just [!USER], perhaps.
>
> I thought about that too, but the pedant in me didn't like the inconsistency
> of doing "READ" instead of "[!WRITE] [!INSTR]", and IMO "READ" is a lot more
> readable (no pun intended). I also like having completely different text,
> makes it harder to miss a single "!" and go down the wrong path.
Fair enough. I'm sold.
On Tue, Dec 04, 2018 at 11:22:25AM -0800, Andy Lutomirski wrote:
> On Tue, Nov 27, 2018 at 7:32 AM Sean Christopherson
> <[email protected]> wrote:
> > arch/x86/mm/fault.c | 6 ++++--
> > 1 file changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> > index 2ff25ad33233..510e263c256b 100644
> > --- a/arch/x86/mm/fault.c
> > +++ b/arch/x86/mm/fault.c
> > @@ -660,8 +660,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
> > err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
> > err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
> > err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
> > -
> > - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
> > + err_str_append(~error_code, err_txt, X86_PF_USER, "[KERNEL]");
> > + err_str_append(~error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR,
> > + "[READ]");
> > + pr_alert("#PF error code: %s\n", err_txt);
> >
>
> Seems generally nice, but I would suggest making the bit-not-set name
> be another parameter to err_str_append().
I didn't recall why I chose to negate error_code until I revisited the
actual code. The "READ" case is a combination of !WRITE && !USER, i.e.
doesn't fit into an existing err_str_append() call. So we'd end up with
an extra err_str_append() call that would also have a null message for
the positive test, which seemed unnecessarily complex and more convoluted
than simply negating error_code.
E.g.:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2ff25ad33233..48b420621825 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -607,12 +607,17 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
* This helper function transforms the #PF error_code bits into
* "[PROT] [USER]" type of descriptive, almost human-readable error strings:
*/
-static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
+static void err_str_append(unsigned long error_code, char *buf, unsigned long mask,
+ const char *pos, const char *neg)
{
- if (error_code & mask) {
+ if ((error_code & mask) == mask && pos) {
if (buf[0])
strcat(buf, " ");
- strcat(buf, txt);
+ strcat(buf, pos);
+ } else if (!(error_code & mask) && neg) {
+ if (buf[0])
+ strcat(buf, " ");
+ strcat(buf, neg);
}
}
@@ -654,14 +659,15 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
* Note: length of these appended strings including the separation space and the
* zero delimiter must fit into err_txt[].
*/
- err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
- err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
- err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
- err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
- err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
- err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
-
- pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
+ err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" , NULL);
+ err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]", NULL);
+ err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" , "[SUPERVISOR]");
+ err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" , NULL);
+ err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]", NULL);
+ err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" , NULL);
+ err_str_append(error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR, NULL,
+ "[READ]");
+ pr_alert("#PF error code: %s\n", err_txt);
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
> + err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]", NULL);
> + err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" , NULL);
> + err_str_append(error_code, err_txt, X86_PF_WRITE | X86_PF_INSTR, NULL,
> + "[READ]");
Ah, I see the issue. OTOH, printing [READ] [INSTR] wouldn’t be so bad, either.