Baoquan He reported lots of KFENCE reports when /proc/kcore is read,
e.g. with crash or even simpler with dd:
BUG: KFENCE: invalid read in copy_from_kernel_nofault+0x5e/0x120
Invalid read at 0x00000000f4f5149f:
copy_from_kernel_nofault+0x5e/0x120
read_kcore+0x6b2/0x870
proc_reg_read+0x9a/0xf0
vfs_read+0x94/0x270
ksys_read+0x70/0x100
__do_syscall+0x1d0/0x200
system_call+0x82/0xb0
The reason for this is that read_kcore() simply reads memory that might
have been unmapped by KFENCE with copy_from_kernel_nofault(). Any fault due
to pages being unmapped by KFENCE would be handled gracefully by the fault
handler (exception table fixup).
However the s390 fault handler first reports the fault, and only afterwards
would perform the exception table fixup. Most architectures have this in
reversed order, which also avoids the false positive KFENCE reports when an
unmapped page is accessed.
Therefore change the s390 fault handler so it handles exception table
fixups before KFENCE page faults are reported.
Reported-by: Baoquan He <[email protected]>
Signed-off-by: Heiko Carstens <[email protected]>
---
arch/s390/mm/fault.c | 49 +++++++++++++++++++++++++++++++-------------
1 file changed, 35 insertions(+), 14 deletions(-)
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 9649d9382e0a..8e84ed2bb944 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -96,6 +96,20 @@ static enum fault_type get_fault_type(struct pt_regs *regs)
return KERNEL_FAULT;
}
+static unsigned long get_fault_address(struct pt_regs *regs)
+{
+ unsigned long trans_exc_code = regs->int_parm_long;
+
+ return trans_exc_code & __FAIL_ADDR_MASK;
+}
+
+static bool fault_is_write(struct pt_regs *regs)
+{
+ unsigned long trans_exc_code = regs->int_parm_long;
+
+ return (trans_exc_code & store_indication) == 0x400;
+}
+
static int bad_address(void *p)
{
unsigned long dummy;
@@ -228,15 +242,26 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-static noinline void do_no_context(struct pt_regs *regs)
+static noinline void do_no_context(struct pt_regs *regs, vm_fault_t fault)
{
+ enum fault_type fault_type;
+ unsigned long address;
+ bool is_write;
+
if (fixup_exception(regs))
return;
+ fault_type = get_fault_type(regs);
+ if ((fault_type == KERNEL_FAULT) && (fault == VM_FAULT_BADCONTEXT)) {
+ address = get_fault_address(regs);
+ is_write = fault_is_write(regs);
+ if (kfence_handle_page_fault(address, is_write, regs))
+ return;
+ }
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
*/
- if (get_fault_type(regs) == KERNEL_FAULT)
+ if (fault_type == KERNEL_FAULT)
printk(KERN_ALERT "Unable to handle kernel pointer dereference"
" in virtual kernel address space\n");
else
@@ -255,7 +280,7 @@ static noinline void do_low_address(struct pt_regs *regs)
die (regs, "Low-address protection");
}
- do_no_context(regs);
+ do_no_context(regs, VM_FAULT_BADACCESS);
}
static noinline void do_sigbus(struct pt_regs *regs)
@@ -286,28 +311,28 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
fallthrough;
case VM_FAULT_BADCONTEXT:
case VM_FAULT_PFAULT:
- do_no_context(regs);
+ do_no_context(regs, fault);
break;
case VM_FAULT_SIGNAL:
if (!user_mode(regs))
- do_no_context(regs);
+ do_no_context(regs, fault);
break;
default: /* fault & VM_FAULT_ERROR */
if (fault & VM_FAULT_OOM) {
if (!user_mode(regs))
- do_no_context(regs);
+ do_no_context(regs, fault);
else
pagefault_out_of_memory();
} else if (fault & VM_FAULT_SIGSEGV) {
/* Kernel mode? Handle exceptions or die */
if (!user_mode(regs))
- do_no_context(regs);
+ do_no_context(regs, fault);
else
do_sigsegv(regs, SEGV_MAPERR);
} else if (fault & VM_FAULT_SIGBUS) {
/* Kernel mode? Handle exceptions or die */
if (!user_mode(regs))
- do_no_context(regs);
+ do_no_context(regs, fault);
else
do_sigbus(regs);
} else
@@ -334,7 +359,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
struct mm_struct *mm;
struct vm_area_struct *vma;
enum fault_type type;
- unsigned long trans_exc_code;
unsigned long address;
unsigned int flags;
vm_fault_t fault;
@@ -351,9 +375,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
return 0;
mm = tsk->mm;
- trans_exc_code = regs->int_parm_long;
- address = trans_exc_code & __FAIL_ADDR_MASK;
- is_write = (trans_exc_code & store_indication) == 0x400;
+ address = get_fault_address(regs);
+ is_write = fault_is_write(regs);
/*
* Verify that the fault happened in user space, that
@@ -364,8 +387,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
type = get_fault_type(regs);
switch (type) {
case KERNEL_FAULT:
- if (kfence_handle_page_fault(address, is_write, regs))
- return 0;
goto out;
case USER_FAULT:
case GMAP_FAULT:
--
2.37.2
On 02/13/23 at 07:38pm, Heiko Carstens wrote:
> Baoquan He reported lots of KFENCE reports when /proc/kcore is read,
> e.g. with crash or even simpler with dd:
>
> BUG: KFENCE: invalid read in copy_from_kernel_nofault+0x5e/0x120
> Invalid read at 0x00000000f4f5149f:
> copy_from_kernel_nofault+0x5e/0x120
> read_kcore+0x6b2/0x870
> proc_reg_read+0x9a/0xf0
> vfs_read+0x94/0x270
> ksys_read+0x70/0x100
> __do_syscall+0x1d0/0x200
> system_call+0x82/0xb0
>
> The reason for this is that read_kcore() simply reads memory that might
> have been unmapped by KFENCE with copy_from_kernel_nofault(). Any fault due
> to pages being unmapped by KFENCE would be handled gracefully by the fault
> handler (exception table fixup).
>
> However the s390 fault handler first reports the fault, and only afterwards
> would perform the exception table fixup. Most architectures have this in
> reversed order, which also avoids the false positive KFENCE reports when an
> unmapped page is accessed.
>
> Therefore change the s390 fault handler so it handles exception table
> fixups before KFENCE page faults are reported.
>
> Reported-by: Baoquan He <[email protected]>
> Signed-off-by: Heiko Carstens <[email protected]>
Applied this patch on the latest kernel 6.2-rc8 and tested, it fixes the
problem perfectly. Thank a lot. Please feel free to add:
Tested-by: Baoquan He <[email protected]>
> ---
> arch/s390/mm/fault.c | 49 +++++++++++++++++++++++++++++++-------------
> 1 file changed, 35 insertions(+), 14 deletions(-)
>
> diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> index 9649d9382e0a..8e84ed2bb944 100644
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -96,6 +96,20 @@ static enum fault_type get_fault_type(struct pt_regs *regs)
> return KERNEL_FAULT;
> }
>
> +static unsigned long get_fault_address(struct pt_regs *regs)
> +{
> + unsigned long trans_exc_code = regs->int_parm_long;
> +
> + return trans_exc_code & __FAIL_ADDR_MASK;
> +}
> +
> +static bool fault_is_write(struct pt_regs *regs)
> +{
> + unsigned long trans_exc_code = regs->int_parm_long;
> +
> + return (trans_exc_code & store_indication) == 0x400;
> +}
> +
> static int bad_address(void *p)
> {
> unsigned long dummy;
> @@ -228,15 +242,26 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
> (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
> }
>
> -static noinline void do_no_context(struct pt_regs *regs)
> +static noinline void do_no_context(struct pt_regs *regs, vm_fault_t fault)
> {
> + enum fault_type fault_type;
> + unsigned long address;
> + bool is_write;
> +
> if (fixup_exception(regs))
> return;
> + fault_type = get_fault_type(regs);
> + if ((fault_type == KERNEL_FAULT) && (fault == VM_FAULT_BADCONTEXT)) {
> + address = get_fault_address(regs);
> + is_write = fault_is_write(regs);
> + if (kfence_handle_page_fault(address, is_write, regs))
> + return;
> + }
> /*
> * Oops. The kernel tried to access some bad page. We'll have to
> * terminate things with extreme prejudice.
> */
> - if (get_fault_type(regs) == KERNEL_FAULT)
> + if (fault_type == KERNEL_FAULT)
> printk(KERN_ALERT "Unable to handle kernel pointer dereference"
> " in virtual kernel address space\n");
> else
> @@ -255,7 +280,7 @@ static noinline void do_low_address(struct pt_regs *regs)
> die (regs, "Low-address protection");
> }
>
> - do_no_context(regs);
> + do_no_context(regs, VM_FAULT_BADACCESS);
> }
>
> static noinline void do_sigbus(struct pt_regs *regs)
> @@ -286,28 +311,28 @@ static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
> fallthrough;
> case VM_FAULT_BADCONTEXT:
> case VM_FAULT_PFAULT:
> - do_no_context(regs);
> + do_no_context(regs, fault);
> break;
> case VM_FAULT_SIGNAL:
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> break;
> default: /* fault & VM_FAULT_ERROR */
> if (fault & VM_FAULT_OOM) {
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> else
> pagefault_out_of_memory();
> } else if (fault & VM_FAULT_SIGSEGV) {
> /* Kernel mode? Handle exceptions or die */
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> else
> do_sigsegv(regs, SEGV_MAPERR);
> } else if (fault & VM_FAULT_SIGBUS) {
> /* Kernel mode? Handle exceptions or die */
> if (!user_mode(regs))
> - do_no_context(regs);
> + do_no_context(regs, fault);
> else
> do_sigbus(regs);
> } else
> @@ -334,7 +359,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> struct mm_struct *mm;
> struct vm_area_struct *vma;
> enum fault_type type;
> - unsigned long trans_exc_code;
> unsigned long address;
> unsigned int flags;
> vm_fault_t fault;
> @@ -351,9 +375,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> return 0;
>
> mm = tsk->mm;
> - trans_exc_code = regs->int_parm_long;
> - address = trans_exc_code & __FAIL_ADDR_MASK;
> - is_write = (trans_exc_code & store_indication) == 0x400;
> + address = get_fault_address(regs);
> + is_write = fault_is_write(regs);
>
> /*
> * Verify that the fault happened in user space, that
> @@ -364,8 +387,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
> type = get_fault_type(regs);
> switch (type) {
> case KERNEL_FAULT:
> - if (kfence_handle_page_fault(address, is_write, regs))
> - return 0;
> goto out;
> case USER_FAULT:
> case GMAP_FAULT:
> --
> 2.37.2
>
On Mon, Feb 13, 2023 at 7:39 PM Heiko Carstens <[email protected]> wrote:
>
> Baoquan He reported lots of KFENCE reports when /proc/kcore is read,
> e.g. with crash or even simpler with dd:
>
> BUG: KFENCE: invalid read in copy_from_kernel_nofault+0x5e/0x120
> Invalid read at 0x00000000f4f5149f:
> copy_from_kernel_nofault+0x5e/0x120
> read_kcore+0x6b2/0x870
> proc_reg_read+0x9a/0xf0
> vfs_read+0x94/0x270
> ksys_read+0x70/0x100
> __do_syscall+0x1d0/0x200
> system_call+0x82/0xb0
>
> The reason for this is that read_kcore() simply reads memory that might
> have been unmapped by KFENCE with copy_from_kernel_nofault(). Any fault due
> to pages being unmapped by KFENCE would be handled gracefully by the fault
> handler (exception table fixup).
>
> However the s390 fault handler first reports the fault, and only afterwards
> would perform the exception table fixup. Most architectures have this in
> reversed order, which also avoids the false positive KFENCE reports when an
> unmapped page is accessed.
>
> Therefore change the s390 fault handler so it handles exception table
> fixups before KFENCE page faults are reported.
>
> Reported-by: Baoquan He <[email protected]>
> Signed-off-by: Heiko Carstens <[email protected]>
Acked-by: Alexander Potapenko <[email protected]>