kmap_local() and related interfaces are NOOPs on 64bit and only create
temporary fixmaps for highmem pages on 32bit. That means the test coverage
for this code is pretty small.
CONFIG_KMAP_LOCAL can be enabled independent from CONFIG_HIGHMEM, which
allows to provide support for enforced kmap_local() debugging even on
64bit.
For 32bit the support is unconditional, for 64bit it's only supported when
CONFIG_NR_CPUS <= 4096 as supporting it for 8192 CPUs would require to set
up yet another fixmap PGT.
If CONFIG_KMAP_LOCAL_FORCE_DEBUG is enabled then kmap_local()/kmap_atomic()
will use the temporary fixmap mapping path.
Signed-off-by: Thomas Gleixner <[email protected]>
---
V4: New patch
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/fixmap.h | 12 +++++++++---
arch/x86/include/asm/pgtable_64_types.h | 6 +++++-
3 files changed, 15 insertions(+), 4 deletions(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
+ select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,13 +14,20 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+#include <asm/kmap_size.h>
+
/*
* Exposed to assembly code for setting up initial page tables. Cannot be
* calculated in assembly code (fixmap entries are an enum), but is sanity
* checked in the actual fixmap C code to make sure that the fixmap is
* covered fully.
*/
-#define FIXMAP_PMD_NUM 2
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define FIXMAP_PMD_NUM 2
+#else
+# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
+# define FIXMAP_PMD_NUM (KM_PMDS + 2)
+#endif
/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
#define FIXMAP_PMD_TOP 507
@@ -31,7 +38,6 @@
#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
-#include <asm/kmap_size.h>
#else
#include <uapi/asm/vsyscall.h>
#endif
@@ -92,7 +98,7 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_KMAP_LOCAL
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
#ifdef CONFIG_PCI_MMCONFIG
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d;
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
-#define MODULES_END _AC(0xffffffffff000000, UL)
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define MODULES_END _AC(0xffffffffff000000, UL)
+#else
+# define MODULES_END _AC(0xfffffffffe000000, UL)
+#endif
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
The following commit has been merged into the core/mm branch of tip:
Commit-ID: 14df32670291588036a498051a54cd8462d7f611
Gitweb: https://git.kernel.org/tip/14df32670291588036a498051a54cd8462d7f611
Author: Thomas Gleixner <[email protected]>
AuthorDate: Wed, 18 Nov 2020 20:48:41 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 24 Nov 2020 14:42:09 +01:00
x86: Support kmap_local() forced debugging
kmap_local() and related interfaces are NOOPs on 64bit and only create
temporary fixmaps for highmem pages on 32bit. That means the test coverage
for this code is pretty small.
CONFIG_KMAP_LOCAL can be enabled independent from CONFIG_HIGHMEM, which
allows to provide support for enforced kmap_local() debugging even on
64bit.
For 32bit the support is unconditional, for 64bit it's only supported when
CONFIG_NR_CPUS <= 4096 as supporting it for 8192 CPUs would require to set
up yet another fixmap PGT.
If CONFIG_KMAP_LOCAL_FORCE_DEBUG is enabled then kmap_local()/kmap_atomic()
will use the temporary fixmap mapping path.
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/fixmap.h | 12 +++++++++---
arch/x86/include/asm/pgtable_64_types.h | 6 +++++-
3 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 33c273c..b5137cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,6 +93,7 @@ config X86
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
+ select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8eba66a..9f1a0a9 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,13 +14,20 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+#include <asm/kmap_size.h>
+
/*
* Exposed to assembly code for setting up initial page tables. Cannot be
* calculated in assembly code (fixmap entries are an enum), but is sanity
* checked in the actual fixmap C code to make sure that the fixmap is
* covered fully.
*/
-#define FIXMAP_PMD_NUM 2
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define FIXMAP_PMD_NUM 2
+#else
+# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
+# define FIXMAP_PMD_NUM (KM_PMDS + 2)
+#endif
/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
#define FIXMAP_PMD_TOP 507
@@ -31,7 +38,6 @@
#include <asm/pgtable_types.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
-#include <asm/kmap_size.h>
#else
#include <uapi/asm/vsyscall.h>
#endif
@@ -92,7 +98,7 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_KMAP_LOCAL
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
#ifdef CONFIG_PCI_MMCONFIG
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 52e5f5f..91ac106 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d;
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
-#define MODULES_END _AC(0xffffffffff000000, UL)
+#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+# define MODULES_END _AC(0xffffffffff000000, UL)
+#else
+# define MODULES_END _AC(0xfffffffffe000000, UL)
+#endif
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
On Wed, 18 Nov 2020 20:48:41 +0100
Thomas Gleixner <[email protected]> wrote:
> kmap_local() and related interfaces are NOOPs on 64bit and only create
> temporary fixmaps for highmem pages on 32bit. That means the test coverage
> for this code is pretty small.
>
> CONFIG_KMAP_LOCAL can be enabled independent from CONFIG_HIGHMEM, which
> allows to provide support for enforced kmap_local() debugging even on
> 64bit.
>
> For 32bit the support is unconditional, for 64bit it's only supported when
> CONFIG_NR_CPUS <= 4096 as supporting it for 8192 CPUs would require to set
> up yet another fixmap PGT.
>
> If CONFIG_KMAP_LOCAL_FORCE_DEBUG is enabled then kmap_local()/kmap_atomic()
> will use the temporary fixmap mapping path.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> V4: New patch
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/include/asm/fixmap.h | 12 +++++++++---
> arch/x86/include/asm/pgtable_64_types.h | 6 +++++-
> 3 files changed, 15 insertions(+), 4 deletions(-)
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -93,6 +93,7 @@ config X86
> select ARCH_SUPPORTS_ACPI
> select ARCH_SUPPORTS_ATOMIC_RMW
> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
I triggered the following crash on x86_32 by simply doing a:
(ssh'ing into the box)
# head -100 /tmp/output-file
Where the /tmp/output-file was the output of a trace-cmd report.
Even after rebooting and not running the tracing code, simply doing the
head command still crashed.
BUG: unable to handle page fault for address: fff58000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
*pdpt = 0000000006de9001 *pde = 0000000001968063 *pte = 0000000000000000
Oops: 0000 [#1] SMP PTI
CPU: 3 PID: 3935 Comm: sshd Not tainted 5.11.0-rc2-test+ #2
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
EIP: skb_copy_bits+0x10c/0x1b9
Code: 3b 5d e8 0f 47 5d e8 c7 45 e0 00 00 00 00 8b 7d e0 39 7d e8 76 3a 8b 45 d4 e8 a4 e4 ff ff 8b 55 e4 03 55 e0 89 d9 01 c6 89 d7 <f3> a4 e8 c9 e4 ff ff 01 5d e0 8b 5d e8 b8 00 10 00 00 2b 5d e0 83
EAX: fff57000 EBX: 000005a8 ECX: 000000f8 EDX: c77b9900
ESI: fff58000 EDI: c77b9db0 EBP: c6de39ec ESP: c6de39c0
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210286
CR0: 80050033 CR2: fff58000 CR3: 06de6000 CR4: 001506f0
Call Trace:
skb_segment+0x4a3/0x828
? __tcp_mtu_to_mss+0x2d/0x6b
tcp_gso_segment+0xf6/0x336
? list_add+0x26/0x26
tcp4_gso_segment+0x77/0x7c
? tcp_gso_segment+0x336/0x336
inet_gso_segment+0x1a1/0x2df
? inet_unregister_protosw+0x5e/0x5e
skb_mac_gso_segment+0xb9/0x107
__skb_gso_segment+0xdf/0x10f
? netif_skb_features+0x1ca/0x24a
? __qdisc_run+0x1e4/0x418
validate_xmit_skb.constprop.0+0x10f/0x1ad
validate_xmit_skb_list+0x25/0x45
sch_direct_xmit+0x5c/0x19d
__qdisc_run+0x3e3/0x418
? qdisc_run_begin+0x53/0x5d
qdisc_run+0x26/0x30
__dev_queue_xmit+0x2bd/0x524
? mark_held_locks+0x40/0x51
dev_queue_xmit+0xf/0x11
ip_finish_output2+0x378/0x3d7
__ip_finish_output+0xd6/0xe2
ip_output+0x8c/0xbb
? ip_mc_output+0x18d/0x18d
dst_output+0x27/0x2d
ip_local_out+0x2b/0x30
__ip_queue_xmit+0x32e/0x38e
? __copy_skb_header+0x4b/0x98
? __ip_queue_xmit+0x38e/0x38e
ip_queue_xmit+0x16/0x1b
__tcp_transmit_skb+0x731/0x794
tcp_transmit_skb+0x16/0x18
tcp_write_xmit+0x7b4/0xa90
__tcp_push_pending_frames+0x2c/0x6b
tcp_push+0x8c/0xf1
tcp_sendmsg_locked+0x74a/0x7f2
? tcp_sendmsg_locked+0x7f2/0x7f2
tcp_sendmsg+0x27/0x38
? tcp_sendmsg_locked+0x7f2/0x7f2
inet_sendmsg+0x3c/0x5f
? inet_send_prepare+0x3b/0x3b
sock_sendmsg_nosec+0x1a/0x2d
sock_sendmsg+0x25/0x29
sock_write_iter+0x84/0xa7
vfs_write+0xf5/0x19b
ksys_write+0x68/0xaa
__ia32_sys_write+0x15/0x17
__do_fast_syscall_32+0x66/0x76
do_fast_syscall_32+0x29/0x5b
do_SYSENTER_32+0x15/0x17
entry_SYSENTER_32+0x9f/0xf2
EIP: 0xb7ee3545
Code: c4 01 10 03 03 74 c0 01 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
EAX: ffffffda EBX: 00000003 ECX: 01d12448 EDX: 00002028
ESI: 00002028 EDI: 01d12448 EBP: bff4e388 ESP: bff4e328
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00200246
? asm_exc_nmi+0xc5/0x2ab
Modules linked in: ppdev parport_pc parport
CR2: 00000000fff58000
---[ end trace 3d4582614c9c2a0e ]---
EIP: skb_copy_bits+0x10c/0x1b9
Code: 3b 5d e8 0f 47 5d e8 c7 45 e0 00 00 00 00 8b 7d e0 39 7d e8 76 3a 8b 45 d4 e8 a4 e4 ff ff 8b 55 e4 03 55 e0 89 d9 01 c6 89 d7 <f3> a4 e8 c9 e4 ff ff 01 5d e0 8b 5d e8 b8 00 10 00 00 2b 5d e0 83
EAX: fff57000 EBX: 000005a8 ECX: 000000f8 EDX: c77b9900
ESI: fff58000 EDI: c77b9db0 EBP: c6de39ec ESP: c6de39c0
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00210286
CR0: 80050033 CR2: fff58000 CR3: 06de6000 CR4: 001506f0
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: disabled
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---
This was against 5.11-rc2.
I bisected it down to the commit that added this patch.
> + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
If I remove the above line, it works fine.
Attached is the config file.
-- Steve
> select ARCH_USE_BUILTIN_BSWAP
> select ARCH_USE_QUEUED_RWLOCKS
> select ARCH_USE_QUEUED_SPINLOCKS
> --- a/arch/x86/include/asm/fixmap.h
> +++ b/arch/x86/include/asm/fixmap.h
> @@ -14,13 +14,20 @@
> #ifndef _ASM_X86_FIXMAP_H
> #define _ASM_X86_FIXMAP_H
>
> +#include <asm/kmap_size.h>
> +
> /*
> * Exposed to assembly code for setting up initial page tables. Cannot be
> * calculated in assembly code (fixmap entries are an enum), but is sanity
> * checked in the actual fixmap C code to make sure that the fixmap is
> * covered fully.
> */
> -#define FIXMAP_PMD_NUM 2
> +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
> +# define FIXMAP_PMD_NUM 2
> +#else
> +# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512))
> +# define FIXMAP_PMD_NUM (KM_PMDS + 2)
> +#endif
> /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
> #define FIXMAP_PMD_TOP 507
>
> @@ -31,7 +38,6 @@
> #include <asm/pgtable_types.h>
> #ifdef CONFIG_X86_32
> #include <linux/threads.h>
> -#include <asm/kmap_size.h>
> #else
> #include <uapi/asm/vsyscall.h>
> #endif
> @@ -92,7 +98,7 @@ enum fixed_addresses {
> FIX_IO_APIC_BASE_0,
> FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
> #endif
> -#ifdef CONFIG_X86_32
> +#ifdef CONFIG_KMAP_LOCAL
> FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
> FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
> #ifdef CONFIG_PCI_MMCONFIG
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d;
>
> #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
> /* The module sections ends with the start of the fixmap */
> -#define MODULES_END _AC(0xffffffffff000000, UL)
> +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
> +# define MODULES_END _AC(0xffffffffff000000, UL)
> +#else
> +# define MODULES_END _AC(0xfffffffffe000000, UL)
> +#endif
> #define MODULES_LEN (MODULES_END - MODULES_VADDR)
>
> #define ESPFIX_PGD_ENTRY _AC(-2, UL)
>
>
On Wed, Jan 6, 2021 at 3:01 PM Steven Rostedt <[email protected]> wrote:
>
> I triggered the following crash on x86_32 by simply doing a:
>
> (ssh'ing into the box)
>
> # head -100 /tmp/output-file
>
> Where the /tmp/output-file was the output of a trace-cmd report.
> Even after rebooting and not running the tracing code, simply doing the
> head command still crashed.
The code decodes to
0: 3b 5d e8 cmp -0x18(%ebp),%ebx
3: 0f 47 5d e8 cmova -0x18(%ebp),%ebx
7: c7 45 e0 00 00 00 00 movl $0x0,-0x20(%ebp)
e: 8b 7d e0 mov -0x20(%ebp),%edi
11: 39 7d e8 cmp %edi,-0x18(%ebp)
14: 76 3a jbe 0x50
16: 8b 45 d4 mov -0x2c(%ebp),%eax
19: e8 a4 e4 ff ff call 0xffffe4c2
1e: 8b 55 e4 mov -0x1c(%ebp),%edx
21: 03 55 e0 add -0x20(%ebp),%edx
24: 89 d9 mov %ebx,%ecx
26: 01 c6 add %eax,%esi
28: 89 d7 mov %edx,%edi
2a:* f3 a4 rep movsb %ds:(%esi),%es:(%edi)
<-- trapping instruction
2c: e8 c9 e4 ff ff call 0xffffe4fa
31: 01 5d e0 add %ebx,-0x20(%ebp)
34: 8b 5d e8 mov -0x18(%ebp),%ebx
37: b8 00 10 00 00 mov $0x1000,%eax
3c: 2b 5d e0 sub -0x20(%ebp),%ebx
and while it would be good to see the output of
scripts/decode_stacktrace.sh, I strongly suspect that the above is
vaddr = kmap_atomic(p);
memcpy(to + copied, vaddr + p_off, p_len);
kunmap_atomic(vaddr);
(although I wonder how/why the heck you've enabled
CC_OPTIMIZE_FOR_SIZE=y, which is what causes "memcpy()" to be done as
that "rep movsb". I thought we disabled it because it's so bad on most
cpus).
So that first "call" instruction is the kmap_atomic(), the "rep movs"
is the memcpy(), and the "call" instruction immediately after is the
kunmap_atomic().
Anyway, you can see vaddr in register state:
EAX: fff57000
so we've kmapped that one page at fff57000, but we're accessing past
it into the next page:
> BUG: unable to handle page fault for address: fff58000
with the current source address being (ESI: fff58000) and we still
have 248 bytes to go (ECX: 000000f8) even though we've already
overflowed into the next page.
You can see the original count still (EBX: 000005a8), so it really
looks like that skb_frag_foreach_page() logic
skb_frag_foreach_page(f,
skb_frag_off(f) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(to + copied, vaddr + p_off, p_len);
kunmap_atomic(vaddr);
}
must be wrong, and doesn't handle the "each page" part properly. It
must have started in the middle of the page, and p_len (that 0x5a8)
was wrong.
IOW, it really looks like p_off + p_len had the value 0x10f8, which is
larger than one page. And looking at the code, in
skb_frag_foreach_page(), I see:
p_off = (f_off) & (PAGE_SIZE - 1), \
p_len = skb_frag_must_loop(p) ? \
min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \
where that "min_t(u32, f_len, PAGE_SIZE - p_off)" looks correct, but
then presumably skb_frag_must_loop() must be wrong.
Oh, and when I look at that, I see
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
if (PageHighMem(p))
return true;
#endif
return false;
}
and that is no longer true. With the kmap debugging, even non-highmem
pages need that "do one page at a time" code, because even non-highmem
pages get remapped by kmap().
IOW, I think the patch to fix this might be something like the attached.
I wonder whether there is other code that "knows" about kmap() only
affecting PageHighmem() pages thing that is no longer true.
Looking at some other code, skb_gro_reset_offset() looks suspiciously
like it also thinks highmem pages are special.
Adding the networking people involved in this area to the cc too.
Linus
On Wed, 6 Jan 2021 17:03:48 -0800
Linus Torvalds <[email protected]> wrote:
> (although I wonder how/why the heck you've enabled
> CC_OPTIMIZE_FOR_SIZE=y, which is what causes "memcpy()" to be done as
> that "rep movsb". I thought we disabled it because it's so bad on most
> cpus).
Why?
Because to test x86_32, I have a Fedora Core 13 (yes 13!) partition
(baremetal) that I use. And the .config I use for it hasn't changed
since that time ;-) (except to add new features that I want to test on
x86_32).
Anyway, I'll test out your patch. Thanks for investigating this.
-- Steve
On Wed, 6 Jan 2021 17:03:48 -0800 Linus Torvalds wrote:
> I wonder whether there is other code that "knows" about kmap() only
> affecting PageHighmem() pages thing that is no longer true.
>
> Looking at some other code, skb_gro_reset_offset() looks suspiciously
> like it also thinks highmem pages are special.
>
> Adding the networking people involved in this area to the cc too.
Thanks for the detailed analysis! skb_gro_reset_offset() checks if
kernel can read data in the fragments directly as an optimization,
in case the entire header is in a fragment.
IIUC DEBUG_KMAP_LOCAL_FORCE_MAP only affects the mappings from
explicit kmap calls, which GRO won't make - it will fall back to
pulling the header out of the fragment and end up in skb_copy_bits(),
i.e. the loop you fixed. So GRO should be good. I think..
On Wed, 6 Jan 2021 17:03:48 -0800
Linus Torvalds <[email protected]> wrote:
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -366,7 +366,7 @@ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
> static inline bool skb_frag_must_loop(struct page *p)
> {
> #if defined(CONFIG_HIGHMEM)
> - if (PageHighMem(p))
> + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
> return true;
> #endif
> return false;
I applied this and I haven't been able to crash it again.
Thanks,
-- Steve
On Wed, Jan 6, 2021 at 8:49 PM Jakub Kicinski <[email protected]> wrote:
>
> On Wed, 6 Jan 2021 17:03:48 -0800 Linus Torvalds wrote:
> > I wonder whether there is other code that "knows" about kmap() only
> > affecting PageHighmem() pages thing that is no longer true.
> >
> > Looking at some other code, skb_gro_reset_offset() looks suspiciously
> > like it also thinks highmem pages are special.
> >
> > Adding the networking people involved in this area to the cc too.
>
> Thanks for the detailed analysis! skb_gro_reset_offset() checks if
> kernel can read data in the fragments directly as an optimization,
> in case the entire header is in a fragment.
>
> IIUC DEBUG_KMAP_LOCAL_FORCE_MAP only affects the mappings from
> explicit kmap calls, which GRO won't make - it will fall back to
> pulling the header out of the fragment and end up in skb_copy_bits(),
> i.e. the loop you fixed. So GRO should be good. I think..
Agreed. That code in skb_gro_reset_offset skips the GRO frag0
optimization in various cases, including if the first fragment is in
high mem.
That specific check goes back to the introduction of the frag0
optimization in commit 86911732d399 ("gro: Avoid copying headers of
unmerged packets"), at the time in helper skb_gro_header().
Very glad to hear that the fix addresses the crash in
skb_frag_foreach_page. Thanks!
On Wed, Jan 6, 2021 at 9:11 PM Willem de Bruijn <[email protected]> wrote:
>
> On Wed, Jan 6, 2021 at 8:49 PM Jakub Kicinski <[email protected]> wrote:
> >
> > On Wed, 6 Jan 2021 17:03:48 -0800 Linus Torvalds wrote:
> > > I wonder whether there is other code that "knows" about kmap() only
> > > affecting PageHighmem() pages thing that is no longer true.
> > >
> > > Looking at some other code, skb_gro_reset_offset() looks suspiciously
> > > like it also thinks highmem pages are special.
> > >
> > > Adding the networking people involved in this area to the cc too.
But there are three other kmap_atomic callers under net/ that do not
loop at all, so assume non-compound pages. In esp_output_head,
esp6_output_head and skb_seq_read. The first two directly use
skb_page_frag_refill, which can allocate compound (but not
__GFP_HIGHMEM) pages, and the third can be inserted with
netfilter xt_string in the path of tcp transmit skbs, which can also
have compound pages. I think that these could similarly access
data beyond the end of the kmap_atomic mapped page. I'll take
a closer look.
On Wed, Jan 6, 2021 at 8:45 PM Willem de Bruijn <[email protected]> wrote:
>
> But there are three other kmap_atomic callers under net/ that do not
> loop at all, so assume non-compound pages. In esp_output_head,
> esp6_output_head and skb_seq_read. The first two directly use
> skb_page_frag_refill, which can allocate compound (but not
> __GFP_HIGHMEM) pages, and the third can be inserted with
> netfilter xt_string in the path of tcp transmit skbs, which can also
> have compound pages. I think that these could similarly access
> data beyond the end of the kmap_atomic mapped page. I'll take
> a closer look.
Thanks.
Note that I have flushed my random one-liner patch from my system, and
expect to get a proper fix through the normal networking pulls.
And _if_ the networking people feel that my one-liner was the proper
fix, you can use it and add my sign-off if you want to, but it really
was more of a "this is the quick ugly fix for testing" rather than
anything else.
Linus
On Thu, 7 Jan 2021 11:47:02 -0800
Linus Torvalds <[email protected]> wrote:
> On Wed, Jan 6, 2021 at 8:45 PM Willem de Bruijn <[email protected]> wrote:
> >
> > But there are three other kmap_atomic callers under net/ that do not
> > loop at all, so assume non-compound pages. In esp_output_head,
> > esp6_output_head and skb_seq_read. The first two directly use
> > skb_page_frag_refill, which can allocate compound (but not
> > __GFP_HIGHMEM) pages, and the third can be inserted with
> > netfilter xt_string in the path of tcp transmit skbs, which can also
> > have compound pages. I think that these could similarly access
> > data beyond the end of the kmap_atomic mapped page. I'll take
> > a closer look.
>
> Thanks.
>
> Note that I have flushed my random one-liner patch from my system, and
> expect to get a proper fix through the normal networking pulls.
>
> And _if_ the networking people feel that my one-liner was the proper
> fix, you can use it and add my sign-off if you want to, but it really
> was more of a "this is the quick ugly fix for testing" rather than
> anything else.
>
Please add:
Link: https://lore.kernel.org/linux-mm/[email protected]/
Reported-by: Steven Rostedt (VMware) <[email protected]>
And if you take Linus's patch, please add my:
Tested-by: Steven Rostedt (VMware) <[email protected]>
and if you come up with another patch, please send it to me for testing.
Thanks!
-- Steve
On Thu, Jan 7, 2021 at 3:53 PM Steven Rostedt <[email protected]> wrote:
>
> On Thu, 7 Jan 2021 11:47:02 -0800
> Linus Torvalds <[email protected]> wrote:
>
> > On Wed, Jan 6, 2021 at 8:45 PM Willem de Bruijn <[email protected]> wrote:
> > >
> > > But there are three other kmap_atomic callers under net/ that do not
> > > loop at all, so assume non-compound pages. In esp_output_head,
> > > esp6_output_head and skb_seq_read. The first two directly use
> > > skb_page_frag_refill, which can allocate compound (but not
> > > __GFP_HIGHMEM) pages, and the third can be inserted with
> > > netfilter xt_string in the path of tcp transmit skbs, which can also
> > > have compound pages. I think that these could similarly access
> > > data beyond the end of the kmap_atomic mapped page. I'll take
> > > a closer look.
> >
> > Thanks.
> >
> > Note that I have flushed my random one-liner patch from my system, and
> > expect to get a proper fix through the normal networking pulls.
> >
> > And _if_ the networking people feel that my one-liner was the proper
> > fix, you can use it and add my sign-off if you want to, but it really
> > was more of a "this is the quick ugly fix for testing" rather than
> > anything else.
I do think it is the proper fix as is. If no one else has comments, I
can submit it through the net tree.
It won't address the other issues that became apparent only as a
result of this. I'm preparing separate patches for those.
> Please add:
>
> Link: https://lore.kernel.org/linux-mm/[email protected]/
> Reported-by: Steven Rostedt (VMware) <[email protected]>
>
> And if you take Linus's patch, please add my:
>
> Tested-by: Steven Rostedt (VMware) <[email protected]>
>
> and if you come up with another patch, please send it to me for testing.
>
> Thanks!
Will do, thanks.