Hello,
today I encountered a crash with kernel built from master branch
snapshot (commit e1ef035d272e):
CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G E 4.21.0-rc0-1.gd3c9245-default #1 openSUSE Tumbleweed (unreleased)
Hardware name: MICRO-STAR INTERANTIONAL CO.,LTD MS-7376/MS-7376, BIOS V1.2 12/21/2007
RIP: 0010:gart_unmap_page+0x69/0xc0
Code: 29 c5 48 c1 eb 0c 48 c1 ed 0c 85 db 7e 27 48 8b 35 dc 9b 8b 01 8d 53 ff 8b 0d bb 9b 8b 01 48 01 ea 48 8d 04 ae 48 8d 54 96 04 <89> 08 48 83 c0 04 48 39 c2 75 f5 48 c7 c7 88 b2 92 89 e8 00 23 81
RSP: 0018:ffff98a4e7a83dd8 EFLAGS: 00010002
RAX: 003f98a4e750a438 RBX: 0000000000000001 RCX: 0000000027788023
RDX: 003f98a4e750a43c RSI: ffff98a4e7750000 RDI: ffff98a4e74020b0
RBP: 000ffffffff6e90e R08: 0000000000000000 R09: ffffffff88071720
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001
R13: ffff98a4e74020b0 R14: 0000000000000001 R15: ffff98a4e2aea418
FS: 0000000000000000(0000) GS:ffff98a4e7a80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00002743b1f4f890 CR3: 00000001f5a70000 CR4: 00000000000006e0
Call Trace:
<IRQ>
gart_unmap_sg+0x52/0x70
__ata_qc_complete+0xfb/0x160
ata_qc_complete_multiple+0xbd/0xe0
ahci_handle_port_interrupt+0xcd/0x5a0
? blk_mq_run_hw_queue+0x38/0xc0
ahci_handle_port_intr+0x54/0xb0
ahci_single_level_irq_intr+0x3b/0x60
__handle_irq_event_percpu+0x46/0x1c0
handle_irq_event_percpu+0x20/0x60
handle_irq_event+0x3a/0x5a
handle_fasteoi_irq+0x9c/0x160
handle_irq+0x1f/0x30
do_IRQ+0x49/0xd0
common_interrupt+0xf/0xf
</IRQ>
This is the code of gart_unmap_page():
254 in ../arch/x86/kernel/amd_gart_64.c
0xffffffff81071660 <+0>: callq 0xffffffff81a01990 <__fentry__>
255 in ../arch/x86/kernel/amd_gart_64.c
256 in ../arch/x86/kernel/amd_gart_64.c
257 in ../arch/x86/kernel/amd_gart_64.c
258 in ../arch/x86/kernel/amd_gart_64.c
259 in ../arch/x86/kernel/amd_gart_64.c
0xffffffff81071665 <+5>: cmp $0xffffffffffffffff,%rsi
0xffffffff81071669 <+9>: je 0xffffffff8107171c <gart_unmap_page+188>
0xffffffff8107166f <+15>: mov 0x18b9c32(%rip),%rax # 0xffffffff8292b2a8 <iommu_bus_base>
0xffffffff81071676 <+22>: mov 0x18b9c23(%rip),%rcx # 0xffffffff8292b2a0 <iommu_size>
0xffffffff8107167d <+29>: add %rax,%rcx
0xffffffff81071680 <+32>: cmp %rsi,%rcx
0xffffffff81071683 <+35>: jbe 0xffffffff8107171c <gart_unmap_page+188>
260 in ../arch/x86/kernel/amd_gart_64.c
261 in ../arch/x86/kernel/amd_gart_64.c
262 in ../arch/x86/kernel/amd_gart_64.c
263 in ../arch/x86/kernel/amd_gart_64.c
0xffffffff81071689 <+41>: push %r12
0xffffffff8107168b <+43>: push %rbp
0xffffffff8107168c <+44>: mov %rsi,%rbp
0xffffffff8107168f <+47>: and $0xfff,%esi
0xffffffff81071695 <+53>: push %rbx
0xffffffff81071696 <+54>: lea 0xfff(%rdx,%rsi,1),%rbx
0xffffffff8107169e <+62>: sub %rax,%rbp
0xffffffff810716a1 <+65>: shr $0xc,%rbx
0xffffffff810716a5 <+69>: shr $0xc,%rbp
264 in ../arch/x86/kernel/amd_gart_64.c
265 in ../arch/x86/kernel/amd_gart_64.c
0xffffffff810716a9 <+73>: test %ebx,%ebx
0xffffffff810716ab <+75>: jle 0xffffffff810716d4 <gart_unmap_page+116>
0xffffffff810716ad <+77>: mov 0x18b9bdc(%rip),%rsi # 0xffffffff8292b290 <iommu_gatt_base>
0xffffffff810716b4 <+84>: lea -0x1(%rbx),%edx
0xffffffff810716b7 <+87>: mov 0x18b9bbb(%rip),%ecx # 0xffffffff8292b278 <gart_unmapped_entry>
0xffffffff810716bd <+93>: add %rbp,%rdx
0xffffffff810716c0 <+96>: lea (%rsi,%rbp,4),%rax
0xffffffff810716c4 <+100>: lea 0x4(%rsi,%rdx,4),%rdx
266 in ../arch/x86/kernel/amd_gart_64.c
0xffffffff810716c9 <+105>: mov %ecx,(%rax) <-------- crashed here
0xffffffff810716cb <+107>: add $0x4,%rax
0xffffffff810716cf <+111>: cmp %rax,%rdx
0xffffffff810716d2 <+114>: jne 0xffffffff810716c9 <gart_unmap_page+105>
...
According to the register values, i = 0 and npages = 1 but the problem
is iommu_page in %rbp which is 0x000ffffffff6e90e. The way it is
calculated it looks as if a kernel pointer was passed as dma_addr rather
than a DMA address.
There is a recent change in gart_unmap_page() from commit 9e8aa6b5461b
("x86/amd_gart: remove the mapping_error dma_map_ops method"):
------------------------------------------------------------------------
@@ -271,7 +259,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
int npages;
int i;
- if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ if (dma_addr == DMA_MAPPING_ERROR ||
dma_addr >= iommu_bus_base + iommu_size)
return;
------------------------------------------------------------------------
It seems the condition removed by this commit would catch such invalid
value of dma_addr so that it's possible this is an older problem which
was masked before and commit 9e8aa6b5461b uncovered it.
Michal Kubecek
Hi Michal,
can you try the patch below?
---
From 6b22ae23a1971646dacc8a0ad313a6329a04cf98 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <[email protected]>
Date: Fri, 4 Jan 2019 09:50:33 +0100
Subject: x86/amd_gart: fix unmapping of non-GART mappings
In many cases we don't have to create a GART mapping at all, which
also means there is nothing to unmap. Fix the range check that was
incorrectly modified when removing the mapping_error method.
Fixes: 9e8aa6b546 ("x86/amd_gart: remove the mapping_error dma_map_ops method")
Reported-by: Michal Kubecek <[email protected]>
Signed-off-by: Christoph Hellwig <[email protected]>
---
arch/x86/kernel/amd_gart_64.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e0ff3ac8c127..2c0aa34af69c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -256,7 +256,15 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
int npages;
int i;
- if (dma_addr == DMA_MAPPING_ERROR ||
+ if (WARN_ON_ONCE(dma_addr == DMA_MAPPING_ERROR))
+ return;
+
+ /*
+ * This driver will not always use a GART mapping, but might have
+ * created a direct mapping instead. If that is the case there is
+ * nothing to unmap here.
+ */
+ if (dma_addr < iommu_bus_base ||
dma_addr >= iommu_bus_base + iommu_size)
return;
--
2.20.1
On Fri, Jan 04, 2019 at 09:53:18AM +0100, Christoph Hellwig wrote:
> Hi Michal,
>
> can you try the patch below?
The machine has been running with it for 12 hours now without any
apparent problem. Without the patch it crashed once after ~10 minutes
and once after ~3 hours (then I switched back to 4.20). Thanks a lot for
quick help.
Tested-by: Michal Kubecek <[email protected]>
>
> ---
> From 6b22ae23a1971646dacc8a0ad313a6329a04cf98 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <[email protected]>
> Date: Fri, 4 Jan 2019 09:50:33 +0100
> Subject: x86/amd_gart: fix unmapping of non-GART mappings
>
> In many cases we don't have to create a GART mapping at all, which
> also means there is nothing to unmap. Fix the range check that was
> incorrectly modified when removing the mapping_error method.
>
> Fixes: 9e8aa6b546 ("x86/amd_gart: remove the mapping_error dma_map_ops method")
> Reported-by: Michal Kubecek <[email protected]>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> arch/x86/kernel/amd_gart_64.c | 10 +++++++++-
> 1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
> index e0ff3ac8c127..2c0aa34af69c 100644
> --- a/arch/x86/kernel/amd_gart_64.c
> +++ b/arch/x86/kernel/amd_gart_64.c
> @@ -256,7 +256,15 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
> int npages;
> int i;
>
> - if (dma_addr == DMA_MAPPING_ERROR ||
> + if (WARN_ON_ONCE(dma_addr == DMA_MAPPING_ERROR))
> + return;
> +
> + /*
> + * This driver will not always use a GART mapping, but might have
> + * created a direct mapping instead. If that is the case there is
> + * nothing to unmap here.
> + */
> + if (dma_addr < iommu_bus_base ||
> dma_addr >= iommu_bus_base + iommu_size)
> return;
>
> --
> 2.20.1
>