NVMe driver and other applications depend on the data offset
to operate correctly. Currently when unaligned data is mapped via
SWIOTLB, the data is mapped as slab aligned with the SWIOTLB. When
booting with --swiotlb=force option and using NVMe as interface,
running mkfs.xfs on Rhel fails because of the unalignment issue.
This patch makes sure the mapped data preserves
its offset of the orginal address. Tested on latest kernel that
this patch fixes the issue.
Signed-off-by: Jianxiong Gao <[email protected]>
Acked-by: David Rientjes <[email protected]>
---
kernel/dma/swiotlb.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 781b9dca197c..56a35e71b3fd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -483,6 +483,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
max_slots = mask + 1
? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
: 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+ /*
+ * We need to keep the offset when mapping, so adding the offset
+ * to the total set we need to allocate in SWIOTLB
+ */
+ alloc_size += offset_in_page(orig_addr);
/*
* For mappings greater than or equal to a page, we limit the stride
@@ -567,6 +573,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
*/
for (i = 0; i < nslots; i++)
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
+ /*
+ * When keeping the offset of the original data, we need to advance
+ * the tlb_addr by the offset of orig_addr.
+ */
+ tlb_addr += orig_addr & (PAGE_SIZE - 1);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
--
2.27.0
On Mon, Dec 07, 2020 at 01:42:04PM -0800, Jianxiong Gao wrote:
> NVMe driver and other applications depend on the data offset
> to operate correctly. Currently when unaligned data is mapped via
> SWIOTLB, the data is mapped as slab aligned with the SWIOTLB. When
> booting with --swiotlb=force option and using NVMe as interface,
> running mkfs.xfs on Rhel fails because of the unalignment issue.
> This patch makes sure the mapped data preserves
> its offset of the orginal address. Tested on latest kernel that
> this patch fixes the issue.
Lets reword this comment a bit more since you are not providing
the RHEL Bug, and instead are focusing on the upstream kernel.
I can do that for you..
>
> Signed-off-by: Jianxiong Gao <[email protected]>
> Acked-by: David Rientjes <[email protected]>
> ---
> kernel/dma/swiotlb.c | 11 +++++++++++
> 1 file changed, 11 insertions(+)
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 781b9dca197c..56a35e71b3fd 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -483,6 +483,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
> max_slots = mask + 1
> ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
> : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
> +
> + /*
> + * We need to keep the offset when mapping, so adding the offset
> + * to the total set we need to allocate in SWIOTLB
> + */
> + alloc_size += offset_in_page(orig_addr);
>
> /*
> * For mappings greater than or equal to a page, we limit the stride
> @@ -567,6 +573,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
> */
> for (i = 0; i < nslots; i++)
> io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
> + /*
> + * When keeping the offset of the original data, we need to advance
> + * the tlb_addr by the offset of orig_addr.
> + */
> + tlb_addr += orig_addr & (PAGE_SIZE - 1);
> if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
> (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
> swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
> --
> 2.27.0
>
>
On Mon, Dec 07, 2020 at 01:42:04PM -0800, Jianxiong Gao wrote:
> NVMe driver and other applications depend on the data offset
> to operate correctly. Currently when unaligned data is mapped via
> SWIOTLB, the data is mapped as slab aligned with the SWIOTLB. When
> booting with --swiotlb=force option and using NVMe as interface,
> running mkfs.xfs on Rhel fails because of the unalignment issue.
> This patch makes sure the mapped data preserves
> its offset of the orginal address. Tested on latest kernel that
> this patch fixes the issue.
>
> Signed-off-by: Jianxiong Gao <[email protected]>
> Acked-by: David Rientjes <[email protected]>
This breaks DHCP with upstream kernel (applied this on top v5.10-rc7)
and used swiotlb=262144,force and now the dhclient is not working:
[ 119.300502] bnxt_en 0000:3b:00.0 eno2np0: NIC Link is Up, 25000 Mbps full duplex, Flow control: ON - receive & transmit
[ 119.437573] bnxt_en 0000:3b:00.0 eno2np0: FEC autoneg off encoding: None
[ 90.064220] dracut-initqueue[1477]: Warning: dhcp for interface eno2np0 failed
[ 101.155295] dracut-initqueue[1477]: Warning: dhcp for interfa[ 142.361359] bnxt_en 0000:3b:00.1 eno3np1: NIC Link is Up, 25000 Mbps full duplex, Flow control: ON - receive & transmit
ce eno2np0 faile[ 142.501860] bnxt_en 0000:3b:00.1 eno3np1: FEC autoneg off encoding: None
d
[ 113.054108] dracut-initqueue[1477]: Warning: dhcp for interface eno3np1 failed
[ 123.867108] dracut-initqueue[1477]: Warning: dhcp for interface eno3np1 failed
[ 251.888002] dracut-initqueue[1477]: Warning: dracut-initqueue timeout - starting timeout scripts
Dropping from linux-next.
> ---
> kernel/dma/swiotlb.c | 11 +++++++++++
> 1 file changed, 11 insertions(+)
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 781b9dca197c..56a35e71b3fd 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -483,6 +483,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
> max_slots = mask + 1
> ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
> : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
> +
> + /*
> + * We need to keep the offset when mapping, so adding the offset
> + * to the total set we need to allocate in SWIOTLB
> + */
> + alloc_size += offset_in_page(orig_addr);
>
> /*
> * For mappings greater than or equal to a page, we limit the stride
> @@ -567,6 +573,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
> */
> for (i = 0; i < nslots; i++)
> io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
> + /*
> + * When keeping the offset of the original data, we need to advance
> + * the tlb_addr by the offset of orig_addr.
> + */
> + tlb_addr += orig_addr & (PAGE_SIZE - 1);
> if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
> (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
> swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
> --
> 2.27.0
>
>
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: c2f4ca83b5ff95fd02b404b38072f4b92adf68dd ("[PATCH] [PATCH] Keep offset when mapping data via SWIOTLB.")
url: https://github.com/0day-ci/linux/commits/Jianxiong-Gao/Keep-offset-when-mapping-data-via-SWIOTLB/20201208-054854
base: https://git.kernel.org/cgit/linux/kernel/git/konrad/swiotlb.git linux-next
in testcase: locktorture
version:
with following parameters:
runtime: 300s
test: default
test-description: This torture test consists of creating a number of kernel threads which acquire the lock and hold it for specific amount of time, thus simulating different critical region behaviors.
test-url: https://www.kernel.org/doc/Documentation/locking/locktorture.txt
on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 8G
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+----------------------------------------------------------------------------+------------+------------+
| | fc0021aa34 | c2f4ca83b5 |
+----------------------------------------------------------------------------+------------+------------+
| boot_successes | 0 | 0 |
| boot_failures | 4 | 12 |
| Kernel_panic-not_syncing:VFS:Unable_to_mount_root_fs_on_unknown-block(#,#) | 4 | 1 |
| BUG:KASAN:slab-out-of-bounds_in_e1000_clean_rx_irq | 0 | 1 |
| IP-Config:Auto-configuration_of_network_failed | 0 | 6 |
| BUG:KASAN:use-after-free_in_dma_unmap_page_attrs | 0 | 5 |
| BUG:KASAN:use-after-free_in_e1000_clean_rx_irq | 0 | 1 |
+----------------------------------------------------------------------------+------------+------------+
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 21.445939] BUG: KASAN: use-after-free in dma_unmap_page_attrs+0xce/0x158
[ 21.446537] Write of size 1522 at addr ffff8881539a0180 by task swapper/1
[ 21.447132]
[ 21.447282] CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc1-00003-gc2f4ca83b5ff #1
[ 21.447966] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 21.448694] Call Trace:
[ 21.448922] <IRQ>
[ 21.449119] print_address_description+0x1c/0x413
[ 21.449524] ? lock_acquire+0x124/0x153
[ 21.449764] ? kasan_report+0xc0/0x190
[ 21.450007] ? dma_unmap_page_attrs+0xce/0x158
[ 21.450284] kasan_report+0x157/0x190
[ 21.450517] ? dma_unmap_page_attrs+0xce/0x158
[ 21.450798] check_memory_region+0x13d/0x145
[ 21.451066] memcpy+0x39/0x58
[ 21.451257] dma_unmap_page_attrs+0xce/0x158
[ 21.451528] e1000_clean_rx_irq+0x41e/0xb8e
[ 21.451802] ? e1000_clean_jumbo_rx_irq+0x135c/0x135c
[ 21.452115] e1000_clean+0xef/0x4f3
[ 21.452336] ? trace_hardirqs_on+0x2d/0x38
[ 21.452592] ? _raw_spin_unlock_irq+0x24/0x2d
[ 21.452865] ? e1000_clean_tx_irq+0x125b/0x125b
[ 21.453151] ? tracer_hardirqs_on+0x16/0x172
[ 21.453432] net_rx_action+0x257/0x5ec
[ 21.453670] ? napi_busy_loop+0x20d/0x20d
[ 21.453926] ? tracer_hardirqs_on+0x16/0x172
[ 21.454196] __do_softirq+0x24d/0x545
[ 21.454433] asm_call_irq_on_stack+0xf/0x20
[ 21.454692] </IRQ>
[ 21.454832] do_softirq_own_stack+0x2e/0x3a
[ 21.455093] do_softirq+0x44/0x54
[ 21.455303] __local_bh_enable_ip+0x4f/0x5d
[ 21.455563] __dev_queue_xmit+0xce4/0xd30
[ 21.455818] ? netdev_core_pick_tx+0x1b9/0x1b9
[ 21.456094] ? memset+0x22/0x42
[ 21.456294] ? __alloc_skb+0x345/0x47d
[ 21.456531] ? skb_scrub_packet+0x170/0x170
[ 21.456792] ? __x64_sys_getrandom+0xe5/0xe5
[ 21.457058] ? memcpy+0x39/0x58
[ 21.457269] ic_bootp_send_if+0x11a1/0x11ce
[ 21.457537] ip_auto_config+0x476/0xb7e
[ 21.457780] ? root_nfs_parse_addr+0xfc/0xfc
[ 21.458047] ? lock_downgrade+0x4aa/0x4aa
[ 21.458299] ? lock_acquire+0x124/0x153
[ 21.458546] ? root_nfs_parse_addr+0xfc/0xfc
[ 21.458810] ? do_one_initcall+0xf2/0x25a
[ 21.459059] do_one_initcall+0xf2/0x25a
[ 21.459298] ? perf_trace_initcall_level+0x2eb/0x2eb
[ 21.459604] ? parameq+0x2d/0x2d
[ 21.459807] ? kasan_save_stack+0x25/0x3c
[ 21.460056] ? __kasan_kmalloc+0x70/0x7e
[ 21.460363] ? trace_kmalloc+0x44/0x54
[ 21.460597] ? __kmalloc+0x13a/0x164
[ 21.460826] do_basic_setup+0x1bb/0x1ea
[ 21.461071] kernel_init_freeable+0x130/0x15a
[ 21.461349] ? rest_init+0x12f/0x12f
[ 21.461575] kernel_init+0xd/0x10c
[ 21.461790] ret_from_fork+0x1f/0x30
[ 21.462026]
[ 21.462127] Allocated by task 1:
[ 21.462333] kasan_save_stack+0x1b/0x3c
[ 21.462574] kasan_set_track+0x1b/0x20
[ 21.462808] __kasan_kmalloc+0x70/0x7e
[ 21.463104] slab_post_alloc_hook+0x3c/0x167
[ 21.463369] kmem_cache_alloc+0xd5/0x150
[ 21.463614] acpi_ut_create_generic_state+0x5d/0x93
[ 21.463914] acpi_ut_create_update_state+0x18/0xac
[ 21.464210] acpi_ut_create_update_state_and_push+0x2a/0x46
[ 21.464552] acpi_ut_update_object_reference+0x331/0x548
[ 21.464879] acpi_ds_do_implicit_return+0x19a/0x1a7
[ 21.465180] acpi_ds_is_result_used+0xa5/0x6e2
[ 21.465466] acpi_ds_delete_result_if_not_used+0xe3/0x14e
[ 21.465799] acpi_ds_exec_end_op+0x127f/0x12f1
[ 21.466075] acpi_ps_parse_loop+0x14ac/0x15c6
[ 21.466345] acpi_ps_parse_aml+0x40c/0xc3b
[ 21.466601] acpi_ps_execute_method+0x672/0x7e6
[ 21.466886] acpi_ns_evaluate+0xa53/0xf5b
[ 21.467137] acpi_ut_evaluate_object+0x11d/0x452
[ 21.467423] acpi_rs_get_prt_method_data+0x93/0x123
[ 21.467725] acpi_get_irq_routing_table+0xbb/0x110
[ 21.468023] acpi_pci_irq_find_prt_entry+0x155/0xa07
[ 21.468329] acpi_pci_irq_lookup+0x80/0x84b
[ 21.468589] acpi_pci_irq_enable+0x281/0x512
[ 21.468857] do_pci_enable_device+0x86/0x157
[ 21.469122] pci_enable_device_flags+0x1d1/0x223
[ 21.469497] e1000_probe+0x117/0x2327
[ 21.469822] pci_device_probe+0x19d/0x32b
[ 21.470179] really_probe+0x321/0x7fd
[ 21.470507] driver_probe_device+0xeb/0x13f
[ 21.470879] device_driver_attach+0xc6/0xfd
[ 21.471251] __driver_attach+0x141/0x148
[ 21.471600] bus_for_each_dev+0xfd/0x149
[ 21.471949] bus_add_driver+0x2bb/0x455
[ 21.472291] driver_register+0x247/0x2c6
[ 21.472642] e1000_init_module+0x42/0x77
[ 21.472995] do_one_initcall+0xf2/0x25a
[ 21.473332] do_basic_setup+0x1bb/0x1ea
[ 21.473674] kernel_init_freeable+0x130/0x15a
[ 21.474053] kernel_init+0xd/0x10c
[ 21.474358] ret_from_fork+0x1f/0x30
[ 21.474677]
To reproduce:
# build kernel
cd linux
cp config-5.10.0-rc1-00003-gc2f4ca83b5ff .config
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 olddefconfig prepare modules_prepare bzImage modules
make HOSTCC=gcc-9 CC=gcc-9 ARCH=x86_64 INSTALL_MOD_PATH=<mod-install-dir> modules_install
cd <mod-install-dir>
find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> -m modules.cgz job-script # job-script is attached in this email
Thanks,
Rong Chen