2024-03-12 06:36:09

by Mirsad Todorovac

[permalink] [raw]
Subject: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

Hi,

(This is verified on the second test box.)

In the most recent 6.8.0 release of torvalds tree kernel with selftest configs on,
process ./iommufd appears to consume 99% of a CPU core for quote a while in an
endless loop:

root 59502 8816 0 Mar11 pts/2 00:00:00 make OUTPUT=/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu -C iommu run_tests O=/home/marvin/linux/kernel/linux_torvalds
root 59503 59502 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59516 59503 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59517 59516 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59518 59517 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59522 59518 0 Mar11 pts/2 00:00:00 /bin/sh -c BASE_DIR="/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests"; . /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/runner.sh; if [ "X" != "X" ]; then per_test_logging=1; fi; run_many /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_fail_nth
root 59523 59522 0 Mar11 pts/2 00:00:00 perl /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/kselftest/prefix.pl
root 59635 2367 99 Mar11 pts/2 11:28:03 ./iommufd

root@stargazer:/home/marvin# strace -p 59635
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
ioctl(5, _IOC(_IOC_NONE, 0x3b, 0xa0, 0), 0x7ffdd9eebc00) = 0
.
.
.

Please find attached config. It is the vanilla kernel, the build suite marked it "dirty"
because of the modifications to the selftests (adding debug option, mostly).

The kseltest output is:

make[3]: Entering directory '/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu'
TAP version 13
1..2
# timeout set to 45
# selftests: iommu: iommufd
# TAP version 13
# 1..180
# # Starting 180 tests from 18 test cases.
# # RUN iommufd.simple_close ...
# # OK iommufd.simple_close
# ok 1 iommufd.simple_close
# # RUN iommufd.cmd_fail ...
# # OK iommufd.cmd_fail
# ok 2 iommufd.cmd_fail
# # RUN iommufd.cmd_length ...
# # OK iommufd.cmd_length
# ok 3 iommufd.cmd_length
# # RUN iommufd.cmd_ex_fail ...
# # OK iommufd.cmd_ex_fail
# ok 4 iommufd.cmd_ex_fail
# # RUN iommufd.global_options ...
# # OK iommufd.global_options
# ok 5 iommufd.global_options
# # RUN iommufd.simple_ioctls ...
# # OK iommufd.simple_ioctls
# ok 6 iommufd.simple_ioctls
# # RUN iommufd.unmap_cmd ...
# # OK iommufd.unmap_cmd
# ok 7 iommufd.unmap_cmd
# # RUN iommufd.map_cmd ...
# # OK iommufd.map_cmd
# ok 8 iommufd.map_cmd
# # RUN iommufd.info_cmd ...
# # OK iommufd.info_cmd
# ok 9 iommufd.info_cmd
# # RUN iommufd.set_iommu_cmd ...
# # OK iommufd.set_iommu_cmd
# ok 10 iommufd.set_iommu_cmd
# # RUN iommufd.vfio_ioas ...
# # OK iommufd.vfio_ioas
# ok 11 iommufd.vfio_ioas
# # RUN iommufd_ioas.no_domain.ioas_auto_destroy ...
# # OK iommufd_ioas.no_domain.ioas_auto_destroy
# ok 12 iommufd_ioas.no_domain.ioas_auto_destroy
# # RUN iommufd_ioas.no_domain.ioas_destroy ...
# # OK iommufd_ioas.no_domain.ioas_destroy
# ok 13 iommufd_ioas.no_domain.ioas_destroy
# # RUN iommufd_ioas.no_domain.alloc_hwpt_nested ...
# # OK iommufd_ioas.no_domain.alloc_hwpt_nested
# ok 14 iommufd_ioas.no_domain.alloc_hwpt_nested
# # RUN iommufd_ioas.no_domain.hwpt_attach ...
# # iommufd.c:541:hwpt_attach:Expected 2 (2) == errno (22)
# # hwpt_attach: Test failed at step #6
# # FAIL iommufd_ioas.no_domain.hwpt_attach
# not ok 15 iommufd_ioas.no_domain.hwpt_attach
# # RUN iommufd_ioas.no_domain.ioas_area_destroy ...
# # OK iommufd_ioas.no_domain.ioas_area_destroy
# ok 16 iommufd_ioas.no_domain.ioas_area_destroy
# # RUN iommufd_ioas.no_domain.ioas_area_auto_destroy ...
# # OK iommufd_ioas.no_domain.ioas_area_auto_destroy
# ok 17 iommufd_ioas.no_domain.ioas_area_auto_destroy
# # RUN iommufd_ioas.no_domain.get_hw_info ...
# # OK iommufd_ioas.no_domain.get_hw_info
# ok 18 iommufd_ioas.no_domain.get_hw_info
# # RUN iommufd_ioas.no_domain.area ...
# # OK iommufd_ioas.no_domain.area
# ok 19 iommufd_ioas.no_domain.area
# # RUN iommufd_ioas.no_domain.unmap_fully_contained_areas ...
# # OK iommufd_ioas.no_domain.unmap_fully_contained_areas
# ok 20 iommufd_ioas.no_domain.unmap_fully_contained_areas
# # RUN iommufd_ioas.no_domain.area_auto_iova ...
# # OK iommufd_ioas.no_domain.area_auto_iova
# ok 21 iommufd_ioas.no_domain.area_auto_iova
# # RUN iommufd_ioas.no_domain.area_allowed ...
# # OK iommufd_ioas.no_domain.area_allowed
# ok 22 iommufd_ioas.no_domain.area_allowed
# # RUN iommufd_ioas.no_domain.copy_area ...
# # OK iommufd_ioas.no_domain.copy_area
# ok 23 iommufd_ioas.no_domain.copy_area
# # RUN iommufd_ioas.no_domain.iova_ranges ...
# # OK iommufd_ioas.no_domain.iova_ranges
# ok 24 iommufd_ioas.no_domain.iova_ranges
# # RUN iommufd_ioas.no_domain.access_domain_destory ...
# # iommufd.c:916:access_domain_destory:Expected MAP_FAILED (18446744073709551615) != buf (18446744073709551615)
# # access_domain_destory: Test terminated by timeout
# # FAIL iommufd_ioas.no_domain.access_domain_destory
# not ok 25 iommufd_ioas.no_domain.access_domain_destory
# # RUN iommufd_ioas.no_domain.access_pin ...
# # iommufd.c:991:access_pin:Expected 0 (0) == _test_cmd_mock_domain(self->fd, self->ioas_id, &mock_stdev_id, &mock_hwpt_id, ((void *)0)) (-1)

The failing assert seems to be here:

987 /* Add/remove a domain with a user */
988 ASSERT_EQ(0, ioctl(self->fd,
989 _IOMMU_TEST_CMD(IOMMU_TEST_OP_ACCESS_PAGES),
990 &access_cmd));
→ 991 test_cmd_mock_domain(self->ioas_id, &mock_stdev_id,
992 &mock_hwpt_id, NULL);
993 check_map_cmd.id = mock_hwpt_id;
994 ASSERT_EQ(0, ioctl(self->fd,
995 _IOMMU_TEST_CMD(IOMMU_TEST_OP_MD_CHECK_MAP),
996 &check_map_cmd));

For those of you who still do not have a clue what went wrong (like myself), I am trying
to generate a reproducer.

attempt to tap gdb on the running ./iommufd gave this:

root@defiant:/home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu# gdb ./iommufd --pid 63963
GNU gdb (Ubuntu 12.1-0ubuntu1~22.04.1) 12.1
Copyright (C) 2022 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./iommufd...
Attaching to program: /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd, process 63963
Reading symbols from /usr/libexec/coreutils/libstdbuf.so...
(No debugging symbols found in /usr/libexec/coreutils/libstdbuf.so)
Reading symbols from /lib/x86_64-linux-gnu/libc.so.6...
Reading symbols from /usr/lib/debug/.build-id/c2/89da5071a3399de893d2af81d6a30c62646e1e.debug...
Reading symbols from /lib64/ld-linux-x86-64.so.2...
Reading symbols from /usr/lib/debug/.build-id/15/921ea631d9f36502d20459c43e5c85b7d6ab76.debug...
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
__GI___ioctl (fd=5, request=request@entry=15264) at ../sysdeps/unix/sysv/linux/ioctl.c:36
36 ../sysdeps/unix/sysv/linux/ioctl.c: No such file or directory.
(gdb) bt
#0 __GI___ioctl (fd=5, request=request@entry=15264) at ../sysdeps/unix/sysv/linux/ioctl.c:36
#1 0x000057ed23d1f1ae in _test_ioctl_set_temp_memory_limit (limit=65536, fd=<optimized out>) at /home/marvin/linux/kernel/linux_torvalds/tools/testing/selftests/iommu/iommufd_utils.h:585
#2 iommufd_ioas_teardown (_metadata=_metadata@entry=0x57ed23d42860 <_iommufd_ioas_access_domain_destory_object>, self=self@entry=0x7ffdcdce5ef0, variant=<optimized out>) at iommufd.c:229
#3 0x000057ed23d23b7f in wrapper_iommufd_ioas_access_domain_destory (_metadata=0x57ed23d42860 <_iommufd_ioas_access_domain_destory_object>, variant=0x57ed23d43860 <_iommufd_ioas_mock_domain_object>) at iommufd.c:902
#4 0x000057ed23d1bfe9 in __run_test (f=f@entry=0x57ed23d438a0 <_iommufd_ioas_fixture_object>, variant=variant@entry=0x57ed23d43860 <_iommufd_ioas_mock_domain_object>,
t=t@entry=0x57ed23d42860 <_iommufd_ioas_access_domain_destory_object>) at ../kselftest_harness.h:1134
#5 0x000057ed23d12146 in test_harness_run (argv=0x7ffdcdce61a8, argc=1) at ../kselftest_harness.h:1199
#6 main (argc=1, argv=0x7ffdcdce61a8) at iommufd.c:2349
(gdb) list iommufd.c:2349
2344 &unmap_cmd));
2345 }
2346 }
2347 }
2348
2349 TEST_HARNESS_MAIN
(gdb)

Hope this helps someone.

Best regards,
Mirsad Todorovac


Attachments:
config-6.8.0-torv-00711-g045395d86acd-dirty.xz (58.01 kB)

2024-03-19 13:59:38

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On Tue, Mar 12, 2024 at 07:35:40AM +0100, Mirsad Todorovac wrote:
> Hi,
>
> (This is verified on the second test box.)
>
> In the most recent 6.8.0 release of torvalds tree kernel with selftest configs on,
> process ./iommufd appears to consume 99% of a CPU core for quote a while in an
> endless loop:

There is a "bug" in the ksefltest framework where if you call a
kselftest assertion from the setup/teardown it infinite loops

The fix I know is to replace kselftest assertions with normal assert()

But I don't see an obvious thing here saying you are hitting that..

Jason

2024-03-23 20:13:43

by Mirsad Todorovac

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()



On 3/19/24 14:58, Jason Gunthorpe wrote:
> On Tue, Mar 12, 2024 at 07:35:40AM +0100, Mirsad Todorovac wrote:
>> Hi,
>>
>> (This is verified on the second test box.)
>>
>> In the most recent 6.8.0 release of torvalds tree kernel with selftest configs on,
>> process ./iommufd appears to consume 99% of a CPU core for quote a while in an
>> endless loop:
>
> There is a "bug" in the ksefltest framework where if you call a
> kselftest assertion from the setup/teardown it infinite loops
>
> The fix I know is to replace kselftest assertions with normal assert()
>
> But I don't see an obvious thing here saying you are hitting that..
>
> Jason

Hi,

I'm not that deep into kselftest for that intervention.

Yet, with the v6.8-11743-ga4145ce1e7bc build, the problem with ./iommufd did not stuck.
Instead I got these 10 failed tests:

# # RUN iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # enforce_dirty: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty
# not ok 156 iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty
# # RUN iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # set_dirty_tracking: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking
# not ok 157 iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking
# # RUN iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # device_dirty_capability: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability
# not ok 158 iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability
# # RUN iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # get_dirty_bitmap: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap
# not ok 159 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap
# # RUN iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # get_dirty_bitmap_no_clear: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear
# not ok 160 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear
.
.
.
# # RUN iommufd_dirty_tracking.domain_dirty256M_huge.enforce_dirty ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # enforce_dirty: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty256M_huge.enforce_dirty
# not ok 166 iommufd_dirty_tracking.domain_dirty256M_huge.enforce_dirty
# # RUN iommufd_dirty_tracking.domain_dirty256M_huge.set_dirty_tracking ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # set_dirty_tracking: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty256M_huge.set_dirty_tracking
# not ok 167 iommufd_dirty_tracking.domain_dirty256M_huge.set_dirty_tracking
# # RUN iommufd_dirty_tracking.domain_dirty256M_huge.device_dirty_capability ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # device_dirty_capability: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty256M_huge.device_dirty_capability
# not ok 168 iommufd_dirty_tracking.domain_dirty256M_huge.device_dirty_capability
# # RUN iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # get_dirty_bitmap: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap
# not ok 169 iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap
# # RUN iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap_no_clear ...
# iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc == self->buffer' failed.
# # get_dirty_bitmap_no_clear: Test terminated by assertion
# # FAIL iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap_no_clear
# not ok 170 iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap_no_clear
.
.
.
# # FAILED: 170 / 180 tests passed.
# # Totals: pass:170 fail:10 xfail:0 xpass:0 skip:0 error:0
not ok 1 selftests: iommu: iommufd # exit=1

It seems like the same assertion failed in all 10 failed tests?

However, I am not smart enough to figure out why ...

Apparently, from the source, mmap() fails to allocate pages on the desired address:

1746 assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
1747 vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
1748 mmap_flags, -1, 0);
→ 1749 assert(vrc == self->buffer);
1750

But I am not that deep into the source to figure our what was intended and what went
wrong :-/

Best regards,
Mirsad Todorovac

2024-03-25 15:06:37

by Joao Martins

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On 23/03/2024 20:13, Mirsad Todorovac wrote:
>
>
> On 3/19/24 14:58, Jason Gunthorpe wrote:
>> On Tue, Mar 12, 2024 at 07:35:40AM +0100, Mirsad Todorovac wrote:
>>> Hi,
>>>
>>> (This is verified on the second test box.)
>>>
>>> In the most recent 6.8.0 release of torvalds tree kernel with selftest
>>> configs on,
>>> process ./iommufd appears to consume 99% of a CPU core for quote a while in an
>>> endless loop:
>>
>> There is a "bug" in the ksefltest framework where if you call a
>> kselftest assertion from the setup/teardown it infinite loops
>>
>> The fix I know is to replace kselftest assertions with normal assert()
>>
>> But I don't see an obvious thing here saying you are hitting that..
>>
>> Jason
>
> Hi,
>
> I'm not that deep into kselftest for that intervention.
>
> Yet, with the v6.8-11743-ga4145ce1e7bc build, the problem with ./iommufd did not
> stuck.
> Instead I got these 10 failed tests:
>
> # #  RUN           iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # enforce_dirty: Test terminated by assertion
> # #          FAIL  iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty
> # not ok 156 iommufd_dirty_tracking.domain_dirty128M_huge.enforce_dirty
> # #  RUN          
> iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # set_dirty_tracking: Test terminated by assertion
> # #          FAIL  iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking
> # not ok 157 iommufd_dirty_tracking.domain_dirty128M_huge.set_dirty_tracking
> # #  RUN          
> iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # device_dirty_capability: Test terminated by assertion
> # #          FAIL 
> iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability
> # not ok 158 iommufd_dirty_tracking.domain_dirty128M_huge.device_dirty_capability
> # #  RUN           iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap
> ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # get_dirty_bitmap: Test terminated by assertion
> # #          FAIL  iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap
> # not ok 159 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap
> # #  RUN          
> iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # get_dirty_bitmap_no_clear: Test terminated by assertion
> # #          FAIL 
> iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear
> # not ok 160 iommufd_dirty_tracking.domain_dirty128M_huge.get_dirty_bitmap_no_clear
> .
> .
> .
> # #  RUN           iommufd_dirty_tracking.domain_dirty256M_huge.enforce_dirty ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # enforce_dirty: Test terminated by assertion
> # #          FAIL  iommufd_dirty_tracking.domain_dirty256M_huge.enforce_dirty
> # not ok 166 iommufd_dirty_tracking.domain_dirty256M_huge.enforce_dirty
> # #  RUN          
> iommufd_dirty_tracking.domain_dirty256M_huge.set_dirty_tracking ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # set_dirty_tracking: Test terminated by assertion
> # #          FAIL  iommufd_dirty_tracking.domain_dirty256M_huge.set_dirty_tracking
> # not ok 167 iommufd_dirty_tracking.domain_dirty256M_huge.set_dirty_tracking
> # #  RUN          
> iommufd_dirty_tracking.domain_dirty256M_huge.device_dirty_capability ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # device_dirty_capability: Test terminated by assertion
> # #          FAIL 
> iommufd_dirty_tracking.domain_dirty256M_huge.device_dirty_capability
> # not ok 168 iommufd_dirty_tracking.domain_dirty256M_huge.device_dirty_capability
> # #  RUN           iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap
> ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # get_dirty_bitmap: Test terminated by assertion
> # #          FAIL  iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap
> # not ok 169 iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap
> # #  RUN          
> iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap_no_clear ...
> # iommufd: iommufd.c:1749: iommufd_dirty_tracking_setup: Assertion `vrc ==
> self->buffer' failed.
> # # get_dirty_bitmap_no_clear: Test terminated by assertion
> # #          FAIL 
> iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap_no_clear
> # not ok 170 iommufd_dirty_tracking.domain_dirty256M_huge.get_dirty_bitmap_no_clear
> .
> .
> .
> # # FAILED: 170 / 180 tests passed.
> # # Totals: pass:170 fail:10 xfail:0 xpass:0 skip:0 error:0
> not ok 1 selftests: iommu: iommufd # exit=1
>
> It seems like the same assertion failed in all 10 failed tests?
>

.. It means that the hugetlb mmap() failed, which is required for this specific
tests. Because we need to allocate a bigger IOVA range, and in hugepages to
exercise the test.


> However, I am not smart enough to figure out why ...
>
> Apparently, from the source, mmap() fails to allocate pages on the desired address:
>
>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
> PROT_WRITE,
>   1748                    mmap_flags, -1, 0);
> → 1749         assert(vrc == self->buffer);
>   1750
>
> But I am not that deep into the source to figure our what was intended and what
> went
> wrong :-/

I can SKIP() the test rather assert() in here if it helps. Though there are
other tests that fail if no hugetlb pages are reserved.

But I am not sure if this is problem here as the initial bug email had an
enterily different set of failures? Maybe all you need is an assert() and it
gets into this state?

Joao

2024-03-25 16:37:01

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
> > However, I am not smart enough to figure out why ...
> >
> > Apparently, from the source, mmap() fails to allocate pages on the desired address:
> >
> >   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
> >   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
> > PROT_WRITE,
> >   1748                    mmap_flags, -1, 0);
> > → 1749         assert(vrc == self->buffer);
> >   1750
> >
> > But I am not that deep into the source to figure our what was intended and what
> > went
> > wrong :-/
>
> I can SKIP() the test rather assert() in here if it helps. Though there are
> other tests that fail if no hugetlb pages are reserved.
>
> But I am not sure if this is problem here as the initial bug email had an
> enterily different set of failures? Maybe all you need is an assert() and it
> gets into this state?

I feel like there is something wrong with the kselftest framework,
there should be some way to fail the setup/teardown operations without
triggering an infinite loop :(

Jason

2024-03-27 10:42:43

by Joao Martins

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On 25/03/2024 13:52, Jason Gunthorpe wrote:
> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>> However, I am not smart enough to figure out why ...
>>>
>>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
>>>
>>>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>> PROT_WRITE,
>>>   1748                    mmap_flags, -1, 0);
>>> → 1749         assert(vrc == self->buffer);
>>>   1750
>>>
>>> But I am not that deep into the source to figure our what was intended and what
>>> went
>>> wrong :-/
>>
>> I can SKIP() the test rather assert() in here if it helps. Though there are
>> other tests that fail if no hugetlb pages are reserved.
>>
>> But I am not sure if this is problem here as the initial bug email had an
>> enterily different set of failures? Maybe all you need is an assert() and it
>> gets into this state?
>
> I feel like there is something wrong with the kselftest framework,
> there should be some way to fail the setup/teardown operations without
> triggering an infinite loop :(

I am now wondering if the problem is the fact that we have an assert() in the
middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
other kselftest macro that). The expect/assert macros from kselftest() don't do
asserts and it looks like we are failing mid tests in the assert().

Maybe it is OK for setup_sizes(), but maybe not OK for the rest (i.e. during the
actual setup / tests). I can throw a patch there to see if this helps Mirsad.


2024-03-27 11:40:43

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On Wed, Mar 27, 2024 at 10:41:52AM +0000, Joao Martins wrote:
> On 25/03/2024 13:52, Jason Gunthorpe wrote:
> > On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
> >>> However, I am not smart enough to figure out why ...
> >>>
> >>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
> >>>
> >>>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
> >>>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
> >>> PROT_WRITE,
> >>>   1748                    mmap_flags, -1, 0);
> >>> → 1749         assert(vrc == self->buffer);
> >>>   1750
> >>>
> >>> But I am not that deep into the source to figure our what was intended and what
> >>> went
> >>> wrong :-/
> >>
> >> I can SKIP() the test rather assert() in here if it helps. Though there are
> >> other tests that fail if no hugetlb pages are reserved.
> >>
> >> But I am not sure if this is problem here as the initial bug email had an
> >> enterily different set of failures? Maybe all you need is an assert() and it
> >> gets into this state?
> >
> > I feel like there is something wrong with the kselftest framework,
> > there should be some way to fail the setup/teardown operations without
> > triggering an infinite loop :(
>
> I am now wondering if the problem is the fact that we have an assert() in the
> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
> other kselftest macro that). The expect/assert macros from kselftest() don't do
> asserts and it looks like we are failing mid tests in the assert().

Those ASSERT_TRUE cause infinite loops when used within the setup
context, I removed them and switched to assert because of this - which
did work OK in my testing at least.

Jason

2024-03-27 15:22:16

by Joao Martins

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On 27/03/2024 11:40, Jason Gunthorpe wrote:
> On Wed, Mar 27, 2024 at 10:41:52AM +0000, Joao Martins wrote:
>> On 25/03/2024 13:52, Jason Gunthorpe wrote:
>>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>>>> However, I am not smart enough to figure out why ...
>>>>>
>>>>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
>>>>>
>>>>>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>>>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>>>> PROT_WRITE,
>>>>>   1748                    mmap_flags, -1, 0);
>>>>> → 1749         assert(vrc == self->buffer);
>>>>>   1750
>>>>>
>>>>> But I am not that deep into the source to figure our what was intended and what
>>>>> went
>>>>> wrong :-/
>>>>
>>>> I can SKIP() the test rather assert() in here if it helps. Though there are
>>>> other tests that fail if no hugetlb pages are reserved.
>>>>
>>>> But I am not sure if this is problem here as the initial bug email had an
>>>> enterily different set of failures? Maybe all you need is an assert() and it
>>>> gets into this state?
>>>
>>> I feel like there is something wrong with the kselftest framework,
>>> there should be some way to fail the setup/teardown operations without
>>> triggering an infinite loop :(
>>
>> I am now wondering if the problem is the fact that we have an assert() in the
>> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
>> other kselftest macro that). The expect/assert macros from kselftest() don't do
>> asserts and it looks like we are failing mid tests in the assert().
>
> Those ASSERT_TRUE cause infinite loops when used within the setup
> context, I removed them and switched to assert because of this - which
> did work OK in my testing at least.

Strange because we make use of ASSERT* widely in our selftests fixture-setup.

setup_sizes() is run before the tests so it can't use ASSERT macros for sure;
maybe that's what you refer?


2024-03-27 16:38:46

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On Wed, Mar 27, 2024 at 03:04:09PM +0000, Joao Martins wrote:
> On 27/03/2024 11:40, Jason Gunthorpe wrote:
> > On Wed, Mar 27, 2024 at 10:41:52AM +0000, Joao Martins wrote:
> >> On 25/03/2024 13:52, Jason Gunthorpe wrote:
> >>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
> >>>>> However, I am not smart enough to figure out why ...
> >>>>>
> >>>>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
> >>>>>
> >>>>>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
> >>>>>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
> >>>>> PROT_WRITE,
> >>>>>   1748                    mmap_flags, -1, 0);
> >>>>> → 1749         assert(vrc == self->buffer);
> >>>>>   1750
> >>>>>
> >>>>> But I am not that deep into the source to figure our what was intended and what
> >>>>> went
> >>>>> wrong :-/
> >>>>
> >>>> I can SKIP() the test rather assert() in here if it helps. Though there are
> >>>> other tests that fail if no hugetlb pages are reserved.
> >>>>
> >>>> But I am not sure if this is problem here as the initial bug email had an
> >>>> enterily different set of failures? Maybe all you need is an assert() and it
> >>>> gets into this state?
> >>>
> >>> I feel like there is something wrong with the kselftest framework,
> >>> there should be some way to fail the setup/teardown operations without
> >>> triggering an infinite loop :(
> >>
> >> I am now wondering if the problem is the fact that we have an assert() in the
> >> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
> >> other kselftest macro that). The expect/assert macros from kselftest() don't do
> >> asserts and it looks like we are failing mid tests in the assert().
> >
> > Those ASSERT_TRUE cause infinite loops when used within the setup
> > context, I removed them and switched to assert because of this - which
> > did work OK in my testing at least.
>
> Strange because we make use of ASSERT* widely in our selftests fixture-setup.
>
> setup_sizes() is run before the tests so it can't use ASSERT macros for sure;
> maybe that's what you refer?

No, it was definately ASSERT/etc if you hit those in the wrong spot
the thing infinite loops. Maybe that was teardown only.

Jason

2024-03-27 20:08:14

by Mirsad Todorovac

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()



On 3/27/24 11:41, Joao Martins wrote:
> On 25/03/2024 13:52, Jason Gunthorpe wrote:
>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>>> However, I am not smart enough to figure out why ...
>>>>
>>>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
>>>>
>>>>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>>> PROT_WRITE,
>>>>   1748                    mmap_flags, -1, 0);
>>>> → 1749         assert(vrc == self->buffer);
>>>>   1750
>>>>
>>>> But I am not that deep into the source to figure our what was intended and what
>>>> went
>>>> wrong :-/
>>>
>>> I can SKIP() the test rather assert() in here if it helps. Though there are
>>> other tests that fail if no hugetlb pages are reserved.
>>>
>>> But I am not sure if this is problem here as the initial bug email had an
>>> enterily different set of failures? Maybe all you need is an assert() and it
>>> gets into this state?
>>
>> I feel like there is something wrong with the kselftest framework,
>> there should be some way to fail the setup/teardown operations without
>> triggering an infinite loop :(
>
> I am now wondering if the problem is the fact that we have an assert() in the
> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
> other kselftest macro that). The expect/assert macros from kselftest() don't do
> asserts and it looks like we are failing mid tests in the assert().
>
> Maybe it is OK for setup_sizes(), but maybe not OK for the rest (i.e. during the
> actual setup / tests). I can throw a patch there to see if this helps Mirsad.

Well, we are in the job of making the kernel better and as bug free as we can.

Maybe we should not delve too much into detail: is this a kernel bug, or the kselftest
program bug?

Some people already mentioned that I might have sysctl variable problems. I don't see
what the mmap() HUGEPAGE allocation at fixed address was meant to prove?

Thanks,
Mirsad

2024-03-28 00:05:58

by Shuah Khan

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On 3/27/24 10:38, Jason Gunthorpe wrote:
> On Wed, Mar 27, 2024 at 03:04:09PM +0000, Joao Martins wrote:
>> On 27/03/2024 11:40, Jason Gunthorpe wrote:
>>> On Wed, Mar 27, 2024 at 10:41:52AM +0000, Joao Martins wrote:
>>>> On 25/03/2024 13:52, Jason Gunthorpe wrote:
>>>>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>>>>>> However, I am not smart enough to figure out why ...
>>>>>>>
>>>>>>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
>>>>>>>
>>>>>>>   1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>>>>>   1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>>>>>> PROT_WRITE,
>>>>>>>   1748                    mmap_flags, -1, 0);
>>>>>>> → 1749         assert(vrc == self->buffer);
>>>>>>>   1750
>>>>>>>
>>>>>>> But I am not that deep into the source to figure our what was intended and what
>>>>>>> went
>>>>>>> wrong :-/
>>>>>>
>>>>>> I can SKIP() the test rather assert() in here if it helps. Though there are
>>>>>> other tests that fail if no hugetlb pages are reserved.
>>>>>>
>>>>>> But I am not sure if this is problem here as the initial bug email had an
>>>>>> enterily different set of failures? Maybe all you need is an assert() and it
>>>>>> gets into this state?
>>>>>
>>>>> I feel like there is something wrong with the kselftest framework,
>>>>> there should be some way to fail the setup/teardown operations without
>>>>> triggering an infinite loop :(
>>>>
>>>> I am now wondering if the problem is the fact that we have an assert() in the
>>>> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
>>>> other kselftest macro that). The expect/assert macros from kselftest() don't do
>>>> asserts and it looks like we are failing mid tests in the assert().
>>>
>>> Those ASSERT_TRUE cause infinite loops when used within the setup
>>> context, I removed them and switched to assert because of this - which
>>> did work OK in my testing at least.
>>
>> Strange because we make use of ASSERT* widely in our selftests fixture-setup.
>>
>> setup_sizes() is run before the tests so it can't use ASSERT macros for sure;
>> maybe that's what you refer?
>
> No, it was definately ASSERT/etc if you hit those in the wrong spot
> the thing infinite loops. Maybe that was teardown only.
>

By adding assert(), you are mixing frameworks and the overall
test behavior will not be consistent.

ASSERT_*() is supposed to exit the test right away. If this
isn't happening it needs to be debugged. There are several
tests that use this framework. Maybe you can refer to another
test for examples of how to use the framework.

thanks,
-- Shuah



2024-03-28 13:55:40

by Joao Martins

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On 27/03/2024 20:04, Mirsad Todorovac wrote:
> On 3/27/24 11:41, Joao Martins wrote:
>> On 25/03/2024 13:52, Jason Gunthorpe wrote:
>>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>>>> However, I am not smart enough to figure out why ...
>>>>>
>>>>> Apparently, from the source, mmap() fails to allocate pages on the desired
>>>>> address:
>>>>>
>>>>>    1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>>>    1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>>>> PROT_WRITE,
>>>>>    1748                    mmap_flags, -1, 0);
>>>>> → 1749         assert(vrc == self->buffer);
>>>>>    1750
>>>>>
>>>>> But I am not that deep into the source to figure our what was intended and
>>>>> what
>>>>> went
>>>>> wrong :-/
>>>>
>>>> I can SKIP() the test rather assert() in here if it helps. Though there are
>>>> other tests that fail if no hugetlb pages are reserved.
>>>>
>>>> But I am not sure if this is problem here as the initial bug email had an
>>>> enterily different set of failures? Maybe all you need is an assert() and it
>>>> gets into this state?
>>>
>>> I feel like there is something wrong with the kselftest framework,
>>> there should be some way to fail the setup/teardown operations without
>>> triggering an infinite loop :(
>>
>> I am now wondering if the problem is the fact that we have an assert() in the
>> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
>> other kselftest macro that). The expect/assert macros from kselftest() don't do
>> asserts and it looks like we are failing mid tests in the assert().
>>
>> Maybe it is OK for setup_sizes(), but maybe not OK for the rest (i.e. during the
>> actual setup / tests). I can throw a patch there to see if this helps Mirsad.
>
> Well, we are in the job of making the kernel better and as bug free as we can.
>
> Maybe we should not delve too much into detail: is this a kernel bug, or the
> kselftest
> program bug?
>

I think the latter thus far. See at the end.

> Some people already mentioned that I might have sysctl variable problems. I
> don't see
> what the mmap() HUGEPAGE allocation at fixed address was meant to prove?

That just sounds like the setup -- you need hugepages to run all iommufd tests.
Most of my comments is about what your first report email in this thread on the
selftest getting stuck at 99%

If the use of assert() within test/setup is the issue then snip below should fix
it. But if Jason is right it won't make a difference. I think this infinite loop
is __bail() where we are doing a longjmp() in a loop once a ASSERT*() fails but
it only happens if we use these ASSERT() functions. Maybe this is because in
some test functions we end up doing ASSERTs within ASSERTs?

--->8---

diff --git a/tools/testing/selftests/iommu/iommufd.c
b/tools/testing/selftests/iommu/iommufd.c
index edf1c99c9936..d2661a13a4f2 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -353,34 +353,34 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Check data_type by passing zero-length array */
num_inv = 0;
test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: Invalid data_type */
num_inv = 1;
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST_INVALID,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: structure size sanity */
num_inv = 1;
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs) + 1, &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

num_inv = 1;
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
1, &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid flag is passed */
num_inv = 1;
@@ -388,7 +388,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid data_uptr when array is not empty */
num_inv = 1;
@@ -396,7 +396,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], NULL,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid entry_len when array is not empty */
num_inv = 1;
@@ -404,7 +404,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
0, &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/* Negative test: invalid iotlb_id */
num_inv = 1;
@@ -413,7 +413,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(!num_inv);
+ ASSERT_TRUE(!num_inv);

/*
* Invalidate the 1st iotlb entry but fail the 2nd request
@@ -427,7 +427,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EOPNOTSUPP, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1,
IOMMU_TEST_IOTLB_DEFAULT);
@@ -448,7 +448,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1,
IOMMU_TEST_IOTLB_DEFAULT);
@@ -464,7 +464,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 0, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 1, 0);
test_cmd_hwpt_check_iotlb(nested_hwpt_id[0], 2,
@@ -481,7 +481,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_cmd_hwpt_invalidate(nested_hwpt_id[0], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 2);
+ ASSERT_TRUE(num_inv == 2);
test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0], 0);

/* Invalidate all iotlb entries for nested_hwpt_id[1] and verify */
@@ -490,7 +490,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
test_cmd_hwpt_invalidate(nested_hwpt_id[1], inv_reqs,
IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
sizeof(*inv_reqs), &num_inv);
- assert(num_inv == 1);
+ ASSERT_TRUE(num_inv == 1);
test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1], 0);

/* Attach device to nested_hwpt_id[0] that then will be busy */
@@ -1743,10 +1743,14 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
*/
mmap_flags |= MAP_HUGETLB | MAP_POPULATE;
}
- assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
+ ASSERT_TRUE((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE,
mmap_flags, -1, 0);
- assert(vrc == self->buffer);
+ if (vrc != self->buffer && variant->hugepages) {
+ SKIP(return, "Skipping buffer_size=%lu due to mmap() errno=%d",
+ variant->buffer_size, errno);
+ }
+ ASSERT_TRUE(vrc == self->buffer);

self->page_size = MOCK_PAGE_SIZE;
self->bitmap_size =
@@ -1755,9 +1759,9 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
/* Provision with an extra (PAGE_SIZE) for the unaligned case */
rc = posix_memalign(&self->bitmap, PAGE_SIZE,
self->bitmap_size + PAGE_SIZE);
- assert(!rc);
- assert(self->bitmap);
- assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
+ ASSERT_TRUE(!rc);
+ ASSERT_TRUE(self->bitmap != NULL);
+ ASSERT_TRUE((uintptr_t)self->bitmap % PAGE_SIZE == 0);

test_ioctl_ioas_alloc(&self->ioas_id);
/* Enable 1M mock IOMMU hugepages */
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c
b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index f590417cd67a..4a88f9c28fe5 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -315,7 +315,8 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)

fail_nth_enable();

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;

if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
@@ -326,7 +327,8 @@ TEST_FAIL_NTH(basic_fail_nth, map_domain)
if (_test_ioctl_destroy(self->fd, stdev_id))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;
return 0;
}
@@ -350,13 +352,14 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;

fail_nth_enable();

if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2,
- NULL))
+ NULL, _metadata))
return -1;

if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, 262144, &iova,
@@ -370,10 +373,11 @@ TEST_FAIL_NTH(basic_fail_nth, map_two_domains)
if (_test_ioctl_destroy(self->fd, stdev_id2))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;
if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id2, &hwpt_id2,
- NULL))
+ NULL, _metadata))
return -1;
return 0;
}
@@ -530,7 +534,8 @@ TEST_FAIL_NTH(basic_fail_nth, access_pin_domain)
if (_test_ioctl_set_temp_memory_limit(self->fd, 32))
return -1;

- if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL))
+ if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, &hwpt_id, NULL,
+ _metadata))
return -1;

if (_test_ioctl_ioas_map(self->fd, ioas_id, buffer, BUFFER_SIZE, &iova,
@@ -609,10 +614,11 @@ TEST_FAIL_NTH(basic_fail_nth, device)
fail_nth_enable();

if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL,
- &idev_id))
+ &idev_id, _metadata))
return -1;

- if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL))
+ if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL,
+ _metadata))
return -1;

if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id,
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h
b/tools/testing/selftests/iommu/iommufd_utils.h
index 8d2b46b2114d..cd8bb14be658 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -64,7 +64,8 @@ static unsigned long PAGE_SIZE;
})

static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id,
- __u32 *hwpt_id, __u32 *idev_id)
+ __u32 *hwpt_id, __u32 *idev_id,
+ struct __test_metadata *_metadata)
{
struct iommu_test_cmd cmd = {
.size = sizeof(cmd),
@@ -79,7 +80,7 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id,
__u32 *stdev_id,
return ret;
if (stdev_id)
*stdev_id = cmd.mock_domain.out_stdev_id;
- assert(cmd.id != 0);
+ ASSERT_TRUE(cmd.id != 0);
if (hwpt_id)
*hwpt_id = cmd.mock_domain.out_hwpt_id;
if (idev_id)
@@ -88,14 +89,16 @@ static int _test_cmd_mock_domain(int fd, unsigned int
ioas_id, __u32 *stdev_id,
}
#define test_cmd_mock_domain(ioas_id, stdev_id, hwpt_id, idev_id) \
ASSERT_EQ(0, _test_cmd_mock_domain(self->fd, ioas_id, stdev_id, \
- hwpt_id, idev_id))
+ hwpt_id, idev_id, _metadata))
#define test_err_mock_domain(_errno, ioas_id, stdev_id, hwpt_id) \
EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \
- stdev_id, hwpt_id, NULL))
+ stdev_id, hwpt_id, NULL, \
+ _metadata))

static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id,
__u32 stdev_flags, __u32 *stdev_id,
- __u32 *hwpt_id, __u32 *idev_id)
+ __u32 *hwpt_id, __u32 *idev_id,
+ struct __test_metadata *_metadata)
{
struct iommu_test_cmd cmd = {
.size = sizeof(cmd),
@@ -110,7 +113,7 @@ static int _test_cmd_mock_domain_flags(int fd, unsigned int
ioas_id,
return ret;
if (stdev_id)
*stdev_id = cmd.mock_domain_flags.out_stdev_id;
- assert(cmd.id != 0);
+ ASSERT_TRUE(cmd.id != 0);
if (hwpt_id)
*hwpt_id = cmd.mock_domain_flags.out_hwpt_id;
if (idev_id)
@@ -119,11 +122,13 @@ static int _test_cmd_mock_domain_flags(int fd, unsigned
int ioas_id,
}
#define test_cmd_mock_domain_flags(ioas_id, flags, stdev_id, hwpt_id, idev_id) \
ASSERT_EQ(0, _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \
- stdev_id, hwpt_id, idev_id))
+ stdev_id, hwpt_id, idev_id, \
+ _metadata))
#define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \
EXPECT_ERRNO(_errno, \
_test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \
- stdev_id, hwpt_id, NULL))
+ stdev_id, hwpt_id, NULL, \
+ _metadata))

static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
__u32 *hwpt_id)
@@ -623,7 +628,8 @@ static void teardown_iommufd(int fd, struct __test_metadata
*_metadata)

/* @data can be NULL */
static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data,
- size_t data_len, uint32_t *capabilities)
+ size_t data_len, uint32_t *capabilities,
+ struct __test_metadata *_metadata)
{
struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data;
struct iommu_hw_info cmd = {
@@ -639,13 +645,13 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
void *data,
if (ret)
return ret;

- assert(cmd.out_data_type == IOMMU_HW_INFO_TYPE_SELFTEST);
+ ASSERT_TRUE(cmd.out_data_type == IOMMU_HW_INFO_TYPE_SELFTEST);

/*
* The struct iommu_test_hw_info should be the one defined
* by the current kernel.
*/
- assert(cmd.data_len == sizeof(struct iommu_test_hw_info));
+ ASSERT_TRUE(cmd.data_len == sizeof(struct iommu_test_hw_info));

/*
* Trailing bytes should be 0 if user buffer is larger than
@@ -656,16 +662,16 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
void *data,
int idx = 0;

while (idx < data_len - cmd.data_len) {
- assert(!*(ptr + idx));
+ ASSERT_TRUE(!*(ptr + idx));
idx++;
}
}

if (info) {
if (data_len >= offsetofend(struct iommu_test_hw_info, test_reg))
- assert(info->test_reg == IOMMU_HW_INFO_SELFTEST_REGVAL);
+ ASSERT_TRUE(info->test_reg == IOMMU_HW_INFO_SELFTEST_REGVAL);
if (data_len >= offsetofend(struct iommu_test_hw_info, flags))
- assert(!info->flags);
+ ASSERT_TRUE(!info->flags);
}

if (capabilities)
@@ -674,13 +680,14 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id,
void *data,
return 0;
}

-#define test_cmd_get_hw_info(device_id, data, data_len) \
+#define test_cmd_get_hw_info(device_id, data, data_len) \
ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \
- data_len, NULL))
+ data_len, NULL, _metadata))

-#define test_err_get_hw_info(_errno, device_id, data, data_len) \
+#define test_err_get_hw_info(_errno, device_id, data, data_len) \
EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \
- data_len, NULL))
+ data_len, NULL, _metadata))

#define test_cmd_get_hw_capabilities(device_id, caps, mask) \
- ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps))
+ ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, \
+ &caps, _metadata))

2024-04-02 11:33:36

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On Wed, Mar 27, 2024 at 06:05:46PM -0600, Shuah Khan wrote:

> ASSERT_*() is supposed to exit the test right away. If this
> isn't happening it needs to be debugged.

We know it doesn't work in setup/teardown functions, you can see it in
the code it jumps back and does the teardown again in an infinite
loop.

If mising assert and ASSERT causes loops that is also a bug, we can't
guarentee that no libraries linked into this (like glibc) doesn't call
assert.

Jason

2024-04-04 16:56:17

by Shuah Khan

[permalink] [raw]
Subject: Re: [BUG] seltests/iommu: runaway ./iommufd consuming 99% CPU after a failed assert()

On 3/27/24 14:04, Mirsad Todorovac wrote:
>
>
> On 3/27/24 11:41, Joao Martins wrote:
>> On 25/03/2024 13:52, Jason Gunthorpe wrote:
>>> On Mon, Mar 25, 2024 at 12:17:28PM +0000, Joao Martins wrote:
>>>>> However, I am not smart enough to figure out why ...
>>>>>
>>>>> Apparently, from the source, mmap() fails to allocate pages on the desired address:
>>>>>
>>>>>    1746         assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0);
>>>>>    1747         vrc = mmap(self->buffer, variant->buffer_size, PROT_READ |
>>>>> PROT_WRITE,
>>>>>    1748                    mmap_flags, -1, 0);
>>>>> → 1749         assert(vrc == self->buffer);
>>>>>    1750
>>>>>
>>>>> But I am not that deep into the source to figure our what was intended and what
>>>>> went
>>>>> wrong :-/
>>>>
>>>> I can SKIP() the test rather assert() in here if it helps. Though there are
>>>> other tests that fail if no hugetlb pages are reserved.
>>>>
>>>> But I am not sure if this is problem here as the initial bug email had an
>>>> enterily different set of failures? Maybe all you need is an assert() and it
>>>> gets into this state?
>>>
>>> I feel like there is something wrong with the kselftest framework,
>>> there should be some way to fail the setup/teardown operations without
>>> triggering an infinite loop :(
>>
>> I am now wondering if the problem is the fact that we have an assert() in the
>> middle of FIXTURE_{TEST,SETUP} whereby we should be having ASSERT_TRUE() (or any
>> other kselftest macro that). The expect/assert macros from kselftest() don't do
>> asserts and it looks like we are failing mid tests in the assert().
>>
>> Maybe it is OK for setup_sizes(), but maybe not OK for the rest (i.e. during the
>> actual setup / tests). I can throw a patch there to see if this helps Mirsad.
>
> Well, we are in the job of making the kernel better and as bug free as we can.
>
> Maybe we should not delve too much into detail: is this a kernel bug, or the kselftest
> program bug?
>
> Some people already mentioned that I might have sysctl variable problems. I don't see
> what the mmap() HUGEPAGE allocation at fixed address was meant to prove?
>

I applied fix to this problem to linux-kselftest fixes branch for next rc.
Please give it a try.

thanks,
-- Shuah