2022-11-01 19:33:39

by Felix Kuehling

[permalink] [raw]
Subject: [PATCH] drm/amdkfd: Fix error handling in criu_checkpoint

Checkpoint BOs last. That way we don't need to close dmabuf FDs if
something else fails later. This avoids problematic access to user mode
memory in the error handling code path.

criu_checkpoint_bos has its own error handling and cleanup that does not
depend on access to user memory.

Fixes: be072b06c739 ("drm/amdkfd: CRIU export BOs as prime dmabuf objects")
Reported-by: Jann Horn <[email protected]>
CC: Rajneesh Bhardwaj <[email protected]>
Signed-off-by: Felix Kuehling <[email protected]>
---
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 27 +++++++-----------------
1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 5feaba6a77de..aabab9010812 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1994,38 +1994,27 @@ static int criu_checkpoint(struct file *filep,
if (ret)
goto exit_unlock;

- ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
- (uint8_t __user *)args->priv_data, &priv_offset);
- if (ret)
- goto exit_unlock;
-
if (num_objects) {
ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
&priv_offset);
if (ret)
- goto close_bo_fds;
+ goto exit_unlock;

ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
&priv_offset);
if (ret)
- goto close_bo_fds;
+ goto exit_unlock;

ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
if (ret)
- goto close_bo_fds;
+ goto exit_unlock;
}

-close_bo_fds:
- if (ret) {
- /* If IOCTL returns err, user assumes all FDs opened in criu_dump_bos are closed */
- uint32_t i;
- struct kfd_criu_bo_bucket *bo_buckets = (struct kfd_criu_bo_bucket *) args->bos;
-
- for (i = 0; i < num_bos; i++) {
- if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
- close_fd(bo_buckets[i].dmabuf_fd);
- }
- }
+ /* This must be the last thing in this function that can fail.
+ * Otherwise we leak dmabuf file descriptors.
+ */
+ ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
+ (uint8_t __user *)args->priv_data, &priv_offset);

exit_unlock:
mutex_unlock(&p->mutex);
--
2.32.0



2022-11-02 02:32:53

by Bhardwaj, Rajneesh

[permalink] [raw]
Subject: Re: [PATCH] drm/amdkfd: Fix error handling in criu_checkpoint


On 11/1/2022 3:15 PM, Felix Kuehling wrote:
> Checkpoint BOs last. That way we don't need to close dmabuf FDs if
> something else fails later. This avoids problematic access to user mode
> memory in the error handling code path.
>
> criu_checkpoint_bos has its own error handling and cleanup that does not
> depend on access to user memory.


This seems to be breaking the restore operation. I did a quick pytorch
based test and I can confirm that restore operation fails with this
change applied.

[  +0.000003] CR2: 000055b6726e0020 CR3: 00000001283fe000 CR4:
00000000003 50ee0
[  +0.000002] Call Trace:
[  +0.000002]  <TASK>
[  +0.000003]  kfd_ioctl_criu+0xd4c/0x1930 [amdgpu]
[  +0.000185]  ? __might_fault+0x32/0x80
[  +0.000004]  ? lock_release+0x1fd/0x2b0
[  +0.000010]  kfd_ioctl+0x29b/0x600 [amdgpu]
[  +0.000153]  ? kfd_ioctl_get_tile_config+0x130/0x130 [amdgpu]
[  +0.000158]  __x64_sys_ioctl+0x8b/0xd0
[  +0.000003]  ? lockdep_hardirqs_on+0x79/0x100
[  +0.000007]  do_syscall_64+0x34/0x80
[  +0.000004]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000005] RIP: 0033:0x7f1c87e7f317
[  +0.000002] Code: b3 66 90 48 8b 05 71 4b 2d 00 64 c7 00 26 00 00 00
48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
05 <4 8> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 4b 2d 00 f7 d8 64 89 01 48
[  +0.000003] RSP: 002b:00007fff630af518 EFLAGS: 00000202 ORIG_RAX:
000000 0000000010
[  +0.000003] RAX: ffffffffffffffda RBX: 00007f1c89351620 RCX:
00007f1c87e 7f317
[  +0.000002] RDX: 00007fff630af5c0 RSI: 00000000c0384b22 RDI:
00000000000 00005
[  +0.000002] RBP: 00007fff630af550 R08: 0000000000000000 R09:
00007f1c87e d7c10
[  +0.000002] R10: 705f757067646d61 R11: 0000000000000202 R12:
000055a4a05 14c60
[  +0.000002] R13: 000055a49eb81540 R14: 000055a49e90eea9 R15:
00007fff630 b069c
[  +0.000010]  </TASK>
[  +0.000002] irq event stamp: 50181
[  +0.000002] hardirqs last  enabled at (50187): [<ffffffffb61072a2>]
__up _console_sem+0x52/0x60
[  +0.000003] hardirqs last disabled at (50192): [<ffffffffb6107287>]
__up _console_sem+0x37/0x60
[  +0.000003] softirqs last  enabled at (45940): [<ffffffffb6bd4103>]
sock _setsockopt+0x223/0xfa0
[  +0.000003] softirqs last disabled at (45938): [<ffffffffb6bd3609>]
rele ase_sock+0x19/0xa0
[  +0.000004] ---[ end trace 0000000000000000 ]---
[  +0.000002] amdgpu: Could not allocate idr
[  +0.000245] amdgpu: Failed to restore CRIU ret:-12
[Nov 1 22:11] loop0: detected capacity change from 0 to 8

https://github.com/checkpoint-restore/criu/blob/criu-dev/plugins/amdgpu/amdgpu_plugin.c
:


(00.093977)     11: Added GPU mapping [0xC093 -> 0xC093]
(00.093982)     11: ===Maps===============
(00.093987)     11: GPU: 0xC093 -> 0xC093
(00.093992)     11: CPU: 00 -> 00
(00.093997)     11: ======================
(00.094002)     11: Matched destination node 0xC093
(00.094007)     11: All nodes mapped successfully
(00.094012)     11: Matched nodes 0xC093 and after
(00.094017)     11: Maps after all nodes matched
(00.094022)     11: ===Maps===============
(00.094027)     11: GPU: 0xC093 -> 0xC093
(00.094032)     11: CPU: 00 -> 00
(00.094037)     11: ======================
(00.094041)     11: amdgpu_plugin: Restoring 1 devices
(00.094319)     11: amdgpu_plugin: amdgpu_plugin: passing drm render fd
= 10 to driver
(00.094326)     11: amdgpu_plugin: Restore devices Ok (ret:0)
(00.094331)     11: amdgpu_plugin: Restoring 184 BOs
(00.094349)     11: amdgpu_plugin: Restore BOs Ok
(00.095791)     11: Error (amdgpu_plugin.c:1830): amdgpu_plugin: Restore
ioctl failed: Cannot allocate memory
(00.095916)     11: Error (amdgpu_plugin.c:1850): amdgpu_plugin:
amdgpu_plugin: Failed to restore (ret:-1)
(00.095951)     11: Error (criu/files-ext.c:53): Unable to restore 0x143
(00.095961)     11: Error (criu/files.c:1213): Unable to open fd=4 id=0x143
(00.096078) Unlink remap /dev/shm/fvKoKz.cr.1.ghost
(00.096152) Error (criu/cr-restore.c:2531): Restoring FAILED.
(00.096181) amdgpu_plugin: amdgpu_plugin: finished  amdgpu_plugin
(AMDGPU/KFD)
"restore.log" 4194L, 201090C

>
> Fixes: be072b06c739 ("drm/amdkfd: CRIU export BOs as prime dmabuf objects")
> Reported-by: Jann Horn <[email protected]>
> CC: Rajneesh Bhardwaj <[email protected]>
> Signed-off-by: Felix Kuehling <[email protected]>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 27 +++++++-----------------
> 1 file changed, 8 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 5feaba6a77de..aabab9010812 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1994,38 +1994,27 @@ static int criu_checkpoint(struct file *filep,
> if (ret)
> goto exit_unlock;
>
> - ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> - (uint8_t __user *)args->priv_data, &priv_offset);
> - if (ret)
> - goto exit_unlock;
> -
> if (num_objects) {
> ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
> &priv_offset);
> if (ret)
> - goto close_bo_fds;
> + goto exit_unlock;
>
> ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
> &priv_offset);
> if (ret)
> - goto close_bo_fds;
> + goto exit_unlock;
>
> ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
> if (ret)
> - goto close_bo_fds;
> + goto exit_unlock;
> }
>
> -close_bo_fds:
> - if (ret) {
> - /* If IOCTL returns err, user assumes all FDs opened in criu_dump_bos are closed */
> - uint32_t i;
> - struct kfd_criu_bo_bucket *bo_buckets = (struct kfd_criu_bo_bucket *) args->bos;
> -
> - for (i = 0; i < num_bos; i++) {
> - if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
> - close_fd(bo_buckets[i].dmabuf_fd);
> - }
> - }
> + /* This must be the last thing in this function that can fail.
> + * Otherwise we leak dmabuf file descriptors.
> + */
> + ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
> + (uint8_t __user *)args->priv_data, &priv_offset);
>
> exit_unlock:
> mutex_unlock(&p->mutex);

2022-11-02 14:13:30

by Felix Kuehling

[permalink] [raw]
Subject: Re: [PATCH] drm/amdkfd: Fix error handling in criu_checkpoint

Am 2022-11-01 um 22:19 schrieb Bhardwaj, Rajneesh:
>
> On 11/1/2022 3:15 PM, Felix Kuehling wrote:
>> Checkpoint BOs last. That way we don't need to close dmabuf FDs if
>> something else fails later. This avoids problematic access to user mode
>> memory in the error handling code path.
>>
>> criu_checkpoint_bos has its own error handling and cleanup that does not
>> depend on access to user memory.
>
>
> This seems to be breaking the restore operation. I did a quick pytorch
> based test and I can confirm that restore operation fails with this
> change applied.

Ah yes, we need to restore things from the private data area in the same
order that they were saved. I'll send an updated patch.

What's the cause for the call trace below? Is this a kernel oops or a
warning? If it's an oops, it's concerning because it could be caused by
a corrupted checkpoint as well.

Thanks,
  Felix


>
> [  +0.000003] CR2: 000055b6726e0020 CR3: 00000001283fe000 CR4:
> 00000000003 50ee0
> [  +0.000002] Call Trace:
> [  +0.000002]  <TASK>
> [  +0.000003]  kfd_ioctl_criu+0xd4c/0x1930 [amdgpu]
> [  +0.000185]  ? __might_fault+0x32/0x80
> [  +0.000004]  ? lock_release+0x1fd/0x2b0
> [  +0.000010]  kfd_ioctl+0x29b/0x600 [amdgpu]
> [  +0.000153]  ? kfd_ioctl_get_tile_config+0x130/0x130 [amdgpu]
> [  +0.000158]  __x64_sys_ioctl+0x8b/0xd0
> [  +0.000003]  ? lockdep_hardirqs_on+0x79/0x100
> [  +0.000007]  do_syscall_64+0x34/0x80
> [  +0.000004]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  +0.000005] RIP: 0033:0x7f1c87e7f317
> [  +0.000002] Code: b3 66 90 48 8b 05 71 4b 2d 00 64 c7 00 26 00 00 00
> 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00
> 0f 05 <4 8> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 41 4b 2d 00 f7 d8 64 89
> 01 48
> [  +0.000003] RSP: 002b:00007fff630af518 EFLAGS: 00000202 ORIG_RAX:
> 000000 0000000010
> [  +0.000003] RAX: ffffffffffffffda RBX: 00007f1c89351620 RCX:
> 00007f1c87e 7f317
> [  +0.000002] RDX: 00007fff630af5c0 RSI: 00000000c0384b22 RDI:
> 00000000000 00005
> [  +0.000002] RBP: 00007fff630af550 R08: 0000000000000000 R09:
> 00007f1c87e d7c10
> [  +0.000002] R10: 705f757067646d61 R11: 0000000000000202 R12:
> 000055a4a05 14c60
> [  +0.000002] R13: 000055a49eb81540 R14: 000055a49e90eea9 R15:
> 00007fff630 b069c
> [  +0.000010]  </TASK>
> [  +0.000002] irq event stamp: 50181
> [  +0.000002] hardirqs last  enabled at (50187): [<ffffffffb61072a2>]
> __up _console_sem+0x52/0x60
> [  +0.000003] hardirqs last disabled at (50192): [<ffffffffb6107287>]
> __up _console_sem+0x37/0x60
> [  +0.000003] softirqs last  enabled at (45940): [<ffffffffb6bd4103>]
> sock _setsockopt+0x223/0xfa0
> [  +0.000003] softirqs last disabled at (45938): [<ffffffffb6bd3609>]
> rele ase_sock+0x19/0xa0
> [  +0.000004] ---[ end trace 0000000000000000 ]---
> [  +0.000002] amdgpu: Could not allocate idr
> [  +0.000245] amdgpu: Failed to restore CRIU ret:-12
> [Nov 1 22:11] loop0: detected capacity change from 0 to 8
>
> https://github.com/checkpoint-restore/criu/blob/criu-dev/plugins/amdgpu/amdgpu_plugin.c
> :
>
>
> (00.093977)     11: Added GPU mapping [0xC093 -> 0xC093]
> (00.093982)     11: ===Maps===============
> (00.093987)     11: GPU: 0xC093 -> 0xC093
> (00.093992)     11: CPU: 00 -> 00
> (00.093997)     11: ======================
> (00.094002)     11: Matched destination node 0xC093
> (00.094007)     11: All nodes mapped successfully
> (00.094012)     11: Matched nodes 0xC093 and after
> (00.094017)     11: Maps after all nodes matched
> (00.094022)     11: ===Maps===============
> (00.094027)     11: GPU: 0xC093 -> 0xC093
> (00.094032)     11: CPU: 00 -> 00
> (00.094037)     11: ======================
> (00.094041)     11: amdgpu_plugin: Restoring 1 devices
> (00.094319)     11: amdgpu_plugin: amdgpu_plugin: passing drm render
> fd = 10 to driver
> (00.094326)     11: amdgpu_plugin: Restore devices Ok (ret:0)
> (00.094331)     11: amdgpu_plugin: Restoring 184 BOs
> (00.094349)     11: amdgpu_plugin: Restore BOs Ok
> (00.095791)     11: Error (amdgpu_plugin.c:1830): amdgpu_plugin:
> Restore ioctl failed: Cannot allocate memory
> (00.095916)     11: Error (amdgpu_plugin.c:1850): amdgpu_plugin:
> amdgpu_plugin: Failed to restore (ret:-1)
> (00.095951)     11: Error (criu/files-ext.c:53): Unable to restore 0x143
> (00.095961)     11: Error (criu/files.c:1213): Unable to open fd=4
> id=0x143
> (00.096078) Unlink remap /dev/shm/fvKoKz.cr.1.ghost
> (00.096152) Error (criu/cr-restore.c:2531): Restoring FAILED.
> (00.096181) amdgpu_plugin: amdgpu_plugin: finished  amdgpu_plugin
> (AMDGPU/KFD)
> "restore.log" 4194L, 201090C
>
>>
>> Fixes: be072b06c739 ("drm/amdkfd: CRIU export BOs as prime dmabuf
>> objects")
>> Reported-by: Jann Horn <[email protected]>
>> CC: Rajneesh Bhardwaj <[email protected]>
>> Signed-off-by: Felix Kuehling <[email protected]>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 27 +++++++-----------------
>>   1 file changed, 8 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 5feaba6a77de..aabab9010812 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -1994,38 +1994,27 @@ static int criu_checkpoint(struct file *filep,
>>       if (ret)
>>           goto exit_unlock;
>>   -    ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user
>> *)args->bos,
>> -                (uint8_t __user *)args->priv_data, &priv_offset);
>> -    if (ret)
>> -        goto exit_unlock;
>> -
>>       if (num_objects) {
>>           ret = kfd_criu_checkpoint_queues(p, (uint8_t __user
>> *)args->priv_data,
>>                            &priv_offset);
>>           if (ret)
>> -            goto close_bo_fds;
>> +            goto exit_unlock;
>>             ret = kfd_criu_checkpoint_events(p, (uint8_t __user
>> *)args->priv_data,
>>                            &priv_offset);
>>           if (ret)
>> -            goto close_bo_fds;
>> +            goto exit_unlock;
>>             ret = kfd_criu_checkpoint_svm(p, (uint8_t __user
>> *)args->priv_data, &priv_offset);
>>           if (ret)
>> -            goto close_bo_fds;
>> +            goto exit_unlock;
>>       }
>>   -close_bo_fds:
>> -    if (ret) {
>> -        /* If IOCTL returns err, user assumes all FDs opened in
>> criu_dump_bos are closed */
>> -        uint32_t i;
>> -        struct kfd_criu_bo_bucket *bo_buckets = (struct
>> kfd_criu_bo_bucket *) args->bos;
>> -
>> -        for (i = 0; i < num_bos; i++) {
>> -            if (bo_buckets[i].alloc_flags &
>> KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
>> -                close_fd(bo_buckets[i].dmabuf_fd);
>> -        }
>> -    }
>> +    /* This must be the last thing in this function that can fail.
>> +     * Otherwise we leak dmabuf file descriptors.
>> +     */
>> +    ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
>> +               (uint8_t __user *)args->priv_data, &priv_offset);
>>     exit_unlock:
>>       mutex_unlock(&p->mutex);