2024-01-26 18:33:07

by Brett Creeley

[permalink] [raw]
Subject: [PATCH vfio] vfio/pds: Rework and simplify reset flows

The current logic for handling resets based on
whether they were initiated from the DSC or
host/VMM is slightly confusing and incorrect.
The incorrect behavior can cause the VF device
to be unusable on the destination on failed
migrations due to incompatible configurations.
Fix this by setting the state back to
VFIO_DEVICE_STATE_RUNNING when an FLR is
triggered, so the VF device is put back in
an "initial" pre-configured state after failures.

Also, while here clean-up the reset logic to
make the source of the reset more obvious.

Signed-off-by: Brett Creeley <[email protected]>
Reviewed-by: Shannon Nelson <[email protected]>
---
drivers/vfio/pci/pds/pci_drv.c | 2 +-
drivers/vfio/pci/pds/vfio_dev.c | 14 +++++++-------
drivers/vfio/pci/pds/vfio_dev.h | 7 ++++++-
3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index a34dda516629..4ac3da7abd32 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -57,7 +57,7 @@ static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
if (deferred_reset_needed) {
mutex_lock(&pds_vfio->reset_mutex);
pds_vfio->deferred_reset = true;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
+ pds_vfio->deferred_reset_type = PDS_VFIO_DEVICE_RESET;
mutex_unlock(&pds_vfio->reset_mutex);
}
}
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index 4c351c59d05a..3357690344c4 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -32,13 +32,14 @@ void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
mutex_lock(&pds_vfio->reset_mutex);
if (pds_vfio->deferred_reset) {
pds_vfio->deferred_reset = false;
- if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
- pds_vfio_put_restore_file(pds_vfio);
- pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ if (pds_vfio->deferred_reset_type == PDS_VFIO_HOST_RESET) {
+ pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ } else {
pds_vfio_dirty_disable(pds_vfio, false);
+ pds_vfio->state = VFIO_DEVICE_STATE_ERROR;
}
- pds_vfio->state = pds_vfio->deferred_reset_state;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
mutex_unlock(&pds_vfio->reset_mutex);
goto again;
}
@@ -50,7 +51,7 @@ void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
{
mutex_lock(&pds_vfio->reset_mutex);
pds_vfio->deferred_reset = true;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+ pds_vfio->deferred_reset_type = PDS_VFIO_HOST_RESET;
if (!mutex_trylock(&pds_vfio->state_mutex)) {
mutex_unlock(&pds_vfio->reset_mutex);
return;
@@ -194,7 +195,6 @@ static int pds_vfio_open_device(struct vfio_device *vdev)
return err;

pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
- pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;

vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);

diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
index e7b01080a1ec..19547fd8e956 100644
--- a/drivers/vfio/pci/pds/vfio_dev.h
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -10,6 +10,11 @@
#include "dirty.h"
#include "lm.h"

+enum pds_vfio_reset_type {
+ PDS_VFIO_HOST_RESET = 0,
+ PDS_VFIO_DEVICE_RESET = 1,
+};
+
struct pds_vfio_pci_device {
struct vfio_pci_core_device vfio_coredev;

@@ -20,7 +25,7 @@ struct pds_vfio_pci_device {
enum vfio_device_mig_state state;
struct mutex reset_mutex; /* protect reset_done flow */
u8 deferred_reset;
- enum vfio_device_mig_state deferred_reset_state;
+ enum pds_vfio_reset_type deferred_reset_type;
struct notifier_block nb;

int vf_id;
--
2.17.1



2024-02-05 06:59:49

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH vfio] vfio/pds: Rework and simplify reset flows

> From: Brett Creeley <[email protected]>
> Sent: Saturday, January 27, 2024 2:32 AM
>
> The current logic for handling resets based on
> whether they were initiated from the DSC or
> host/VMM is slightly confusing and incorrect.
> The incorrect behavior can cause the VF device
> to be unusable on the destination on failed
> migrations due to incompatible configurations.
> Fix this by setting the state back to
> VFIO_DEVICE_STATE_RUNNING when an FLR is
> triggered, so the VF device is put back in
> an "initial" pre-configured state after failures.

any reason for putting short lines (<50 chars) in commit msg?

>
> Also, while here clean-up the reset logic to
> make the source of the reset more obvious.

as a fix the a 'Fixed' tag is preferred and CC stable

also separate the real fix from the cleanup so stable kernel doesn't need
to backport unnecessary code.

btw the commit msg is not clear to me. It says fixing the problem
by setting the state to _ERROR for the DSC path and to _RUNNING for
the FLR path.

But looks it's already such case with old code:

pds_vfio_recovery()
pds_vfio->deferred_reset = true;
pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;

pds_vfio_reset()
pds_vfio->deferred_reset = true;
pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;

pds_vfio_state_mutex_unlock()
if (pds_vfio->deferred_reset) {
...
pds_vfio->state = pds_vfio->deferred_reset_state;
...
}

it's same as what this patch does:

pds_vfio_recovery()
pds_vfio->deferred_reset_type = PDS_VFIO_DEVICE_RESET;

pds_vfio_reset()
pds_vfio->deferred_reset_state = PDS_VFIO_HOST_RESET;

pds_vfio_state_mutex_unlock()
if (pds_vfio->deferred_reset) {
...
if (pds_vfio->deferred_reset_type == PDS_VFIO_HOST_RESET)
pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
else
pds_vfio->state = VFIO_DEVICE_STATE_ERROR;
...
}

looks the actual functional difference is from below change:

> @@ -32,13 +32,14 @@ void pds_vfio_state_mutex_unlock(struct
> pds_vfio_pci_device *pds_vfio)
> mutex_lock(&pds_vfio->reset_mutex);
> if (pds_vfio->deferred_reset) {
> pds_vfio->deferred_reset = false;
> - if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
> - pds_vfio_put_restore_file(pds_vfio);
> - pds_vfio_put_save_file(pds_vfio);
> + pds_vfio_put_restore_file(pds_vfio);
> + pds_vfio_put_save_file(pds_vfio);

above two are changed from conditional to always.

> + if (pds_vfio->deferred_reset_type == PDS_VFIO_HOST_RESET)
> {
> + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
> + } else {
> pds_vfio_dirty_disable(pds_vfio, false);

and this is now only for the DSC path.

need a better explanation here.

2024-02-05 17:35:36

by Brett Creeley

[permalink] [raw]
Subject: Re: [PATCH vfio] vfio/pds: Rework and simplify reset flows

On 2/4/2024 10:58 PM, Tian, Kevin wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
>> From: Brett Creeley <[email protected]>
>> Sent: Saturday, January 27, 2024 2:32 AM
>>
>> The current logic for handling resets based on
>> whether they were initiated from the DSC or
>> host/VMM is slightly confusing and incorrect.
>> The incorrect behavior can cause the VF device
>> to be unusable on the destination on failed
>> migrations due to incompatible configurations.
>> Fix this by setting the state back to
>> VFIO_DEVICE_STATE_RUNNING when an FLR is
>> triggered, so the VF device is put back in
>> an "initial" pre-configured state after failures.
>
> any reason for putting short lines (<50 chars) in commit msg?

No, I will make the lines longer in the next commits.

>
>>
>> Also, while here clean-up the reset logic to
>> make the source of the reset more obvious.
>
> as a fix the a 'Fixed' tag is preferred and CC stable
>
> also separate the real fix from the cleanup so stable kernel doesn't need
> to backport unnecessary code.

Sure, I can split this into 2 changes as you suggested.

>
> btw the commit msg is not clear to me. It says fixing the problem
> by setting the state to _ERROR for the DSC path and to _RUNNING for
> the FLR path.
>
> But looks it's already such case with old code:
>
> pds_vfio_recovery()
> pds_vfio->deferred_reset = true;
> pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
>
> pds_vfio_reset()
> pds_vfio->deferred_reset = true;
> pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
>
> pds_vfio_state_mutex_unlock()
> if (pds_vfio->deferred_reset) {
> ...
> pds_vfio->state = pds_vfio->deferred_reset_state;
> ...
> }
>
> it's same as what this patch does:
>
> pds_vfio_recovery()
> pds_vfio->deferred_reset_type = PDS_VFIO_DEVICE_RESET;
>
> pds_vfio_reset()
> pds_vfio->deferred_reset_state = PDS_VFIO_HOST_RESET;
>
> pds_vfio_state_mutex_unlock()
> if (pds_vfio->deferred_reset) {
> ...
> if (pds_vfio->deferred_reset_type == PDS_VFIO_HOST_RESET)
> pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
> else
> pds_vfio->state = VFIO_DEVICE_STATE_ERROR;
> ...
> }
>
> looks the actual functional difference is from below change:
>
>> @@ -32,13 +32,14 @@ void pds_vfio_state_mutex_unlock(struct
>> pds_vfio_pci_device *pds_vfio)
>> mutex_lock(&pds_vfio->reset_mutex);
>> if (pds_vfio->deferred_reset) {
>> pds_vfio->deferred_reset = false;
>> - if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
>> - pds_vfio_put_restore_file(pds_vfio);
>> - pds_vfio_put_save_file(pds_vfio);
>> + pds_vfio_put_restore_file(pds_vfio);
>> + pds_vfio_put_save_file(pds_vfio);
>
> above two are changed from conditional to always.
>
>> + if (pds_vfio->deferred_reset_type == PDS_VFIO_HOST_RESET)
>> {
>> + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
>> + } else {
>> pds_vfio_dirty_disable(pds_vfio, false);
>
> and this is now only for the DSC path.
>
> need a better explanation here.

I will clean up this patch by separating it into 2 patches and improving
the commit descriptions before sending a v2.

Thanks for the review,

Brett