2022-08-18 13:32:47

by Anthony Krowiak

[permalink] [raw]
Subject: [PATCH v2 0/2] s390/vfio-ap: fix two problems discovered in the vfio_ap driver

Two problems have been discovered with the vfio_ap device driver since the
hot plug support was recently introduced:

1. Attempting to remove a matrix mdev after assigning a duplicate adapter
or duplicate domain results in a hang.

2. The queues associated with an adapter or domain being unassigned from
the matrix mdev do not get unlinked from it.

Two patches are provided to resolve these problems.

Change log v1 => v2:
====================
* Added Fixes: tags to both patches
* Copying [email protected]

Tony Krowiak (2):
s390/vfio-ap: fix hang during removal of mdev after duplicate
assignment
s390/vfio-ap: fix unlinking of queues from the mdev

drivers/s390/crypto/vfio_ap_ops.c | 36 +++++++++++++++++++++++++++----
1 file changed, 32 insertions(+), 4 deletions(-)

--
2.31.1


2022-08-18 13:34:41

by Anthony Krowiak

[permalink] [raw]
Subject: [PATCH v2 1/2] s390/vfio-ap: fix hang during removal of mdev after duplicate assignment

When the same adapter or domain is assigned more than one time prior to
removing the matrix mdev to which it is assigned, the remove operation
will hang. The reason is because the same vfio_ap_queue objects with an
APQN containing the APID of the adapter or APQI of the domain being
assigned will get added to the hashtable that holds them multiple times.
This results in the pprev and next pointers of the hlist_node (mdev_qnode
field in the vfio_ap_queue object) pointing to the queue object itself.
This causes an interminable loop when the mdev is removed and the queue
table is iterated to reset the queues.

To fix this problem, the assignment operation is bypassed when assigning
an adapter or domain if it is already assigned to the matrix mdev.

Since it is not necessary to assign a resource already assigned or to
unassign a resource that has not been assigned, this patch will bypass
all assignment/unassignment operations for an adapter, domain or
control domain under these circumstances.

Cc: [email protected]
Fixes: 771e387d5e79 ("s390/vfio-ap: manage link between queue struct and matrix mdev")
Reported-by: Matthew Rosato <[email protected]>
Signed-off-by: Tony Krowiak <[email protected]>
---
drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)

diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 6c8c41fac4e1..ee82207b4e60 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -984,6 +984,11 @@ static ssize_t assign_adapter_store(struct device *dev,
goto done;
}

+ if (test_bit_inv(apid, matrix_mdev->matrix.apm)) {
+ ret = count;
+ goto done;
+ }
+
set_bit_inv(apid, matrix_mdev->matrix.apm);

ret = vfio_ap_mdev_validate_masks(matrix_mdev);
@@ -1109,6 +1114,11 @@ static ssize_t unassign_adapter_store(struct device *dev,
goto done;
}

+ if (!test_bit_inv(apid, matrix_mdev->matrix.apm)) {
+ ret = count;
+ goto done;
+ }
+
clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
vfio_ap_mdev_hot_unplug_adapter(matrix_mdev, apid);
ret = count;
@@ -1183,6 +1193,11 @@ static ssize_t assign_domain_store(struct device *dev,
goto done;
}

+ if (test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
+ ret = count;
+ goto done;
+ }
+
set_bit_inv(apqi, matrix_mdev->matrix.aqm);

ret = vfio_ap_mdev_validate_masks(matrix_mdev);
@@ -1286,6 +1301,11 @@ static ssize_t unassign_domain_store(struct device *dev,
goto done;
}

+ if (!test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
+ ret = count;
+ goto done;
+ }
+
clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
vfio_ap_mdev_hot_unplug_domain(matrix_mdev, apqi);
ret = count;
@@ -1329,6 +1349,11 @@ static ssize_t assign_control_domain_store(struct device *dev,
goto done;
}

+ if (test_bit_inv(id, matrix_mdev->matrix.adm)) {
+ ret = count;
+ goto done;
+ }
+
/* Set the bit in the ADM (bitmask) corresponding to the AP control
* domain number (id). The bits in the mask, from most significant to
* least significant, correspond to IDs 0 up to the one less than the
@@ -1378,6 +1403,11 @@ static ssize_t unassign_control_domain_store(struct device *dev,
goto done;
}

+ if (!test_bit_inv(domid, matrix_mdev->matrix.adm)) {
+ ret = count;
+ goto done;
+ }
+
clear_bit_inv(domid, matrix_mdev->matrix.adm);

if (test_bit_inv(domid, matrix_mdev->shadow_apcb.adm)) {
--
2.31.1

2022-08-18 13:55:02

by Anthony Krowiak

[permalink] [raw]
Subject: [PATCH v2 2/2] s390/vfio-ap: fix unlinking of queues from the mdev

The vfio_ap_mdev_unlink_adapter and vfio_ap_mdev_unlink_domain functions
add the associated vfio_ap_queue objects to the hashtable that links them
to the matrix mdev to which their APQN is assigned. In order to unlink
them, they must be deleted from the hashtable; if not, they will continue
to be reset whenever userspace closes the mdev fd or removes the mdev.
This patch fixes that issue.

Cc: [email protected]
Fixes: 2838ba5bdcd6 ("s390/vfio-ap: reset queues after adapter/domain unassignment")
Reported-by: Tony Krowiak <[email protected]>
Signed-off-by: Tony Krowiak <[email protected]>
---
drivers/s390/crypto/vfio_ap_ops.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index ee82207b4e60..2493926b5dfb 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1049,8 +1049,7 @@ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev,
if (q && qtable) {
if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) &&
test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm))
- hash_add(qtable->queues, &q->mdev_qnode,
- q->apqn);
+ vfio_ap_unlink_queue_fr_mdev(q);
}
}
}
@@ -1236,8 +1235,7 @@ static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev,
if (q && qtable) {
if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) &&
test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm))
- hash_add(qtable->queues, &q->mdev_qnode,
- q->apqn);
+ vfio_ap_unlink_queue_fr_mdev(q);
}
}
}
--
2.31.1

2022-08-18 14:44:06

by Halil Pasic

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] s390/vfio-ap: fix hang during removal of mdev after duplicate assignment

On Thu, 18 Aug 2022 09:26:05 -0400
Tony Krowiak <[email protected]> wrote:

Subject: s390/vfio-ap: fix hang during removal of mdev after duplicate
assignment

It would have made sense to do it this way in the first place, even
if the link code were to take care of the duplicates. It did not really
make sense to do the whole filtering biz and everything else. Maybe we
should spin the short description and the rest of the commit message so
it reflects the code more.

> When the same adapter or domain is assigned more than one time prior to
> removing the matrix mdev to which it is assigned, the remove operation
> will hang. The reason is because the same vfio_ap_queue objects with an
> APQN containing the APID of the adapter or APQI of the domain being
> assigned will get added to the hashtable that holds them multiple times.
> This results in the pprev and next pointers of the hlist_node (mdev_qnode
> field in the vfio_ap_queue object) pointing to the queue object itself.
> This causes an interminable loop when the mdev is removed and the queue
> table is iterated to reset the queues.
>
> To fix this problem, the assignment operation is bypassed when assigning
> an adapter or domain if it is already assigned to the matrix mdev.
>
> Since it is not necessary to assign a resource already assigned or to
> unassign a resource that has not been assigned, this patch will bypass
> all assignment/unassignment operations for an adapter, domain or
> control domain under these circumstances.
>
> Cc: [email protected]
> Fixes: 771e387d5e79 ("s390/vfio-ap: manage link between queue struct and matrix mdev")

Not 11cb2419fafe ("s390/vfio-ap: manage link between queue struct and
matrix mdev")

Is my repo borked?


> Reported-by: Matthew Rosato <[email protected]>
> Signed-off-by: Tony Krowiak <[email protected]>
> ---
> drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++
> 1 file changed, 30 insertions(+)
>
> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
> index 6c8c41fac4e1..ee82207b4e60 100644
> --- a/drivers/s390/crypto/vfio_ap_ops.c
> +++ b/drivers/s390/crypto/vfio_ap_ops.c
> @@ -984,6 +984,11 @@ static ssize_t assign_adapter_store(struct device *dev,
> goto done;
> }
>
> + if (test_bit_inv(apid, matrix_mdev->matrix.apm)) {
> + ret = count;
> + goto done;
> + }
> +
> set_bit_inv(apid, matrix_mdev->matrix.apm);
>
> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
> @@ -1109,6 +1114,11 @@ static ssize_t unassign_adapter_store(struct device *dev,
> goto done;
> }
>
> + if (!test_bit_inv(apid, matrix_mdev->matrix.apm)) {
> + ret = count;
> + goto done;
> + }
> +
> clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
> vfio_ap_mdev_hot_unplug_adapter(matrix_mdev, apid);
> ret = count;
> @@ -1183,6 +1193,11 @@ static ssize_t assign_domain_store(struct device *dev,
> goto done;
> }
>
> + if (test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
> + ret = count;
> + goto done;
> + }
> +
> set_bit_inv(apqi, matrix_mdev->matrix.aqm);
>
> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
> @@ -1286,6 +1301,11 @@ static ssize_t unassign_domain_store(struct device *dev,
> goto done;
> }
>
> + if (!test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
> + ret = count;
> + goto done;
> + }
> +
> clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
> vfio_ap_mdev_hot_unplug_domain(matrix_mdev, apqi);
> ret = count;
> @@ -1329,6 +1349,11 @@ static ssize_t assign_control_domain_store(struct device *dev,
> goto done;
> }
>
> + if (test_bit_inv(id, matrix_mdev->matrix.adm)) {
> + ret = count;
> + goto done;
> + }
> +
> /* Set the bit in the ADM (bitmask) corresponding to the AP control
> * domain number (id). The bits in the mask, from most significant to
> * least significant, correspond to IDs 0 up to the one less than the
> @@ -1378,6 +1403,11 @@ static ssize_t unassign_control_domain_store(struct device *dev,
> goto done;
> }
>
> + if (!test_bit_inv(domid, matrix_mdev->matrix.adm)) {
> + ret = count;
> + goto done;
> + }
> +
> clear_bit_inv(domid, matrix_mdev->matrix.adm);
>
> if (test_bit_inv(domid, matrix_mdev->shadow_apcb.adm)) {

2022-08-18 19:32:18

by Anthony Krowiak

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] s390/vfio-ap: fix hang during removal of mdev after duplicate assignment


On 8/18/22 10:12 AM, Halil Pasic wrote:
> On Thu, 18 Aug 2022 09:26:05 -0400
> Tony Krowiak <[email protected]> wrote:
>
> Subject: s390/vfio-ap: fix hang during removal of mdev after duplicate
> assignment
>
> It would have made sense to do it this way in the first place, even
> if the link code were to take care of the duplicates. It did not really
> make sense to do the whole filtering biz and everything else.


No, it did not; however, nobody caught it in review either. In fact,
this probably should have been done prior to hot plug.


> Maybe we
> should spin the short description and the rest of the commit message so
> it reflects the code more.


I'm not sure what you mean here, are you suggesting the first two
paragraphs should be eliminated?


>
>
>> When the same adapter or domain is assigned more than one time prior to
>> removing the matrix mdev to which it is assigned, the remove operation
>> will hang. The reason is because the same vfio_ap_queue objects with an
>> APQN containing the APID of the adapter or APQI of the domain being
>> assigned will get added to the hashtable that holds them multiple times.
>> This results in the pprev and next pointers of the hlist_node (mdev_qnode
>> field in the vfio_ap_queue object) pointing to the queue object itself.
>> This causes an interminable loop when the mdev is removed and the queue
>> table is iterated to reset the queues.
>>
>> To fix this problem, the assignment operation is bypassed when assigning
>> an adapter or domain if it is already assigned to the matrix mdev.
>>
>> Since it is not necessary to assign a resource already assigned or to
>> unassign a resource that has not been assigned, this patch will bypass
>> all assignment/unassignment operations for an adapter, domain or
>> control domain under these circumstances.
>>
>> Cc: [email protected]
>> Fixes: 771e387d5e79 ("s390/vfio-ap: manage link between queue struct and matrix mdev")
> Not 11cb2419fafe ("s390/vfio-ap: manage link between queue struct and
> matrix mdev")
>
> Is my repo borked?
>
>
>> Reported-by: Matthew Rosato <[email protected]>
>> Signed-off-by: Tony Krowiak <[email protected]>
>> ---
>> drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++
>> 1 file changed, 30 insertions(+)
>>
>> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
>> index 6c8c41fac4e1..ee82207b4e60 100644
>> --- a/drivers/s390/crypto/vfio_ap_ops.c
>> +++ b/drivers/s390/crypto/vfio_ap_ops.c
>> @@ -984,6 +984,11 @@ static ssize_t assign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apid, matrix_mdev->matrix.apm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1109,6 +1114,11 @@ static ssize_t unassign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
>> vfio_ap_mdev_hot_unplug_adapter(matrix_mdev, apid);
>> ret = count;
>> @@ -1183,6 +1193,11 @@ static ssize_t assign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apqi, matrix_mdev->matrix.aqm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1286,6 +1301,11 @@ static ssize_t unassign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
>> vfio_ap_mdev_hot_unplug_domain(matrix_mdev, apqi);
>> ret = count;
>> @@ -1329,6 +1349,11 @@ static ssize_t assign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(id, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> /* Set the bit in the ADM (bitmask) corresponding to the AP control
>> * domain number (id). The bits in the mask, from most significant to
>> * least significant, correspond to IDs 0 up to the one less than the
>> @@ -1378,6 +1403,11 @@ static ssize_t unassign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(domid, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv(domid, matrix_mdev->matrix.adm);
>>
>> if (test_bit_inv(domid, matrix_mdev->shadow_apcb.adm)) {

2022-08-19 14:07:25

by Anthony Krowiak

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] s390/vfio-ap: fix hang during removal of mdev after duplicate assignment


On 8/18/22 10:12 AM, Halil Pasic wrote:
> On Thu, 18 Aug 2022 09:26:05 -0400
> Tony Krowiak <[email protected]> wrote:
>
> Subject: s390/vfio-ap: fix hang during removal of mdev after duplicate
> assignment
>
> It would have made sense to do it this way in the first place, even
> if the link code were to take care of the duplicates. It did not really
> make sense to do the whole filtering biz and everything else. Maybe we
> should spin the short description and the rest of the commit message so
> it reflects the code more.
>
>> When the same adapter or domain is assigned more than one time prior to
>> removing the matrix mdev to which it is assigned, the remove operation
>> will hang. The reason is because the same vfio_ap_queue objects with an
>> APQN containing the APID of the adapter or APQI of the domain being
>> assigned will get added to the hashtable that holds them multiple times.
>> This results in the pprev and next pointers of the hlist_node (mdev_qnode
>> field in the vfio_ap_queue object) pointing to the queue object itself.
>> This causes an interminable loop when the mdev is removed and the queue
>> table is iterated to reset the queues.
>>
>> To fix this problem, the assignment operation is bypassed when assigning
>> an adapter or domain if it is already assigned to the matrix mdev.
>>
>> Since it is not necessary to assign a resource already assigned or to
>> unassign a resource that has not been assigned, this patch will bypass
>> all assignment/unassignment operations for an adapter, domain or
>> control domain under these circumstances.
>>
>> Cc: [email protected]
>> Fixes: 771e387d5e79 ("s390/vfio-ap: manage link between queue struct and matrix mdev")
> Not 11cb2419fafe ("s390/vfio-ap: manage link between queue struct and
> matrix mdev")
>
> Is my repo borked?


I can't speak for your repo, but I was able to successfully execute 'git
show 11cb2419fafe' in both my master and devel branches.


>
>
>> Reported-by: Matthew Rosato <[email protected]>
>> Signed-off-by: Tony Krowiak <[email protected]>
>> ---
>> drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++
>> 1 file changed, 30 insertions(+)
>>
>> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
>> index 6c8c41fac4e1..ee82207b4e60 100644
>> --- a/drivers/s390/crypto/vfio_ap_ops.c
>> +++ b/drivers/s390/crypto/vfio_ap_ops.c
>> @@ -984,6 +984,11 @@ static ssize_t assign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apid, matrix_mdev->matrix.apm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1109,6 +1114,11 @@ static ssize_t unassign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
>> vfio_ap_mdev_hot_unplug_adapter(matrix_mdev, apid);
>> ret = count;
>> @@ -1183,6 +1193,11 @@ static ssize_t assign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apqi, matrix_mdev->matrix.aqm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1286,6 +1301,11 @@ static ssize_t unassign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
>> vfio_ap_mdev_hot_unplug_domain(matrix_mdev, apqi);
>> ret = count;
>> @@ -1329,6 +1349,11 @@ static ssize_t assign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(id, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> /* Set the bit in the ADM (bitmask) corresponding to the AP control
>> * domain number (id). The bits in the mask, from most significant to
>> * least significant, correspond to IDs 0 up to the one less than the
>> @@ -1378,6 +1403,11 @@ static ssize_t unassign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(domid, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv(domid, matrix_mdev->matrix.adm);
>>
>> if (test_bit_inv(domid, matrix_mdev->shadow_apcb.adm)) {

2022-08-22 13:17:55

by Anthony Krowiak

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] s390/vfio-ap: fix hang during removal of mdev after duplicate assignment


On 8/18/22 10:12 AM, Halil Pasic wrote:
> On Thu, 18 Aug 2022 09:26:05 -0400
> Tony Krowiak <[email protected]> wrote:
>
> Subject: s390/vfio-ap: fix hang during removal of mdev after duplicate
> assignment
>
> It would have made sense to do it this way in the first place, even
> if the link code were to take care of the duplicates. It did not really
> make sense to do the whole filtering biz and everything else. Maybe we
> should spin the short description and the rest of the commit message so
> it reflects the code more.
>
>> When the same adapter or domain is assigned more than one time prior to
>> removing the matrix mdev to which it is assigned, the remove operation
>> will hang. The reason is because the same vfio_ap_queue objects with an
>> APQN containing the APID of the adapter or APQI of the domain being
>> assigned will get added to the hashtable that holds them multiple times.
>> This results in the pprev and next pointers of the hlist_node (mdev_qnode
>> field in the vfio_ap_queue object) pointing to the queue object itself.
>> This causes an interminable loop when the mdev is removed and the queue
>> table is iterated to reset the queues.
>>
>> To fix this problem, the assignment operation is bypassed when assigning
>> an adapter or domain if it is already assigned to the matrix mdev.
>>
>> Since it is not necessary to assign a resource already assigned or to
>> unassign a resource that has not been assigned, this patch will bypass
>> all assignment/unassignment operations for an adapter, domain or
>> control domain under these circumstances.
>>
>> Cc: [email protected]
>> Fixes: 771e387d5e79 ("s390/vfio-ap: manage link between queue struct and matrix mdev")
> Not 11cb2419fafe ("s390/vfio-ap: manage link between queue struct and
> matrix mdev")
>
> Is my repo borked?


Alexander pointed out my mistake, your repo is not borked.


>
>
>> Reported-by: Matthew Rosato <[email protected]>
>> Signed-off-by: Tony Krowiak <[email protected]>
>> ---
>> drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++
>> 1 file changed, 30 insertions(+)
>>
>> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
>> index 6c8c41fac4e1..ee82207b4e60 100644
>> --- a/drivers/s390/crypto/vfio_ap_ops.c
>> +++ b/drivers/s390/crypto/vfio_ap_ops.c
>> @@ -984,6 +984,11 @@ static ssize_t assign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apid, matrix_mdev->matrix.apm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1109,6 +1114,11 @@ static ssize_t unassign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
>> vfio_ap_mdev_hot_unplug_adapter(matrix_mdev, apid);
>> ret = count;
>> @@ -1183,6 +1193,11 @@ static ssize_t assign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apqi, matrix_mdev->matrix.aqm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1286,6 +1301,11 @@ static ssize_t unassign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
>> vfio_ap_mdev_hot_unplug_domain(matrix_mdev, apqi);
>> ret = count;
>> @@ -1329,6 +1349,11 @@ static ssize_t assign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(id, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> /* Set the bit in the ADM (bitmask) corresponding to the AP control
>> * domain number (id). The bits in the mask, from most significant to
>> * least significant, correspond to IDs 0 up to the one less than the
>> @@ -1378,6 +1403,11 @@ static ssize_t unassign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(domid, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv(domid, matrix_mdev->matrix.adm);
>>
>> if (test_bit_inv(domid, matrix_mdev->shadow_apcb.adm)) {

2022-08-22 13:43:20

by Anthony Krowiak

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] s390/vfio-ap: fix hang during removal of mdev after duplicate assignment


On 8/18/22 10:12 AM, Halil Pasic wrote:
> On Thu, 18 Aug 2022 09:26:05 -0400
> Tony Krowiak <[email protected]> wrote:
>
> Subject: s390/vfio-ap: fix hang during removal of mdev after duplicate
> assignment
>
> It would have made sense to do it this way in the first place, even
> if the link code were to take care of the duplicates. It did not really
> make sense to do the whole filtering biz and everything else. Maybe we
> should spin the short description and the rest of the commit message so
> it reflects the code more.


Okay, I think I'm picking up what you're laying down here. I believe you
are suggesting I change the subject to indicate that the unnecessary
processing of AP resources should be bypassed and the rest of the
description should expound on that.


>
>
>> When the same adapter or domain is assigned more than one time prior to
>> removing the matrix mdev to which it is assigned, the remove operation
>> will hang. The reason is because the same vfio_ap_queue objects with an
>> APQN containing the APID of the adapter or APQI of the domain being
>> assigned will get added to the hashtable that holds them multiple times.
>> This results in the pprev and next pointers of the hlist_node (mdev_qnode
>> field in the vfio_ap_queue object) pointing to the queue object itself.
>> This causes an interminable loop when the mdev is removed and the queue
>> table is iterated to reset the queues.
>>
>> To fix this problem, the assignment operation is bypassed when assigning
>> an adapter or domain if it is already assigned to the matrix mdev.
>>
>> Since it is not necessary to assign a resource already assigned or to
>> unassign a resource that has not been assigned, this patch will bypass
>> all assignment/unassignment operations for an adapter, domain or
>> control domain under these circumstances.
>>
>> Cc: [email protected]
>> Fixes: 771e387d5e79 ("s390/vfio-ap: manage link between queue struct and matrix mdev")
> Not 11cb2419fafe ("s390/vfio-ap: manage link between queue struct and
> matrix mdev")
>
> Is my repo borked?
>
>
>> Reported-by: Matthew Rosato <[email protected]>
>> Signed-off-by: Tony Krowiak <[email protected]>
>> ---
>> drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++
>> 1 file changed, 30 insertions(+)
>>
>> diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
>> index 6c8c41fac4e1..ee82207b4e60 100644
>> --- a/drivers/s390/crypto/vfio_ap_ops.c
>> +++ b/drivers/s390/crypto/vfio_ap_ops.c
>> @@ -984,6 +984,11 @@ static ssize_t assign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apid, matrix_mdev->matrix.apm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1109,6 +1114,11 @@ static ssize_t unassign_adapter_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apid, matrix_mdev->matrix.apm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apid, matrix_mdev->matrix.apm);
>> vfio_ap_mdev_hot_unplug_adapter(matrix_mdev, apid);
>> ret = count;
>> @@ -1183,6 +1193,11 @@ static ssize_t assign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> set_bit_inv(apqi, matrix_mdev->matrix.aqm);
>>
>> ret = vfio_ap_mdev_validate_masks(matrix_mdev);
>> @@ -1286,6 +1301,11 @@ static ssize_t unassign_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(apqi, matrix_mdev->matrix.aqm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv((unsigned long)apqi, matrix_mdev->matrix.aqm);
>> vfio_ap_mdev_hot_unplug_domain(matrix_mdev, apqi);
>> ret = count;
>> @@ -1329,6 +1349,11 @@ static ssize_t assign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (test_bit_inv(id, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> /* Set the bit in the ADM (bitmask) corresponding to the AP control
>> * domain number (id). The bits in the mask, from most significant to
>> * least significant, correspond to IDs 0 up to the one less than the
>> @@ -1378,6 +1403,11 @@ static ssize_t unassign_control_domain_store(struct device *dev,
>> goto done;
>> }
>>
>> + if (!test_bit_inv(domid, matrix_mdev->matrix.adm)) {
>> + ret = count;
>> + goto done;
>> + }
>> +
>> clear_bit_inv(domid, matrix_mdev->matrix.adm);
>>
>> if (test_bit_inv(domid, matrix_mdev->shadow_apcb.adm)) {