2022-04-22 16:03:48

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH v3 2/5] virtio-crypto: wait ctrl queue instead of busy polling


在 2022/4/21 18:40, zhenwei pi 写道:
> Originally, after submitting request into virtio crypto control
> queue, the guest side polls the result from the virt queue. This
> works like following:
> CPU0 CPU1 ... CPUx CPUy
> | | | |
> \ \ / /
> \--------spin_lock(&vcrypto->ctrl_lock)-------/
> |
> virtqueue add & kick
> |
> busy poll virtqueue
> |
> spin_unlock(&vcrypto->ctrl_lock)
> ...
>
> There are two problems:
> 1, The queue depth is always 1, the performance of a virtio crypto
> device gets limited. Multi user processes share a single control
> queue, and hit spin lock race from control queue. Test on Intel
> Platinum 8260, a single worker gets ~35K/s create/close session
> operations, and 8 workers get ~40K/s operations with 800% CPU
> utilization.
> 2, The control request is supposed to get handled immediately, but
> in the current implementation of QEMU(v6.2), the vCPU thread kicks
> another thread to do this work, the latency also gets unstable.
> Tracking latency of virtio_crypto_alg_akcipher_close_session in 5s:
> usecs : count distribution
> 0 -> 1 : 0 | |
> 2 -> 3 : 7 | |
> 4 -> 7 : 72 | |
> 8 -> 15 : 186485 |************************|
> 16 -> 31 : 687 | |
> 32 -> 63 : 5 | |
> 64 -> 127 : 3 | |
> 128 -> 255 : 1 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 0 | |
> 1024 -> 2047 : 0 | |
> 2048 -> 4095 : 0 | |
> 4096 -> 8191 : 0 | |
> 8192 -> 16383 : 2 | |
> This means that a CPU may hold vcrypto->ctrl_lock as long as 8192~16383us.
>
> To improve the performance of control queue, a request on control queue waits
> completion instead of busy polling to reduce lock racing, and gets completed by
> control queue callback.
> CPU0 CPU1 ... CPUx CPUy
> | | | |
> \ \ / /
> \--------spin_lock(&vcrypto->ctrl_lock)-------/
> |
> virtqueue add & kick
> |
> ---------spin_unlock(&vcrypto->ctrl_lock)------
> / / \ \
> | | | |
> wait wait wait wait
>
> Test this patch, the guest side get ~200K/s operations with 300% CPU
> utilization.
>
> Cc: Michael S. Tsirkin <[email protected]>
> Cc: Jason Wang <[email protected]>
> Cc: Gonglei <[email protected]>
> Signed-off-by: zhenwei pi <[email protected]>
> ---
> drivers/crypto/virtio/virtio_crypto_common.c | 42 +++++++++++++++-----
> drivers/crypto/virtio/virtio_crypto_common.h | 8 ++++
> drivers/crypto/virtio/virtio_crypto_core.c | 2 +-
> 3 files changed, 41 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/crypto/virtio/virtio_crypto_common.c b/drivers/crypto/virtio/virtio_crypto_common.c
> index e65125a74db2..93df73c40dd3 100644
> --- a/drivers/crypto/virtio/virtio_crypto_common.c
> +++ b/drivers/crypto/virtio/virtio_crypto_common.c
> @@ -8,14 +8,21 @@
>
> #include "virtio_crypto_common.h"
>
> +static void virtio_crypto_ctrlq_callback(struct virtio_crypto_ctrl_request *vc_ctrl_req)
> +{
> + complete(&vc_ctrl_req->compl);
> +}
> +
> int virtio_crypto_ctrl_vq_request(struct virtio_crypto *vcrypto, struct scatterlist *sgs[],
> unsigned int out_sgs, unsigned int in_sgs,
> struct virtio_crypto_ctrl_request *vc_ctrl_req)
> {
> int err;
> - unsigned int inlen;
> unsigned long flags;
>
> + init_completion(&vc_ctrl_req->compl);
> + vc_ctrl_req->ctrl_cb = virtio_crypto_ctrlq_callback;


Is there a chance that the cb would not be virtio_crypto_ctrlq_callback()?


> +
> spin_lock_irqsave(&vcrypto->ctrl_lock, flags);
> err = virtqueue_add_sgs(vcrypto->ctrl_vq, sgs, out_sgs, in_sgs, vc_ctrl_req, GFP_ATOMIC);
> if (err < 0) {
> @@ -24,16 +31,31 @@ int virtio_crypto_ctrl_vq_request(struct virtio_crypto *vcrypto, struct scatterl
> }
>
> virtqueue_kick(vcrypto->ctrl_vq);
> -
> - /*
> - * Trapping into the hypervisor, so the request should be
> - * handled immediately.
> - */
> - while (!virtqueue_get_buf(vcrypto->ctrl_vq, &inlen) &&
> - !virtqueue_is_broken(vcrypto->ctrl_vq))
> - cpu_relax();
> -
> spin_unlock_irqrestore(&vcrypto->ctrl_lock, flags);
>
> + wait_for_completion(&vc_ctrl_req->compl);
> +
> return 0;
> }
> +
> +void virtcrypto_ctrlq_callback(struct virtqueue *vq)
> +{
> + struct virtio_crypto *vcrypto = vq->vdev->priv;
> + struct virtio_crypto_ctrl_request *vc_ctrl_req;
> + unsigned long flags;
> + unsigned int len;
> +
> + spin_lock_irqsave(&vcrypto->ctrl_lock, flags);
> + do {
> + virtqueue_disable_cb(vq);
> + while ((vc_ctrl_req = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_unlock_irqrestore(&vcrypto->ctrl_lock, flags);
> + if (vc_ctrl_req->ctrl_cb)
> + vc_ctrl_req->ctrl_cb(vc_ctrl_req);
> + spin_lock_irqsave(&vcrypto->ctrl_lock, flags);
> + }
> + if (unlikely(virtqueue_is_broken(vq)))
> + break;
> + } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vcrypto->ctrl_lock, flags);
> +}
> diff --git a/drivers/crypto/virtio/virtio_crypto_common.h b/drivers/crypto/virtio/virtio_crypto_common.h
> index d2a20fe6e13e..25b4f22e8605 100644
> --- a/drivers/crypto/virtio/virtio_crypto_common.h
> +++ b/drivers/crypto/virtio/virtio_crypto_common.h
> @@ -81,6 +81,10 @@ struct virtio_crypto_sym_session_info {
> __u64 session_id;
> };
>
> +struct virtio_crypto_ctrl_request;
> +typedef void (*virtio_crypto_ctrl_callback)
> + (struct virtio_crypto_ctrl_request *vc_ctrl_req);
> +
> /*
> * Note: there are padding fields in request, clear them to zero before sending to host,
> * Ex, virtio_crypto_ctrl_request::ctrl::u::destroy_session::padding[48]
> @@ -89,6 +93,8 @@ struct virtio_crypto_ctrl_request {
> struct virtio_crypto_op_ctrl_req ctrl;
> struct virtio_crypto_session_input input;
> struct virtio_crypto_inhdr ctrl_status;
> + virtio_crypto_ctrl_callback ctrl_cb;
> + struct completion compl;
> };
>
> struct virtio_crypto_request;
> @@ -141,7 +147,9 @@ void virtio_crypto_skcipher_algs_unregister(struct virtio_crypto *vcrypto);
> int virtio_crypto_akcipher_algs_register(struct virtio_crypto *vcrypto);
> void virtio_crypto_akcipher_algs_unregister(struct virtio_crypto *vcrypto);
>
> +void virtcrypto_ctrlq_callback(struct virtqueue *vq);
> int virtio_crypto_ctrl_vq_request(struct virtio_crypto *vcrypto, struct scatterlist *sgs[],
> unsigned int out_sgs, unsigned int in_sgs,
> struct virtio_crypto_ctrl_request *vc_ctrl_req);
> +


Unnecessary changes.

Thanks


> #endif /* _VIRTIO_CRYPTO_COMMON_H */
> diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
> index c6f482db0bc0..e668d4b1bc6a 100644
> --- a/drivers/crypto/virtio/virtio_crypto_core.c
> +++ b/drivers/crypto/virtio/virtio_crypto_core.c
> @@ -73,7 +73,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
> goto err_names;
>
> /* Parameters for control virtqueue */
> - callbacks[total_vqs - 1] = NULL;
> + callbacks[total_vqs - 1] = virtcrypto_ctrlq_callback;
> names[total_vqs - 1] = "controlq";
>
> /* Allocate/initialize parameters for data virtqueues */


2022-04-22 19:46:15

by zhenwei pi

[permalink] [raw]
Subject: Re: Re: [PATCH v3 2/5] virtio-crypto: wait ctrl queue instead of busy polling

On 4/22/22 15:46, Jason Wang wrote:
>
> 在 2022/4/21 18:40, zhenwei pi 写道:
>> Originally, after submitting request into virtio crypto control
>> queue, the guest side polls the result from the virt queue. This
>> works like following:
>>      CPU0   CPU1               ...             CPUx  CPUy
>>       |      |                                  |     |
>>       \      \                                  /     /
>>        \--------spin_lock(&vcrypto->ctrl_lock)-------/
>>                             |
>>                   virtqueue add & kick
>>                             |
>>                    busy poll virtqueue
>>                             |
>>                spin_unlock(&vcrypto->ctrl_lock)
>>                            ...
>>
>> There are two problems:
>> 1, The queue depth is always 1, the performance of a virtio crypto
>>     device gets limited. Multi user processes share a single control
>>     queue, and hit spin lock race from control queue. Test on Intel
>>     Platinum 8260, a single worker gets ~35K/s create/close session
>>     operations, and 8 workers get ~40K/s operations with 800% CPU
>>     utilization.
>> 2, The control request is supposed to get handled immediately, but
>>     in the current implementation of QEMU(v6.2), the vCPU thread kicks
>>     another thread to do this work, the latency also gets unstable.
>>     Tracking latency of virtio_crypto_alg_akcipher_close_session in 5s:
>>          usecs               : count     distribution
>>           0 -> 1          : 0        |                        |
>>           2 -> 3          : 7        |                        |
>>           4 -> 7          : 72       |                        |
>>           8 -> 15         : 186485   |************************|
>>          16 -> 31         : 687      |                        |
>>          32 -> 63         : 5        |                        |
>>          64 -> 127        : 3        |                        |
>>         128 -> 255        : 1        |                        |
>>         256 -> 511        : 0        |                        |
>>         512 -> 1023       : 0        |                        |
>>        1024 -> 2047       : 0        |                        |
>>        2048 -> 4095       : 0        |                        |
>>        4096 -> 8191       : 0        |                        |
>>        8192 -> 16383      : 2        |                        |
>>     This means that a CPU may hold vcrypto->ctrl_lock as long as
>> 8192~16383us.
>>
>> To improve the performance of control queue, a request on control
>> queue waits
>> completion instead of busy polling to reduce lock racing, and gets
>> completed by
>> control queue callback.
>>      CPU0   CPU1               ...             CPUx  CPUy
>>       |      |                                  |     |
>>       \      \                                  /     /
>>        \--------spin_lock(&vcrypto->ctrl_lock)-------/
>>                             |
>>                   virtqueue add & kick
>>                             |
>>        ---------spin_unlock(&vcrypto->ctrl_lock)------
>>       /      /                                  \     \
>>       |      |                                  |     |
>>      wait   wait                               wait  wait
>>
>> Test this patch, the guest side get ~200K/s operations with 300% CPU
>> utilization.
>>
>> Cc: Michael S. Tsirkin <[email protected]>
>> Cc: Jason Wang <[email protected]>
>> Cc: Gonglei <[email protected]>
>> Signed-off-by: zhenwei pi <[email protected]>
>> ---
>>   drivers/crypto/virtio/virtio_crypto_common.c | 42 +++++++++++++++-----
>>   drivers/crypto/virtio/virtio_crypto_common.h |  8 ++++
>>   drivers/crypto/virtio/virtio_crypto_core.c   |  2 +-
>>   3 files changed, 41 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/crypto/virtio/virtio_crypto_common.c
>> b/drivers/crypto/virtio/virtio_crypto_common.c
>> index e65125a74db2..93df73c40dd3 100644
>> --- a/drivers/crypto/virtio/virtio_crypto_common.c
>> +++ b/drivers/crypto/virtio/virtio_crypto_common.c
>> @@ -8,14 +8,21 @@
>>   #include "virtio_crypto_common.h"
>> +static void virtio_crypto_ctrlq_callback(struct
>> virtio_crypto_ctrl_request *vc_ctrl_req)
>> +{
>> +    complete(&vc_ctrl_req->compl);
>> +}
>> +
>>   int virtio_crypto_ctrl_vq_request(struct virtio_crypto *vcrypto,
>> struct scatterlist *sgs[],
>>                     unsigned int out_sgs, unsigned int in_sgs,
>>                     struct virtio_crypto_ctrl_request *vc_ctrl_req)
>>   {
>>       int err;
>> -    unsigned int inlen;
>>       unsigned long flags;
>> +    init_completion(&vc_ctrl_req->compl);
>> +    vc_ctrl_req->ctrl_cb =  virtio_crypto_ctrlq_callback;
>
>
> Is there a chance that the cb would not be virtio_crypto_ctrlq_callback()?
>

Yes, it's the only callback function used for control queue, removing
this and calling virtio_crypto_ctrlq_callback directly in
virtcrypto_ctrlq_callback seems better. I'll fix this in the next version.
>
>> +
>>       spin_lock_irqsave(&vcrypto->ctrl_lock, flags);
>>       err = virtqueue_add_sgs(vcrypto->ctrl_vq, sgs, out_sgs, in_sgs,
>> vc_ctrl_req, GFP_ATOMIC);
>>       if (err < 0) {
>> @@ -24,16 +31,31 @@ int virtio_crypto_ctrl_vq_request(struct
>> virtio_crypto *vcrypto, struct scatterl
>>       }
>>       virtqueue_kick(vcrypto->ctrl_vq);
>> -
>> -    /*
>> -     * Trapping into the hypervisor, so the request should be
>> -     * handled immediately.
>> -     */
>> -    while (!virtqueue_get_buf(vcrypto->ctrl_vq, &inlen) &&
>> -        !virtqueue_is_broken(vcrypto->ctrl_vq))
>> -        cpu_relax();
>> -
>>       spin_unlock_irqrestore(&vcrypto->ctrl_lock, flags);
>> +    wait_for_completion(&vc_ctrl_req->compl);
>> +
>>       return 0;
>>   }
>> +
>> +void virtcrypto_ctrlq_callback(struct virtqueue *vq)
>> +{
>> +    struct virtio_crypto *vcrypto = vq->vdev->priv;
>> +    struct virtio_crypto_ctrl_request *vc_ctrl_req;
>> +    unsigned long flags;
>> +    unsigned int len;
>> +
>> +    spin_lock_irqsave(&vcrypto->ctrl_lock, flags);
>> +    do {
>> +        virtqueue_disable_cb(vq);
>> +        while ((vc_ctrl_req = virtqueue_get_buf(vq, &len)) != NULL) {
>> +            spin_unlock_irqrestore(&vcrypto->ctrl_lock, flags);
>> +            if (vc_ctrl_req->ctrl_cb)
>> +                vc_ctrl_req->ctrl_cb(vc_ctrl_req);
>> +            spin_lock_irqsave(&vcrypto->ctrl_lock, flags);
>> +        }
>> +        if (unlikely(virtqueue_is_broken(vq)))
>> +            break;
>> +    } while (!virtqueue_enable_cb(vq));
>> +    spin_unlock_irqrestore(&vcrypto->ctrl_lock, flags);
>> +}
>> diff --git a/drivers/crypto/virtio/virtio_crypto_common.h
>> b/drivers/crypto/virtio/virtio_crypto_common.h
>> index d2a20fe6e13e..25b4f22e8605 100644
>> --- a/drivers/crypto/virtio/virtio_crypto_common.h
>> +++ b/drivers/crypto/virtio/virtio_crypto_common.h
>> @@ -81,6 +81,10 @@ struct virtio_crypto_sym_session_info {
>>       __u64 session_id;
>>   };
>> +struct virtio_crypto_ctrl_request;
>> +typedef void (*virtio_crypto_ctrl_callback)
>> +        (struct virtio_crypto_ctrl_request *vc_ctrl_req);
>> +
>>   /*
>>    * Note: there are padding fields in request, clear them to zero
>> before sending to host,
>>    * Ex,
>> virtio_crypto_ctrl_request::ctrl::u::destroy_session::padding[48]
>> @@ -89,6 +93,8 @@ struct virtio_crypto_ctrl_request {
>>       struct virtio_crypto_op_ctrl_req ctrl;
>>       struct virtio_crypto_session_input input;
>>       struct virtio_crypto_inhdr ctrl_status;
>> +    virtio_crypto_ctrl_callback ctrl_cb;
>> +    struct completion compl;
>>   };
>>   struct virtio_crypto_request;
>> @@ -141,7 +147,9 @@ void virtio_crypto_skcipher_algs_unregister(struct
>> virtio_crypto *vcrypto);
>>   int virtio_crypto_akcipher_algs_register(struct virtio_crypto
>> *vcrypto);
>>   void virtio_crypto_akcipher_algs_unregister(struct virtio_crypto
>> *vcrypto);
>> +void virtcrypto_ctrlq_callback(struct virtqueue *vq);
>>   int virtio_crypto_ctrl_vq_request(struct virtio_crypto *vcrypto,
>> struct scatterlist *sgs[],
>>                     unsigned int out_sgs, unsigned int in_sgs,
>>                     struct virtio_crypto_ctrl_request *vc_ctrl_req);
>> +
>
>
> Unnecessary changes.
>
> Thanks
>
>
>>   #endif /* _VIRTIO_CRYPTO_COMMON_H */
>> diff --git a/drivers/crypto/virtio/virtio_crypto_core.c
>> b/drivers/crypto/virtio/virtio_crypto_core.c
>> index c6f482db0bc0..e668d4b1bc6a 100644
>> --- a/drivers/crypto/virtio/virtio_crypto_core.c
>> +++ b/drivers/crypto/virtio/virtio_crypto_core.c
>> @@ -73,7 +73,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto
>> *vi)
>>           goto err_names;
>>       /* Parameters for control virtqueue */
>> -    callbacks[total_vqs - 1] = NULL;
>> +    callbacks[total_vqs - 1] = virtcrypto_ctrlq_callback;
>>       names[total_vqs - 1] = "controlq";
>>       /* Allocate/initialize parameters for data virtqueues */
>

--
zhenwei pi