The usual way to configure a device interrupt from userland is to write
the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using
vfio to implement a device driver or a virtual machine monitor, this may
not be ideal: the process managing the vfio device interrupts may not be
granted root privilege, for security reasons. Thus it cannot directly
control the interrupt affinity and has to rely on an external command.
This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag
to specify the affinity of interrupts of a vfio pci device.
The CPU affinity mask argument must be a subset of the process cpuset,
otherwise an error -EPERM is returned.
The vfio_irq_set argument shall be set-up in the following way:
- the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set
as well as VFIO_IRQ_SET_ACTION_TRIGGER.
- the variable-length 'data' field is a cpu_set_t structure, as
for the sched_setaffinity() syscall, the size of which is derived
from 'argsz'.
Signed-off-by: Fred Griffoul <[email protected]>
---
drivers/vfio/pci/vfio_pci_core.c | 27 +++++++++++++++++----
drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++
drivers/vfio/vfio_main.c | 13 +++++++----
include/uapi/linux/vfio.h | 10 +++++++-
4 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 80cae87fff36..2e3419560480 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
{
unsigned long minsz = offsetofend(struct vfio_irq_set, count);
struct vfio_irq_set hdr;
+ cpumask_var_t mask;
u8 *data = NULL;
int max, ret = 0;
size_t data_size = 0;
@@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
return ret;
if (data_size) {
- data = memdup_user(&arg->data, data_size);
- if (IS_ERR(data))
- return PTR_ERR(data);
+ if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) {
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (copy_from_user(mask, &arg->data, data_size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ data = (u8 *)mask;
+
+ } else {
+ data = memdup_user(&arg->data, data_size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }
}
mutex_lock(&vdev->igate);
@@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
hdr.count, data);
mutex_unlock(&vdev->igate);
- kfree(data);
+
+out:
+ if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size)
+ free_cpumask_var(mask);
+ else
+ kfree(data);
return ret;
}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 8382c5834335..fe01303cf94e 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -19,6 +19,7 @@
#include <linux/vfio.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/cpuset.h>
#include "vfio_pci_priv.h"
@@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
return 0;
}
+static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev,
+ unsigned int start, unsigned int count,
+ struct cpumask *irq_mask)
+{
+ struct vfio_pci_irq_ctx *ctx;
+ cpumask_var_t allowed_mask;
+ unsigned int i;
+ int err = 0;
+
+ if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpuset_cpus_allowed(current, allowed_mask);
+ if (!cpumask_subset(irq_mask, allowed_mask)) {
+ err = -EPERM;
+ goto finish;
+ }
+
+ for (i = start; i < start + count; i++) {
+ ctx = vfio_irq_ctx_get(vdev, i);
+ if (!ctx) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = irq_set_affinity(ctx->producer.irq, irq_mask);
+ if (err)
+ break;
+ }
+
+finish:
+ free_cpumask_var(allowed_mask);
+ return err;
+}
+
static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
@@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
if (!irq_is(vdev, index))
return -EINVAL;
+ if (flags & VFIO_IRQ_SET_DATA_AFFINITY)
+ return vfio_pci_set_msi_affinity(vdev, start, count, data);
+
for (i = start; i < start + count; i++) {
ctx = vfio_irq_ctx_get(vdev, i);
if (!ctx)
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index e97d796a54fb..e75c5d66681c 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
size = 0;
break;
case VFIO_IRQ_SET_DATA_BOOL:
- size = sizeof(uint8_t);
+ size = hdr->count * sizeof(uint8_t);
break;
case VFIO_IRQ_SET_DATA_EVENTFD:
- size = sizeof(int32_t);
+ size = size_mul(hdr->count, sizeof(int32_t));
+ break;
+ case VFIO_IRQ_SET_DATA_AFFINITY:
+ size = hdr->argsz - minsz;
+ if (size > cpumask_size())
+ size = cpumask_size();
break;
default:
return -EINVAL;
}
if (size) {
- if (hdr->argsz - minsz < hdr->count * size)
+ if (hdr->argsz - minsz < size)
return -EINVAL;
if (!data_size)
return -EINVAL;
- *data_size = hdr->count * size;
+ *data_size = size;
}
return 0;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf190..5ba2ca223550 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -580,6 +580,12 @@ struct vfio_irq_info {
*
* Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
* ACTION_TRIGGER specifies kernel->user signaling.
+ *
+ * DATA_AFFINITY specifies the affinity for the range of interrupt vectors.
+ * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data'
+ * array is a CPU affinity mask 'cpu_set_t' structure, as for the
+ * sched_setaffinity() syscall argument: the 'argsz' field is used to check
+ * the actual cpu_set_t size.
*/
struct vfio_irq_set {
__u32 argsz;
@@ -587,6 +593,7 @@ struct vfio_irq_set {
#define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
#define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
#define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
+#define VFIO_IRQ_SET_DATA_AFFINITY (1 << 6) /* Data is cpu_set_t */
#define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
#define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
#define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
@@ -599,7 +606,8 @@ struct vfio_irq_set {
#define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \
VFIO_IRQ_SET_DATA_BOOL | \
- VFIO_IRQ_SET_DATA_EVENTFD)
+ VFIO_IRQ_SET_DATA_EVENTFD | \
+ VFIO_IRQ_SET_DATA_AFFINITY)
#define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \
VFIO_IRQ_SET_ACTION_UNMASK | \
VFIO_IRQ_SET_ACTION_TRIGGER)
--
2.40.1
On Mon, 10 Jun 2024 12:57:08 +0000
Fred Griffoul <[email protected]> wrote:
> The usual way to configure a device interrupt from userland is to write
> the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using
> vfio to implement a device driver or a virtual machine monitor, this may
> not be ideal: the process managing the vfio device interrupts may not be
> granted root privilege, for security reasons. Thus it cannot directly
> control the interrupt affinity and has to rely on an external command.
>
> This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag
> to specify the affinity of interrupts of a vfio pci device.
>
> The CPU affinity mask argument must be a subset of the process cpuset,
> otherwise an error -EPERM is returned.
>
> The vfio_irq_set argument shall be set-up in the following way:
>
> - the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set
> as well as VFIO_IRQ_SET_ACTION_TRIGGER.
>
> - the variable-length 'data' field is a cpu_set_t structure, as
> for the sched_setaffinity() syscall, the size of which is derived
> from 'argsz'.
>
> Signed-off-by: Fred Griffoul <[email protected]>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 27 +++++++++++++++++----
> drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++
> drivers/vfio/vfio_main.c | 13 +++++++----
> include/uapi/linux/vfio.h | 10 +++++++-
> 4 files changed, 80 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 80cae87fff36..2e3419560480 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> {
> unsigned long minsz = offsetofend(struct vfio_irq_set, count);
> struct vfio_irq_set hdr;
> + cpumask_var_t mask;
> u8 *data = NULL;
> int max, ret = 0;
> size_t data_size = 0;
> @@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> return ret;
>
> if (data_size) {
> - data = memdup_user(&arg->data, data_size);
> - if (IS_ERR(data))
> - return PTR_ERR(data);
> + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) {
> + if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + if (copy_from_user(mask, &arg->data, data_size)) {
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + data = (u8 *)mask;
Seems like this could just use the memdup_user() path, why do we care
to copy it into a cpumask_var_t here? If we do care, wouldn't we
implement something like get_user_cpu_mask() used by
sched_setaffinity(2)?
> +
> + } else {
> + data = memdup_user(&arg->data, data_size);
> + if (IS_ERR(data))
> + return PTR_ERR(data);
> + }
> }
>
> mutex_lock(&vdev->igate);
> @@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> hdr.count, data);
>
> mutex_unlock(&vdev->igate);
> - kfree(data);
> +
> +out:
> + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size)
> + free_cpumask_var(mask);
> + else
> + kfree(data);
>
> return ret;
> }
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 8382c5834335..fe01303cf94e 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -19,6 +19,7 @@
> #include <linux/vfio.h>
> #include <linux/wait.h>
> #include <linux/slab.h>
> +#include <linux/cpuset.h>
>
> #include "vfio_pci_priv.h"
>
> @@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> return 0;
> }
>
> +static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev,
> + unsigned int start, unsigned int count,
> + struct cpumask *irq_mask)
Aside from the name, what makes this unique to MSI vectors?
> +{
> + struct vfio_pci_irq_ctx *ctx;
> + cpumask_var_t allowed_mask;
> + unsigned int i;
> + int err = 0;
> +
> + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + cpuset_cpus_allowed(current, allowed_mask);
> + if (!cpumask_subset(irq_mask, allowed_mask)) {
> + err = -EPERM;
> + goto finish;
> + }
> +
> + for (i = start; i < start + count; i++) {
> + ctx = vfio_irq_ctx_get(vdev, i);
> + if (!ctx) {
> + err = -EINVAL;
> + break;
> + }
> +
> + err = irq_set_affinity(ctx->producer.irq, irq_mask);
> + if (err)
> + break;
> + }
Is this typical/userful behavior to set a series of vectors to the same
cpu_set_t? It's unusual behavior for this ioctl to apply the same data
across multiple vectors. Should the DATA_AFFINITY case support an
array of cpu_set_t?
> +
> +finish:
> + free_cpumask_var(allowed_mask);
> + return err;
> +}
> +
> static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> unsigned index, unsigned start,
> unsigned count, uint32_t flags, void *data)
> @@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> if (!irq_is(vdev, index))
> return -EINVAL;
>
> + if (flags & VFIO_IRQ_SET_DATA_AFFINITY)
> + return vfio_pci_set_msi_affinity(vdev, start, count, data);
> +
> for (i = start; i < start + count; i++) {
> ctx = vfio_irq_ctx_get(vdev, i);
> if (!ctx)
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index e97d796a54fb..e75c5d66681c 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
> size = 0;
> break;
> case VFIO_IRQ_SET_DATA_BOOL:
> - size = sizeof(uint8_t);
> + size = hdr->count * sizeof(uint8_t);
> break;
> case VFIO_IRQ_SET_DATA_EVENTFD:
> - size = sizeof(int32_t);
> + size = size_mul(hdr->count, sizeof(int32_t));
Why use size_mul() in one place and not the other?
> + break;
> + case VFIO_IRQ_SET_DATA_AFFINITY:
> + size = hdr->argsz - minsz;
> + if (size > cpumask_size())
> + size = cpumask_size();
Or just set size = (hdr->argsz - minsz) / count?
Generate an error if (hdr->argsz - minsz) % count?
It seems like all the cpumask'items can be contained to the set affinity
function.
> break;
> default:
> return -EINVAL;
> }
>
> if (size) {
> - if (hdr->argsz - minsz < hdr->count * size)
> + if (hdr->argsz - minsz < size)
> return -EINVAL;
>
> if (!data_size)
> return -EINVAL;
>
> - *data_size = hdr->count * size;
> + *data_size = size;
> }
>
> return 0;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2b68e6cdf190..5ba2ca223550 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -580,6 +580,12 @@ struct vfio_irq_info {
> *
> * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
> * ACTION_TRIGGER specifies kernel->user signaling.
> + *
> + * DATA_AFFINITY specifies the affinity for the range of interrupt vectors.
> + * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data'
> + * array is a CPU affinity mask 'cpu_set_t' structure, as for the
> + * sched_setaffinity() syscall argument: the 'argsz' field is used to check
> + * the actual cpu_set_t size.
> */
DATA_CPUSET?
The IRQ_INFO ioctl should probably also report support for this
feature.
Is there any proposed userspace code that takes advantage of this
interface? Thanks,
Alex
> struct vfio_irq_set {
> __u32 argsz;
> @@ -587,6 +593,7 @@ struct vfio_irq_set {
> #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
> #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
> #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
> +#define VFIO_IRQ_SET_DATA_AFFINITY (1 << 6) /* Data is cpu_set_t */
> #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
> #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
> #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
> @@ -599,7 +606,8 @@ struct vfio_irq_set {
>
> #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \
> VFIO_IRQ_SET_DATA_BOOL | \
> - VFIO_IRQ_SET_DATA_EVENTFD)
> + VFIO_IRQ_SET_DATA_EVENTFD | \
> + VFIO_IRQ_SET_DATA_AFFINITY)
> #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \
> VFIO_IRQ_SET_ACTION_UNMASK | \
> VFIO_IRQ_SET_ACTION_TRIGGER)
On Mon, Jun 10, 2024 at 8:32 PM Alex Williamson
<[email protected]> wrote:
>
> On Mon, 10 Jun 2024 12:57:08 +0000
> Fred Griffoul <[email protected]> wrote:
>
> > The usual way to configure a device interrupt from userland is to write
> > the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using
> > vfio to implement a device driver or a virtual machine monitor, this may
> > not be ideal: the process managing the vfio device interrupts may not be
> > granted root privilege, for security reasons. Thus it cannot directly
> > control the interrupt affinity and has to rely on an external command.
> >
> > This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag
> > to specify the affinity of interrupts of a vfio pci device.
> >
> > The CPU affinity mask argument must be a subset of the process cpuset,
> > otherwise an error -EPERM is returned.
> >
> > The vfio_irq_set argument shall be set-up in the following way:
> >
> > - the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set
> > as well as VFIO_IRQ_SET_ACTION_TRIGGER.
> >
> > - the variable-length 'data' field is a cpu_set_t structure, as
> > for the sched_setaffinity() syscall, the size of which is derived
> > from 'argsz'.
> >
> > Signed-off-by: Fred Griffoul <[email protected]>
> > ---
> > drivers/vfio/pci/vfio_pci_core.c | 27 +++++++++++++++++----
> > drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++
> > drivers/vfio/vfio_main.c | 13 +++++++----
> > include/uapi/linux/vfio.h | 10 +++++++-
> > 4 files changed, 80 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 80cae87fff36..2e3419560480 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> > {
> > unsigned long minsz = offsetofend(struct vfio_irq_set, count);
> > struct vfio_irq_set hdr;
> > + cpumask_var_t mask;
> > u8 *data = NULL;
> > int max, ret = 0;
> > size_t data_size = 0;
> > @@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> > return ret;
> >
> > if (data_size) {
> > - data = memdup_user(&arg->data, data_size);
> > - if (IS_ERR(data))
> > - return PTR_ERR(data);
> > + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) {
> > + if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> > + return -ENOMEM;
> > +
> > + if (copy_from_user(mask, &arg->data, data_size)) {
> > + ret = -EFAULT;
> > + goto out;
> > + }
> > +
> > + data = (u8 *)mask;
>
> Seems like this could just use the memdup_user() path, why do we care
> to copy it into a cpumask_var_t here? If we do care, wouldn't we
> implement something like get_user_cpu_mask() used by
> sched_setaffinity(2)?
>
A valid cpu_set_t argument could be smaller than a cpumask_var_t so we
have to allocate a cpumask_var_t and zero it if the argument size is smaller.
Moreover depending on the kernel configuration the cpumask_var_t could
be allocated on the stack, avoiding an actual memory allocation.
Exporting get_user_cpu_mask() may be better, although here the size
is checked in a separate function, as there are other explicit user
cpumask handling (in io_uring for instance).
> > +
> > + } else {
> > + data = memdup_user(&arg->data, data_size);
> > + if (IS_ERR(data))
> > + return PTR_ERR(data);
> > + }
> > }
> >
> > mutex_lock(&vdev->igate);
> > @@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> > hdr.count, data);
> >
> > mutex_unlock(&vdev->igate);
> > - kfree(data);
> > +
> > +out:
> > + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size)
> > + free_cpumask_var(mask);
> > + else
> > + kfree(data);
> >
> > return ret;
> > }
> > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> > index 8382c5834335..fe01303cf94e 100644
> > --- a/drivers/vfio/pci/vfio_pci_intrs.c
> > +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> > @@ -19,6 +19,7 @@
> > #include <linux/vfio.h>
> > #include <linux/wait.h>
> > #include <linux/slab.h>
> > +#include <linux/cpuset.h>
> >
> > #include "vfio_pci_priv.h"
> >
> > @@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> > return 0;
> > }
> >
> > +static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev,
> > + unsigned int start, unsigned int count,
> > + struct cpumask *irq_mask)
>
> Aside from the name, what makes this unique to MSI vectors?
>
Actually nothing, I had a use case for VFIO msi and msi-x based devices only.
> > +{
> > + struct vfio_pci_irq_ctx *ctx;
> > + cpumask_var_t allowed_mask;
> > + unsigned int i;
> > + int err = 0;
> > +
> > + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
> > + return -ENOMEM;
> > +
> > + cpuset_cpus_allowed(current, allowed_mask);
> > + if (!cpumask_subset(irq_mask, allowed_mask)) {
> > + err = -EPERM;
> > + goto finish;
> > + }
> > +
> > + for (i = start; i < start + count; i++) {
> > + ctx = vfio_irq_ctx_get(vdev, i);
> > + if (!ctx) {
> > + err = -EINVAL;
> > + break;
> > + }
> > +
> > + err = irq_set_affinity(ctx->producer.irq, irq_mask);
> > + if (err)
> > + break;
> > + }
>
> Is this typical/userful behavior to set a series of vectors to the same
> cpu_set_t? It's unusual behavior for this ioctl to apply the same data
> across multiple vectors. Should the DATA_AFFINITY case support an
> array of cpu_set_t?
>
My main use case is to configure NVMe queues in a virtual machine monitor
to interrupt only the physical CPUs assigned to that vmm. Then we can
set the same cpu_set_t to all the admin and I/O queues with a single ioctl().
I reckon another usage would be to assign a specific CPU for each interrupt
vector: with this interface it requires multiple ioctl().
I'm worried about the size of the argument if we allow an array of cpu_set_t
for a device with many interrupt vectors.
> > +
> > +finish:
> > + free_cpumask_var(allowed_mask);
> > + return err;
> > +}
> > +
> > static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> > unsigned index, unsigned start,
> > unsigned count, uint32_t flags, void *data)
> > @@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> > if (!irq_is(vdev, index))
> > return -EINVAL;
> >
> > + if (flags & VFIO_IRQ_SET_DATA_AFFINITY)
> > + return vfio_pci_set_msi_affinity(vdev, start, count, data);
> > +
> > for (i = start; i < start + count; i++) {
> > ctx = vfio_irq_ctx_get(vdev, i);
> > if (!ctx)
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index e97d796a54fb..e75c5d66681c 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
> > size = 0;
> > break;
> > case VFIO_IRQ_SET_DATA_BOOL:
> > - size = sizeof(uint8_t);
> > + size = hdr->count * sizeof(uint8_t);
> > break;
> > case VFIO_IRQ_SET_DATA_EVENTFD:
> > - size = sizeof(int32_t);
> > + size = size_mul(hdr->count, sizeof(int32_t));
>
> Why use size_mul() in one place and not the other?
>
Right. The DATA_BOOL cannot overflow this `hdr->count` has been checked
already but it would be more consistent to use it there too.
> > + break;
> > + case VFIO_IRQ_SET_DATA_AFFINITY:
> > + size = hdr->argsz - minsz;
> > + if (size > cpumask_size())
> > + size = cpumask_size();
>
> Or just set size = (hdr->argsz - minsz) / count?
>
> Generate an error if (hdr->argsz - minsz) % count?
>
> It seems like all the cpumask'items can be contained to the set affinity
> function.
>
Ok. Indeed we can just copy the hdr->argz - minsz, then allocate and copy
the cpumask_var_t only in the set affinity function. It only costs 1 memory
allocation but the patch will be less intrusive in the generic ioctl()) code.
> > break;
> > default:
> > return -EINVAL;
> > }
> >
> > if (size) {
> > - if (hdr->argsz - minsz < hdr->count * size)
> > + if (hdr->argsz - minsz < size)
> > return -EINVAL;
> >
> > if (!data_size)
> > return -EINVAL;
> >
> > - *data_size = hdr->count * size;
> > + *data_size = size;
> > }
> >
> > return 0;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 2b68e6cdf190..5ba2ca223550 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -580,6 +580,12 @@ struct vfio_irq_info {
> > *
> > * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
> > * ACTION_TRIGGER specifies kernel->user signaling.
> > + *
> > + * DATA_AFFINITY specifies the affinity for the range of interrupt vectors.
> > + * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data'
> > + * array is a CPU affinity mask 'cpu_set_t' structure, as for the
> > + * sched_setaffinity() syscall argument: the 'argsz' field is used to check
> > + * the actual cpu_set_t size.
> > */
>
> DATA_CPUSET?
>
> The IRQ_INFO ioctl should probably also report support for this
> feature.
>
Ok, I will do it in the next revision.
> Is there any proposed userspace code that takes advantage of this
> interface? Thanks,
>
Not yet but I will work on it.
Thanks for your review.
Fred
> Alex
>
> > struct vfio_irq_set {
> > __u32 argsz;
> > @@ -587,6 +593,7 @@ struct vfio_irq_set {
> > #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
> > #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
> > #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
> > +#define VFIO_IRQ_SET_DATA_AFFINITY (1 << 6) /* Data is cpu_set_t */
> > #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
> > #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
> > #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
> > @@ -599,7 +606,8 @@ struct vfio_irq_set {
> >
> > #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \
> > VFIO_IRQ_SET_DATA_BOOL | \
> > - VFIO_IRQ_SET_DATA_EVENTFD)
> > + VFIO_IRQ_SET_DATA_EVENTFD | \
> > + VFIO_IRQ_SET_DATA_AFFINITY)
> > #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \
> > VFIO_IRQ_SET_ACTION_UNMASK | \
> > VFIO_IRQ_SET_ACTION_TRIGGER)
>