2023-07-12 09:21:12

by Viresh Kumar

[permalink] [raw]
Subject: [PATCH] xen: privcmd: Add support for irqfd

Xen provides support for injecting interrupts to the guests via the
HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
device backend implementations, in an inefficient manner currently.

Generally, the Virtio backends are implemented to work with the Eventfd
based mechanism. In order to make such backends work with Xen, another
software layer needs to poll the Eventfds and raise an interrupt to the
guest using the Xen based mechanism. This results in an extra context
switch.

This is not a new problem in Linux though. It is present with other
hypervisors like KVM, etc. as well. The generic solution implemented in
the kernel for them is to provide an IOCTL call to pass the interrupt
details and eventfd, which lets the kernel take care of polling the
eventfd and raising of the interrupt, instead of handling this in user
space (which involves an extra context switch).

This patch adds support to inject a specific interrupt to guest using
the eventfd mechanism, by preventing the extra context switch.

Inspired by existing implementations for KVM, etc..

Signed-off-by: Viresh Kumar <[email protected]>
---
drivers/xen/privcmd.c | 285 ++++++++++++++++++++++++++++++++++++-
include/uapi/xen/privcmd.h | 14 ++
2 files changed, 297 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index e2f580e30a86..e8096b09c113 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -9,11 +9,16 @@

#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt

+#include <linux/eventfd.h>
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/workqueue.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mman.h>
@@ -833,6 +838,266 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
return rc;
}

+/* Irqfd support */
+static struct workqueue_struct *irqfd_cleanup_wq;
+static DEFINE_MUTEX(irqfds_lock);
+static LIST_HEAD(irqfds_list);
+
+struct privcmd_kernel_irqfd {
+ domid_t dom;
+ u8 level;
+ u32 irq;
+ struct eventfd_ctx *eventfd;
+ struct work_struct shutdown;
+ wait_queue_entry_t wait;
+ struct list_head list;
+ poll_table pt;
+};
+
+/* From xen/include/public/hvm/dm_op.h */
+#define XEN_DMOP_set_irq_level 19
+
+struct xen_dm_op_set_irq_level {
+ u32 irq;
+ /* IN - Level: 0 -> deasserted, 1 -> asserted */
+ u8 level;
+ u8 pad[3];
+};
+
+struct xen_dm_op {
+ u32 op;
+ u32 pad;
+ union {
+ /*
+ * There are more structures here, we won't be using them, so
+ * can skip adding them here.
+ */
+ struct xen_dm_op_set_irq_level set_irq_level;
+ } u;
+};
+
+static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
+{
+ lockdep_assert_held(&irqfds_lock);
+
+ list_del_init(&kirqfd->list);
+ queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
+}
+
+static void irqfd_shutdown(struct work_struct *work)
+{
+ struct privcmd_kernel_irqfd *kirqfd =
+ container_of(work, struct privcmd_kernel_irqfd, shutdown);
+ u64 cnt;
+
+ eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
+ eventfd_ctx_put(kirqfd->eventfd);
+ kfree(kirqfd);
+}
+
+static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
+{
+ struct xen_dm_op dm_op = {
+ .op = XEN_DMOP_set_irq_level,
+ .u.set_irq_level.irq = kirqfd->irq,
+ .u.set_irq_level.level = kirqfd->level,
+ };
+ struct xen_dm_op_buf xbufs = {
+ .size = sizeof(dm_op),
+ };
+ u64 cnt;
+
+ eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
+ set_xen_guest_handle(xbufs.h, &dm_op);
+
+ xen_preemptible_hcall_begin();
+ HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);
+ xen_preemptible_hcall_end();
+}
+
+static int
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+{
+ struct privcmd_kernel_irqfd *kirqfd =
+ container_of(wait, struct privcmd_kernel_irqfd, wait);
+ __poll_t flags = key_to_poll(key);
+
+ if (flags & EPOLLIN)
+ irqfd_inject(kirqfd);
+
+ if (flags & EPOLLHUP) {
+ mutex_lock(&irqfds_lock);
+ irqfd_deactivate(kirqfd);
+ mutex_unlock(&irqfds_lock);
+ }
+
+ return 0;
+}
+
+static void
+irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct privcmd_kernel_irqfd *kirqfd =
+ container_of(pt, struct privcmd_kernel_irqfd, pt);
+
+ add_wait_queue_priority(wqh, &kirqfd->wait);
+}
+
+static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
+{
+ struct privcmd_kernel_irqfd *kirqfd, *tmp;
+ struct eventfd_ctx *eventfd;
+ __poll_t events;
+ struct fd f;
+ int ret;
+
+ kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
+ if (!kirqfd)
+ return -ENOMEM;
+
+ kirqfd->irq = irqfd->irq;
+ kirqfd->dom = irqfd->dom;
+ kirqfd->level = irqfd->level;
+ INIT_LIST_HEAD(&kirqfd->list);
+ INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
+
+ f = fdget(irqfd->fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto error_kfree;
+ }
+
+ eventfd = eventfd_ctx_fileget(f.file);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto error_fd_put;
+ }
+
+ kirqfd->eventfd = eventfd;
+
+ /*
+ * Install our own custom wake-up handling so we are notified via a
+ * callback whenever someone signals the underlying eventfd.
+ */
+ init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
+ init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
+
+ mutex_lock(&irqfds_lock);
+
+ list_for_each_entry(tmp, &irqfds_list, list) {
+ if (kirqfd->eventfd == tmp->eventfd) {
+ ret = -EBUSY;
+ mutex_unlock(&irqfds_lock);
+ goto error_eventfd;
+ }
+ }
+
+ list_add_tail(&kirqfd->list, &irqfds_list);
+ mutex_unlock(&irqfds_lock);
+
+ /*
+ * Check if there was an event already pending on the eventfd before we
+ * registered, and trigger it as if we didn't miss it.
+ */
+ events = vfs_poll(f.file, &kirqfd->pt);
+ if (events & EPOLLIN)
+ irqfd_inject(kirqfd);
+
+ /*
+ * Do not drop the file until the kirqfd is fully initialized, otherwise
+ * we might race against the EPOLLHUP.
+ */
+ fdput(f);
+ return 0;
+
+error_eventfd:
+ eventfd_ctx_put(eventfd);
+
+error_fd_put:
+ fdput(f);
+
+error_kfree:
+ kfree(kirqfd);
+ return ret;
+}
+
+static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
+{
+ struct privcmd_kernel_irqfd *kirqfd, *tmp;
+ struct eventfd_ctx *eventfd;
+
+ eventfd = eventfd_ctx_fdget(irqfd->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&irqfds_lock);
+
+ list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) {
+ if (kirqfd->eventfd == eventfd) {
+ irqfd_deactivate(kirqfd);
+ break;
+ }
+ }
+
+ mutex_unlock(&irqfds_lock);
+
+ eventfd_ctx_put(eventfd);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed so
+ * that we guarantee there will not be any more interrupts once this
+ * deassign function returns.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+
+ return 0;
+}
+
+static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
+{
+ struct privcmd_data *data = file->private_data;
+ struct privcmd_irqfd irqfd;
+
+ if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
+ return -EFAULT;
+
+ /* No other flags should be set */
+ if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
+ return -EINVAL;
+
+ /* If restriction is in place, check the domid matches */
+ if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
+ return -EPERM;
+
+ if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
+ return privcmd_irqfd_deassign(&irqfd);
+
+ return privcmd_irqfd_assign(&irqfd);
+}
+
+static int privcmd_irqfd_init(void)
+{
+ irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
+ if (!irqfd_cleanup_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void privcmd_irqfd_exit(void)
+{
+ struct privcmd_kernel_irqfd *kirqfd, *tmp;
+
+ mutex_lock(&irqfds_lock);
+
+ list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
+ irqfd_deactivate(kirqfd);
+
+ mutex_unlock(&irqfds_lock);
+
+ destroy_workqueue(irqfd_cleanup_wq);
+}
+
static long privcmd_ioctl(struct file *file,
unsigned int cmd, unsigned long data)
{
@@ -868,6 +1133,10 @@ static long privcmd_ioctl(struct file *file,
ret = privcmd_ioctl_mmap_resource(file, udata);
break;

+ case IOCTL_PRIVCMD_IRQFD:
+ ret = privcmd_ioctl_irqfd(file, udata);
+ break;
+
default:
break;
}
@@ -992,15 +1261,27 @@ static int __init privcmd_init(void)
err = misc_register(&xen_privcmdbuf_dev);
if (err != 0) {
pr_err("Could not register Xen hypercall-buf device\n");
- misc_deregister(&privcmd_dev);
- return err;
+ goto err_privcmdbuf;
+ }
+
+ err = privcmd_irqfd_init();
+ if (err != 0) {
+ pr_err("irqfd init failed\n");
+ goto err_irqfd;
}

return 0;
+
+err_irqfd:
+ misc_deregister(&xen_privcmdbuf_dev);
+err_privcmdbuf:
+ misc_deregister(&privcmd_dev);
+ return err;
}

static void __exit privcmd_exit(void)
{
+ privcmd_irqfd_exit();
misc_deregister(&privcmd_dev);
misc_deregister(&xen_privcmdbuf_dev);
}
diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
index d2029556083e..47334bb91a09 100644
--- a/include/uapi/xen/privcmd.h
+++ b/include/uapi/xen/privcmd.h
@@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
__u64 addr;
};

+/* For privcmd_irqfd::flags */
+#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
+
+struct privcmd_irqfd {
+ __u32 fd;
+ __u32 flags;
+ __u32 irq;
+ domid_t dom;
+ __u8 level;
+ __u8 pad;
+};
+
/*
* @cmd: IOCTL_PRIVCMD_HYPERCALL
* @arg: &privcmd_hypercall_t
@@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
_IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
#define IOCTL_PRIVCMD_MMAP_RESOURCE \
_IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
+#define IOCTL_PRIVCMD_IRQFD \
+ _IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))

#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
--
2.31.1.272.g89b43f80a514



2023-07-13 08:15:36

by Jürgen Groß

[permalink] [raw]
Subject: Re: [PATCH] xen: privcmd: Add support for irqfd

On 12.07.23 10:48, Viresh Kumar wrote:
> Xen provides support for injecting interrupts to the guests via the
> HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
> device backend implementations, in an inefficient manner currently.
>
> Generally, the Virtio backends are implemented to work with the Eventfd
> based mechanism. In order to make such backends work with Xen, another
> software layer needs to poll the Eventfds and raise an interrupt to the
> guest using the Xen based mechanism. This results in an extra context
> switch.
>
> This is not a new problem in Linux though. It is present with other
> hypervisors like KVM, etc. as well. The generic solution implemented in
> the kernel for them is to provide an IOCTL call to pass the interrupt
> details and eventfd, which lets the kernel take care of polling the
> eventfd and raising of the interrupt, instead of handling this in user
> space (which involves an extra context switch).
>
> This patch adds support to inject a specific interrupt to guest using
> the eventfd mechanism, by preventing the extra context switch.
>
> Inspired by existing implementations for KVM, etc..
>
> Signed-off-by: Viresh Kumar <[email protected]>
> ---
> drivers/xen/privcmd.c | 285 ++++++++++++++++++++++++++++++++++++-
> include/uapi/xen/privcmd.h | 14 ++
> 2 files changed, 297 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
> index e2f580e30a86..e8096b09c113 100644
> --- a/drivers/xen/privcmd.c
> +++ b/drivers/xen/privcmd.c
> @@ -9,11 +9,16 @@
>
> #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
>
> +#include <linux/eventfd.h>
> +#include <linux/file.h>
> #include <linux/kernel.h>
> #include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/poll.h>
> #include <linux/sched.h>
> #include <linux/slab.h>
> #include <linux/string.h>
> +#include <linux/workqueue.h>
> #include <linux/errno.h>
> #include <linux/mm.h>
> #include <linux/mman.h>
> @@ -833,6 +838,266 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
> return rc;
> }
>
> +/* Irqfd support */
> +static struct workqueue_struct *irqfd_cleanup_wq;
> +static DEFINE_MUTEX(irqfds_lock);
> +static LIST_HEAD(irqfds_list);
> +
> +struct privcmd_kernel_irqfd {
> + domid_t dom;
> + u8 level;
> + u32 irq;
> + struct eventfd_ctx *eventfd;
> + struct work_struct shutdown;
> + wait_queue_entry_t wait;
> + struct list_head list;
> + poll_table pt;
> +};
> +
> +/* From xen/include/public/hvm/dm_op.h */
> +#define XEN_DMOP_set_irq_level 19
> +
> +struct xen_dm_op_set_irq_level {
> + u32 irq;
> + /* IN - Level: 0 -> deasserted, 1 -> asserted */
> + u8 level;
> + u8 pad[3];
> +};
> +
> +struct xen_dm_op {
> + u32 op;
> + u32 pad;
> + union {
> + /*
> + * There are more structures here, we won't be using them, so
> + * can skip adding them here.
> + */
> + struct xen_dm_op_set_irq_level set_irq_level;
> + } u;
> +};

Instead of copying definitions over from Xen into privcmd.c, please just update
the related linux header include/xen/interface/dm_op.h from the Xen public
header.

> +
> +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
> +{
> + lockdep_assert_held(&irqfds_lock);
> +
> + list_del_init(&kirqfd->list);
> + queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
> +}
> +
> +static void irqfd_shutdown(struct work_struct *work)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(work, struct privcmd_kernel_irqfd, shutdown);
> + u64 cnt;
> +
> + eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
> + eventfd_ctx_put(kirqfd->eventfd);
> + kfree(kirqfd);
> +}
> +
> +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
> +{
> + struct xen_dm_op dm_op = {
> + .op = XEN_DMOP_set_irq_level,
> + .u.set_irq_level.irq = kirqfd->irq,
> + .u.set_irq_level.level = kirqfd->level,
> + };
> + struct xen_dm_op_buf xbufs = {
> + .size = sizeof(dm_op),
> + };
> + u64 cnt;
> +
> + eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
> + set_xen_guest_handle(xbufs.h, &dm_op);
> +
> + xen_preemptible_hcall_begin();
> + HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);

Please add some error handling, e.g. by issuing a message in case this hypercall
was failing. Adding a bool "error" to struct privcmd_kernel_irqfd in order to
avoid multiple error messages for the same device might be a good idea.

> + xen_preemptible_hcall_end();
> +}
> +
> +static int
> +irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(wait, struct privcmd_kernel_irqfd, wait);
> + __poll_t flags = key_to_poll(key);
> +
> + if (flags & EPOLLIN)
> + irqfd_inject(kirqfd);
> +
> + if (flags & EPOLLHUP) {
> + mutex_lock(&irqfds_lock);
> + irqfd_deactivate(kirqfd);
> + mutex_unlock(&irqfds_lock);
> + }
> +
> + return 0;
> +}
> +
> +static void
> +irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
> +{
> + struct privcmd_kernel_irqfd *kirqfd =
> + container_of(pt, struct privcmd_kernel_irqfd, pt);
> +
> + add_wait_queue_priority(wqh, &kirqfd->wait);
> +}
> +
> +static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> + struct eventfd_ctx *eventfd;
> + __poll_t events;
> + struct fd f;
> + int ret;
> +
> + kirqfd = kzalloc(sizeof(*kirqfd), GFP_KERNEL);
> + if (!kirqfd)
> + return -ENOMEM;
> +
> + kirqfd->irq = irqfd->irq;
> + kirqfd->dom = irqfd->dom;
> + kirqfd->level = irqfd->level;
> + INIT_LIST_HEAD(&kirqfd->list);
> + INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
> +
> + f = fdget(irqfd->fd);
> + if (!f.file) {
> + ret = -EBADF;
> + goto error_kfree;
> + }
> +
> + eventfd = eventfd_ctx_fileget(f.file);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto error_fd_put;
> + }
> +
> + kirqfd->eventfd = eventfd;
> +
> + /*
> + * Install our own custom wake-up handling so we are notified via a
> + * callback whenever someone signals the underlying eventfd.
> + */
> + init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
> + init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry(tmp, &irqfds_list, list) {
> + if (kirqfd->eventfd == tmp->eventfd) {
> + ret = -EBUSY;
> + mutex_unlock(&irqfds_lock);
> + goto error_eventfd;
> + }
> + }
> +
> + list_add_tail(&kirqfd->list, &irqfds_list);
> + mutex_unlock(&irqfds_lock);
> +
> + /*
> + * Check if there was an event already pending on the eventfd before we
> + * registered, and trigger it as if we didn't miss it.
> + */
> + events = vfs_poll(f.file, &kirqfd->pt);
> + if (events & EPOLLIN)
> + irqfd_inject(kirqfd);
> +
> + /*
> + * Do not drop the file until the kirqfd is fully initialized, otherwise
> + * we might race against the EPOLLHUP.
> + */
> + fdput(f);
> + return 0;
> +
> +error_eventfd:
> + eventfd_ctx_put(eventfd);
> +
> +error_fd_put:
> + fdput(f);
> +
> +error_kfree:
> + kfree(kirqfd);
> + return ret;
> +}
> +
> +static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> + struct eventfd_ctx *eventfd;
> +
> + eventfd = eventfd_ctx_fdget(irqfd->fd);
> + if (IS_ERR(eventfd))
> + return PTR_ERR(eventfd);
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) {

I don't think you need the safe variant here, as you will stop the loop in
case you are removing an entry.

> + if (kirqfd->eventfd == eventfd) {
> + irqfd_deactivate(kirqfd);
> + break;
> + }
> + }
> +
> + mutex_unlock(&irqfds_lock);
> +
> + eventfd_ctx_put(eventfd);
> +
> + /*
> + * Block until we know all outstanding shutdown jobs have completed so
> + * that we guarantee there will not be any more interrupts once this
> + * deassign function returns.
> + */
> + flush_workqueue(irqfd_cleanup_wq);
> +
> + return 0;
> +}
> +
> +static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
> +{
> + struct privcmd_data *data = file->private_data;
> + struct privcmd_irqfd irqfd;
> +
> + if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
> + return -EFAULT;
> +
> + /* No other flags should be set */
> + if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
> + return -EINVAL;
> +
> + /* If restriction is in place, check the domid matches */
> + if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
> + return -EPERM;
> +
> + if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
> + return privcmd_irqfd_deassign(&irqfd);
> +
> + return privcmd_irqfd_assign(&irqfd);
> +}
> +
> +static int privcmd_irqfd_init(void)
> +{
> + irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
> + if (!irqfd_cleanup_wq)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static void privcmd_irqfd_exit(void)
> +{
> + struct privcmd_kernel_irqfd *kirqfd, *tmp;
> +
> + mutex_lock(&irqfds_lock);
> +
> + list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
> + irqfd_deactivate(kirqfd);
> +
> + mutex_unlock(&irqfds_lock);
> +
> + destroy_workqueue(irqfd_cleanup_wq);
> +}
> +
> static long privcmd_ioctl(struct file *file,
> unsigned int cmd, unsigned long data)
> {
> @@ -868,6 +1133,10 @@ static long privcmd_ioctl(struct file *file,
> ret = privcmd_ioctl_mmap_resource(file, udata);
> break;
>
> + case IOCTL_PRIVCMD_IRQFD:
> + ret = privcmd_ioctl_irqfd(file, udata);
> + break;
> +
> default:
> break;
> }
> @@ -992,15 +1261,27 @@ static int __init privcmd_init(void)
> err = misc_register(&xen_privcmdbuf_dev);
> if (err != 0) {
> pr_err("Could not register Xen hypercall-buf device\n");
> - misc_deregister(&privcmd_dev);
> - return err;
> + goto err_privcmdbuf;
> + }
> +
> + err = privcmd_irqfd_init();
> + if (err != 0) {
> + pr_err("irqfd init failed\n");
> + goto err_irqfd;
> }
>
> return 0;
> +
> +err_irqfd:
> + misc_deregister(&xen_privcmdbuf_dev);
> +err_privcmdbuf:
> + misc_deregister(&privcmd_dev);
> + return err;
> }
>
> static void __exit privcmd_exit(void)
> {
> + privcmd_irqfd_exit();
> misc_deregister(&privcmd_dev);
> misc_deregister(&xen_privcmdbuf_dev);
> }
> diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h
> index d2029556083e..47334bb91a09 100644
> --- a/include/uapi/xen/privcmd.h
> +++ b/include/uapi/xen/privcmd.h
> @@ -98,6 +98,18 @@ struct privcmd_mmap_resource {
> __u64 addr;
> };
>
> +/* For privcmd_irqfd::flags */
> +#define PRIVCMD_IRQFD_FLAG_DEASSIGN (1 << 0)
> +
> +struct privcmd_irqfd {
> + __u32 fd;
> + __u32 flags;
> + __u32 irq;
> + domid_t dom;
> + __u8 level;
> + __u8 pad;
> +};
> +
> /*
> * @cmd: IOCTL_PRIVCMD_HYPERCALL
> * @arg: &privcmd_hypercall_t
> @@ -125,5 +137,7 @@ struct privcmd_mmap_resource {
> _IOC(_IOC_NONE, 'P', 6, sizeof(domid_t))
> #define IOCTL_PRIVCMD_MMAP_RESOURCE \
> _IOC(_IOC_NONE, 'P', 7, sizeof(struct privcmd_mmap_resource))
> +#define IOCTL_PRIVCMD_IRQFD \
> + _IOC(_IOC_NONE, 'P', 8, sizeof(struct privcmd_irqfd))
>
> #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */


Juergen


Attachments:
OpenPGP_0xB0DE9DD628BF132F.asc (3.08 kB)
OpenPGP public key
OpenPGP_signature (505.00 B)
OpenPGP digital signature
Download all attachments

2023-07-13 14:56:21

by Oleksandr Tyshchenko

[permalink] [raw]
Subject: Re: [PATCH] xen: privcmd: Add support for irqfd



On 13.07.23 10:44, Juergen Gross wrote:

Hello all.


> On 12.07.23 10:48, Viresh Kumar wrote:
>> Xen provides support for injecting interrupts to the guests via the
>> HYPERVISOR_dm_op() hypercall. The same is used by the Virtio based
>> device backend implementations, in an inefficient manner currently.
>>
>> Generally, the Virtio backends are implemented to work with the Eventfd
>> based mechanism. In order to make such backends work with Xen, another
>> software layer needs to poll the Eventfds and raise an interrupt to the
>> guest using the Xen based mechanism. This results in an extra context
>> switch.
>>
>> This is not a new problem in Linux though. It is present with other
>> hypervisors like KVM, etc. as well. The generic solution implemented in
>> the kernel for them is to provide an IOCTL call to pass the interrupt
>> details and eventfd, which lets the kernel take care of polling the
>> eventfd and raising of the interrupt, instead of handling this in user
>> space (which involves an extra context switch).
>>
>> This patch adds support to inject a specific interrupt to guest using
>> the eventfd mechanism, by preventing the extra context switch.
>>
>> Inspired by existing implementations for KVM, etc..


Viresh, great work!

Do you perhaps have corresponding users-space (virtio backend) example
adopted for that feature (I would like to take a look at it if possible)?



>>
>> Signed-off-by: Viresh Kumar <[email protected]>
>> ---
>>   drivers/xen/privcmd.c      | 285 ++++++++++++++++++++++++++++++++++++-
>>   include/uapi/xen/privcmd.h |  14 ++
>>   2 files changed, 297 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
>> index e2f580e30a86..e8096b09c113 100644
>> --- a/drivers/xen/privcmd.c
>> +++ b/drivers/xen/privcmd.c
>> @@ -9,11 +9,16 @@
>>   #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
>> +#include <linux/eventfd.h>
>> +#include <linux/file.h>
>>   #include <linux/kernel.h>
>>   #include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/poll.h>
>>   #include <linux/sched.h>
>>   #include <linux/slab.h>
>>   #include <linux/string.h>
>> +#include <linux/workqueue.h>
>>   #include <linux/errno.h>
>>   #include <linux/mm.h>
>>   #include <linux/mman.h>
>> @@ -833,6 +838,266 @@ static long privcmd_ioctl_mmap_resource(struct
>> file *file,
>>       return rc;
>>   }
>> +/* Irqfd support */
>> +static struct workqueue_struct *irqfd_cleanup_wq;
>> +static DEFINE_MUTEX(irqfds_lock);
>> +static LIST_HEAD(irqfds_list);
>> +
>> +struct privcmd_kernel_irqfd {
>> +    domid_t dom;
>> +    u8 level;
>> +    u32 irq;
>> +    struct eventfd_ctx *eventfd;
>> +    struct work_struct shutdown;
>> +    wait_queue_entry_t wait;
>> +    struct list_head list;
>> +    poll_table pt;
>> +};
>> +
>> +/* From xen/include/public/hvm/dm_op.h */
>> +#define XEN_DMOP_set_irq_level 19
>> +
>> +struct xen_dm_op_set_irq_level {
>> +    u32 irq;
>> +    /* IN - Level: 0 -> deasserted, 1 -> asserted */
>> +    u8 level;
>> +    u8 pad[3];
>> +};
>> +
>> +struct xen_dm_op {
>> +    u32 op;
>> +    u32 pad;
>> +    union {
>> +        /*
>> +         * There are more structures here, we won't be using them, so
>> +         * can skip adding them here.
>> +         */
>> +        struct xen_dm_op_set_irq_level set_irq_level;
>> +    } u;
>> +};
>
> Instead of copying definitions over from Xen into privcmd.c, please just
> update
> the related linux header include/xen/interface/dm_op.h from the Xen public
> header.
>
>> +
>> +static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
>> +{
>> +    lockdep_assert_held(&irqfds_lock);
>> +
>> +    list_del_init(&kirqfd->list);
>> +    queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
>> +}
>> +
>> +static void irqfd_shutdown(struct work_struct *work)
>> +{
>> +    struct privcmd_kernel_irqfd *kirqfd =
>> +        container_of(work, struct privcmd_kernel_irqfd, shutdown);
>> +    u64 cnt;
>> +
>> +    eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
>> +    eventfd_ctx_put(kirqfd->eventfd);
>> +    kfree(kirqfd);
>> +}
>> +
>> +static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
>> +{
>> +    struct xen_dm_op dm_op = {
>> +        .op = XEN_DMOP_set_irq_level,
>> +        .u.set_irq_level.irq = kirqfd->irq,
>> +        .u.set_irq_level.level = kirqfd->level,
>> +    };
>> +    struct xen_dm_op_buf xbufs = {
>> +        .size = sizeof(dm_op),
>> +    };
>> +    u64 cnt;
>> +
>> +    eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
>> +    set_xen_guest_handle(xbufs.h, &dm_op);
>> +
>> +    xen_preemptible_hcall_begin();
>> +    HYPERVISOR_dm_op(kirqfd->dom, 1, &xbufs);
>
> Please add some error handling, e.g. by issuing a message in case this
> hypercall
> was failing. Adding a bool "error" to struct privcmd_kernel_irqfd in
> order to
> avoid multiple error messages for the same device might be a good idea.


In addition to provided comments, I would like to mention that this
particular dm_op has Arm implementation only in vanilla hypervisor.

So this feature cannot be immediately reused on x86 because of
XEN_DMOP_set_irq_level at least. As I understand, the x86's variant is
XEN_DMOP_set_isa_irq_level (there is also XEN_DMOP_set_pci_intx_level
for legacy PCI interrupts).

Please note, I am not asking to wire on x86, but maybe it is worth
considering to put this new feature under something like Kconfig's
XEN_PRIVCMD_IRQFD which depends on Arm for now? Or maybe to put dm_op
specific part of irqfd_inject() under CONFIG_ARCH_XXX here or introduce
per-ARCH helpers to form a dm_op (Arm's variant will use
XEN_DMOP_set_irq_level like in current commit). If not, then at least a
sentence in the description mentioning that this works on Arm only needs
to be added, I think.

Also, I am wondering whether we should foresee the IOCTL_PRIVCMD_IRQFD
interface to be suitable for all existing irq related dm_ops (not only
XEN_DMOP_set_irq_level) right now or this can be extended/updated later
on (when there is a real need)?



[snip]

2023-07-20 10:00:50

by Viresh Kumar

[permalink] [raw]
Subject: Re: [PATCH] xen: privcmd: Add support for irqfd

On 13-07-23, 14:40, Oleksandr Tyshchenko wrote:
> Viresh, great work!

Thanks Oleksandr.

> Do you perhaps have corresponding users-space (virtio backend) example
> adopted for that feature (I would like to take a look at it if possible)?

This is taken care by the xen-vhost-frontend Rust crate in our case
(which was initially designed based on virtio-disk but has deviated a
lot from it now). Here is the commit of interest. The backends remain
unmodified though.

https://github.com/vireshk/xen-vhost-frontend/commit/d79c419f14c1f54240b3147c342894998c274364

And I have updated the commit with CONFIG_ARM64 thingy..

--
viresh

2023-07-21 18:52:57

by Oleksandr Tyshchenko

[permalink] [raw]
Subject: Re: [PATCH] xen: privcmd: Add support for irqfd



On 20.07.23 12:41, Viresh Kumar wrote:

Hello Viresh

> On 13-07-23, 14:40, Oleksandr Tyshchenko wrote:
>> Viresh, great work!
>
> Thanks Oleksandr.
>
>> Do you perhaps have corresponding users-space (virtio backend) example
>> adopted for that feature (I would like to take a look at it if possible)?
>
> This is taken care by the xen-vhost-frontend Rust crate in our case
> (which was initially designed based on virtio-disk but has deviated a
> lot from it now).

I see

Here is the commit of interest. The backends remain
> unmodified though.
>
> https://urldefense.com/v3/__https://github.com/vireshk/xen-vhost-frontend/commit/d79c419f14c1f54240b3147c342894998c274364__;!!GF_29dbcQIUBPA!1yHRR11TbPB-cqHmbO9ew0W4GKPfx1y1GXWHj0Q7wIEcom3ZgU28uZcrXEYlnVPl1x47t3ooXECSYer2lClnO3QosqiOmFY$ [github[.]com]
>
> And I have updated the commit with CONFIG_ARM64 thingy..

Thank you for the information!