2022-11-24 23:27:02

by Thomas Gleixner

[permalink] [raw]
Subject: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

Provide two sorts of interfaces to handle the different use cases:

- msi_domain_free_irqs_range():

Handles a caller defined precise range

- msi_domain_free_irqs_all():

Frees all interrupts associated to a domain

The latter is useful for device teardown and to handle the legacy MSI support
which does not have any range information available.

Signed-off-by: Thomas Gleixner <[email protected]>
---
V3: Adopt to the domain/xarray storage change
---
include/linux/msi.h | 9 +++
kernel/irq/msi.c | 142 +++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 129 insertions(+), 22 deletions(-)

--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -498,6 +498,15 @@ int msi_domain_alloc_irqs(struct irq_dom
int nvec);
void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev);
void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid);
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid);
+
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);

struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -546,7 +546,25 @@ static inline void msi_sysfs_remove_desc
#endif /* !CONFIG_SYSFS */

static int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec);
-static void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}

static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
@@ -707,7 +725,6 @@ static struct msi_domain_ops msi_domain_
.msi_prepare = msi_domain_ops_prepare,
.set_desc = msi_domain_ops_set_desc,
.domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};

static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -721,8 +738,6 @@ static void msi_domain_update_dom_ops(st

if (ops->domain_alloc_irqs == NULL)
ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;

if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -1074,15 +1089,21 @@ int msi_domain_alloc_irqs(struct irq_dom
return ret;
}

-static void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
int i;

- /* Only handle MSI entries which have an interrupt associated */
- msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
@@ -1097,11 +1118,99 @@ static void __msi_domain_free_irqs(struc
}
}

-static void msi_domain_free_msi_descs(struct msi_domain_info *info,
- struct device *dev)
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
- msi_free_msi_descs(dev);
+ msi_domain_free_descs(dev, ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
+ */
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
+{
+ msi_domain_free_irqs_range_locked(dev, domid, 0, MSI_MAX_INDEX);
+}
+
+/**
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ */
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
+ msi_unlock_descs(dev);
}

/**
@@ -1116,18 +1225,7 @@ static void msi_domain_free_msi_descs(st
*/
void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
- return;
-
- ops->domain_free_irqs(domain, dev);
- if (ops->msi_post_free)
- ops->msi_post_free(domain, dev);
- msi_domain_free_msi_descs(info, dev);
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, 0, MSI_MAX_INDEX);
}

/**


Subject: [tip: irq/core] genirq/msi: Provide new domain id based interfaces for freeing interrupts

The following commit has been merged into the irq/core branch of tip:

Commit-ID: 4cd5f4403f283766f73749c4c936801f58ffe77a
Gitweb: https://git.kernel.org/tip/4cd5f4403f283766f73749c4c936801f58ffe77a
Author: Thomas Gleixner <[email protected]>
AuthorDate: Fri, 25 Nov 2022 00:24:33 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Mon, 05 Dec 2022 19:21:00 +01:00

genirq/msi: Provide new domain id based interfaces for freeing interrupts

Provide two sorts of interfaces to handle the different use cases:

- msi_domain_free_irqs_range():

Handles a caller defined precise range

- msi_domain_free_irqs_all():

Frees all interrupts associated to a domain

The latter is useful for device teardown and to handle the legacy MSI support
which does not have any range information available.

Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Kevin Tian <[email protected]>
Acked-by: Marc Zyngier <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
include/linux/msi.h | 9 +++-
kernel/irq/msi.c | 142 ++++++++++++++++++++++++++++++++++++-------
2 files changed, 129 insertions(+), 22 deletions(-)

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 23172d6..74cb0a9 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -498,6 +498,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
int nvec);
void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev);
void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last);
+
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid);
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid);
+
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain);

struct irq_domain *platform_msi_create_irq_domain(struct fwnode_handle *fwnode,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 64a4cc8..c1ac780 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -545,7 +545,25 @@ static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *de
#endif /* !CONFIG_SYSFS */

static int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec);
-static void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev);
+
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}

static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
@@ -706,7 +724,6 @@ static struct msi_domain_ops msi_domain_ops_default = {
.msi_prepare = msi_domain_ops_prepare,
.set_desc = msi_domain_ops_set_desc,
.domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};

static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -720,8 +737,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)

if (ops->domain_alloc_irqs == NULL)
ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;

if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -1073,15 +1088,21 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nve
return ret;
}

-static void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
int i;

- /* Only handle MSI entries which have an interrupt associated */
- msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
@@ -1096,11 +1117,99 @@ static void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev
}
}

-static void msi_domain_free_msi_descs(struct msi_domain_info *info,
- struct device *dev)
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
- msi_free_msi_descs(dev);
+ msi_domain_free_descs(dev, ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
+ */
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
+{
+ msi_domain_free_irqs_range_locked(dev, domid, 0, MSI_MAX_INDEX);
+}
+
+/**
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ */
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
+ msi_unlock_descs(dev);
}

/**
@@ -1115,18 +1224,7 @@ static void msi_domain_free_msi_descs(struct msi_domain_info *info,
*/
void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
- return;
-
- ops->domain_free_irqs(domain, dev);
- if (ops->msi_post_free)
- ops->msi_post_free(domain, dev);
- msi_domain_free_msi_descs(info, dev);
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, 0, MSI_MAX_INDEX);
}

/**

2023-01-16 11:12:59

by David Woodhouse

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

On Mon, 2023-01-16 at 09:56 +0000, David Woodhouse wrote:
>
>   msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
> - for (i = 0; i < msidesc->nvec_used; i++)
> + for (i = 0; i < msidesc->nvec_used; i++) {
>   xen_destroy_irq(msidesc->irq + i);
> + msidesc->irq = 0;
> + }
>   }
>  }
>  

Der, setting it to zero wants to be in the msi_for_each_desc() loop and
*not* in the 'for i' loop of course.


Attachments:
smime.p7s (5.83 kB)

2023-01-16 11:13:31

by David Woodhouse

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

On Fri, 2022-11-25 at 00:24 +0100, Thomas Gleixner wrote:
> Provide two sorts of interfaces to handle the different use cases:
>
>   - msi_domain_free_irqs_range():
>
>         Handles a caller defined precise range
>
>   - msi_domain_free_irqs_all():
>
>         Frees all interrupts associated to a domain
>
> The latter is useful for device teardown and to handle the legacy MSI support
> which does not have any range information available.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---

...

> +static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
>  {
> +       struct msi_domain_info *info;
> +       struct msi_domain_ops *ops;
> +       struct irq_domain *domain;
> +
> +       if (!msi_ctrl_valid(dev, ctrl))
> +               return;
> +
> +       domain = msi_get_device_domain(dev, ctrl->domid);
> +       if (!domain)
> +               return;
> +
> +       info = domain->host_data;
> +       ops = info->ops;
> +
> +       if (ops->domain_free_irqs)
> +               ops->domain_free_irqs(domain, dev);

Do you want a call to msi_free_dev_descs(dev) here? In the case where
the core code calls ops->domain_alloc_irqs() it *has* allocated the
descriptors first... but it's expecting the irqdomain to free them?

However, it's not quite as simple as adding msi_free_dev_descs()...

> +       else
> +               __msi_domain_free_irqs(dev, domain, ctrl);
> +

The igb driver seems to set up a single MSI-X in its setup, then tear
that down, then try again with more. Thus exercising the teardown path.

In 6.2-rc3 it fails under Xen (emulation in qemu) thus:

[ 1.491207] igb: Intel(R) Gigabit Ethernet Network Driver
[ 1.494003] igb: Copyright (c) 2007-2014 Intel Corporation.
[ 1.664907] ACPI: \_SB_.LNKA: Enabled at IRQ 10
[ 1.670837] ------------[ cut here ]------------
[ 1.672644] WARNING: CPU: 1 PID: 1 at drivers/xen/events/events_base.c:793 xen_free_irq+0x156/0x170
[ 1.676202] Modules linked in:
[ 1.677638] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc3+ #1059
[ 1.680134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.org 04/01/2014
[ 1.684484] RIP: 0010:xen_free_irq+0x156/0x170
[ 1.686240] Code: 5c 41 5d 41 5e 41 5f e9 08 03 95 ff e8 a3 5b 95 ff 48 85 c0 74 14 48 8b 58 30 e9 df fe ff ff 31 f6 89 ef e8 6c 59 95 ff eb 94 <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f 0b eb 86 0f
[ 1.692888] RSP: 0000:ffffc90000013ac8 EFLAGS: 00010246
[ 1.694705] RAX: 0000000000000000 RBX: 0000000000000026 RCX: 0000000000000000
[ 1.697113] RDX: 0000000000000028 RSI: ffff888001400490 RDI: 0000000000000026
[ 1.699498] RBP: 0000000000000026 R08: ffff8880014005e8 R09: ffffffff89c00240
[ 1.701917] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffffe
[ 1.704520] R13: ffffffff89de6880 R14: 0000000000000000 R15: 0000000000000005
[ 1.707202] FS: 0000000000000000(0000) GS:ffff88803ed00000(0000) knlGS:0000000000000000
[ 1.709974] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.711867] CR2: 0000000000000000 CR3: 000000003c812001 CR4: 0000000000170ee0
[ 1.714260] Call Trace:
[ 1.715145] <TASK>
[ 1.715897] xen_destroy_irq+0x64/0x120
[ 1.717181] ? msi_find_desc+0x41/0xb0
[ 1.718552] xen_teardown_msi_irqs+0x3d/0x70
[ 1.720064] msi_domain_free_locked.part.0+0x58/0x1c0
[ 1.721791] msi_domain_free_irqs_all_locked+0x6a/0x90
[ 1.723551] __pci_enable_msix_range+0x353/0x590
[ 1.725159] igb_set_interrupt_capability+0x90/0x1c0
[ 1.726879] igb_init_interrupt_scheme+0x2d/0x230
[ 1.728494] ? rcu_read_lock_sched_held+0x3f/0x80
[ 1.730361] igb_sw_init+0x1b3/0x260
[ 1.731797] igb_probe+0x3b6/0xf00
[ 1.733146] ? _raw_spin_unlock_irqrestore+0x40/0x60
[ 1.734834] local_pci_probe+0x41/0x80
[ 1.736164] pci_call_probe+0x54/0x160
[ 1.737441] pci_device_probe+0x7c/0x100
[ 1.738828] ? driver_sysfs_add+0x71/0xd0
[ 1.740229] really_probe+0xde/0x380
[ 1.741434] ? pm_runtime_barrier+0x50/0x90
[ 1.742873] __driver_probe_device+0x78/0x170
[ 1.744314] driver_probe_device+0x1f/0x90
[ 1.745689] __driver_attach+0xd2/0x1c0
[ 1.747035] ? __pfx___driver_attach+0x10/0x10
[ 1.748518] bus_for_each_dev+0x79/0xc0
[ 1.749859] bus_add_driver+0x1b1/0x200
[ 1.751182] driver_register+0x89/0xe0
[ 1.752472] ? __pfx_igb_init_module+0x10/0x10
[ 1.754054] do_one_initcall+0x5b/0x320
[ 1.755573] ? rcu_read_lock_sched_held+0x3f/0x80
[ 1.757375] kernel_init_freeable+0x1a6/0x1ec
[ 1.759005] ? __pfx_kernel_init+0x10/0x10
[ 1.760375] kernel_init+0x16/0x130
[ 1.761554] ret_from_fork+0x2c/0x50
[ 1.762797] </TASK>
[ 1.763590] irq event stamp: 1798623
[ 1.764869] hardirqs last enabled at (1798633): [<ffffffff8814aa8e>] __up_console_sem+0x5e/0x70
[ 1.767762] hardirqs last disabled at (1798642): [<ffffffff8814aa73>] __up_console_sem+0x43/0x70
[ 1.770715] softirqs last enabled at (1798570): [<ffffffff88d91f76>] __do_softirq+0x356/0x4da
[ 1.773576] softirqs last disabled at (1798565): [<ffffffff880bb83d>] __irq_exit_rcu+0xdd/0x150
[ 1.776492] ---[ end trace 0000000000000000 ]---
[ 1.839462] igb 0000:00:04.0: added PHC on eth0
[ 1.843531] igb 0000:00:04.0: Intel(R) Gigabit Ethernet Network Connection
[ 1.843541] igb 0000:00:04.0: eth0: (PCIe:5.0Gb/s:Width x4) 00:1e:67:cb:7a:93
[ 1.843620] igb 0000:00:04.0: eth0: PBA No: 006000-000
[ 1.849237] igb 0000:00:04.0: Using legacy interrupts. 1 rx queue(s), 1 tx queue(s)


If I add the missing call to msi_free_msi_descs() then it does work,
but complains differently:

[ 1.563055] igb: Intel(R) Gigabit Ethernet Network Driver
[ 1.566442] igb: Copyright (c) 2007-2014 Intel Corporation.
[ 1.737236] ACPI: \_SB_.LNKA: Enabled at IRQ 10
[ 1.742162] ------------[ cut here ]------------
[ 1.744393] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:196 msi_domain_free_descs+0xe1/0x110
[ 1.748248] Modules linked in:
[ 1.749289] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc3+ #1057
[ 1.751466] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.1-0-g3208b098f51a-prebuilt.qemu.org 04/01/2014
[ 1.755187] RIP: 0010:msi_domain_free_descs+0xe1/0x110
[ 1.756875] Code: 00 48 89 e6 4c 89 e7 e8 ed f4 ba 00 48 89 c3 48 85 c0 0f 84 71 ff ff ff 48 8b 34 24 4c 89 e7 e8 a5 01 bb 00 8b 03 85 c0 74 be <0f> 0b eb cb 48 8b 87 70 03 00 00 be ff ff ff ff 48 8d 78 78 e8 26
[ 1.763060] RSP: 0000:ffffc90000013b78 EFLAGS: 00010202
[ 1.764804] RAX: 0000000000000026 RBX: ffff888001ac5f00 RCX: 0000000000000000
[ 1.767155] RDX: 0000000000000001 RSI: ffffffffa649808a RDI: 00000000ffffffff
[ 1.769462] RBP: ffffc90000013ba8 R08: 0000000000000001 R09: 0000000000000000
[ 1.771934] R10: 000000006ac46bb1 R11: 00000000aee0433d R12: ffff888001a238c8
[ 1.774695] R13: ffffffffa6de6880 R14: ffff888001a51218 R15: ffff888001a50000
[ 1.777081] FS: 0000000000000000(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
[ 1.779754] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1.781681] CR2: ffff888010801000 CR3: 000000000e812001 CR4: 0000000000170ef0
[ 1.784093] Call Trace:
[ 1.784880] <TASK>
[ 1.785640] msi_domain_free_msi_descs_range+0x34/0x60
[ 1.787370] msi_domain_free_locked.part.0+0x58/0x1c0
[ 1.789034] msi_domain_free_irqs_all_locked+0x6a/0x90
[ 1.790815] pci_free_msi_irqs+0xe/0x30
[ 1.792157] pci_disable_msix+0x48/0x60
[ 1.793413] igb_reset_interrupt_capability+0xa4/0xb0
[ 1.795077] igb_sw_init+0x13f/0x260
[ 1.796281] igb_probe+0x3b6/0xf00
[ 1.797421] ? _raw_spin_unlock_irqrestore+0x40/0x60
[ 1.799050] local_pci_probe+0x41/0x80
[ 1.800282] pci_call_probe+0x54/0x160
[ 1.801543] pci_device_probe+0x7c/0x100
[ 1.803393] ? driver_sysfs_add+0x71/0xd0
[ 1.804761] really_probe+0xde/0x380
[ 1.806021] ? pm_runtime_barrier+0x50/0x90
[ 1.807395] __driver_probe_device+0x78/0x170
[ 1.808877] driver_probe_device+0x1f/0x90
[ 1.810257] __driver_attach+0xd2/0x1c0
[ 1.811529] ? __pfx___driver_attach+0x10/0x10
[ 1.813251] bus_for_each_dev+0x79/0xc0
[ 1.814534] bus_add_driver+0x1b1/0x200
[ 1.816058] driver_register+0x89/0xe0
[ 1.817291] ? __pfx_igb_init_module+0x10/0x10
[ 1.818821] do_one_initcall+0x5b/0x320
[ 1.820133] ? rcu_read_lock_sched_held+0x3f/0x80
[ 1.821697] kernel_init_freeable+0x1a6/0x1ec
[ 1.823150] ? __pfx_kernel_init+0x10/0x10
[ 1.824484] kernel_init+0x16/0x130
[ 1.825990] ret_from_fork+0x2c/0x50
[ 1.827757] </TASK>
[ 1.828865] irq event stamp: 1797845
[ 1.830573] hardirqs last enabled at (1797855): [<ffffffffa514aa8e>] __up_console_sem+0x5e/0x70
[ 1.834809] hardirqs last disabled at (1797864): [<ffffffffa514aa73>] __up_console_sem+0x43/0x70
[ 1.838915] softirqs last enabled at (1797742): [<ffffffffa5d91f76>] __do_softirq+0x356/0x4da
[ 1.842442] softirqs last disabled at (1797737): [<ffffffffa50bb83d>] __irq_exit_rcu+0xdd/0x150
[ 1.846094] ---[ end trace 0000000000000000 ]---


If I zero msidesc->irq in the loop in xen_teardown_msi_irqs(), *then*
it both works and stops complaining. But I'm mostly just tampering
empirically now...

I can provide a qemu tree which will let you test this easily with just
`qemu-system-x86_64 -kernel ...` but you have to promise not to look at
the way I've fixed some qemu deadlocks just by commenting out the lock
on MSI delivery/translation :)

You'd also have to provide your own igb device as qemu doesn't emulate
those; I'm testing it in passthrough. Or hack the e1000e driver to do a
setup/teardown/setup... or perhaps just unload and reload its module?

I'll provide a SoB just in case it's actually the right way to fix it…

Signed-off-by: David Woodhouse <[email protected]>

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 790550479831..293e23b7229a 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -390,8 +390,10 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
int i;

msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
- for (i = 0; i < msidesc->nvec_used; i++)
+ for (i = 0; i < msidesc->nvec_used; i++) {
xen_destroy_irq(msidesc->irq + i);
+ msidesc->irq = 0;
+ }
}
}


diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 955267bbc2be..812e1ec1a633 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1533,9 +1533,10 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
info = domain->host_data;
ops = info->ops;

- if (ops->domain_free_irqs)
+ if (ops->domain_free_irqs) {
ops->domain_free_irqs(domain, dev);
- else
+ msi_free_msi_descs(dev);
+ } else
__msi_domain_free_irqs(dev, domain, ctrl);

if (ops->msi_post_free)



Attachments:
smime.p7s (5.83 kB)

2023-01-16 17:55:30

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

David!

On Mon, Jan 16 2023 at 09:56, David Woodhouse wrote:
> On Fri, 2022-11-25 at 00:24 +0100, Thomas Gleixner wrote:
>> +
>> +       if (ops->domain_free_irqs)
>> +               ops->domain_free_irqs(domain, dev);
>
> Do you want a call to msi_free_dev_descs(dev) here? In the case where
> the core code calls ops->domain_alloc_irqs() it *has* allocated the
> descriptors first... but it's expecting the irqdomain to free them?

No. Let me stare at it.

> However, it's not quite as simple as adding msi_free_dev_descs()...

Correct.

> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 955267bbc2be..812e1ec1a633 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -1533,9 +1533,10 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
> info = domain->host_data;
> ops = info->ops;
>
> - if (ops->domain_free_irqs)
> + if (ops->domain_free_irqs) {
> ops->domain_free_irqs(domain, dev);
> - else
> + msi_free_msi_descs(dev);

This is just wrong. I need to taxi my grandson. Will have a look
afterwards.

Thanks,

tglx

2023-01-16 18:50:03

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

David!

On Mon, Jan 16 2023 at 17:35, Thomas Gleixner wrote:
> On Mon, Jan 16 2023 at 09:56, David Woodhouse wrote:
>
> This is just wrong. I need to taxi my grandson. Will have a look
> afterwards.

There are three 'tglx missed to fixup XEN' problems:

- b2bdda205c0c ("PCI/MSI: Let the MSI core free descriptors")

This requires the MSI_FLAG_FREE_MSI_DESCS flag to be set in the XEN
MSI domain info


- 2f2940d16823 ("genirq/msi: Remove filter from msi_free_descs_free_range()")

This requires the 'desc->irq = 0' disassociation on teardown.


- ffd84485e6be ("PCI/MSI: Let the irq code handle sysfs groups")

Lacks a flag in the XEN MSI domain info as well.

Combo patch below.

Thanks,

tglx
---
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -392,6 +392,7 @@ static void xen_teardown_msi_irqs(struct
msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
for (i = 0; i < msidesc->nvec_used; i++)
xen_destroy_irq(msidesc->irq + i);
+ msidesc->irq = 0;
}
}

@@ -434,6 +435,7 @@ static struct msi_domain_ops xen_pci_msi

static struct msi_domain_info xen_pci_msi_domain_info = {
.ops = &xen_pci_msi_domain_ops,
+ .flags = MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
};

/*

2023-01-16 19:39:20

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

David!

On Mon, Jan 16 2023 at 18:58, David Woodhouse wrote:

> On Mon, 2023-01-16 at 19:11 +0100, Thomas Gleixner wrote:
>> +       .flags                  = MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
>  
> That doesn't apply on top of
> https://lore.kernel.org/all/[email protected]/
> and doesn't include the | MSI_FLAG_PCI_MSIX either.

Indeed. I saw that patch after my reply. :)

> With that remedied,
>
> Tested-by: David Woodhouse <[email protected]>
>
> Albeit only under qemu with
> https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
> and not under real Xen.

Five levels of emulation. What could possibly go wrong?

2023-01-16 19:40:15

by David Woodhouse

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

On Mon, 2023-01-16 at 19:11 +0100, Thomas Gleixner wrote:
> +       .flags                  = MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
 
That doesn't apply on top of
https://lore.kernel.org/all/[email protected]/
and doesn't include the | MSI_FLAG_PCI_MSIX either.

With that remedied,

Tested-by: David Woodhouse <[email protected]>

Albeit only under qemu with
https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
and not under real Xen.



Attachments:
smime.p7s (5.83 kB)

2023-01-16 19:45:22

by David Woodhouse

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

On Mon, 2023-01-16 at 20:22 +0100, Thomas Gleixner wrote:
> > Tested-by: David Woodhouse <[email protected]>
> >
> > Albeit only under qemu with
> > https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
> > and not under real Xen.
>
> Five levels of emulation. What could possibly go wrong?

It's the opposite — this is what happened when I threw my toys out of
the pram and said, "You're NOT doing that with nested virtualization!".

One level of emulation. We host guests that think they're running on
Xen, directly in QEMU/KVM by handling the hypercalls and event
channels, grant tables, etc.

We virtualised Xen itself :)

Now you have no more excuses for breaking Xen guest mode!


Attachments:
smime.p7s (5.83 kB)
Subject: [tip: x86/urgent] x86/pci/xen: Fixup fallout from the PCI/MSI overhaul

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 6c796996ee7033bcbbc3cd733513eb43f8160f5e
Gitweb: https://git.kernel.org/tip/6c796996ee7033bcbbc3cd733513eb43f8160f5e
Author: Thomas Gleixner <[email protected]>
AuthorDate: Mon, 16 Jan 2023 19:11:32 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Mon, 16 Jan 2023 20:40:44 +01:00

x86/pci/xen: Fixup fallout from the PCI/MSI overhaul

David reported that the recent PCI/MSI rework results in MSI descriptor
leakage under XEN.

This is caused by:

1) The missing MSI_FLAG_FREE_MSI_DESCS flag in the XEN MSI domain info,
which is required now that PCI/MSI delegates descriptor freeing to
the core MSI code.

2) Not disassociating the interrupts on teardown, by setting the
msi_desc::irq to 0. This was not required before because the teardown
was unconditional and did not check whether a MSI descriptor was still
connected to a Linux interrupt.

On further inspection it came to light that the MSI_FLAG_DEV_SYSFS is
missing in the XEN MSI domain info as well to restore the pre 6.2 status
quo.

Add the missing MSI flags and disassociate the MSI descriptor from the
Linux interrupt in the XEN specific teardown function.

Fixes: b2bdda205c0c ("PCI/MSI: Let the MSI core free descriptors")
Fixes: 2f2940d16823 ("genirq/msi: Remove filter from msi_free_descs_free_range()")
Fixes: ffd84485e6be ("PCI/MSI: Let the irq code handle sysfs groups")
Reported-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: David Woodhouse <[email protected]>
Link: https://lore.kernel.org/r/871qnunycr.ffs@tglx
---
arch/x86/pci/xen.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 7905504..8babce7 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -392,6 +392,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
for (i = 0; i < msidesc->nvec_used; i++)
xen_destroy_irq(msidesc->irq + i);
+ msidesc->irq = 0;
}
}

@@ -433,7 +434,7 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
};

static struct msi_domain_info xen_pci_msi_domain_info = {
- .flags = MSI_FLAG_PCI_MSIX,
+ .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
.ops = &xen_pci_msi_domain_ops,
};

2023-01-16 20:38:14

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

David!

On Mon, Jan 16 2023 at 19:28, David Woodhouse wrote:
> On Mon, 2023-01-16 at 20:22 +0100, Thomas Gleixner wrote:
>> > Tested-by: David Woodhouse <[email protected]>
>> >
>> > Albeit only under qemu with
>> > https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
>> > and not under real Xen.
>>
>> Five levels of emulation. What could possibly go wrong?
>
> It's the opposite — this is what happened when I threw my toys out of
> the pram and said, "You're NOT doing that with nested virtualization!".
>
> One level of emulation. We host guests that think they're running on
> Xen, directly in QEMU/KVM by handling the hypercalls and event
> channels, grant tables, etc.
>
> We virtualised Xen itself :)

Groan. Can we please agree on *one* hypervisor instead of growing
emulators for all other hypervisors in each of them :)

> Now you have no more excuses for breaking Xen guest mode!

No cookies, you spoilsport! :)

tglx

2023-01-17 08:41:01

by David Woodhouse

[permalink] [raw]
Subject: Re: [patch V3 16/22] genirq/msi: Provide new domain id based interfaces for freeing interrupts

On Mon, 2023-01-16 at 20:49 +0100, Thomas Gleixner wrote:
> David!
>
> On Mon, Jan 16 2023 at 19:28, David Woodhouse wrote:
> > On Mon, 2023-01-16 at 20:22 +0100, Thomas Gleixner wrote:
> > > > Tested-by: David Woodhouse <[email protected]>
> > > >
> > > > Albeit only under qemu with
> > > > https://git.infradead.org/users/dwmw2/qemu.git/shortlog/refs/heads/xenfv
> > > > and not under real Xen.
> > >
> > > Five levels of emulation. What could possibly go wrong?
> >
> > It's the opposite — this is what happened when I threw my toys out of
> > the pram and said, "You're NOT doing that with nested virtualization!".
> >
> > One level of emulation. We host guests that think they're running on
> > Xen, directly in QEMU/KVM by handling the hypercalls and event
> > channels, grant tables, etc.
> >
> > We virtualised Xen itself :)
>
> Groan. Can we please agree on *one* hypervisor instead of growing
> emulators for all other hypervisors in each of them :)

Hey, we did work across KVM, Xen and even Hyper-V to make sure the
Extended Destination ID in MSI supports 32Ki vCPUs the *same* way on
each guest. Be thankful for small mercies!

And the code to support Xen guests natively in KVM is *fairly* minimal;
we allow userspace to catch hypercalls, and do a little bit of the fast
path of IRQ delivery because we really don't want to be bouncing out to
the userspace VMM for IPIs etc.

As for qemu, emulating environments that you may not have access to in
real hardware is its raison d'être, isn't it?

And agreeing on one hypervisor — that's what we're doing. But the
*administration* is the far more important part. We're allowing people
to standardise on KVM, and to focus on the administration and security
of only Linux and KVM.

But there are still huge numbers of of virtual machine images out there
which are configured to run on Xen. Their root disk is /dev/xvda, the
network device they have configured is vif0.

In some ways it's theoretically just as easy as telling all those folks
"well, you just need to install an NVMe driver and a new network card
driver". Except it isn't really, because that often ends up being
"rebuild it on a newer kernel and/or OS". And if the intern who set
this system up left three years ago and the company now depends on it
as critical infrastructure without really knowing it yet...

It isn't practical to tell people, "screw you, you can't run that any
more".

So we host them under Linux and they mostly look like native KVM guests
to the kernel, you stop breaking Xen guest mode, and everybody wins.

> > Now you have no more excuses for breaking Xen guest mode!
>
> No cookies, you spoilsport! :)

:)


Attachments:
smime.p7s (5.83 kB)