2022-11-25 00:43:43

by Thomas Gleixner

[permalink] [raw]
Subject: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

Per device domains provide the real domain size to the core code. This
allows range checking on insertion of MSI descriptors and also paves the
way for dynamic index allocations which are required e.g. for IMS. This
avoids external mechanisms like bitmaps on the device side and just
utilizes the core internal MSI descriptor storxe for it.

Signed-off-by: Thomas Gleixner <[email protected]>
---
V3: Adopt to the new info->hwsize handling and to the new xarray split
---
kernel/irq/msi.c | 58 ++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 47 insertions(+), 11 deletions(-)

--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -40,6 +40,7 @@ struct msi_ctrl {
#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)

static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);


@@ -80,16 +81,28 @@ static void msi_free_desc(struct msi_des
kfree(desc);
}

-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc,
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;

+ hwsize = msi_domain_get_hwsize(dev, domid);
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
desc->msi_index = index;
ret = xa_insert(xa, index, desc, GFP_KERNEL);
if (ret)
- msi_free_desc(desc);
+ goto fail;
+ return 0;
+
+fail:
+ msi_free_desc(desc);
return ret;
}

@@ -117,7 +130,7 @@ int msi_domain_insert_msi_desc(struct de
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;

- return msi_insert_desc(dev->msi.data, desc, domid, init_desc->msi_index);
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}

static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_de

static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
{
+ unsigned int hwsize;
+
if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
- !dev->msi.data->__domains[ctrl->domid].domain ||
- ctrl->first > ctrl->last ||
- ctrl->first > MSI_MAX_INDEX ||
- ctrl->last > MSI_MAX_INDEX))
+ !dev->msi.data->__domains[ctrl->domid].domain))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
return false;
return true;
}
@@ -208,7 +226,7 @@ static int msi_domain_add_simple_msi_des
desc = msi_alloc_desc(dev, 1, NULL);
if (!desc)
goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, ctrl->domid, idx);
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
if (ret)
goto fail;
}
@@ -407,7 +425,10 @@ unsigned int msi_domain_get_virq(struct
if (!dev->msi.data)
return 0;

- if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ if (WARN_ON_ONCE(index >= msi_domain_get_hwsize(dev, domid)))
return 0;

/* This check is only valid for the PCI default MSI domain */
@@ -569,6 +590,20 @@ static struct irq_domain *msi_get_device
return domain;
}

+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, no size... */
+ return 0;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -1359,7 +1394,7 @@ int msi_domain_alloc_irqs_all_locked(str
struct msi_ctrl ctrl = {
.domid = domid,
.first = 0,
- .last = MSI_MAX_INDEX,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
.nirqs = nirqs,
};

@@ -1473,7 +1508,8 @@ void msi_domain_free_irqs_range(struct d
*/
void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- msi_domain_free_irqs_range_locked(dev, domid, 0, MSI_MAX_INDEX);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}

/**


Subject: [tip: irq/core] genirq/msi: Add range checking to msi_insert_desc()

The following commit has been merged into the irq/core branch of tip:

Commit-ID: 02de943b0519c5940094ed8cd10d348a63ab0646
Gitweb: https://git.kernel.org/tip/02de943b0519c5940094ed8cd10d348a63ab0646
Author: Thomas Gleixner <[email protected]>
AuthorDate: Fri, 25 Nov 2022 00:25:59 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Mon, 05 Dec 2022 19:21:02 +01:00

genirq/msi: Add range checking to msi_insert_desc()

Per device domains provide the real domain size to the core code. This
allows range checking on insertion of MSI descriptors and also paves the
way for dynamic index allocations which are required e.g. for IMS. This
avoids external mechanisms like bitmaps on the device side and just
utilizes the core internal MSI descriptor storxe for it.

Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Kevin Tian <[email protected]>
Acked-by: Marc Zyngier <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/irq/msi.c | 58 ++++++++++++++++++++++++++++++++++++++---------
1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7449998..21a7452 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -40,6 +40,7 @@ struct msi_ctrl {
#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)

static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);


@@ -80,16 +81,28 @@ static void msi_free_desc(struct msi_desc *desc)
kfree(desc);
}

-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc,
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;

+ hwsize = msi_domain_get_hwsize(dev, domid);
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
desc->msi_index = index;
ret = xa_insert(xa, index, desc, GFP_KERNEL);
if (ret)
- msi_free_desc(desc);
+ goto fail;
+ return 0;
+
+fail:
+ msi_free_desc(desc);
return ret;
}

@@ -117,7 +130,7 @@ int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;

- return msi_insert_desc(dev->msi.data, desc, domid, init_desc->msi_index);
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}

static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)

static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
{
+ unsigned int hwsize;
+
if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
- !dev->msi.data->__domains[ctrl->domid].domain ||
- ctrl->first > ctrl->last ||
- ctrl->first > MSI_MAX_INDEX ||
- ctrl->last > MSI_MAX_INDEX))
+ !dev->msi.data->__domains[ctrl->domid].domain))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
return false;
return true;
}
@@ -208,7 +226,7 @@ static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *
desc = msi_alloc_desc(dev, 1, NULL);
if (!desc)
goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, ctrl->domid, idx);
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
if (ret)
goto fail;
}
@@ -406,7 +424,10 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
if (!dev->msi.data)
return 0;

- if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ if (WARN_ON_ONCE(index >= msi_domain_get_hwsize(dev, domid)))
return 0;

/* This check is only valid for the PCI default MSI domain */
@@ -568,6 +589,20 @@ static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int
return domain;
}

+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, no size... */
+ return 0;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -1356,7 +1391,7 @@ int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int
struct msi_ctrl ctrl = {
.domid = domid,
.first = 0,
- .last = MSI_MAX_INDEX,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
.nirqs = nirqs,
};

@@ -1470,7 +1505,8 @@ void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
*/
void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- msi_domain_free_irqs_range_locked(dev, domid, 0, MSI_MAX_INDEX);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}

/**

Subject: [tip: irq/core] genirq/msi: Add range checking to msi_insert_desc()

The following commit has been merged into the irq/core branch of tip:

Commit-ID: 36db3d9003ea85217b357a658cf7b37920c2c38e
Gitweb: https://git.kernel.org/tip/36db3d9003ea85217b357a658cf7b37920c2c38e
Author: Thomas Gleixner <[email protected]>
AuthorDate: Fri, 25 Nov 2022 00:25:59 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Mon, 05 Dec 2022 22:22:32 +01:00

genirq/msi: Add range checking to msi_insert_desc()

Per device domains provide the real domain size to the core code. This
allows range checking on insertion of MSI descriptors and also paves the
way for dynamic index allocations which are required e.g. for IMS. This
avoids external mechanisms like bitmaps on the device side and just
utilizes the core internal MSI descriptor storxe for it.

Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Kevin Tian <[email protected]>
Acked-by: Marc Zyngier <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/irq/msi.c | 53 ++++++++++++++++++++++++++++++++++++++---------
1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7449998..ae58692 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -40,6 +40,7 @@ struct msi_ctrl {
#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)

static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);


@@ -80,16 +81,28 @@ static void msi_free_desc(struct msi_desc *desc)
kfree(desc);
}

-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc,
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;

+ hwsize = msi_domain_get_hwsize(dev, domid);
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
desc->msi_index = index;
ret = xa_insert(xa, index, desc, GFP_KERNEL);
if (ret)
- msi_free_desc(desc);
+ goto fail;
+ return 0;
+
+fail:
+ msi_free_desc(desc);
return ret;
}

@@ -117,7 +130,7 @@ int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;

- return msi_insert_desc(dev->msi.data, desc, domid, init_desc->msi_index);
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}

static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)

static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
{
+ unsigned int hwsize;
+
if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
- !dev->msi.data->__domains[ctrl->domid].domain ||
- ctrl->first > ctrl->last ||
- ctrl->first > MSI_MAX_INDEX ||
- ctrl->last > MSI_MAX_INDEX))
+ !dev->msi.data->__domains[ctrl->domid].domain))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
return false;
return true;
}
@@ -208,7 +226,7 @@ static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *
desc = msi_alloc_desc(dev, 1, NULL);
if (!desc)
goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, ctrl->domid, idx);
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
if (ret)
goto fail;
}
@@ -568,6 +586,20 @@ static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int
return domain;
}

+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, no size... */
+ return 0;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -1356,7 +1388,7 @@ int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int
struct msi_ctrl ctrl = {
.domid = domid,
.first = 0,
- .last = MSI_MAX_INDEX,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
.nirqs = nirqs,
};

@@ -1470,7 +1502,8 @@ void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
*/
void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- msi_domain_free_irqs_range_locked(dev, domid, 0, MSI_MAX_INDEX);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}

/**

2022-12-13 19:50:25

by Guenter Roeck

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

Hi,

On Fri, Nov 25, 2022 at 12:25:59AM +0100, Thomas Gleixner wrote:
> Per device domains provide the real domain size to the core code. This
> allows range checking on insertion of MSI descriptors and also paves the
> way for dynamic index allocations which are required e.g. for IMS. This
> avoids external mechanisms like bitmaps on the device side and just
> utilizes the core internal MSI descriptor storxe for it.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---

This patch results in various s390 qemu test failures.
There is a warning backtrace

12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0

followed by

[ 12.684333] virtio_net: probe of virtio0 failed with error -34

and Ethernet interfaces don't instantiate.

When trying to instantiate virtio-pci and booting from it, I see
the same warning backtrace followed by

[ 9.943123] virtio_blk: probe of virtio0 failed with error -34

and a crash.

A typical backtrace is

[ 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
[ 12.675108] Modules linked in:
[ 12.675346] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G N 6.1.0-03225-g764822972d64 #1
[ 12.675512] Hardware name: QEMU 8561 QEMU (KVM/Linux)
[ 12.675648] Krnl PSW : 0704c00180000000 00000000001ec4c6 (msi_ctrl_valid+0x2e/0xb0)
[ 12.675853] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[ 12.675987] Krnl GPRS: 00000000435318a9 0000000000000000 00000000035510a0 0000000000000000
[ 12.676069] 0000000000000000 000000000000ffff 0000000000000000 0000037fffb1b6c0
[ 12.676151] 0000000000000000 0000037fffb1b658 0000000000000000 0000037fffb1b658
[ 12.676232] 0000000002ae4100 00000000035510a0 0000037fffb1b568 0000037fffb1b538
[ 12.677127] Krnl Code: 00000000001ec4b8: 58303000 l %r3,0(%r3)
[ 12.677127] 00000000001ec4bc: ec3c000f017f clij %r3,1,12,00000000001ec4da
[ 12.677127] #00000000001ec4c2: af000000 mc 0,0
[ 12.677127] >00000000001ec4c6: a7280000 lhi %r2,0
[ 12.677127] 00000000001ec4ca: b9840022 llgcr %r2,%r2
[ 12.677127] 00000000001ec4ce: ebbff0a00004 lmg %r11,%r15,160(%r15)
[ 12.677127] 00000000001ec4d4: c0f400714f1a brcl 15,0000000001016308
[ 12.677127] 00000000001ec4da: b9160033 llgfr %r3,%r3
[ 12.677743] Call Trace:
[ 12.677835] [<00000000001ec4c6>] msi_ctrl_valid+0x2e/0xb0
[ 12.677943] [<00000000001ec58a>] msi_domain_free_descs+0x42/0x120
[ 12.678024] [<00000000001ecaf0>] msi_domain_free_msi_descs_range+0x38/0x48
[ 12.678103] [<00000000009db7ae>] __pci_enable_msix_range+0x44e/0x710
[ 12.678186] [<00000000009d9da4>] pci_alloc_irq_vectors_affinity+0xa4/0x120
[ 12.678268] [<00000000009f5888>] vp_request_msix_vectors+0xb8/0x208
[ 12.678348] [<00000000009f5f24>] vp_find_vqs_msix+0x254/0x2f0
[ 12.678428] [<00000000009f6016>] vp_find_vqs+0x56/0x1f8
[ 12.678508] [<00000000009f4e4e>] vp_modern_find_vqs+0x3e/0x90
[ 12.678587] [<0000000000ad8c14>] virtnet_find_vqs+0x244/0x3e8
[ 12.678669] [<0000000000ad9268>] virtnet_probe+0x4b0/0xca8
[ 12.678748] [<00000000009ed6b4>] virtio_dev_probe+0x1ec/0x418
[ 12.678826] [<0000000000a3c246>] really_probe+0xd6/0x480
[ 12.678906] [<0000000000a3c7a0>] driver_probe_device+0x40/0xf0
[ 12.678985] [<0000000000a3d0e4>] __driver_attach+0xbc/0x228
[ 12.679065] [<0000000000a396c0>] bus_for_each_dev+0x80/0xb8
[ 12.679143] [<0000000000a3b38e>] bus_add_driver+0x1d6/0x260
[ 12.679222] [<0000000000a3dc10>] driver_register+0xa8/0x170
[ 12.679312] [<00000000017b8848>] virtio_net_driver_init+0x88/0xc0

This worked fine in v6.1 and earlier kernels. Bisect log attached.

Guenter

---
# bad: [764822972d64e7f3e6792278ecc7a3b3c81087cd] Merge tag 'nfsd-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux
# good: [830b3c68c1fb1e9176028d02ef86f3cf76aa2476] Linux 6.1
git bisect start 'HEAD' 'v6.1'
# good: [01f3cbb296a9ad378167c01758c99557b5bc3208] Merge tag 'soc-dt-6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
git bisect good 01f3cbb296a9ad378167c01758c99557b5bc3208
# bad: [e2ed78d5d9ca07a2b9d158ebac366170a2d3083d] Merge tag 'linux-kselftest-kunit-next-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest
git bisect bad e2ed78d5d9ca07a2b9d158ebac366170a2d3083d
# bad: [045e222d0a9dcec152abe0633f538cafd965b12b] Merge tag 'pm-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
git bisect bad 045e222d0a9dcec152abe0633f538cafd965b12b
# good: [f10bc40168032962ebee26894bdbdc972cde35bf] Merge tag 'core-debugobjects-2022-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect good f10bc40168032962ebee26894bdbdc972cde35bf
# bad: [9d33edb20f7e6943250d6bb96ceaf2368f674d51] Merge tag 'irq-core-2022-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
git bisect bad 9d33edb20f7e6943250d6bb96ceaf2368f674d51
# good: [c459f11f32a022d0f97694030419d16816275a9d] genirq/msi: Remove unused alloc/free interfaces
git bisect good c459f11f32a022d0f97694030419d16816275a9d
# bad: [d51a15af37ce8cf59e73de51dcdce3c9f4944974] irqchip/gic-v2m: Mark a few functions __init
git bisect bad d51a15af37ce8cf59e73de51dcdce3c9f4944974
# bad: [4d5a4ccc519ab0a62e220dc8dcd8bc1c5f8fee10] x86/apic/msi: Remove arch_create_remap_msi_irq_domain()
git bisect bad 4d5a4ccc519ab0a62e220dc8dcd8bc1c5f8fee10
# good: [26e91b75bf6108550035355c835bf0c93c885b61] genirq/msi: Provide msi_match_device_domain()
git bisect good 26e91b75bf6108550035355c835bf0c93c885b61
# bad: [15c72f824b32761696b1854500bb3dedccbbb45a] PCI/MSI: Add support for per device MSI[X] domains
git bisect bad 15c72f824b32761696b1854500bb3dedccbbb45a
# bad: [877d6c4e93f5091bfa52549bde8fb9ce71d6f7e5] PCI/MSI: Split __pci_write_msi_msg()
git bisect bad 877d6c4e93f5091bfa52549bde8fb9ce71d6f7e5
# bad: [36db3d9003ea85217b357a658cf7b37920c2c38e] genirq/msi: Add range checking to msi_insert_desc()
git bisect bad 36db3d9003ea85217b357a658cf7b37920c2c38e
# first bad commit: [36db3d9003ea85217b357a658cf7b37920c2c38e] genirq/msi: Add range checking to msi_insert_desc()

2022-12-14 10:07:27

by Niklas Schnelle

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
> Hi,
>
> On Fri, Nov 25, 2022 at 12:25:59AM +0100, Thomas Gleixner wrote:
> > Per device domains provide the real domain size to the core code. This
> > allows range checking on insertion of MSI descriptors and also paves the
> > way for dynamic index allocations which are required e.g. for IMS. This
> > avoids external mechanisms like bitmaps on the device side and just
> > utilizes the core internal MSI descriptor storxe for it.
> >
> > Signed-off-by: Thomas Gleixner <[email protected]>
> > ---
>
> This patch results in various s390 qemu test failures.
> There is a warning backtrace
>
> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
>
> followed by
>
> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
>
> and Ethernet interfaces don't instantiate.
>
> When trying to instantiate virtio-pci and booting from it, I see
> the same warning backtrace followed by
>
> [ 9.943123] virtio_blk: probe of virtio0 failed with error -34
>
> and a crash.
>
> A typical backtrace is
>
> [ 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
> [ 12.675108] Modules linked in:
> [ 12.675346] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G N 6.1.0-03225-g764822972d64 #1
> [ 12.675512] Hardware name: QEMU 8561 QEMU (KVM/Linux)
> [ 12.675648] Krnl PSW : 0704c00180000000 00000000001ec4c6 (msi_ctrl_valid+0x2e/0xb0)
> [ 12.675853] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [ 12.675987] Krnl GPRS: 00000000435318a9 0000000000000000 00000000035510a0 0000000000000000
> [ 12.676069] 0000000000000000 000000000000ffff 0000000000000000 0000037fffb1b6c0
> [ 12.676151] 0000000000000000 0000037fffb1b658 0000000000000000 0000037fffb1b658
> [ 12.676232] 0000000002ae4100 00000000035510a0 0000037fffb1b568 0000037fffb1b538
> [ 12.677127] Krnl Code: 00000000001ec4b8: 58303000 l %r3,0(%r3)
> [ 12.677127] 00000000001ec4bc: ec3c000f017f clij %r3,1,12,00000000001ec4da
> [ 12.677127] #00000000001ec4c2: af000000 mc 0,0
> [ 12.677127] >00000000001ec4c6: a7280000 lhi %r2,0
> [ 12.677127] 00000000001ec4ca: b9840022 llgcr %r2,%r2
> [ 12.677127] 00000000001ec4ce: ebbff0a00004 lmg %r11,%r15,160(%r15)
> [ 12.677127] 00000000001ec4d4: c0f400714f1a brcl 15,0000000001016308
> [ 12.677127] 00000000001ec4da: b9160033 llgfr %r3,%r3
> [ 12.677743] Call Trace:
> [ 12.677835] [<00000000001ec4c6>] msi_ctrl_valid+0x2e/0xb0
> [ 12.677943] [<00000000001ec58a>] msi_domain_free_descs+0x42/0x120
> [ 12.678024] [<00000000001ecaf0>] msi_domain_free_msi_descs_range+0x38/0x48
> [ 12.678103] [<00000000009db7ae>] __pci_enable_msix_range+0x44e/0x710
> [ 12.678186] [<00000000009d9da4>] pci_alloc_irq_vectors_affinity+0xa4/0x120
> [ 12.678268] [<00000000009f5888>] vp_request_msix_vectors+0xb8/0x208
> [ 12.678348] [<00000000009f5f24>] vp_find_vqs_msix+0x254/0x2f0
> [ 12.678428] [<00000000009f6016>] vp_find_vqs+0x56/0x1f8
> [ 12.678508] [<00000000009f4e4e>] vp_modern_find_vqs+0x3e/0x90
> [ 12.678587] [<0000000000ad8c14>] virtnet_find_vqs+0x244/0x3e8
> [ 12.678669] [<0000000000ad9268>] virtnet_probe+0x4b0/0xca8
> [ 12.678748] [<00000000009ed6b4>] virtio_dev_probe+0x1ec/0x418
> [ 12.678826] [<0000000000a3c246>] really_probe+0xd6/0x480
> [ 12.678906] [<0000000000a3c7a0>] driver_probe_device+0x40/0xf0
> [ 12.678985] [<0000000000a3d0e4>] __driver_attach+0xbc/0x228
> [ 12.679065] [<0000000000a396c0>] bus_for_each_dev+0x80/0xb8
> [ 12.679143] [<0000000000a3b38e>] bus_add_driver+0x1d6/0x260
> [ 12.679222] [<0000000000a3dc10>] driver_register+0xa8/0x170
> [ 12.679312] [<00000000017b8848>] virtio_net_driver_init+0x88/0xc0
>
> This worked fine in v6.1 and earlier kernels. Bisect log attached.
>
> Guenter

Yes, we were about to report the same issue. Currently in linux-next
PCI support is broken for both ConnectX based NICs, NVMes etc. Matthew
Rosato bisected this to the above mentioned commit on Monday and was I
believe still investigating details.


As far as I'm aware so far he tracked this down to code calling
msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
which then returns NULL leading to msi_domain_get_hwsize() returning 0.
I think this is related to the fact that we currently don't use IRQ
domains.

Thanks,
Niklas

2022-12-15 15:07:23

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Wed, Dec 14 2022 at 10:42, Niklas Schnelle wrote:
> On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
>> This patch results in various s390 qemu test failures.
>> There is a warning backtrace
>>
>> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
>>
>> followed by
>>
>> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
>>
>> and Ethernet interfaces don't instantiate.
> As far as I'm aware so far he tracked this down to code calling
> msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
> which then returns NULL leading to msi_domain_get_hwsize() returning 0.
> I think this is related to the fact that we currently don't use IRQ
> domains.

Correct and for some stupid reason I thought 0 is a good return value
here :)



diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index bd4d4dd626b4..8fb10f216dc0 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -609,8 +609,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
info = domain->host_data;
return info->hwsize;
}
- /* No domain, no size... */
- return 0;
+ /* No domain, default to MSI_MAX_INDEX */
+ return MSI_MAX_INDEX;
}

static inline void irq_chip_write_msi_msg(struct irq_data *data,

2022-12-15 17:34:44

by Matthew Rosato

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/15/22 9:49 AM, Thomas Gleixner wrote:
> On Wed, Dec 14 2022 at 10:42, Niklas Schnelle wrote:
>> On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
>>> This patch results in various s390 qemu test failures.
>>> There is a warning backtrace
>>>
>>> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
>>>
>>> followed by
>>>
>>> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
>>>
>>> and Ethernet interfaces don't instantiate.
>> As far as I'm aware so far he tracked this down to code calling
>> msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
>> which then returns NULL leading to msi_domain_get_hwsize() returning 0.
>> I think this is related to the fact that we currently don't use IRQ
>> domains.
>
> Correct and for some stupid reason I thought 0 is a good return value
> here :)
>
>
>
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index bd4d4dd626b4..8fb10f216dc0 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -609,8 +609,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
> info = domain->host_data;
> return info->hwsize;
> }
> - /* No domain, no size... */
> - return 0;
> + /* No domain, default to MSI_MAX_INDEX */
> + return MSI_MAX_INDEX;
> }
>
> static inline void irq_chip_write_msi_msg(struct irq_data *data,

Ah, that makes sense... So, with that diff applied, that fixes most of the issues I'm seeing incl. the virtio one that Guenter mentioned. But it looks like NVMe devices are still broken on s390 with a different backtrace -- the bisect for that one points to another patch in part2 of this work and looks like another issue with missing irq domain:

40742716f294 genirq/msi: Make msi_add_simple_msi_descs() device domain aware


[ 4.308861] ------------[ cut here ]------------
[ 4.308865] WARNING: CPU: 7 PID: 9 at kernel/irq/msi.c:167 msi_domain_free_msi_descs_range+0x3c/0xd0
[ 4.308877] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
[ 4.308896] CPU: 7 PID: 9 Comm: kworker/u20:0 Not tainted 6.1.0 #179
[ 4.308898] Hardware name: IBM 3931 A01 782 (KVM/Linux)
[ 4.308900] Workqueue: events_unbound async_run_entry_fn
[ 4.308905] Krnl PSW : 0704c00180000000 00000000b6426b78 (msi_domain_free_msi_descs_range+0x40/0xd0)
[ 4.308909] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[ 4.308911] Krnl GPRS: 0700000080eda0a0 0000000000000000 0000000080eda0a0 0000000000000000
[ 4.308913] 0000000000000000 0000000000000000 0000000000000cc0 0000000080eda000
[ 4.308914] 00000000b7ddc000 0000000091934aa8 000000000000ffff 0000000000000000
[ 4.308915] 0000000080344200 0000000080f2b1c0 0000037fffb5b918 0000037fffb5b8c8
[ 4.308924] Krnl Code: 00000000b6426b68: e54cf0ac0000 mvhi 172(%r15),0
[ 4.308924] 00000000b6426b6e: ec3c000b017f clij %r3,1,12,00000000b6426b84
[ 4.308924] #00000000b6426b74: af000000 mc 0,0
[ 4.308924] >00000000b6426b78: eb9ff0b00004 lmg %r9,%r15,176(%r15)
[ 4.308924] 00000000b6426b7e: 07fe bcr 15,%r14
[ 4.308924] 00000000b6426b80: 47000700 bc 0,1792
[ 4.308924] 00000000b6426b84: b90400a5 lgr %r10,%r5
[ 4.308924] 00000000b6426b88: b9040013 lgr %r1,%r3
[ 4.308935] Call Trace:
[ 4.308938] [<00000000b6426b78>] msi_domain_free_msi_descs_range+0x40/0xd0
[ 4.308945] [<00000000b6bb126e>] pci_free_msi_irqs+0x26/0x48
[ 4.308950] [<00000000b6baf4d4>] pci_disable_msix+0x6c/0x80
[ 4.308954] [<00000000b6baf716>] pci_free_irq_vectors+0x26/0x88
[ 4.308956] [<000003ff7fdfa8f4>] nvme_setup_io_queues+0x18c/0x398 [nvme]
[ 4.308968] [<000003ff7fdfbf1e>] nvme_probe+0x2e6/0x3b0 [nvme]
[ 4.308972] [<00000000b6ba44cc>] local_pci_probe+0x44/0x80
[ 4.308974] [<00000000b6ba46d8>] pci_call_probe+0x50/0x180
[ 4.308976] [<00000000b6ba5166>] pci_device_probe+0xae/0x110
[ 4.308978] [<00000000b6c0a19a>] really_probe+0xd2/0x480
[ 4.308982] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
[ 4.308984] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
[ 4.308986] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
[ 4.308987] [<00000000b63c1368>] process_one_work+0x200/0x458
[ 4.308991] [<00000000b63c1aee>] worker_thread+0x66/0x480
[ 4.308993] [<00000000b63caa00>] kthread+0x108/0x110
[ 4.308996] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
[ 4.308999] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
[ 4.309006] Last Breaking-Event-Address:
[ 4.309007] [<00000000b6426ba8>] msi_domain_free_msi_descs_range+0x70/0xd0
[ 4.309009] ---[ end trace 0000000000000000 ]---
[ 8.957187] nvme: probe of 0003:00:00.0 failed with error -22
[ 8.957216] ------------[ cut here ]------------
[ 8.957217] WARNING: CPU: 5 PID: 9 at kernel/irq/msi.c:275 msi_device_data_release+0x76/0xa0
[ 8.957229] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
[ 8.957248] CPU: 5 PID: 9 Comm: kworker/u20:0 Tainted: G W 6.1.0 #179
[ 8.957252] Hardware name: IBM 3931 A01 782 (KVM/Linux)
[ 8.957254] Workqueue: events_unbound async_run_entry_fn
[ 8.957259] Krnl PSW : 0704e00180000000 00000000b642729a (msi_device_data_release+0x7a/0xa0)
[ 8.957262] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
[ 8.957265] Krnl GPRS: a813fdc020800000 00000000928b6840 0000000091934ab8 0000000080344200
[ 8.957267] 0000000000000000 0000000091934a80 0000000080d79988 0000000000000000
[ 8.957268] 0000000080eda0a0 0000037fffb5bc10 0000000080eda0a0 0000000091934aa8
[ 8.957270] 0000000080344200 00000000800e0402 00000000b642724e 0000037fffb5bad0
[ 8.957279] Krnl Code: 00000000b642728c: f0a0000407fe srp 4(11,%r0),2046,0
[ 8.957279] 00000000b6427292: 47000700 bc 0,1792
[ 8.957279] #00000000b6427296: af000000 mc 0,0
[ 8.957279] >00000000b642729a: a7f4ffdf brc 15,00000000b6427258
[ 8.957279] 00000000b642729e: af000000 mc 0,0
[ 8.957279] 00000000b64272a2: 4120b048 la %r2,72(%r11)
[ 8.957279] 00000000b64272a6: c0e5005a0c4d brasl %r14,00000000b6f68b40
[ 8.957279] 00000000b64272ac: e548a1180000 mvghi 280(%r10),0
[ 8.957290] Call Trace:
[ 8.957292] [<00000000b642729a>] msi_device_data_release+0x7a/0xa0
[ 8.957295] ([<00000000b642724e>] msi_device_data_release+0x2e/0xa0)
[ 8.957298] [<00000000b6c0f608>] release_nodes+0x50/0xd8
[ 8.957305] [<00000000b6c111aa>] devres_release_all+0xaa/0xf0
[ 8.957308] [<00000000b6c0a2f2>] really_probe+0x22a/0x480
[ 8.957310] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
[ 8.957312] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
[ 8.957314] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
[ 8.957315] [<00000000b63c1368>] process_one_work+0x200/0x458
[ 8.957320] [<00000000b63c1aee>] worker_thread+0x66/0x480
[ 8.957322] [<00000000b63caa00>] kthread+0x108/0x110
[ 8.957325] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
[ 8.957328] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
[ 8.957336] Last Breaking-Event-Address:
[ 8.957337] [<00000000b6427254>] msi_device_data_release+0x34/0xa0
[ 8.957339] ---[ end trace 0000000000000000 ]---

The line number for the first warning points to the WARN_ON check in msi_ctrl_valid -- specifically it's the !dev->msi.data->__domains[ctrl->domid].domain check that is failing.

The second warning is the WARN_ON_ONCE(!xa_empty(&md->__domains[i].store)) check in msi_device_data_release, presumably a victim of backing out after the first error.

2022-12-15 21:56:49

by Guenter Roeck

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/15/22 08:23, Matthew Rosato wrote:
> On 12/15/22 9:49 AM, Thomas Gleixner wrote:
>> On Wed, Dec 14 2022 at 10:42, Niklas Schnelle wrote:
>>> On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
>>>> This patch results in various s390 qemu test failures.
>>>> There is a warning backtrace
>>>>
>>>> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
>>>>
>>>> followed by
>>>>
>>>> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
>>>>
>>>> and Ethernet interfaces don't instantiate.
>>> As far as I'm aware so far he tracked this down to code calling
>>> msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
>>> which then returns NULL leading to msi_domain_get_hwsize() returning 0.
>>> I think this is related to the fact that we currently don't use IRQ
>>> domains.
>>
>> Correct and for some stupid reason I thought 0 is a good return value
>> here :)
>>
>>
>>
>> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
>> index bd4d4dd626b4..8fb10f216dc0 100644
>> --- a/kernel/irq/msi.c
>> +++ b/kernel/irq/msi.c
>> @@ -609,8 +609,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
>> info = domain->host_data;
>> return info->hwsize;
>> }
>> - /* No domain, no size... */
>> - return 0;
>> + /* No domain, default to MSI_MAX_INDEX */
>> + return MSI_MAX_INDEX;
>> }
>>
>> static inline void irq_chip_write_msi_msg(struct irq_data *data,
>
> Ah, that makes sense... So, with that diff applied, that fixes most of the issues I'm seeing incl. the virtio one that Guenter mentioned. But it looks like NVMe devices are still broken on s390 with a different backtrace -- the bisect for that one points to another patch in part2 of this work and looks like another issue with missing irq domain:
>
> 40742716f294 genirq/msi: Make msi_add_simple_msi_descs() device domain aware
>
>
> [ 4.308861] ------------[ cut here ]------------
> [ 4.308865] WARNING: CPU: 7 PID: 9 at kernel/irq/msi.c:167 msi_domain_free_msi_descs_range+0x3c/0xd0
> [ 4.308877] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
> [ 4.308896] CPU: 7 PID: 9 Comm: kworker/u20:0 Not tainted 6.1.0 #179
> [ 4.308898] Hardware name: IBM 3931 A01 782 (KVM/Linux)
> [ 4.308900] Workqueue: events_unbound async_run_entry_fn
> [ 4.308905] Krnl PSW : 0704c00180000000 00000000b6426b78 (msi_domain_free_msi_descs_range+0x40/0xd0)
> [ 4.308909] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [ 4.308911] Krnl GPRS: 0700000080eda0a0 0000000000000000 0000000080eda0a0 0000000000000000
> [ 4.308913] 0000000000000000 0000000000000000 0000000000000cc0 0000000080eda000
> [ 4.308914] 00000000b7ddc000 0000000091934aa8 000000000000ffff 0000000000000000
> [ 4.308915] 0000000080344200 0000000080f2b1c0 0000037fffb5b918 0000037fffb5b8c8
> [ 4.308924] Krnl Code: 00000000b6426b68: e54cf0ac0000 mvhi 172(%r15),0
> [ 4.308924] 00000000b6426b6e: ec3c000b017f clij %r3,1,12,00000000b6426b84
> [ 4.308924] #00000000b6426b74: af000000 mc 0,0
> [ 4.308924] >00000000b6426b78: eb9ff0b00004 lmg %r9,%r15,176(%r15)
> [ 4.308924] 00000000b6426b7e: 07fe bcr 15,%r14
> [ 4.308924] 00000000b6426b80: 47000700 bc 0,1792
> [ 4.308924] 00000000b6426b84: b90400a5 lgr %r10,%r5
> [ 4.308924] 00000000b6426b88: b9040013 lgr %r1,%r3
> [ 4.308935] Call Trace:
> [ 4.308938] [<00000000b6426b78>] msi_domain_free_msi_descs_range+0x40/0xd0
> [ 4.308945] [<00000000b6bb126e>] pci_free_msi_irqs+0x26/0x48
> [ 4.308950] [<00000000b6baf4d4>] pci_disable_msix+0x6c/0x80
> [ 4.308954] [<00000000b6baf716>] pci_free_irq_vectors+0x26/0x88
> [ 4.308956] [<000003ff7fdfa8f4>] nvme_setup_io_queues+0x18c/0x398 [nvme]
> [ 4.308968] [<000003ff7fdfbf1e>] nvme_probe+0x2e6/0x3b0 [nvme]
> [ 4.308972] [<00000000b6ba44cc>] local_pci_probe+0x44/0x80
> [ 4.308974] [<00000000b6ba46d8>] pci_call_probe+0x50/0x180
> [ 4.308976] [<00000000b6ba5166>] pci_device_probe+0xae/0x110
> [ 4.308978] [<00000000b6c0a19a>] really_probe+0xd2/0x480
> [ 4.308982] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
> [ 4.308984] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
> [ 4.308986] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
> [ 4.308987] [<00000000b63c1368>] process_one_work+0x200/0x458
> [ 4.308991] [<00000000b63c1aee>] worker_thread+0x66/0x480
> [ 4.308993] [<00000000b63caa00>] kthread+0x108/0x110
> [ 4.308996] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
> [ 4.308999] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
> [ 4.309006] Last Breaking-Event-Address:
> [ 4.309007] [<00000000b6426ba8>] msi_domain_free_msi_descs_range+0x70/0xd0
> [ 4.309009] ---[ end trace 0000000000000000 ]---
> [ 8.957187] nvme: probe of 0003:00:00.0 failed with error -22

With this patch applied, I see the same error on powerpc, followed by

WARNING: CPU: 0 PID: 21 at arch/powerpc/kernel/irq.c:348 .virq_to_hw+0x1c/0x60
Modules linked in:
CPU: 0 PID: 21 Comm: kworker/u2:2 Tainted: G W N 6.1.0-10397-g8a1566869bf4 #1
Hardware name: QEMU ppce500 e5500 0x80240020 QEMU e500
Workqueue: events_unbound .async_run_entry_fn
NIP: c00000000000415c LR: c000000000004150 CTR: 0000000000000000
REGS: c0000000043c2f70 TRAP: 0700 Tainted: G W N (6.1.0-10397-g8a1566869bf4)
MSR: 0000000080029002 <CE,EE,ME> CR: 24828842 XER: 00000000
IRQMASK: 0
GPR00: c000000000004150 c0000000043c3210 c00000000169dd00 0000000000000000
GPR04: 0000000000000013 0000000000000000 0000000000000000 c000000005000368
GPR08: c0000000050002a8 0000000000000001 c000000005000360 fffffffffffffffd
GPR12: c00000000186a400 c0000000025f2000 c0000000060da000 c00000000433ca40
GPR16: 0000000000000004 c00000000617a020 0000000000000000 0000000000000000
GPR20: 0000000000000001 0000000000000000 c0000000043c3758 c00000000617a020
GPR24: 0000000000000000 0000000000000001 0000000000000001 c0000000043810b8
GPR28: c0000000043810b8 0000000000000041 fffffffffffffff0 c00000000602e4b8
NIP [c00000000000415c] .virq_to_hw+0x1c/0x60
LR [c000000000004150] .virq_to_hw+0x10/0x60
Call Trace:
[c0000000043c3210] [c000000000004150] .virq_to_hw+0x10/0x60 (unreliable)
[c0000000043c3280] [c000000000041d58] .fsl_teardown_msi_irqs+0x48/0xe0
[c0000000043c3310] [c00000000002b204] .arch_teardown_msi_irqs+0x34/0x50
[c0000000043c3380] [c0000000008d6e68] .pci_msi_legacy_teardown_msi_irqs+0x28/0x40
[c0000000043c3400] [c0000000008d66c0] .pci_msi_teardown_msi_irqs+0x30/0xa0
[c0000000043c3480] [c0000000008d590c] .__pci_enable_msix_range+0x5fc/0x990
[c0000000043c35e0] [c0000000008d3934] .pci_alloc_irq_vectors_affinity+0x154/0x1d0
[c0000000043c36c0] [c000000000a74360] .nvme_setup_io_queues+0x2b0/0x9c0
[c0000000043c3830] [c000000000a76298] .nvme_probe+0x538/0x620
[c0000000043c38d0] [c0000000008c6e84] .pci_device_probe+0xc4/0x190
[c0000000043c3960] [c0000000009a9f38] .really_probe+0x108/0x460
[c0000000043c39f0] [c0000000009aa3a4] .driver_probe_device+0x44/0x120
[c0000000043c3a80] [c0000000009aa4e4] .__driver_attach_async_helper+0x64/0x120
[c0000000043c3b10] [c000000000094ca0] .async_run_entry_fn+0x50/0x140
[c0000000043c3bb0] [c000000000081e98] .process_one_work+0x2d8/0x7b0
[c0000000043c3c90] [c000000000082408] .worker_thread+0x98/0x4f0
[c0000000043c3d70] [c00000000008f2ac] .kthread+0x13c/0x150
[c0000000043c3e10] [c0000000000005d8] .ret_from_kernel_thread+0x58/0x60
Instruction dump:
78630020 ebc1fff0 ebe1fff8 7c0803a6 4e800020 7c0802a6 f8010010 f821ff91
480fb275 60000000 7c690074 7929d182 <0b090000> 38210070 e8630008 e8010010
irq event stamp: 96256
hardirqs last enabled at (96255): [<c000000001114314>] ._raw_spin_unlock_irqrestore+0x84/0xd0
hardirqs last disabled at (96256): [<c000000000010b68>] .program_check_exception+0x38/0x120
softirqs last enabled at (96188): [<c00000000111557c>] .__do_softirq+0x60c/0x678
softirqs last disabled at (96181): [<c000000000004d30>] .do_softirq_own_stack+0x30/0x50
---[ end trace 0000000000000000 ]---
Kernel attempted to read user page (d8) - exploit attempt? (uid: 0)
BUG: Kernel NULL pointer dereference on read at 0x000000d8
Faulting instruction address: 0xc0000000000e5540
Oops: Kernel access of bad area, sig: 11 [#1]
BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
Modules linked in:
CPU: 0 PID: 21 Comm: kworker/u2:2 Tainted: G W N 6.1.0-10397-g8a1566869bf4 #1
Hardware name: QEMU ppce500 e5500 0x80240020 QEMU e500
Workqueue: events_unbound .async_run_entry_fn
NIP: c0000000000e5540 LR: c0000000000e44a4 CTR: 0000000000000000
REGS: c0000000043c2c90 TRAP: 0300 Tainted: G W N (6.1.0-10397-g8a1566869bf4)
MSR: 0000000080029002 <CE,EE,ME> CR: 44828842 XER: 00000000
DEAR: 00000000000000d8 ESR: 0000000000000000 IRQMASK: 1
GPR00: c0000000000e44a4 c0000000043c2f30 c00000000169dd00 00000000000000d8
GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000001
GPR08: 0000000000000001 0000000000000001 c00000000003f5a4 0000000000000001
GPR12: 0000000044828842 c0000000025f2000 00000000000000d8 0000000000000000
GPR16: 0000000000000004 c00000000617a020 c0000000043e8040 0000000000000000
GPR20: 0000000000000001 c0000000019e5918 c00000000182e738 0000000000000000
GPR24: 0000000000000000 0000000000000001 0000000000000000 00000000000000d8
GPR28: 0000000000000001 0000000000000000 0000000000000000 c00000000182e738
NIP [c0000000000e5540] .__lock_acquire+0x2f0/0x1e90
LR [c0000000000e44a4] .lock_acquire+0x144/0x450
Call Trace:
[c0000000043c2f30] [c0000000043c2ff0] 0xc0000000043c2ff0 (unreliable)
[c0000000043c3060] [c0000000000e44a4] .lock_acquire+0x144/0x450
[c0000000043c3160] [c000000001113ebc] ._raw_spin_lock_irqsave+0x5c/0xb0
[c0000000043c31f0] [c00000000003f5a4] .msi_bitmap_free_hwirqs+0x34/0x90
[c0000000043c3280] [c000000000041da4] .fsl_teardown_msi_irqs+0x94/0xe0
[c0000000043c3310] [c00000000002b204] .arch_teardown_msi_irqs+0x34/0x50
[c0000000043c3380] [c0000000008d6e68] .pci_msi_legacy_teardown_msi_irqs+0x28/0x40
[c0000000043c3400] [c0000000008d66c0] .pci_msi_teardown_msi_irqs+0x30/0xa0
[c0000000043c3480] [c0000000008d590c] .__pci_enable_msix_range+0x5fc/0x990
[c0000000043c35e0] [c0000000008d3934] .pci_alloc_irq_vectors_affinity+0x154/0x1d0
[c0000000043c36c0] [c000000000a74360] .nvme_setup_io_queues+0x2b0/0x9c0
[c0000000043c3830] [c000000000a76298] .nvme_probe+0x538/0x620
[c0000000043c38d0] [c0000000008c6e84] .pci_device_probe+0xc4/0x190
[c0000000043c3960] [c0000000009a9f38] .really_probe+0x108/0x460
[c0000000043c39f0] [c0000000009aa3a4] .driver_probe_device+0x44/0x120
[c0000000043c3a80] [c0000000009aa4e4] .__driver_attach_async_helper+0x64/0x120
[c0000000043c3b10] [c000000000094ca0] .async_run_entry_fn+0x50/0x140
[c0000000043c3bb0] [c000000000081e98] .process_one_work+0x2d8/0x7b0
[c0000000043c3c90] [c000000000082408] .worker_thread+0x98/0x4f0
[c0000000043c3d70] [c00000000008f2ac] .kthread+0x13c/0x150
[c0000000043c3e10] [c0000000000005d8] .ret_from_kernel_thread+0x58/0x60
Instruction dump:
2c0a0000 40c200bc 3c82ffe4 3c62ffe3 38841f48 38639c50 4bf6eeb1 60000000
0fe00000 60000000 60000000 60000000 <e9430000> 3d22006c 3929cc98 7c2a4800
---[ end trace 0000000000000000 ]---

Guenter


2022-12-16 10:11:57

by Marc Zyngier

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Thu, 15 Dec 2022 16:23:20 +0000,
Matthew Rosato <[email protected]> wrote:
>
> On 12/15/22 9:49 AM, Thomas Gleixner wrote:
> > On Wed, Dec 14 2022 at 10:42, Niklas Schnelle wrote:
> >> On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
> >>> This patch results in various s390 qemu test failures.
> >>> There is a warning backtrace
> >>>
> >>> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
> >>>
> >>> followed by
> >>>
> >>> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
> >>>
> >>> and Ethernet interfaces don't instantiate.
> >> As far as I'm aware so far he tracked this down to code calling
> >> msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
> >> which then returns NULL leading to msi_domain_get_hwsize() returning 0.
> >> I think this is related to the fact that we currently don't use IRQ
> >> domains.
> >
> > Correct and for some stupid reason I thought 0 is a good return value
> > here :)
> >
> >
> >
> > diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> > index bd4d4dd626b4..8fb10f216dc0 100644
> > --- a/kernel/irq/msi.c
> > +++ b/kernel/irq/msi.c
> > @@ -609,8 +609,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
> > info = domain->host_data;
> > return info->hwsize;
> > }
> > - /* No domain, no size... */
> > - return 0;
> > + /* No domain, default to MSI_MAX_INDEX */
> > + return MSI_MAX_INDEX;
> > }
> >
> > static inline void irq_chip_write_msi_msg(struct irq_data *data,
>
> Ah, that makes sense... So, with that diff applied, that fixes most of the issues I'm seeing incl. the virtio one that Guenter mentioned. But it looks like NVMe devices are still broken on s390 with a different backtrace -- the bisect for that one points to another patch in part2 of this work and looks like another issue with missing irq domain:
>
> 40742716f294 genirq/msi: Make msi_add_simple_msi_descs() device domain aware
>
>
> [ 4.308861] ------------[ cut here ]------------
> [ 4.308865] WARNING: CPU: 7 PID: 9 at kernel/irq/msi.c:167 msi_domain_free_msi_descs_range+0x3c/0xd0
> [ 4.308877] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
> [ 4.308896] CPU: 7 PID: 9 Comm: kworker/u20:0 Not tainted 6.1.0 #179
> [ 4.308898] Hardware name: IBM 3931 A01 782 (KVM/Linux)
> [ 4.308900] Workqueue: events_unbound async_run_entry_fn
> [ 4.308905] Krnl PSW : 0704c00180000000 00000000b6426b78 (msi_domain_free_msi_descs_range+0x40/0xd0)
> [ 4.308909] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [ 4.308911] Krnl GPRS: 0700000080eda0a0 0000000000000000 0000000080eda0a0 0000000000000000
> [ 4.308913] 0000000000000000 0000000000000000 0000000000000cc0 0000000080eda000
> [ 4.308914] 00000000b7ddc000 0000000091934aa8 000000000000ffff 0000000000000000
> [ 4.308915] 0000000080344200 0000000080f2b1c0 0000037fffb5b918 0000037fffb5b8c8
> [ 4.308924] Krnl Code: 00000000b6426b68: e54cf0ac0000 mvhi 172(%r15),0
> [ 4.308924] 00000000b6426b6e: ec3c000b017f clij %r3,1,12,00000000b6426b84
> [ 4.308924] #00000000b6426b74: af000000 mc 0,0
> [ 4.308924] >00000000b6426b78: eb9ff0b00004 lmg %r9,%r15,176(%r15)
> [ 4.308924] 00000000b6426b7e: 07fe bcr 15,%r14
> [ 4.308924] 00000000b6426b80: 47000700 bc 0,1792
> [ 4.308924] 00000000b6426b84: b90400a5 lgr %r10,%r5
> [ 4.308924] 00000000b6426b88: b9040013 lgr %r1,%r3
> [ 4.308935] Call Trace:
> [ 4.308938] [<00000000b6426b78>] msi_domain_free_msi_descs_range+0x40/0xd0
> [ 4.308945] [<00000000b6bb126e>] pci_free_msi_irqs+0x26/0x48
> [ 4.308950] [<00000000b6baf4d4>] pci_disable_msix+0x6c/0x80
> [ 4.308954] [<00000000b6baf716>] pci_free_irq_vectors+0x26/0x88
> [ 4.308956] [<000003ff7fdfa8f4>] nvme_setup_io_queues+0x18c/0x398 [nvme]
> [ 4.308968] [<000003ff7fdfbf1e>] nvme_probe+0x2e6/0x3b0 [nvme]
> [ 4.308972] [<00000000b6ba44cc>] local_pci_probe+0x44/0x80
> [ 4.308974] [<00000000b6ba46d8>] pci_call_probe+0x50/0x180
> [ 4.308976] [<00000000b6ba5166>] pci_device_probe+0xae/0x110
> [ 4.308978] [<00000000b6c0a19a>] really_probe+0xd2/0x480
> [ 4.308982] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
> [ 4.308984] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
> [ 4.308986] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
> [ 4.308987] [<00000000b63c1368>] process_one_work+0x200/0x458
> [ 4.308991] [<00000000b63c1aee>] worker_thread+0x66/0x480
> [ 4.308993] [<00000000b63caa00>] kthread+0x108/0x110
> [ 4.308996] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
> [ 4.308999] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
> [ 4.309006] Last Breaking-Event-Address:
> [ 4.309007] [<00000000b6426ba8>] msi_domain_free_msi_descs_range+0x70/0xd0
> [ 4.309009] ---[ end trace 0000000000000000 ]---
> [ 8.957187] nvme: probe of 0003:00:00.0 failed with error -22
> [ 8.957216] ------------[ cut here ]------------
> [ 8.957217] WARNING: CPU: 5 PID: 9 at kernel/irq/msi.c:275 msi_device_data_release+0x76/0xa0
> [ 8.957229] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
> [ 8.957248] CPU: 5 PID: 9 Comm: kworker/u20:0 Tainted: G W 6.1.0 #179
> [ 8.957252] Hardware name: IBM 3931 A01 782 (KVM/Linux)
> [ 8.957254] Workqueue: events_unbound async_run_entry_fn
> [ 8.957259] Krnl PSW : 0704e00180000000 00000000b642729a (msi_device_data_release+0x7a/0xa0)
> [ 8.957262] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
> [ 8.957265] Krnl GPRS: a813fdc020800000 00000000928b6840 0000000091934ab8 0000000080344200
> [ 8.957267] 0000000000000000 0000000091934a80 0000000080d79988 0000000000000000
> [ 8.957268] 0000000080eda0a0 0000037fffb5bc10 0000000080eda0a0 0000000091934aa8
> [ 8.957270] 0000000080344200 00000000800e0402 00000000b642724e 0000037fffb5bad0
> [ 8.957279] Krnl Code: 00000000b642728c: f0a0000407fe srp 4(11,%r0),2046,0
> [ 8.957279] 00000000b6427292: 47000700 bc 0,1792
> [ 8.957279] #00000000b6427296: af000000 mc 0,0
> [ 8.957279] >00000000b642729a: a7f4ffdf brc 15,00000000b6427258
> [ 8.957279] 00000000b642729e: af000000 mc 0,0
> [ 8.957279] 00000000b64272a2: 4120b048 la %r2,72(%r11)
> [ 8.957279] 00000000b64272a6: c0e5005a0c4d brasl %r14,00000000b6f68b40
> [ 8.957279] 00000000b64272ac: e548a1180000 mvghi 280(%r10),0
> [ 8.957290] Call Trace:
> [ 8.957292] [<00000000b642729a>] msi_device_data_release+0x7a/0xa0
> [ 8.957295] ([<00000000b642724e>] msi_device_data_release+0x2e/0xa0)
> [ 8.957298] [<00000000b6c0f608>] release_nodes+0x50/0xd8
> [ 8.957305] [<00000000b6c111aa>] devres_release_all+0xaa/0xf0
> [ 8.957308] [<00000000b6c0a2f2>] really_probe+0x22a/0x480
> [ 8.957310] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
> [ 8.957312] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
> [ 8.957314] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
> [ 8.957315] [<00000000b63c1368>] process_one_work+0x200/0x458
> [ 8.957320] [<00000000b63c1aee>] worker_thread+0x66/0x480
> [ 8.957322] [<00000000b63caa00>] kthread+0x108/0x110
> [ 8.957325] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
> [ 8.957328] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
> [ 8.957336] Last Breaking-Event-Address:
> [ 8.957337] [<00000000b6427254>] msi_device_data_release+0x34/0xa0
> [ 8.957339] ---[ end trace 0000000000000000 ]---
>
> The line number for the first warning points to the WARN_ON check in msi_ctrl_valid -- specifically it's the !dev->msi.data->__domains[ctrl->domid].domain check that is failing.
>
> The second warning is the WARN_ON_ONCE(!xa_empty(&md->__domains[i].store)) check in msi_device_data_release, presumably a victim of backing out after the first error.
>

Yeah, the non-irqdomain legacy path definitely wounds up here, and we
end-up leaking descriptors. If the following hack works for you, I'll
ferry the two fixes to Linus asap.

Thanks,

M.

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index bd4d4dd626b4..9921dc57f1b4 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -165,7 +165,8 @@ static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
unsigned int hwsize;

if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
- !dev->msi.data->__domains[ctrl->domid].domain))
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain))
return false;

hwsize = msi_domain_get_hwsize(dev, ctrl->domid);

--
Without deviation from the norm, progress is not possible.

2022-12-16 13:58:10

by Matthew Rosato

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/16/22 4:53 AM, Marc Zyngier wrote:
> On Thu, 15 Dec 2022 16:23:20 +0000,
> Matthew Rosato <[email protected]> wrote:
>>
>> On 12/15/22 9:49 AM, Thomas Gleixner wrote:
>>> On Wed, Dec 14 2022 at 10:42, Niklas Schnelle wrote:
>>>> On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
>>>>> This patch results in various s390 qemu test failures.
>>>>> There is a warning backtrace
>>>>>
>>>>> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
>>>>>
>>>>> followed by
>>>>>
>>>>> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
>>>>>
>>>>> and Ethernet interfaces don't instantiate.
>>>> As far as I'm aware so far he tracked this down to code calling
>>>> msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
>>>> which then returns NULL leading to msi_domain_get_hwsize() returning 0.
>>>> I think this is related to the fact that we currently don't use IRQ
>>>> domains.
>>>
>>> Correct and for some stupid reason I thought 0 is a good return value
>>> here :)
>>>
>>>
>>>
>>> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
>>> index bd4d4dd626b4..8fb10f216dc0 100644
>>> --- a/kernel/irq/msi.c
>>> +++ b/kernel/irq/msi.c
>>> @@ -609,8 +609,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
>>> info = domain->host_data;
>>> return info->hwsize;
>>> }
>>> - /* No domain, no size... */
>>> - return 0;
>>> + /* No domain, default to MSI_MAX_INDEX */
>>> + return MSI_MAX_INDEX;
>>> }
>>>
>>> static inline void irq_chip_write_msi_msg(struct irq_data *data,
>>
>> Ah, that makes sense... So, with that diff applied, that fixes most of the issues I'm seeing incl. the virtio one that Guenter mentioned. But it looks like NVMe devices are still broken on s390 with a different backtrace -- the bisect for that one points to another patch in part2 of this work and looks like another issue with missing irq domain:
>>
>> 40742716f294 genirq/msi: Make msi_add_simple_msi_descs() device domain aware
>>
>>
>> [ 4.308861] ------------[ cut here ]------------
>> [ 4.308865] WARNING: CPU: 7 PID: 9 at kernel/irq/msi.c:167 msi_domain_free_msi_descs_range+0x3c/0xd0
>> [ 4.308877] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
>> [ 4.308896] CPU: 7 PID: 9 Comm: kworker/u20:0 Not tainted 6.1.0 #179
>> [ 4.308898] Hardware name: IBM 3931 A01 782 (KVM/Linux)
>> [ 4.308900] Workqueue: events_unbound async_run_entry_fn
>> [ 4.308905] Krnl PSW : 0704c00180000000 00000000b6426b78 (msi_domain_free_msi_descs_range+0x40/0xd0)
>> [ 4.308909] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
>> [ 4.308911] Krnl GPRS: 0700000080eda0a0 0000000000000000 0000000080eda0a0 0000000000000000
>> [ 4.308913] 0000000000000000 0000000000000000 0000000000000cc0 0000000080eda000
>> [ 4.308914] 00000000b7ddc000 0000000091934aa8 000000000000ffff 0000000000000000
>> [ 4.308915] 0000000080344200 0000000080f2b1c0 0000037fffb5b918 0000037fffb5b8c8
>> [ 4.308924] Krnl Code: 00000000b6426b68: e54cf0ac0000 mvhi 172(%r15),0
>> [ 4.308924] 00000000b6426b6e: ec3c000b017f clij %r3,1,12,00000000b6426b84
>> [ 4.308924] #00000000b6426b74: af000000 mc 0,0
>> [ 4.308924] >00000000b6426b78: eb9ff0b00004 lmg %r9,%r15,176(%r15)
>> [ 4.308924] 00000000b6426b7e: 07fe bcr 15,%r14
>> [ 4.308924] 00000000b6426b80: 47000700 bc 0,1792
>> [ 4.308924] 00000000b6426b84: b90400a5 lgr %r10,%r5
>> [ 4.308924] 00000000b6426b88: b9040013 lgr %r1,%r3
>> [ 4.308935] Call Trace:
>> [ 4.308938] [<00000000b6426b78>] msi_domain_free_msi_descs_range+0x40/0xd0
>> [ 4.308945] [<00000000b6bb126e>] pci_free_msi_irqs+0x26/0x48
>> [ 4.308950] [<00000000b6baf4d4>] pci_disable_msix+0x6c/0x80
>> [ 4.308954] [<00000000b6baf716>] pci_free_irq_vectors+0x26/0x88
>> [ 4.308956] [<000003ff7fdfa8f4>] nvme_setup_io_queues+0x18c/0x398 [nvme]
>> [ 4.308968] [<000003ff7fdfbf1e>] nvme_probe+0x2e6/0x3b0 [nvme]
>> [ 4.308972] [<00000000b6ba44cc>] local_pci_probe+0x44/0x80
>> [ 4.308974] [<00000000b6ba46d8>] pci_call_probe+0x50/0x180
>> [ 4.308976] [<00000000b6ba5166>] pci_device_probe+0xae/0x110
>> [ 4.308978] [<00000000b6c0a19a>] really_probe+0xd2/0x480
>> [ 4.308982] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
>> [ 4.308984] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
>> [ 4.308986] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
>> [ 4.308987] [<00000000b63c1368>] process_one_work+0x200/0x458
>> [ 4.308991] [<00000000b63c1aee>] worker_thread+0x66/0x480
>> [ 4.308993] [<00000000b63caa00>] kthread+0x108/0x110
>> [ 4.308996] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
>> [ 4.308999] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
>> [ 4.309006] Last Breaking-Event-Address:
>> [ 4.309007] [<00000000b6426ba8>] msi_domain_free_msi_descs_range+0x70/0xd0
>> [ 4.309009] ---[ end trace 0000000000000000 ]---
>> [ 8.957187] nvme: probe of 0003:00:00.0 failed with error -22
>> [ 8.957216] ------------[ cut here ]------------
>> [ 8.957217] WARNING: CPU: 5 PID: 9 at kernel/irq/msi.c:275 msi_device_data_release+0x76/0xa0
>> [ 8.957229] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
>> [ 8.957248] CPU: 5 PID: 9 Comm: kworker/u20:0 Tainted: G W 6.1.0 #179
>> [ 8.957252] Hardware name: IBM 3931 A01 782 (KVM/Linux)
>> [ 8.957254] Workqueue: events_unbound async_run_entry_fn
>> [ 8.957259] Krnl PSW : 0704e00180000000 00000000b642729a (msi_device_data_release+0x7a/0xa0)
>> [ 8.957262] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
>> [ 8.957265] Krnl GPRS: a813fdc020800000 00000000928b6840 0000000091934ab8 0000000080344200
>> [ 8.957267] 0000000000000000 0000000091934a80 0000000080d79988 0000000000000000
>> [ 8.957268] 0000000080eda0a0 0000037fffb5bc10 0000000080eda0a0 0000000091934aa8
>> [ 8.957270] 0000000080344200 00000000800e0402 00000000b642724e 0000037fffb5bad0
>> [ 8.957279] Krnl Code: 00000000b642728c: f0a0000407fe srp 4(11,%r0),2046,0
>> [ 8.957279] 00000000b6427292: 47000700 bc 0,1792
>> [ 8.957279] #00000000b6427296: af000000 mc 0,0
>> [ 8.957279] >00000000b642729a: a7f4ffdf brc 15,00000000b6427258
>> [ 8.957279] 00000000b642729e: af000000 mc 0,0
>> [ 8.957279] 00000000b64272a2: 4120b048 la %r2,72(%r11)
>> [ 8.957279] 00000000b64272a6: c0e5005a0c4d brasl %r14,00000000b6f68b40
>> [ 8.957279] 00000000b64272ac: e548a1180000 mvghi 280(%r10),0
>> [ 8.957290] Call Trace:
>> [ 8.957292] [<00000000b642729a>] msi_device_data_release+0x7a/0xa0
>> [ 8.957295] ([<00000000b642724e>] msi_device_data_release+0x2e/0xa0)
>> [ 8.957298] [<00000000b6c0f608>] release_nodes+0x50/0xd8
>> [ 8.957305] [<00000000b6c111aa>] devres_release_all+0xaa/0xf0
>> [ 8.957308] [<00000000b6c0a2f2>] really_probe+0x22a/0x480
>> [ 8.957310] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
>> [ 8.957312] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
>> [ 8.957314] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
>> [ 8.957315] [<00000000b63c1368>] process_one_work+0x200/0x458
>> [ 8.957320] [<00000000b63c1aee>] worker_thread+0x66/0x480
>> [ 8.957322] [<00000000b63caa00>] kthread+0x108/0x110
>> [ 8.957325] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
>> [ 8.957328] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
>> [ 8.957336] Last Breaking-Event-Address:
>> [ 8.957337] [<00000000b6427254>] msi_device_data_release+0x34/0xa0
>> [ 8.957339] ---[ end trace 0000000000000000 ]---
>>
>> The line number for the first warning points to the WARN_ON check in msi_ctrl_valid -- specifically it's the !dev->msi.data->__domains[ctrl->domid].domain check that is failing.
>>
>> The second warning is the WARN_ON_ONCE(!xa_empty(&md->__domains[i].store)) check in msi_device_data_release, presumably a victim of backing out after the first error.
>>
>
> Yeah, the non-irqdomain legacy path definitely wounds up here, and we
> end-up leaking descriptors. If the following hack works for you, I'll
> ferry the two fixes to Linus asap.
>
> Thanks,
>
> M.
>
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index bd4d4dd626b4..9921dc57f1b4 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -165,7 +165,8 @@ static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
> unsigned int hwsize;
>
> if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
> - !dev->msi.data->__domains[ctrl->domid].domain))
> + (dev->msi.domain &&
> + !dev->msi.data->__domains[ctrl->domid].domain))
> return false;
>
> hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
>

Close, but I had to add an extra ) at the end that was missing :)

With both these fixes applied, it actually then leads to the very next WARN_ON failing in msi_ctrl_valid... Because ctrl->last == hwsize. I think Thomas' initial fix for msi_domain_get_hwsize has an off-by-1 error, I think we should return MSI_XA_DOMAIN_SIZE for msi_domain_get_hwsize instead.

Here's what my final squashed diff looks like, and with this applied everything seems to be working again for s390 (Guenter, can you test again on powerpc?). Thanks all!

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index bd4d4dd626b4..955267bbc2be 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -165,7 +165,8 @@ static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
unsigned int hwsize;

if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
- !dev->msi.data->__domains[ctrl->domid].domain))
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
return false;

hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
@@ -609,8 +610,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
info = domain->host_data;
return info->hwsize;
}
- /* No domain, no size... */
- return 0;
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
}

2022-12-16 14:34:08

by Marc Zyngier

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Fri, 16 Dec 2022 13:58:59 +0000,
Marc Zyngier <[email protected]> wrote:
>
> I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
> send it out.

And FWIW, the branch is at [1].

Thanks,

M.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/msi-fixes-6.2

--
Without deviation from the norm, progress is not possible.

2022-12-16 14:36:56

by Marc Zyngier

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Fri, 16 Dec 2022 13:50:59 +0000,
Matthew Rosato <[email protected]> wrote:
>
> On 12/16/22 4:53 AM, Marc Zyngier wrote:
> > On Thu, 15 Dec 2022 16:23:20 +0000,
> > Matthew Rosato <[email protected]> wrote:
> >>
> >> On 12/15/22 9:49 AM, Thomas Gleixner wrote:
> >>> On Wed, Dec 14 2022 at 10:42, Niklas Schnelle wrote:
> >>>> On Tue, 2022-12-13 at 11:04 -0800, Guenter Roeck wrote:
> >>>>> This patch results in various s390 qemu test failures.
> >>>>> There is a warning backtrace
> >>>>>
> >>>>> 12.674858] WARNING: CPU: 0 PID: 1 at kernel/irq/msi.c:167 msi_ctrl_valid+0x2a/0xb0
> >>>>>
> >>>>> followed by
> >>>>>
> >>>>> [ 12.684333] virtio_net: probe of virtio0 failed with error -34
> >>>>>
> >>>>> and Ethernet interfaces don't instantiate.
> >>>> As far as I'm aware so far he tracked this down to code calling
> >>>> msi_domain_get_hwsize() which in turn calls msi_get_device_domain()
> >>>> which then returns NULL leading to msi_domain_get_hwsize() returning 0.
> >>>> I think this is related to the fact that we currently don't use IRQ
> >>>> domains.
> >>>
> >>> Correct and for some stupid reason I thought 0 is a good return value
> >>> here :)
> >>>
> >>>
> >>>
> >>> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> >>> index bd4d4dd626b4..8fb10f216dc0 100644
> >>> --- a/kernel/irq/msi.c
> >>> +++ b/kernel/irq/msi.c
> >>> @@ -609,8 +609,8 @@ static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid
> >>> info = domain->host_data;
> >>> return info->hwsize;
> >>> }
> >>> - /* No domain, no size... */
> >>> - return 0;
> >>> + /* No domain, default to MSI_MAX_INDEX */
> >>> + return MSI_MAX_INDEX;
> >>> }
> >>>
> >>> static inline void irq_chip_write_msi_msg(struct irq_data *data,
> >>
> >> Ah, that makes sense... So, with that diff applied, that fixes most of the issues I'm seeing incl. the virtio one that Guenter mentioned. But it looks like NVMe devices are still broken on s390 with a different backtrace -- the bisect for that one points to another patch in part2 of this work and looks like another issue with missing irq domain:
> >>
> >> 40742716f294 genirq/msi: Make msi_add_simple_msi_descs() device domain aware
> >>
> >>
> >> [ 4.308861] ------------[ cut here ]------------
> >> [ 4.308865] WARNING: CPU: 7 PID: 9 at kernel/irq/msi.c:167 msi_domain_free_msi_descs_range+0x3c/0xd0
> >> [ 4.308877] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
> >> [ 4.308896] CPU: 7 PID: 9 Comm: kworker/u20:0 Not tainted 6.1.0 #179
> >> [ 4.308898] Hardware name: IBM 3931 A01 782 (KVM/Linux)
> >> [ 4.308900] Workqueue: events_unbound async_run_entry_fn
> >> [ 4.308905] Krnl PSW : 0704c00180000000 00000000b6426b78 (msi_domain_free_msi_descs_range+0x40/0xd0)
> >> [ 4.308909] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> >> [ 4.308911] Krnl GPRS: 0700000080eda0a0 0000000000000000 0000000080eda0a0 0000000000000000
> >> [ 4.308913] 0000000000000000 0000000000000000 0000000000000cc0 0000000080eda000
> >> [ 4.308914] 00000000b7ddc000 0000000091934aa8 000000000000ffff 0000000000000000
> >> [ 4.308915] 0000000080344200 0000000080f2b1c0 0000037fffb5b918 0000037fffb5b8c8
> >> [ 4.308924] Krnl Code: 00000000b6426b68: e54cf0ac0000 mvhi 172(%r15),0
> >> [ 4.308924] 00000000b6426b6e: ec3c000b017f clij %r3,1,12,00000000b6426b84
> >> [ 4.308924] #00000000b6426b74: af000000 mc 0,0
> >> [ 4.308924] >00000000b6426b78: eb9ff0b00004 lmg %r9,%r15,176(%r15)
> >> [ 4.308924] 00000000b6426b7e: 07fe bcr 15,%r14
> >> [ 4.308924] 00000000b6426b80: 47000700 bc 0,1792
> >> [ 4.308924] 00000000b6426b84: b90400a5 lgr %r10,%r5
> >> [ 4.308924] 00000000b6426b88: b9040013 lgr %r1,%r3
> >> [ 4.308935] Call Trace:
> >> [ 4.308938] [<00000000b6426b78>] msi_domain_free_msi_descs_range+0x40/0xd0
> >> [ 4.308945] [<00000000b6bb126e>] pci_free_msi_irqs+0x26/0x48
> >> [ 4.308950] [<00000000b6baf4d4>] pci_disable_msix+0x6c/0x80
> >> [ 4.308954] [<00000000b6baf716>] pci_free_irq_vectors+0x26/0x88
> >> [ 4.308956] [<000003ff7fdfa8f4>] nvme_setup_io_queues+0x18c/0x398 [nvme]
> >> [ 4.308968] [<000003ff7fdfbf1e>] nvme_probe+0x2e6/0x3b0 [nvme]
> >> [ 4.308972] [<00000000b6ba44cc>] local_pci_probe+0x44/0x80
> >> [ 4.308974] [<00000000b6ba46d8>] pci_call_probe+0x50/0x180
> >> [ 4.308976] [<00000000b6ba5166>] pci_device_probe+0xae/0x110
> >> [ 4.308978] [<00000000b6c0a19a>] really_probe+0xd2/0x480
> >> [ 4.308982] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
> >> [ 4.308984] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
> >> [ 4.308986] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
> >> [ 4.308987] [<00000000b63c1368>] process_one_work+0x200/0x458
> >> [ 4.308991] [<00000000b63c1aee>] worker_thread+0x66/0x480
> >> [ 4.308993] [<00000000b63caa00>] kthread+0x108/0x110
> >> [ 4.308996] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
> >> [ 4.308999] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
> >> [ 4.309006] Last Breaking-Event-Address:
> >> [ 4.309007] [<00000000b6426ba8>] msi_domain_free_msi_descs_range+0x70/0xd0
> >> [ 4.309009] ---[ end trace 0000000000000000 ]---
> >> [ 8.957187] nvme: probe of 0003:00:00.0 failed with error -22
> >> [ 8.957216] ------------[ cut here ]------------
> >> [ 8.957217] WARNING: CPU: 5 PID: 9 at kernel/irq/msi.c:275 msi_device_data_release+0x76/0xa0
> >> [ 8.957229] Modules linked in: mlx5_core aes_s390 nvme des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common pkey zcrypt rng_core autofs4
> >> [ 8.957248] CPU: 5 PID: 9 Comm: kworker/u20:0 Tainted: G W 6.1.0 #179
> >> [ 8.957252] Hardware name: IBM 3931 A01 782 (KVM/Linux)
> >> [ 8.957254] Workqueue: events_unbound async_run_entry_fn
> >> [ 8.957259] Krnl PSW : 0704e00180000000 00000000b642729a (msi_device_data_release+0x7a/0xa0)
> >> [ 8.957262] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
> >> [ 8.957265] Krnl GPRS: a813fdc020800000 00000000928b6840 0000000091934ab8 0000000080344200
> >> [ 8.957267] 0000000000000000 0000000091934a80 0000000080d79988 0000000000000000
> >> [ 8.957268] 0000000080eda0a0 0000037fffb5bc10 0000000080eda0a0 0000000091934aa8
> >> [ 8.957270] 0000000080344200 00000000800e0402 00000000b642724e 0000037fffb5bad0
> >> [ 8.957279] Krnl Code: 00000000b642728c: f0a0000407fe srp 4(11,%r0),2046,0
> >> [ 8.957279] 00000000b6427292: 47000700 bc 0,1792
> >> [ 8.957279] #00000000b6427296: af000000 mc 0,0
> >> [ 8.957279] >00000000b642729a: a7f4ffdf brc 15,00000000b6427258
> >> [ 8.957279] 00000000b642729e: af000000 mc 0,0
> >> [ 8.957279] 00000000b64272a2: 4120b048 la %r2,72(%r11)
> >> [ 8.957279] 00000000b64272a6: c0e5005a0c4d brasl %r14,00000000b6f68b40
> >> [ 8.957279] 00000000b64272ac: e548a1180000 mvghi 280(%r10),0
> >> [ 8.957290] Call Trace:
> >> [ 8.957292] [<00000000b642729a>] msi_device_data_release+0x7a/0xa0
> >> [ 8.957295] ([<00000000b642724e>] msi_device_data_release+0x2e/0xa0)
> >> [ 8.957298] [<00000000b6c0f608>] release_nodes+0x50/0xd8
> >> [ 8.957305] [<00000000b6c111aa>] devres_release_all+0xaa/0xf0
> >> [ 8.957308] [<00000000b6c0a2f2>] really_probe+0x22a/0x480
> >> [ 8.957310] [<00000000b6c0a6f8>] driver_probe_device+0x40/0xf0
> >> [ 8.957312] [<00000000b6c0a80e>] __driver_attach_async_helper+0x66/0xf0
> >> [ 8.957314] [<00000000b63cfb72>] async_run_entry_fn+0x4a/0x1b0
> >> [ 8.957315] [<00000000b63c1368>] process_one_work+0x200/0x458
> >> [ 8.957320] [<00000000b63c1aee>] worker_thread+0x66/0x480
> >> [ 8.957322] [<00000000b63caa00>] kthread+0x108/0x110
> >> [ 8.957325] [<00000000b634f2dc>] __ret_from_fork+0x3c/0x58
> >> [ 8.957328] [<00000000b6f8da2a>] ret_from_fork+0xa/0x40
> >> [ 8.957336] Last Breaking-Event-Address:
> >> [ 8.957337] [<00000000b6427254>] msi_device_data_release+0x34/0xa0
> >> [ 8.957339] ---[ end trace 0000000000000000 ]---
> >>
> >> The line number for the first warning points to the WARN_ON check in msi_ctrl_valid -- specifically it's the !dev->msi.data->__domains[ctrl->domid].domain check that is failing.
> >>
> >> The second warning is the WARN_ON_ONCE(!xa_empty(&md->__domains[i].store)) check in msi_device_data_release, presumably a victim of backing out after the first error.
> >>
> >
> > Yeah, the non-irqdomain legacy path definitely wounds up here, and we
> > end-up leaking descriptors. If the following hack works for you, I'll
> > ferry the two fixes to Linus asap.
> >
> > Thanks,
> >
> > M.
> >
> > diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> > index bd4d4dd626b4..9921dc57f1b4 100644
> > --- a/kernel/irq/msi.c
> > +++ b/kernel/irq/msi.c
> > @@ -165,7 +165,8 @@ static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
> > unsigned int hwsize;
> >
> > if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
> > - !dev->msi.data->__domains[ctrl->domid].domain))
> > + (dev->msi.domain &&
> > + !dev->msi.data->__domains[ctrl->domid].domain))
> > return false;
> >
> > hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
> >
>
> Close, but I had to add an extra ) at the end that was missing :)

Hey, I never said I tried to compile the thing! ;-)

>
> With both these fixes applied, it actually then leads to the very
> next WARN_ON failing in msi_ctrl_valid... Because ctrl->last ==
> hwsize. I think Thomas' initial fix for msi_domain_get_hwsize has
> an off-by-1 error, I think we should return MSI_XA_DOMAIN_SIZE for
> msi_domain_get_hwsize instead.

Yes, that's a good point, and that's consistent with what
__msi_create_irq_domain() does already, assuming MSI_XA_DOMAIN_SIZE
when info->hwsize is 0. No reason to do something else here.

I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
send it out.

M.

--
Without deviation from the norm, progress is not possible.

2022-12-16 14:37:44

by Matthew Rosato

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/16/22 9:03 AM, Marc Zyngier wrote:
> On Fri, 16 Dec 2022 13:58:59 +0000,
> Marc Zyngier <[email protected]> wrote:
>>
>> I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
>> send it out.
>
> And FWIW, the branch is at [1].
>
> Thanks,
>
> M.
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/msi-fixes-6.2
>

FYI, your patch there mentions MSI_XA_DOMAIN_SIZE in the commit message but the code (and comment) is still returning MSI_MAX_INDEX

2022-12-16 16:35:42

by Guenter Roeck

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/16/22 05:58, Marc Zyngier wrote:
[ ... ]

>>
>> With both these fixes applied, it actually then leads to the very
>> next WARN_ON failing in msi_ctrl_valid... Because ctrl->last ==
>> hwsize. I think Thomas' initial fix for msi_domain_get_hwsize has
>> an off-by-1 error, I think we should return MSI_XA_DOMAIN_SIZE for
>> msi_domain_get_hwsize instead.
>
> Yes, that's a good point, and that's consistent with what
> __msi_create_irq_domain() does already, assuming MSI_XA_DOMAIN_SIZE
> when info->hwsize is 0. No reason to do something else here.
>
> I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
> send it out.
>

It wasn't just ppc; that was just the easiest to report. I applied
the two patches on top of the irq merge and will test the resulting
branch (mainline is too broken right now). I hope that will give me
useful results. It will take a while though since my testbed is
still busy testing the most recent release candidates.

Guenter

2022-12-16 17:53:18

by Marc Zyngier

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 2022-12-16 14:11, Matthew Rosato wrote:
> On 12/16/22 9:03 AM, Marc Zyngier wrote:
>> On Fri, 16 Dec 2022 13:58:59 +0000,
>> Marc Zyngier <[email protected]> wrote:
>>>
>>> I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
>>> send it out.
>>
>> And FWIW, the branch is at [1].
>>
>> Thanks,
>>
>> M.
>>
>> [1]
>> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/msi-fixes-6.2
>>
>
> FYI, your patch there mentions MSI_XA_DOMAIN_SIZE in the commit
> message but the code (and comment) is still returning MSI_MAX_INDEX

https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/commit/?h=irq/msi-fixes-6.2&id=e982ad82bd8f7931f5788a15dfa3709f7a7ee79f

That was the case initially (amended the commit message, but
didn't commit the code...), then pushed the real stuff a couple of
minutes later. It took about 20 minutes for the mirror to sync
up...

I guess the git mirroring is a bit busy at the moment...

M.
--
Jazz is not dead. It just smells funny...

2022-12-17 01:06:30

by Guenter Roeck

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/16/22 05:58, Marc Zyngier wrote:
[ ... ]

>> With both these fixes applied, it actually then leads to the very
>> next WARN_ON failing in msi_ctrl_valid... Because ctrl->last ==
>> hwsize. I think Thomas' initial fix for msi_domain_get_hwsize has
>> an off-by-1 error, I think we should return MSI_XA_DOMAIN_SIZE for
>> msi_domain_get_hwsize instead.
>
> Yes, that's a good point, and that's consistent with what
> __msi_create_irq_domain() does already, assuming MSI_XA_DOMAIN_SIZE
> when info->hwsize is 0. No reason to do something else here.
>
> I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
> send it out.
>
With

7a27b6136dcb (local/testing, testing-msi) genirq/msi: Return MSI_XA_DOMAIN_SIZE as the maximum MSI index when no domain is present
c581d525bb1d genirq/msi: Check for the presence of an irq domain when validating msi_ctrl
9d33edb20f7e Merge tag 'irq-core-2022-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

I still get the following runtime warning.

------------[ cut here ]------------
WARNING: CPU: 0 PID: 8 at kernel/irq/msi.c:196 .msi_domain_free_descs+0x144/0x170
Modules linked in:
CPU: 0 PID: 8 Comm: kworker/u2:0 Tainted: G N 6.1.0-01957-g7a27b6136dcb #1
Hardware name: QEMU ppce500 e5500 0x80240020 QEMU e500
Workqueue: nvme-reset-wq .nvme_reset_work
NIP: c000000000107d54 LR: c000000000107d44 CTR: 0000000000000000
REGS: c0000000041e74d0 TRAP: 0700 Tainted: G N (6.1.0-01957-g7a27b6136dcb)
MSR: 0000000080029002 <CE,EE,ME> CR: 44002282 XER: 20000000
IRQMASK: 0
GPR00: c000000000107d44 c0000000041e7770 c000000001629c00 c000000004e748a0
GPR04: 000000005358db0a c000000001ce7a00 c00000000423b5d0 000000004735aaa2
GPR08: 0000000000000002 0000000000000013 c00000000423acc0 c00000000214a998
GPR12: 0000000024002282 c000000002579000 c00000000008e190 c000000004173540
GPR16: 0000000000000000 c0000000043810b8 0000000000000000 0000000000000001
GPR20: c0000000060b22d8 c0000000060a70f0 0000000000000000 c000000001996800
GPR24: c0000000017df6c0 c0000000043810b8 c0000000060b2388 c0000000060b2000
GPR28: ffffffffffffffff c0000000041e7888 c000000006025ac8 c000000004e748a0
NIP [c000000000107d54] .msi_domain_free_descs+0x144/0x170
LR [c000000000107d44] .msi_domain_free_descs+0x134/0x170
Call Trace:
[c0000000041e7770] [c000000000107d44] .msi_domain_free_descs+0x134/0x170 (unreliable)
[c0000000041e7810] [c0000000001085d8] .msi_domain_free_msi_descs_range+0x38/0x70
[c0000000041e78a0] [c0000000008d000c] .pci_msi_teardown_msi_irqs+0x4c/0xa0
[c0000000041e7920] [c0000000008cf9e8] .pci_free_msi_irqs+0x18/0x50
[c0000000041e79a0] [c0000000008cd8d0] .pci_free_irq_vectors+0x80/0xb0
[c0000000041e7a20] [c000000000a6d2a0] .nvme_reset_work+0x870/0x1780
[c0000000041e7bb0] [c000000000080e68] .process_one_work+0x2d8/0x7b0
[c0000000041e7c90] [c0000000000813d8] .worker_thread+0x98/0x4f0
[c0000000041e7d70] [c00000000008e2cc] .kthread+0x13c/0x150
[c0000000041e7e10] [c0000000000005d8] .ret_from_kernel_thread+0x58/0x60
Instruction dump:
7fc3f378 48ff1ca9 60000000 7c7f1b79 41c2002c e8810070 7fc3f378 48ff3491
60000000 813f0000 2c090000 41e2ffb0 <0fe00000> 60000000 60000000 ebc10090
irq event stamp: 98168
hardirqs last enabled at (98167): [<c00000000110a274>] ._raw_spin_unlock_irqrestore+0x84/0xd0
hardirqs last disabled at (98168): [<c000000000010b58>] .program_check_exception+0x38/0x120
softirqs last enabled at (97760): [<c00000000110b4dc>] .__do_softirq+0x60c/0x678
softirqs last disabled at (97749): [<c000000000004d20>] .do_softirq_own_stack+0x30/0x50
---[ end trace 0000000000000000 ]---
nvme nvme0: 1/0/0 default/read/poll queues
nvme nvme0: Ignoring bogus Namespace Identifiers
...

The system boots fine, though. This is seen when booting the ppce500 machine with
e5500 CPU and corenet64_smp_defconfig from nvme.

Guenter

2022-12-17 11:29:06

by Marc Zyngier

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Sat, 17 Dec 2022 00:45:50 +0000,
Guenter Roeck <[email protected]> wrote:
>
> On 12/16/22 05:58, Marc Zyngier wrote:
> [ ... ]
>
> >> With both these fixes applied, it actually then leads to the very
> >> next WARN_ON failing in msi_ctrl_valid... Because ctrl->last ==
> >> hwsize. I think Thomas' initial fix for msi_domain_get_hwsize has
> >> an off-by-1 error, I think we should return MSI_XA_DOMAIN_SIZE for
> >> msi_domain_get_hwsize instead.
> >
> > Yes, that's a good point, and that's consistent with what
> > __msi_create_irq_domain() does already, assuming MSI_XA_DOMAIN_SIZE
> > when info->hwsize is 0. No reason to do something else here.
> >
> > I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
> > send it out.
> >
> With
>
> 7a27b6136dcb (local/testing, testing-msi) genirq/msi: Return MSI_XA_DOMAIN_SIZE as the maximum MSI index when no domain is present
> c581d525bb1d genirq/msi: Check for the presence of an irq domain when validating msi_ctrl
> 9d33edb20f7e Merge tag 'irq-core-2022-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
>
> I still get the following runtime warning.
>
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 8 at kernel/irq/msi.c:196 .msi_domain_free_descs+0x144/0x170
> Modules linked in:
> CPU: 0 PID: 8 Comm: kworker/u2:0 Tainted: G N 6.1.0-01957-g7a27b6136dcb #1
> Hardware name: QEMU ppce500 e5500 0x80240020 QEMU e500
> Workqueue: nvme-reset-wq .nvme_reset_work
> NIP: c000000000107d54 LR: c000000000107d44 CTR: 0000000000000000
> REGS: c0000000041e74d0 TRAP: 0700 Tainted: G N (6.1.0-01957-g7a27b6136dcb)
> MSR: 0000000080029002 <CE,EE,ME> CR: 44002282 XER: 20000000
> IRQMASK: 0
> GPR00: c000000000107d44 c0000000041e7770 c000000001629c00 c000000004e748a0
> GPR04: 000000005358db0a c000000001ce7a00 c00000000423b5d0 000000004735aaa2
> GPR08: 0000000000000002 0000000000000013 c00000000423acc0 c00000000214a998
> GPR12: 0000000024002282 c000000002579000 c00000000008e190 c000000004173540
> GPR16: 0000000000000000 c0000000043810b8 0000000000000000 0000000000000001
> GPR20: c0000000060b22d8 c0000000060a70f0 0000000000000000 c000000001996800
> GPR24: c0000000017df6c0 c0000000043810b8 c0000000060b2388 c0000000060b2000
> GPR28: ffffffffffffffff c0000000041e7888 c000000006025ac8 c000000004e748a0
> NIP [c000000000107d54] .msi_domain_free_descs+0x144/0x170
> LR [c000000000107d44] .msi_domain_free_descs+0x134/0x170
> Call Trace:
> [c0000000041e7770] [c000000000107d44] .msi_domain_free_descs+0x134/0x170 (unreliable)
> [c0000000041e7810] [c0000000001085d8] .msi_domain_free_msi_descs_range+0x38/0x70
> [c0000000041e78a0] [c0000000008d000c] .pci_msi_teardown_msi_irqs+0x4c/0xa0
> [c0000000041e7920] [c0000000008cf9e8] .pci_free_msi_irqs+0x18/0x50
> [c0000000041e79a0] [c0000000008cd8d0] .pci_free_irq_vectors+0x80/0xb0
> [c0000000041e7a20] [c000000000a6d2a0] .nvme_reset_work+0x870/0x1780
> [c0000000041e7bb0] [c000000000080e68] .process_one_work+0x2d8/0x7b0
> [c0000000041e7c90] [c0000000000813d8] .worker_thread+0x98/0x4f0
> [c0000000041e7d70] [c00000000008e2cc] .kthread+0x13c/0x150
> [c0000000041e7e10] [c0000000000005d8] .ret_from_kernel_thread+0x58/0x60
> Instruction dump:
> 7fc3f378 48ff1ca9 60000000 7c7f1b79 41c2002c e8810070 7fc3f378 48ff3491
> 60000000 813f0000 2c090000 41e2ffb0 <0fe00000> 60000000 60000000 ebc10090
> irq event stamp: 98168
> hardirqs last enabled at (98167): [<c00000000110a274>] ._raw_spin_unlock_irqrestore+0x84/0xd0
> hardirqs last disabled at (98168): [<c000000000010b58>] .program_check_exception+0x38/0x120
> softirqs last enabled at (97760): [<c00000000110b4dc>] .__do_softirq+0x60c/0x678
> softirqs last disabled at (97749): [<c000000000004d20>] .do_softirq_own_stack+0x30/0x50
> ---[ end trace 0000000000000000 ]---
> nvme nvme0: 1/0/0 default/read/poll queues
> nvme nvme0: Ignoring bogus Namespace Identifiers
> ...
>
> The system boots fine, though. This is seen when booting the ppce500
> machine with e5500 CPU and corenet64_smp_defconfig from nvme.

+PPC folks.

Thanks for the report.

I managed to reproduce this, although in a limited way (a SMP qemu
instance wouldn't boot at all). The problem is that the core MSI code
now assumes that if the arch code is in charge of breaking the
association of a MSI with a device, it is also in charge of clearing
the irq in the MSI descriptor.

This matches the s390 behaviour, but not powerpc's, hence the splat
and the leaked MSI descriptors. The minimal fix should be as follow,
which I'll add to the pile of patches.

Thanks,

M.

diff --git a/arch/powerpc/platforms/4xx/hsta_msi.c b/arch/powerpc/platforms/4xx/hsta_msi.c
index d4f7fff1fc87..e11b57a62b05 100644
--- a/arch/powerpc/platforms/4xx/hsta_msi.c
+++ b/arch/powerpc/platforms/4xx/hsta_msi.c
@@ -115,6 +115,7 @@ static void hsta_teardown_msi_irqs(struct pci_dev *dev)
msi_bitmap_free_hwirqs(&ppc4xx_hsta_msi.bmp, irq, 1);
pr_debug("%s: Teardown IRQ %u (index %u)\n", __func__,
entry->irq, irq);
+ entry->irq = 0;
}
}

diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 5b012abca773..0c11aad896c7 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -289,6 +289,7 @@ static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
msi_for_each_desc(entry, &dev->dev, MSI_DESC_ASSOCIATED) {
irq_set_msi_desc(entry->irq, NULL);
irq_dispose_mapping(entry->irq);
+ entry->irq = 0;
}
}

diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c
index dc1846660005..166c97fff16d 100644
--- a/arch/powerpc/platforms/pasemi/msi.c
+++ b/arch/powerpc/platforms/pasemi/msi.c
@@ -66,6 +66,7 @@ static void pasemi_msi_teardown_msi_irqs(struct pci_dev *pdev)
hwirq = virq_to_hw(entry->irq);
irq_set_msi_desc(entry->irq, NULL);
irq_dispose_mapping(entry->irq);
+ entry->irq = 0;
msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq, ALLOC_CHUNK);
}
}
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 73c2d70706c0..57978a44d55b 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -132,6 +132,7 @@ static void fsl_teardown_msi_irqs(struct pci_dev *pdev)
msi_data = irq_get_chip_data(entry->irq);
irq_set_msi_desc(entry->irq, NULL);
irq_dispose_mapping(entry->irq);
+ entry->irq = 0;
msi_bitmap_free_hwirqs(&msi_data->bitmap, hwirq, 1);
}
}
diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c
index 1d8cfdfdf115..492cb03c0b62 100644
--- a/arch/powerpc/sysdev/mpic_u3msi.c
+++ b/arch/powerpc/sysdev/mpic_u3msi.c
@@ -108,6 +108,7 @@ static void u3msi_teardown_msi_irqs(struct pci_dev *pdev)
hwirq = virq_to_hw(entry->irq);
irq_set_msi_desc(entry->irq, NULL);
irq_dispose_mapping(entry->irq);
+ entry->irq = 0;
msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq, 1);
}
}

--
Without deviation from the norm, progress is not possible.

2022-12-17 14:19:56

by Guenter Roeck

[permalink] [raw]
Subject: Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On 12/17/22 02:46, Marc Zyngier wrote:
> On Sat, 17 Dec 2022 00:45:50 +0000,
> Guenter Roeck <[email protected]> wrote:
>>
>> On 12/16/22 05:58, Marc Zyngier wrote:
>> [ ... ]
>>
>>>> With both these fixes applied, it actually then leads to the very
>>>> next WARN_ON failing in msi_ctrl_valid... Because ctrl->last ==
>>>> hwsize. I think Thomas' initial fix for msi_domain_get_hwsize has
>>>> an off-by-1 error, I think we should return MSI_XA_DOMAIN_SIZE for
>>>> msi_domain_get_hwsize instead.
>>>
>>> Yes, that's a good point, and that's consistent with what
>>> __msi_create_irq_domain() does already, assuming MSI_XA_DOMAIN_SIZE
>>> when info->hwsize is 0. No reason to do something else here.
>>>
>>> I'll update Thomas' patch. Once Guenter confirms that PPC is OK, I'll
>>> send it out.
>>>
>> With
>>
>> 7a27b6136dcb (local/testing, testing-msi) genirq/msi: Return MSI_XA_DOMAIN_SIZE as the maximum MSI index when no domain is present
>> c581d525bb1d genirq/msi: Check for the presence of an irq domain when validating msi_ctrl
>> 9d33edb20f7e Merge tag 'irq-core-2022-12-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
>>
>> I still get the following runtime warning.
>>
>> ------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 8 at kernel/irq/msi.c:196 .msi_domain_free_descs+0x144/0x170
>> Modules linked in:
>> CPU: 0 PID: 8 Comm: kworker/u2:0 Tainted: G N 6.1.0-01957-g7a27b6136dcb #1
>> Hardware name: QEMU ppce500 e5500 0x80240020 QEMU e500
>> Workqueue: nvme-reset-wq .nvme_reset_work
>> NIP: c000000000107d54 LR: c000000000107d44 CTR: 0000000000000000
>> REGS: c0000000041e74d0 TRAP: 0700 Tainted: G N (6.1.0-01957-g7a27b6136dcb)
>> MSR: 0000000080029002 <CE,EE,ME> CR: 44002282 XER: 20000000
>> IRQMASK: 0
>> GPR00: c000000000107d44 c0000000041e7770 c000000001629c00 c000000004e748a0
>> GPR04: 000000005358db0a c000000001ce7a00 c00000000423b5d0 000000004735aaa2
>> GPR08: 0000000000000002 0000000000000013 c00000000423acc0 c00000000214a998
>> GPR12: 0000000024002282 c000000002579000 c00000000008e190 c000000004173540
>> GPR16: 0000000000000000 c0000000043810b8 0000000000000000 0000000000000001
>> GPR20: c0000000060b22d8 c0000000060a70f0 0000000000000000 c000000001996800
>> GPR24: c0000000017df6c0 c0000000043810b8 c0000000060b2388 c0000000060b2000
>> GPR28: ffffffffffffffff c0000000041e7888 c000000006025ac8 c000000004e748a0
>> NIP [c000000000107d54] .msi_domain_free_descs+0x144/0x170
>> LR [c000000000107d44] .msi_domain_free_descs+0x134/0x170
>> Call Trace:
>> [c0000000041e7770] [c000000000107d44] .msi_domain_free_descs+0x134/0x170 (unreliable)
>> [c0000000041e7810] [c0000000001085d8] .msi_domain_free_msi_descs_range+0x38/0x70
>> [c0000000041e78a0] [c0000000008d000c] .pci_msi_teardown_msi_irqs+0x4c/0xa0
>> [c0000000041e7920] [c0000000008cf9e8] .pci_free_msi_irqs+0x18/0x50
>> [c0000000041e79a0] [c0000000008cd8d0] .pci_free_irq_vectors+0x80/0xb0
>> [c0000000041e7a20] [c000000000a6d2a0] .nvme_reset_work+0x870/0x1780
>> [c0000000041e7bb0] [c000000000080e68] .process_one_work+0x2d8/0x7b0
>> [c0000000041e7c90] [c0000000000813d8] .worker_thread+0x98/0x4f0
>> [c0000000041e7d70] [c00000000008e2cc] .kthread+0x13c/0x150
>> [c0000000041e7e10] [c0000000000005d8] .ret_from_kernel_thread+0x58/0x60
>> Instruction dump:
>> 7fc3f378 48ff1ca9 60000000 7c7f1b79 41c2002c e8810070 7fc3f378 48ff3491
>> 60000000 813f0000 2c090000 41e2ffb0 <0fe00000> 60000000 60000000 ebc10090
>> irq event stamp: 98168
>> hardirqs last enabled at (98167): [<c00000000110a274>] ._raw_spin_unlock_irqrestore+0x84/0xd0
>> hardirqs last disabled at (98168): [<c000000000010b58>] .program_check_exception+0x38/0x120
>> softirqs last enabled at (97760): [<c00000000110b4dc>] .__do_softirq+0x60c/0x678
>> softirqs last disabled at (97749): [<c000000000004d20>] .do_softirq_own_stack+0x30/0x50
>> ---[ end trace 0000000000000000 ]---
>> nvme nvme0: 1/0/0 default/read/poll queues
>> nvme nvme0: Ignoring bogus Namespace Identifiers
>> ...
>>
>> The system boots fine, though. This is seen when booting the ppce500
>> machine with e5500 CPU and corenet64_smp_defconfig from nvme.
>
> +PPC folks.
>
> Thanks for the report.
>
> I managed to reproduce this, although in a limited way (a SMP qemu
> instance wouldn't boot at all). The problem is that the core MSI code
> now assumes that if the arch code is in charge of breaking the
> association of a MSI with a device, it is also in charge of clearing
> the irq in the MSI descriptor.
>
> This matches the s390 behaviour, but not powerpc's, hence the splat
> and the leaked MSI descriptors. The minimal fix should be as follow,
> which I'll add to the pile of patches.
>

Confirmed, the patch below fixes the ppc problem.

Thanks,
Guenter

> Thanks,
>
> M.
>
> diff --git a/arch/powerpc/platforms/4xx/hsta_msi.c b/arch/powerpc/platforms/4xx/hsta_msi.c
> index d4f7fff1fc87..e11b57a62b05 100644
> --- a/arch/powerpc/platforms/4xx/hsta_msi.c
> +++ b/arch/powerpc/platforms/4xx/hsta_msi.c
> @@ -115,6 +115,7 @@ static void hsta_teardown_msi_irqs(struct pci_dev *dev)
> msi_bitmap_free_hwirqs(&ppc4xx_hsta_msi.bmp, irq, 1);
> pr_debug("%s: Teardown IRQ %u (index %u)\n", __func__,
> entry->irq, irq);
> + entry->irq = 0;
> }
> }
>
> diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
> index 5b012abca773..0c11aad896c7 100644
> --- a/arch/powerpc/platforms/cell/axon_msi.c
> +++ b/arch/powerpc/platforms/cell/axon_msi.c
> @@ -289,6 +289,7 @@ static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
> msi_for_each_desc(entry, &dev->dev, MSI_DESC_ASSOCIATED) {
> irq_set_msi_desc(entry->irq, NULL);
> irq_dispose_mapping(entry->irq);
> + entry->irq = 0;
> }
> }
>
> diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c
> index dc1846660005..166c97fff16d 100644
> --- a/arch/powerpc/platforms/pasemi/msi.c
> +++ b/arch/powerpc/platforms/pasemi/msi.c
> @@ -66,6 +66,7 @@ static void pasemi_msi_teardown_msi_irqs(struct pci_dev *pdev)
> hwirq = virq_to_hw(entry->irq);
> irq_set_msi_desc(entry->irq, NULL);
> irq_dispose_mapping(entry->irq);
> + entry->irq = 0;
> msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq, ALLOC_CHUNK);
> }
> }
> diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
> index 73c2d70706c0..57978a44d55b 100644
> --- a/arch/powerpc/sysdev/fsl_msi.c
> +++ b/arch/powerpc/sysdev/fsl_msi.c
> @@ -132,6 +132,7 @@ static void fsl_teardown_msi_irqs(struct pci_dev *pdev)
> msi_data = irq_get_chip_data(entry->irq);
> irq_set_msi_desc(entry->irq, NULL);
> irq_dispose_mapping(entry->irq);
> + entry->irq = 0;
> msi_bitmap_free_hwirqs(&msi_data->bitmap, hwirq, 1);
> }
> }
> diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c
> index 1d8cfdfdf115..492cb03c0b62 100644
> --- a/arch/powerpc/sysdev/mpic_u3msi.c
> +++ b/arch/powerpc/sysdev/mpic_u3msi.c
> @@ -108,6 +108,7 @@ static void u3msi_teardown_msi_irqs(struct pci_dev *pdev)
> hwirq = virq_to_hw(entry->irq);
> irq_set_msi_desc(entry->irq, NULL);
> irq_dispose_mapping(entry->irq);
> + entry->irq = 0;
> msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq, 1);
> }
> }
>

2023-02-20 17:11:46

by Russell King (Oracle)

[permalink] [raw]
Subject: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Fri, Nov 25, 2022 at 12:25:59AM +0100, Thomas Gleixner wrote:
> Per device domains provide the real domain size to the core code. This
> allows range checking on insertion of MSI descriptors and also paves the
> way for dynamic index allocations which are required e.g. for IMS. This
> avoids external mechanisms like bitmaps on the device side and just
> utilizes the core internal MSI descriptor storxe for it.
>
> Signed-off-by: Thomas Gleixner <[email protected]>

Hi Thomas,

This patch appears to cause a regression on Macchiatobin, delaying the
boot by about ten seconds due to all the warnings the kernel now
produces.

> @@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_de
>
> static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
> {
> + unsigned int hwsize;
> +
> if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
> - !dev->msi.data->__domains[ctrl->domid].domain ||
> - ctrl->first > ctrl->last ||
> - ctrl->first > MSI_MAX_INDEX ||
> - ctrl->last > MSI_MAX_INDEX))
> + !dev->msi.data->__domains[ctrl->domid].domain))
> + return false;
> +
> + hwsize = msi_domain_get_hwsize(dev, ctrl->domid);

This calls msi_get_device_domain() without taking dev->msi.data->mutex,
resulting in the lockdep_assert_held() firing for what seems to be every
MSI created by the Armada 8040 ICU driver, which suggests something isn't
taking the lock as you expect. Please can you take a look and propose a
patch to fix this regression.

Thanks.

[ 0.960451] WARNING: CPU: 2 PID: 1 at kernel/irq/msi.c:588 msi_get_device_domain+0x70/0xa0
[ 0.967454] Modules linked in:
[ 0.969216] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.2.0+ #1134
[ 0.974116] Hardware name: Marvell 8040 MACCHIATOBin Single-shot (DT)
[ 0.979276] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 0.984961] pc : msi_get_device_domain+0x70/0xa0
[ 0.988292] lr : msi_get_device_domain+0x6c/0xa0
[ 0.991623] sp : ffffffc080083460
[ 0.993643] x29: ffffffc080083460 x28: 0000000000000000 x27: ffffffc041dcb6c8
[ 0.999506] x26: ffffff8101f23810 x25: ffffffc080083668 x24: ffffff8101f23080
[ 1.005370] x23: 0000000000000012 x22: ffffff81003d1000 x21: ffffff81025dfd90
[ 1.011234] x20: ffffff8101f23810 x19: 0000000000000000 x18: 00000000fffffffd
[ 1.017097] x17: 00000000cc510454 x16: 0000000000000051 x15: 0000000000000002
[ 1.022960] x14: 00000000000389cb x13: 0000000000000001 x12: 0000000000000040
[ 1.028822] x11: ffffff8100400490 x10: ffffff8100400492 x9 : 0000000000000000
[ 1.034685] x8 : 0000000000000000 x7 : ffffff81001c8858 x6 : ffffffc0402ad718
[ 1.040547] x5 : 00000000ffffffff x4 : ffffff81003d4c80 x3 : 0000000000000000
[ 1.046410] x2 : ffffffc0fed09000 x1 : 0000000000000000 x0 : 0000000000000000
[ 1.052274] Call trace:
[ 1.053422] msi_get_device_domain+0x70/0xa0
[ 1.056404] msi_ctrl_valid+0x5c/0x94
[ 1.058775] msi_domain_populate_irqs+0x64/0x1b0
[ 1.062106] platform_msi_device_domain_alloc+0x20/0x30
[ 1.066048] mvebu_icu_irq_domain_alloc+0xa0/0x1a0
[ 1.069555] __irq_domain_alloc_irqs+0xf8/0x46c
[ 1.072799] irq_create_fwspec_mapping+0x224/0x320
[ 1.076303] irq_create_of_mapping+0x68/0x90
[ 1.079284] of_irq_get+0x88/0xd0
[ 1.081308] platform_get_irq_optional+0x20/0x114
[ 1.084725] platform_get_irq+0x18/0x50
[ 1.087269] dw8250_probe+0x60/0x6e0
[ 1.089552] platform_probe+0x64/0xd0
[ 1.091923] really_probe+0xb8/0x2d4
[ 1.094207] __driver_probe_device+0x74/0xdc
[ 1.097190] driver_probe_device+0xd0/0x160
[ 1.100085] __driver_attach+0x94/0x1a0
[ 1.102631] bus_for_each_dev+0x6c/0xc0
[ 1.105176] driver_attach+0x20/0x30
[ 1.107460] bus_add_driver+0x148/0x200
[ 1.110006] driver_register+0x74/0x120
[ 1.112550] __platform_driver_register+0x24/0x30
[ 1.115966] dw8250_platform_driver_init+0x18/0x20
[ 1.119473] do_one_initcall+0x70/0x370
[ 1.122018] kernel_init_freeable+0x1d0/0x238
[ 1.125087] kernel_init+0x20/0x120
[ 1.127283] ret_from_fork+0x10/0x20
[ 1.129567] ---[ end trace 0000000000000000 ]---

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

2023-02-20 18:29:43

by Marc Zyngier

[permalink] [raw]
Subject: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Mon, 20 Feb 2023 17:11:23 +0000,
"Russell King (Oracle)" <[email protected]> wrote:
>
> On Fri, Nov 25, 2022 at 12:25:59AM +0100, Thomas Gleixner wrote:
> > Per device domains provide the real domain size to the core code. This
> > allows range checking on insertion of MSI descriptors and also paves the
> > way for dynamic index allocations which are required e.g. for IMS. This
> > avoids external mechanisms like bitmaps on the device side and just
> > utilizes the core internal MSI descriptor storxe for it.
> >
> > Signed-off-by: Thomas Gleixner <[email protected]>
>
> Hi Thomas,
>
> This patch appears to cause a regression on Macchiatobin, delaying the
> boot by about ten seconds due to all the warnings the kernel now
> produces.
>
> > @@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_de
> >
> > static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
> > {
> > + unsigned int hwsize;
> > +
> > if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
> > - !dev->msi.data->__domains[ctrl->domid].domain ||
> > - ctrl->first > ctrl->last ||
> > - ctrl->first > MSI_MAX_INDEX ||
> > - ctrl->last > MSI_MAX_INDEX))
> > + !dev->msi.data->__domains[ctrl->domid].domain))
> > + return false;
> > +
> > + hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
>
> This calls msi_get_device_domain() without taking dev->msi.data->mutex,
> resulting in the lockdep_assert_held() firing for what seems to be every
> MSI created by the Armada 8040 ICU driver, which suggests something isn't
> taking the lock as you expect. Please can you take a look and propose a
> patch to fix this regression.

Since you already worked it out, I only had to translate your words
into the patch below, which solves it for me.

Lockdep also reports[1] a possible circular locking dependency between
phy_attach_direct() and rtnetlink_rcv_msg(), which looks interesting.

Thanks,

M.

[1] https://paste.debian.net/1271454/

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 783a3e6a0b10..13d96495e6d0 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1084,10 +1084,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
struct xarray *xa;
int ret, virq;

- if (!msi_ctrl_valid(dev, &ctrl))
- return -EINVAL;
-
msi_lock_descs(dev);
+
+ if (!msi_ctrl_valid(dev, &ctrl)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
if (ret)
goto unlock;

--
Without deviation from the norm, progress is not possible.

2023-02-20 18:30:21

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Mon, Feb 20 2023 at 17:11, Russell King wrote:
> On Fri, Nov 25, 2022 at 12:25:59AM +0100, Thomas Gleixner wrote:
> Hi Thomas,
>
> This patch appears to cause a regression on Macchiatobin, delaying the
> boot by about ten seconds due to all the warnings the kernel now
> produces.
>
>> @@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_de
>>
>> static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
>> {
>> + unsigned int hwsize;
>> +
>> if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
>> - !dev->msi.data->__domains[ctrl->domid].domain ||
>> - ctrl->first > ctrl->last ||
>> - ctrl->first > MSI_MAX_INDEX ||
>> - ctrl->last > MSI_MAX_INDEX))
>> + !dev->msi.data->__domains[ctrl->domid].domain))
>> + return false;
>> +
>> + hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
>
> This calls msi_get_device_domain() without taking dev->msi.data->mutex,
> resulting in the lockdep_assert_held() firing for what seems to be every
> MSI created by the Armada 8040 ICU driver, which suggests something isn't
> taking the lock as you expect. Please can you take a look and propose a
> patch to fix this regression.

Groan. I'll have a look.

2023-02-20 18:43:31

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Mon, Feb 20 2023 at 18:29, Marc Zyngier wrote:
> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
> index 783a3e6a0b10..13d96495e6d0 100644
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -1084,10 +1084,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
> struct xarray *xa;
> int ret, virq;
>
> - if (!msi_ctrl_valid(dev, &ctrl))
> - return -EINVAL;
> -
> msi_lock_descs(dev);
> +
> + if (!msi_ctrl_valid(dev, &ctrl)) {
> + ret = -EINVAL;
> + goto unlock;
> + }
> +
> ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
> if (ret)
> goto unlock;

Yup, you beat me by a minute :)

2023-02-20 19:02:19

by Russell King (Oracle)

[permalink] [raw]
Subject: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Mon, Feb 20, 2023 at 06:29:33PM +0000, Marc Zyngier wrote:
> On Mon, 20 Feb 2023 17:11:23 +0000,
> "Russell King (Oracle)" <[email protected]> wrote:
> >
> > On Fri, Nov 25, 2022 at 12:25:59AM +0100, Thomas Gleixner wrote:
> > > Per device domains provide the real domain size to the core code. This
> > > allows range checking on insertion of MSI descriptors and also paves the
> > > way for dynamic index allocations which are required e.g. for IMS. This
> > > avoids external mechanisms like bitmaps on the device side and just
> > > utilizes the core internal MSI descriptor storxe for it.
> > >
> > > Signed-off-by: Thomas Gleixner <[email protected]>
> >
> > Hi Thomas,
> >
> > This patch appears to cause a regression on Macchiatobin, delaying the
> > boot by about ten seconds due to all the warnings the kernel now
> > produces.
> >
> > > @@ -136,11 +149,16 @@ static bool msi_desc_match(struct msi_de
> > >
> > > static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
> > > {
> > > + unsigned int hwsize;
> > > +
> > > if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
> > > - !dev->msi.data->__domains[ctrl->domid].domain ||
> > > - ctrl->first > ctrl->last ||
> > > - ctrl->first > MSI_MAX_INDEX ||
> > > - ctrl->last > MSI_MAX_INDEX))
> > > + !dev->msi.data->__domains[ctrl->domid].domain))
> > > + return false;
> > > +
> > > + hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
> >
> > This calls msi_get_device_domain() without taking dev->msi.data->mutex,
> > resulting in the lockdep_assert_held() firing for what seems to be every
> > MSI created by the Armada 8040 ICU driver, which suggests something isn't
> > taking the lock as you expect. Please can you take a look and propose a
> > patch to fix this regression.
>
> Since you already worked it out, I only had to translate your words
> into the patch below, which solves it for me.

Thanks for making incorrect assumptions. I hadn't "worked it out",
I merely reported it and stated the bleeding obvious.

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

2023-02-20 19:17:45

by Russell King (Oracle)

[permalink] [raw]
Subject: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Mon, Feb 20, 2023 at 06:29:33PM +0000, Marc Zyngier wrote:
> Lockdep also reports[1] a possible circular locking dependency between
> phy_attach_direct() and rtnetlink_rcv_msg(), which looks interesting.
>
> [1] https://paste.debian.net/1271454/

Adding Andrew, but really this should be in a separate thread, since
this has nothing to do with MSI.

It looks like the open path takes the RTNL lock followed by the phydev
lock, whereas the PHY probe path takes the phydev lock, and then if
there's a SFP attached to the PHY, we end up taking the RTNL lock.
That's going to be utterly horrid to try and solve, and isn't going
to be quick to fix.

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

2023-02-20 19:44:02

by Andrew Lunn

[permalink] [raw]
Subject: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking to msi_insert_desc()

On Mon, Feb 20, 2023 at 07:17:11PM +0000, Russell King (Oracle) wrote:
> On Mon, Feb 20, 2023 at 06:29:33PM +0000, Marc Zyngier wrote:
> > Lockdep also reports[1] a possible circular locking dependency between
> > phy_attach_direct() and rtnetlink_rcv_msg(), which looks interesting.
> >
> > [1] https://paste.debian.net/1271454/
>
> Adding Andrew, but really this should be in a separate thread, since
> this has nothing to do with MSI.
>
> It looks like the open path takes the RTNL lock followed by the phydev
> lock, whereas the PHY probe path takes the phydev lock, and then if
> there's a SFP attached to the PHY, we end up taking the RTNL lock.
> That's going to be utterly horrid to try and solve, and isn't going
> to be quick to fix.

What are we actually trying to protect in phy_probe() when we take the
lock and call phydev->drv->probe(phydev) ?

The main purpose of the lock is to protect members of phydev, such as
link, speed, duplex, which can be inconsistent when the lock is not
held. But the PHY is not attached to a MAC yet, so a MAC cannot be
using it, and those members of phydev are not valid yet anyway.

The lock also prevents parallel operation on the device by phylib, but
i cannot think of how that could happen at this early stage in the
life of the PHY.

So maybe we can move the mutex_lock() after the call to
phydev->drv->probe()?

Andrew

2023-02-20 20:16:11

by Russell King (Oracle)

[permalink] [raw]
Subject: Re: phylib locking (was: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking) to msi_insert_desc()

[dropped most on the Cc as this has probably deviated off topic for
them... and changed the subject]

On Mon, Feb 20, 2023 at 08:43:44PM +0100, Andrew Lunn wrote:
> On Mon, Feb 20, 2023 at 07:17:11PM +0000, Russell King (Oracle) wrote:
> > On Mon, Feb 20, 2023 at 06:29:33PM +0000, Marc Zyngier wrote:
> > > Lockdep also reports[1] a possible circular locking dependency between
> > > phy_attach_direct() and rtnetlink_rcv_msg(), which looks interesting.
> > >
> > > [1] https://paste.debian.net/1271454/
> >
> > Adding Andrew, but really this should be in a separate thread, since
> > this has nothing to do with MSI.
> >
> > It looks like the open path takes the RTNL lock followed by the phydev
> > lock, whereas the PHY probe path takes the phydev lock, and then if
> > there's a SFP attached to the PHY, we end up taking the RTNL lock.
> > That's going to be utterly horrid to try and solve, and isn't going
> > to be quick to fix.
>
> What are we actually trying to protect in phy_probe() when we take the
> lock and call phydev->drv->probe(phydev) ?
>
> The main purpose of the lock is to protect members of phydev, such as
> link, speed, duplex, which can be inconsistent when the lock is not
> held. But the PHY is not attached to a MAC yet, so a MAC cannot be
> using it, and those members of phydev are not valid yet anyway.
>
> The lock also prevents parallel operation on the device by phylib, but
> i cannot think of how that could happen at this early stage in the
> life of the PHY.
>
> So maybe we can move the mutex_lock() after the call to
> phydev->drv->probe()?

That's what I've been thinking too - I dug back in the history, and
it was a spin_lock_bh(), and before that it was a spin_lock().

The patch that converted it to a spin_lock_bh() is a brilliant
example of a poor commit message "Lock debugging finds a problem"
but doesn't say _what_ the problem was! Going back further still, the
spin_lock() was there from the very beginnings of PHYLIB. So the
reasoning for having a lock here has been lost in the depths of time.

The lock certainly doesn't prevent any interaction with
phy_attach_direct(), so it seems to be utterly pointless to take
the lock in the probe() function.

So yes, I agree, we can move the lock - and I wonder whether we
could just get rid of it completely in phy_probe().

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

2023-02-21 14:57:55

by Russell King (Oracle)

[permalink] [raw]
Subject: Re: phylib locking (was: Re: [REGRESSION] Re: [patch V3 09/33] genirq/msi: Add range checking) to msi_insert_desc()

On Mon, Feb 20, 2023 at 08:15:59PM +0000, Russell King (Oracle) wrote:
> [dropped most on the Cc as this has probably deviated off topic for
> them... and changed the subject]
>
> On Mon, Feb 20, 2023 at 08:43:44PM +0100, Andrew Lunn wrote:
> > On Mon, Feb 20, 2023 at 07:17:11PM +0000, Russell King (Oracle) wrote:
> > > On Mon, Feb 20, 2023 at 06:29:33PM +0000, Marc Zyngier wrote:
> > > > Lockdep also reports[1] a possible circular locking dependency between
> > > > phy_attach_direct() and rtnetlink_rcv_msg(), which looks interesting.
> > > >
> > > > [1] https://paste.debian.net/1271454/
> > >
> > > Adding Andrew, but really this should be in a separate thread, since
> > > this has nothing to do with MSI.
> > >
> > > It looks like the open path takes the RTNL lock followed by the phydev
> > > lock, whereas the PHY probe path takes the phydev lock, and then if
> > > there's a SFP attached to the PHY, we end up taking the RTNL lock.
> > > That's going to be utterly horrid to try and solve, and isn't going
> > > to be quick to fix.
> >
> > What are we actually trying to protect in phy_probe() when we take the
> > lock and call phydev->drv->probe(phydev) ?
> >
> > The main purpose of the lock is to protect members of phydev, such as
> > link, speed, duplex, which can be inconsistent when the lock is not
> > held. But the PHY is not attached to a MAC yet, so a MAC cannot be
> > using it, and those members of phydev are not valid yet anyway.
> >
> > The lock also prevents parallel operation on the device by phylib, but
> > i cannot think of how that could happen at this early stage in the
> > life of the PHY.
> >
> > So maybe we can move the mutex_lock() after the call to
> > phydev->drv->probe()?
>
> That's what I've been thinking too - I dug back in the history, and
> it was a spin_lock_bh(), and before that it was a spin_lock().
>
> The patch that converted it to a spin_lock_bh() is a brilliant
> example of a poor commit message "Lock debugging finds a problem"
> but doesn't say _what_ the problem was! Going back further still, the
> spin_lock() was there from the very beginnings of PHYLIB. So the
> reasoning for having a lock here has been lost in the depths of time.
>
> The lock certainly doesn't prevent any interaction with
> phy_attach_direct(), so it seems to be utterly pointless to take
> the lock in the probe() function.
>
> So yes, I agree, we can move the lock - and I wonder whether we
> could just get rid of it completely in phy_probe().

Thinking about this more, I think taking phydev->lock in both
phy_probe() and phy_remove() are both entirely pointless, so I think
we should remove both and be done with this. As I note above, it does
nothing to stop a race between phy_attach_direct() and phy_probe() or
even phy_remove(). So, I think this is entirely sensible:

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 71becceb8764..b46a074b27e4 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3098,8 +3098,6 @@ static int phy_probe(struct device *dev)
if (phydrv->flags & PHY_IS_INTERNAL)
phydev->is_internal = true;

- mutex_lock(&phydev->lock);
-
/* Deassert the reset signal */
phy_device_reset(phydev, 0);

@@ -3173,8 +3171,6 @@ static int phy_probe(struct device *dev)
if (err)
phy_device_reset(phydev, 1);

- mutex_unlock(&phydev->lock);
-
return err;
}

@@ -3184,9 +3180,7 @@ static int phy_remove(struct device *dev)

cancel_delayed_work_sync(&phydev->state_queue);

- mutex_lock(&phydev->lock);
phydev->state = PHY_DOWN;
- mutex_unlock(&phydev->lock);

sfp_bus_del_upstream(phydev->sfp_bus);
phydev->sfp_bus = NULL;

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!