Originally, DMAR fault IRQ was the only source of interrupts for VT-d
itself, thus simply named 'irq'. Newer interrupt sources were added later
for page requests and perfmon with proper names, i.e. pr_irq and perf_irq.
Rename the fault IRQ to reflect its usage. This avoids confusion when
programming MSI messages for the three possible sources.
Signed-off-by: Jacob Pan <[email protected]>
---
drivers/iommu/intel/dmar.c | 16 ++++++++--------
drivers/iommu/intel/iommu.h | 2 +-
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 36d7427b1202..ab325af93f71 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1175,15 +1175,15 @@ static void free_iommu(struct intel_iommu *iommu)
free_iommu_pmu(iommu);
- if (iommu->irq) {
+ if (iommu->fault_irq) {
if (iommu->pr_irq) {
free_irq(iommu->pr_irq, iommu);
dmar_free_hwirq(iommu->pr_irq);
iommu->pr_irq = 0;
}
- free_irq(iommu->irq, iommu);
- dmar_free_hwirq(iommu->irq);
- iommu->irq = 0;
+ free_irq(iommu->fault_irq, iommu);
+ dmar_free_hwirq(iommu->fault_irq);
+ iommu->fault_irq = 0;
}
if (iommu->qi) {
@@ -1918,7 +1918,7 @@ static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
static inline int dmar_msi_reg(struct intel_iommu *iommu, int irq)
{
- if (iommu->irq == irq)
+ if (iommu->fault_irq == irq)
return DMAR_FECTL_REG;
else if (iommu->pr_irq == irq)
return DMAR_PECTL_REG;
@@ -2105,12 +2105,12 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
/*
* Check if the fault interrupt is already initialized.
*/
- if (iommu->irq)
+ if (iommu->fault_irq)
return 0;
irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
if (irq > 0) {
- iommu->irq = irq;
+ iommu->fault_irq = irq;
} else {
pr_err("No free IRQ vectors\n");
return -EINVAL;
@@ -2143,7 +2143,7 @@ int __init enable_drhd_fault_handling(void)
/*
* Clear any previous faults.
*/
- dmar_fault(iommu->irq, iommu);
+ dmar_fault(iommu->fault_irq, iommu);
fault_status = readl(iommu->reg + DMAR_FSTS_REG);
writel(fault_status, iommu->reg + DMAR_FSTS_REG);
}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 404d2476a877..deebd4817d27 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -699,7 +699,7 @@ struct intel_iommu {
int seq_id; /* sequence id of the iommu */
int agaw; /* agaw of this iommu */
int msagaw; /* max sagaw of this iommu */
- unsigned int irq, pr_irq, perf_irq;
+ unsigned int fault_irq, pr_irq, perf_irq;
u16 segment; /* PCI segment# */
unsigned char name[13]; /* Device Name */
--
2.25.1
DMAR fault interrupt is used for per-IOMMU unrecoverable fault reporting,
it occurs only if there is a kernel programming error or serious hardware
failure. In other words, they should never occur under normal circumstances.
However, we are permanently occupying IRQ vectors per DMAR unit. On a
dual-socket Saphire Rapids system, DMAR fault interrupts can consume 16
vectors on BSP, which can lead to vector exhaustion. The effort to spread
vectors to each socket only partially alleviates the problem.
This patch leverages the shared IRQ mechanism such that only a single IRQ
vector is consumed for all the DMAR units on a system. When any DMAR faults
occur, all DMAR handlers are called to check their own fault records.
After this patch /proc/interrupts will show the list of DMAR units that share
the fault interrupt, e.g.
24 DMAR-MSI 14-edge dmar14, dmar13, dmar12, dmar11, dmar10, dmar9,
dmar8, dmar7, dmar6, dmar5, dmar4, dmar3, dmar2, dmar1, dmar0, dmar15
Link: https://lore.kernel.org/lkml/20240325115638.342716e5@jacob-builder/t/#mc08892e405456428773bcc3b0bbe8971886c5ab9
Reported-by: Dimitri Sivanich <[email protected]>
Originally-by: Thomas Gleixner <[email protected]>
Signed-off-by: Jacob Pan <[email protected]>
---
drivers/iommu/intel/dmar.c | 71 +++++++++++++++++++++++++++++--------
drivers/iommu/intel/iommu.h | 1 +
2 files changed, 57 insertions(+), 15 deletions(-)
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index ab325af93f71..cf68464b3404 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -1182,7 +1182,6 @@ static void free_iommu(struct intel_iommu *iommu)
iommu->pr_irq = 0;
}
free_irq(iommu->fault_irq, iommu);
- dmar_free_hwirq(iommu->fault_irq);
iommu->fault_irq = 0;
}
@@ -1956,9 +1955,8 @@ void dmar_msi_mask(struct irq_data *data)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
-void dmar_msi_write(int irq, struct msi_msg *msg)
+static void dmar_msi_write_msg(struct intel_iommu *iommu, int irq, struct msi_msg *msg)
{
- struct intel_iommu *iommu = irq_get_handler_data(irq);
int reg = dmar_msi_reg(iommu, irq);
unsigned long flag;
@@ -1969,6 +1967,13 @@ void dmar_msi_write(int irq, struct msi_msg *msg)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+ struct intel_iommu *iommu = irq_get_handler_data(irq);
+
+ dmar_msi_write_msg(iommu, irq, msg);
+}
+
void dmar_msi_read(int irq, struct msi_msg *msg)
{
struct intel_iommu *iommu = irq_get_handler_data(irq);
@@ -2098,27 +2103,63 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
return IRQ_HANDLED;
}
+static inline void dmar_fault_irq_unmask(struct intel_iommu *iommu)
+{
+ unsigned long flag;
+
+ raw_spin_lock_irqsave(&iommu->register_lock, flag);
+ writel(0, iommu->reg + DMAR_FECTL_REG);
+ raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
int dmar_set_interrupt(struct intel_iommu *iommu)
{
- int irq, ret;
+ static int dmar_irq;
+ int ret;
- /*
- * Check if the fault interrupt is already initialized.
- */
+ /* Don't initialize it twice for a given iommu */
if (iommu->fault_irq)
return 0;
+ /*
+ * There is one shared interrupt for all IOMMUs to prevent vector
+ * exhaustion.
+ */
+ if (!dmar_irq) {
+ int irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
- irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
- if (irq > 0) {
- iommu->fault_irq = irq;
+ if (irq <= 0) {
+ pr_err("No free IRQ vectors\n");
+ return -EINVAL;
+ }
+ dmar_irq = irq;
+ iommu->fault_irq = dmar_irq;
+ iommu->flags |= VTD_FLAG_FAULT_IRQ_OWNER;
} else {
- pr_err("No free IRQ vectors\n");
- return -EINVAL;
- }
+ struct msi_msg msg;
- ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
- if (ret)
+ /*
+ * Get the MSI message from the shared interrupt and write
+ * it to the iommu MSI registers. Must assign fault_irq to get
+ * the MSI register offset.
+ */
+ iommu->fault_irq = dmar_irq;
+ dmar_msi_read(dmar_irq, &msg);
+ dmar_msi_write_msg(iommu, dmar_irq, &msg);
+ }
+ ret = request_irq(dmar_irq, dmar_fault, IRQF_NO_THREAD | IRQF_SHARED | IRQF_NOBALANCING, iommu->name, iommu);
+ if (ret) {
pr_err("Can't request irq\n");
+ return ret;
+ }
+
+ /*
+ * Only the owner IOMMU of the shared IRQ has its fault event
+ * interrupt unmasked after request_irq(), the rest are explicitly
+ * unmasked.
+ */
+ if (!(iommu->flags & VTD_FLAG_FAULT_IRQ_OWNER))
+ dmar_fault_irq_unmask(iommu);
+
return ret;
}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index deebd4817d27..128f6cdaebac 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -536,6 +536,7 @@ enum {
#define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0)
#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1)
#define VTD_FLAG_SVM_CAPABLE (1 << 2)
+#define VTD_FLAG_FAULT_IRQ_OWNER (1 << 3)
#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
#define pasid_supported(iommu) (sm_supported(iommu) && \
--
2.25.1
> From: Jacob Pan <[email protected]>
> Sent: Thursday, April 4, 2024 7:46 AM
>
> DMAR fault interrupt is used for per-IOMMU unrecoverable fault reporting,
> it occurs only if there is a kernel programming error or serious hardware
> failure. In other words, they should never occur under normal circumstances.
this is not accurate. When a device is assigned to a malicious guest then
it's not unusual to observe faults.
in this context you probably meant that it's not a performance path hence
sharing the vector is acceptable.
>
> @@ -1182,7 +1182,6 @@ static void free_iommu(struct intel_iommu
> *iommu)
> iommu->pr_irq = 0;
> }
> free_irq(iommu->fault_irq, iommu);
> - dmar_free_hwirq(iommu->fault_irq);
You still want to free the vector for the iommu which first gets the
vector allocated.
> @@ -1956,9 +1955,8 @@ void dmar_msi_mask(struct irq_data *data)
> raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
> }
>
> -void dmar_msi_write(int irq, struct msi_msg *msg)
> +static void dmar_msi_write_msg(struct intel_iommu *iommu, int irq, struct
> msi_msg *msg)
> {
what about iommu_msi_write_msg() to match the first parameter?
otherwise it leads to a slightly circled calltrace:
dmar_msi_write_msg()
dmar_msi_write()
dmar_msi_write_msg()
> +
> + /*
> + * Only the owner IOMMU of the shared IRQ has its fault event
> + * interrupt unmasked after request_irq(), the rest are explicitly
> + * unmasked.
> + */
> + if (!(iommu->flags & VTD_FLAG_FAULT_IRQ_OWNER))
> + dmar_fault_irq_unmask(iommu);
> +
em there is a problem in dmar_msi_mask() and dmar_msi_mask()
which only touches the owner IOMMU. With this shared vector
approach we should mask/unmask all IOMMU's together.
Hi Kevin,
On Mon, 8 Apr 2024 08:48:54 +0000, "Tian, Kevin" <[email protected]>
wrote:
> > From: Jacob Pan <[email protected]>
> > Sent: Thursday, April 4, 2024 7:46 AM
> >
> > DMAR fault interrupt is used for per-IOMMU unrecoverable fault
> > reporting, it occurs only if there is a kernel programming error or
> > serious hardware failure. In other words, they should never occur under
> > normal circumstances.
>
> this is not accurate. When a device is assigned to a malicious guest then
> it's not unusual to observe faults.
>
Right, a malicious guest kernel could cause unrecoverable faults, e.g.
wrong privilege.
> in this context you probably meant that it's not a performance path hence
> sharing the vector is acceptable.
>
Yes.
> >
> > @@ -1182,7 +1182,6 @@ static void free_iommu(struct intel_iommu
> > *iommu)
> > iommu->pr_irq = 0;
> > }
> > free_irq(iommu->fault_irq, iommu);
> > - dmar_free_hwirq(iommu->fault_irq);
>
> You still want to free the vector for the iommu which first gets the
> vector allocated.
>
I think we always want to keep this vector since the system always needs
one vector to share. We will never offline all the IOMMUs, right?
> > @@ -1956,9 +1955,8 @@ void dmar_msi_mask(struct irq_data *data)
> > raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
> > }
> >
> > -void dmar_msi_write(int irq, struct msi_msg *msg)
> > +static void dmar_msi_write_msg(struct intel_iommu *iommu, int irq,
> > struct msi_msg *msg)
> > {
>
> what about iommu_msi_write_msg() to match the first parameter?
>
> otherwise it leads to a slightly circled calltrace:
> dmar_msi_write_msg()
> dmar_msi_write()
> dmar_msi_write_msg()
>
Good point, will do.
> > +
> > + /*
> > + * Only the owner IOMMU of the shared IRQ has its fault event
> > + * interrupt unmasked after request_irq(), the rest are
> > explicitly
> > + * unmasked.
> > + */
> > + if (!(iommu->flags & VTD_FLAG_FAULT_IRQ_OWNER))
> > + dmar_fault_irq_unmask(iommu);
> > +
>
> em there is a problem in dmar_msi_mask() and dmar_msi_mask()
> which only touches the owner IOMMU. With this shared vector
> approach we should mask/unmask all IOMMU's together.
I thought about this as well, in addition to fault_irq,
dmar_msi_mask/unmask() are used for other DMAR irqs, page request and
perfmon. So we need a special case for fault_irq there, it is not pretty.
I added a special case here in this patch, thinking we never mask the
fault_irq since we need to cover the lifetime of the system. I have looked
at:
1.IOMMU suspend/resume, no mask/unmask
2.IRQ migration, added IRQF_NOBALANCING
maybe I missed some cases?
Thanks,
Jacob
Hi Jacob,
On Mon, 8 Apr 2024 09:05:56 -0700, Jacob Pan
<[email protected]> wrote:
> Hi Kevin,
>
> On Mon, 8 Apr 2024 08:48:54 +0000, "Tian, Kevin" <[email protected]>
> wrote:
>
> > > From: Jacob Pan <[email protected]>
> > > Sent: Thursday, April 4, 2024 7:46 AM
> > >
> > > DMAR fault interrupt is used for per-IOMMU unrecoverable fault
> > > reporting, it occurs only if there is a kernel programming error or
> > > serious hardware failure. In other words, they should never occur
> > > under normal circumstances.
> >
> > this is not accurate. When a device is assigned to a malicious guest
> > then it's not unusual to observe faults.
> >
> Right, a malicious guest kernel could cause unrecoverable faults, e.g.
> wrong privilege.
>
> > in this context you probably meant that it's not a performance path
> > hence sharing the vector is acceptable.
> >
> Yes.
> > >
> > > @@ -1182,7 +1182,6 @@ static void free_iommu(struct intel_iommu
> > > *iommu)
> > > iommu->pr_irq = 0;
> > > }
> > > free_irq(iommu->fault_irq, iommu);
> > > - dmar_free_hwirq(iommu->fault_irq);
> >
> > You still want to free the vector for the iommu which first gets the
> > vector allocated.
> >
> I think we always want to keep this vector since the system always needs
> one vector to share. We will never offline all the IOMMUs, right?
>
> > > @@ -1956,9 +1955,8 @@ void dmar_msi_mask(struct irq_data *data)
> > > raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
> > > }
> > >
> > > -void dmar_msi_write(int irq, struct msi_msg *msg)
> > > +static void dmar_msi_write_msg(struct intel_iommu *iommu, int irq,
> > > struct msi_msg *msg)
> > > {
> >
> > what about iommu_msi_write_msg() to match the first parameter?
> >
> > otherwise it leads to a slightly circled calltrace:
> > dmar_msi_write_msg()
> > dmar_msi_write()
> > dmar_msi_write_msg()
> >
> Good point, will do.
>
> > > +
> > > + /*
> > > + * Only the owner IOMMU of the shared IRQ has its fault event
> > > + * interrupt unmasked after request_irq(), the rest are
> > > explicitly
> > > + * unmasked.
> > > + */
> > > + if (!(iommu->flags & VTD_FLAG_FAULT_IRQ_OWNER))
> > > + dmar_fault_irq_unmask(iommu);
> > > +
> >
> > em there is a problem in dmar_msi_mask() and dmar_msi_mask()
> > which only touches the owner IOMMU. With this shared vector
> > approach we should mask/unmask all IOMMU's together.
> I thought about this as well, in addition to fault_irq,
> dmar_msi_mask/unmask() are used for other DMAR irqs, page request and
> perfmon. So we need a special case for fault_irq there, it is not pretty.
>
> I added a special case here in this patch, thinking we never mask the
> fault_irq since we need to cover the lifetime of the system. I have looked
> at:
> 1.IOMMU suspend/resume, no mask/unmask
Actually, we do call mask/unmask in suspend/unmask noirq phase.
And DMAR-MSI chip has IRQCHIP_SKIP_SET_WAKE flag.
So you are right, I am missing this case where non-owner IOMMU's fault_irqs
are not masked/unmasked.
> 2.IRQ migration, added IRQF_NOBALANCING
>
> maybe I missed some cases?
>
>
> Thanks,
>
> Jacob
Thanks,
Jacob
> From: Jacob Pan <[email protected]>
> Sent: Tuesday, April 9, 2024 1:39 AM
>
> Hi Jacob,
>
> On Mon, 8 Apr 2024 09:05:56 -0700, Jacob Pan
> <[email protected]> wrote:
>
> > Hi Kevin,
> >
> > On Mon, 8 Apr 2024 08:48:54 +0000, "Tian, Kevin" <[email protected]>
> > wrote:
> >
> > > > From: Jacob Pan <[email protected]>
> > > > Sent: Thursday, April 4, 2024 7:46 AM
> > > >
> > > > @@ -1182,7 +1182,6 @@ static void free_iommu(struct intel_iommu
> > > > *iommu)
> > > > iommu->pr_irq = 0;
> > > > }
> > > > free_irq(iommu->fault_irq, iommu);
> > > > - dmar_free_hwirq(iommu->fault_irq);
> > >
> > > You still want to free the vector for the iommu which first gets the
> > > vector allocated.
> > >
> > I think we always want to keep this vector since the system always needs
> > one vector to share. We will never offline all the IOMMUs, right?
Not about offline. Just about the common rule of cleaning up a resource
when all of its users are destroyed.
> > > > +
> > > > + /*
> > > > + * Only the owner IOMMU of the shared IRQ has its fault event
> > > > + * interrupt unmasked after request_irq(), the rest are
> > > > explicitly
> > > > + * unmasked.
> > > > + */
> > > > + if (!(iommu->flags & VTD_FLAG_FAULT_IRQ_OWNER))
> > > > + dmar_fault_irq_unmask(iommu);
> > > > +
> > >
> > > em there is a problem in dmar_msi_mask() and dmar_msi_mask()
> > > which only touches the owner IOMMU. With this shared vector
> > > approach we should mask/unmask all IOMMU's together.
> > I thought about this as well, in addition to fault_irq,
> > dmar_msi_mask/unmask() are used for other DMAR irqs, page request and
> > perfmon. So we need a special case for fault_irq there, it is not pretty.
yes, that is the part which I don't really like.
> >
> > I added a special case here in this patch, thinking we never mask the
> > fault_irq since we need to cover the lifetime of the system. I have looked
> > at:
> > 1.IOMMU suspend/resume, no mask/unmask
> Actually, we do call mask/unmask in suspend/unmask noirq phase.
> And DMAR-MSI chip has IRQCHIP_SKIP_SET_WAKE flag.
>
> So you are right, I am missing this case where non-owner IOMMU's fault_irqs
> are not masked/unmasked.
>
and it's not good to code a mask/unmask callback upon fixed assumptions
on when irq core may call mask/unmask as the latter part can change
w/o noting the broken assumption in such callback.