2020-10-24 21:38:10

by David Woodhouse

[permalink] [raw]
Subject: [PATCH v3 19/35] x86/io_apic: Cleanup trigger/polarity helpers

From: Thomas Gleixner <[email protected]>

'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
the trigger type (edge/level) and the active low/high configuration. While
there are defines for initializing these variables and struct members, they
are not used consequently and the meaning of 'trigger' and 'polarity' is
opaque and confusing at best.

Rename them to 'is_level' and 'active_low' and make them boolean in various
structs so it's entirely clear what the meaning is.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: David Woodhouse <[email protected]>
---
arch/x86/include/asm/hw_irq.h | 6 +-
arch/x86/kernel/apic/io_apic.c | 244 +++++++++++++---------------
arch/x86/pci/intel_mid_pci.c | 8 +-
drivers/iommu/amd/iommu.c | 10 +-
drivers/iommu/intel/irq_remapping.c | 9 +-
5 files changed, 130 insertions(+), 147 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a4aeeaace040..517847a94dbe 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -47,9 +47,9 @@ enum irq_alloc_type {
struct ioapic_alloc_info {
int pin;
int node;
- u32 trigger : 1;
- u32 polarity : 1;
- u32 valid : 1;
+ u32 is_level : 1;
+ u32 active_low : 1;
+ u32 valid : 1;
struct IO_APIC_route_entry *entry;
};

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c6d92d2570d0..24a7bba7cbf4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,12 +89,12 @@ struct irq_pin_list {
};

struct mp_chip_data {
- struct list_head irq_2_pin;
- struct IO_APIC_route_entry entry;
- int trigger;
- int polarity;
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
u32 count;
- bool isa_irq;
};

struct mp_ioapic_gsi {
@@ -745,44 +745,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}

-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < nr_legacy_irqs()) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (IOAPIC_EDGE)
-#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
-#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;

@@ -791,90 +754,139 @@ static int irq_polarity(int idx)
*/
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
case MP_IRQPOL_DEFAULT:
- /* conforms to spec, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- return default_ISA_polarity(idx);
- else
- return default_PCI_polarity(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
case MP_IRQPOL_ACTIVE_HIGH:
- return IOAPIC_POL_HIGH;
+ return false;
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
fallthrough;
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_POL_LOW;
+ return true;
}
}

#ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_PCI:
case MP_BUS_ISA:
- return trigger;
+ return level;
case MP_BUS_EISA:
- return default_EISA_trigger(idx);
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
}
pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
- return IOAPIC_LEVEL;
+ return true;
}
#else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return trigger;
+ return level;
}
#endif

-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int trigger;
+ bool level;

/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
case MP_IRQTRIG_DEFAULT:
- /* conforms to spec, ie. bus-type dependent trigger mode */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to egde, ISA to level.
+ */
+ level = test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
- return eisa_irq_trigger(idx, bus, trigger);
+ return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:
- return IOAPIC_EDGE;
+ return false;
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
fallthrough;
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_LEVEL;
+ return true;
}
}

+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
+}
+#endif
+
void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
int trigger, int polarity)
{
init_irq_alloc_info(info, NULL);
info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
info->ioapic.node = node;
- info->ioapic.trigger = trigger;
- info->ioapic.polarity = polarity;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
info->ioapic.valid = 1;
}

-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
struct irq_alloc_info *src,
u32 gsi, int ioapic_idx, int pin)
{
- int trigger, polarity;
+ bool level, pol_low;

copy_irq_alloc_info(dst, src);
dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
@@ -883,20 +895,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
dst->ioapic.valid = 1;
if (src && src->ioapic.valid) {
dst->ioapic.node = src->ioapic.node;
- dst->ioapic.trigger = src->ioapic.trigger;
- dst->ioapic.polarity = src->ioapic.polarity;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
} else {
dst->ioapic.node = NUMA_NO_NODE;
- if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
- dst->ioapic.trigger = trigger;
- dst->ioapic.polarity = polarity;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
} else {
/*
* PCI interrupts are always active low level
* triggered.
*/
- dst->ioapic.trigger = IOAPIC_LEVEL;
- dst->ioapic.polarity = IOAPIC_POL_LOW;
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
}
}
@@ -906,12 +918,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}

-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
{
irq_flow_handler_t hdl;
bool fasteoi;

- if (trigger) {
+ if (level) {
irq_set_status_flags(irq, IRQ_LEVEL);
fasteoi = true;
} else {
@@ -933,14 +945,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
- if (info->ioapic.trigger != data->trigger)
- mp_register_handler(irq, info->ioapic.trigger);
- data->entry.trigger = data->trigger = info->ioapic.trigger;
- data->entry.polarity = data->polarity = info->ioapic.polarity;
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.trigger = data->is_level = info->ioapic.is_level;
+ data->entry.polarity = data->active_low = info->ioapic.active_low;
}

- return data->trigger == info->ioapic.trigger &&
- data->polarity == info->ioapic.polarity;
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}

static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -2179,9 +2191,9 @@ static inline void __init check_timer(void)
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
unmask_ioapic_irq(irq_get_irq_data(0));
}
irq_domain_deactivate_irq(irq_data);
@@ -2588,30 +2600,6 @@ static int io_apic_get_version(int ioapic)
return reg_01.bits.version;
}

-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
- int ioapic, pin, idx;
-
- if (skip_ioapic_setup)
- return -1;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -1;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- if (pin < 0)
- return -1;
-
- idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- return -1;
-
- *trigger = irq_trigger(idx);
- *polarity = irq_polarity(idx);
- return 0;
-}
-
/*
* This function updates target affinity of IOAPIC interrupts to include
* the CPUs which came online during SMP bringup.
@@ -2935,13 +2923,13 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
struct irq_alloc_info *info)
{
if (info && info->ioapic.valid) {
- data->trigger = info->ioapic.trigger;
- data->polarity = info->ioapic.polarity;
- } else if (acpi_get_override_irq(gsi, &data->trigger,
- &data->polarity) < 0) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level,
+ &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
- data->trigger = IOAPIC_LEVEL;
- data->polarity = IOAPIC_POL_LOW;
+ data->is_level = true;
+ data->active_low = true;
}
}

@@ -2953,16 +2941,13 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
entry->dest_mode = apic->dest_mode_logical;
entry->dest = cfg->dest_apicid;
entry->vector = cfg->vector;
- entry->trigger = data->trigger;
- entry->polarity = data->polarity;
+ entry->trigger = data->is_level;
+ entry->polarity = data->active_low;
/*
* Mask level triggered irqs. Edge triggered irqs are masked
* by the irq core code in case they fire.
*/
- if (data->trigger == IOAPIC_LEVEL)
- entry->mask = IOAPIC_MASKED;
- else
- entry->mask = IOAPIC_UNMASKED;
+ entry->mask = data->is_level;
}

int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
@@ -3010,7 +2995,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
local_irq_save(flags);
if (info->ioapic.entry)
mp_setup_entry(cfg, data, info->ioapic.entry);
- mp_register_handler(virq, data->trigger);
+ mp_register_handler(virq, data->is_level);
if (virq < nr_legacy_irqs())
legacy_pic->mask(virq);
local_irq_restore(flags);
@@ -3018,7 +3003,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
apic_printk(APIC_VERBOSE, KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
- virq, data->trigger, data->polarity, cfg->dest_apicid);
+ virq, data->is_level, data->active_low,
+ cfg->dest_apicid);

return 0;
}
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 00c62115f39c..3709afae7c77 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -214,7 +214,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
struct irq_alloc_info info;
- int polarity;
+ bool polarity_low;
int ret;
u8 gsi;

@@ -229,7 +229,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)

switch (intel_mid_identify_cpu()) {
case INTEL_MID_CPU_CHIP_TANGIER:
- polarity = IOAPIC_POL_HIGH;
+ polarity_low = false;

/* Special treatment for IRQ0 */
if (gsi == 0) {
@@ -251,11 +251,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
}
break;
default:
- polarity = IOAPIC_POL_LOW;
+ polarity_low = true;
break;
}

- ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
+ ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);

/*
* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 473de0920b64..b0e5210e53b2 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3687,13 +3687,11 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
entry = info->ioapic.entry;
info->ioapic.entry = NULL;
memset(entry, 0, sizeof(*entry));
- entry->vector = index;
- entry->mask = 0;
- entry->trigger = info->ioapic.trigger;
- entry->polarity = info->ioapic.polarity;
+ entry->vector = index;
+ entry->trigger = info->ioapic.is_level;
+ entry->polarity = info->ioapic.active_low;
/* Mask level triggered irqs. */
- if (info->ioapic.trigger)
- entry->mask = 1;
+ entry->mask = info->ioapic.is_level;
break;

case X86_IRQ_ALLOC_TYPE_HPET:
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 30269b738fa5..54ca69333445 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1306,11 +1306,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
* irq handler will do the explicit EOI to the io-apic.
*/
entry->vector = info->ioapic.pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = info->ioapic.trigger;
- entry->polarity = info->ioapic.polarity;
- if (info->ioapic.trigger)
- entry->mask = 1; /* Mask level triggered irqs. */
+ entry->trigger = info->ioapic.is_level;
+ entry->polarity = info->ioapic.active_low;
+ /* Mask level triggered irqs. */
+ entry->mask = info->ioapic.is_level;
break;

case X86_IRQ_ALLOC_TYPE_HPET:
--
2.26.2


2020-10-29 12:19:53

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

The following commit has been merged into the x86/apic branch of tip:

Commit-ID: a27dca645d2c0f31abb7858aa0e10b2fa0f2f659
Gitweb: https://git.kernel.org/tip/a27dca645d2c0f31abb7858aa0e10b2fa0f2f659
Author: Thomas Gleixner <[email protected]>
AuthorDate: Sat, 24 Oct 2020 22:35:19 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 28 Oct 2020 20:26:26 +01:00

x86/io_apic: Cleanup trigger/polarity helpers

'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
the trigger type (edge/level) and the active low/high configuration. While
there are defines for initializing these variables and struct members, they
are not used consequently and the meaning of 'trigger' and 'polarity' is
opaque and confusing at best.

Rename them to 'is_level' and 'active_low' and make them boolean in various
structs so it's entirely clear what the meaning is.

Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
arch/x86/include/asm/hw_irq.h | 6 +-
arch/x86/kernel/apic/io_apic.c | 244 ++++++++++++---------------
arch/x86/pci/intel_mid_pci.c | 8 +-
drivers/iommu/amd/iommu.c | 10 +-
drivers/iommu/intel/irq_remapping.c | 9 +-
5 files changed, 130 insertions(+), 147 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a4aeeaa..517847a 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -47,9 +47,9 @@ enum irq_alloc_type {
struct ioapic_alloc_info {
int pin;
int node;
- u32 trigger : 1;
- u32 polarity : 1;
- u32 valid : 1;
+ u32 is_level : 1;
+ u32 active_low : 1;
+ u32 valid : 1;
struct IO_APIC_route_entry *entry;
};

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c6d92d2..24a7bba 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,12 +89,12 @@ struct irq_pin_list {
};

struct mp_chip_data {
- struct list_head irq_2_pin;
- struct IO_APIC_route_entry entry;
- int trigger;
- int polarity;
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
u32 count;
- bool isa_irq;
};

struct mp_ioapic_gsi {
@@ -745,44 +745,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}

-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < nr_legacy_irqs()) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (IOAPIC_EDGE)
-#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
-#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;

@@ -791,90 +754,139 @@ static int irq_polarity(int idx)
*/
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
case MP_IRQPOL_DEFAULT:
- /* conforms to spec, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- return default_ISA_polarity(idx);
- else
- return default_PCI_polarity(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
case MP_IRQPOL_ACTIVE_HIGH:
- return IOAPIC_POL_HIGH;
+ return false;
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
fallthrough;
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_POL_LOW;
+ return true;
}
}

#ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_PCI:
case MP_BUS_ISA:
- return trigger;
+ return level;
case MP_BUS_EISA:
- return default_EISA_trigger(idx);
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
}
pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
- return IOAPIC_LEVEL;
+ return true;
}
#else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return trigger;
+ return level;
}
#endif

-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int trigger;
+ bool level;

/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
case MP_IRQTRIG_DEFAULT:
- /* conforms to spec, ie. bus-type dependent trigger mode */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to egde, ISA to level.
+ */
+ level = test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
- return eisa_irq_trigger(idx, bus, trigger);
+ return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:
- return IOAPIC_EDGE;
+ return false;
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
fallthrough;
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_LEVEL;
+ return true;
}
}

+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
+}
+#endif
+
void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
int trigger, int polarity)
{
init_irq_alloc_info(info, NULL);
info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
info->ioapic.node = node;
- info->ioapic.trigger = trigger;
- info->ioapic.polarity = polarity;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
info->ioapic.valid = 1;
}

-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
struct irq_alloc_info *src,
u32 gsi, int ioapic_idx, int pin)
{
- int trigger, polarity;
+ bool level, pol_low;

copy_irq_alloc_info(dst, src);
dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
@@ -883,20 +895,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
dst->ioapic.valid = 1;
if (src && src->ioapic.valid) {
dst->ioapic.node = src->ioapic.node;
- dst->ioapic.trigger = src->ioapic.trigger;
- dst->ioapic.polarity = src->ioapic.polarity;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
} else {
dst->ioapic.node = NUMA_NO_NODE;
- if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
- dst->ioapic.trigger = trigger;
- dst->ioapic.polarity = polarity;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
} else {
/*
* PCI interrupts are always active low level
* triggered.
*/
- dst->ioapic.trigger = IOAPIC_LEVEL;
- dst->ioapic.polarity = IOAPIC_POL_LOW;
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
}
}
@@ -906,12 +918,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}

-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
{
irq_flow_handler_t hdl;
bool fasteoi;

- if (trigger) {
+ if (level) {
irq_set_status_flags(irq, IRQ_LEVEL);
fasteoi = true;
} else {
@@ -933,14 +945,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
- if (info->ioapic.trigger != data->trigger)
- mp_register_handler(irq, info->ioapic.trigger);
- data->entry.trigger = data->trigger = info->ioapic.trigger;
- data->entry.polarity = data->polarity = info->ioapic.polarity;
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.trigger = data->is_level = info->ioapic.is_level;
+ data->entry.polarity = data->active_low = info->ioapic.active_low;
}

- return data->trigger == info->ioapic.trigger &&
- data->polarity == info->ioapic.polarity;
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}

static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -2179,9 +2191,9 @@ static inline void __init check_timer(void)
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
unmask_ioapic_irq(irq_get_irq_data(0));
}
irq_domain_deactivate_irq(irq_data);
@@ -2588,30 +2600,6 @@ static int io_apic_get_version(int ioapic)
return reg_01.bits.version;
}

-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
- int ioapic, pin, idx;
-
- if (skip_ioapic_setup)
- return -1;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -1;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- if (pin < 0)
- return -1;
-
- idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- return -1;
-
- *trigger = irq_trigger(idx);
- *polarity = irq_polarity(idx);
- return 0;
-}
-
/*
* This function updates target affinity of IOAPIC interrupts to include
* the CPUs which came online during SMP bringup.
@@ -2935,13 +2923,13 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
struct irq_alloc_info *info)
{
if (info && info->ioapic.valid) {
- data->trigger = info->ioapic.trigger;
- data->polarity = info->ioapic.polarity;
- } else if (acpi_get_override_irq(gsi, &data->trigger,
- &data->polarity) < 0) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level,
+ &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
- data->trigger = IOAPIC_LEVEL;
- data->polarity = IOAPIC_POL_LOW;
+ data->is_level = true;
+ data->active_low = true;
}
}

@@ -2953,16 +2941,13 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
entry->dest_mode = apic->dest_mode_logical;
entry->dest = cfg->dest_apicid;
entry->vector = cfg->vector;
- entry->trigger = data->trigger;
- entry->polarity = data->polarity;
+ entry->trigger = data->is_level;
+ entry->polarity = data->active_low;
/*
* Mask level triggered irqs. Edge triggered irqs are masked
* by the irq core code in case they fire.
*/
- if (data->trigger == IOAPIC_LEVEL)
- entry->mask = IOAPIC_MASKED;
- else
- entry->mask = IOAPIC_UNMASKED;
+ entry->mask = data->is_level;
}

int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
@@ -3010,7 +2995,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
local_irq_save(flags);
if (info->ioapic.entry)
mp_setup_entry(cfg, data, info->ioapic.entry);
- mp_register_handler(virq, data->trigger);
+ mp_register_handler(virq, data->is_level);
if (virq < nr_legacy_irqs())
legacy_pic->mask(virq);
local_irq_restore(flags);
@@ -3018,7 +3003,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
apic_printk(APIC_VERBOSE, KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
- virq, data->trigger, data->polarity, cfg->dest_apicid);
+ virq, data->is_level, data->active_low,
+ cfg->dest_apicid);

return 0;
}
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 24ca4ee..95e2e6b 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -215,7 +215,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
struct irq_alloc_info info;
- int polarity;
+ bool polarity_low;
int ret;
u8 gsi;

@@ -230,7 +230,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)

switch (intel_mid_identify_cpu()) {
case INTEL_MID_CPU_CHIP_TANGIER:
- polarity = IOAPIC_POL_HIGH;
+ polarity_low = false;

/* Special treatment for IRQ0 */
if (gsi == 0) {
@@ -252,11 +252,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
}
break;
default:
- polarity = IOAPIC_POL_LOW;
+ polarity_low = true;
break;
}

- ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
+ ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);

/*
* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 473de09..b0e5210 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3687,13 +3687,11 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
entry = info->ioapic.entry;
info->ioapic.entry = NULL;
memset(entry, 0, sizeof(*entry));
- entry->vector = index;
- entry->mask = 0;
- entry->trigger = info->ioapic.trigger;
- entry->polarity = info->ioapic.polarity;
+ entry->vector = index;
+ entry->trigger = info->ioapic.is_level;
+ entry->polarity = info->ioapic.active_low;
/* Mask level triggered irqs. */
- if (info->ioapic.trigger)
- entry->mask = 1;
+ entry->mask = info->ioapic.is_level;
break;

case X86_IRQ_ALLOC_TYPE_HPET:
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 30269b7..54ca693 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1306,11 +1306,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
* irq handler will do the explicit EOI to the io-apic.
*/
entry->vector = info->ioapic.pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = info->ioapic.trigger;
- entry->polarity = info->ioapic.polarity;
- if (info->ioapic.trigger)
- entry->mask = 1; /* Mask level triggered irqs. */
+ entry->trigger = info->ioapic.is_level;
+ entry->polarity = info->ioapic.active_low;
+ /* Mask level triggered irqs. */
+ entry->mask = info->ioapic.is_level;
break;

case X86_IRQ_ALLOC_TYPE_HPET:

2020-11-09 23:17:01

by Tom Lendacky

[permalink] [raw]
Subject: Re: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 10/29/20 7:15 AM, tip-bot2 for Thomas Gleixner wrote:
> The following commit has been merged into the x86/apic branch of tip:
>
> Commit-ID: a27dca645d2c0f31abb7858aa0e10b2fa0f2f659
> Gitweb: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=a27dca645d2c0f31abb7858aa0e10b2fa0f2f659
> Author: Thomas Gleixner <[email protected]>
> AuthorDate: Sat, 24 Oct 2020 22:35:19 +01:00
> Committer: Thomas Gleixner <[email protected]>
> CommitterDate: Wed, 28 Oct 2020 20:26:26 +01:00
>
> x86/io_apic: Cleanup trigger/polarity helpers
>
> 'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
> the trigger type (edge/level) and the active low/high configuration. While
> there are defines for initializing these variables and struct members, they
> are not used consequently and the meaning of 'trigger' and 'polarity' is
> opaque and confusing at best.
>
> Rename them to 'is_level' and 'active_low' and make them boolean in various
> structs so it's entirely clear what the meaning is.

Running the tip tree on my second generation EPYC system I'm seeing lots
of the following:

[ 105.325371] hpet: Lost 9601 RTC interrupts
[ 105.485766] hpet: Lost 9600 RTC interrupts
[ 105.639182] hpet: Lost 9601 RTC interrupts
[ 105.792155] hpet: Lost 9601 RTC interrupts
[ 105.947076] hpet: Lost 9601 RTC interrupts
[ 106.100876] hpet: Lost 9600 RTC interrupts
[ 106.253444] hpet: Lost 9601 RTC interrupts
[ 106.406722] hpet: Lost 9601 RTC interrupts

preventing the system from booting. I bisected it to this commit.

Additionally, I'm seeing warnings and error messages (which I haven't
bisected, yet) along these lines:

[ 12.790801] WARNING: CPU: 135 PID: 1 at arch/x86/kernel/apic/apic.c:2505 __irq_msi_compose_msg+0x79/0x80
[ 98.121716] irq 3: nobody cared (try booting with the "irqpoll" option)
[ 100.692087] irq 15: nobody cared (try booting with the "irqpoll" option)
[ 100.800217] irq 11: nobody cared (try booting with the "irqpoll" option)
[ 100.800407] irq 10: nobody cared (try booting with the "irqpoll" option)

Thanks,
Tom

2020-11-10 03:35:20

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Mon, 2020-11-09 at 17:15 -0600, Tom Lendacky wrote:
> On 10/29/20 7:15 AM, tip-bot2 for Thomas Gleixner wrote:
> > The following commit has been merged into the x86/apic branch of tip:
> >
> > Commit-ID: a27dca645d2c0f31abb7858aa0e10b2fa0f2f659
> > Gitweb: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=a27dca645d2c0f31abb7858aa0e10b2fa0f2f659
> > Author: Thomas Gleixner <[email protected]>
> > AuthorDate: Sat, 24 Oct 2020 22:35:19 +01:00
> > Committer: Thomas Gleixner <[email protected]>
> > CommitterDate: Wed, 28 Oct 2020 20:26:26 +01:00
> >
> > x86/io_apic: Cleanup trigger/polarity helpers
> >
> > 'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
> > the trigger type (edge/level) and the active low/high configuration. While
> > there are defines for initializing these variables and struct members, they
> > are not used consequently and the meaning of 'trigger' and 'polarity' is
> > opaque and confusing at best.
> >
> > Rename them to 'is_level' and 'active_low' and make them boolean in various
> > structs so it's entirely clear what the meaning is.
>
> Running the tip tree on my second generation EPYC system I'm seeing lots
> of the following:
>
> [ 105.325371] hpet: Lost 9601 RTC interrupts
> [ 105.485766] hpet: Lost 9600 RTC interrupts
> [ 105.639182] hpet: Lost 9601 RTC interrupts
> [ 105.792155] hpet: Lost 9601 RTC interrupts
> [ 105.947076] hpet: Lost 9601 RTC interrupts
> [ 106.100876] hpet: Lost 9600 RTC interrupts
> [ 106.253444] hpet: Lost 9601 RTC interrupts
> [ 106.406722] hpet: Lost 9601 RTC interrupts
>
> preventing the system from booting. I bisected it to this commit.
>
> Additionally, I'm seeing warnings and error messages (which I haven't
> bisected, yet) along these lines:
>
> [ 12.790801] WARNING: CPU: 135 PID: 1 at arch/x86/kernel/apic/apic.c:2505 __irq_msi_compose_msg+0x79/0x80
> [ 98.121716] irq 3: nobody cared (try booting with the "irqpoll" option)
> [ 100.692087] irq 15: nobody cared (try booting with the "irqpoll" option)
> [ 100.800217] irq 11: nobody cared (try booting with the "irqpoll" option)
> [ 100.800407] irq 10: nobody cared (try booting with the "irqpoll" option)

Odd. The warning suggests we're asking __irq_msi_compose_msg() to
compose an MSI targeted at an APIC ID above 255, which shouldn't ever
happen.

Can we see a full boot log with apic=verbose please?


Attachments:
smime.p7s (5.05 kB)

2020-11-10 06:15:31

by Borislav Petkov

[permalink] [raw]
Subject: Re: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Mon, Nov 09, 2020 at 05:15:03PM -0600, Tom Lendacky wrote:
> [ 105.325371] hpet: Lost 9601 RTC interrupts
> [ 105.485766] hpet: Lost 9600 RTC interrupts
> [ 105.639182] hpet: Lost 9601 RTC interrupts
> [ 105.792155] hpet: Lost 9601 RTC interrupts
> [ 105.947076] hpet: Lost 9601 RTC interrupts
> [ 106.100876] hpet: Lost 9600 RTC interrupts
> [ 106.253444] hpet: Lost 9601 RTC interrupts
> [ 106.406722] hpet: Lost 9601 RTC interrupts
>
> preventing the system from booting. I bisected it to this commit.

I bisected it to the exact same thing too, on an AMD laptop, after seeing what
you're seeing.

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2020-11-10 06:36:02

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v3 19/35] x86/io_apic: Cleanup trigger/polarity helpers

On Sat, 2020-10-24 at 22:35 +0100, David Woodhouse wrote:
> From: Thomas Gleixner <[email protected]>
>
> 'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
> the trigger type (edge/level) and the active low/high configuration. While
> there are defines for initializing these variables and struct members, they
> are not used consequently and the meaning of 'trigger' and 'polarity' is
> opaque and confusing at best.
>
> Rename them to 'is_level' and 'active_low' and make them boolean in various
> structs so it's entirely clear what the meaning is.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Signed-off-by: David Woodhouse <[email protected]>
> ---
> arch/x86/include/asm/hw_irq.h | 6 +-
> arch/x86/kernel/apic/io_apic.c | 244 +++++++++++++---------------
> arch/x86/pci/intel_mid_pci.c | 8 +-
> drivers/iommu/amd/iommu.c | 10 +-
> drivers/iommu/intel/irq_remapping.c | 9 +-
> 5 files changed, 130 insertions(+), 147 deletions(-)

Reverting the rest of patchset up to this commit on next-20201109 fixed an
endless soft-lockups issue booting an AMD server below. I noticed that the
failed boots always has this IOMMU IO_PAGE_FAULT before those soft-lockups:

[ 3404.093354][ T1] AMD-Vi: Interrupt remapping enabled
[ 3404.098593][ T1] AMD-Vi: Virtual APIC enabled
[ 3404.107783][ T340] pci 0000:00:14.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0000 address=0xfffffffdf8020200 flags=0x0008]
[ 3404.120644][ T1] AMD-Vi: Lazy IO/TLB flushing enabled
[ 3404.126011][ T1] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
[ 3404.133173][ T1] software IO TLB: mapped [mem 0x0000000068dcf000-0x000000006cdcf000] (64MB)

.config (if ever matters):
https://cailca.coding.net/public/linux/mm/git/files/master/x86.config

good boot dmesg (with the commits reverted):
http://people.redhat.com/qcai/dmesg.txt

== system info ==
Dell Poweredge R6415
AMD EPYC 7401P 24-Core Processor
24576 MB memory, 239 GB disk space

[ OK ] Started Flush Journal to Persistent Storage.
[ OK ] Started udev Kernel Device Manager.
[ OK ] Started udev Coldplug all Devices.
[ OK ] Started Monitoring of LVM2 mirrors,…sing dmeventd or progress polling.
[ OK ] Reached target Local File Systems (Pre).
Mounting /boot...
[ OK ] Created slice system-lvm2\x2dpvscan.slice.
[ 3740.376500][ T1058] XFS (sda1): Mounting V5 Filesystem
[ 3740.438474][ T1058] XFS (sda1): Ending clean mount
[ 3765.159433][ C0] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [systemd:1]
[ 3765.166929][ C0] Modules linked in: acpi_cpufreq(+) ip_tables x_tables sd_mod ahci libahci tg3 bnxt_en megaraid_sas libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod
[ 3765.183576][ C0] irq event stamp: 26230104
[ 3765.187954][ C0] hardirqs last enabled at (26230103): [<ffffffff82800b9e>] asm_common_interrupt+0x1e/0x40
[ 3765.197873][ C0] hardirqs last disabled at (26230104): [<ffffffff8277e45a>] sysvec_apic_timer_interrupt+0xa/0xa0
[ 3765.208303][ C0] softirqs last enabled at (26202664): [<ffffffff82a0061b>] __do_softirq+0x61b/0x95d
[ 3765.217699][ C0] softirqs last disabled at (26202591): [<ffffffff82800ec2>] asm_call_irq_on_stack+0x12/0x20
[ 3765.227702][ C0] CPU: 0 PID: 1 Comm: systemd Not tainted 5.10.0-rc2-next-20201109+ #2
[ 3765.235793][ C0] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.9.3 06/25/2019
[ 3765.244065][ C0] RIP: 0010:lock_acquire+0x1f4/0x820
lock_acquire at kernel/locking/lockdep.c:5404
[ 3765.249211][ C0] Code: ff ff ff 48 83 c4 20 65 0f c1 05 a7 ba 9e 7e 83 f8 01 4c 8b 54 24 08 0f 85 60 04 00 00 41 52 9d 48 b8 00 00 00 00 00 fc ff df <48> 01 c3 c7 03 00 00 00 00 c7 43 08 00 00 00 00 48 8b0
[ 3765.268657][ C0] RSP: 0018:ffffc9000006f9f8 EFLAGS: 00000246
[ 3765.274587][ C0] RAX: dffffc0000000000 RBX: 1ffff9200000df42 RCX: 1ffff9200000df28
[ 3765.282420][ C0] RDX: 1ffff11020645769 RSI: 00000000ffffffff RDI: 0000000000000001
[ 3765.290256][ C0] RBP: 0000000000000001 R08: fffffbfff164cb10 R09: fffffbfff164cb10
[ 3765.298090][ C0] R10: 0000000000000246 R11: fffffbfff164cb0f R12: ffff88812be555b0
[ 3765.305922][ C0] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 3765.313750][ C0] FS: 00007f12bb8c59c0(0000) GS:ffff8881b7000000(0000) knlGS:0000000000000000
[ 3765.322537][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3765.328985][ C0] CR2: 00007f0c2d828fd0 CR3: 000000011868a000 CR4: 00000000003506f0
[ 3765.336820][ C0] Call Trace:
[ 3765.339979][ C0] ? rcu_read_unlock+0x40/0x40
[ 3765.344609][ C0] __d_move+0x2a2/0x16f0
__seqprop_spinlock_assert at include/linux/seqlock.h:277
(inlined by) __d_move at fs/dcache.c:2861
[ 3765.348711][ C0] ? d_move+0x47/0x70
[ 3765.352560][ C0] ? _raw_spin_unlock+0x1a/0x30
[ 3765.357275][ C0] d_move+0x47/0x70
write_seqcount_t_end at include/linux/seqlock.h:560
(inlined by) write_sequnlock at include/linux/seqlock.h:901
(inlined by) d_move at fs/dcache.c:2916
[ 3765.360951][ C0] ? vfs_rename+0x9ac/0x1270
[ 3765.365402][ C0] vfs_rename+0x9ac/0x1270
vfs_rename at fs/namei.c:4330
[ 3765.369687][ C0] ? vfs_link+0x830/0x830
[ 3765.373878][ C0] ? do_renameat2+0x58c/0x930
[ 3765.378419][ C0] do_renameat2+0x58c/0x930
do_renameat2 at fs/namei.c:4455
[ 3765.382790][ C0] ? __ia32_sys_link+0x80/0x80
[ 3765.387418][ C0] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 3765.392563][ C0] ? rcu_read_lock_sched_held+0x9c/0xd0
[ 3765.397965][ C0] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 3765.403112][ C0] ? lockdep_hardirqs_on_prepare+0x27c/0x3d0
[ 3765.408955][ C0] ? trace_hardirqs_on+0x1c/0x150
[ 3765.413841][ C0] ? asm_common_interrupt+0x1e/0x40
[ 3765.418905][ C0] ? strncpy_from_user+0x14c/0x330
[ 3765.423878][ C0] ? strncpy_from_user+0x6b/0x330
[ 3765.428764][ C0] ? getname_flags+0x88/0x3e0
[ 3765.433307][ C0] __x64_sys_rename+0x75/0x90
__x64_sys_rename at fs/namei.c:4504
[ 3765.437848][ C0] do_syscall_64+0x33/0x40
[ 3765.442129][ C0] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3765.447882][ C0] RIP: 0033:0x7f12b9b759eb
[ 3765.452166][ C0] Code: e8 5a 0a 08 00 85 c0 0f 95 c0 0f b6 c0 f7 d8 5b c3 66 0f 1f 44 00 00 b8 ff ff ff ff 5b c3 90 f3 0f 1e fa b8 52 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 69 e48
[ 3765.471611][ C0] RSP: 002b:00007fff76252538 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
[ 3765.479880][ C0] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f12b9b759eb
[ 3765.487715][ C0] RDX: 000055ea2997f400 RSI: 00007fff76252570 RDI: 000055ea2997f450
[ 3765.495548][ C0] RBP: 00007fff76252570 R08: 0000000000000000 R09: 0000000065732e32
[ 3765.503374][ C0] R10: 0000000000000003 R11: 0000000000000246 R12: 000055ea2996fcf8
[ 3765.511201][ C0] R13: 000055ea2996fcf8 R14: 000055ea277097f0 R15: 0000000000000000
[ 3765.939428][ C14] watchdog: BUG: soft lockup - CPU#14 stuck for 23s! [systemd-udevd:976]
[ 3765.947707][ C14] Modules linked in: acpi_cpufreq(+) ip_tables x_tables sd_mod ahci libahci tg3 bnxt_en megaraid_sas libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod
[ 3765.964357][ C14] irq event stamp: 535074
[ 3765.968564][ C14] hardirqs last enabled at (535073): [<ffffffff82800c42>] asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 3765.979266][ C14] hardirqs last disabled at (535074): [<ffffffff8277e45a>] sysvec_apic_timer_interrupt+0xa/0xa0
[ 3765.989532][ C14] softirqs last enabled at (535072): [<ffffffff82a0061b>] __do_softirq+0x61b/0x95d
[ 3765.998761][ C14] softirqs last disabled at (535067): [<ffffffff82800ec2>] asm_call_irq_on_stack+0x12/0x20
[ 3766.008590][ C14] CPU: 14 PID: 976 Comm: systemd-udevd Tainted: G L 5.10.0-rc2-next-20201109+ #2
[ 3766.018851][ C14] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.9.3 06/25/2019
[ 3766.027130][ C14] RIP: 0010:path_init+0x1dd/0x1380
__seqprop_spinlock_sequence at include/linux/seqlock.h:277
(inlined by) path_init at fs/namei.c:2219
[ 3766.032112][ C14] Code: 03 0f 8e 79 11 00 00 44 8b 3d 5f f0 6e 01 41 f6 c7 01 74 2e 48 b8 00 00 00 00 00 fc ff df 48 89 d1 48 c1 e9 03 48 01 c1 f3 90 <0f> b6 01 84 c0 74 08 3c 03 0f 8e 9a 0d 00 00 44 8b 3a1
[ 3766.051565][ C14] RSP: 0018:ffffc90006bdfa78 EFLAGS: 00000202
[ 3766.057493][ C14] RAX: 0000000000000000 RBX: ffffc90006bdfcc0 RCX: fffffbfff06426b0
[ 3766.065330][ C14] RDX: ffffffff83213580 RSI: 00000000d93ef353 RDI: ffffc90006bdfd00
[ 3766.073163][ C14] RBP: ffffc90006bdfb08 R08: fffffbfff164cb0d R09: fffffbfff164cb0d
[ 3766.080997][ C14] R10: 0000000000000246 R11: fffffbfff164cb0c R12: ffff8881159d0060
[ 3766.088831][ C14] R13: ffffc90006bdfcf8 R14: 0000000000000041 R15: 000000000000039d
[ 3766.096665][ C14] FS: 00007f0c2ec45980(0000) GS:ffff88866dec0000(0000) knlGS:0000000000000000
[ 3766.105454][ C14] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3766.111900][ C14] CR2: 00007f79350ceb84 CR3: 0000000160a84000 CR4: 00000000003506e0
[ 3766.119737][ C14] Call Trace:
[ 3766.122899][ C14] ? debug_mutex_init+0x31/0x60
[ 3766.127614][ C14] path_openat+0x154/0x1b00
path_openat at fs/namei.c:3364
[ 3766.131988][ C14] ? rcu_read_lock_held+0xb0/0xb0
[ 3766.136881][ C14] ? __lock_acquire+0x1dad/0x3890
[ 3766.141767][ C14] ? path_parentat.isra.62+0x190/0x190
[ 3766.147090][ C14] ? rcu_read_lock_sched_held+0x9c/0xd0
[ 3766.152496][ C14] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 3766.157646][ C14] do_filp_open+0x171/0x240
[ 3766.162017][ C14] ? may_open_dev+0xc0/0xc0
[ 3766.166388][ C14] ? lock_downgrade+0x700/0x700
[ 3766.171104][ C14] ? rcu_read_unlock+0x40/0x40
[ 3766.175735][ C14] ? rwlock_bug.part.1+0x90/0x90
[ 3766.180543][ C14] ? __check_object_size+0x454/0x670
[ 3766.185692][ C14] ? do_raw_spin_unlock+0x4f/0x250
[ 3766.190668][ C14] ? __alloc_fd+0x184/0x500
[ 3766.195031][ C14] do_sys_openat2+0x2db/0x5b0
[ 3766.199576][ C14] ? memset+0x1f/0x40
[ 3766.203422][ C14] ? file_open_root+0x200/0x200
[ 3766.208136][ C14] do_sys_open+0x85/0xd0
[ 3766.212245][ C14] ? filp_open+0x50/0x50
[ 3766.216353][ C14] ? syscall_trace_enter.isra.21+0x54/0x1f0
[ 3766.222107][ C14] do_syscall_64+0x33/0x40
[ 3766.226389][ C14] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3766.232145][ C14] RIP: 0033:0x7f0c2d828552
[ 3766.236426][ C14] Code: 25 00 00 41 00 3d 00 00 41 00 74 4c 48 8d 05 15 50 2d 00 8b 00 85 c0 75 6d 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d 00 f0 ff ff 0f 87 a2 00 00 00 48 8b 4c 24 28 645
[ 3766.255873][ C14] RSP: 002b:00007ffc65d36000 EFLAGS: 00000246 ORIG_RAX: 0000000000000101
[ 3766.264147][ C14] RAX: ffffffffffffffda RBX: 0000564d81e50770 RCX: 00007f0c2d828552
[ 3766.271982][ C14] RDX: 0000000000080000 RSI: 00007ffc65d36150 RDI: 00000000ffffff9c
[ 3766.279817][ C14] RBP: 0000000000000008 R08: 0000000000000008 R09: 0000000000000001
[ 3766.287651][ C14] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f0c2e733347
[ 3766.295486][ C14] R13: 00007f0c2e733347 R14: 0000000000000001 R15: 0000564d81e2d1b8
[ 3766.469430][ C28] watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [mount:1058]
[ 3766.477105][ C28] Modules linked in: acpi_cpufreq(+) ip_tables x_tables sd_mod ahci libahci tg3 bnxt_en megaraid_sas libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod
[ 3766.493753][ C28] irq event stamp: 88756
[ 3766.497868][ C28] hardirqs last enabled at (88755): [<ffffffff82800c42>] asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 3766.508481][ C28] hardirqs last disabled at (88756): [<ffffffff8277e45a>] sysvec_apic_timer_interrupt+0xa/0xa0
[ 3766.518665][ C28] softirqs last enabled at (88748): [<ffffffff82a0061b>] __do_softirq+0x61b/0x95d
[ 3766.527805][ C28] softirqs last disabled at (88743): [<ffffffff82800ec2>] asm_call_irq_on_stack+0x12/0x20
[ 3766.537550][ C28] CPU: 28 PID: 1058 Comm: mount Tainted: G L 5.10.0-rc2-next-20201109+ #2
[ 3766.547210][ C28] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.9.3 06/25/2019
[ 3766.555482][ C28] RIP: 0010:prepend_path.isra.6+0x349/0xdb0
__seqprop_spinlock_sequence at include/linux/seqlock.h:277
(inlined by) read_seqbegin at include/linux/seqlock.h:838
(inlined by) read_seqbegin_or_lock at include/linux/seqlock.h:1142
(inlined by) read_seqbegin_or_lock at include/linux/seqlock.h:1139
(inlined by) prepend_path at fs/d_path.c:99
[ 3766.561241][ C28] Code: 95 f0 fe ff ff 5a 41 54 9d 41 0f b6 55 00 84 d2 74 09 80 fa 03 0f 8e 75 09 00 00 45 8b 26 41 f6 c4 01 0f 84 5b 05 00 00 f3 90 <41> 0f b6 55 00 84 d2 74 e8 80 fa 03 7f e3 4c 89 f7 4ce
[ 3766.580687][ C28] RSP: 0018:ffffc9000670fa18 EFLAGS: 00000202
[ 3766.586618][ C28] RAX: ffff88812bd7d070 RBX: dffffc0000000000 RCX: 1ffff1102a164dc8
[ 3766.594449][ C28] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81bafabb
[ 3766.602287][ C28] RBP: ffffc9000670fb68 R08: 0000000000000001 R09: ffffffff81bb033d
[ 3766.610118][ C28] R10: ffff8881517d8340 R11: fffffbfff14b86e4 R12: 000000000000039d
[ 3766.617956][ C28] R13: fffffbfff06426b0 R14: ffffffff83213580 R15: ffffc9000670fbc0
[ 3766.625789][ C28] FS: 00007f96bb7a6080(0000) GS:ffff8881b71c0000(0000) knlGS:0000000000000000
[ 3766.634579][ C28] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3766.641026][ C28] CR2: 00007f96ba7343f0 CR3: 0000000146c76000 CR4: 00000000003506e0
[ 3766.648859][ C28] Call Trace:
[ 3766.652016][ C28] ? dentry_path_raw+0x10/0x10
[ 3766.656645][ C28] ? rcu_read_unlock+0x40/0x40
[ 3766.661276][ C28] ? memcpy+0x38/0x60
[ 3766.665128][ C28] d_path+0x3fd/0x660
[ 3766.668976][ C28] ? prepend_path.isra.6+0xdb0/0xdb0
[ 3766.674129][ C28] ? __alloc_pages_nodemask+0x650/0x760
[ 3766.679543][ C28] ? __alloc_pages_slowpath.constprop.112+0x2120/0x2120
[ 3766.686345][ C28] mnt_warn_timestamp_expiry+0x25a/0x270
mnt_warn_timestamp_expiry at fs/namespace.c:2555 (discriminator 1)
[ 3766.691839][ C28] ? get_mountpoint+0x370/0x370
[ 3766.696554][ C28] ? do_raw_spin_unlock+0x4f/0x250
[ 3766.701536][ C28] ? _raw_spin_unlock+0x1a/0x30
[ 3766.706253][ C28] ? vfs_create_mount+0x371/0x4a0
[ 3766.711150][ C28] path_mount+0xe6b/0x1720
do_new_mount_fc at fs/namespace.c:2839
(inlined by) do_new_mount at fs/namespace.c:2898
(inlined by) path_mount at fs/namespace.c:3227
[ 3766.715438][ C28] ? finish_automount+0x750/0x750
[ 3766.720328][ C28] ? getname_flags+0x88/0x3e0
[ 3766.724877][ C28] do_mount+0xc6/0xe0
[ 3766.728726][ C28] ? path_mount+0x1720/0x1720
[ 3766.733275][ C28] ? _copy_from_user+0xab/0xf0
[ 3766.737906][ C28] ? memdup_user+0x4f/0x80
[ 3766.742193][ C28] __x64_sys_mount+0x15d/0x1b0
[ 3766.746828][ C28] do_syscall_64+0x33/0x40
[ 3766.751111][ C28] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3914.819439][ C36] watchdog: BUG: soft lockup - CPU#36 stuck for 22s! [systemd-journal:902]
[ 3914.827884][ C36] Modules linked in: acpi_cpufreq(+) ip_tables x_tables sd_mod ahci libahci tg3 bnxt_en megaraid_sas libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod
[ 3914.844543][ C36] irq event stamp: 196688
[ 3914.848748][ C36] hardirqs last enabled at (196687): [<ffffffff82800c42>] asm_sysvec_apic_timer_interrupt+0x12/0x20
[ 3914.859450][ C36] hardirqs last disabled at (196688): [<ffffffff8277e45a>] sysvec_apic_timer_interrupt+0xa/0xa0
[ 3914.869709][ C36] softirqs last enabled at (196684): [<ffffffff82a0061b>] __do_softirq+0x61b/0x95d
[ 3914.878930][ C36] softirqs last disabled at (196679): [<ffffffff82800ec2>] asm_call_irq_on_stack+0x12/0x20
[ 3914.888759][ C36] CPU: 36 PID: 902 Comm: systemd-journal Tainted: G L 5.10.0-rc2-next-20201109+ #2
[ 3914.899195][ C36] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.9.3 06/25/2019
[ 3914.907471][ C36] RIP: 0010:path_init+0x1f3/0x1380
[ 3914.912448][ C36] Code: 00 00 00 00 00 fc ff df 48 89 d1 48 c1 e9 03 48 01 c1 f3 90 0f b6 01 84 c0 74 08 3c 03 0f 8e 9a 0d 00 00 44 8b 3a 41 f6 c7 01 <75> e6 48 b8 00 00 00 00 00 fc ff df 48 8d 7b 44 48 89a
[ 3914.931901][ C36] RSP: 0018:ffffc900065dfc08 EFLAGS: 00000202
[ 3914.937837][ C36] RAX: 0000000000000000 RBX: ffffc900065dfd40 RCX: fffffbfff06426b0
[ 3914.945675][ C36] RDX: ffffffff83213580 RSI: 00000000d93ef353 RDI: ffffc900065dfd80
[ 3914.953514][ C36] RBP: ffffc900065dfc98 R08: fffffbfff164cb0d R09: fffffbfff164cb0d
[ 3914.961349][ C36] R10: 0000000000000246 R11: fffffbfff164cb0c R12: ffff8881159d5920
[ 3914.969186][ C36] R13: ffffc900065dfd78 R14: 0000000000000041 R15: 000000000000039d
[ 3914.977028][ C36] FS: 00007fc5f95f9980(0000) GS:ffff8881b7240000(0000) knlGS:0000000000000000
[ 3914.985814][ C36] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3914.992262][ C36] CR2: 000055ea2994ba42 CR3: 000000013702a000 CR4: 00000000003506e0
[ 3915.000097][ C36] Call Trace:
[ 3915.003257][ C36] path_lookupat.isra.61+0x20/0x440
[ 3915.008323][ C36] ? find_held_lock+0x33/0x1c0
[ 3915.012954][ C36] filename_lookup.part.73+0x15c/0x290
[ 3915.018283][ C36] ? __might_fault+0xb5/0x160
[ 3915.022831][ C36] ? do_symlinkat+0x190/0x190
[ 3915.027375][ C36] ? strncpy_from_user+0x6b/0x330
[ 3915.032268][ C36] ? getname_flags+0x88/0x3e0
[ 3915.036809][ C36] ? kmem_cache_alloc+0x244/0x270
[ 3915.041701][ C36] do_faccessat+0xc0/0x5a0
[ 3915.045987][ C36] ? stream_open+0x60/0x60
[ 3915.050269][ C36] do_syscall_64+0x33/0x40
[ 3915.054552][ C36] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3915.060306][ C36] RIP: 0033:0x7fc5f862697b
[ 3915.064590][ C36] Code: 77 05 c3 0f 1f 40 00 48 8b 15 09 f5 2c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 0f 1f 40 00 f3 0f 1e fa b8 15 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 d9 f48
[ 3915.084044][ C36] RSP: 002b:00007ffdefba1198 EFLAGS: 00000246 ORIG_RAX: 0000000000000015
[ 3915.092322][ C36] RAX: ffffffffffffffda RBX: 00007ffdefba3cf0 RCX: 00007fc5f862697b
[ 3915.100163][ C36] RDX: 00007fc5f91032e8 RSI: 0000000000000000 RDI: 0000556a2401c0d0
[ 3915.107996][ C36] RBP: 00007ffdefba11e0 R08: 0000000000000000 R09: 0000000000000000
[ 3915.115832][ C36] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[ 3915.123667][ C36] R13: 0000000000000000 R14: 0000000000000009 R15: 0000556a254de950
[ 3917.159439][ C0] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [systemd:1]
[ 3917.166926][ C0] Modules linked in: acpi_cpufreq(+) ip_tables x_tables sd_mod ahci libahci tg3 bnxt_en megaraid_sas libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod
[ 3917.183572][ C0] irq event stamp: 94238206
[ 3917.187944][ C0] hardirqs last enabled at (94238205): [<ffffffff82800b9e>] asm_common_interrupt+0x1e/0x40
[ 3917.197857][ C0] hardirqs last disabled at (94238206): [<ffffffff8277e45a>] sysvec_apic_timer_interrupt+0xa/0xa0
[ 3917.208289][ C0] softirqs last enabled at (94219940): [<ffffffff82a0061b>] __do_softirq+0x61b/0x95d
[ 3917.217685][ C0] softirqs last disabled at (94219881): [<ffffffff82800ec2>] asm_call_irq_on_stack+0x12/0x20
[ 3917.227688][ C0] CPU: 0 PID: 1 Comm: systemd Tainted: G L 5.10.0-rc2-next-20201109+ #2
[ 3917.237167][ C0] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.9.3 06/25/2019
[ 3917.245437][ C0] RIP: 0010:lock_acquire+0x21b/0x820
[ 3917.250585][ C0] Code: fc ff df 48 01 c3 c7 03 00 00 00 00 c7 43 08 00 00 00 00 48 8b 84 24 90 00 00 00 65 48 33 04 25 28 00 00 00 0f 85 1a 05 00 00 <48> 81 c4 98 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3c
[ 3917.270033][ C0] RSP: 0018:ffffc9000006f9f8 EFLAGS: 00000246
[ 3917.275959][ C0] RAX: 0000000000000000 RBX: fffff5200000df42 RCX: 1ffff9200000df28
[ 3917.283795][ C0] RDX: 1ffff11020645769 RSI: 00000000ffffffff RDI: 0000000000000001
[ 3917.291627][ C0] RBP: 0000000000000001 R08: fffffbfff164cb10 R09: fffffbfff164cb10
[ 3917.299463][ C0] R10: 0000000000000246 R11: fffffbfff164cb0f R12: ffff88812be555b0
[ 3917.307297][ C0] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 3917.315133][ C0] FS: 00007f12bb8c59c0(0000) GS:ffff8881b7000000(0000) knlGS:0000000000000000
[ 3917.323921][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3917.330368][ C0] CR2: 00007f0c2d828fd0 CR3: 000000011868a000 CR4: 00000000003506f0
[ 3917.338202][ C0] Call Trace:
[ 3917.341360][ C0] ? rcu_read_unlock+0x40/0x40
[ 3917.345987][ C0] __d_move+0x2a2/0x16f0
[ 3917.350096][ C0] ? d_move+0x47/0x70
[ 3917.353944][ C0] ? _raw_spin_unlock+0x1a/0x30
[ 3917.358656][ C0] d_move+0x47/0x70
[ 3917.362330][ C0] ? vfs_rename+0x9ac/0x1270
[ 3917.366786][ C0] vfs_rename+0x9ac/0x1270
[ 3917.371068][ C0] ? vfs_link+0x830/0x830
[ 3917.375261][ C0] ? do_renameat2+0x58c/0x930
[ 3917.379804][ C0] do_renameat2+0x58c/0x930
[ 3917.384173][ C0] ? __ia32_sys_link+0x80/0x80
[ 3917.388800][ C0] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 3917.393948][ C0] ? rcu_read_lock_sched_held+0x9c/0xd0
[ 3917.399354][ C0] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 3917.404503][ C0] ? lockdep_hardirqs_on_prepare+0x27c/0x3d0
[ 3917.410346][ C0] ? trace_hardirqs_on+0x1c/0x150
[ 3917.415234][ C0] ? asm_common_interrupt+0x1e/0x40
[ 3917.420294][ C0] ? strncpy_from_user+0x14c/0x330
[ 3917.425269][ C0] ? strncpy_from_user+0x6b/0x330
[ 3917.430158][ C0] ? getname_flags+0x88/0x3e0
[ 3917.434699][ C0] __x64_sys_rename+0x75/0x90
[ 3917.439239][ C0] do_syscall_64+0x33/0x40
[ 3917.443520][ C0] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3917.449274][ C0] RIP: 0033:0x7f12b9b759eb
[ 3917.453557][ C0] Code: e8 5a 0a 08 00 85 c0 0f 95 c0 0f b6 c0 f7 d8 5b c3 66 0f 1f 44 00 00 b8 ff ff ff ff 5b c3 90 f3 0f 1e fa b8 52 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 69 e48
[ 3917.473003][ C0] RSP: 002b:00007fff76252538 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
[ 3917.481272][ C0] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f12b9b759eb
[ 3917.489107][ C0] RDX: 000055ea2997f400 RSI: 00007fff76252570 RDI: 000055ea2997f450
[ 3917.496942][ C0] RBP: 00007fff76252570 R08: 0000000000000000 R09: 0000000065732e32
[ 3917.504776][ C0] R10: 0000000000000003 R11: 0000000000000246 R12: 000055ea2996fcf8
[ 3917.512610][ C0] R13: 000055ea2996fcf8 R14: 000055ea277097f0 R15: 0000000000000000


[ 3634.443982][ C37] NMI backtrace for cpu 37
[ 3634.443984][ C37] CPU: 37 PID: 906 Comm: udevadm Tainted: G L 5.10.0-rc2-next-20201109+ #3
[ 3634.443986][ C37] Hardware name: Dell Inc. PowerEdge R6415/07YXFK, BIOS 1.9.3 06/25/2019
[ 3634.443987][ C37] RIP: 0010:io_serial_in+0x5c/0x90
[ 3634.443989][ C37] Code: 48 8d 7b 40 0f b6 8b f1 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 d3 e6 80 3c 02 00 75 1e 03 73 40 89 f2 ec <48> 83 c4 08 0f b6 c0 5b c3 89 74 24 04 e8 d2 90 81 ff4
[ 3634.443991][ C37] RSP: 0018:ffffc90004a38780 EFLAGS: 00000002
[ 3634.443993][ C37] RAX: dffffc0000000000 RBX: ffffffffa0b1c988 RCX: 0000000000000000
[ 3634.443995][ C37] RDX: 00000000000002fd RSI: 00000000000002fd RDI: ffffffffa0b1c9c8
[ 3634.443996][ C37] RBP: 0000000000000020 R08: 0000000000000004 R09: fffff520009470f3
[ 3634.443998][ C37] R10: 0000000000000003 R11: fffff520009470f3 R12: fffffbfff4163984
[ 3634.443999][ C37] R13: fffffbfff416393b R14: ffffffffa0b1c9d8 R15: ffffffffa0b1c988
[ 3634.444001][ C37] FS: 00007f6947084980(0000) GS:ffff88866e440000(0000) knlGS:0000000000000000
[ 3634.444002][ C37] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3634.444004][ C37] CR2: 00007fd220b94816 CR3: 000000015371e000 CR4: 00000000003506e0
[ 3634.444005][ C37] Call Trace:
[ 3634.444006][ C37] <IRQ>
[ 3634.444007][ C37] wait_for_xmitr+0x7c/0x1b0
[ 3634.444008][ C37] ? wait_for_xmitr+0x1b0/0x1b0
[ 3634.444010][ C37] serial8250_console_putchar+0x11/0x50
[ 3634.444011][ C37] uart_console_write+0x39/0xc0
[ 3634.444012][ C37] serial8250_console_write+0x74c/0x8a0
[ 3634.444013][ C37] ? serial8250_start_tx+0x810/0x810
[ 3634.444015][ C37] ? rcu_read_unlock+0x40/0x40
[ 3634.444016][ C37] ? do_raw_spin_lock+0x121/0x290
[ 3634.444017][ C37] ? rwlock_bug.part.1+0x90/0x90
[ 3634.444018][ C37] ? prb_read_valid+0x61/0x90
[ 3634.444020][ C37] ? prb_final_commit+0x10/0x10
[ 3634.444021][ C37] console_unlock+0x721/0xa20
[ 3634.444022][ C37] ? do_syslog.part.22+0x5d0/0x5d0
[ 3634.444023][ C37] ? lock_acquire+0x1c8/0x820
[ 3634.444024][ C37] ? lock_downgrade+0x700/0x700
[ 3634.444026][ C37] vprintk_emit+0x201/0x2c0
[ 3634.444027][ C37] printk+0x9a/0xc0
[ 3634.444028][ C37] ? record_print_text.cold.38+0x11/0x11
[ 3634.444029][ C37] ? __module_text_address+0xe/0x140
[ 3634.444031][ C37] ? nmi_cpu_backtrace.cold.10+0x18/0xc5
[ 3634.444032][ C37] show_trace_log_lvl+0x228/0x2bb
[ 3634.444033][ C37] ? nmi_cpu_backtrace.cold.10+0x18/0xc5
[ 3634.444035][ C37] ? nmi_cpu_backtrace.cold.10+0x18/0xc5
[ 3634.444036][ C37] ? lapic_can_unplug_cpu+0x11/0x70
[ 3634.444037][ C37] dump_stack+0x99/0xcb
[ 3634.444038][ C37] nmi_cpu_backtrace.cold.10+0x18/0xc5
[ 3634.444040][ C37] ? lapic_can_unplug_cpu+0x70/0x70
[ 3634.444041][ C37] nmi_trigger_cpumask_backtrace+0x232/0x2a0
[ 3634.444042][ C37] rcu_dump_cpu_stacks+0x1ad/0x1f9
[ 3634.444043][ C37] rcu_sched_clock_irq.cold.90+0x4f7/0x968
[ 3634.444045][ C37] ? tick_sched_do_timer+0x140/0x140
[ 3634.444046][ C37] update_process_times+0x75/0xb0
[ 3634.444047][ C37] tick_sched_timer+0xc8/0xf0
[ 3634.444048][ C37] __hrtimer_run_queues+0x529/0xbb0
[ 3634.444050][ C37] ? enqueue_hrtimer+0x330/0x330
[ 3634.444051][ C37] ? ktime_get_update_offsets_now+0xd6/0x2b0
[ 3634.444052][ C37] hrtimer_interrupt+0x2c2/0x750
[ 3634.444054][ C37] __sysvec_apic_timer_interrupt+0x13c/0x520
[ 3634.444055][ C37] asm_call_irq_on_stack+0x12/0x20
[ 3634.444056][ C37] </IRQ>
[ 3634.444057][ C37] sysvec_apic_timer_interrupt+0x85/0xa0
[ 3634.444059][ C37] asm_sysvec_apic_timer_interrupt+0x12/0x20


2020-11-10 09:01:08

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH v3 19/35] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, 2020-11-10 at 01:31 -0500, Qian Cai wrote:
> On Sat, 2020-10-24 at 22:35 +0100, David Woodhouse wrote:
> > From: Thomas Gleixner <[email protected]>
> >
> > 'trigger' and 'polarity' are used throughout the I/O-APIC code for handling
> > the trigger type (edge/level) and the active low/high configuration. While
> > there are defines for initializing these variables and struct members, they
> > are not used consequently and the meaning of 'trigger' and 'polarity' is
> > opaque and confusing at best.
> >
> > Rename them to 'is_level' and 'active_low' and make them boolean in various
> > structs so it's entirely clear what the meaning is.
> >
> > Signed-off-by: Thomas Gleixner <[email protected]>
> > Signed-off-by: David Woodhouse <[email protected]>
> > ---
> > arch/x86/include/asm/hw_irq.h | 6 +-
> > arch/x86/kernel/apic/io_apic.c | 244 +++++++++++++---------------
> > arch/x86/pci/intel_mid_pci.c | 8 +-
> > drivers/iommu/amd/iommu.c | 10 +-
> > drivers/iommu/intel/irq_remapping.c | 9 +-
> > 5 files changed, 130 insertions(+), 147 deletions(-)
>
> Reverting the rest of patchset up to this commit on next-20201109 fixed an
> endless soft-lockups issue booting an AMD server below. I noticed that the
> failed boots always has this IOMMU IO_PAGE_FAULT before those soft-lockups:

Hm, attempting to reproduce this shows something else. Ever since
commit be62dbf554c5 ("iommu/amd: Convert AMD iommu driver to the dma-
iommu api") in 5.5 the following stops working for me:

$ qemu-system-x86_64 -serial mon:stdio -kernel bzImage -machine q35,accel=kvm,kernel-irqchip=split -m 2G -device amd-iommu,intremap=off -append "console=ttyS0 apic=verbose debug" -display none

It hasn't got a hard drive but I can watch the SATA interrupts fail as
it probes the CD-ROM:

[ 7.403327] ata3.00: qc timeout (cmd 0xa1)
[ 7.405980] ata3.00: failed to IDENTIFY (I/O error, err_mask=0x4)

Adding 'iommu=off' to the kernel command line makes it work again, in
that it correctly panics at the lack of a root file system, quickly.



Attachments:
smime.p7s (5.05 kB)

2020-11-10 14:38:51

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, Nov 10 2020 at 07:10, Borislav Petkov wrote:

> On Mon, Nov 09, 2020 at 05:15:03PM -0600, Tom Lendacky wrote:
>> [ 105.325371] hpet: Lost 9601 RTC interrupts
>> [ 105.485766] hpet: Lost 9600 RTC interrupts
>> [ 105.639182] hpet: Lost 9601 RTC interrupts
>> [ 105.792155] hpet: Lost 9601 RTC interrupts
>> [ 105.947076] hpet: Lost 9601 RTC interrupts
>> [ 106.100876] hpet: Lost 9600 RTC interrupts
>> [ 106.253444] hpet: Lost 9601 RTC interrupts
>> [ 106.406722] hpet: Lost 9601 RTC interrupts
>>
>> preventing the system from booting. I bisected it to this commit.
>
> I bisected it to the exact same thing too, on an AMD laptop, after seeing what
> you're seeing.

Bah. I'm a moron.

--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -809,9 +809,9 @@ static bool irq_is_level(int idx)
case MP_IRQTRIG_DEFAULT:
/*
* Conforms to spec, ie. bus-type dependent trigger
- * mode. PCI defaults to egde, ISA to level.
+ * mode. PCI defaults to level, ISA to edge.
*/
- level = test_bit(bus, mp_bus_not_pci);
+ level = !test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:

2020-11-10 14:57:21

by Tom Lendacky

[permalink] [raw]
Subject: Re: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 11/10/20 8:34 AM, Thomas Gleixner wrote:
> On Tue, Nov 10 2020 at 07:10, Borislav Petkov wrote:
>
>> On Mon, Nov 09, 2020 at 05:15:03PM -0600, Tom Lendacky wrote:
>>> [ 105.325371] hpet: Lost 9601 RTC interrupts
>>> [ 105.485766] hpet: Lost 9600 RTC interrupts
>>> [ 105.639182] hpet: Lost 9601 RTC interrupts
>>> [ 105.792155] hpet: Lost 9601 RTC interrupts
>>> [ 105.947076] hpet: Lost 9601 RTC interrupts
>>> [ 106.100876] hpet: Lost 9600 RTC interrupts
>>> [ 106.253444] hpet: Lost 9601 RTC interrupts
>>> [ 106.406722] hpet: Lost 9601 RTC interrupts
>>>
>>> preventing the system from booting. I bisected it to this commit.
>>
>> I bisected it to the exact same thing too, on an AMD laptop, after seeing what
>> you're seeing.
>
> Bah. I'm a moron.
>
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -809,9 +809,9 @@ static bool irq_is_level(int idx)
> case MP_IRQTRIG_DEFAULT:
> /*
> * Conforms to spec, ie. bus-type dependent trigger
> - * mode. PCI defaults to egde, ISA to level.
> + * mode. PCI defaults to level, ISA to edge.
> */
> - level = test_bit(bus, mp_bus_not_pci);
> + level = !test_bit(bus, mp_bus_not_pci);
> /* Take EISA into account */
> return eisa_irq_is_level(idx, bus, level);
> case MP_IRQTRIG_EDGE:
>

I was about to send the dmesg output when I saw this. A quick test with
this change resolves the boot issue, thanks!

I'm still seeing the warning at arch/x86/kernel/apic/apic.c:2527, but I'll
start a separate thread on that.

Thanks,
Tom

2020-11-10 15:57:27

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, Nov 10 2020 at 08:55, Tom Lendacky wrote:
> On 11/10/20 8:34 AM, Thomas Gleixner wrote:
> I was about to send the dmesg output when I saw this. A quick test with
> this change resolves the boot issue, thanks!

/me feels stupid

> I'm still seeing the warning at arch/x86/kernel/apic/apic.c:2527, but I'll
> start a separate thread on that.

Can you please provide the backtrace?

Thanks,

tglx

2020-11-10 16:03:53

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, 2020-11-10 at 16:54 +0100, Thomas Gleixner wrote:
> On Tue, Nov 10 2020 at 08:55, Tom Lendacky wrote:
> > On 11/10/20 8:34 AM, Thomas Gleixner wrote:
> > I was about to send the dmesg output when I saw this. A quick test
> > with
> > this change resolves the boot issue, thanks!
>
> /me feels stupid
>
> > I'm still seeing the warning at arch/x86/kernel/apic/apic.c:2527,
> > but I'll
> > start a separate thread on that.
>
> Can you please provide the backtrace?

We'll want something like this, to stop AMD IOMMU from registering its
irqdomain even when it shouldn't. But that's the wrong way round; that
would give you remapped MSIs when it should be using the
x86_vector_domain for them. And you have the converse, it seems.

--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1601,9 +1601,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (ret)
return ret;

- ret = amd_iommu_create_irq_domain(iommu);
- if (ret)
- return ret;
+ if (amd_iommu_irq_remap) {
+ ret = amd_iommu_create_irq_domain(iommu);
+ if (ret)
+ return ret;
+ }

/*
* Make sure IOMMU is not considered to translate itself. The IVRS


Attachments:
smime.p7s (5.05 kB)

2020-11-10 16:21:01

by Tom Lendacky

[permalink] [raw]
Subject: Re: [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 11/10/20 9:54 AM, Thomas Gleixner wrote:
> On Tue, Nov 10 2020 at 08:55, Tom Lendacky wrote:
>> On 11/10/20 8:34 AM, Thomas Gleixner wrote:
>> I was about to send the dmesg output when I saw this. A quick test with
>> this change resolves the boot issue, thanks!
>
> /me feels stupid
>
>> I'm still seeing the warning at arch/x86/kernel/apic/apic.c:2527, but I'll
>> start a separate thread on that.
>
> Can you please provide the backtrace?

Yep. The warning started triggering with:
47bea873cf80 ("x86/msi: Only use high bits of MSI address for DMAR unit")

Here's the backtrace:

[ 15.611109] ------------[ cut here ]------------
[ 15.616274] WARNING: CPU: 184 PID: 1 at arch/x86/kernel/apic/apic.c:2505 __irq_msi_compose_msg+0x79/0x80
[ 15.626855] Modules linked in:
[ 15.630263] CPU: 184 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc1-sos-custom #1
[ 15.638516] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS REX1006G 01/25/2020
[ 15.647549] RIP: 0010:__irq_msi_compose_msg+0x79/0x80
[ 15.653188] Code: 0f f0 ff 09 d0 89 06 8b 47 04 c7 46 04 00 00 00 00 88 46 08 45 84 c0 74 08 8b 07 30 c0 89 46 04 c3 81 3f ff 00 00 00 77 01 c3 <0f> 0b c3 0f 1f 40 00 0f 1f 44 00 00 8b 05 81 f9 04 02 85 c0 75 16
[ 15.674140] RSP: 0018:ffffc900000c7c30 EFLAGS: 00010212
[ 15.679971] RAX: 0000000000000021 RBX: ffff888143789028 RCX: 0000000000000000
[ 15.687936] RDX: 0000000000000000 RSI: ffffc900000c7c40 RDI: ffff8881447dd1c0
[ 15.695899] RBP: ffff888143789028 R08: 0000000000000000 R09: 0000000000000005
[ 15.703861] R10: 0000000000000002 R11: 0000000000000004 R12: ffff88900e7b80c0
[ 15.711825] R13: 0000000000000000 R14: ffff88907646ba80 R15: 0000000000000000
[ 15.719789] FS: 0000000000000000(0000) GS:ffff88900f000000(0000) knlGS:0000000000000000
[ 15.728821] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 15.735234] CR2: 0000000000000000 CR3: 000000114d40a000 CR4: 0000000000350ee0
[ 15.743197] Call Trace:
[ 15.745929] irq_chip_compose_msi_msg+0x2e/0x40
[ 15.750984] msi_domain_activate+0x4b/0x90
[ 15.755556] __irq_domain_activate_irq+0x53/0x80
[ 15.760707] ? irq_set_msi_desc_off+0x5a/0x90
[ 15.765568] irq_domain_activate_irq+0x25/0x40
[ 15.770525] __msi_domain_alloc_irqs+0x16a/0x310
[ 15.775680] __pci_enable_msi_range+0x182/0x2b0
[ 15.780738] ? e820__memblock_setup+0x7d/0x7d
[ 15.785597] pci_enable_msi+0x16/0x30
[ 15.789685] iommu_init_msi+0x30/0x190
[ 15.793860] state_next+0x39d/0x665
[ 15.797754] ? e820__memblock_setup+0x7d/0x7d
[ 15.802615] iommu_go_to_state+0x24/0x28
[ 15.806993] amd_iommu_init+0x11/0x46
[ 15.811076] pci_iommu_init+0x16/0x3f
[ 15.815165] do_one_initcall+0x44/0x1d0
[ 15.819447] kernel_init_freeable+0x1e7/0x249
[ 15.824311] ? rest_init+0xb4/0xb4
[ 15.828097] kernel_init+0xa/0x10c
[ 15.831889] ret_from_fork+0x22/0x30
[ 15.835883] CPU: 184 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc1-sos-custom #1
[ 15.836876] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS REX1006G 01/25/2020
[ 15.836876] Call Trace:
[ 15.836876] dump_stack+0x6d/0x88
[ 15.836876] __warn.cold+0x24/0x3d
[ 15.836876] ? __irq_msi_compose_msg+0x79/0x80
[ 15.836876] report_bug+0xd1/0x100
[ 15.836876] handle_bug+0x35/0x80
[ 15.836876] exc_invalid_op+0x14/0x70
[ 15.836876] asm_exc_invalid_op+0x12/0x20

[ 15.836876] RIP: 0010:__irq_msi_compose_msg+0x79/0x80
[ 15.836876] Code: 0f f0 ff 09 d0 89 06 8b 47 04 c7 46 04 00 00 00 00 88 46 08 45 84 c0 74 08 8b 07 30 c0 89 46 04 c3 81 3f ff 00 00 00 77 01 c3 <0f> 0b c3 0f 1f 40 00 0f 1f 44 00 00 8b 05 81 f9 04 02 85 c0 75 16
[ 15.836876] RSP: 0018:ffffc900000c7c30 EFLAGS: 00010212
[ 15.836876] RAX: 0000000000000021 RBX: ffff888143789028 RCX: 0000000000000000
[ 15.836876] RDX: 0000000000000000 RSI: ffffc900000c7c40 RDI: ffff8881447dd1c0
[ 15.836876] RBP: ffff888143789028 R08: 0000000000000000 R09: 0000000000000005
[ 15.836876] R10: 0000000000000002 R11: 0000000000000004 R12: ffff88900e7b80c0
[ 15.836876] R13: 0000000000000000 R14: ffff88907646ba80 R15: 0000000000000000
[ 15.836876] irq_chip_compose_msi_msg+0x2e/0x40
[ 15.836876] msi_domain_activate+0x4b/0x90
[ 15.836876] __irq_domain_activate_irq+0x53/0x80
[ 15.836876] ? irq_set_msi_desc_off+0x5a/0x90
[ 15.836876] irq_domain_activate_irq+0x25/0x40
[ 15.836876] __msi_domain_alloc_irqs+0x16a/0x310
[ 15.836876] __pci_enable_msi_range+0x182/0x2b0
[ 15.836876] ? e820__memblock_setup+0x7d/0x7d
[ 15.836876] pci_enable_msi+0x16/0x30
[ 15.836876] iommu_init_msi+0x30/0x190
[ 15.836876] state_next+0x39d/0x665
[ 15.836876] ? e820__memblock_setup+0x7d/0x7d
[ 15.836876] iommu_go_to_state+0x24/0x28
[ 15.836876] amd_iommu_init+0x11/0x46
[ 15.836876] pci_iommu_init+0x16/0x3f
[ 15.836876] do_one_initcall+0x44/0x1d0
[ 15.836876] kernel_init_freeable+0x1e7/0x249
[ 15.836876] ? rest_init+0xb4/0xb4
[ 15.836876] kernel_init+0xa/0x10c
[ 15.836876] ret_from_fork+0x22/0x30
[ 16.046524] ---[ end trace 2fb272e5ca8df1c8 ]---

Thanks,
Tom

>
> Thanks,
>
> tglx
>

2020-11-10 16:28:24

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v3 19/35] x86/io_apic: Cleanup trigger/polarity helpers

On 10/11/20 09:59, David Woodhouse wrote:
> Hm, attempting to reproduce this shows something else. Ever since
> commit be62dbf554c5 ("iommu/amd: Convert AMD iommu driver to the dma-
> iommu api") in 5.5 the following stops working for me:
>
> $ qemu-system-x86_64 -serial mon:stdio -kernel bzImage -machine q35,accel=kvm,kernel-irqchip=split -m 2G -device amd-iommu,intremap=off -append "console=ttyS0 apic=verbose debug" -display none
>
> It hasn't got a hard drive but I can watch the SATA interrupts fail as
> it probes the CD-ROM:
>
> [ 7.403327] ata3.00: qc timeout (cmd 0xa1)
> [ 7.405980] ata3.00: failed to IDENTIFY (I/O error, err_mask=0x4)
>
> Adding 'iommu=off' to the kernel command line makes it work again, in
> that it correctly panics at the lack of a root file system, quickly.

That might well be a QEMU bug though, AMD emulation is kinda experimental.

Paolo

2020-11-10 16:38:31

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, 2020-11-10 at 10:17 -0600, Tom Lendacky wrote:
> Yep. The warning started triggering with:
> 47bea873cf80 ("x86/msi: Only use high bits of MSI address for DMAR unit")
>
> Here's the backtrace:
>
> [ 15.611109] ------------[ cut here ]------------
> [ 15.616274] WARNING: CPU: 184 PID: 1 at arch/x86/kernel/apic/apic.c:2505 __irq_msi_compose_msg+0x79/0x80
> [ 15.626855] Modules linked in:
> [ 15.630263] CPU: 184 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc1-sos-custom #1
> [ 15.638516] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS REX1006G 01/25/2020
> [ 15.647549] RIP: 0010:__irq_msi_compose_msg+0x79/0x80
> [ 15.653188] Code: 0f f0 ff 09 d0 89 06 8b 47 04 c7 46 04 00 00 00 00 88 46 08 45 84 c0 74 08 8b 07 30 c0 89 46 04 c3 81 3f ff 00 00 00 77 01 c3 <0f> 0b c3 0f 1f 40 00 0f 1f 44 00 00 8b 05 81 f9 04 02 85 c0 75 16
> [ 15.674140] RSP: 0018:ffffc900000c7c30 EFLAGS: 00010212
> [ 15.679971] RAX: 0000000000000021 RBX: ffff888143789028 RCX: 0000000000000000
> [ 15.687936] RDX: 0000000000000000 RSI: ffffc900000c7c40 RDI: ffff8881447dd1c0
> [ 15.695899] RBP: ffff888143789028 R08: 0000000000000000 R09: 0000000000000005
> [ 15.703861] R10: 0000000000000002 R11: 0000000000000004 R12: ffff88900e7b80c0
> [ 15.711825] R13: 0000000000000000 R14: ffff88907646ba80 R15: 0000000000000000
> [ 15.719789] FS: 0000000000000000(0000) GS:ffff88900f000000(0000) knlGS:0000000000000000
> [ 15.728821] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 15.735234] CR2: 0000000000000000 CR3: 000000114d40a000 CR4: 0000000000350ee0
> [ 15.743197] Call Trace:
> [ 15.745929] irq_chip_compose_msi_msg+0x2e/0x40
> [ 15.750984] msi_domain_activate+0x4b/0x90
> [ 15.755556] __irq_domain_activate_irq+0x53/0x80
> [ 15.760707] ? irq_set_msi_desc_off+0x5a/0x90
> [ 15.765568] irq_domain_activate_irq+0x25/0x40
> [ 15.770525] __msi_domain_alloc_irqs+0x16a/0x310
> [ 15.775680] __pci_enable_msi_range+0x182/0x2b0
> [ 15.780738] ? e820__memblock_setup+0x7d/0x7d
> [ 15.785597] pci_enable_msi+0x16/0x30
> [ 15.789685] iommu_init_msi+0x30/0x190
> [ 15.793860] state_next+0x39d/0x665
> [ 15.797754] ? e820__memblock_setup+0x7d/0x7d
> [ 15.802615] iommu_go_to_state+0x24/0x28
> [ 15.806993] amd_iommu_init+0x11/0x46
> [ 15.811076] pci_iommu_init+0x16/0x3f

Oh joy.

It's asking the core code to generate a PCI MSI message for it and
actually program that to the PCI device (since the IOMMU itself is a
PCI device).

That isn't actually used for generating MSI, but is instead interpreted
to write the intcapxt registers which *do* generate the interrupts.

That wants fixing, preferably not to go via MSI format at all, or maybe
just to use the 'dmar' flag to __irq_msi_compose_msg(). Either way by
having an irqdomain of its own like the Intel IOMMU does.

If I could get post-5.5 kernels to boot at all with the AMD IOMMU
enabled, I'd have a go at throwing that together now...


Attachments:
smime.p7s (5.05 kB)

2020-11-10 17:08:46

by Tom Lendacky

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 11/10/20 10:33 AM, David Woodhouse wrote:
> On Tue, 2020-11-10 at 10:17 -0600, Tom Lendacky wrote:
>> Yep. The warning started triggering with:
>> 47bea873cf80 ("x86/msi: Only use high bits of MSI address for DMAR unit")
>>
>> Here's the backtrace:
>>
>> [ 15.611109] ------------[ cut here ]------------
>> [ 15.616274] WARNING: CPU: 184 PID: 1 at arch/x86/kernel/apic/apic.c:2505 __irq_msi_compose_msg+0x79/0x80
>> [ 15.626855] Modules linked in:
>> [ 15.630263] CPU: 184 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc1-sos-custom #1
>> [ 15.638516] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS REX1006G 01/25/2020
>> [ 15.647549] RIP: 0010:__irq_msi_compose_msg+0x79/0x80
>> [ 15.653188] Code: 0f f0 ff 09 d0 89 06 8b 47 04 c7 46 04 00 00 00 00 88 46 08 45 84 c0 74 08 8b 07 30 c0 89 46 04 c3 81 3f ff 00 00 00 77 01 c3 <0f> 0b c3 0f 1f 40 00 0f 1f 44 00 00 8b 05 81 f9 04 02 85 c0 75 16
>> [ 15.674140] RSP: 0018:ffffc900000c7c30 EFLAGS: 00010212
>> [ 15.679971] RAX: 0000000000000021 RBX: ffff888143789028 RCX: 0000000000000000
>> [ 15.687936] RDX: 0000000000000000 RSI: ffffc900000c7c40 RDI: ffff8881447dd1c0
>> [ 15.695899] RBP: ffff888143789028 R08: 0000000000000000 R09: 0000000000000005
>> [ 15.703861] R10: 0000000000000002 R11: 0000000000000004 R12: ffff88900e7b80c0
>> [ 15.711825] R13: 0000000000000000 R14: ffff88907646ba80 R15: 0000000000000000
>> [ 15.719789] FS: 0000000000000000(0000) GS:ffff88900f000000(0000) knlGS:0000000000000000
>> [ 15.728821] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 15.735234] CR2: 0000000000000000 CR3: 000000114d40a000 CR4: 0000000000350ee0
>> [ 15.743197] Call Trace:
>> [ 15.745929] irq_chip_compose_msi_msg+0x2e/0x40
>> [ 15.750984] msi_domain_activate+0x4b/0x90
>> [ 15.755556] __irq_domain_activate_irq+0x53/0x80
>> [ 15.760707] ? irq_set_msi_desc_off+0x5a/0x90
>> [ 15.765568] irq_domain_activate_irq+0x25/0x40
>> [ 15.770525] __msi_domain_alloc_irqs+0x16a/0x310
>> [ 15.775680] __pci_enable_msi_range+0x182/0x2b0
>> [ 15.780738] ? e820__memblock_setup+0x7d/0x7d
>> [ 15.785597] pci_enable_msi+0x16/0x30
>> [ 15.789685] iommu_init_msi+0x30/0x190
>> [ 15.793860] state_next+0x39d/0x665
>> [ 15.797754] ? e820__memblock_setup+0x7d/0x7d
>> [ 15.802615] iommu_go_to_state+0x24/0x28
>> [ 15.806993] amd_iommu_init+0x11/0x46
>> [ 15.811076] pci_iommu_init+0x16/0x3f
>
> Oh joy.
>
> It's asking the core code to generate a PCI MSI message for it and
> actually program that to the PCI device (since the IOMMU itself is a
> PCI device).
>
> That isn't actually used for generating MSI, but is instead interpreted
> to write the intcapxt registers which *do* generate the interrupts.
>
> That wants fixing, preferably not to go via MSI format at all, or maybe
> just to use the 'dmar' flag to __irq_msi_compose_msg(). Either way by
> having an irqdomain of its own like the Intel IOMMU does.
>
> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
> enabled, I'd have a go at throwing that together now...

That's odd. I haven't had an issue with or heard of any issues with
booting post-5.5 kernels with the AMD IOMMU enabled. What kind of problems
are you encountering?

Thanks,
Tom

>

2020-11-10 17:51:22

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/apic] x86/ioapic: Correct the PCI/ISA trigger type selection

The following commit has been merged into the x86/apic branch of tip:

Commit-ID: aec8da04e4d71afdd4ab3025ea34a6517435f363
Gitweb: https://git.kernel.org/tip/aec8da04e4d71afdd4ab3025ea34a6517435f363
Author: Thomas Gleixner <[email protected]>
AuthorDate: Tue, 10 Nov 2020 15:34:32 +01:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 10 Nov 2020 18:43:22 +01:00

x86/ioapic: Correct the PCI/ISA trigger type selection

PCI's default trigger type is level and ISA's is edge. The recent
refactoring made it the other way round, which went unnoticed as it seems
only to cause havoc on some AMD systems.

Make the comment and code do the right thing again.

Fixes: a27dca645d2c ("x86/io_apic: Cleanup trigger/polarity helpers")
Reported-by: Tom Lendacky <[email protected]>
Reported-by: Borislav Petkov <[email protected]>
Reported-by: Qian Cai <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Tom Lendacky <[email protected]>
Cc: David Woodhouse <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/kernel/apic/io_apic.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0602c95..089e755 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -809,9 +809,9 @@ static bool irq_is_level(int idx)
case MP_IRQTRIG_DEFAULT:
/*
* Conforms to spec, ie. bus-type dependent trigger
- * mode. PCI defaults to egde, ISA to level.
+ * mode. PCI defaults to level, ISA to edge.
*/
- level = test_bit(bus, mp_bus_not_pci);
+ level = !test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:

2020-11-10 17:53:54

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, Nov 10 2020 at 16:33, David Woodhouse wrote:
> On Tue, 2020-11-10 at 10:17 -0600, Tom Lendacky wrote:
>> Yep. The warning started triggering with:
>> 47bea873cf80 ("x86/msi: Only use high bits of MSI address for DMAR unit")
>>
>> Here's the backtrace:
>>
>> [ 15.745929] irq_chip_compose_msi_msg+0x2e/0x40
>> [ 15.750984] msi_domain_activate+0x4b/0x90
>> [ 15.755556] __irq_domain_activate_irq+0x53/0x80
>> [ 15.760707] ? irq_set_msi_desc_off+0x5a/0x90
>> [ 15.765568] irq_domain_activate_irq+0x25/0x40
>> [ 15.770525] __msi_domain_alloc_irqs+0x16a/0x310
>> [ 15.775680] __pci_enable_msi_range+0x182/0x2b0
>> [ 15.780738] ? e820__memblock_setup+0x7d/0x7d
>> [ 15.785597] pci_enable_msi+0x16/0x30
>> [ 15.789685] iommu_init_msi+0x30/0x190
>
> It's asking the core code to generate a PCI MSI message for it and
> actually program that to the PCI device (since the IOMMU itself is a
> PCI device).
>
> That isn't actually used for generating MSI, but is instead interpreted
> to write the intcapxt registers which *do* generate the interrupts.
>
> That wants fixing, preferably not to go via MSI format at all, or maybe
> just to use the 'dmar' flag to __irq_msi_compose_msg(). Either way by
> having an irqdomain of its own like the Intel IOMMU does.
>
> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
> enabled, I'd have a go at throwing that together now...

It can share the dmar domain code. Let me frob something.

2020-11-10 18:58:56

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, Nov 10 2020 at 18:50, Thomas Gleixner wrote:
> On Tue, Nov 10 2020 at 16:33, David Woodhouse wrote:
>> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
>> enabled, I'd have a go at throwing that together now...
>
> It can share the dmar domain code. Let me frob something.

Not much to share there and I can't access my AMD machine at the
moment. Something like the untested below should work.

Thanks,

tglx
---
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -523,7 +523,7 @@ struct msi_msg;
struct irq_cfg;

extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
- bool dmar);
+ bool iommu);

extern void ioapic_zap_locks(void);

--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -63,4 +63,6 @@ static inline void x86_create_pci_msi_do
#define x86_pci_msi_default_domain NULL
#endif

+struct irq_domain *x86_create_iommu_msi_domain(void);
+
#endif
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2498,7 +2498,7 @@ int hard_smp_processor_id(void)
}

void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
- bool dmar)
+ bool iommu)
{
memset(msg, 0, sizeof(*msg));

@@ -2519,7 +2519,7 @@ void __irq_msi_compose_msg(struct irq_cf
* some hypervisors allow the extended destination ID field in bits
* 5-11 to be used, giving support for 15 bits of APIC IDs in total.
*/
- if (dmar)
+ if (iommu)
msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000)
msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8;
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -184,7 +184,8 @@ static struct msi_domain_info pci_msi_do
.handler_name = "edge",
};

-struct irq_domain * __init native_create_pci_msi_domain(void)
+static struct irq_domain *__init
+__create_pci_msi_domain(struct msi_domain_info *info, const char *name)
{
struct fwnode_handle *fn;
struct irq_domain *d;
@@ -192,21 +193,25 @@ struct irq_domain * __init native_create
if (disable_apic)
return NULL;

- fn = irq_domain_alloc_named_fwnode("PCI-MSI");
+ fn = irq_domain_alloc_named_fwnode(name);
if (!fn)
return NULL;

- d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
- x86_vector_domain);
+ d = pci_msi_create_irq_domain(fn, info, x86_vector_domain);
if (!d) {
irq_domain_free_fwnode(fn);
- pr_warn("Failed to initialize PCI-MSI irqdomain.\n");
+ pr_warn("Failed to initialize %s irqdomain.\n", name);
} else {
d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
return d;
}

+struct irq_domain * __init native_create_pci_msi_domain(void)
+{
+ return __create_pci_msi_domain(&pci_msi_domain_info, "PCI-MSI");
+}
+
void __init x86_create_pci_msi_domain(void)
{
x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
@@ -247,6 +252,43 @@ struct irq_domain *arch_create_remap_msi
}
#endif

+static void __maybe_unused iommu_msi_compose_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
+#ifdef CONFIG_AMD_IOMMU
+/*
+ * Similar to the Intel IOMMU abuse below. The resulting irq domain is
+ * associated to the IOMMU pci device, so that pci_enable_msi() works.
+ */
+static struct irq_chip iommu_msi_controller = {
+ .name = "IOMMU-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_affinity = msi_set_affinity,
+ .irq_compose_msi_msg = iommu_msi_compose_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_info iommu_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &iommu_msi_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+struct irq_domain __init *x86_create_iommu_msi_domain(void)
+{
+ return __create_pci_msi_domain(&iommu_msi_domain_info, "IOMMU-MSI");
+}
+#endif
+
#ifdef CONFIG_DMAR_TABLE
/*
* The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
@@ -254,11 +296,6 @@ struct irq_domain *arch_create_remap_msi
* case for MSIs as it would be targeting real memory above 4GiB not the
* APIC.
*/
-static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
-{
- __irq_msi_compose_msg(irqd_cfg(data), msg, true);
-}
-
static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
dmar_msi_write(data->irq, msg);
@@ -271,7 +308,7 @@ static struct irq_chip dmar_msi_controll
.irq_ack = irq_chip_ack_parent,
.irq_set_affinity = msi_domain_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_compose_msi_msg = dmar_msi_compose_msg,
+ .irq_compose_msi_msg = iommu_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1759,6 +1759,7 @@ static const struct attribute_group *amd

static int __init iommu_init_pci(struct amd_iommu *iommu)
{
+ static struct irq_domain *msidom;
int cap_ptr = iommu->cap_ptr;
int ret;

@@ -1767,6 +1768,13 @@ static int __init iommu_init_pci(struct
if (!iommu->dev)
return -ENODEV;

+ if (!msidom) {
+ msidom = x86_create_iommu_msi_domain();
+ if (!msidom)
+ return -ENODEV;
+ }
+ dev_set_msi_domain(&iommu->dev->dev, msidom);
+
/* Prevent binding other PCI device drivers to IOMMU devices */
iommu->dev->match_driver = false;

2020-11-10 19:23:36

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers



On 10 November 2020 18:56:17 GMT, Thomas Gleixner <[email protected]> wrote:
>On Tue, Nov 10 2020 at 18:50, Thomas Gleixner wrote:
>> On Tue, Nov 10 2020 at 16:33, David Woodhouse wrote:
>>> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
>>> enabled, I'd have a go at throwing that together now...
>>
>> It can share the dmar domain code. Let me frob something.
>
>Not much to share there and I can't access my AMD machine at the
>moment. Something like the untested below should work.

Does it even need its own irqdomain? Can it not just allocate directly from the vector domain then program its own register directly based on the irq_cfg?


--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2020-11-10 21:05:39

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, Nov 10 2020 at 19:21, David Woodhouse wrote:

> On 10 November 2020 18:56:17 GMT, Thomas Gleixner <[email protected]> wrote:
>>On Tue, Nov 10 2020 at 18:50, Thomas Gleixner wrote:
>>> On Tue, Nov 10 2020 at 16:33, David Woodhouse wrote:
>>>> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
>>>> enabled, I'd have a go at throwing that together now...
>>>
>>> It can share the dmar domain code. Let me frob something.
>>
>>Not much to share there and I can't access my AMD machine at the
>>moment. Something like the untested below should work.
>
> Does it even need its own irqdomain? Can it not just allocate directly
> from the vector domain then program its own register directly based on
> the irq_cfg?

It uses pci_enable_msi() and I have no clue about that piece of hardware
and whether that is actually required or not. If it is, then it needs a
domain because that's what pci_enable_msi() uses.

Thanks,

tglx

2020-11-10 21:32:50

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers



On 10 November 2020 21:01:17 GMT, Thomas Gleixner <[email protected]> wrote:
>On Tue, Nov 10 2020 at 19:21, David Woodhouse wrote:
>
>> On 10 November 2020 18:56:17 GMT, Thomas Gleixner
><[email protected]> wrote:
>>>On Tue, Nov 10 2020 at 18:50, Thomas Gleixner wrote:
>>>> On Tue, Nov 10 2020 at 16:33, David Woodhouse wrote:
>>>>> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
>>>>> enabled, I'd have a go at throwing that together now...
>>>>
>>>> It can share the dmar domain code. Let me frob something.
>>>
>>>Not much to share there and I can't access my AMD machine at the
>>>moment. Something like the untested below should work.
>>
>> Does it even need its own irqdomain? Can it not just allocate
>directly
>> from the vector domain then program its own register directly based
>on
>> the irq_cfg?
>
>It uses pci_enable_msi() and I have no clue about that piece of
>hardware
>and whether that is actually required or not. If it is, then it needs a
>domain because that's what pci_enable_msi() uses.

I'd be kind of surprised if it is required, but testing on qemu is obviously not going to cut it. Tom?

--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2020-11-10 22:03:37

by Tom Lendacky

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 11/10/20 3:30 PM, David Woodhouse wrote:
>
>
> On 10 November 2020 21:01:17 GMT, Thomas Gleixner <[email protected]> wrote:
>> On Tue, Nov 10 2020 at 19:21, David Woodhouse wrote:
>>
>>> On 10 November 2020 18:56:17 GMT, Thomas Gleixner
>> <[email protected]> wrote:
>>>> On Tue, Nov 10 2020 at 18:50, Thomas Gleixner wrote:
>>>>> On Tue, Nov 10 2020 at 16:33, David Woodhouse wrote:
>>>>>> If I could get post-5.5 kernels to boot at all with the AMD IOMMU
>>>>>> enabled, I'd have a go at throwing that together now...
>>>>>
>>>>> It can share the dmar domain code. Let me frob something.
>>>>
>>>> Not much to share there and I can't access my AMD machine at the
>>>> moment. Something like the untested below should work.
>>>
>>> Does it even need its own irqdomain? Can it not just allocate
>> directly
>>> from the vector domain then program its own register directly based
>> on
>>> the irq_cfg?
>>
>> It uses pci_enable_msi() and I have no clue about that piece of
>> hardware
>> and whether that is actually required or not. If it is, then it needs a
>> domain because that's what pci_enable_msi() uses.
>
> I'd be kind of surprised if it is required, but testing on qemu is obviously not going to cut it. Tom?

Was just in the process of testing it... I still get a warning. Here's
the new backtrace:

[ 15.581115] WARNING: CPU: 6 PID: 1 at arch/x86/kernel/apic/apic.c:2527 __irq_msi_compose_msg+0x9f/0xb0
[ 15.581115] Modules linked in:
[ 15.581115] CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc3-sos-custom #1
[ 15.581115] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS REX1006G 01/25/2020
[ 15.581115] RIP: 0010:__irq_msi_compose_msg+0x9f/0xb0
[ 15.581115] Code: 01 00 74 1e 3d ff 7f 00 00 77 1f 0f b7 16 c1 e8 08 83 e0 7f c1 e0 05 66 81 e2 1f f0 09 d0 66 89 06 c3 3d ff 00 00 00 77 01 c3 <0f> 0b c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 0f 1f 44 00 00
[ 15.581115] RSP: 0018:ffffc900000c7c18 EFLAGS: 00010012
[ 15.581115] RAX: 0000000000000100 RBX: 0000000000000000 RCX: 0000000000000000
[ 15.581115] RDX: 0000000000000000 RSI: ffffc900000c7c20 RDI: ffff8881088341c0
[ 15.581115] RBP: ffff8881599e0428 R08: 0000000000000000 R09: ffffffffffffffff
[ 15.581115] R10: 0000000000000003 R11: 0000000000000004 R12: ffff8881088341c0
[ 15.581115] R13: 0000000000000000 R14: 0000000000000004 R15: ffff888108834180
[ 15.581115] FS: 0000000000000000(0000) GS:ffff88900d380000(0000) knlGS:0000000000000000
[ 15.581115] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 15.581115] CR2: 0000000000000000 CR3: 000000015240a000 CR4: 0000000000350ee0
[ 15.581115] Call Trace:
[ 15.581115] irq_msi_update_msg+0x4d/0x80
[ 15.581115] msi_set_affinity+0x160/0x190
[ 15.581115] irq_do_set_affinity+0x52/0x190
[ 15.581115] irq_setup_affinity+0xd7/0x170
[ 15.581115] irq_startup+0x5d/0xf0
[ 15.581115] __setup_irq+0x6b9/0x700
[ 15.581115] request_threaded_irq+0xf8/0x160
[ 15.581115] ? irq_remapping_alloc+0x4d0/0x4d0
[ 15.581115] ? e820__memblock_setup+0x7d/0x7d
[ 15.581115] iommu_init_msi+0x60/0x190
[ 15.581115] state_next+0x39d/0x665
[ 15.581115] ? e820__memblock_setup+0x7d/0x7d
[ 15.581115] iommu_go_to_state+0x24/0x28
[ 15.581115] amd_iommu_init+0x11/0x46
[ 15.581115] pci_iommu_init+0x16/0x3f
[ 15.581115] do_one_initcall+0x44/0x1d0
[ 15.581115] kernel_init_freeable+0x1e7/0x249
[ 15.581115] ? rest_init+0xb4/0xb4
[ 15.581115] kernel_init+0xa/0x10c
[ 15.581115] ret_from_fork+0x22/0x30
[ 15.581115] CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc3-sos-custom #1
[ 15.581115] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS REX1006G 01/25/2020
[ 15.581115] Call Trace:
[ 15.581115] dump_stack+0x6d/0x88
[ 15.581115] __warn.cold+0x24/0x3d
[ 15.581115] ? __irq_msi_compose_msg+0x9f/0xb0
[ 15.581115] report_bug+0xd1/0x100
[ 15.581115] handle_bug+0x35/0x80
[ 15.581115] exc_invalid_op+0x14/0x70
[ 15.581115] asm_exc_invalid_op+0x12/0x20
[ 15.581115] RIP: 0010:__irq_msi_compose_msg+0x9f/0xb0
[ 15.581115] Code: 01 00 74 1e 3d ff 7f 00 00 77 1f 0f b7 16 c1 e8 08 83 e0 7f c1 e0 05 66 81 e2 1f f0 09 d0 66 89 06 c3 3d ff 00 00 00 77 01 c3 <0f> 0b c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 0f 1f 44 00 00
[ 15.581115] RSP: 0018:ffffc900000c7c18 EFLAGS: 00010012
[ 15.581115] RAX: 0000000000000100 RBX: 0000000000000000 RCX: 0000000000000000
[ 15.581115] RDX: 0000000000000000 RSI: ffffc900000c7c20 RDI: ffff8881088341c0
[ 15.581115] RBP: ffff8881599e0428 R08: 0000000000000000 R09: ffffffffffffffff
[ 15.581115] R10: 0000000000000003 R11: 0000000000000004 R12: ffff8881088341c0
[ 15.581115] R13: 0000000000000000 R14: 0000000000000004 R15: ffff888108834180
[ 15.581115] irq_msi_update_msg+0x4d/0x80
[ 15.581115] msi_set_affinity+0x160/0x190
[ 15.581115] irq_do_set_affinity+0x52/0x190
[ 15.581115] irq_setup_affinity+0xd7/0x170
[ 15.581115] irq_startup+0x5d/0xf0
[ 15.581115] __setup_irq+0x6b9/0x700
[ 15.581115] request_threaded_irq+0xf8/0x160
[ 15.581115] ? irq_remapping_alloc+0x4d0/0x4d0
[ 15.581115] ? e820__memblock_setup+0x7d/0x7d
[ 15.581115] iommu_init_msi+0x60/0x190
[ 15.581115] state_next+0x39d/0x665
[ 15.581115] ? e820__memblock_setup+0x7d/0x7d
[ 15.581115] iommu_go_to_state+0x24/0x28
[ 15.581115] amd_iommu_init+0x11/0x46
[ 15.581115] pci_iommu_init+0x16/0x3f
[ 15.581115] do_one_initcall+0x44/0x1d0
[ 15.581115] kernel_init_freeable+0x1e7/0x249
[ 15.581115] ? rest_init+0xb4/0xb4
[ 15.581115] kernel_init+0xa/0x10c
[ 15.581115] ret_from_fork+0x22/0x30
[ 15.581115] ---[ end trace 05c9465e30ba6a20 ]---


Thanks,
Tom

>

2020-11-10 22:50:07

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, Nov 10 2020 at 16:00, Tom Lendacky wrote:
> On 11/10/20 3:30 PM, David Woodhouse wrote:
> [ 15.581115] WARNING: CPU: 6 PID: 1 at arch/x86/kernel/apic/apic.c:2527 __irq_msi_compose_msg+0x9f/0xb0
> [ 15.581115] Call Trace:
> [ 15.581115] irq_msi_update_msg+0x4d/0x80
> [ 15.581115] msi_set_affinity+0x160/0x190

Duh. Yes, that want's some love as well. Delta patch below.

Thanks,

tglx
---
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -24,10 +24,11 @@ struct irq_domain *x86_pci_msi_default_d

static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
{
+ struct irq_chip *chip = irq_data_get_irq_chip(irqd);
struct msi_msg msg[2] = { [1] = { }, };

- __irq_msi_compose_msg(cfg, msg, false);
- irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+ __irq_msi_compose_msg(cfg, msg, chip->flags & IRQCHIP_MSI_EXTID);
+ chip->irq_write_msi_msg(irqd, msg);
}

static int
@@ -271,7 +272,7 @@ static struct irq_chip iommu_msi_control
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_set_affinity = msi_set_affinity,
.irq_compose_msi_msg = iommu_msi_compose_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MSI_EXTID,
};

static struct msi_domain_info iommu_msi_domain_info = {
@@ -310,7 +311,7 @@ static struct irq_chip dmar_msi_controll
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = iommu_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MSI_EXTID,
};

static int dmar_msi_init(struct irq_domain *domain,
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -567,6 +567,8 @@ struct irq_chip {
* IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips
* IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs
* in the suspend path if they are in disabled state
+ * IRQCHIP_MSI_EXTID The MSI message created for this chip can
+ * have an otherwise forbidden extended ID
*/
enum {
IRQCHIP_SET_TYPE_MASKED = (1 << 0),
@@ -579,6 +581,7 @@ enum {
IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7),
IRQCHIP_SUPPORTS_NMI = (1 << 8),
IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9),
+ IRQCHIP_MSI_EXTID = (1 << 10),
};

#include <linux/irqdesc.h>

2020-11-10 23:08:51

by Tom Lendacky

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 11/10/20 4:48 PM, Thomas Gleixner wrote:
> On Tue, Nov 10 2020 at 16:00, Tom Lendacky wrote:
>> On 11/10/20 3:30 PM, David Woodhouse wrote:
>> [ 15.581115] WARNING: CPU: 6 PID: 1 at arch/x86/kernel/apic/apic.c:2527 __irq_msi_compose_msg+0x9f/0xb0
>> [ 15.581115] Call Trace:
>> [ 15.581115] irq_msi_update_msg+0x4d/0x80
>> [ 15.581115] msi_set_affinity+0x160/0x190
>
> Duh. Yes, that want's some love as well. Delta patch below.
>
> Thanks,
>
> tglx

That fixed it.

Thanks,
Tom

> ---
> --- a/arch/x86/kernel/apic/msi.c
> +++ b/arch/x86/kernel/apic/msi.c
> @@ -24,10 +24,11 @@ struct irq_domain *x86_pci_msi_default_d
>
> static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
> {
> + struct irq_chip *chip = irq_data_get_irq_chip(irqd);
> struct msi_msg msg[2] = { [1] = { }, };
>
> - __irq_msi_compose_msg(cfg, msg, false);
> - irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
> + __irq_msi_compose_msg(cfg, msg, chip->flags & IRQCHIP_MSI_EXTID);
> + chip->irq_write_msi_msg(irqd, msg);
> }
>
> static int
> @@ -271,7 +272,7 @@ static struct irq_chip iommu_msi_control
> .irq_retrigger = irq_chip_retrigger_hierarchy,
> .irq_set_affinity = msi_set_affinity,
> .irq_compose_msi_msg = iommu_msi_compose_msg,
> - .flags = IRQCHIP_SKIP_SET_WAKE,
> + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MSI_EXTID,
> };
>
> static struct msi_domain_info iommu_msi_domain_info = {
> @@ -310,7 +311,7 @@ static struct irq_chip dmar_msi_controll
> .irq_retrigger = irq_chip_retrigger_hierarchy,
> .irq_compose_msi_msg = iommu_msi_compose_msg,
> .irq_write_msi_msg = dmar_msi_write_msg,
> - .flags = IRQCHIP_SKIP_SET_WAKE,
> + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MSI_EXTID,
> };
>
> static int dmar_msi_init(struct irq_domain *domain,
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -567,6 +567,8 @@ struct irq_chip {
> * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips
> * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs
> * in the suspend path if they are in disabled state
> + * IRQCHIP_MSI_EXTID The MSI message created for this chip can
> + * have an otherwise forbidden extended ID
> */
> enum {
> IRQCHIP_SET_TYPE_MASKED = (1 << 0),
> @@ -579,6 +581,7 @@ enum {
> IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7),
> IRQCHIP_SUPPORTS_NMI = (1 << 8),
> IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9),
> + IRQCHIP_MSI_EXTID = (1 << 10),
> };
>
> #include <linux/irqdesc.h>
>

2020-11-11 08:19:05

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Tue, 2020-11-10 at 23:48 +0100, Thomas Gleixner wrote:
> + * IRQCHIP_MSI_EXTID The MSI message created for this chip can
> + * have an otherwise forbidden extended ID

If we're going to do that then we could ditch the separate
iommu_compose_msi_msg() function too, right?

But actually I'd be more inclined to fix this differently, in a way
that doesn't still leave AMD's iommu_init_intcapxt() having to set use
irq_set_affinity_notifier() to update its own registers. That's icky.

Given that this is *its* irqdomain in the first place, it should just
sit at ->set_affinity() for itself, and call its parent as usual
without having to use a notifier.

We should also leave it using the basic PCI MSI support in the case
where the IOMMU doesn't have XTSUP support. It doesn't need its own
irqdomain for that.


Attachments:
smime.p7s (5.05 kB)

2020-11-11 09:51:09

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, Nov 11 2020 at 08:16, David Woodhouse wrote:
> On Tue, 2020-11-10 at 23:48 +0100, Thomas Gleixner wrote:
>> + * IRQCHIP_MSI_EXTID The MSI message created for this chip can
>> + * have an otherwise forbidden extended ID
>
> If we're going to do that then we could ditch the separate
> iommu_compose_msi_msg() function too, right?
>
> But actually I'd be more inclined to fix this differently, in a way
> that doesn't still leave AMD's iommu_init_intcapxt() having to set use
> irq_set_affinity_notifier() to update its own registers. That's icky.
>
> Given that this is *its* irqdomain in the first place, it should just
> sit at ->set_affinity() for itself, and call its parent as usual
> without having to use a notifier.
>
> We should also leave it using the basic PCI MSI support in the case
> where the IOMMU doesn't have XTSUP support. It doesn't need its own
> irqdomain for that.

Looking at it now with brain awake, the XTSUP stuff is pretty much
the same as DMAR, which I didn't realize yesterday. The affinity
notifier muck is not needed when we have a write_msg() function which
twiddles the bits into those other locations.

Thanks,

tglx


2020-11-11 10:39:02

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, 2020-11-11 at 10:46 +0100, Thomas Gleixner wrote:
> Looking at it now with brain awake, the XTSUP stuff is pretty much
> the same as DMAR, which I didn't realize yesterday. The affinity
> notifier muck is not needed when we have a write_msg() function which
> twiddles the bits into those other locations.

I kind of hate the fact that it's swizzling those bits through invalid
MSI messages, so I did it as its own irqdomain using
irq_domain_create_hierarchy() directly instead of
msi_create_irq_domain().

Looks something like this, although I believe I'm missing a call to
irq_domain_set_info() somewhere which means that the irq_chip
'intcapxt_controller' isn't being used at all...


diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 90a8add186e0..b68fe10aa67d 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/interrupt.h>
#include <linux/msi.h>
+#include <linux/irq.h>
#include <linux/amd-iommu.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
@@ -1563,8 +1564,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
* for MSI address lo/hi/data, we need to check both
* EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support.
*/
- if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) &&
- (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT)))
+ if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT))
amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
break;
default:
@@ -1978,28 +1978,27 @@ union intcapxt {
destid_24_31 : 8;
} __attribute__ ((packed));

-/*
- * Setup the IntCapXT registers with interrupt routing information
- * based on the PCI MSI capability block registers, accessed via
- * MMIO MSI address low/hi and MSI data registers.
- */
-static void iommu_update_intcapxt(struct amd_iommu *iommu)
+/* We weren't actually doing anything before. Should we? */
+static void intcapxt_unmask_irq(struct irq_data *data)
{
- struct msi_msg msg;
- union intcapxt xt;
- u32 destid;
+}
+static void intcapxt_mask_irq(struct irq_data *data)
+{
+}

- msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
- msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
- msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);

- destid = x86_msi_msg_get_destid(&msg, x2apic_enabled());
+static int intcapxt_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irqd, bool reserve)
+{
+ struct amd_iommu *iommu = domain->host_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ union intcapxt xt;

xt.capxt = 0ULL;
- xt.dest_mode_logical = msg.arch_data.dest_mode_logical;
- xt.vector = msg.arch_data.vector;
- xt.destid_0_23 = destid & GENMASK(23, 0);
- xt.destid_24_31 = destid >> 24;
+ xt.dest_mode_logical = apic->dest_mode_logical;
+ xt.vector = cfg->vector;
+ xt.destid_0_23 = cfg->dest_apicid & GENMASK(23, 0);
+ xt.destid_24_31 = cfg->dest_apicid >> 24;

/**
* Current IOMMU implemtation uses the same IRQ for all
@@ -2008,64 +2007,117 @@ static void iommu_update_intcapxt(struct amd_iommu *iommu)
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+
+ return 0;
}

-static void _irq_notifier_notify(struct irq_affinity_notify *notify,
- const cpumask_t *mask)
+static void intcapxt_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irqd)
{
- struct amd_iommu *iommu;
+ intcapxt_mask_irq(irqd);
+}

- for_each_iommu(iommu) {
- if (iommu->dev->irq == notify->irq) {
- iommu_update_intcapxt(iommu);
- break;
- }
- }
+
+static int intcapxt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ return irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
}

-static void _irq_notifier_release(struct kref *ref)
+static void intcapxt_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}

-static int iommu_init_intcapxt(struct amd_iommu *iommu)
+static int intcapxt_set_affinity(struct irq_data *irqd,
+ const struct cpumask *mask, bool force)
{
+ struct irq_data *parent = irqd->parent_data;
int ret;
- struct irq_affinity_notify *notify = &iommu->intcapxt_notify;

- /**
- * IntCapXT requires XTSup=1 and MsiCapMmioSup=1,
- * which can be inferred from amd_iommu_xt_mode.
- */
- if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE)
- return 0;
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;

- /**
- * Also, we need to setup notifier to update the IntCapXT registers
- * whenever the irq affinity is changed from user-space.
- */
- notify->irq = iommu->dev->irq;
- notify->notify = _irq_notifier_notify,
- notify->release = _irq_notifier_release,
- ret = irq_set_affinity_notifier(iommu->dev->irq, notify);
+ return intcapxt_irqdomain_activate(irqd->domain, irqd, false);
+}
+
+static struct irq_chip intcapxt_controller = {
+ .name = "IOMMU-MSI",
+ .irq_unmask = intcapxt_unmask_irq,
+ .irq_mask = intcapxt_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_affinity = intcapxt_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static const struct irq_domain_ops intcapxt_domain_ops = {
+ .alloc = intcapxt_irqdomain_alloc,
+ .free = intcapxt_irqdomain_free,
+ .activate = intcapxt_irqdomain_activate,
+ .deactivate = intcapxt_irqdomain_deactivate,
+};
+
+static struct irq_domain *iommu_get_irqdomain(struct amd_iommu *iommu)
+{
+ struct irq_domain *domain = NULL;
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_id_fwnode("AMD-Vi-MSI", iommu->index);
+ if (!fn)
+ return NULL;
+
+ domain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0,
+ fn, &intcapxt_domain_ops,
+ iommu);
+ if (!domain)
+ irq_domain_free_fwnode(fn);
+
+ return domain;
+}
+
+static int iommu_setup_intcapxt(struct amd_iommu *iommu)
+{
+ struct irq_domain *domain;
+ int irq, ret;
+
+ domain = iommu_get_irqdomain(iommu);
+ if (!domain)
+ return -ENXIO;
+
+ irq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, NULL);
+ if (irq < 0) {
+ irq_domain_remove(domain);
+ return irq;
+ }
+
+ ret = request_threaded_irq(irq,
+ amd_iommu_int_handler,
+ amd_iommu_int_thread,
+ 0, "AMD-Vi",
+ iommu);
if (ret) {
- pr_err("Failed to register irq affinity notifier (devid=%#x, irq %d)\n",
- iommu->devid, iommu->dev->irq);
+ irq_domain_free_irqs(irq, 1);
+ irq_domain_remove(domain);
return ret;
}

- iommu_update_intcapxt(iommu);
iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN);
- return ret;
+ return 0;
}

-static int iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_irq(struct amd_iommu *iommu)
{
int ret;

if (iommu->int_enabled)
goto enable_faults;

- if (iommu->dev->msi_cap)
+ if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+ ret = iommu_setup_intcapxt(iommu);
+ else if (iommu->dev->msi_cap)
ret = iommu_setup_msi(iommu);
else
ret = -ENODEV;
@@ -2074,10 +2126,6 @@ static int iommu_init_msi(struct amd_iommu *iommu)
return ret;

enable_faults:
- ret = iommu_init_intcapxt(iommu);
- if (ret)
- return ret;
-
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);

if (iommu->ppr_log != NULL)
@@ -2700,7 +2748,7 @@ static int amd_iommu_enable_interrupts(void)
int ret = 0;

for_each_iommu(iommu) {
- ret = iommu_init_msi(iommu);
+ ret = iommu_init_irq(iommu);
if (ret)
goto out;
}


Attachments:
smime.p7s (5.05 kB)

2020-11-11 12:34:19

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, 2020-11-11 at 10:36 +0000, David Woodhouse wrote:
> On Wed, 2020-11-11 at 10:46 +0100, Thomas Gleixner wrote:
> > Looking at it now with brain awake, the XTSUP stuff is pretty much
> > the same as DMAR, which I didn't realize yesterday. The affinity
> > notifier muck is not needed when we have a write_msg() function which
> > twiddles the bits into those other locations.
>
> I kind of hate the fact that it's swizzling those bits through invalid
> MSI messages, so I did it as its own irqdomain using
> irq_domain_create_hierarchy() directly instead of
> msi_create_irq_domain().

Please give this a spin:

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/amdvi


Attachments:
smime.p7s (5.05 kB)

2020-11-11 14:45:33

by David Woodhouse

[permalink] [raw]
Subject: [PATCH 1/3] iommu/amd: Don't register interrupt remapping irqdomain when IR is disabled

From: David Woodhouse <[email protected]>

This was potentially allowing I/OAPIC and MSI interrupts to be parented
in the IOMMU IR domain even when IR was disabled. Don't do that.

Signed-off-by: David Woodhouse <[email protected]>
---
drivers/iommu/amd/init.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 263670d36fed..90a8add186e0 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1601,9 +1601,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (ret)
return ret;

- ret = amd_iommu_create_irq_domain(iommu);
- if (ret)
- return ret;
+ if (amd_iommu_irq_remap) {
+ ret = amd_iommu_create_irq_domain(iommu);
+ if (ret)
+ return ret;
+ }

/*
* Make sure IOMMU is not considered to translate itself. The IVRS
--
2.26.2

2020-11-11 14:45:33

by David Woodhouse

[permalink] [raw]
Subject: [PATCH 2/3] iommu/amd: Fix union of bitfields in intcapxt support

From: David Woodhouse <[email protected]>

All the bitfields in here are overlaid on top of each other since
they're a union. Change the second u64 to be in a struct so it does
the intended thing.

Fixes: b5c3786ee3704 ("iommu/amd: Use msi_msg shadow structs")
Signed-off-by: David Woodhouse <[email protected]>
---
drivers/iommu/amd/init.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 90a8add186e0..a94b96f1e13a 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1969,13 +1969,15 @@ static int iommu_setup_msi(struct amd_iommu *iommu)

union intcapxt {
u64 capxt;
- u64 reserved_0 : 2,
- dest_mode_logical : 1,
- reserved_1 : 5,
- destid_0_23 : 24,
- vector : 8,
- reserved_2 : 16,
- destid_24_31 : 8;
+ struct {
+ u64 reserved_0 : 2,
+ dest_mode_logical : 1,
+ reserved_1 : 5,
+ destid_0_23 : 24,
+ vector : 8,
+ reserved_2 : 16,
+ destid_24_31 : 8;
+ };
} __attribute__ ((packed));

/*
--
2.26.2

2020-11-11 14:46:43

by David Woodhouse

[permalink] [raw]
Subject: [PATCH 3/3] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode

From: David Woodhouse <[email protected]>

The AMD IOMMU has two modes for generating its own interrupts.

The first is very much based on PCI MSI, and can be configured by Linux
precisely that way. But like legacy unmapped PCI MSI it's limited to
8 bits of APIC ID.

The second method does not use PCI MSI at all in hardawre, and instead
configures the INTCAPXT registers in the IOMMU directly with the APIC ID
and vector.

In the latter case, the IOMMU driver would still use pci_enable_msi(),
read back (through MMIO) the MSI message that Linux wrote to the PCI MSI
table, then swizzle those bits into the appropriate register.

Historically, this worked because__irq_compose_msi_msg() would silently
generate an invalid MSI message with the high bits of the APIC ID in the
high bits of the MSI address. That hack was intended only for the Intel
IOMMU, and I recently enforced that, introducing a warning in
__irq_msi_compose_msg() if it was invoked with an APIC ID above 255.

Fix the AMD IOMMU not to depend on that hack any more, by having its own
irqdomain and directly putting the bits from the irq_cfg into the right
place in its ->activate() method.

Fixes: 47bea873cf80 "x86/msi: Only use high bits of MSI address for DMAR unit")
Signed-off-by: David Woodhouse <[email protected]>
---
arch/x86/include/asm/hw_irq.h | 1 +
drivers/iommu/amd/init.c | 190 +++++++++++++++++++++++-----------
2 files changed, 132 insertions(+), 59 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 458f5a676402..d465ece58151 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -39,6 +39,7 @@ enum irq_alloc_type {
X86_IRQ_ALLOC_TYPE_PCI_MSI,
X86_IRQ_ALLOC_TYPE_PCI_MSIX,
X86_IRQ_ALLOC_TYPE_DMAR,
+ X86_IRQ_ALLOC_TYPE_AMDVI,
X86_IRQ_ALLOC_TYPE_UV,
};

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index a94b96f1e13a..a57400a73abb 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/interrupt.h>
#include <linux/msi.h>
+#include <linux/irq.h>
#include <linux/amd-iommu.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
@@ -1557,14 +1558,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
break;
}

- /*
- * Note: Since iommu_update_intcapxt() leverages
- * the IOMMU MMIO access to MSI capability block registers
- * for MSI address lo/hi/data, we need to check both
- * EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support.
- */
- if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) &&
- (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT)))
+ if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT))
amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
break;
default:
@@ -1981,27 +1975,32 @@ union intcapxt {
} __attribute__ ((packed));

/*
- * Setup the IntCapXT registers with interrupt routing information
- * based on the PCI MSI capability block registers, accessed via
- * MMIO MSI address low/hi and MSI data registers.
+ * There isn't really any need to mask/unmask at the irqchip level because
+ * the 64-bit INTCAPXT registers can be updated atomically without tearing
+ * when the affinity is being updated.
*/
-static void iommu_update_intcapxt(struct amd_iommu *iommu)
+static void intcapxt_unmask_irq(struct irq_data *data)
{
- struct msi_msg msg;
- union intcapxt xt;
- u32 destid;
+}

- msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
- msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
- msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
+static void intcapxt_mask_irq(struct irq_data *data)
+{
+}

- destid = x86_msi_msg_get_destid(&msg, x2apic_enabled());
+static struct irq_chip intcapxt_controller;
+
+static int intcapxt_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irqd, bool reserve)
+{
+ struct amd_iommu *iommu = irqd->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ union intcapxt xt;

xt.capxt = 0ULL;
- xt.dest_mode_logical = msg.arch_data.dest_mode_logical;
- xt.vector = msg.arch_data.vector;
- xt.destid_0_23 = destid & GENMASK(23, 0);
- xt.destid_24_31 = destid >> 24;
+ xt.dest_mode_logical = apic->dest_mode_logical;
+ xt.vector = cfg->vector;
+ xt.destid_0_23 = cfg->dest_apicid & GENMASK(23, 0);
+ xt.destid_24_31 = cfg->dest_apicid >> 24;

/**
* Current IOMMU implemtation uses the same IRQ for all
@@ -2010,64 +2009,141 @@ static void iommu_update_intcapxt(struct amd_iommu *iommu)
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+ return 0;
}

-static void _irq_notifier_notify(struct irq_affinity_notify *notify,
- const cpumask_t *mask)
+static void intcapxt_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irqd)
{
- struct amd_iommu *iommu;
+ intcapxt_mask_irq(irqd);
+}

- for_each_iommu(iommu) {
- if (iommu->dev->irq == notify->irq) {
- iommu_update_intcapxt(iommu);
- break;
- }
+
+static int intcapxt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ int i, ret;
+
+ if (!info || info->type != X86_IRQ_ALLOC_TYPE_AMDVI)
+ return -EINVAL;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = virq; i < virq + nr_irqs; i++) {
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, i);
+
+ irqd->chip = &intcapxt_controller;
+ irqd->chip_data = info->data;
}
+
+ return ret;
}

-static void _irq_notifier_release(struct kref *ref)
+static void intcapxt_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}

-static int iommu_init_intcapxt(struct amd_iommu *iommu)
+static int intcapxt_set_affinity(struct irq_data *irqd,
+ const struct cpumask *mask, bool force)
{
+ struct irq_data *parent = irqd->parent_data;
int ret;
- struct irq_affinity_notify *notify = &iommu->intcapxt_notify;

- /**
- * IntCapXT requires XTSup=1 and MsiCapMmioSup=1,
- * which can be inferred from amd_iommu_xt_mode.
- */
- if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE)
- return 0;
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;

- /**
- * Also, we need to setup notifier to update the IntCapXT registers
- * whenever the irq affinity is changed from user-space.
- */
- notify->irq = iommu->dev->irq;
- notify->notify = _irq_notifier_notify,
- notify->release = _irq_notifier_release,
- ret = irq_set_affinity_notifier(iommu->dev->irq, notify);
+ return intcapxt_irqdomain_activate(irqd->domain, irqd, false);
+}
+
+static struct irq_chip intcapxt_controller = {
+ .name = "IOMMU-MSI",
+ .irq_unmask = intcapxt_unmask_irq,
+ .irq_mask = intcapxt_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_affinity = intcapxt_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static const struct irq_domain_ops intcapxt_domain_ops = {
+ .alloc = intcapxt_irqdomain_alloc,
+ .free = intcapxt_irqdomain_free,
+ .activate = intcapxt_irqdomain_activate,
+ .deactivate = intcapxt_irqdomain_deactivate,
+};
+
+
+static struct irq_domain *iommu_irqdomain;
+
+static struct irq_domain *iommu_get_irqdomain(void)
+{
+ struct fwnode_handle *fn;
+
+ /* No need for locking here (yet) as the init is single-threaded */
+ if (iommu_irqdomain)
+ return iommu_irqdomain;
+
+ fn = irq_domain_alloc_named_fwnode("AMD-Vi-MSI");
+ if (!fn)
+ return NULL;
+
+ iommu_irqdomain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0,
+ fn, &intcapxt_domain_ops,
+ NULL);
+ if (!iommu_irqdomain)
+ irq_domain_free_fwnode(fn);
+
+ return iommu_irqdomain;
+}
+
+static int iommu_setup_intcapxt(struct amd_iommu *iommu)
+{
+ struct irq_domain *domain;
+ struct irq_alloc_info info;
+ int irq, ret;
+
+ domain = iommu_get_irqdomain();
+ if (!domain)
+ return -ENXIO;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_AMDVI;
+ info.data = iommu;
+
+ irq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+ if (irq < 0) {
+ irq_domain_remove(domain);
+ return irq;
+ }
+
+ ret = request_threaded_irq(irq, amd_iommu_int_handler,
+ amd_iommu_int_thread, 0, "AMD-Vi", iommu);
if (ret) {
- pr_err("Failed to register irq affinity notifier (devid=%#x, irq %d)\n",
- iommu->devid, iommu->dev->irq);
+ irq_domain_free_irqs(irq, 1);
+ irq_domain_remove(domain);
return ret;
}

- iommu_update_intcapxt(iommu);
iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN);
- return ret;
+ return 0;
}

-static int iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_irq(struct amd_iommu *iommu)
{
int ret;

if (iommu->int_enabled)
goto enable_faults;

- if (iommu->dev->msi_cap)
+ if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+ ret = iommu_setup_intcapxt(iommu);
+ else if (iommu->dev->msi_cap)
ret = iommu_setup_msi(iommu);
else
ret = -ENODEV;
@@ -2076,10 +2152,6 @@ static int iommu_init_msi(struct amd_iommu *iommu)
return ret;

enable_faults:
- ret = iommu_init_intcapxt(iommu);
- if (ret)
- return ret;
-
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);

if (iommu->ppr_log != NULL)
@@ -2702,7 +2774,7 @@ static int amd_iommu_enable_interrupts(void)
int ret = 0;

for_each_iommu(iommu) {
- ret = iommu_init_msi(iommu);
+ ret = iommu_init_irq(iommu);
if (ret)
goto out;
}
--
2.26.2

2020-11-11 20:32:52

by Tom Lendacky

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On 11/11/20 6:32 AM, David Woodhouse wrote:
> On Wed, 2020-11-11 at 10:36 +0000, David Woodhouse wrote:
>> On Wed, 2020-11-11 at 10:46 +0100, Thomas Gleixner wrote:
>>> Looking at it now with brain awake, the XTSUP stuff is pretty much
>>> the same as DMAR, which I didn't realize yesterday. The affinity
>>> notifier muck is not needed when we have a write_msg() function which
>>> twiddles the bits into those other locations.
>>
>> I kind of hate the fact that it's swizzling those bits through invalid
>> MSI messages, so I did it as its own irqdomain using
>> irq_domain_create_hierarchy() directly instead of
>> msi_create_irq_domain().
>
> Please give this a spin:
>
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/amdvi

I had trouble cloning your tree for some reason, so just took the top
three patches and applied them to the tip tree. This all appears to be
working. I'll let the IOMMU experts take a closer look (adding Suravee).

Thanks,
Tom

>

2020-11-11 21:21:48

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, 2020-11-11 at 14:30 -0600, Tom Lendacky wrote:
> On 11/11/20 6:32 AM, David Woodhouse wrote:
> > On Wed, 2020-11-11 at 10:36 +0000, David Woodhouse wrote:
> > > On Wed, 2020-11-11 at 10:46 +0100, Thomas Gleixner wrote:
> > > > Looking at it now with brain awake, the XTSUP stuff is pretty much
> > > > the same as DMAR, which I didn't realize yesterday. The affinity
> > > > notifier muck is not needed when we have a write_msg() function which
> > > > twiddles the bits into those other locations.
> > >
> > > I kind of hate the fact that it's swizzling those bits through invalid
> > > MSI messages, so I did it as its own irqdomain using
> > > irq_domain_create_hierarchy() directly instead of
> > > msi_create_irq_domain().
> >
> > Please give this a spin:
> >
> > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/amdvi
>
> I had trouble cloning your tree for some reason, so just took the top
> three patches and applied them to the tip tree. This all appears to be
> working. I'll let the IOMMU experts take a closer look (adding Suravee).

Thanks. Exercising it by actually triggering faults and observing the
IOMMU interrupt really working would be quite useful — on hardware with
both INTCAPXT and the older MSI delivery.


Attachments:
smime.p7s (5.05 kB)

2020-11-11 22:08:37

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/apic] iommu/amd: Fix union of bitfields in intcapxt support

The following commit has been merged into the x86/apic branch of tip:

Commit-ID: 2fb6acf3edfeb904505f9ba3fd01166866062591
Gitweb: https://git.kernel.org/tip/2fb6acf3edfeb904505f9ba3fd01166866062591
Author: David Woodhouse <[email protected]>
AuthorDate: Wed, 11 Nov 2020 14:43:21
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 11 Nov 2020 23:01:57 +01:00

iommu/amd: Fix union of bitfields in intcapxt support

All the bitfields in here are overlaid on top of each other since
they're a union. Change the second u64 to be in a struct so it does
the intended thing.

Fixes: b5c3786ee370 ("iommu/amd: Use msi_msg shadow structs")
Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
drivers/iommu/amd/init.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 263670d..c2769f2 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1967,13 +1967,15 @@ static int iommu_setup_msi(struct amd_iommu *iommu)

union intcapxt {
u64 capxt;
- u64 reserved_0 : 2,
- dest_mode_logical : 1,
- reserved_1 : 5,
- destid_0_23 : 24,
- vector : 8,
- reserved_2 : 16,
- destid_24_31 : 8;
+ struct {
+ u64 reserved_0 : 2,
+ dest_mode_logical : 1,
+ reserved_1 : 5,
+ destid_0_23 : 24,
+ vector : 8,
+ reserved_2 : 16,
+ destid_24_31 : 8;
+ };
} __attribute__ ((packed));

/*

2020-11-11 22:08:37

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/apic] iommu/amd: Don't register interrupt remapping irqdomain when IR is disabled

The following commit has been merged into the x86/apic branch of tip:

Commit-ID: 2df985f5e44c43f5d29d8cc3aaa8e8ac697e9de6
Gitweb: https://git.kernel.org/tip/2df985f5e44c43f5d29d8cc3aaa8e8ac697e9de6
Author: David Woodhouse <[email protected]>
AuthorDate: Wed, 11 Nov 2020 14:43:20
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 11 Nov 2020 23:01:58 +01:00

iommu/amd: Don't register interrupt remapping irqdomain when IR is disabled

Registering the remapping irq domain unconditionally is potentially
allowing I/O-APIC and MSI interrupts to be parented in the IOMMU IR domain
even when IR is disabled. Don't do that.

Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
drivers/iommu/amd/init.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index c2769f2..a94b96f 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1601,9 +1601,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (ret)
return ret;

- ret = amd_iommu_create_irq_domain(iommu);
- if (ret)
- return ret;
+ if (amd_iommu_irq_remap) {
+ ret = amd_iommu_create_irq_domain(iommu);
+ if (ret)
+ return ret;
+ }

/*
* Make sure IOMMU is not considered to translate itself. The IVRS

2020-11-13 15:17:00

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, 2020-11-11 at 14:30 -0600, Tom Lendacky wrote:
> I had trouble cloning your tree for some reason, so just took the top
> three patches and applied them to the tip tree. This all appears to be
> working. I'll let the IOMMU experts take a closer look (adding Suravee).

Thanks. I see Thomas has taken the first two into the tip.git x86/apic
branch already, so we're just looking for an ack on the third. Which is
this one...

From 49ee4fa51b8c06d14b7c4c74d15a7d76f865a8ea Mon Sep 17 00:00:00 2001
From: David Woodhouse <[email protected]>
Date: Wed, 11 Nov 2020 12:09:01 +0000
Subject: [PATCH] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode

The AMD IOMMU has two modes for generating its own interrupts.

The first is very much based on PCI MSI, and can be configured by Linux
precisely that way. But like legacy unmapped PCI MSI it's limited to
8 bits of APIC ID.

The second method does not use PCI MSI at all in hardawre, and instead
configures the INTCAPXT registers in the IOMMU directly with the APIC ID
and vector.

In the latter case, the IOMMU driver would still use pci_enable_msi(),
read back (through MMIO) the MSI message that Linux wrote to the PCI MSI
table, then swizzle those bits into the appropriate register.

Historically, this worked because__irq_compose_msi_msg() would silently
generate an invalid MSI message with the high bits of the APIC ID in the
high bits of the MSI address. That hack was intended only for the Intel
IOMMU, and I recently enforced that, introducing a warning in
__irq_msi_compose_msg() if it was invoked with an APIC ID above 255.

Fix the AMD IOMMU not to depend on that hack any more, by having its own
irqdomain and directly putting the bits from the irq_cfg into the right
place in its ->activate() method.

Fixes: 47bea873cf80 "x86/msi: Only use high bits of MSI address for DMAR unit")
Signed-off-by: David Woodhouse <[email protected]>
---
arch/x86/include/asm/hw_irq.h | 1 +
drivers/iommu/amd/init.c | 190 +++++++++++++++++++++++-----------
2 files changed, 132 insertions(+), 59 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 458f5a676402..d465ece58151 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -39,6 +39,7 @@ enum irq_alloc_type {
X86_IRQ_ALLOC_TYPE_PCI_MSI,
X86_IRQ_ALLOC_TYPE_PCI_MSIX,
X86_IRQ_ALLOC_TYPE_DMAR,
+ X86_IRQ_ALLOC_TYPE_AMDVI,
X86_IRQ_ALLOC_TYPE_UV,
};

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index a94b96f1e13a..a57400a73abb 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/interrupt.h>
#include <linux/msi.h>
+#include <linux/irq.h>
#include <linux/amd-iommu.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
@@ -1557,14 +1558,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
break;
}

- /*
- * Note: Since iommu_update_intcapxt() leverages
- * the IOMMU MMIO access to MSI capability block registers
- * for MSI address lo/hi/data, we need to check both
- * EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support.
- */
- if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) &&
- (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT)))
+ if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT))
amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
break;
default:
@@ -1981,27 +1975,32 @@ union intcapxt {
} __attribute__ ((packed));

/*
- * Setup the IntCapXT registers with interrupt routing information
- * based on the PCI MSI capability block registers, accessed via
- * MMIO MSI address low/hi and MSI data registers.
+ * There isn't really any need to mask/unmask at the irqchip level because
+ * the 64-bit INTCAPXT registers can be updated atomically without tearing
+ * when the affinity is being updated.
*/
-static void iommu_update_intcapxt(struct amd_iommu *iommu)
+static void intcapxt_unmask_irq(struct irq_data *data)
{
- struct msi_msg msg;
- union intcapxt xt;
- u32 destid;
+}

- msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
- msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
- msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
+static void intcapxt_mask_irq(struct irq_data *data)
+{
+}

- destid = x86_msi_msg_get_destid(&msg, x2apic_enabled());
+static struct irq_chip intcapxt_controller;
+
+static int intcapxt_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irqd, bool reserve)
+{
+ struct amd_iommu *iommu = irqd->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ union intcapxt xt;

xt.capxt = 0ULL;
- xt.dest_mode_logical = msg.arch_data.dest_mode_logical;
- xt.vector = msg.arch_data.vector;
- xt.destid_0_23 = destid & GENMASK(23, 0);
- xt.destid_24_31 = destid >> 24;
+ xt.dest_mode_logical = apic->dest_mode_logical;
+ xt.vector = cfg->vector;
+ xt.destid_0_23 = cfg->dest_apicid & GENMASK(23, 0);
+ xt.destid_24_31 = cfg->dest_apicid >> 24;

/**
* Current IOMMU implemtation uses the same IRQ for all
@@ -2010,64 +2009,141 @@ static void iommu_update_intcapxt(struct amd_iommu *iommu)
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+ return 0;
}

-static void _irq_notifier_notify(struct irq_affinity_notify *notify,
- const cpumask_t *mask)
+static void intcapxt_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irqd)
{
- struct amd_iommu *iommu;
+ intcapxt_mask_irq(irqd);
+}

- for_each_iommu(iommu) {
- if (iommu->dev->irq == notify->irq) {
- iommu_update_intcapxt(iommu);
- break;
- }
+
+static int intcapxt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ int i, ret;
+
+ if (!info || info->type != X86_IRQ_ALLOC_TYPE_AMDVI)
+ return -EINVAL;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = virq; i < virq + nr_irqs; i++) {
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, i);
+
+ irqd->chip = &intcapxt_controller;
+ irqd->chip_data = info->data;
}
+
+ return ret;
}

-static void _irq_notifier_release(struct kref *ref)
+static void intcapxt_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}

-static int iommu_init_intcapxt(struct amd_iommu *iommu)
+static int intcapxt_set_affinity(struct irq_data *irqd,
+ const struct cpumask *mask, bool force)
{
+ struct irq_data *parent = irqd->parent_data;
int ret;
- struct irq_affinity_notify *notify = &iommu->intcapxt_notify;

- /**
- * IntCapXT requires XTSup=1 and MsiCapMmioSup=1,
- * which can be inferred from amd_iommu_xt_mode.
- */
- if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE)
- return 0;
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;

- /**
- * Also, we need to setup notifier to update the IntCapXT registers
- * whenever the irq affinity is changed from user-space.
- */
- notify->irq = iommu->dev->irq;
- notify->notify = _irq_notifier_notify,
- notify->release = _irq_notifier_release,
- ret = irq_set_affinity_notifier(iommu->dev->irq, notify);
+ return intcapxt_irqdomain_activate(irqd->domain, irqd, false);
+}
+
+static struct irq_chip intcapxt_controller = {
+ .name = "IOMMU-MSI",
+ .irq_unmask = intcapxt_unmask_irq,
+ .irq_mask = intcapxt_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_affinity = intcapxt_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static const struct irq_domain_ops intcapxt_domain_ops = {
+ .alloc = intcapxt_irqdomain_alloc,
+ .free = intcapxt_irqdomain_free,
+ .activate = intcapxt_irqdomain_activate,
+ .deactivate = intcapxt_irqdomain_deactivate,
+};
+
+
+static struct irq_domain *iommu_irqdomain;
+
+static struct irq_domain *iommu_get_irqdomain(void)
+{
+ struct fwnode_handle *fn;
+
+ /* No need for locking here (yet) as the init is single-threaded */
+ if (iommu_irqdomain)
+ return iommu_irqdomain;
+
+ fn = irq_domain_alloc_named_fwnode("AMD-Vi-MSI");
+ if (!fn)
+ return NULL;
+
+ iommu_irqdomain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0,
+ fn, &intcapxt_domain_ops,
+ NULL);
+ if (!iommu_irqdomain)
+ irq_domain_free_fwnode(fn);
+
+ return iommu_irqdomain;
+}
+
+static int iommu_setup_intcapxt(struct amd_iommu *iommu)
+{
+ struct irq_domain *domain;
+ struct irq_alloc_info info;
+ int irq, ret;
+
+ domain = iommu_get_irqdomain();
+ if (!domain)
+ return -ENXIO;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_AMDVI;
+ info.data = iommu;
+
+ irq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+ if (irq < 0) {
+ irq_domain_remove(domain);
+ return irq;
+ }
+
+ ret = request_threaded_irq(irq, amd_iommu_int_handler,
+ amd_iommu_int_thread, 0, "AMD-Vi", iommu);
if (ret) {
- pr_err("Failed to register irq affinity notifier (devid=%#x, irq %d)\n",
- iommu->devid, iommu->dev->irq);
+ irq_domain_free_irqs(irq, 1);
+ irq_domain_remove(domain);
return ret;
}

- iommu_update_intcapxt(iommu);
iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN);
- return ret;
+ return 0;
}

-static int iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_irq(struct amd_iommu *iommu)
{
int ret;

if (iommu->int_enabled)
goto enable_faults;

- if (iommu->dev->msi_cap)
+ if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+ ret = iommu_setup_intcapxt(iommu);
+ else if (iommu->dev->msi_cap)
ret = iommu_setup_msi(iommu);
else
ret = -ENODEV;
@@ -2076,10 +2152,6 @@ static int iommu_init_msi(struct amd_iommu *iommu)
return ret;

enable_faults:
- ret = iommu_init_intcapxt(iommu);
- if (ret)
- return ret;
-
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);

if (iommu->ppr_log != NULL)
@@ -2702,7 +2774,7 @@ static int amd_iommu_enable_interrupts(void)
int ret = 0;

for_each_iommu(iommu) {
- ret = iommu_init_msi(iommu);
+ ret = iommu_init_irq(iommu);
if (ret)
goto out;
}
--
2.17.1



Attachments:
smime.p7s (5.05 kB)

2020-11-16 18:06:10

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Fri, 2020-11-13 at 15:14 +0000, David Woodhouse wrote:
> On Wed, 2020-11-11 at 14:30 -0600, Tom Lendacky wrote:
> > I had trouble cloning your tree for some reason, so just took the top
> > three patches and applied them to the tip tree. This all appears to be
> > working. I'll let the IOMMU experts take a closer look (adding Suravee).
>
> Thanks. I see Thomas has taken the first two into the tip.git x86/apic
> branch already, so we're just looking for an ack on the third. Which is
> this one...
>
> From 49ee4fa51b8c06d14b7c4c74d15a7d76f865a8ea Mon Sep 17 00:00:00 2001
> From: David Woodhouse <[email protected]>
> Date: Wed, 11 Nov 2020 12:09:01 +0000
> Subject: [PATCH] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode

Ping?


Attachments:
smime.p7s (5.05 kB)

2020-11-17 02:42:14

by Suthikulpanit, Suravee

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

David,

On 11/13/20 10:14 PM, David Woodhouse wrote:
> On Wed, 2020-11-11 at 14:30 -0600, Tom Lendacky wrote:
>> I had trouble cloning your tree for some reason, so just took the top
>> three patches and applied them to the tip tree. This all appears to be
>> working. I'll let the IOMMU experts take a closer look (adding Suravee).
>
> Thanks. I see Thomas has taken the first two into the tip.git x86/apic
> branch already, so we're just looking for an ack on the third. Which is
> this one...
>
> From 49ee4fa51b8c06d14b7c4c74d15a7d76f865a8ea Mon Sep 17 00:00:00 2001
> From: David Woodhouse <[email protected]>
> Date: Wed, 11 Nov 2020 12:09:01 +0000
> Subject: [PATCH] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode
>
> The AMD IOMMU has two modes for generating its own interrupts.
>
> The first is very much based on PCI MSI, and can be configured by Linux
> precisely that way. But like legacy unmapped PCI MSI it's limited to
> 8 bits of APIC ID.
>
> The second method does not use PCI MSI at all in hardawre, and instead
> configures the INTCAPXT registers in the IOMMU directly with the APIC ID
> and vector.
>
> In the latter case, the IOMMU driver would still use pci_enable_msi(),
> read back (through MMIO) the MSI message that Linux wrote to the PCI MSI
> table, then swizzle those bits into the appropriate register.
>
> Historically, this worked because__irq_compose_msi_msg() would silently
> generate an invalid MSI message with the high bits of the APIC ID in the
> high bits of the MSI address. That hack was intended only for the Intel
> IOMMU, and I recently enforced that, introducing a warning in
> __irq_msi_compose_msg() if it was invoked with an APIC ID above 255.
>
> Fix the AMD IOMMU not to depend on that hack any more, by having its own
> irqdomain and directly putting the bits from the irq_cfg into the right
> place in its ->activate() method.
>
> Fixes: 47bea873cf80 "x86/msi: Only use high bits of MSI address for DMAR unit")
> Signed-off-by: David Woodhouse <[email protected]>

I'm still working on testing this series using IO_PAGE_FAULT injection to trigger the IOMMU interrupts. I am still
debugging some issues, and I'll keep you updated on the findings.

Thanks,
Suravee

2020-11-18 10:33:44

by Suthikulpanit, Suravee

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

David

On 11/17/20 9:00 AM, Suravee Suthikulpanit wrote:
> David,
>
> On 11/13/20 10:14 PM, David Woodhouse wrote:
>> On Wed, 2020-11-11 at 14:30 -0600, Tom Lendacky wrote:
>>> I had trouble cloning your tree for some reason, so just took the top
>>> three patches and applied them to the tip tree. This all appears to be
>>> working. I'll let the IOMMU experts take a closer look (adding Suravee).
>>
>> Thanks. I see Thomas has taken the first two into the tip.git x86/apic
>> branch already, so we're just looking for an ack on the third. Which is
>> this one...
>>
>>  From 49ee4fa51b8c06d14b7c4c74d15a7d76f865a8ea Mon Sep 17 00:00:00 2001
>> From: David Woodhouse <[email protected]>
>> Date: Wed, 11 Nov 2020 12:09:01 +0000
>> Subject: [PATCH] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode
>>
>> The AMD IOMMU has two modes for generating its own interrupts.
>>
>> The first is very much based on PCI MSI, and can be configured by Linux
>> precisely that way. But like legacy unmapped PCI MSI it's limited to
>> 8 bits of APIC ID.
>>
>> The second method does not use PCI MSI at all in hardawre, and instead
>> configures the INTCAPXT registers in the IOMMU directly with the APIC ID
>> and vector.
>>
>> In the latter case, the IOMMU driver would still use pci_enable_msi(),
>> read back (through MMIO) the MSI message that Linux wrote to the PCI MSI
>> table, then swizzle those bits into the appropriate register.
>>
>> Historically, this worked because__irq_compose_msi_msg() would silently
>> generate an invalid MSI message with the high bits of the APIC ID in the
>> high bits of the MSI address. That hack was intended only for the Intel
>> IOMMU, and I recently enforced that, introducing a warning in
>> __irq_msi_compose_msg() if it was invoked with an APIC ID above 255.
>>
>> Fix the AMD IOMMU not to depend on that hack any more, by having its own
>> irqdomain and directly putting the bits from the irq_cfg into the right
>> place in its ->activate() method.
>>
>> Fixes: 47bea873cf80 "x86/msi: Only use high bits of MSI address for DMAR unit")
>> Signed-off-by: David Woodhouse <[email protected]>
>
> I'm still working on testing this series using IO_PAGE_FAULT injection to trigger the IOMMU interrupts. I am still
> debugging some issues, and I'll keep you updated on the findings.
>
> Thanks,
> Suravee

I might need your help debugging this issue. I'm seeing the following error:

[ 14.005937] irq 29, desc: 00000000d200500b, depth: 0, count: 0, unhandled: 0
[ 14.006234] ->handle_irq(): 00000000eab4b6eb, handle_bad_irq+0x0/0x230
[ 14.006234] ->irq_data.chip(): 000000001cce6d6b, intcapxt_controller+0x0/0x120
[ 14.006234] ->action(): 0000000083bfd734
[ 14.006234] ->action->handler(): 0000000094806345, amd_iommu_int_handler+0x0/0x10
[ 14.006234] unexpected IRQ trap at vector 1d

Do you have any idea what might have gone wrong here?

Thanks,
Suravee

2020-11-18 10:54:31

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, 2020-11-18 at 17:29 +0700, Suravee Suthikulpanit wrote:
> I might need your help debugging this issue. I'm seeing the following error:
>
> [ 14.005937] irq 29, desc: 00000000d200500b, depth: 0, count: 0, unhandled: 0
> [ 14.006234] ->handle_irq(): 00000000eab4b6eb, handle_bad_irq+0x0/0x230

Where's that line coming from?

> [ 14.006234] ->irq_data.chip(): 000000001cce6d6b, intcapxt_controller+0x0/0x120
> [ 14.006234] ->action(): 0000000083bfd734
> [ 14.006234] ->action->handler(): 0000000094806345, amd_iommu_int_handler+0x0/0x10
> [ 14.006234] unexpected IRQ trap at vector 1d
>
> Do you have any idea what might have gone wrong here?

Hm, vector 0x1d is vector 29. It's also IRQ 29. I wonder if that's a coincidence.

Can you make intcapxt_irqdomain_activate() print the 64-bit value that
it's writing to the intcapxt registers?


Attachments:
smime.p7s (5.05 kB)

2020-11-18 14:08:53

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

Suravee,

On Wed, Nov 18 2020 at 17:29, Suravee Suthikulpanit wrote:
> On 11/17/20 9:00 AM, Suravee Suthikulpanit wrote:
>
> I might need your help debugging this issue. I'm seeing the following error:
>
> [ 14.005937] irq 29, desc: 00000000d200500b, depth: 0, count: 0, unhandled: 0
> [ 14.006234] ->handle_irq(): 00000000eab4b6eb, handle_bad_irq+0x0/0x230
> [ 14.006234] ->irq_data.chip(): 000000001cce6d6b, intcapxt_controller+0x0/0x120
> [ 14.006234] ->action(): 0000000083bfd734
> [ 14.006234] ->action->handler(): 0000000094806345, amd_iommu_int_handler+0x0/0x10
> [ 14.006234] unexpected IRQ trap at vector 1d
>
> Do you have any idea what might have gone wrong here?

Yes. This lacks setting up the low level flow handler. Delta patch
below.

Thanks,

tglx
---
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -2033,6 +2033,7 @@ static int intcapxt_irqdomain_alloc(stru

irqd->chip = &intcapxt_controller;
irqd->chip_data = info->data;
+ __irq_set_handler(i, handle_edge_irq, 0, "edge");
}

return ret;

2020-11-18 16:56:13

by Suthikulpanit, Suravee

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

Tglx,

On 11/18/20 9:06 PM, Thomas Gleixner wrote:
> Suravee,
>
> On Wed, Nov 18 2020 at 17:29, Suravee Suthikulpanit wrote:
>> On 11/17/20 9:00 AM, Suravee Suthikulpanit wrote:
>>
>> I might need your help debugging this issue. I'm seeing the following error:
>>
>> [ 14.005937] irq 29, desc: 00000000d200500b, depth: 0, count: 0, unhandled: 0
>> [ 14.006234] ->handle_irq(): 00000000eab4b6eb, handle_bad_irq+0x0/0x230
>> [ 14.006234] ->irq_data.chip(): 000000001cce6d6b, intcapxt_controller+0x0/0x120
>> [ 14.006234] ->action(): 0000000083bfd734
>> [ 14.006234] ->action->handler(): 0000000094806345, amd_iommu_int_handler+0x0/0x10
>> [ 14.006234] unexpected IRQ trap at vector 1d
>>
>> Do you have any idea what might have gone wrong here?
>
> Yes. This lacks setting up the low level flow handler. Delta patch
> below.
>
> Thanks,
>
> tglx
> ---
> --- a/drivers/iommu/amd/init.c
> +++ b/drivers/iommu/amd/init.c
> @@ -2033,6 +2033,7 @@ static int intcapxt_irqdomain_alloc(stru
>
> irqd->chip = &intcapxt_controller;
> irqd->chip_data = info->data;
> + __irq_set_handler(i, handle_edge_irq, 0, "edge");
> }
>
> return ret;
>

Yes, this fixes the issue. Now I can receive the IOMMU event log interrupts for IO_PAGE_FAULT event, which is triggered
using the injection interface via debugfs.

Thanks,
Suravee

2020-11-18 17:10:25

by David Woodhouse

[permalink] [raw]
Subject: Re: [EXTERNAL] [tip: x86/apic] x86/io_apic: Cleanup trigger/polarity helpers

On Wed, 2020-11-18 at 23:51 +0700, Suravee Suthikulpanit wrote:
> Yes, this fixes the issue. Now I can receive the IOMMU event log
> interrupts for IO_PAGE_FAULT event, which is triggered
> using the injection interface via debugfs.

Thanks, Suravee.


Attachments:
smime.p7s (5.05 kB)

2020-11-18 20:19:34

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/apic] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode

The following commit has been merged into the x86/apic branch of tip:

Commit-ID: d1adcfbb520c43c10fc22fcdccdd4204e014fb53
Gitweb: https://git.kernel.org/tip/d1adcfbb520c43c10fc22fcdccdd4204e014fb53
Author: David Woodhouse <[email protected]>
AuthorDate: Wed, 11 Nov 2020 12:09:01
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 18 Nov 2020 20:55:59 +01:00

iommu/amd: Fix IOMMU interrupt generation in X2APIC mode

The AMD IOMMU has two modes for generating its own interrupts.

The first is very much based on PCI MSI, and can be configured by Linux
precisely that way. But like legacy unmapped PCI MSI it's limited to
8 bits of APIC ID.

The second method does not use PCI MSI at all in hardawre, and instead
configures the INTCAPXT registers in the IOMMU directly with the APIC ID
and vector.

In the latter case, the IOMMU driver would still use pci_enable_msi(),
read back (through MMIO) the MSI message that Linux wrote to the PCI MSI
table, then swizzle those bits into the appropriate register.

Historically, this worked because__irq_compose_msi_msg() would silently
generate an invalid MSI message with the high bits of the APIC ID in the
high bits of the MSI address. That hack was intended only for the Intel
IOMMU, and I recently enforced that, introducing a warning in
__irq_msi_compose_msg() if it was invoked with an APIC ID above 255.

Fix the AMD IOMMU not to depend on that hack any more, by having its own
irqdomain and directly putting the bits from the irq_cfg into the right
place in its ->activate() method.

Fixes: 47bea873cf80 "x86/msi: Only use high bits of MSI address for DMAR unit")
Signed-off-by: David Woodhouse <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Suravee Suthikulpanit <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/hw_irq.h | 1 +-
drivers/iommu/amd/init.c | 191 ++++++++++++++++++++++-----------
2 files changed, 133 insertions(+), 59 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 458f5a6..d465ece 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -39,6 +39,7 @@ enum irq_alloc_type {
X86_IRQ_ALLOC_TYPE_PCI_MSI,
X86_IRQ_ALLOC_TYPE_PCI_MSIX,
X86_IRQ_ALLOC_TYPE_DMAR,
+ X86_IRQ_ALLOC_TYPE_AMDVI,
X86_IRQ_ALLOC_TYPE_UV,
};

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index a94b96f..07d1f99 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/interrupt.h>
#include <linux/msi.h>
+#include <linux/irq.h>
#include <linux/amd-iommu.h>
#include <linux/export.h>
#include <linux/kmemleak.h>
@@ -1557,14 +1558,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
break;
}

- /*
- * Note: Since iommu_update_intcapxt() leverages
- * the IOMMU MMIO access to MSI capability block registers
- * for MSI address lo/hi/data, we need to check both
- * EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support.
- */
- if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) &&
- (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT)))
+ if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT))
amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE;
break;
default:
@@ -1981,27 +1975,32 @@ union intcapxt {
} __attribute__ ((packed));

/*
- * Setup the IntCapXT registers with interrupt routing information
- * based on the PCI MSI capability block registers, accessed via
- * MMIO MSI address low/hi and MSI data registers.
+ * There isn't really any need to mask/unmask at the irqchip level because
+ * the 64-bit INTCAPXT registers can be updated atomically without tearing
+ * when the affinity is being updated.
*/
-static void iommu_update_intcapxt(struct amd_iommu *iommu)
+static void intcapxt_unmask_irq(struct irq_data *data)
{
- struct msi_msg msg;
- union intcapxt xt;
- u32 destid;
+}

- msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET);
- msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET);
- msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET);
+static void intcapxt_mask_irq(struct irq_data *data)
+{
+}

- destid = x86_msi_msg_get_destid(&msg, x2apic_enabled());
+static struct irq_chip intcapxt_controller;
+
+static int intcapxt_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irqd, bool reserve)
+{
+ struct amd_iommu *iommu = irqd->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ union intcapxt xt;

xt.capxt = 0ULL;
- xt.dest_mode_logical = msg.arch_data.dest_mode_logical;
- xt.vector = msg.arch_data.vector;
- xt.destid_0_23 = destid & GENMASK(23, 0);
- xt.destid_24_31 = destid >> 24;
+ xt.dest_mode_logical = apic->dest_mode_logical;
+ xt.vector = cfg->vector;
+ xt.destid_0_23 = cfg->dest_apicid & GENMASK(23, 0);
+ xt.destid_24_31 = cfg->dest_apicid >> 24;

/**
* Current IOMMU implemtation uses the same IRQ for all
@@ -2010,64 +2009,142 @@ static void iommu_update_intcapxt(struct amd_iommu *iommu)
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET);
writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET);
+ return 0;
}

-static void _irq_notifier_notify(struct irq_affinity_notify *notify,
- const cpumask_t *mask)
+static void intcapxt_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irqd)
{
- struct amd_iommu *iommu;
+ intcapxt_mask_irq(irqd);
+}

- for_each_iommu(iommu) {
- if (iommu->dev->irq == notify->irq) {
- iommu_update_intcapxt(iommu);
- break;
- }
+
+static int intcapxt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ int i, ret;
+
+ if (!info || info->type != X86_IRQ_ALLOC_TYPE_AMDVI)
+ return -EINVAL;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = virq; i < virq + nr_irqs; i++) {
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, i);
+
+ irqd->chip = &intcapxt_controller;
+ irqd->chip_data = info->data;
+ __irq_set_handler(i, handle_edge_irq, 0, "edge");
}
+
+ return ret;
}

-static void _irq_notifier_release(struct kref *ref)
+static void intcapxt_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
{
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}

-static int iommu_init_intcapxt(struct amd_iommu *iommu)
+static int intcapxt_set_affinity(struct irq_data *irqd,
+ const struct cpumask *mask, bool force)
{
+ struct irq_data *parent = irqd->parent_data;
int ret;
- struct irq_affinity_notify *notify = &iommu->intcapxt_notify;

- /**
- * IntCapXT requires XTSup=1 and MsiCapMmioSup=1,
- * which can be inferred from amd_iommu_xt_mode.
- */
- if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE)
- return 0;
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;

- /**
- * Also, we need to setup notifier to update the IntCapXT registers
- * whenever the irq affinity is changed from user-space.
- */
- notify->irq = iommu->dev->irq;
- notify->notify = _irq_notifier_notify,
- notify->release = _irq_notifier_release,
- ret = irq_set_affinity_notifier(iommu->dev->irq, notify);
+ return intcapxt_irqdomain_activate(irqd->domain, irqd, false);
+}
+
+static struct irq_chip intcapxt_controller = {
+ .name = "IOMMU-MSI",
+ .irq_unmask = intcapxt_unmask_irq,
+ .irq_mask = intcapxt_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_affinity = intcapxt_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static const struct irq_domain_ops intcapxt_domain_ops = {
+ .alloc = intcapxt_irqdomain_alloc,
+ .free = intcapxt_irqdomain_free,
+ .activate = intcapxt_irqdomain_activate,
+ .deactivate = intcapxt_irqdomain_deactivate,
+};
+
+
+static struct irq_domain *iommu_irqdomain;
+
+static struct irq_domain *iommu_get_irqdomain(void)
+{
+ struct fwnode_handle *fn;
+
+ /* No need for locking here (yet) as the init is single-threaded */
+ if (iommu_irqdomain)
+ return iommu_irqdomain;
+
+ fn = irq_domain_alloc_named_fwnode("AMD-Vi-MSI");
+ if (!fn)
+ return NULL;
+
+ iommu_irqdomain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0,
+ fn, &intcapxt_domain_ops,
+ NULL);
+ if (!iommu_irqdomain)
+ irq_domain_free_fwnode(fn);
+
+ return iommu_irqdomain;
+}
+
+static int iommu_setup_intcapxt(struct amd_iommu *iommu)
+{
+ struct irq_domain *domain;
+ struct irq_alloc_info info;
+ int irq, ret;
+
+ domain = iommu_get_irqdomain();
+ if (!domain)
+ return -ENXIO;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_AMDVI;
+ info.data = iommu;
+
+ irq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+ if (irq < 0) {
+ irq_domain_remove(domain);
+ return irq;
+ }
+
+ ret = request_threaded_irq(irq, amd_iommu_int_handler,
+ amd_iommu_int_thread, 0, "AMD-Vi", iommu);
if (ret) {
- pr_err("Failed to register irq affinity notifier (devid=%#x, irq %d)\n",
- iommu->devid, iommu->dev->irq);
+ irq_domain_free_irqs(irq, 1);
+ irq_domain_remove(domain);
return ret;
}

- iommu_update_intcapxt(iommu);
iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN);
- return ret;
+ return 0;
}

-static int iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_irq(struct amd_iommu *iommu)
{
int ret;

if (iommu->int_enabled)
goto enable_faults;

- if (iommu->dev->msi_cap)
+ if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE)
+ ret = iommu_setup_intcapxt(iommu);
+ else if (iommu->dev->msi_cap)
ret = iommu_setup_msi(iommu);
else
ret = -ENODEV;
@@ -2076,10 +2153,6 @@ static int iommu_init_msi(struct amd_iommu *iommu)
return ret;

enable_faults:
- ret = iommu_init_intcapxt(iommu);
- if (ret)
- return ret;
-
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);

if (iommu->ppr_log != NULL)
@@ -2702,7 +2775,7 @@ static int amd_iommu_enable_interrupts(void)
int ret = 0;

for_each_iommu(iommu) {
- ret = iommu_init_msi(iommu);
+ ret = iommu_init_irq(iommu);
if (ret)
goto out;
}