For IOMMU AVIC, the IOMMU driver needs to keep track of vcpu scheduling
changes, and updates interrupt remapping table entry (IRTE) accordingly.
The IRTE is normally cached by the hardware, which requires the IOMMU
driver to issue IOMMU IRT invalidation command and wait for completion
everytime it updates the table.
Enabling IOMMU AVIC on a large scale system with lots of vcpus and
VFIO pass-through devices running interrupt-intensive workload,
it could result in high IRT invalidation rate. In such case, the overhead
from IRT invalidation could outweigh the benefit of IRTE caching.
Therefore, introduce a new AMD IOMMU driver option "amd_iommu=irtcachedis"
to allow disabling IRTE caching, and avoid the need for IRTE invalidation.
Patch 1,2 prepare the AMD IOMMU driver to support IRT cache disabling.
Patch 3,4 introduce IRT cache disabling support
Patch 5 improves the code path in IOMMU driver for updating vcpu scheduling
for AVIC.
Thank you,
Suravee
Changes from V1
(https://lore.kernel.org/lkml/[email protected]/T/)
* Patch 3: Add logic to clean up the IRTE cache disabling
and handle kdump code path (per Alejandro)
Joao Martins (1):
iommu/amd: Switch amd_iommu_update_ga() to use modify_irte_ga()
Suravee Suthikulpanit (4):
iommu/amd: Remove the unused struct amd_ir_data.ref
iommu/amd: Introduce Disable IRTE Caching Support
iommu/amd: Do not Invalidate IRT when disable IRTE caching
iommu/amd: Improving Interrupt Remapping Table Invalidation
.../admin-guide/kernel-parameters.txt | 1 +
drivers/iommu/amd/amd_iommu_types.h | 7 +-
drivers/iommu/amd/init.c | 38 +++++++-
drivers/iommu/amd/iommu.c | 97 ++++++++++---------
4 files changed, 94 insertions(+), 49 deletions(-)
--
2.31.1
With the Interrupt Remapping Table cache disabled, there is no need to
issue invalidate IRT and wait for its completion. Therefore, add logic
to bypass the operation.
Suggested-by: Joao Martins <[email protected]>
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
drivers/iommu/amd/iommu.c | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 0c4a2796bb0a..51c2b018433d 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1273,12 +1273,24 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
u32 devid;
u16 last_bdf = iommu->pci_seg->last_bdf;
+ if (iommu->irtcachedis_enabled)
+ return;
+
for (devid = 0; devid <= last_bdf; devid++)
iommu_flush_irt(iommu, devid);
iommu_completion_wait(iommu);
}
+static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
+{
+ if (iommu->irtcachedis_enabled)
+ return;
+
+ iommu_flush_irt(iommu, devid);
+ iommu_completion_wait(iommu);
+}
+
void iommu_flush_all_caches(struct amd_iommu *iommu)
{
if (iommu_feature(iommu, FEATURE_IA)) {
@@ -3028,8 +3040,7 @@ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
raw_spin_unlock_irqrestore(&table->lock, flags);
- iommu_flush_irt(iommu, devid);
- iommu_completion_wait(iommu);
+ iommu_flush_irt_and_complete(iommu, devid);
return 0;
}
@@ -3048,8 +3059,7 @@ static int modify_irte(struct amd_iommu *iommu,
table->table[index] = irte->val;
raw_spin_unlock_irqrestore(&table->lock, flags);
- iommu_flush_irt(iommu, devid);
- iommu_completion_wait(iommu);
+ iommu_flush_irt_and_complete(iommu, devid);
return 0;
}
@@ -3067,8 +3077,7 @@ static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
iommu->irte_ops->clear_allocated(table, index);
raw_spin_unlock_irqrestore(&table->lock, flags);
- iommu_flush_irt(iommu, devid);
- iommu_completion_wait(iommu);
+ iommu_flush_irt_and_complete(iommu, devid);
}
static void irte_prepare(void *entry,
--
2.31.1
From: Joao Martins <[email protected]>
The modify_irte_ga() uses cmpxchg_double() to update the IRTE in one shot,
which is necessary when adding IRTE cache disabling support since
the driver no longer need to flush the IRT for hardware to take effect.
Please note that there is a functional change where the IsRun and
Destination bits of IRTE are now cached in the struct amd_ir_data.entry.
Signed-off-by: Joao Martins <[email protected]>
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
drivers/iommu/amd/iommu.c | 38 ++++++++++----------------------------
1 file changed, 10 insertions(+), 28 deletions(-)
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index ebb155bfef15..4a3a7346ab21 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3700,44 +3700,26 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
int amd_iommu_update_ga(int cpu, bool is_run, void *data)
{
- unsigned long flags;
- struct amd_iommu *iommu;
- struct irq_remap_table *table;
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
- int devid = ir_data->irq_2_irte.devid;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
- struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
- !ref || !entry || !entry->lo.fields_vapic.guest_mode)
+ !entry || !entry->lo.fields_vapic.guest_mode)
return 0;
- iommu = ir_data->iommu;
- if (!iommu)
- return -ENODEV;
-
- table = get_irq_table(iommu, devid);
- if (!table)
+ if (!ir_data->iommu)
return -ENODEV;
- raw_spin_lock_irqsave(&table->lock, flags);
-
- if (ref->lo.fields_vapic.guest_mode) {
- if (cpu >= 0) {
- ref->lo.fields_vapic.destination =
- APICID_TO_IRTE_DEST_LO(cpu);
- ref->hi.fields.destination =
- APICID_TO_IRTE_DEST_HI(cpu);
- }
- ref->lo.fields_vapic.is_run = is_run;
- barrier();
+ if (cpu >= 0) {
+ entry->lo.fields_vapic.destination =
+ APICID_TO_IRTE_DEST_LO(cpu);
+ entry->hi.fields.destination =
+ APICID_TO_IRTE_DEST_HI(cpu);
}
+ entry->lo.fields_vapic.is_run = is_run;
- raw_spin_unlock_irqrestore(&table->lock, flags);
-
- iommu_flush_irt(iommu, devid);
- iommu_completion_wait(iommu);
- return 0;
+ return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry, ir_data);
}
EXPORT_SYMBOL(amd_iommu_update_ga);
#endif
--
2.31.1
Invalidating Interrupt Remapping Table (IRT) requires, the AMD IOMMU driver
to issue INVALIDATE_INTERRUPT_TABLE and COMPLETION_WAIT commands.
Currently, the driver issues the two commands separately, which requires
calling raw_spin_lock_irqsave() twice. In addition, the COMPLETION_WAIT
could potentially be interleaved with other commands causing delay of
the COMPLETION_WAIT command.
Therefore, combine issuing of the two commands in one spin-lock, and
changing struct amd_iommu.cmd_sem_val to use atomic64 to minimize
locking.
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
drivers/iommu/amd/amd_iommu_types.h | 2 +-
drivers/iommu/amd/init.c | 2 +-
drivers/iommu/amd/iommu.c | 27 ++++++++++++++++++++++-----
3 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 486a052e37ca..2fa65da2a9a5 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -744,7 +744,7 @@ struct amd_iommu {
u32 flags;
volatile u64 *cmd_sem;
- u64 cmd_sem_val;
+ atomic64_t cmd_sem_val;
#ifdef CONFIG_AMD_IOMMU_DEBUGFS
/* DebugFS Info */
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index fc0392d706db..16737819f79a 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1750,7 +1750,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
iommu->pci_seg = pci_seg;
raw_spin_lock_init(&iommu->lock);
- iommu->cmd_sem_val = 0;
+ atomic64_set(&iommu->cmd_sem_val, 0);
/* Add IOMMU to internal data structures */
list_add_tail(&iommu->list, &amd_iommu_list);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 51c2b018433d..57ae4a8072d3 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1182,11 +1182,11 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
if (!iommu->need_sync)
return 0;
- raw_spin_lock_irqsave(&iommu->lock, flags);
-
- data = ++iommu->cmd_sem_val;
+ data = atomic64_add_return(1, &iommu->cmd_sem_val);
build_completion_wait(&cmd, iommu, data);
+ raw_spin_lock_irqsave(&iommu->lock, flags);
+
ret = __iommu_queue_command_sync(iommu, &cmd, false);
if (ret)
goto out_unlock;
@@ -1284,11 +1284,28 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
{
+ int ret;
+ u64 data;
+ unsigned long flags;
+ struct iommu_cmd cmd, cmd2;
+
if (iommu->irtcachedis_enabled)
return;
- iommu_flush_irt(iommu, devid);
- iommu_completion_wait(iommu);
+ build_inv_irt(&cmd, devid);
+ data = atomic64_add_return(1, &iommu->cmd_sem_val);
+ build_completion_wait(&cmd2, iommu, data);
+
+ raw_spin_lock_irqsave(&iommu->lock, flags);
+ ret = __iommu_queue_command_sync(iommu, &cmd, true);
+ if (ret)
+ goto out;
+ ret = __iommu_queue_command_sync(iommu, &cmd2, false);
+ if (ret)
+ goto out;
+ wait_on_sem(iommu, data);
+out:
+ raw_spin_unlock_irqrestore(&iommu->lock, flags);
}
void iommu_flush_all_caches(struct amd_iommu *iommu)
--
2.31.1
Since the amd_iommu_update_ga() has been switched to use the
modify_irte_ga() helper function to update the IRTE, the parameter
is no longer needed.
Suggested-by: Vasant Hegde <[email protected]>
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
drivers/iommu/amd/amd_iommu_types.h | 1 -
drivers/iommu/amd/iommu.c | 17 +++++++----------
2 files changed, 7 insertions(+), 11 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 3d684190b4d5..a0ff1e852efc 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -993,7 +993,6 @@ struct amd_ir_data {
struct irq_2_irte irq_2_irte;
struct msi_msg msi_entry;
void *entry; /* Pointer to union irte or struct irte_ga */
- void *ref; /* Pointer to the actual irte */
/**
* Store information for activate/de-activate
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 4a3a7346ab21..0c4a2796bb0a 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2999,7 +2999,7 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
}
static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
- struct irte_ga *irte, struct amd_ir_data *data)
+ struct irte_ga *irte)
{
bool ret;
struct irq_remap_table *table;
@@ -3026,9 +3026,6 @@ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
*/
WARN_ON(!ret);
- if (data)
- data->ref = entry;
-
raw_spin_unlock_irqrestore(&table->lock, flags);
iommu_flush_irt(iommu, devid);
@@ -3117,7 +3114,7 @@ static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u1
struct irte_ga *irte = (struct irte_ga *) entry;
irte->lo.fields_remap.valid = 1;
- modify_irte_ga(iommu, devid, index, irte, NULL);
+ modify_irte_ga(iommu, devid, index, irte);
}
static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
@@ -3133,7 +3130,7 @@ static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid,
struct irte_ga *irte = (struct irte_ga *) entry;
irte->lo.fields_remap.valid = 0;
- modify_irte_ga(iommu, devid, index, irte, NULL);
+ modify_irte_ga(iommu, devid, index, irte);
}
static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
@@ -3157,7 +3154,7 @@ static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid
APICID_TO_IRTE_DEST_LO(dest_apicid);
irte->hi.fields.destination =
APICID_TO_IRTE_DEST_HI(dest_apicid);
- modify_irte_ga(iommu, devid, index, irte, NULL);
+ modify_irte_ga(iommu, devid, index, irte);
}
}
@@ -3508,7 +3505,7 @@ int amd_iommu_activate_guest_mode(void *data)
entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
- ir_data->irq_2_irte.index, entry, ir_data);
+ ir_data->irq_2_irte.index, entry);
}
EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
@@ -3538,7 +3535,7 @@ int amd_iommu_deactivate_guest_mode(void *data)
APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
- ir_data->irq_2_irte.index, entry, ir_data);
+ ir_data->irq_2_irte.index, entry);
}
EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
@@ -3719,7 +3716,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
entry->lo.fields_vapic.is_run = is_run;
return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
- ir_data->irq_2_irte.index, entry, ir_data);
+ ir_data->irq_2_irte.index, entry);
}
EXPORT_SYMBOL(amd_iommu_update_ga);
#endif
--
2.31.1
An Interrupt Remapping Table (IRT) stores interrupt remapping configuration
for each device. In a normal operation, the AMD IOMMU caches the table
to optimize subsequent data accesses. This requires the IOMMU driver to
invalidate IRT whenever it updates the table. The invalidation process
includes issuing an INVALIDATE_INTERRUPT_TABLE command following by
a COMPLETION_WAIT command.
However, there are cases in which the IRT is updated at a high rate.
For example, for IOMMU AVIC, the IRTE[IsRun] bit is updated on every
vcpu scheduling (i.e. amd_iommu_update_ga()). On system with large
amount of vcpus and VFIO PCI pass-through devices, the invalidation
process could potentially become a performance bottleneck.
Introducing a new kernel boot option:
amd_iommu=irtcachedis
which disables IRTE caching by setting the IRTCachedis bit in each IOMMU
Control register, and bypass the IRT invalidation process.
Co-developed-by: Alejandro Jimenez <[email protected]>
[Awaiting sign-off by Alejandro]
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 1 +
drivers/iommu/amd/amd_iommu_types.h | 4 +++
drivers/iommu/amd/init.c | 36 +++++++++++++++++++
3 files changed, 41 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6221a1d057dd..f29dee600faf 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -323,6 +323,7 @@
option with care.
pgtbl_v1 - Use v1 page table for DMA-API (Default).
pgtbl_v2 - Use v2 page table for DMA-API.
+ irtcachedis - Disable Interrupt Remapping Table (IRT) caching.
amd_iommu_dump= [HW,X86-64]
Enable AMD IOMMU driver option to dump the ACPI table
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index a0ff1e852efc..486a052e37ca 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -172,6 +172,7 @@
#define CONTROL_GAINT_EN 29
#define CONTROL_XT_EN 50
#define CONTROL_INTCAPXT_EN 51
+#define CONTROL_IRTCACHEDIS 59
#define CONTROL_SNPAVIC_EN 61
#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT)
@@ -708,6 +709,9 @@ struct amd_iommu {
/* if one, we need to send a completion wait command */
bool need_sync;
+ /* true if disable irte caching */
+ bool irtcachedis_enabled;
+
/* Handle for IOMMU core code */
struct iommu_device iommu;
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index fd487c33b28a..fc0392d706db 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -160,6 +160,7 @@ static int amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE;
static bool amd_iommu_detected;
static bool amd_iommu_disabled __initdata;
static bool amd_iommu_force_enable __initdata;
+static bool amd_iommu_irtcachedis;
static int amd_iommu_target_ivhd_type;
/* Global EFR and EFR2 registers */
@@ -477,6 +478,9 @@ static void iommu_disable(struct amd_iommu *iommu)
/* Disable IOMMU hardware itself */
iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
+
+ /* Clear IRTE cache disabling bit */
+ iommu_feature_disable(iommu, CONTROL_IRTCACHEDIS);
}
/*
@@ -2700,6 +2704,33 @@ static void iommu_enable_ga(struct amd_iommu *iommu)
#endif
}
+static void iommu_disable_irtcachedis(struct amd_iommu *iommu)
+{
+ iommu_feature_disable(iommu, CONTROL_IRTCACHEDIS);
+}
+
+static void iommu_enable_irtcachedis(struct amd_iommu *iommu)
+{
+ u64 ctrl;
+
+ if (!amd_iommu_irtcachedis)
+ return;
+
+ /*
+ * Note:
+ * The support for IRTCacheDis feature is dertermined by
+ * checking if the bit is writable.
+ */
+ iommu_feature_enable(iommu, CONTROL_IRTCACHEDIS);
+ ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= (1ULL << CONTROL_IRTCACHEDIS);
+ if (ctrl)
+ iommu->irtcachedis_enabled = true;
+ pr_info("iommu%d (%#06x) : IRT cache is %s\n",
+ iommu->index, iommu->devid,
+ iommu->irtcachedis_enabled ? "disabled" : "enabled");
+}
+
static void early_enable_iommu(struct amd_iommu *iommu)
{
iommu_disable(iommu);
@@ -2710,6 +2741,7 @@ static void early_enable_iommu(struct amd_iommu *iommu)
iommu_set_exclusion_range(iommu);
iommu_enable_ga(iommu);
iommu_enable_xt(iommu);
+ iommu_enable_irtcachedis(iommu);
iommu_enable(iommu);
iommu_flush_all_caches(iommu);
}
@@ -2760,10 +2792,12 @@ static void early_enable_iommus(void)
for_each_iommu(iommu) {
iommu_disable_command_buffer(iommu);
iommu_disable_event_buffer(iommu);
+ iommu_disable_irtcachedis(iommu);
iommu_enable_command_buffer(iommu);
iommu_enable_event_buffer(iommu);
iommu_enable_ga(iommu);
iommu_enable_xt(iommu);
+ iommu_enable_irtcachedis(iommu);
iommu_set_device_table(iommu);
iommu_flush_all_caches(iommu);
}
@@ -3411,6 +3445,8 @@ static int __init parse_amd_iommu_options(char *str)
amd_iommu_pgtable = AMD_IOMMU_V1;
} else if (strncmp(str, "pgtbl_v2", 8) == 0) {
amd_iommu_pgtable = AMD_IOMMU_V2;
+ } else if (strncmp(str, "irtcachedis", 11) == 0) {
+ amd_iommu_irtcachedis = true;
} else {
pr_notice("Unknown option - '%s'\n", str);
}
--
2.31.1
Hi Suravee,
On 5/18/2023 8:55 PM, Suravee Suthikulpanit wrote:
> An Interrupt Remapping Table (IRT) stores interrupt remapping configuration
> for each device. In a normal operation, the AMD IOMMU caches the table
> to optimize subsequent data accesses. This requires the IOMMU driver to
> invalidate IRT whenever it updates the table. The invalidation process
> includes issuing an INVALIDATE_INTERRUPT_TABLE command following by
> a COMPLETION_WAIT command.
>
> However, there are cases in which the IRT is updated at a high rate.
> For example, for IOMMU AVIC, the IRTE[IsRun] bit is updated on every
> vcpu scheduling (i.e. amd_iommu_update_ga()). On system with large
> amount of vcpus and VFIO PCI pass-through devices, the invalidation
> process could potentially become a performance bottleneck.
>
> Introducing a new kernel boot option:
>
> amd_iommu=irtcachedis
>
> which disables IRTE caching by setting the IRTCachedis bit in each IOMMU
> Control register, and bypass the IRT invalidation process.
>
> Co-developed-by: Alejandro Jimenez <[email protected]>
> [Awaiting sign-off by Alejandro]
Signed-off-by: Alejandro Jimenez <[email protected]
I sanity tested the kdump mechanism, and confirmed that
CONTROL_IRTCACHEDIS is set appropriately based on the irtcachedis kernel
parameter.
Also, I have not observed any errors during multiple rounds of testing
with large vCPU counts and VFIO passthrough devices under
interrupt-intensive workload that causes heavy vCPU scheduling activity
and exercises the relevant code path.
Thank you,
Alejandro
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 1 +
> drivers/iommu/amd/amd_iommu_types.h | 4 +++
> drivers/iommu/amd/init.c | 36 +++++++++++++++++++
> 3 files changed, 41 insertions(+)
>
>
On Thu, May 18, 2023 at 08:55:29PM -0400, Suravee Suthikulpanit wrote:
> Invalidating Interrupt Remapping Table (IRT) requires, the AMD IOMMU driver
> to issue INVALIDATE_INTERRUPT_TABLE and COMPLETION_WAIT commands.
> Currently, the driver issues the two commands separately, which requires
> calling raw_spin_lock_irqsave() twice. In addition, the COMPLETION_WAIT
> could potentially be interleaved with other commands causing delay of
> the COMPLETION_WAIT command.
>
> Therefore, combine issuing of the two commands in one spin-lock, and
> changing struct amd_iommu.cmd_sem_val to use atomic64 to minimize
> locking.
>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
Reviewed-by: Jerry Snitselaar <[email protected]>
> ---
> drivers/iommu/amd/amd_iommu_types.h | 2 +-
> drivers/iommu/amd/init.c | 2 +-
> drivers/iommu/amd/iommu.c | 27 ++++++++++++++++++++++-----
> 3 files changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> index 486a052e37ca..2fa65da2a9a5 100644
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -744,7 +744,7 @@ struct amd_iommu {
>
> u32 flags;
> volatile u64 *cmd_sem;
> - u64 cmd_sem_val;
> + atomic64_t cmd_sem_val;
>
> #ifdef CONFIG_AMD_IOMMU_DEBUGFS
> /* DebugFS Info */
> diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
> index fc0392d706db..16737819f79a 100644
> --- a/drivers/iommu/amd/init.c
> +++ b/drivers/iommu/amd/init.c
> @@ -1750,7 +1750,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
> iommu->pci_seg = pci_seg;
>
> raw_spin_lock_init(&iommu->lock);
> - iommu->cmd_sem_val = 0;
> + atomic64_set(&iommu->cmd_sem_val, 0);
>
> /* Add IOMMU to internal data structures */
> list_add_tail(&iommu->list, &amd_iommu_list);
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 51c2b018433d..57ae4a8072d3 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -1182,11 +1182,11 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
> if (!iommu->need_sync)
> return 0;
>
> - raw_spin_lock_irqsave(&iommu->lock, flags);
> -
> - data = ++iommu->cmd_sem_val;
> + data = atomic64_add_return(1, &iommu->cmd_sem_val);
> build_completion_wait(&cmd, iommu, data);
>
> + raw_spin_lock_irqsave(&iommu->lock, flags);
> +
> ret = __iommu_queue_command_sync(iommu, &cmd, false);
> if (ret)
> goto out_unlock;
> @@ -1284,11 +1284,28 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
>
> static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
> {
> + int ret;
> + u64 data;
> + unsigned long flags;
> + struct iommu_cmd cmd, cmd2;
> +
> if (iommu->irtcachedis_enabled)
> return;
>
> - iommu_flush_irt(iommu, devid);
> - iommu_completion_wait(iommu);
> + build_inv_irt(&cmd, devid);
> + data = atomic64_add_return(1, &iommu->cmd_sem_val);
> + build_completion_wait(&cmd2, iommu, data);
> +
> + raw_spin_lock_irqsave(&iommu->lock, flags);
> + ret = __iommu_queue_command_sync(iommu, &cmd, true);
> + if (ret)
> + goto out;
> + ret = __iommu_queue_command_sync(iommu, &cmd2, false);
> + if (ret)
> + goto out;
> + wait_on_sem(iommu, data);
> +out:
> + raw_spin_unlock_irqrestore(&iommu->lock, flags);
> }
>
> void iommu_flush_all_caches(struct amd_iommu *iommu)
> --
> 2.31.1
>
On Thu, May 18, 2023 at 08:55:27PM -0400, Suravee Suthikulpanit wrote:
> An Interrupt Remapping Table (IRT) stores interrupt remapping configuration
> for each device. In a normal operation, the AMD IOMMU caches the table
> to optimize subsequent data accesses. This requires the IOMMU driver to
> invalidate IRT whenever it updates the table. The invalidation process
> includes issuing an INVALIDATE_INTERRUPT_TABLE command following by
> a COMPLETION_WAIT command.
>
> However, there are cases in which the IRT is updated at a high rate.
> For example, for IOMMU AVIC, the IRTE[IsRun] bit is updated on every
> vcpu scheduling (i.e. amd_iommu_update_ga()). On system with large
> amount of vcpus and VFIO PCI pass-through devices, the invalidation
> process could potentially become a performance bottleneck.
>
> Introducing a new kernel boot option:
>
> amd_iommu=irtcachedis
>
> which disables IRTE caching by setting the IRTCachedis bit in each IOMMU
> Control register, and bypass the IRT invalidation process.
>
> Co-developed-by: Alejandro Jimenez <[email protected]>
> [Awaiting sign-off by Alejandro]
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
Reviewed-by: Jerry Snitselaar <[email protected]>
> ---
> .../admin-guide/kernel-parameters.txt | 1 +
> drivers/iommu/amd/amd_iommu_types.h | 4 +++
> drivers/iommu/amd/init.c | 36 +++++++++++++++++++
> 3 files changed, 41 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 6221a1d057dd..f29dee600faf 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -323,6 +323,7 @@
> option with care.
> pgtbl_v1 - Use v1 page table for DMA-API (Default).
> pgtbl_v2 - Use v2 page table for DMA-API.
> + irtcachedis - Disable Interrupt Remapping Table (IRT) caching.
>
> amd_iommu_dump= [HW,X86-64]
> Enable AMD IOMMU driver option to dump the ACPI table
> diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> index a0ff1e852efc..486a052e37ca 100644
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -172,6 +172,7 @@
> #define CONTROL_GAINT_EN 29
> #define CONTROL_XT_EN 50
> #define CONTROL_INTCAPXT_EN 51
> +#define CONTROL_IRTCACHEDIS 59
> #define CONTROL_SNPAVIC_EN 61
>
> #define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT)
> @@ -708,6 +709,9 @@ struct amd_iommu {
> /* if one, we need to send a completion wait command */
> bool need_sync;
>
> + /* true if disable irte caching */
> + bool irtcachedis_enabled;
> +
> /* Handle for IOMMU core code */
> struct iommu_device iommu;
>
> diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
> index fd487c33b28a..fc0392d706db 100644
> --- a/drivers/iommu/amd/init.c
> +++ b/drivers/iommu/amd/init.c
> @@ -160,6 +160,7 @@ static int amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE;
> static bool amd_iommu_detected;
> static bool amd_iommu_disabled __initdata;
> static bool amd_iommu_force_enable __initdata;
> +static bool amd_iommu_irtcachedis;
> static int amd_iommu_target_ivhd_type;
>
> /* Global EFR and EFR2 registers */
> @@ -477,6 +478,9 @@ static void iommu_disable(struct amd_iommu *iommu)
>
> /* Disable IOMMU hardware itself */
> iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
> +
> + /* Clear IRTE cache disabling bit */
> + iommu_feature_disable(iommu, CONTROL_IRTCACHEDIS);
> }
>
> /*
> @@ -2700,6 +2704,33 @@ static void iommu_enable_ga(struct amd_iommu *iommu)
> #endif
> }
>
> +static void iommu_disable_irtcachedis(struct amd_iommu *iommu)
> +{
> + iommu_feature_disable(iommu, CONTROL_IRTCACHEDIS);
> +}
> +
> +static void iommu_enable_irtcachedis(struct amd_iommu *iommu)
> +{
> + u64 ctrl;
> +
> + if (!amd_iommu_irtcachedis)
> + return;
> +
> + /*
> + * Note:
> + * The support for IRTCacheDis feature is dertermined by
> + * checking if the bit is writable.
> + */
> + iommu_feature_enable(iommu, CONTROL_IRTCACHEDIS);
> + ctrl = readq(iommu->mmio_base + MMIO_CONTROL_OFFSET);
> + ctrl &= (1ULL << CONTROL_IRTCACHEDIS);
> + if (ctrl)
> + iommu->irtcachedis_enabled = true;
> + pr_info("iommu%d (%#06x) : IRT cache is %s\n",
> + iommu->index, iommu->devid,
> + iommu->irtcachedis_enabled ? "disabled" : "enabled");
> +}
> +
> static void early_enable_iommu(struct amd_iommu *iommu)
> {
> iommu_disable(iommu);
> @@ -2710,6 +2741,7 @@ static void early_enable_iommu(struct amd_iommu *iommu)
> iommu_set_exclusion_range(iommu);
> iommu_enable_ga(iommu);
> iommu_enable_xt(iommu);
> + iommu_enable_irtcachedis(iommu);
> iommu_enable(iommu);
> iommu_flush_all_caches(iommu);
> }
> @@ -2760,10 +2792,12 @@ static void early_enable_iommus(void)
> for_each_iommu(iommu) {
> iommu_disable_command_buffer(iommu);
> iommu_disable_event_buffer(iommu);
> + iommu_disable_irtcachedis(iommu);
> iommu_enable_command_buffer(iommu);
> iommu_enable_event_buffer(iommu);
> iommu_enable_ga(iommu);
> iommu_enable_xt(iommu);
> + iommu_enable_irtcachedis(iommu);
> iommu_set_device_table(iommu);
> iommu_flush_all_caches(iommu);
> }
> @@ -3411,6 +3445,8 @@ static int __init parse_amd_iommu_options(char *str)
> amd_iommu_pgtable = AMD_IOMMU_V1;
> } else if (strncmp(str, "pgtbl_v2", 8) == 0) {
> amd_iommu_pgtable = AMD_IOMMU_V2;
> + } else if (strncmp(str, "irtcachedis", 11) == 0) {
> + amd_iommu_irtcachedis = true;
> } else {
> pr_notice("Unknown option - '%s'\n", str);
> }
> --
> 2.31.1
>
On Thu, May 18, 2023 at 08:55:28PM -0400, Suravee Suthikulpanit wrote:
> With the Interrupt Remapping Table cache disabled, there is no need to
> issue invalidate IRT and wait for its completion. Therefore, add logic
> to bypass the operation.
>
> Suggested-by: Joao Martins <[email protected]>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
Would it be clearer for the summary to be "iommu/amd: Do not
Invalidate IRT when IRTE caching is disabled"?
Reviewed-by: Jerry Snitselaar <[email protected]>
> ---
> drivers/iommu/amd/iommu.c | 21 +++++++++++++++------
> 1 file changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 0c4a2796bb0a..51c2b018433d 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -1273,12 +1273,24 @@ static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
> u32 devid;
> u16 last_bdf = iommu->pci_seg->last_bdf;
>
> + if (iommu->irtcachedis_enabled)
> + return;
> +
> for (devid = 0; devid <= last_bdf; devid++)
> iommu_flush_irt(iommu, devid);
>
> iommu_completion_wait(iommu);
> }
>
> +static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
> +{
> + if (iommu->irtcachedis_enabled)
> + return;
> +
> + iommu_flush_irt(iommu, devid);
> + iommu_completion_wait(iommu);
> +}
> +
> void iommu_flush_all_caches(struct amd_iommu *iommu)
> {
> if (iommu_feature(iommu, FEATURE_IA)) {
> @@ -3028,8 +3040,7 @@ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
>
> raw_spin_unlock_irqrestore(&table->lock, flags);
>
> - iommu_flush_irt(iommu, devid);
> - iommu_completion_wait(iommu);
> + iommu_flush_irt_and_complete(iommu, devid);
>
> return 0;
> }
> @@ -3048,8 +3059,7 @@ static int modify_irte(struct amd_iommu *iommu,
> table->table[index] = irte->val;
> raw_spin_unlock_irqrestore(&table->lock, flags);
>
> - iommu_flush_irt(iommu, devid);
> - iommu_completion_wait(iommu);
> + iommu_flush_irt_and_complete(iommu, devid);
>
> return 0;
> }
> @@ -3067,8 +3077,7 @@ static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
> iommu->irte_ops->clear_allocated(table, index);
> raw_spin_unlock_irqrestore(&table->lock, flags);
>
> - iommu_flush_irt(iommu, devid);
> - iommu_completion_wait(iommu);
> + iommu_flush_irt_and_complete(iommu, devid);
> }
>
> static void irte_prepare(void *entry,
> --
> 2.31.1
>
On Thu, May 18, 2023 at 08:55:25PM -0400, Suravee Suthikulpanit wrote:
> From: Joao Martins <[email protected]>
>
> The modify_irte_ga() uses cmpxchg_double() to update the IRTE in one shot,
> which is necessary when adding IRTE cache disabling support since
> the driver no longer need to flush the IRT for hardware to take effect.
>
> Please note that there is a functional change where the IsRun and
> Destination bits of IRTE are now cached in the struct amd_ir_data.entry.
>
> Signed-off-by: Joao Martins <[email protected]>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
Reviewed-by: Jerry Snitselaar <[email protected]>
> ---
> drivers/iommu/amd/iommu.c | 38 ++++++++++----------------------------
> 1 file changed, 10 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index ebb155bfef15..4a3a7346ab21 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -3700,44 +3700,26 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
>
> int amd_iommu_update_ga(int cpu, bool is_run, void *data)
> {
> - unsigned long flags;
> - struct amd_iommu *iommu;
> - struct irq_remap_table *table;
> struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
> - int devid = ir_data->irq_2_irte.devid;
> struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
> - struct irte_ga *ref = (struct irte_ga *) ir_data->ref;
>
> if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
> - !ref || !entry || !entry->lo.fields_vapic.guest_mode)
> + !entry || !entry->lo.fields_vapic.guest_mode)
> return 0;
>
> - iommu = ir_data->iommu;
> - if (!iommu)
> - return -ENODEV;
> -
> - table = get_irq_table(iommu, devid);
> - if (!table)
> + if (!ir_data->iommu)
> return -ENODEV;
>
> - raw_spin_lock_irqsave(&table->lock, flags);
> -
> - if (ref->lo.fields_vapic.guest_mode) {
> - if (cpu >= 0) {
> - ref->lo.fields_vapic.destination =
> - APICID_TO_IRTE_DEST_LO(cpu);
> - ref->hi.fields.destination =
> - APICID_TO_IRTE_DEST_HI(cpu);
> - }
> - ref->lo.fields_vapic.is_run = is_run;
> - barrier();
> + if (cpu >= 0) {
> + entry->lo.fields_vapic.destination =
> + APICID_TO_IRTE_DEST_LO(cpu);
> + entry->hi.fields.destination =
> + APICID_TO_IRTE_DEST_HI(cpu);
> }
> + entry->lo.fields_vapic.is_run = is_run;
>
> - raw_spin_unlock_irqrestore(&table->lock, flags);
> -
> - iommu_flush_irt(iommu, devid);
> - iommu_completion_wait(iommu);
> - return 0;
> + return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
> + ir_data->irq_2_irte.index, entry, ir_data);
> }
> EXPORT_SYMBOL(amd_iommu_update_ga);
> #endif
> --
> 2.31.1
>
On Thu, May 18, 2023 at 08:55:26PM -0400, Suravee Suthikulpanit wrote:
> Since the amd_iommu_update_ga() has been switched to use the
> modify_irte_ga() helper function to update the IRTE, the parameter
> is no longer needed.
>
> Suggested-by: Vasant Hegde <[email protected]>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
Reviewed-by: Jerry Snitselaar <[email protected]>
> ---
> drivers/iommu/amd/amd_iommu_types.h | 1 -
> drivers/iommu/amd/iommu.c | 17 +++++++----------
> 2 files changed, 7 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> index 3d684190b4d5..a0ff1e852efc 100644
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -993,7 +993,6 @@ struct amd_ir_data {
> struct irq_2_irte irq_2_irte;
> struct msi_msg msi_entry;
> void *entry; /* Pointer to union irte or struct irte_ga */
> - void *ref; /* Pointer to the actual irte */
>
> /**
> * Store information for activate/de-activate
> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
> index 4a3a7346ab21..0c4a2796bb0a 100644
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -2999,7 +2999,7 @@ static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
> }
>
> static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
> - struct irte_ga *irte, struct amd_ir_data *data)
> + struct irte_ga *irte)
> {
> bool ret;
> struct irq_remap_table *table;
> @@ -3026,9 +3026,6 @@ static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
> */
> WARN_ON(!ret);
>
> - if (data)
> - data->ref = entry;
> -
> raw_spin_unlock_irqrestore(&table->lock, flags);
>
> iommu_flush_irt(iommu, devid);
> @@ -3117,7 +3114,7 @@ static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u1
> struct irte_ga *irte = (struct irte_ga *) entry;
>
> irte->lo.fields_remap.valid = 1;
> - modify_irte_ga(iommu, devid, index, irte, NULL);
> + modify_irte_ga(iommu, devid, index, irte);
> }
>
> static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
> @@ -3133,7 +3130,7 @@ static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid,
> struct irte_ga *irte = (struct irte_ga *) entry;
>
> irte->lo.fields_remap.valid = 0;
> - modify_irte_ga(iommu, devid, index, irte, NULL);
> + modify_irte_ga(iommu, devid, index, irte);
> }
>
> static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
> @@ -3157,7 +3154,7 @@ static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid
> APICID_TO_IRTE_DEST_LO(dest_apicid);
> irte->hi.fields.destination =
> APICID_TO_IRTE_DEST_HI(dest_apicid);
> - modify_irte_ga(iommu, devid, index, irte, NULL);
> + modify_irte_ga(iommu, devid, index, irte);
> }
> }
>
> @@ -3508,7 +3505,7 @@ int amd_iommu_activate_guest_mode(void *data)
> entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
>
> return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
> - ir_data->irq_2_irte.index, entry, ir_data);
> + ir_data->irq_2_irte.index, entry);
> }
> EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
>
> @@ -3538,7 +3535,7 @@ int amd_iommu_deactivate_guest_mode(void *data)
> APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
>
> return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
> - ir_data->irq_2_irte.index, entry, ir_data);
> + ir_data->irq_2_irte.index, entry);
> }
> EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
>
> @@ -3719,7 +3716,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
> entry->lo.fields_vapic.is_run = is_run;
>
> return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
> - ir_data->irq_2_irte.index, entry, ir_data);
> + ir_data->irq_2_irte.index, entry);
> }
> EXPORT_SYMBOL(amd_iommu_update_ga);
> #endif
> --
> 2.31.1
>
On 5/23/2023 7:17 AM, [email protected] wrote:
> Would it be clearer for the summary to be "iommu/amd: Do not
> Invalidate IRT when IRTE caching is disabled"?
I have no objection to an updated summary.
Joerg, Please let me know if you would like me to send v3 with updated
summary.
Thank you,
Suravee
Hi Suravee,
On Tue, May 23, 2023 at 10:38:59AM +0700, Suthikulpanit, Suravee wrote:
> Joerg, Please let me know if you would like me to send v3 with updated
> summary.
Yes, please send a v3 with all updates included.
Thanks,
Joerg