LinuxLists.cc - [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

2013-04-15 07:06:55

Subject: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

From: Suravee Suthikulpanit <[email protected]>

The IOMMU interrupt handling in bottom half must clear the PPR log interrupt
and event log interrupt bits to re-enable the interrupt. This is done by
writing 1 to the memory mapped register to clear the bit. Due to hardware bug,
if the driver tries to clear this bit while the IOMMU hardware also setting
this bit, the conflict will result with the bit being set. If the interrupt
handling code does not make sure to clear this bit, subsequent changes in the
event/PPR logs will no longer generating interrupts, and would result if
buffer overflow. After clearing the bits, the driver must read back
the register to verify.

Signed-off-by: Suravee Suthikulpanit <[email protected]>
---
drivers/iommu/amd_iommu.c | 145 +++++++++++++++++++++++++++++----------------
1 file changed, 93 insertions(+), 52 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index e5db3e5..419af1d 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -803,22 +803,43 @@ retry:
static void iommu_poll_events(struct amd_iommu *iommu)
{
u32 head, tail;
+ u32 status;
unsigned long flags;

- /* enable event interrupts again */
- writel(MMIO_STATUS_EVT_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
spin_lock_irqsave(&iommu->lock, flags);

- head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);

- while (head != tail) {
- iommu_handle_event(iommu, iommu->evt_buf + head);
- head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
- }
+ while (status & MMIO_STATUS_EVT_INT_MASK) {
+
+ /* enable event interrupts again */
+ writel(MMIO_STATUS_EVT_INT_MASK,
+ iommu->mmio_base + MMIO_STATUS_OFFSET);

- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+ while (head != tail) {
+ iommu_handle_event(iommu, iommu->evt_buf + head);
+ head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
+ }
+
+ writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+
+ /*
+ * Hardware bug: When re-enabling interrupt (by writing 1
+ * to clear the bit), the hardware might also try to set
+ * the interrupt bit in the event status register.
+ * In this scenario, the bit will be set, and disable
+ * subsequent interrupts.
+ *
+ * Workaround: The IOMMU driver should read back the
+ * status register and check if the interrupt bits are cleared.
+ * If not, driver will need to go through the interrupt handler
+ * again and re-clear the bits
+ */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ }

spin_unlock_irqrestore(&iommu->lock, flags);
}
@@ -846,65 +867,85 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
static void iommu_poll_ppr_log(struct amd_iommu *iommu)
{
unsigned long flags;
+ u32 status;
u32 head, tail;

if (iommu->ppr_log == NULL)
return;

- /* enable ppr interrupts again */
- writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
spin_lock_irqsave(&iommu->lock, flags);

- head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);

- while (head != tail) {
- volatile u64 *raw;
- u64 entry[2];
- int i;
+ while (status & MMIO_STATUS_PPR_INT_MASK) {
+ /* enable ppr interrupts again */
+ writel(MMIO_STATUS_PPR_INT_MASK,
+ iommu->mmio_base + MMIO_STATUS_OFFSET);

- raw = (u64 *)(iommu->ppr_log + head);
+ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);

- /*
- * Hardware bug: Interrupt may arrive before the entry is
- * written to memory. If this happens we need to wait for the
- * entry to arrive.
- */
- for (i = 0; i < LOOP_TIMEOUT; ++i) {
- if (PPR_REQ_TYPE(raw[0]) != 0)
- break;
- udelay(1);
- }
+ while (head != tail) {
+ volatile u64 *raw;
+ u64 entry[2];
+ int i;

- /* Avoid memcpy function-call overhead */
- entry[0] = raw[0];
- entry[1] = raw[1];
+ raw = (u64 *)(iommu->ppr_log + head);

- /*
- * To detect the hardware bug we need to clear the entry
- * back to zero.
- */
- raw[0] = raw[1] = 0UL;
+ /*
+ * Hardware bug: Interrupt may arrive before the entry
+ * is written to memory. If this happens we need to wait
+ * for the entry to arrive.
+ */
+ for (i = 0; i < LOOP_TIMEOUT; ++i) {
+ if (PPR_REQ_TYPE(raw[0]) != 0)
+ break;
+ udelay(1);
+ }

- /* Update head pointer of hardware ring-buffer */
- head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
- writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ /* Avoid memcpy function-call overhead */
+ entry[0] = raw[0];
+ entry[1] = raw[1];

- /*
- * Release iommu->lock because ppr-handling might need to
- * re-acquire it
- */
- spin_unlock_irqrestore(&iommu->lock, flags);
+ /*
+ * To detect the hardware bug we need to clear the entry
+ * back to zero.
+ */
+ raw[0] = raw[1] = 0UL;
+
+ /* Update head pointer of hardware ring-buffer */
+ head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE;
+ writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);

- /* Handle PPR entry */
- iommu_handle_ppr_entry(iommu, entry);
+ /*
+ * Release iommu->lock because ppr-handling might need
+ * to re-acquire it
+ */
+ spin_unlock_irqrestore(&iommu->lock, flags);

- spin_lock_irqsave(&iommu->lock, flags);
+ /* Handle PPR entry */
+ iommu_handle_ppr_entry(iommu, entry);

- /* Refresh ring-buffer information */
- head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+ spin_lock_irqsave(&iommu->lock, flags);
+
+ /* Refresh ring-buffer information */
+ head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
+ }
+
+ /*
+ * Hardware bug: When re-enabling interrupt (by writing 1
+ * to clear the bit), the hardware might also try to set
+ * the interrupt bit in the event status register.
+ * In this scenario, the bit will be set, and disable
+ * subsequent interrupts.
+ *
+ * Workaround: The IOMMU driver should read back the
+ * status register and check if the interrupt bits are cleared.
+ * If not, driver will need to go through the interrupt handler
+ * again and re-clear the bits
+ */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
}

spin_unlock_irqrestore(&iommu->lock, flags);
--
1.7.10.4

2013-04-18 16:02:29

by Joerg Roedel

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On Mon, Apr 15, 2013 at 02:07:46AM -0500, [email protected] wrote:
> drivers/iommu/amd_iommu.c | 145 +++++++++++++++++++++++++++++----------------

That is way too much for a simple erratum workaround, and too much for a
stable backport. I queued the patch below instead, which has a much
smaller diff and does the same. Please rebase your second patch on-top
of this one and send it again.

>From 4ba052102863da02db79c03d2483b6ad905737ad Mon Sep 17 00:00:00 2001
From: Joerg Roedel <[email protected]>
Date: Thu, 18 Apr 2013 17:55:04 +0200
Subject: [PATCH] iommu/amd: Workaround for ERBT1312

Work around an IOMMU hardware bug where clearing the
EVT_INT bit in the status register may race with the
hardware trying to set it again. When not handled the bit
might not be cleared and we lose all future event
interrupts.

Reported-by: Suravee Suthikulpanit <[email protected]>
Cc: [email protected]
Signed-off-by: Joerg Roedel <[email protected]>
---
drivers/iommu/amd_iommu.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f42793d..de5ae4b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -700,14 +700,23 @@ retry:

static void iommu_poll_events(struct amd_iommu *iommu)
{
- u32 head, tail;
+ u32 head, tail, status;
unsigned long flags;

- /* enable event interrupts again */
- writel(MMIO_STATUS_EVT_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
spin_lock_irqsave(&iommu->lock, flags);

+ /* enable event interrupts again */
+ do {
+ /*
+ * Workaround for Erratum ERBT1312
+ * Clearing the EVT_INT bit may race in the hardware, so read
+ * it again and make sure it was really cleared
+ */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ writel(MMIO_STATUS_EVT_INT_MASK,
+ iommu->mmio_base + MMIO_STATUS_OFFSET);
+ } while (status & MMIO_STATUS_EVT_INT_MASK);
+
head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);

--
1.7.9.5

2013-04-18 16:13:31

by Suthikulpanit, Suravee

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

Joerg,

This workaround is required for both event log and ppr log. Your patch
is only taking care of the event log.

Suravee

On 4/18/2013 11:02 AM, Joerg Roedel wrote:
> On Mon, Apr 15, 2013 at 02:07:46AM -0500, [email protected] wrote:
>> drivers/iommu/amd_iommu.c | 145 +++++++++++++++++++++++++++++----------------
> That is way too much for a simple erratum workaround, and too much for a
> stable backport. I queued the patch below instead, which has a much
> smaller diff and does the same. Please rebase your second patch on-top
> of this one and send it again.
>
> From 4ba052102863da02db79c03d2483b6ad905737ad Mon Sep 17 00:00:00 2001
> From: Joerg Roedel <[email protected]>
> Date: Thu, 18 Apr 2013 17:55:04 +0200
> Subject: [PATCH] iommu/amd: Workaround for ERBT1312
>
> Work around an IOMMU hardware bug where clearing the
> EVT_INT bit in the status register may race with the
> hardware trying to set it again. When not handled the bit
> might not be cleared and we lose all future event
> interrupts.
>
> Reported-by: Suravee Suthikulpanit <[email protected]>
> Cc: [email protected]
> Signed-off-by: Joerg Roedel <[email protected]>
> ---
> drivers/iommu/amd_iommu.c | 17 +++++++++++++----
> 1 file changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index f42793d..de5ae4b 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -700,14 +700,23 @@ retry:
>
> static void iommu_poll_events(struct amd_iommu *iommu)
> {
> - u32 head, tail;
> + u32 head, tail, status;
> unsigned long flags;
>
> - /* enable event interrupts again */
> - writel(MMIO_STATUS_EVT_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
> -
> spin_lock_irqsave(&iommu->lock, flags);
>
> + /* enable event interrupts again */
> + do {
> + /*
> + * Workaround for Erratum ERBT1312
> + * Clearing the EVT_INT bit may race in the hardware, so read
> + * it again and make sure it was really cleared
> + */
> + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
> + writel(MMIO_STATUS_EVT_INT_MASK,
> + iommu->mmio_base + MMIO_STATUS_OFFSET);
> + } while (status & MMIO_STATUS_EVT_INT_MASK);
> +
> head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
> tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
>

2013-04-18 16:29:03

by Joerg Roedel

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On Thu, Apr 18, 2013 at 11:13:19AM -0500, Suravee Suthikulanit wrote:
> This workaround is required for both event log and ppr log. Your
> patch is only taking care of the event log.

Right, thanks for the notice. Here is the updated patch.

>From cebe04596989c4b9001e2c1571c4fb219ea37b99 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <[email protected]>
Date: Thu, 18 Apr 2013 17:55:04 +0200
Subject: [PATCH] iommu/amd: Workaround for ERBT1312

Work around an IOMMU hardware bug where clearing the
EVT_INT or PPR_INT bit in the status register may race with
the hardware trying to set it again. When not handled the
bit might not be cleared and we lose all future event or ppr
interrupts.

Reported-by: Suravee Suthikulpanit <[email protected]>
Cc: [email protected]
Signed-off-by: Joerg Roedel <[email protected]>
---
drivers/iommu/amd_iommu.c | 34 ++++++++++++++++++++++++++--------
1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index f42793d..27792f8 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -700,14 +700,23 @@ retry:

static void iommu_poll_events(struct amd_iommu *iommu)
{
- u32 head, tail;
+ u32 head, tail, status;
unsigned long flags;

- /* enable event interrupts again */
- writel(MMIO_STATUS_EVT_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
spin_lock_irqsave(&iommu->lock, flags);

+ /* enable event interrupts again */
+ do {
+ /*
+ * Workaround for Erratum ERBT1312
+ * Clearing the EVT_INT bit may race in the hardware, so read
+ * it again and make sure it was really cleared
+ */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ writel(MMIO_STATUS_EVT_INT_MASK,
+ iommu->mmio_base + MMIO_STATUS_OFFSET);
+ } while (status & MMIO_STATUS_EVT_INT_MASK);
+
head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);

@@ -744,16 +753,25 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
static void iommu_poll_ppr_log(struct amd_iommu *iommu)
{
unsigned long flags;
- u32 head, tail;
+ u32 head, tail, status;

if (iommu->ppr_log == NULL)
return;

- /* enable ppr interrupts again */
- writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
spin_lock_irqsave(&iommu->lock, flags);

+ /* enable ppr interrupts again */
+ do {
+ /*
+ * Workaround for Erratum ERBT1312
+ * Clearing the PPR_INT bit may race in the hardware, so read
+ * it again and make sure it was really cleared
+ */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ writel(MMIO_STATUS_PPR_INT_MASK,
+ iommu->mmio_base + MMIO_STATUS_OFFSET);
+ } while (status & MMIO_STATUS_PPR_INT_MASK);
+
head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);

--
1.7.9.5

2013-04-18 17:00:14

by Suthikulpanit, Suravee

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

Joerg,

One last concern I have for this patch is the case when we re-enable the
interrupt, then another interrupt happens while we processing the log
and set the bit. If the interrupt thread doesn't check this right
before the thread exits the handler. We could still end up leaving the
interrupt disabled.

Suravee

On 4/18/2013 11:28 AM, Joerg Roedel wrote:
> On Thu, Apr 18, 2013 at 11:13:19AM -0500, Suravee Suthikulanit wrote:
>> This workaround is required for both event log and ppr log. Your
>> patch is only taking care of the event log.
> Right, thanks for the notice. Here is the updated patch.
>
> From cebe04596989c4b9001e2c1571c4fb219ea37b99 Mon Sep 17 00:00:00 2001
> From: Joerg Roedel <[email protected]>
> Date: Thu, 18 Apr 2013 17:55:04 +0200
> Subject: [PATCH] iommu/amd: Workaround for ERBT1312
>
> Work around an IOMMU hardware bug where clearing the
> EVT_INT or PPR_INT bit in the status register may race with
> the hardware trying to set it again. When not handled the
> bit might not be cleared and we lose all future event or ppr
> interrupts.
>
> Reported-by: Suravee Suthikulpanit <[email protected]>
> Cc: [email protected]
> Signed-off-by: Joerg Roedel <[email protected]>
> ---
> drivers/iommu/amd_iommu.c | 34 ++++++++++++++++++++++++++--------
> 1 file changed, 26 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index f42793d..27792f8 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -700,14 +700,23 @@ retry:
>
> static void iommu_poll_events(struct amd_iommu *iommu)
> {
> - u32 head, tail;
> + u32 head, tail, status;
> unsigned long flags;
>
> - /* enable event interrupts again */
> - writel(MMIO_STATUS_EVT_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
> -
> spin_lock_irqsave(&iommu->lock, flags);
>
> + /* enable event interrupts again */
> + do {
> + /*
> + * Workaround for Erratum ERBT1312
> + * Clearing the EVT_INT bit may race in the hardware, so read
> + * it again and make sure it was really cleared
> + */
> + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
> + writel(MMIO_STATUS_EVT_INT_MASK,
> + iommu->mmio_base + MMIO_STATUS_OFFSET);
> + } while (status & MMIO_STATUS_EVT_INT_MASK);
> +
> head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
> tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
>
> @@ -744,16 +753,25 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
> static void iommu_poll_ppr_log(struct amd_iommu *iommu)
> {
> unsigned long flags;
> - u32 head, tail;
> + u32 head, tail, status;
>
> if (iommu->ppr_log == NULL)
> return;
>
> - /* enable ppr interrupts again */
> - writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
> -
> spin_lock_irqsave(&iommu->lock, flags);
>
> + /* enable ppr interrupts again */
> + do {
> + /*
> + * Workaround for Erratum ERBT1312
> + * Clearing the PPR_INT bit may race in the hardware, so read
> + * it again and make sure it was really cleared
> + */
> + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
> + writel(MMIO_STATUS_PPR_INT_MASK,
> + iommu->mmio_base + MMIO_STATUS_OFFSET);
> + } while (status & MMIO_STATUS_PPR_INT_MASK);
> +
> head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
> tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
>

2013-04-18 18:35:41

by Joerg Roedel

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On Thu, Apr 18, 2013 at 11:59:58AM -0500, Suthikulpanit, Suravee wrote:
> One last concern I have for this patch is the case when we re-enable
> the interrupt, then another interrupt happens while we processing
> the log and set the bit. If the interrupt thread doesn't check this
> right before the thread exits the handler. We could still end up
> leaving the interrupt disabled.

That can't happen, the patch checks whether the bit is really 0 and then
it processes the event/ppr-log entries. If any new entry is queued while
we process the logs another interrupt will be fired and the irq-thread
will run again. So we will not miss any log entry.

Joerg

2013-04-18 18:58:39

by Suthikulpanit, Suravee

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On 4/18/2013 1:35 PM, Joerg Roedel wrote:
> On Thu, Apr 18, 2013 at 11:59:58AM -0500, Suthikulpanit, Suravee wrote:
>> One last concern I have for this patch is the case when we re-enable
>> the interrupt, then another interrupt happens while we processing
>> the log and set the bit. If the interrupt thread doesn't check this
>> right before the thread exits the handler. We could still end up
>> leaving the interrupt disabled.
> That can't happen, the patch checks whether the bit is really 0 and then
> it processes the event/ppr-log entries. If any new entry is queued while
> we process the logs another interrupt will be fired and the irq-thread
> will run again. So we will not miss any log entry.
According to the "kernel/irq/handle.c:irq_wake_thread()", I thought that
for the threaded IRQ, if the system getting a new interrupt from the
device while the thread is running, it will just return and do nothing.

Suravee

> Joerg
>
>
>

2013-04-18 20:06:17

by Joerg Roedel

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On Thu, Apr 18, 2013 at 01:56:42PM -0500, Suthikulpanit, Suravee wrote:
> On 4/18/2013 1:35 PM, Joerg Roedel wrote:

> According to the "kernel/irq/handle.c:irq_wake_thread()", I thought
> that for the threaded IRQ, if the system getting a new interrupt
> from the device while the thread is running, it will just return and
> do nothing.

Yes, but the irq-thread function itself executes the handler function
repeatedly until the IRQTF_RUNTHREAD bit is cleared. And every new
interrupt will set this bit again. So when there is a new interrupt
while our handler function runs the handler will be called again by the
irq-thread.

Joerg

2013-04-22 21:44:03

by Suthikulpanit, Suravee

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On 4/18/2013 3:06 PM, Joerg Roedel wrote:
> Yes, but the irq-thread function itself executes the handler function
> repeatedly until the IRQTF_RUNTHREAD bit is cleared. And every new
> interrupt will set this bit again. So when there is a new interrupt
> while our handler function runs the handler will be called again by the
> irq-thread.
>
>
> Joerg
Thank you for the clarification. I have submitted a new patch based on
the new workaround code.
(http://lists.linuxfoundation.org/pipermail/iommu/2013-April/005574.html)

Suravee

2013-04-23 13:22:53

by Donald Dutile

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On 04/18/2013 12:28 PM, Joerg Roedel wrote:
> On Thu, Apr 18, 2013 at 11:13:19AM -0500, Suravee Suthikulanit wrote:
>> This workaround is required for both event log and ppr log. Your
>> patch is only taking care of the event log.
>
> Right, thanks for the notice. Here is the updated patch.
>
> From cebe04596989c4b9001e2c1571c4fb219ea37b99 Mon Sep 17 00:00:00 2001
> From: Joerg Roedel<[email protected]>
> Date: Thu, 18 Apr 2013 17:55:04 +0200
> Subject: [PATCH] iommu/amd: Workaround for ERBT1312
>
> Work around an IOMMU hardware bug where clearing the
> EVT_INT or PPR_INT bit in the status register may race with
> the hardware trying to set it again. When not handled the
> bit might not be cleared and we lose all future event or ppr
> interrupts.
>
> Reported-by: Suravee Suthikulpanit<[email protected]>
> Cc: [email protected]
> Signed-off-by: Joerg Roedel<[email protected]>
> ---
> drivers/iommu/amd_iommu.c | 34 ++++++++++++++++++++++++++--------
> 1 file changed, 26 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index f42793d..27792f8 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -700,14 +700,23 @@ retry:
>
> static void iommu_poll_events(struct amd_iommu *iommu)
> {
> - u32 head, tail;
> + u32 head, tail, status;
> unsigned long flags;
>
> - /* enable event interrupts again */
> - writel(MMIO_STATUS_EVT_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
> -
> spin_lock_irqsave(&iommu->lock, flags);
>
> + /* enable event interrupts again */
> + do {
> + /*
> + * Workaround for Erratum ERBT1312
> + * Clearing the EVT_INT bit may race in the hardware, so read
> + * it again and make sure it was really cleared
> + */
> + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
> + writel(MMIO_STATUS_EVT_INT_MASK,
> + iommu->mmio_base + MMIO_STATUS_OFFSET);
> + } while (status& MMIO_STATUS_EVT_INT_MASK);
> +
> head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
> tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
>
> @@ -744,16 +753,25 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
> static void iommu_poll_ppr_log(struct amd_iommu *iommu)
> {
> unsigned long flags;
> - u32 head, tail;
> + u32 head, tail, status;
>
> if (iommu->ppr_log == NULL)
> return;
>
> - /* enable ppr interrupts again */
> - writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET);
> -
> spin_lock_irqsave(&iommu->lock, flags);
>
> + /* enable ppr interrupts again */
> + do {
> + /*
> + * Workaround for Erratum ERBT1312
> + * Clearing the PPR_INT bit may race in the hardware, so read
> + * it again and make sure it was really cleared
> + */
> + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
> + writel(MMIO_STATUS_PPR_INT_MASK,
> + iommu->mmio_base + MMIO_STATUS_OFFSET);
> + } while (status& MMIO_STATUS_PPR_INT_MASK);
> +
> head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET);
> tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET);
>
Given other threads on this mail list (and I've seen crashes with same problem)
where this type of logging during a flood of IOMMU errors will lock up the machine,
is there something that can be done to break the do-while loop after n iterations
have been exec'd, so the kernel can progress during a crash ?

2013-04-24 10:46:20

by Joerg Roedel

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On Tue, Apr 23, 2013 at 09:22:45AM -0400, Don Dutile wrote:
> Given other threads on this mail list (and I've seen crashes with same problem)
> where this type of logging during a flood of IOMMU errors will lock up the machine,
> is there something that can be done to break the do-while loop after n iterations
> have been exec'd, so the kernel can progress during a crash ?

In the case of an IOMMU error flood this loop will only run until the
event-log/ppr-log overflows. So it should not turn into an endless loop.

Joerg

2013-04-24 13:52:47

by Donald Dutile

[permalink] [raw]

Subject: Re: [PATCH 1/2 V2] iommu/amd: Add workaround for ERBT1312

On 04/24/2013 06:46 AM, Joerg Roedel wrote:
> On Tue, Apr 23, 2013 at 09:22:45AM -0400, Don Dutile wrote:
>> Given other threads on this mail list (and I've seen crashes with same problem)
>> where this type of logging during a flood of IOMMU errors will lock up the machine,
>> is there something that can be done to break the do-while loop after n iterations
>> have been exec'd, so the kernel can progress during a crash ?
>
> In the case of an IOMMU error flood this loop will only run until the
> event-log/ppr-log overflows. So it should not turn into an endless loop.
>
>
> Joerg
>
>
Thanks for verification.