2023-05-30 18:01:45

by Lino Sanfilippo

[permalink] [raw]
Subject: [PATCH v2] tpm,tpm_tis: Handle interrupt storm

From: Lino Sanfilippo <[email protected]>

After activation of interrupts for TPM TIS drivers 0-day reports an
interrupt storm on an Inspur NF5180M6/NF5180M6 server.

Fix this by detecting the storm and falling back to polling:
Count the number of unhandled interrupts within a 10 ms time interval. In
case that more than 1000 were unhandled deactivate interrupts entirely,
deregister the handler and use polling instead.

The storm detection logic equals the implementation in note_interrupt()
which uses timestamps and counters stored in struct irq_desc. Since this
structure is private to the generic interrupt core the TPM TIS core uses
its own timestamps and counters. Furthermore the TPM interrupt handler
always returns IRQ_HANDLED to prevent the generic interrupt core from
processing the interrupt storm.

Since the interrupt deregistration function devm_free_irq() waits for all
interrupt handlers to finish, only trigger a worker in the interrupt
handler and do the unregistration in the worker to avoid a deadlock.

Reported-by: kernel test robot <[email protected]>
Closes: https://lore.kernel.org/oe-lkp/[email protected]/
Suggested-by: Lukas Wunner <[email protected]>
Signed-off-by: Lino Sanfilippo <[email protected]>
---
drivers/char/tpm/tpm_tis_core.c | 93 ++++++++++++++++++++++++++++-----
drivers/char/tpm/tpm_tis_core.h | 4 ++
2 files changed, 85 insertions(+), 12 deletions(-)

diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index 558144fa707a..7ae8228e803f 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -468,25 +468,32 @@ static int tpm_tis_send_data(struct tpm_chip *chip, const u8 *buf, size_t len)
return rc;
}

+static void __tpm_tis_disable_interrupts(struct tpm_chip *chip)
+{
+ struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
+ u32 intmask = 0;
+
+ tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
+ intmask &= ~TPM_GLOBAL_INT_ENABLE;
+
+ tpm_tis_request_locality(chip, 0);
+ tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
+ tpm_tis_relinquish_locality(chip, 0);
+
+ chip->flags &= ~TPM_CHIP_FLAG_IRQ;
+}
+
static void disable_interrupts(struct tpm_chip *chip)
{
struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
- u32 intmask;
- int rc;

if (priv->irq == 0)
return;

- rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
- if (rc < 0)
- intmask = 0;
-
- intmask &= ~TPM_GLOBAL_INT_ENABLE;
- rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
+ __tpm_tis_disable_interrupts(chip);

devm_free_irq(chip->dev.parent, priv->irq, chip);
priv->irq = 0;
- chip->flags &= ~TPM_CHIP_FLAG_IRQ;
}

/*
@@ -752,6 +759,53 @@ static bool tpm_tis_req_canceled(struct tpm_chip *chip, u8 status)
return status == TPM_STS_COMMAND_READY;
}

+static void tpm_tis_reenable_polling(struct tpm_chip *chip)
+{
+ struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
+
+ dev_warn(&chip->dev, FW_BUG
+ "TPM interrupt storm detected, polling instead\n");
+
+ __tpm_tis_disable_interrupts(chip);
+
+ /*
+ * devm_free_irq() must not be called from within the interrupt handler,
+ * since this function waits for running handlers to finish and thus it
+ * would deadlock. Instead trigger a worker that takes care of the
+ * unregistration.
+ */
+ schedule_work(&priv->free_irq_work);
+}
+
+static irqreturn_t tpm_tis_check_for_interrupt_storm(struct tpm_chip *chip)
+{
+ struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
+ const unsigned int MAX_UNHANDLED_IRQS = 1000;
+
+ /*
+ * The worker to free the TPM interrupt (free_irq_work) may already
+ * be scheduled, so make sure it is not scheduled again.
+ */
+ if (!(chip->flags & TPM_CHIP_FLAG_IRQ))
+ return IRQ_HANDLED;
+
+ if (time_after(jiffies, priv->last_unhandled_irq + HZ/10))
+ priv->unhandled_irqs = 1;
+ else
+ priv->unhandled_irqs++;
+
+ priv->last_unhandled_irq = jiffies;
+
+ if (priv->unhandled_irqs > MAX_UNHANDLED_IRQS)
+ tpm_tis_reenable_polling(chip);
+
+ /*
+ * Prevent the genirq code from starting its own interrupt storm
+ * handling by always reporting that the interrupt was handled.
+ */
+ return IRQ_HANDLED;
+}
+
static irqreturn_t tis_int_handler(int dummy, void *dev_id)
{
struct tpm_chip *chip = dev_id;
@@ -761,10 +815,10 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id)

rc = tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt);
if (rc < 0)
- return IRQ_NONE;
+ goto unhandled;

if (interrupt == 0)
- return IRQ_NONE;
+ goto unhandled;

set_bit(TPM_TIS_IRQ_TESTED, &priv->flags);
if (interrupt & TPM_INTF_DATA_AVAIL_INT)
@@ -780,10 +834,13 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id)
rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), interrupt);
tpm_tis_relinquish_locality(chip, 0);
if (rc < 0)
- return IRQ_NONE;
+ goto unhandled;

tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt);
return IRQ_HANDLED;
+
+unhandled:
+ return tpm_tis_check_for_interrupt_storm(chip);
}

static void tpm_tis_gen_interrupt(struct tpm_chip *chip)
@@ -804,6 +861,15 @@ static void tpm_tis_gen_interrupt(struct tpm_chip *chip)
chip->flags &= ~TPM_CHIP_FLAG_IRQ;
}

+static void tpm_tis_free_irq_func(struct work_struct *work)
+{
+ struct tpm_tis_data *priv = container_of(work, typeof(*priv), free_irq_work);
+ struct tpm_chip *chip = priv->chip;
+
+ devm_free_irq(chip->dev.parent, priv->irq, chip);
+ priv->irq = 0;
+}
+
/* Register the IRQ and issue a command that will cause an interrupt. If an
* irq is seen then leave the chip setup for IRQ operation, otherwise reverse
* everything and leave in polling mode. Returns 0 on success.
@@ -816,6 +882,7 @@ static int tpm_tis_probe_irq_single(struct tpm_chip *chip, u32 intmask,
int rc;
u32 int_status;

+ INIT_WORK(&priv->free_irq_work, tpm_tis_free_irq_func);

rc = devm_request_threaded_irq(chip->dev.parent, irq, NULL,
tis_int_handler, IRQF_ONESHOT | flags,
@@ -918,6 +985,7 @@ void tpm_tis_remove(struct tpm_chip *chip)
interrupt = 0;

tpm_tis_write32(priv, reg, ~TPM_GLOBAL_INT_ENABLE & interrupt);
+ flush_work(&priv->free_irq_work);

tpm_tis_clkrun_enable(chip, false);

@@ -1021,6 +1089,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
chip->timeout_b = msecs_to_jiffies(TIS_TIMEOUT_B_MAX);
chip->timeout_c = msecs_to_jiffies(TIS_TIMEOUT_C_MAX);
chip->timeout_d = msecs_to_jiffies(TIS_TIMEOUT_D_MAX);
+ priv->chip = chip;
priv->timeout_min = TPM_TIMEOUT_USECS_MIN;
priv->timeout_max = TPM_TIMEOUT_USECS_MAX;
priv->phy_ops = phy_ops;
diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h
index e978f457fd4d..b1fa42367052 100644
--- a/drivers/char/tpm/tpm_tis_core.h
+++ b/drivers/char/tpm/tpm_tis_core.h
@@ -91,11 +91,15 @@ enum tpm_tis_flags {
};

struct tpm_tis_data {
+ struct tpm_chip *chip;
u16 manufacturer_id;
struct mutex locality_count_mutex;
unsigned int locality_count;
int locality;
int irq;
+ struct work_struct free_irq_work;
+ unsigned long last_unhandled_irq;
+ unsigned int unhandled_irqs;
unsigned int int_mask;
unsigned long flags;
void __iomem *ilb_base_addr;

base-commit: 7877cb91f1081754a1487c144d85dc0d2e2e7fc4
--
2.40.1


2023-06-09 14:55:46

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v2] tpm,tpm_tis: Handle interrupt storm

Short summary: tpm_tis: Disable interrupts after 1000 unhandled IRQs

I.e. the exact thing the commit changes. Handle means absolutely
nothing.

BR, Jarkko

2023-06-09 14:57:05

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v2] tpm,tpm_tis: Handle interrupt storm

On Tue May 30, 2023 at 8:47 PM EEST, Lino Sanfilippo wrote:
> From: Lino Sanfilippo <[email protected]>
>
> After activation of interrupts for TPM TIS drivers 0-day reports an
> interrupt storm on an Inspur NF5180M6/NF5180M6 server.
>
> Fix this by detecting the storm and falling back to polling:
> Count the number of unhandled interrupts within a 10 ms time interval. In
> case that more than 1000 were unhandled deactivate interrupts entirely,
> deregister the handler and use polling instead.
>
> The storm detection logic equals the implementation in note_interrupt()
> which uses timestamps and counters stored in struct irq_desc. Since this
> structure is private to the generic interrupt core the TPM TIS core uses
> its own timestamps and counters. Furthermore the TPM interrupt handler
> always returns IRQ_HANDLED to prevent the generic interrupt core from
> processing the interrupt storm.
>
> Since the interrupt deregistration function devm_free_irq() waits for all
> interrupt handlers to finish, only trigger a worker in the interrupt
> handler and do the unregistration in the worker to avoid a deadlock.
>
> Reported-by: kernel test robot <[email protected]>
> Closes: https://lore.kernel.org/oe-lkp/[email protected]/
> Suggested-by: Lukas Wunner <[email protected]>
> Signed-off-by: Lino Sanfilippo <[email protected]>
> ---

Sorry for the latency. I've moved home office to a new location,
which has caused ~2 week lag. Unfortunate timing.

> drivers/char/tpm/tpm_tis_core.c | 93 ++++++++++++++++++++++++++++-----
> drivers/char/tpm/tpm_tis_core.h | 4 ++
> 2 files changed, 85 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
> index 558144fa707a..7ae8228e803f 100644
> --- a/drivers/char/tpm/tpm_tis_core.c
> +++ b/drivers/char/tpm/tpm_tis_core.c
> @@ -468,25 +468,32 @@ static int tpm_tis_send_data(struct tpm_chip *chip, const u8 *buf, size_t len)
> return rc;
> }
>
> +static void __tpm_tis_disable_interrupts(struct tpm_chip *chip)
> +{
> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> + u32 intmask = 0;
> +
> + tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
> + intmask &= ~TPM_GLOBAL_INT_ENABLE;
> +
> + tpm_tis_request_locality(chip, 0);
> + tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
> + tpm_tis_relinquish_locality(chip, 0);
> +
> + chip->flags &= ~TPM_CHIP_FLAG_IRQ;
> +}
> +
> static void disable_interrupts(struct tpm_chip *chip)

Add tpm_ prefix here too. It makes tracing/grepping/etc so much nicer.

> {
> struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> - u32 intmask;

int_mask is more readable

> - int rc;
>
> if (priv->irq == 0)
> return;
>
> - rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
> - if (rc < 0)
> - intmask = 0;
> -
> - intmask &= ~TPM_GLOBAL_INT_ENABLE;
> - rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
> + __tpm_tis_disable_interrupts(chip);
>
> devm_free_irq(chip->dev.parent, priv->irq, chip);
> priv->irq = 0;
> - chip->flags &= ~TPM_CHIP_FLAG_IRQ;
> }
>
> /*
> @@ -752,6 +759,53 @@ static bool tpm_tis_req_canceled(struct tpm_chip *chip, u8 status)
> return status == TPM_STS_COMMAND_READY;
> }
>
> +static void tpm_tis_reenable_polling(struct tpm_chip *chip)
> +{
> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> +
> + dev_warn(&chip->dev, FW_BUG
> + "TPM interrupt storm detected, polling instead\n");
> +
> + __tpm_tis_disable_interrupts(chip);
> +
> + /*
> + * devm_free_irq() must not be called from within the interrupt handler,
> + * since this function waits for running handlers to finish and thus it
> + * would deadlock. Instead trigger a worker that takes care of the
> + * unregistration.
> + */
> + schedule_work(&priv->free_irq_work);
> +}
> +
> +static irqreturn_t tpm_tis_check_for_interrupt_storm(struct tpm_chip *chip)
> +{
> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> + const unsigned int MAX_UNHANDLED_IRQS = 1000;

Please declare this in the beginning of file because it is non-empirical
tuning parameter. I do not want it to be buried here. It is now as good
as a magic number.

Or perhaps even tpm_tis_core.h?

Why MAX_UNHANDLED_IRQS is exactly 1000 and not 1? I would rollback eagerly.

> +
> + /*
> + * The worker to free the TPM interrupt (free_irq_work) may already
> + * be scheduled, so make sure it is not scheduled again.
> + */
> + if (!(chip->flags & TPM_CHIP_FLAG_IRQ))
> + return IRQ_HANDLED;
> +
> + if (time_after(jiffies, priv->last_unhandled_irq + HZ/10))
> + priv->unhandled_irqs = 1;
> + else
> + priv->unhandled_irqs++;
> +
> + priv->last_unhandled_irq = jiffies;
> +
> + if (priv->unhandled_irqs > MAX_UNHANDLED_IRQS)
> + tpm_tis_reenable_polling(chip);
> +
> + /*
> + * Prevent the genirq code from starting its own interrupt storm
> + * handling by always reporting that the interrupt was handled.
> + */
> + return IRQ_HANDLED;
> +}
> +
> static irqreturn_t tis_int_handler(int dummy, void *dev_id)
> {
> struct tpm_chip *chip = dev_id;
> @@ -761,10 +815,10 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id)
>
> rc = tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt);
> if (rc < 0)
> - return IRQ_NONE;
> + goto unhandled;
>
> if (interrupt == 0)
> - return IRQ_NONE;
> + goto unhandled;
>
> set_bit(TPM_TIS_IRQ_TESTED, &priv->flags);
> if (interrupt & TPM_INTF_DATA_AVAIL_INT)
> @@ -780,10 +834,13 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id)
> rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), interrupt);
> tpm_tis_relinquish_locality(chip, 0);
> if (rc < 0)
> - return IRQ_NONE;
> + goto unhandled;
>
> tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt);
> return IRQ_HANDLED;
> +
> +unhandled:
> + return tpm_tis_check_for_interrupt_storm(chip);
> }
>
> static void tpm_tis_gen_interrupt(struct tpm_chip *chip)
> @@ -804,6 +861,15 @@ static void tpm_tis_gen_interrupt(struct tpm_chip *chip)
> chip->flags &= ~TPM_CHIP_FLAG_IRQ;
> }
>
> +static void tpm_tis_free_irq_func(struct work_struct *work)
> +{
> + struct tpm_tis_data *priv = container_of(work, typeof(*priv), free_irq_work);
> + struct tpm_chip *chip = priv->chip;
> +
> + devm_free_irq(chip->dev.parent, priv->irq, chip);
> + priv->irq = 0;
> +}
> +
> /* Register the IRQ and issue a command that will cause an interrupt. If an
> * irq is seen then leave the chip setup for IRQ operation, otherwise reverse
> * everything and leave in polling mode. Returns 0 on success.
> @@ -816,6 +882,7 @@ static int tpm_tis_probe_irq_single(struct tpm_chip *chip, u32 intmask,
> int rc;
> u32 int_status;
>
> + INIT_WORK(&priv->free_irq_work, tpm_tis_free_irq_func);
>
> rc = devm_request_threaded_irq(chip->dev.parent, irq, NULL,
> tis_int_handler, IRQF_ONESHOT | flags,
> @@ -918,6 +985,7 @@ void tpm_tis_remove(struct tpm_chip *chip)
> interrupt = 0;
>
> tpm_tis_write32(priv, reg, ~TPM_GLOBAL_INT_ENABLE & interrupt);
> + flush_work(&priv->free_irq_work);
>
> tpm_tis_clkrun_enable(chip, false);
>
> @@ -1021,6 +1089,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq,
> chip->timeout_b = msecs_to_jiffies(TIS_TIMEOUT_B_MAX);
> chip->timeout_c = msecs_to_jiffies(TIS_TIMEOUT_C_MAX);
> chip->timeout_d = msecs_to_jiffies(TIS_TIMEOUT_D_MAX);
> + priv->chip = chip;
> priv->timeout_min = TPM_TIMEOUT_USECS_MIN;
> priv->timeout_max = TPM_TIMEOUT_USECS_MAX;
> priv->phy_ops = phy_ops;
> diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h
> index e978f457fd4d..b1fa42367052 100644
> --- a/drivers/char/tpm/tpm_tis_core.h
> +++ b/drivers/char/tpm/tpm_tis_core.h
> @@ -91,11 +91,15 @@ enum tpm_tis_flags {
> };
>
> struct tpm_tis_data {
> + struct tpm_chip *chip;
> u16 manufacturer_id;
> struct mutex locality_count_mutex;
> unsigned int locality_count;
> int locality;
> int irq;
> + struct work_struct free_irq_work;
> + unsigned long last_unhandled_irq;
> + unsigned int unhandled_irqs;
> unsigned int int_mask;
> unsigned long flags;
> void __iomem *ilb_base_addr;
>
> base-commit: 7877cb91f1081754a1487c144d85dc0d2e2e7fc4
> --
> 2.40.1


BR, Jarkko

2023-06-09 16:16:39

by Lino Sanfilippo

[permalink] [raw]
Subject: Re: [PATCH v2] tpm,tpm_tis: Handle interrupt storm


Hi,

On 09.06.23 16:33, Jarkko Sakkinen wrote:

>
> On Tue May 30, 2023 at 8:47 PM EEST, Lino Sanfilippo wrote:
>> From: Lino Sanfilippo <[email protected]>
>>
>> After activation of interrupts for TPM TIS drivers 0-day reports an
>> interrupt storm on an Inspur NF5180M6/NF5180M6 server.
>>
>> Fix this by detecting the storm and falling back to polling:
>> Count the number of unhandled interrupts within a 10 ms time interval. In
>> case that more than 1000 were unhandled deactivate interrupts entirely,
>> deregister the handler and use polling instead.
>>
>> The storm detection logic equals the implementation in note_interrupt()
>> which uses timestamps and counters stored in struct irq_desc. Since this
>> structure is private to the generic interrupt core the TPM TIS core uses
>> its own timestamps and counters. Furthermore the TPM interrupt handler
>> always returns IRQ_HANDLED to prevent the generic interrupt core from
>> processing the interrupt storm.
>>
>> Since the interrupt deregistration function devm_free_irq() waits for all
>> interrupt handlers to finish, only trigger a worker in the interrupt
>> handler and do the unregistration in the worker to avoid a deadlock.
>>
>> Reported-by: kernel test robot <[email protected]>
>> Closes: https://lore.kernel.org/oe-lkp/[email protected]/
>> Suggested-by: Lukas Wunner <[email protected]>
>> Signed-off-by: Lino Sanfilippo <[email protected]>
>> ---
>
> Sorry for the latency. I've moved home office to a new location,
> which has caused ~2 week lag. Unfortunate timing.
>


No prob :)


>> drivers/char/tpm/tpm_tis_core.c | 93 ++++++++++++++++++++++++++++-----
>> drivers/char/tpm/tpm_tis_core.h | 4 ++
>> 2 files changed, 85 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
>> index 558144fa707a..7ae8228e803f 100644
>> --- a/drivers/char/tpm/tpm_tis_core.c
>> +++ b/drivers/char/tpm/tpm_tis_core.c
>> @@ -468,25 +468,32 @@ static int tpm_tis_send_data(struct tpm_chip *chip, const u8 *buf, size_t len)
>> return rc;
>> }
>>
>> +static void __tpm_tis_disable_interrupts(struct tpm_chip *chip)
>> +{
>> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
>> + u32 intmask = 0;
>> +
>> + tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
>> + intmask &= ~TPM_GLOBAL_INT_ENABLE;
>> +
>> + tpm_tis_request_locality(chip, 0);
>> + tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
>> + tpm_tis_relinquish_locality(chip, 0);
>> +
>> + chip->flags &= ~TPM_CHIP_FLAG_IRQ;
>> +}
>> +
>> static void disable_interrupts(struct tpm_chip *chip)
>
> Add tpm_ prefix here too. It makes tracing/grepping/etc so much nicer.

Ok.

>
>> {
>> struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
>> - u32 intmask;
>
> int_mask is more readable

Ok.

>
>> - int rc;
>>
>> if (priv->irq == 0)
>> return;
>>
>> - rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
>> - if (rc < 0)
>> - intmask = 0;
>> -
>> - intmask &= ~TPM_GLOBAL_INT_ENABLE;
>> - rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
>> + __tpm_tis_disable_interrupts(chip);
>>
>> devm_free_irq(chip->dev.parent, priv->irq, chip);
>> priv->irq = 0;
>> - chip->flags &= ~TPM_CHIP_FLAG_IRQ;
>> }
>>
>> /*
>> @@ -752,6 +759,53 @@ static bool tpm_tis_req_canceled(struct tpm_chip *chip, u8 status)
>> return status == TPM_STS_COMMAND_READY;
>> }
>>
>> +static void tpm_tis_reenable_polling(struct tpm_chip *chip)
>> +{
>> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
>> +
>> + dev_warn(&chip->dev, FW_BUG
>> + "TPM interrupt storm detected, polling instead\n");
>> +
>> + __tpm_tis_disable_interrupts(chip);
>> +
>> + /*
>> + * devm_free_irq() must not be called from within the interrupt handler,
>> + * since this function waits for running handlers to finish and thus it
>> + * would deadlock. Instead trigger a worker that takes care of the
>> + * unregistration.
>> + */
>> + schedule_work(&priv->free_irq_work);
>> +}
>> +
>> +static irqreturn_t tpm_tis_check_for_interrupt_storm(struct tpm_chip *chip)
>> +{
>> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
>> + const unsigned int MAX_UNHANDLED_IRQS = 1000;
>
> Please declare this in the beginning of file because it is non-empirical
> tuning parameter. I do not want it to be buried here. It is now as good
> as a magic number.
>
> Or perhaps even tpm_tis_core.h?
>

For now that constant is only used in tpm_tis_core.c. So I would favor to define it there.

> Why MAX_UNHANDLED_IRQS is exactly 1000 and not 1? I would rollback eagerly.


Because the IRQ line may be shared with another device which has raised the
interrupt instead of the TPM. So unhandled interrupts may be legit.

Regards,
Lino





2023-06-09 16:40:39

by Jarkko Sakkinen

[permalink] [raw]
Subject: Re: [PATCH v2] tpm,tpm_tis: Handle interrupt storm

On Fri Jun 9, 2023 at 7:03 PM EEST, Lino Sanfilippo wrote:
>
> Hi,
>
> On 09.06.23 16:33, Jarkko Sakkinen wrote:
>
> >
> > On Tue May 30, 2023 at 8:47 PM EEST, Lino Sanfilippo wrote:
> >> From: Lino Sanfilippo <[email protected]>
> >>
> >> After activation of interrupts for TPM TIS drivers 0-day reports an
> >> interrupt storm on an Inspur NF5180M6/NF5180M6 server.
> >>
> >> Fix this by detecting the storm and falling back to polling:
> >> Count the number of unhandled interrupts within a 10 ms time interval. In
> >> case that more than 1000 were unhandled deactivate interrupts entirely,
> >> deregister the handler and use polling instead.
> >>
> >> The storm detection logic equals the implementation in note_interrupt()
> >> which uses timestamps and counters stored in struct irq_desc. Since this
> >> structure is private to the generic interrupt core the TPM TIS core uses
> >> its own timestamps and counters. Furthermore the TPM interrupt handler
> >> always returns IRQ_HANDLED to prevent the generic interrupt core from
> >> processing the interrupt storm.
> >>
> >> Since the interrupt deregistration function devm_free_irq() waits for all
> >> interrupt handlers to finish, only trigger a worker in the interrupt
> >> handler and do the unregistration in the worker to avoid a deadlock.
> >>
> >> Reported-by: kernel test robot <[email protected]>
> >> Closes: https://lore.kernel.org/oe-lkp/[email protected]/
> >> Suggested-by: Lukas Wunner <[email protected]>
> >> Signed-off-by: Lino Sanfilippo <[email protected]>
> >> ---
> >
> > Sorry for the latency. I've moved home office to a new location,
> > which has caused ~2 week lag. Unfortunate timing.
> >
>
>
> No prob :)
>
>
> >> drivers/char/tpm/tpm_tis_core.c | 93 ++++++++++++++++++++++++++++-----
> >> drivers/char/tpm/tpm_tis_core.h | 4 ++
> >> 2 files changed, 85 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
> >> index 558144fa707a..7ae8228e803f 100644
> >> --- a/drivers/char/tpm/tpm_tis_core.c
> >> +++ b/drivers/char/tpm/tpm_tis_core.c
> >> @@ -468,25 +468,32 @@ static int tpm_tis_send_data(struct tpm_chip *chip, const u8 *buf, size_t len)
> >> return rc;
> >> }
> >>
> >> +static void __tpm_tis_disable_interrupts(struct tpm_chip *chip)
> >> +{
> >> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> >> + u32 intmask = 0;
> >> +
> >> + tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
> >> + intmask &= ~TPM_GLOBAL_INT_ENABLE;
> >> +
> >> + tpm_tis_request_locality(chip, 0);
> >> + tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
> >> + tpm_tis_relinquish_locality(chip, 0);
> >> +
> >> + chip->flags &= ~TPM_CHIP_FLAG_IRQ;
> >> +}
> >> +
> >> static void disable_interrupts(struct tpm_chip *chip)
> >
> > Add tpm_ prefix here too. It makes tracing/grepping/etc so much nicer.
>
> Ok.
>
> >
> >> {
> >> struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> >> - u32 intmask;
> >
> > int_mask is more readable
>
> Ok.
>
> >
> >> - int rc;
> >>
> >> if (priv->irq == 0)
> >> return;
> >>
> >> - rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask);
> >> - if (rc < 0)
> >> - intmask = 0;
> >> -
> >> - intmask &= ~TPM_GLOBAL_INT_ENABLE;
> >> - rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask);
> >> + __tpm_tis_disable_interrupts(chip);
> >>
> >> devm_free_irq(chip->dev.parent, priv->irq, chip);
> >> priv->irq = 0;
> >> - chip->flags &= ~TPM_CHIP_FLAG_IRQ;
> >> }
> >>
> >> /*
> >> @@ -752,6 +759,53 @@ static bool tpm_tis_req_canceled(struct tpm_chip *chip, u8 status)
> >> return status == TPM_STS_COMMAND_READY;
> >> }
> >>
> >> +static void tpm_tis_reenable_polling(struct tpm_chip *chip)
> >> +{
> >> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> >> +
> >> + dev_warn(&chip->dev, FW_BUG
> >> + "TPM interrupt storm detected, polling instead\n");
> >> +
> >> + __tpm_tis_disable_interrupts(chip);
> >> +
> >> + /*
> >> + * devm_free_irq() must not be called from within the interrupt handler,
> >> + * since this function waits for running handlers to finish and thus it
> >> + * would deadlock. Instead trigger a worker that takes care of the
> >> + * unregistration.
> >> + */
> >> + schedule_work(&priv->free_irq_work);
> >> +}
> >> +
> >> +static irqreturn_t tpm_tis_check_for_interrupt_storm(struct tpm_chip *chip)
> >> +{
> >> + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
> >> + const unsigned int MAX_UNHANDLED_IRQS = 1000;
> >
> > Please declare this in the beginning of file because it is non-empirical
> > tuning parameter. I do not want it to be buried here. It is now as good
> > as a magic number.
> >
> > Or perhaps even tpm_tis_core.h?
> >
>
> For now that constant is only used in tpm_tis_core.c. So I would favor to define it there.
>
> > Why MAX_UNHANDLED_IRQS is exactly 1000 and not 1? I would rollback eagerly.
>
>
> Because the IRQ line may be shared with another device which has raised the
> interrupt instead of the TPM. So unhandled interrupts may be legit.

I understand that being exact here is impossible. So let's stick to this
but please move the constant to the tpm_tis_core.c with the TPM_ prefix
because it is an essential tuning parameter.

BR, Jarkko