2023-06-06 04:08:19

by Grant Grundler

[permalink] [raw]
Subject: [PATCH 1/2] PCI/AER: correctable error message as KERN_INFO

Since correctable errors have been corrected (and counted), the dmesg output
should not be reported as a warning, but rather as "informational".

Otherwise, using a certain well known vendor's PCIe parts in a USB4 docking
station, the dmesg buffer can be spammed with correctable errors, 717 bytes
per instance, potentially many MB per day.

Given the "WARN" priority, these messages have already confused the typical
user that stumbles across them, support staff (triaging feedback reports),
and more than a few linux kernel devs. Changing to INFO will hide these
messages from most audiences.

Signed-off-by: Grant Grundler <[email protected]>
---
drivers/pci/pcie/aer.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index f6c24ded134c..d7bfc6070ddb 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -692,7 +692,7 @@ static void __aer_print_error(struct pci_dev *dev,

if (info->severity == AER_CORRECTABLE) {
strings = aer_correctable_error_string;
- level = KERN_WARNING;
+ level = KERN_INFO;
} else {
strings = aer_uncorrectable_error_string;
level = KERN_ERR;
@@ -724,7 +724,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);

- level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
+ level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;

pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
aer_error_severity_string[info->severity],
@@ -797,14 +797,22 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
info.mask = mask;
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);

- pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+ if (aer_severity == AER_CORRECTABLE)
+ pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+ else
+ pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+
__aer_print_error(dev, &info);
- pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
- aer_error_layer[layer], aer_agent_string[agent]);

- if (aer_severity != AER_CORRECTABLE)
+ if (aer_severity == AER_CORRECTABLE) {
+ pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
+ aer_error_layer[layer], aer_agent_string[agent]);
+ } else {
+ pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
+ aer_error_layer[layer], aer_agent_string[agent]);
pci_err(dev, "aer_uncor_severity: 0x%08x\n",
aer->uncor_severity);
+ }

if (tlp_header_valid)
__print_tlp_header(dev, &aer->header_log);
--
2.41.0.rc0.172.g3f132b7071-goog



2023-06-06 04:25:14

by Grant Grundler

[permalink] [raw]
Subject: [PATCH 2/2] PCI/AER: Rate limit the reporting of the correctable errors

From: Rajat Khandelwal <[email protected]>

There are many instances where correctable errors tend to inundate
the message buffer. We observe such instances during thunderbolt PCIe
tunneling.

It's true that they are mitigated by the hardware and are non-fatal
but we shouldn't be spamming the logs with such correctable errors as it
confuses other kernel developers less familiar with PCI errors, support
staff, and users who happen to look at the logs, hence rate limit them.

A typical example log inside an HP TBT4 dock:
[54912.661142] pcieport 0000:00:07.0: AER: Multiple Corrected error received: 0000:2b:00.0
[54912.661194] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[54912.661203] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001100/00002000
[54912.661211] igc 0000:2b:00.0: [ 8] Rollover
[54912.661219] igc 0000:2b:00.0: [12] Timeout
[54982.838760] pcieport 0000:00:07.0: AER: Corrected error received: 0000:2b:00.0
[54982.838798] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID)
[54982.838808] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001000/00002000
[54982.838817] igc 0000:2b:00.0: [12] Timeout

This gets repeated continuously, thus inundating the buffer.

Signed-off-by: Rajat Khandelwal <[email protected]>
Signed-off-by: Grant Grundler <[email protected]>
---
drivers/pci/pcie/aer.c | 80 +++++++++++++++++++++++++++---------------
1 file changed, 51 insertions(+), 29 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d7bfc6070ddb..830f5a1261c9 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -686,26 +686,36 @@ static void __aer_print_error(struct pci_dev *dev,
struct aer_err_info *info)
{
const char **strings;
+ char aer_msg[512];
unsigned long status = info->status & ~info->mask;
- const char *level, *errmsg;
int i;

- if (info->severity == AER_CORRECTABLE) {
- strings = aer_correctable_error_string;
- level = KERN_INFO;
- } else {
- strings = aer_uncorrectable_error_string;
- level = KERN_ERR;
- }
+ memset(aer_msg, 0, sizeof(*aer_msg));
+ snprintf(aer_msg, sizeof(*aer_msg), "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+ info->status, info->mask);
+
+ strings = (info->severity == AER_CORRECTABLE) ?
+ aer_correctable_error_string : aer_uncorrectable_error_string;

for_each_set_bit(i, &status, 32) {
- errmsg = strings[i];
+ const char *errmsg = strings[i];
+ char bitmsg[64];
+ memset(bitmsg, 0, sizeof(*bitmsg));
+
if (!errmsg)
errmsg = "Unknown Error Bit";

- pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
- info->first_error == i ? " (First)" : "");
+ snprintf(bitmsg, sizeof(*bitmsg), " [%2d] %-22s%s\n", i, errmsg,
+ info->first_error == i ? " (First)" : "");
+
+ strlcat(aer_msg, bitmsg, sizeof(*aer_msg));
}
+
+ if (info->severity == AER_CORRECTABLE)
+ pci_info_ratelimited(dev, "%s", aer_msg);
+ else
+ pci_err(dev, "%s", aer_msg):
+
pci_dev_aer_stats_incr(dev, info);
}

@@ -713,7 +723,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
{
int layer, agent;
int id = ((dev->bus->number << 8) | dev->devfn);
- const char *level;

if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -724,14 +733,19 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);

- level = (info->severity == AER_CORRECTABLE) ? KERN_INFO : KERN_ERR;
-
- pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
- aer_error_severity_string[info->severity],
- aer_error_layer[layer], aer_agent_string[agent]);
-
- pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
- dev->vendor, dev->device, info->status, info->mask);
+ if (info->severity == AER_CORRECTABLE) {
+ pci_info_ratelimited(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n"
+ " device [%04x:%04x] error status/mask=%08x/%08x\n",
+ aer_error_severity_string[info->severity],
+ aer_error_layer[layer], aer_agent_string[agent],
+ dev->vendor, dev->device, info->status, info->mask);
+ } else {
+ pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
+ " device [%04x:%04x] error status/mask=%08x/%08x\n",
+ aer_error_severity_string[info->severity],
+ aer_error_layer[layer], aer_agent_string[agent],
+ dev->vendor, dev->device, info->status, info->mask);
+ }

__aer_print_error(dev, info);

@@ -751,11 +765,19 @@ static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
u8 bus = info->id >> 8;
u8 devfn = info->id & 0xff;

- pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
- info->multi_error_valid ? "Multiple " : "",
- aer_error_severity_string[info->severity],
- pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
- PCI_FUNC(devfn));
+ if (info->severity == AER_CORRECTABLE)
+ pci_info_ratelimited(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
+ info->multi_error_valid ? "Multiple " : "",
+ aer_error_severity_string[info->severity],
+ pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
+ else
+ pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n",
+ info->multi_error_valid ? "Multiple " : "",
+ aer_error_severity_string[info->severity],
+ pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
+
}

#ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -798,7 +820,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);

if (aer_severity == AER_CORRECTABLE)
- pci_info(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+ pci_info_ratelimited(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
else
pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);

@@ -808,9 +830,9 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
pci_info(dev, "aer_layer=%s, aer_agent=%s\n",
aer_error_layer[layer], aer_agent_string[agent]);
} else {
- pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
- aer_error_layer[layer], aer_agent_string[agent]);
- pci_err(dev, "aer_uncor_severity: 0x%08x\n",
+ pci_err(dev, "aer_layer=%s, aer_agent=%s,"
+ " aer_uncor_severity=0x%08x\n",
+ aer_error_layer[layer], aer_agent_string[agent],
aer->uncor_severity);
}

--
2.41.0.rc0.172.g3f132b7071-goog