Currently the AER driver uses cper_print_bits() to print the AER status
string. This causes the status string to not include the proper PCI device
name prefix that the other AER prints include. Also, it has a different
print level than all the other AER prints, and there is a potential to
have multiple status prints based on string lengths.
Update the AER driver to print the AER status string with the proper string
prefix and proper print level, and abreviate the status strings similar to
lspci -vv prints so they can be printed on the same line.
Previous log example:
e1000e 0003:01:00.1: aer_status: 0x00000041, aer_mask: 0x00000000
Receiver Error, Bad TLP
e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
pcieport 0003:00:00.0: aer_status: 0x00001000, aer_mask: 0x0000e000
Replay Timer Timeout
pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
New log:
e1000e 0003:01:00.1: aer_status: 0x00000041, aer_mask: 0x00000000
e1000e 0003:01:00.1: RxErr, BadTLP
e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
pcieport 0003:00:00.0: aer_status: 0x00001000, aer_mask: 0x0000e000
pcieport 0003:00:00.0: Timeout
pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
Signed-off-by: Tyler Baicar <[email protected]>
---
drivers/pci/pcie/aer/aerdrv_errprint.c | 71 ++++++++++++++++++++++------------
1 file changed, 47 insertions(+), 24 deletions(-)
diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 6a352e6..bb68dd4 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -72,22 +72,22 @@
};
static const char *aer_correctable_error_string[] = {
- "Receiver Error", /* Bit Position 0 */
+ "RxErr", /* Bit Position 0 */
NULL,
NULL,
NULL,
NULL,
NULL,
- "Bad TLP", /* Bit Position 6 */
- "Bad DLLP", /* Bit Position 7 */
- "RELAY_NUM Rollover", /* Bit Position 8 */
+ "BadTLP", /* Bit Position 6 */
+ "BadDLLP", /* Bit Position 7 */
+ "Rollover", /* Bit Position 8 */
NULL,
NULL,
NULL,
- "Replay Timer Timeout", /* Bit Position 12 */
- "Advisory Non-Fatal", /* Bit Position 13 */
- "Corrected Internal Error", /* Bit Position 14 */
- "Header Log Overflow", /* Bit Position 15 */
+ "Timeout", /* Bit Position 12 */
+ "NonFatalErr", /* Bit Position 13 */
+ "CorrIntErr", /* Bit Position 14 */
+ "HeaderOF", /* Bit Position 15 */
};
static const char *aer_uncorrectable_error_string[] = {
@@ -95,28 +95,28 @@
NULL,
NULL,
NULL,
- "Data Link Protocol", /* Bit Position 4 */
- "Surprise Down Error", /* Bit Position 5 */
+ "DLP", /* Bit Position 4 */
+ "SDES", /* Bit Position 5 */
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
- "Poisoned TLP", /* Bit Position 12 */
- "Flow Control Protocol", /* Bit Position 13 */
- "Completion Timeout", /* Bit Position 14 */
- "Completer Abort", /* Bit Position 15 */
- "Unexpected Completion", /* Bit Position 16 */
- "Receiver Overflow", /* Bit Position 17 */
- "Malformed TLP", /* Bit Position 18 */
+ "TLP", /* Bit Position 12 */
+ "FCP", /* Bit Position 13 */
+ "CmpltTO", /* Bit Position 14 */
+ "CmpltAbrt", /* Bit Position 15 */
+ "UnxCmplt", /* Bit Position 16 */
+ "RxOF", /* Bit Position 17 */
+ "MalfTLP", /* Bit Position 18 */
"ECRC", /* Bit Position 19 */
- "Unsupported Request", /* Bit Position 20 */
- "ACS Violation", /* Bit Position 21 */
- "Uncorrectable Internal Error", /* Bit Position 22 */
- "MC Blocked TLP", /* Bit Position 23 */
- "AtomicOp Egress Blocked", /* Bit Position 24 */
- "TLP Prefix Blocked Error", /* Bit Position 25 */
+ "UnsupReq", /* Bit Position 20 */
+ "ACSViol", /* Bit Position 21 */
+ "UncorrIntErr", /* Bit Position 22 */
+ "BlockedTLP", /* Bit Position 23 */
+ "AtomicOpBlocked", /* Bit Position 24 */
+ "TLPBlockedErr", /* Bit Position 25 */
};
static const char *aer_agent_string[] = {
@@ -203,6 +203,29 @@ void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
+
+#define MAX_PRINT_LENGTH 120
+
+void dev_print_bits(struct pci_dev *dev, unsigned int bits,
+ const char * const strs[], unsigned int strs_size)
+{
+ unsigned int i;
+ char errs[MAX_PRINT_LENGTH];
+
+ errs[0] = '\0';
+
+ for (i = 0; i < strs_size; i++) {
+ if (!(bits & (1U << i)))
+ continue;
+ if (strs[i]) {
+ if (strlen(errs))
+ strlcat(errs, ", ", MAX_PRINT_LENGTH);
+ strlcat(errs, strs[i], MAX_PRINT_LENGTH);
+ }
+ }
+ dev_err(&dev->dev, "%s\n", errs);
+}
+
int cper_severity_to_aer(int cper_severity)
{
switch (cper_severity) {
@@ -240,7 +263,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
agent = AER_GET_AGENT(aer_severity, status);
pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
- cper_print_bits("", status, status_strs, status_strs_size);
+ dev_print_bits(dev, status, status_strs, status_strs_size);
pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
aer_error_layer[layer], aer_agent_string[agent]);
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
Hello Bjorn,
On 2/7/2018 3:11 PM, Tyler Baicar wrote:
> Currently the AER driver uses cper_print_bits() to print the AER status
> string. This causes the status string to not include the proper PCI device
> name prefix that the other AER prints include. Also, it has a different
> print level than all the other AER prints, and there is a potential to
> have multiple status prints based on string lengths.
>
> Update the AER driver to print the AER status string with the proper string
> prefix and proper print level, and abreviate the status strings similar to
> lspci -vv prints so they can be printed on the same line.
>
> Previous log example:
>
> e1000e 0003:01:00.1: aer_status: 0x00000041, aer_mask: 0x00000000
> Receiver Error, Bad TLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x00001000, aer_mask: 0x0000e000
> Replay Timer Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
>
> New log:
>
> e1000e 0003:01:00.1: aer_status: 0x00000041, aer_mask: 0x00000000
> e1000e 0003:01:00.1: RxErr, BadTLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x00001000, aer_mask: 0x0000e000
> pcieport 0003:00:00.0: Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
Do you think this is ready for merge now?
Thanks,
Tyler
>
> Signed-off-by: Tyler Baicar <[email protected]>
> ---
> drivers/pci/pcie/aer/aerdrv_errprint.c | 71 ++++++++++++++++++++++------------
> 1 file changed, 47 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 6a352e6..bb68dd4 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -72,22 +72,22 @@
> };
>
> static const char *aer_correctable_error_string[] = {
> - "Receiver Error", /* Bit Position 0 */
> + "RxErr", /* Bit Position 0 */
> NULL,
> NULL,
> NULL,
> NULL,
> NULL,
> - "Bad TLP", /* Bit Position 6 */
> - "Bad DLLP", /* Bit Position 7 */
> - "RELAY_NUM Rollover", /* Bit Position 8 */
> + "BadTLP", /* Bit Position 6 */
> + "BadDLLP", /* Bit Position 7 */
> + "Rollover", /* Bit Position 8 */
> NULL,
> NULL,
> NULL,
> - "Replay Timer Timeout", /* Bit Position 12 */
> - "Advisory Non-Fatal", /* Bit Position 13 */
> - "Corrected Internal Error", /* Bit Position 14 */
> - "Header Log Overflow", /* Bit Position 15 */
> + "Timeout", /* Bit Position 12 */
> + "NonFatalErr", /* Bit Position 13 */
> + "CorrIntErr", /* Bit Position 14 */
> + "HeaderOF", /* Bit Position 15 */
> };
>
> static const char *aer_uncorrectable_error_string[] = {
> @@ -95,28 +95,28 @@
> NULL,
> NULL,
> NULL,
> - "Data Link Protocol", /* Bit Position 4 */
> - "Surprise Down Error", /* Bit Position 5 */
> + "DLP", /* Bit Position 4 */
> + "SDES", /* Bit Position 5 */
> NULL,
> NULL,
> NULL,
> NULL,
> NULL,
> NULL,
> - "Poisoned TLP", /* Bit Position 12 */
> - "Flow Control Protocol", /* Bit Position 13 */
> - "Completion Timeout", /* Bit Position 14 */
> - "Completer Abort", /* Bit Position 15 */
> - "Unexpected Completion", /* Bit Position 16 */
> - "Receiver Overflow", /* Bit Position 17 */
> - "Malformed TLP", /* Bit Position 18 */
> + "TLP", /* Bit Position 12 */
> + "FCP", /* Bit Position 13 */
> + "CmpltTO", /* Bit Position 14 */
> + "CmpltAbrt", /* Bit Position 15 */
> + "UnxCmplt", /* Bit Position 16 */
> + "RxOF", /* Bit Position 17 */
> + "MalfTLP", /* Bit Position 18 */
> "ECRC", /* Bit Position 19 */
> - "Unsupported Request", /* Bit Position 20 */
> - "ACS Violation", /* Bit Position 21 */
> - "Uncorrectable Internal Error", /* Bit Position 22 */
> - "MC Blocked TLP", /* Bit Position 23 */
> - "AtomicOp Egress Blocked", /* Bit Position 24 */
> - "TLP Prefix Blocked Error", /* Bit Position 25 */
> + "UnsupReq", /* Bit Position 20 */
> + "ACSViol", /* Bit Position 21 */
> + "UncorrIntErr", /* Bit Position 22 */
> + "BlockedTLP", /* Bit Position 23 */
> + "AtomicOpBlocked", /* Bit Position 24 */
> + "TLPBlockedErr", /* Bit Position 25 */
> };
>
> static const char *aer_agent_string[] = {
> @@ -203,6 +203,29 @@ void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
> }
>
> #ifdef CONFIG_ACPI_APEI_PCIEAER
> +
> +#define MAX_PRINT_LENGTH 120
> +
> +void dev_print_bits(struct pci_dev *dev, unsigned int bits,
> + const char * const strs[], unsigned int strs_size)
> +{
> + unsigned int i;
> + char errs[MAX_PRINT_LENGTH];
> +
> + errs[0] = '\0';
> +
> + for (i = 0; i < strs_size; i++) {
> + if (!(bits & (1U << i)))
> + continue;
> + if (strs[i]) {
> + if (strlen(errs))
> + strlcat(errs, ", ", MAX_PRINT_LENGTH);
> + strlcat(errs, strs[i], MAX_PRINT_LENGTH);
> + }
> + }
> + dev_err(&dev->dev, "%s\n", errs);
> +}
> +
> int cper_severity_to_aer(int cper_severity)
> {
> switch (cper_severity) {
> @@ -240,7 +263,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
> agent = AER_GET_AGENT(aer_severity, status);
>
> pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> - cper_print_bits("", status, status_strs, status_strs_size);
> + dev_print_bits(dev, status, status_strs, status_strs_size);
> pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> aer_error_layer[layer], aer_agent_string[agent]);
>
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
On Wed, Feb 07, 2018 at 03:11:25PM -0500, Tyler Baicar wrote:
> Currently the AER driver uses cper_print_bits() to print the AER status
> string. This causes the status string to not include the proper PCI device
> name prefix that the other AER prints include. Also, it has a different
> print level than all the other AER prints, and there is a potential to
> have multiple status prints based on string lengths.
>
> Update the AER driver to print the AER status string with the proper string
> prefix and proper print level, and abreviate the status strings similar to
> lspci -vv prints so they can be printed on the same line.
>
> Previous log example:
>
> e1000e 0003:01:00.1: aer_status: 0x00000041, aer_mask: 0x00000000
> Receiver Error, Bad TLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x00001000, aer_mask: 0x0000e000
> Replay Timer Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
>
> New log:
>
> e1000e 0003:01:00.1: aer_status: 0x00000041, aer_mask: 0x00000000
> e1000e 0003:01:00.1: RxErr, BadTLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x00001000, aer_mask: 0x0000e000
> pcieport 0003:00:00.0: Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
This is awesome, much better than before.
But it only changes the output via the APEI/GHES path. I think errors
reported via the "native" path, i.e., aer_print_error(), should look
the same. Since this patch changes the way cper_print_aer() decodes
the status bits, can you make the aer_print_error() status bit
decoding match it?
Both paths (cper_print_aer() and aer_print_error()) also print the
raw "status" and "mask" values. But these can be from either the
Uncorrectable Error registers or the Correctable Errors registers.
I don't think cper_print_aer() prints any clue about which is the
source. Can you include something like what aer_print_error() does,
e.g., with aer_error_severity_string[]?
I would suggest splitting this into a few patches:
- abbreviate the *_error_string[] values
- change cper_print_aer() to use dev_print_bits() instead of
cper_print_bits()
- change cper_print_aer() to print severity/type/id in the same
format aer_print_error() uses
- change aer_print_error() to use dev_print_bits() instead of
__aer_print_error()
> Signed-off-by: Tyler Baicar <[email protected]>
> ---
> drivers/pci/pcie/aer/aerdrv_errprint.c | 71 ++++++++++++++++++++++------------
> 1 file changed, 47 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 6a352e6..bb68dd4 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -72,22 +72,22 @@
> };
>
> static const char *aer_correctable_error_string[] = {
> - "Receiver Error", /* Bit Position 0 */
> + "RxErr", /* Bit Position 0 */
> NULL,
> NULL,
> NULL,
> NULL,
> NULL,
> - "Bad TLP", /* Bit Position 6 */
> - "Bad DLLP", /* Bit Position 7 */
> - "RELAY_NUM Rollover", /* Bit Position 8 */
> + "BadTLP", /* Bit Position 6 */
> + "BadDLLP", /* Bit Position 7 */
> + "Rollover", /* Bit Position 8 */
> NULL,
> NULL,
> NULL,
> - "Replay Timer Timeout", /* Bit Position 12 */
> - "Advisory Non-Fatal", /* Bit Position 13 */
> - "Corrected Internal Error", /* Bit Position 14 */
> - "Header Log Overflow", /* Bit Position 15 */
> + "Timeout", /* Bit Position 12 */
> + "NonFatalErr", /* Bit Position 13 */
> + "CorrIntErr", /* Bit Position 14 */
> + "HeaderOF", /* Bit Position 15 */
> };
>
> static const char *aer_uncorrectable_error_string[] = {
> @@ -95,28 +95,28 @@
> NULL,
> NULL,
> NULL,
> - "Data Link Protocol", /* Bit Position 4 */
> - "Surprise Down Error", /* Bit Position 5 */
> + "DLP", /* Bit Position 4 */
> + "SDES", /* Bit Position 5 */
> NULL,
> NULL,
> NULL,
> NULL,
> NULL,
> NULL,
> - "Poisoned TLP", /* Bit Position 12 */
> - "Flow Control Protocol", /* Bit Position 13 */
> - "Completion Timeout", /* Bit Position 14 */
> - "Completer Abort", /* Bit Position 15 */
> - "Unexpected Completion", /* Bit Position 16 */
> - "Receiver Overflow", /* Bit Position 17 */
> - "Malformed TLP", /* Bit Position 18 */
> + "TLP", /* Bit Position 12 */
> + "FCP", /* Bit Position 13 */
> + "CmpltTO", /* Bit Position 14 */
> + "CmpltAbrt", /* Bit Position 15 */
> + "UnxCmplt", /* Bit Position 16 */
> + "RxOF", /* Bit Position 17 */
> + "MalfTLP", /* Bit Position 18 */
> "ECRC", /* Bit Position 19 */
> - "Unsupported Request", /* Bit Position 20 */
> - "ACS Violation", /* Bit Position 21 */
> - "Uncorrectable Internal Error", /* Bit Position 22 */
> - "MC Blocked TLP", /* Bit Position 23 */
> - "AtomicOp Egress Blocked", /* Bit Position 24 */
> - "TLP Prefix Blocked Error", /* Bit Position 25 */
> + "UnsupReq", /* Bit Position 20 */
> + "ACSViol", /* Bit Position 21 */
> + "UncorrIntErr", /* Bit Position 22 */
> + "BlockedTLP", /* Bit Position 23 */
> + "AtomicOpBlocked", /* Bit Position 24 */
> + "TLPBlockedErr", /* Bit Position 25 */
> };
>
> static const char *aer_agent_string[] = {
> @@ -203,6 +203,29 @@ void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
> }
>
> #ifdef CONFIG_ACPI_APEI_PCIEAER
> +
> +#define MAX_PRINT_LENGTH 120
> +
> +void dev_print_bits(struct pci_dev *dev, unsigned int bits,
> + const char * const strs[], unsigned int strs_size)
> +{
> + unsigned int i;
> + char errs[MAX_PRINT_LENGTH];
> +
> + errs[0] = '\0';
> +
> + for (i = 0; i < strs_size; i++) {
> + if (!(bits & (1U << i)))
> + continue;
> + if (strs[i]) {
> + if (strlen(errs))
> + strlcat(errs, ", ", MAX_PRINT_LENGTH);
> + strlcat(errs, strs[i], MAX_PRINT_LENGTH);
> + }
> + }
> + dev_err(&dev->dev, "%s\n", errs);
> +}
> +
> int cper_severity_to_aer(int cper_severity)
> {
> switch (cper_severity) {
> @@ -240,7 +263,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
> agent = AER_GET_AGENT(aer_severity, status);
>
> pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> - cper_print_bits("", status, status_strs, status_strs_size);
> + dev_print_bits(dev, status, status_strs, status_strs_size);
> pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> aer_error_layer[layer], aer_agent_string[agent]);
>
> --
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project.
>