2012-11-30 21:38:34

by Lance Ortiz

[permalink] [raw]
Subject: [PATCH v3 1/3] aerdrv: Trace Event for AER

This header file will define a new trace event that will be triggered when
a AER event occurs. The following data will be provided to the trace
event.

char * name - String containing the device path

u32 status - Either the correctable or uncorrectable register
indicating what error or errors have been see.

u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED

The trace event will also provide a trace string that may look like:

"0000:05:00.0 PCIe Bus Error:severity=Uncorrected (Non-Fatal), Poisoned
TLP"

v1-v2 Move header from include/ras/aer_event.h to
include/trace/events/ras.h

Signed-off-by: Lance Ortiz <[email protected]>
---

include/trace/events/ras.h | 77 ++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 77 insertions(+), 0 deletions(-)
create mode 100644 include/trace/events/ras.h

diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h
new file mode 100644
index 0000000..f77d009
--- /dev/null
+++ b/include/trace/events/ras.h
@@ -0,0 +1,77 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM aer_event
+#define TRACE_INCLUDE_FILE ras
+
+#if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_AER_H
+
+#include <linux/tracepoint.h>
+#include <linux/edac.h>
+
+
+/*
+ * Anhance Error Reporting (AER) PCIE Report Error
+ *
+ * These events are generated when hardware detects a corrected or
+ * uncorrected event on a pci express device and reports
+ * errors. The event reports the following data.
+ *
+ * char * dev_name - String containing the device identification
+ * u32 status - Either the correctable or uncorrectable register
+ * indicating what error or errors have been seen
+ * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED
+ */
+
+#define correctable_error_string \
+ {BIT(0), "Receiver Error"}, \
+ {BIT(6), "Bad TLP"}, \
+ {BIT(7), "Bad DLLP"}, \
+ {BIT(8), "RELAY_NUM Rollover"}, \
+ {BIT(12), "Replay Timer Timeout"}, \
+ {BIT(13), "Advisory Non-Fatal"}
+
+#define uncorrectable_error_string \
+ {BIT(4), "Data Link Protocol"}, \
+ {BIT(12), "Poisoned TLP"}, \
+ {BIT(13), "Flow Control Protocol"}, \
+ {BIT(14), "Completion Timeout"}, \
+ {BIT(15), "Completer Abort"}, \
+ {BIT(16), "Unexpected Completion"}, \
+ {BIT(17), "Receiver Overflow"}, \
+ {BIT(18), "Malformed TLP"}, \
+ {BIT(19), "ECRC"}, \
+ {BIT(20), "Unsupported Request"}
+
+TRACE_EVENT(aer_event,
+ TP_PROTO(const char *dev_name,
+ const u32 status,
+ const u8 severity),
+
+ TP_ARGS(dev_name, status, severity),
+
+ TP_STRUCT__entry(
+ __string( dev_name, dev_name )
+ __field( u32, status )
+ __field( u8, severity )
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev_name, dev_name);
+ __entry->status = status;
+ __entry->severity = severity;
+ ),
+
+ TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
+ __get_str(dev_name),
+ (__entry->severity == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
+ ((__entry->severity == HW_EVENT_ERR_FATAL) ?
+ "Fatal" : "Uncorrected"),
+ __entry->severity == HW_EVENT_ERR_CORRECTED ?
+ __print_flags(__entry->status, "|", correctable_error_string) :
+ __print_flags(__entry->status, "|", uncorrectable_error_string))
+);
+
+#endif /* _TRACE_AER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>


2012-11-30 21:38:43

by Lance Ortiz

[permalink] [raw]
Subject: [PATCH v3 2/3] aerdrv: Enhanced AER logging

This patch will provide a more reliable and easy way for user-space
applications to have access to AER logs rather than reading them from the
message buffer. It also provides a way to notify user-space when an AER
event occurs.

The aer driver is updated to generate a trace event of function 'aer_event'
when an AER occurs. The trace event was added to both the interrupt
based aer path and the firmware first path

v1-v2 fix compile errors in ifdefs.
v2-v3 Update to new location of trace header. Update print to remove
warning.

Signed-off-by: Lance Ortiz <[email protected]>
---

drivers/acpi/apei/cper.c | 17 +++++++++++++++--
drivers/pci/pcie/aer/aerdrv_errprint.c | 10 +++++++++-
include/linux/aer.h | 2 +-
3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
index e6defd8..e9d6062 100644
--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -29,6 +29,7 @@
#include <linux/time.h>
#include <linux/cper.h>
#include <linux/acpi.h>
+#include <linux/pci.h>
#include <linux/aer.h>

/*
@@ -249,6 +250,10 @@ static const char *cper_pcie_port_type_strs[] = {
static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
const struct acpi_hest_generic_data *gdata)
{
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+ struct pci_dev *dev;
+#endif
+
if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ?
@@ -281,9 +286,17 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
pfx, pcie->bridge.secondary_status, pcie->bridge.control);
#ifdef CONFIG_ACPI_APEI_PCIEAER
- if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) {
+ dev = pci_get_domain_bus_and_slot(pcie->device_id.segment,
+ pcie->device_id.bus, pcie->device_id.function);
+ if (!dev)
+ pr_info("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n",
+ pcie->device_id.segment, pcie->device_id.bus,
+ pcie->device_id.slot, pcie->device_id.function);
+
+ if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO && dev) {
struct aer_capability_regs *aer_regs = (void *)pcie->aer_info;
- cper_print_aer(pfx, gdata->error_severity, aer_regs);
+ cper_print_aer(dev, gdata->error_severity, aer_regs);
+ pci_dev_put(dev);
}
#endif
}
diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 3ea5173..34d96e4 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -23,6 +23,9 @@

#include "aerdrv.h"

+#define CREATE_TRACE_POINTS
+#include <trace/events/ras.h>
+
#define AER_AGENT_RECEIVER 0
#define AER_AGENT_REQUESTER 1
#define AER_AGENT_COMPLETER 2
@@ -194,6 +197,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
if (info->id && info->error_dev_num > 1 && info->id == id)
printk("%s"" Error of this Agent(%04x) is reported first\n",
prefix, id);
+ trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+ info->severity);
}

void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -217,12 +222,13 @@ int cper_severity_to_aer(int cper_severity)
}
EXPORT_SYMBOL_GPL(cper_severity_to_aer);

-void cper_print_aer(const char *prefix, int cper_severity,
+void cper_print_aer(struct pci_dev *dev, int cper_severity,
struct aer_capability_regs *aer)
{
int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0;
u32 status, mask;
const char **status_strs;
+ char *prefix = NULL;

aer_severity = cper_severity_to_aer(cper_severity);
if (aer_severity == AER_CORRECTABLE) {
@@ -259,5 +265,7 @@ void cper_print_aer(const char *prefix, int cper_severity,
*(tlp + 8), *(tlp + 15), *(tlp + 14),
*(tlp + 13), *(tlp + 12));
}
+ trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+ aer_severity);
}
#endif
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 544abdb..7b86dc6 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -49,7 +49,7 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
}
#endif

-extern void cper_print_aer(const char *prefix, int cper_severity,
+extern void cper_print_aer(struct pci_dev *dev, int cper_severity,
struct aer_capability_regs *aer);
extern int cper_severity_to_aer(int cper_severity);
extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,

2012-11-30 21:38:49

by Lance Ortiz

[permalink] [raw]
Subject: [PATCH v3 3/3] aerdrv: Cleanup log output for CPER based AER

These changes make cper_print_aer more consistent with aer_print_error
which is called in the AER interrupt case. The string in the variable
'prefix' is printed at the beginning of each print statement in
cper_print_aer(). The prefix is a string containing the driver name
and the device's path. Looking up the call path, the value of prefix
is never assigned and is NULL, so when cper_print_aer prints data the
initial string does not get printed. This string is important because
it identifies the device that the error is on. This patch adds code to
create the prefix and print it in the cper_print_aer function. This
patch also calculates the device's agent id so it can be printed.

v1-v2 fix some compile errors withinn the #ifdef

Signed-off-by: Lance Ortiz <[email protected]>
---

drivers/pci/pcie/aer/aerdrv_errprint.c | 12 +++++++++---
1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 34d96e4..5050891 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -228,9 +228,15 @@ void cper_print_aer(struct pci_dev *dev, int cper_severity,
int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0;
u32 status, mask;
const char **status_strs;
- char *prefix = NULL;
+ int id = ((dev->bus->number << 8) | dev->devfn);
+ char prefix[44];

aer_severity = cper_severity_to_aer(cper_severity);
+ snprintf(prefix, sizeof(prefix), "%s%s %s: ",
+ (aer_severity == AER_CORRECTABLE) ?
+ KERN_WARNING : KERN_ERR,
+ dev_driver_string(&dev->dev), dev_name(&dev->dev));
+
if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
mask = aer->cor_mask;
@@ -248,8 +254,8 @@ void cper_print_aer(struct pci_dev *dev, int cper_severity,
printk("%s""aer_status: 0x%08x, aer_mask: 0x%08x\n",
prefix, status, mask);
cper_print_bits(prefix, status, status_strs, status_strs_size);
- printk("%s""aer_layer=%s, aer_agent=%s\n", prefix,
- aer_error_layer[layer], aer_agent_string[agent]);
+ printk("%s""aer_layer=%s, id=%04x(%s)\n", prefix,
+ aer_error_layer[layer], id, aer_agent_string[agent]);
if (aer_severity != AER_CORRECTABLE)
printk("%s""aer_uncor_severity: 0x%08x\n",
prefix, aer->uncor_severity);