Hello.
Here comes patch v3, which contains some fixes and optimizations of
aer api usage. The v1 and v2 can be found on the mailing list.
v3:
- Modifications to comments proposed by Sathyanarayanan. Remove
pci_aer_clear_nonfatal_status() call in NTB and improve commit log.
v2:
- Modifications to comments proposed by Bjorn. Split patch into more
obvious parts.
Zhuo Chen (9):
PCI/AER: Add pci_aer_clear_uncorrect_error_status() to PCI core
PCI/DPC: Use pci_aer_clear_uncorrect_error_status() to clear
uncorrectable error status
NTB: Remove pci_aer_clear_nonfatal_status() call
scsi: lpfc: Change to use pci_aer_clear_uncorrect_error_status()
PCI/AER: Unexport pci_aer_clear_nonfatal_status()
PCI/AER: Move check inside pcie_clear_device_status().
PCI/AER: Use pcie_aer_is_native() to judge whether OS owns AER
PCI/ERR: Clear fatal error status when pci_channel_io_frozen
PCI/AER: Refine status clearing process with api
drivers/ntb/hw/idt/ntb_hw_idt.c | 2 --
drivers/pci/pci.c | 7 +++--
drivers/pci/pci.h | 2 ++
drivers/pci/pcie/aer.c | 45 +++++++++++++++++++--------------
drivers/pci/pcie/dpc.c | 3 +--
drivers/pci/pcie/err.c | 15 ++++-------
drivers/pci/pcie/portdrv_core.c | 3 +--
drivers/scsi/lpfc/lpfc_attr.c | 4 +--
include/linux/aer.h | 4 +--
9 files changed, 44 insertions(+), 41 deletions(-)
--
2.30.1 (Apple Git-130)
Since pci_aer_clear_nonfatal_status() is used only internally, move
its declaration to the PCI internal header file. Also, no one cares
about return value of pci_aer_clear_nonfatal_status(), so make it void.
Signed-off-by: Zhuo Chen <[email protected]>
---
drivers/pci/pci.h | 2 ++
drivers/pci/pcie/aer.c | 7 ++-----
include/linux/aer.h | 5 -----
3 files changed, 4 insertions(+), 10 deletions(-)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 785f31086313..a114175d08e4 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -684,6 +684,7 @@ void pci_aer_init(struct pci_dev *dev);
void pci_aer_exit(struct pci_dev *dev);
extern const struct attribute_group aer_stats_attr_group;
void pci_aer_clear_fatal_status(struct pci_dev *dev);
+void pci_aer_clear_nonfatal_status(struct pci_dev *dev);
int pci_aer_clear_status(struct pci_dev *dev);
int pci_aer_raw_clear_status(struct pci_dev *dev);
#else
@@ -691,6 +692,7 @@ static inline void pci_no_aer(void) { }
static inline void pci_aer_init(struct pci_dev *d) { }
static inline void pci_aer_exit(struct pci_dev *d) { }
static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
+static inline void pci_aer_clear_nonfatal_status(struct pci_dev *dev) { }
static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
#endif
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 4e637121be23..e2ebd108339d 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -251,13 +251,13 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
-int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
+void pci_aer_clear_nonfatal_status(struct pci_dev *dev)
{
int aer = dev->aer_cap;
u32 status, sev;
if (!pcie_aer_is_native(dev))
- return -EIO;
+ return;
/* Clear status bits for ERR_NONFATAL errors only */
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, &status);
@@ -265,10 +265,7 @@ int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
status &= ~sev;
if (status)
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
-
- return 0;
}
-EXPORT_SYMBOL_GPL(pci_aer_clear_nonfatal_status);
void pci_aer_clear_fatal_status(struct pci_dev *dev)
{
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 154690c278cb..f638ad955deb 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,7 +44,6 @@ struct aer_capability_regs {
/* PCIe port driver needs this function to enable AER */
int pci_enable_pcie_error_reporting(struct pci_dev *dev);
int pci_disable_pcie_error_reporting(struct pci_dev *dev);
-int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
int pci_aer_clear_uncorrect_error_status(struct pci_dev *dev);
void pci_save_aer_state(struct pci_dev *dev);
void pci_restore_aer_state(struct pci_dev *dev);
@@ -57,10 +56,6 @@ static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev)
{
return -EINVAL;
}
-static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
-{
- return -EINVAL;
-}
static inline int pci_aer_clear_uncorrect_error_status(struct pci_dev *dev)
{
return -EINVAL;
--
2.30.1 (Apple Git-130)
pci_aer_clear_uncorrect_error_status() clears both fatal and non-fatal
errors. So use it in place of pci_aer_clear_nonfatal_status()
and pci_aer_clear_fatal_status().
Signed-off-by: Zhuo Chen <[email protected]>
---
drivers/pci/pcie/dpc.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 3e9afee02e8d..7942073fbb34 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -288,8 +288,7 @@ void dpc_process_error(struct pci_dev *pdev)
dpc_get_aer_uncorrect_severity(pdev, &info) &&
aer_get_device_error_info(pdev, &info)) {
aer_print_error(pdev, &info);
- pci_aer_clear_nonfatal_status(pdev);
- pci_aer_clear_fatal_status(pdev);
+ pci_aer_clear_uncorrect_error_status(pdev);
}
}
--
2.30.1 (Apple Git-130)
lpfc_aer_cleanup_state() requires clearing both fatal and non-fatal
uncorrectable error status. But using pci_aer_clear_nonfatal_status()
will only clear non-fatal error status. To clear both fatal and
non-fatal error status, use pci_aer_clear_uncorrect_error_status().
Signed-off-by: Zhuo Chen <[email protected]>
---
drivers/scsi/lpfc/lpfc_attr.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 09cf2cd0ae60..d835cc0ba153 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -4689,7 +4689,7 @@ static DEVICE_ATTR_RW(lpfc_aer_support);
* Description:
* If the @buf contains 1 and the device currently has the AER support
* enabled, then invokes the kernel AER helper routine
- * pci_aer_clear_nonfatal_status() to clean up the uncorrectable
+ * pci_aer_clear_uncorrect_error_status() to clean up the uncorrectable
* error status register.
*
* Notes:
@@ -4715,7 +4715,7 @@ lpfc_aer_cleanup_state(struct device *dev, struct device_attribute *attr,
return -EINVAL;
if (phba->hba_flag & HBA_AER_ENABLED)
- rc = pci_aer_clear_nonfatal_status(phba->pcidev);
+ rc = pci_aer_clear_uncorrect_error_status(phba->pcidev);
if (rc == 0)
return strlen(buf);
--
2.30.1 (Apple Git-130)
pcie_clear_device_status() doesn't check for pcie_aer_is_native()
internally, but after commit 068c29a248b6 ("PCI/ERR: Clear PCIe Device
Status errors only if OS owns AER") and commit aa344bc8b727 ("PCI/ERR:
Clear AER status only when we control AER"), both callers check before
calling it. So move the check inside pcie_clear_device_status().
pcie_clear_device_status() and pci_aer_clear_nonfatal_status() both
have check internally, so remove check when callers calling them.
Signed-off-by: Zhuo Chen <[email protected]>
---
drivers/pci/pci.c | 7 +++++--
drivers/pci/pcie/aer.c | 4 ++--
drivers/pci/pcie/err.c | 14 +++-----------
3 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 95bc329e74c0..8caf4a5529a1 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2282,9 +2282,12 @@ EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state);
void pcie_clear_device_status(struct pci_dev *dev)
{
u16 sta;
+ struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
- pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
- pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
+ if (host->native_aer || pcie_ports_native) {
+ pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
+ pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
+ }
}
#endif
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e2ebd108339d..e2320ab27a31 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -971,11 +971,11 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
* Correctable error does not need software intervention.
* No need to go through error recovery process.
*/
- if (aer)
+ if (aer) {
pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
info->status);
- if (pcie_aer_is_native(dev))
pcie_clear_device_status(dev);
+ }
} else if (info->severity == AER_NONFATAL)
pcie_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
else if (info->severity == AER_FATAL)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 59c90d04a609..f80b21244ef1 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -188,7 +188,6 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
int type = pci_pcie_type(dev);
struct pci_dev *bridge;
pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
- struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
/*
* If the error was detected by a Root Port, Downstream Port, RCEC,
@@ -241,16 +240,9 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
pci_dbg(bridge, "broadcast resume message\n");
pci_walk_bridge(bridge, report_resume, &status);
- /*
- * If we have native control of AER, clear error status in the device
- * that detected the error. If the platform retained control of AER,
- * it is responsible for clearing this status. In that case, the
- * signaling device may not even be visible to the OS.
- */
- if (host->native_aer || pcie_ports_native) {
- pcie_clear_device_status(dev);
- pci_aer_clear_nonfatal_status(dev);
- }
+ pcie_clear_device_status(dev);
+ pci_aer_clear_nonfatal_status(dev);
+
pci_info(bridge, "device recovery successful\n");
return status;
--
2.30.1 (Apple Git-130)
On Wed, Sep 28, 2022 at 06:59:37PM +0800, Zhuo Chen wrote:
> Hello.
>
> Here comes patch v3, which contains some fixes and optimizations of
> aer api usage. The v1 and v2 can be found on the mailing list.
>
> v3:
> - Modifications to comments proposed by Sathyanarayanan.
> Remove
> pci_aer_clear_nonfatal_status() call in NTB and improve commit log.
Failed to see who has requested that...
-Sergey
>
> v2:
> - Modifications to comments proposed by Bjorn. Split patch into more
> obvious parts.
>
> Zhuo Chen (9):
> PCI/AER: Add pci_aer_clear_uncorrect_error_status() to PCI core
> PCI/DPC: Use pci_aer_clear_uncorrect_error_status() to clear
> uncorrectable error status
> NTB: Remove pci_aer_clear_nonfatal_status() call
> scsi: lpfc: Change to use pci_aer_clear_uncorrect_error_status()
> PCI/AER: Unexport pci_aer_clear_nonfatal_status()
> PCI/AER: Move check inside pcie_clear_device_status().
> PCI/AER: Use pcie_aer_is_native() to judge whether OS owns AER
> PCI/ERR: Clear fatal error status when pci_channel_io_frozen
> PCI/AER: Refine status clearing process with api
>
> drivers/ntb/hw/idt/ntb_hw_idt.c | 2 --
> drivers/pci/pci.c | 7 +++--
> drivers/pci/pci.h | 2 ++
> drivers/pci/pcie/aer.c | 45 +++++++++++++++++++--------------
> drivers/pci/pcie/dpc.c | 3 +--
> drivers/pci/pcie/err.c | 15 ++++-------
> drivers/pci/pcie/portdrv_core.c | 3 +--
> drivers/scsi/lpfc/lpfc_attr.c | 4 +--
> include/linux/aer.h | 4 +--
> 9 files changed, 44 insertions(+), 41 deletions(-)
>
> --
> 2.30.1 (Apple Git-130)
>
Use pcie_aer_is_native() in place of "host->native_aer ||
pcie_ports_native" to judge whether OS owns AER in aer_root_reset().
Replace "dev->aer_cap && (pcie_ports_native || host->native_aer)" in
get_port_device_capability() with pcie_aer_is_native(), which has no
functional changes.
Signed-off-by: Zhuo Chen <[email protected]>
---
drivers/pci/pcie/aer.c | 5 ++---
drivers/pci/pcie/portdrv_core.c | 3 +--
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e2320ab27a31..a6d29269ccf2 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1403,7 +1403,6 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
int type = pci_pcie_type(dev);
struct pci_dev *root;
int aer;
- struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
u32 reg32;
int rc;
@@ -1424,7 +1423,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
*/
aer = root ? root->aer_cap : 0;
- if ((host->native_aer || pcie_ports_native) && aer) {
+ if (aer && pcie_aer_is_native(root)) {
/* Disable Root's interrupt in response to error messages */
pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, ®32);
reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK;
@@ -1443,7 +1442,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
pci_is_root_bus(dev->bus) ? "Root" : "Downstream", rc);
}
- if ((host->native_aer || pcie_ports_native) && aer) {
+ if (aer && pcie_aer_is_native(root)) {
/* Clear Root Error Status */
pci_read_config_dword(root, aer + PCI_ERR_ROOT_STATUS, ®32);
pci_write_config_dword(root, aer + PCI_ERR_ROOT_STATUS, reg32);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 1ac7fec47d6f..844297c0c85e 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -221,8 +221,7 @@ static int get_port_device_capability(struct pci_dev *dev)
}
#ifdef CONFIG_PCIEAER
- if (dev->aer_cap && pci_aer_available() &&
- (pcie_ports_native || host->native_aer))
+ if (pcie_aer_is_native(dev) && pci_aer_available())
services |= PCIE_PORT_SERVICE_AER;
#endif
--
2.30.1 (Apple Git-130)
When state is pci_channel_io_frozen in pcie_do_recovery(), the
severity is fatal and fatal error status should be cleared.
So add pci_aer_clear_fatal_status().
Signed-off-by: Zhuo Chen <[email protected]>
---
drivers/pci/pcie/err.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f80b21244ef1..b46f1d36c090 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -241,7 +241,10 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
pci_walk_bridge(bridge, report_resume, &status);
pcie_clear_device_status(dev);
- pci_aer_clear_nonfatal_status(dev);
+ if (state == pci_channel_io_frozen)
+ pci_aer_clear_fatal_status(dev);
+ else
+ pci_aer_clear_nonfatal_status(dev);
pci_info(bridge, "device recovery successful\n");
return status;
--
2.30.1 (Apple Git-130)
On 9/28/22 7:06 PM, Serge Semin wrote:
> On Wed, Sep 28, 2022 at 06:59:37PM +0800, Zhuo Chen wrote:
>> Hello.
>>
>> Here comes patch v3, which contains some fixes and optimizations of
>> aer api usage. The v1 and v2 can be found on the mailing list.
>>
>> v3:
>> - Modifications to comments proposed by Sathyanarayanan.
>
>> Remove
>> pci_aer_clear_nonfatal_status() call in NTB and improve commit log.
>
> Failed to see who has requested that...
>
> -Sergey
>
Hi, Sergey
Currently other vendor drivers do not clear error status in their own
init code, I don't exactly know what is special reason for clearing
error status during init code in ntb driver.
An evidence is in pci_aer_init(), PCI core driver has do
pci_aer_clear_status() and pci_enable_pcie_error_reporting() in common
process. So vendor drivers don't need to do again.
But I don't know the reason why many vendor drivers reserve
pci_enable_pcie_error_reporting() after commit f26e58bf6f54 ("PCI/AER:
Enable error reporting when AER is native"). Do they need to be removed?
Could Bjorn and Sathyanarayanan help look into it, thanks a lot.
Thanks.
>>
>> v2:
>> - Modifications to comments proposed by Bjorn. Split patch into more
>> obvious parts.
>>
>> Zhuo Chen (9):
>> PCI/AER: Add pci_aer_clear_uncorrect_error_status() to PCI core
>> PCI/DPC: Use pci_aer_clear_uncorrect_error_status() to clear
>> uncorrectable error status
>> NTB: Remove pci_aer_clear_nonfatal_status() call
>> scsi: lpfc: Change to use pci_aer_clear_uncorrect_error_status()
>> PCI/AER: Unexport pci_aer_clear_nonfatal_status()
>> PCI/AER: Move check inside pcie_clear_device_status().
>> PCI/AER: Use pcie_aer_is_native() to judge whether OS owns AER
>> PCI/ERR: Clear fatal error status when pci_channel_io_frozen
>> PCI/AER: Refine status clearing process with api
>>
>> drivers/ntb/hw/idt/ntb_hw_idt.c | 2 --
>> drivers/pci/pci.c | 7 +++--
>> drivers/pci/pci.h | 2 ++
>> drivers/pci/pcie/aer.c | 45 +++++++++++++++++++--------------
>> drivers/pci/pcie/dpc.c | 3 +--
>> drivers/pci/pcie/err.c | 15 ++++-------
>> drivers/pci/pcie/portdrv_core.c | 3 +--
>> drivers/scsi/lpfc/lpfc_attr.c | 4 +--
>> include/linux/aer.h | 4 +--
>> 9 files changed, 44 insertions(+), 41 deletions(-)
>>
>> --
>> 2.30.1 (Apple Git-130)
>>
--
Zhuo Chen
Hi Bjorn, a gentle reminder.
Thanks and regards.
On 9/28/22 6:59 PM, Zhuo Chen wrote:
> Hello.
>
> Here comes patch v3, which contains some fixes and optimizations of
> aer api usage. The v1 and v2 can be found on the mailing list.
>
> v3:
> - Modifications to comments proposed by Sathyanarayanan. Remove
> pci_aer_clear_nonfatal_status() call in NTB and improve commit log.
>
> v2:
> - Modifications to comments proposed by Bjorn. Split patch into more
> obvious parts.
>
> Zhuo Chen (9):
> PCI/AER: Add pci_aer_clear_uncorrect_error_status() to PCI core
> PCI/DPC: Use pci_aer_clear_uncorrect_error_status() to clear
> uncorrectable error status
> NTB: Remove pci_aer_clear_nonfatal_status() call
> scsi: lpfc: Change to use pci_aer_clear_uncorrect_error_status()
> PCI/AER: Unexport pci_aer_clear_nonfatal_status()
> PCI/AER: Move check inside pcie_clear_device_status().
> PCI/AER: Use pcie_aer_is_native() to judge whether OS owns AER
> PCI/ERR: Clear fatal error status when pci_channel_io_frozen
> PCI/AER: Refine status clearing process with api
>
> drivers/ntb/hw/idt/ntb_hw_idt.c | 2 --
> drivers/pci/pci.c | 7 +++--
> drivers/pci/pci.h | 2 ++
> drivers/pci/pcie/aer.c | 45 +++++++++++++++++++--------------
> drivers/pci/pcie/dpc.c | 3 +--
> drivers/pci/pcie/err.c | 15 ++++-------
> drivers/pci/pcie/portdrv_core.c | 3 +--
> drivers/scsi/lpfc/lpfc_attr.c | 4 +--
> include/linux/aer.h | 4 +--
> 9 files changed, 44 insertions(+), 41 deletions(-)
>
--
Zhuo Chen
Ping. Gentle reminder
On 11/5/22 1:20 AM, Zhuo Chen wrote:
> Hi Bjorn, a gentle reminder.
>
> Thanks and regards.
>
> On 9/28/22 6:59 PM, Zhuo Chen wrote:
>> Hello.
>>
>> Here comes patch v3, which contains some fixes and optimizations of
>> aer api usage. The v1 and v2 can be found on the mailing list.
>>
>> v3:
>> - Modifications to comments proposed by Sathyanarayanan. Remove
>> pci_aer_clear_nonfatal_status() call in NTB and improve commit log.
>>
>> v2:
>> - Modifications to comments proposed by Bjorn. Split patch into more
>> obvious parts.
>>
>> Zhuo Chen (9):
>> PCI/AER: Add pci_aer_clear_uncorrect_error_status() to PCI core
>> PCI/DPC: Use pci_aer_clear_uncorrect_error_status() to clear
>> uncorrectable error status
>> NTB: Remove pci_aer_clear_nonfatal_status() call
>> scsi: lpfc: Change to use pci_aer_clear_uncorrect_error_status()
>> PCI/AER: Unexport pci_aer_clear_nonfatal_status()
>> PCI/AER: Move check inside pcie_clear_device_status().
>> PCI/AER: Use pcie_aer_is_native() to judge whether OS owns AER
>> PCI/ERR: Clear fatal error status when pci_channel_io_frozen
>> PCI/AER: Refine status clearing process with api
>>
>> drivers/ntb/hw/idt/ntb_hw_idt.c | 2 --
>> drivers/pci/pci.c | 7 +++--
>> drivers/pci/pci.h | 2 ++
>> drivers/pci/pcie/aer.c | 45 +++++++++++++++++++--------------
>> drivers/pci/pcie/dpc.c | 3 +--
>> drivers/pci/pcie/err.c | 15 ++++-------
>> drivers/pci/pcie/portdrv_core.c | 3 +--
>> drivers/scsi/lpfc/lpfc_attr.c | 4 +--
>> include/linux/aer.h | 4 +--
>> 9 files changed, 44 insertions(+), 41 deletions(-)
>>
>
--
Zhuo Chen
Hi Zhuo,
On Wed, Sep 28, 2022 at 06:59:45PM +0800, Zhuo Chen wrote:
> When state is pci_channel_io_frozen in pcie_do_recovery(), the
> severity is fatal and fatal error status should be cleared.
> So add pci_aer_clear_fatal_status().
>
> Signed-off-by: Zhuo Chen <[email protected]>
> ---
> drivers/pci/pcie/err.c | 5 ++++-
> 1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index f80b21244ef1..b46f1d36c090 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -241,7 +241,10 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> pci_walk_bridge(bridge, report_resume, &status);
>
> pcie_clear_device_status(dev);
> - pci_aer_clear_nonfatal_status(dev);
> + if (state == pci_channel_io_frozen)
> + pci_aer_clear_fatal_status(dev);
> + else
> + pci_aer_clear_nonfatal_status(dev);
I'm confused. It seems like we certainly need to clear fatal errors
after they occur *somewhere*, and if we don't, surely this would be a
very obvious issue. But you didn't mention this being a bug fix, so I
assume it's more of a cleanup.
If it *is* a bug fix, please say that and give a hint about what the
bug looks like, e.g., what sort of messages a user might see.
If it's not a bug fix, I don't understand how AER fatal errors get
cleared today. The PCI_ERR_UNCOR_STATUS bits are sticky, so they're
not cleared by a reset. In the current tree, these are the only
places I see that clear AER fatal errors:
pci_init_capabilities
pci_aer_init # once at device enumeration
pci_aer_clear_status
pci_aer_raw_clear_status
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status)
aer_probe
aer_enable_rootport # once at Root Port enumeration
pci_write_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, reg32)
dpc_process_error # after DPC triggered
pci_aer_clear_fatal_status
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status)
edr_handle_event # after EDR event
pci_aer_raw_clear_status
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status)
pci_restore_state # after reset or PM sleep/resume
pci_aer_clear_status
pci_aer_raw_clear_status
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status)
The only one that could clear errors after an AER error (not DPC or
EDR), would be the pci_restore_state() in the reset path. If the
current code relies on that, I'd say that's a pretty non-obvious
dependency.
> pci_info(bridge, "device recovery successful\n");
> return status;
> --
> 2.30.1 (Apple Git-130)
>
[moved James, Dick, LPFC supporters to "to"]
On Wed, Sep 28, 2022 at 06:59:41PM +0800, Zhuo Chen wrote:
> lpfc_aer_cleanup_state() requires clearing both fatal and non-fatal
> uncorrectable error status.
I don't know what the point of lpfc_aer_cleanup_state() is. AER
errors should be handled and cleared by the PCI core, not by
individual drivers. Only lpfc, liquidio, and sky2 touch
PCI_ERR_UNCOR_STATUS.
But lpfc_aer_cleanup_state() is visible in the
"lpfc_aer_state_cleanup" sysfs file, so removing it would break any
userspace that uses it.
If we can rely on the PCI core to clean up AER errors itself
(admittedly, that might be a big "if"), maybe lpfc_aer_cleanup_state()
could just become a no-op?
Any comment from the LPFC folks?
Ideally, I would rather not export pci_aer_clear_nonfatal_status() or
pci_aer_clear_uncorrect_error_status() outside the PCI core at all.
> But using pci_aer_clear_nonfatal_status()
> will only clear non-fatal error status. To clear both fatal and
> non-fatal error status, use pci_aer_clear_uncorrect_error_status().
>
> Signed-off-by: Zhuo Chen <[email protected]>
> ---
> drivers/scsi/lpfc/lpfc_attr.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
> index 09cf2cd0ae60..d835cc0ba153 100644
> --- a/drivers/scsi/lpfc/lpfc_attr.c
> +++ b/drivers/scsi/lpfc/lpfc_attr.c
> @@ -4689,7 +4689,7 @@ static DEVICE_ATTR_RW(lpfc_aer_support);
> * Description:
> * If the @buf contains 1 and the device currently has the AER support
> * enabled, then invokes the kernel AER helper routine
> - * pci_aer_clear_nonfatal_status() to clean up the uncorrectable
> + * pci_aer_clear_uncorrect_error_status() to clean up the uncorrectable
> * error status register.
> *
> * Notes:
> @@ -4715,7 +4715,7 @@ lpfc_aer_cleanup_state(struct device *dev, struct device_attribute *attr,
> return -EINVAL;
>
> if (phba->hba_flag & HBA_AER_ENABLED)
> - rc = pci_aer_clear_nonfatal_status(phba->pcidev);
> + rc = pci_aer_clear_uncorrect_error_status(phba->pcidev);
>
> if (rc == 0)
> return strlen(buf);
> --
> 2.30.1 (Apple Git-130)
>
On Tue, Dec 06, 2022 at 04:13:35PM -0600, Bjorn Helgaas wrote:
> On Wed, Sep 28, 2022 at 06:59:41PM +0800, Zhuo Chen wrote:
> > lpfc_aer_cleanup_state() requires clearing both fatal and non-fatal
> > uncorrectable error status.
>
> I don't know what the point of lpfc_aer_cleanup_state() is. AER
> errors should be handled and cleared by the PCI core, not by
> individual drivers. Only lpfc, liquidio, and sky2 touch
> PCI_ERR_UNCOR_STATUS.
>
> But lpfc_aer_cleanup_state() is visible in the
> "lpfc_aer_state_cleanup" sysfs file, so removing it would break any
> userspace that uses it.
>
> If we can rely on the PCI core to clean up AER errors itself
> (admittedly, that might be a big "if"), maybe lpfc_aer_cleanup_state()
> could just become a no-op?
>
> Any comment from the LPFC folks?
>
> Ideally, I would rather not export pci_aer_clear_nonfatal_status() or
> pci_aer_clear_uncorrect_error_status() outside the PCI core at all.
Resurrecting this old thread. Zhuo, can you figure out where the PCI
core clears these errors, include that in the commit log, and propose
a patch that makes lpfc_aer_cleanup_state() a no-op, by removing the
pci_aer_clear_nonfatal_status() call completely?
Such a patch could be sent to the SCSI maintainers since it doesn't
involve the PCI core.
If it turns out that the PCI core *doesn't* clear these errors, we
should figure out *why* it doesn't and try to change the PCI core so
it does.
> > But using pci_aer_clear_nonfatal_status()
> > will only clear non-fatal error status. To clear both fatal and
> > non-fatal error status, use pci_aer_clear_uncorrect_error_status().
> >
> > Signed-off-by: Zhuo Chen <[email protected]>
> > ---
> > drivers/scsi/lpfc/lpfc_attr.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
> > index 09cf2cd0ae60..d835cc0ba153 100644
> > --- a/drivers/scsi/lpfc/lpfc_attr.c
> > +++ b/drivers/scsi/lpfc/lpfc_attr.c
> > @@ -4689,7 +4689,7 @@ static DEVICE_ATTR_RW(lpfc_aer_support);
> > * Description:
> > * If the @buf contains 1 and the device currently has the AER support
> > * enabled, then invokes the kernel AER helper routine
> > - * pci_aer_clear_nonfatal_status() to clean up the uncorrectable
> > + * pci_aer_clear_uncorrect_error_status() to clean up the uncorrectable
> > * error status register.
> > *
> > * Notes:
> > @@ -4715,7 +4715,7 @@ lpfc_aer_cleanup_state(struct device *dev, struct device_attribute *attr,
> > return -EINVAL;
> >
> > if (phba->hba_flag & HBA_AER_ENABLED)
> > - rc = pci_aer_clear_nonfatal_status(phba->pcidev);
> > + rc = pci_aer_clear_uncorrect_error_status(phba->pcidev);
> >
> > if (rc == 0)
> > return strlen(buf);
> > --
> > 2.30.1 (Apple Git-130)
> >
Hi Bjorn,
> But lpfc_aer_cleanup_state() is visible in the
> "lpfc_aer_state_cleanup" sysfs file, so removing it would break any
> userspace that uses it.
>
> If we can rely on the PCI core to clean up AER errors itself
> (admittedly, that might be a big "if"), maybe lpfc_aer_cleanup_state()
> could just become a no-op?
>
> Any comment from the LPFC folks?
We have notified all users of the lpfc_aer_cleanup_state sysfs entry,
and Broadcom LPFC is okay to no-op.
Regards,
Justin
On Wed, Mar 15, 2023 at 2:39 PM Bjorn Helgaas <[email protected]> wrote:
>
> On Tue, Dec 06, 2022 at 04:13:35PM -0600, Bjorn Helgaas wrote:
> > On Wed, Sep 28, 2022 at 06:59:41PM +0800, Zhuo Chen wrote:
> > > lpfc_aer_cleanup_state() requires clearing both fatal and non-fatal
> > > uncorrectable error status.
> >
> > I don't know what the point of lpfc_aer_cleanup_state() is. AER
> > errors should be handled and cleared by the PCI core, not by
> > individual drivers. Only lpfc, liquidio, and sky2 touch
> > PCI_ERR_UNCOR_STATUS.
> >
> > But lpfc_aer_cleanup_state() is visible in the
> > "lpfc_aer_state_cleanup" sysfs file, so removing it would break any
> > userspace that uses it.
> >
> > If we can rely on the PCI core to clean up AER errors itself
> > (admittedly, that might be a big "if"), maybe lpfc_aer_cleanup_state()
> > could just become a no-op?
> >
> > Any comment from the LPFC folks?
> >
> > Ideally, I would rather not export pci_aer_clear_nonfatal_status() or
> > pci_aer_clear_uncorrect_error_status() outside the PCI core at all.
>
> Resurrecting this old thread. Zhuo, can you figure out where the PCI
> core clears these errors, include that in the commit log, and propose
> a patch that makes lpfc_aer_cleanup_state() a no-op, by removing the
> pci_aer_clear_nonfatal_status() call completely?
>
> Such a patch could be sent to the SCSI maintainers since it doesn't
> involve the PCI core.
>
> If it turns out that the PCI core *doesn't* clear these errors, we
> should figure out *why* it doesn't and try to change the PCI core so
> it does.
>
> > > But using pci_aer_clear_nonfatal_status()
> > > will only clear non-fatal error status. To clear both fatal and
> > > non-fatal error status, use pci_aer_clear_uncorrect_error_status().
> > >
> > > Signed-off-by: Zhuo Chen <[email protected]>
> > > ---
> > > drivers/scsi/lpfc/lpfc_attr.c | 4 ++--
> > > 1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
> > > index 09cf2cd0ae60..d835cc0ba153 100644
> > > --- a/drivers/scsi/lpfc/lpfc_attr.c
> > > +++ b/drivers/scsi/lpfc/lpfc_attr.c
> > > @@ -4689,7 +4689,7 @@ static DEVICE_ATTR_RW(lpfc_aer_support);
> > > * Description:
> > > * If the @buf contains 1 and the device currently has the AER support
> > > * enabled, then invokes the kernel AER helper routine
> > > - * pci_aer_clear_nonfatal_status() to clean up the uncorrectable
> > > + * pci_aer_clear_uncorrect_error_status() to clean up the uncorrectable
> > > * error status register.
> > > *
> > > * Notes:
> > > @@ -4715,7 +4715,7 @@ lpfc_aer_cleanup_state(struct device *dev, struct device_attribute *attr,
> > > return -EINVAL;
> > >
> > > if (phba->hba_flag & HBA_AER_ENABLED)
> > > - rc = pci_aer_clear_nonfatal_status(phba->pcidev);
> > > + rc = pci_aer_clear_uncorrect_error_status(phba->pcidev);
> > >
> > > if (rc == 0)
> > > return strlen(buf);
> > > --
> > > 2.30.1 (Apple Git-130)
> > >