This patchset adds RAS compatibility adaptation solution for new devices.
Jiaran Zhang (4):
net: hns3: add the RAS compatibility adaptation solution
net: hns3: add support for imp-handle ras capability
net: hns3: update error recovery module and type
net: hns3: add error handling compatibility during initialization
Yufeng Mo (1):
net: hns3: add support for handling all errors through MSI-X
drivers/net/ethernet/hisilicon/hns3/hnae3.h | 4 +
drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 3 +
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 5 +-
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 3 +
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 410 +++++++++++++++++++--
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h | 89 +++++
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 87 +++--
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 +
8 files changed, 546 insertions(+), 56 deletions(-)
--
2.8.1
From: Jiaran Zhang <[email protected]>
During initialization, the driver logs and clears the hw errors that
already occurred. For device supports imp-handle ras capability, it
needs handle different error status, otherwise it may cause wrong reset.
So fix it by adding a new processing branch.
Signed-off-by: Jiaran Zhang <[email protected]>
Signed-off-by: Guangbin Huang <[email protected]>
---
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 22 ++++++++++++++++++++++
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h | 2 ++
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 21 ++++++++++-----------
3 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index 0e942d11dbf3..bad9fda19398 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -2152,6 +2152,28 @@ void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev)
kfree(desc);
}
+bool hclge_find_error_source(struct hclge_dev *hdev)
+{
+ u32 msix_src_flag, hw_err_src_flag;
+
+ msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
+ HCLGE_VECTOR0_REG_MSIX_MASK;
+
+ hw_err_src_flag = hclge_read_dev(&hdev->hw,
+ HCLGE_RAS_PF_OTHER_INT_STS_REG) &
+ HCLGE_RAS_REG_ERR_MASK;
+
+ return msix_src_flag || hw_err_src_flag;
+}
+
+void hclge_handle_occurred_error(struct hclge_dev *hdev)
+{
+ struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
+
+ if (hclge_find_error_source(hdev))
+ hclge_handle_error_info_log(ae_dev);
+}
+
static void
hclge_handle_error_type_reg_log(struct device *dev,
struct hclge_mod_err_info *mod_info,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
index ce4c96bbef8e..07987fb8332e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h
@@ -215,6 +215,8 @@ int hclge_config_mac_tnl_int(struct hclge_dev *hdev, bool en);
int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state);
int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en);
void hclge_handle_all_hns_hw_errors(struct hnae3_ae_dev *ae_dev);
+bool hclge_find_error_source(struct hclge_dev *hdev);
+void hclge_handle_occurred_error(struct hclge_dev *hdev);
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
unsigned long *reset_requests);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 9ff4210f6477..d960e08850ae 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4255,18 +4255,11 @@ static void hclge_handle_err_reset_request(struct hclge_dev *hdev)
static void hclge_handle_err_recovery(struct hclge_dev *hdev)
{
- u32 mask_val = HCLGE_RAS_REG_NFE_MASK | HCLGE_RAS_REG_ROCEE_ERR_MASK;
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
- u32 msix_src_flag, hw_err_src_flag;
- msix_src_flag = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS) &
- HCLGE_VECTOR0_REG_MSIX_MASK;
+ ae_dev->hw_err_reset_req = 0;
- hw_err_src_flag = hclge_read_dev(&hdev->hw,
- HCLGE_RAS_PF_OTHER_INT_STS_REG) &
- mask_val;
-
- if (msix_src_flag || hw_err_src_flag) {
+ if (hclge_find_error_source(hdev)) {
hclge_handle_error_info_log(ae_dev);
hclge_handle_mac_tnl(hdev);
}
@@ -11558,7 +11551,10 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_clear_resetting_state(hdev);
/* Log and clear the hw errors those already occurred */
- hclge_handle_all_hns_hw_errors(ae_dev);
+ if (hnae3_dev_ras_imp_supported(hdev))
+ hclge_handle_occurred_error(hdev);
+ else
+ hclge_handle_all_hns_hw_errors(ae_dev);
/* request delayed reset for the error recovery because an immediate
* global reset on a PF affecting pending initialization of other PFs
@@ -11911,7 +11907,10 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
}
/* Log and clear the hw errors those already occurred */
- hclge_handle_all_hns_hw_errors(ae_dev);
+ if (hnae3_dev_ras_imp_supported(hdev))
+ hclge_handle_occurred_error(hdev);
+ else
+ hclge_handle_all_hns_hw_errors(ae_dev);
/* Re-enable the hw error interrupts because
* the interrupts get disabled on global reset.
--
2.8.1
From: Yufeng Mo <[email protected]>
Currently, hardware errors can be reported through AER or MSI-X mode.
However, the AER mode is intended to handle only bus errors, but not
hardware errors. On the other hand, virtual machines cannot handle
AER errors. When an AER error is reported, virtual machines will be
suspended. So add support for handling all these hardware errors
through MSI-X mode which depends on a newer version of firmware,
and reserve the handler of the AER mode for compatibility.
Signed-off-by: Yufeng Mo <[email protected]>
Signed-off-by: Jiaran Zhang <[email protected]>
Signed-off-by: Guangbin Huang <[email protected]>
---
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 16 ++++++++
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 47 +++++++++++-----------
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 1 +
3 files changed, 41 insertions(+), 23 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
index f125aa425872..540dd15d7771 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
@@ -1611,11 +1611,27 @@ static const struct hclge_hw_blk hw_blk[] = {
{ /* sentinel */ }
};
+static void hclge_config_all_msix_error(struct hclge_dev *hdev, bool enable)
+{
+ u32 reg_val;
+
+ reg_val = hclge_read_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG);
+
+ if (enable)
+ reg_val |= BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
+ else
+ reg_val &= ~BIT(HCLGE_VECTOR0_ALL_MSIX_ERR_B);
+
+ hclge_write_dev(&hdev->hw, HCLGE_PF_OTHER_INT_REG, reg_val);
+}
+
int hclge_config_nic_hw_error(struct hclge_dev *hdev, bool state)
{
const struct hclge_hw_blk *module = hw_blk;
int ret = 0;
+ hclge_config_all_msix_error(hdev, state);
+
while (module->name) {
if (module->config_err_int) {
ret = module->config_err_int(hdev, state);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 45102681bd2a..d5be3bc50b5c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3307,11 +3307,13 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf,
static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
{
- u32 cmdq_src_reg, msix_src_reg;
+ u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg;
/* fetch the events from their corresponding regs */
cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
msix_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
+ hw_err_src_reg = hclge_read_dev(&hdev->hw,
+ HCLGE_RAS_PF_OTHER_INT_STS_REG);
/* Assumption: If by any chance reset and mailbox events are reported
* together then we will only process reset event in this go and will
@@ -3339,11 +3341,11 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
return HCLGE_VECTOR0_EVENT_RST;
}
- /* check for vector0 msix event source */
- if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
- *clearval = msix_src_reg;
+ /* check for vector0 msix event and hardware error event source */
+ if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK ||
+ hw_err_src_reg & HCLGE_RAS_REG_NFE_MASK ||
+ hw_err_src_reg & HCLGE_RAS_REG_ROCEE_ERR_MASK)
return HCLGE_VECTOR0_EVENT_ERR;
- }
/* check for vector0 mailbox(=CMDQ RX) event source */
if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
@@ -3354,9 +3356,8 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
/* print other vector0 event source */
dev_info(&hdev->pdev->dev,
- "CMDQ INT status:0x%x, other INT status:0x%x\n",
- cmdq_src_reg, msix_src_reg);
- *clearval = msix_src_reg;
+ "INT status: CMDQ(%#x) HW errors(%#x) other(%#x)\n",
+ cmdq_src_reg, hw_err_src_reg, msix_src_reg);
return HCLGE_VECTOR0_EVENT_OTHER;
}
@@ -3427,15 +3428,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
hclge_clear_event_cause(hdev, event_cause, clearval);
- /* Enable interrupt if it is not cause by reset. And when
- * clearval equal to 0, it means interrupt status may be
- * cleared by hardware before driver reads status register.
- * For this case, vector0 interrupt also should be enabled.
- */
- if (!clearval ||
- event_cause == HCLGE_VECTOR0_EVENT_MBX) {
+ /* Enable interrupt if it is not caused by reset event or error event */
+ if (event_cause == HCLGE_VECTOR0_EVENT_MBX ||
+ event_cause == HCLGE_VECTOR0_EVENT_OTHER)
hclge_enable_vector(&hdev->misc_vector, true);
- }
return IRQ_HANDLED;
}
@@ -4244,22 +4240,27 @@ static void hclge_misc_err_recovery(struct hclge_dev *hdev)
{
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
struct device *dev = &hdev->pdev->dev;
+ enum hnae3_reset_type reset_type;
u32 msix_sts_reg;
msix_sts_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
-
if (msix_sts_reg & HCLGE_VECTOR0_REG_MSIX_MASK) {
- if (hclge_handle_hw_msix_error(hdev,
- &hdev->default_reset_request))
+ if (hclge_handle_hw_msix_error
+ (hdev, &hdev->default_reset_request))
dev_info(dev, "received msix interrupt 0x%x\n",
msix_sts_reg);
+ }
+ hclge_enable_vector(&hdev->misc_vector, true);
- if (hdev->default_reset_request)
- if (ae_dev->ops->reset_event)
- ae_dev->ops->reset_event(hdev->pdev, NULL);
+ hclge_handle_hw_ras_error(ae_dev);
+ if (ae_dev->hw_err_reset_req) {
+ reset_type = hclge_get_reset_level(ae_dev,
+ &ae_dev->hw_err_reset_req);
+ hclge_set_def_reset_request(ae_dev, reset_type);
}
- hclge_enable_vector(&hdev->misc_vector, true);
+ if (hdev->default_reset_request && ae_dev->ops->reset_event)
+ ae_dev->ops->reset_event(hdev->pdev, NULL);
}
static void hclge_errhand_service_task(struct hclge_dev *hdev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 9b8abb5d7a8e..582972a6f60e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -190,6 +190,7 @@ enum HLCGE_PORT_TYPE {
#define HCLGE_VECTOR0_IMP_RESET_INT_B 1
#define HCLGE_VECTOR0_IMP_CMDQ_ERR_B 4U
#define HCLGE_VECTOR0_IMP_RD_POISON_B 5U
+#define HCLGE_VECTOR0_ALL_MSIX_ERR_B 6U
#define HCLGE_MAC_DEFAULT_FRAME \
(ETH_HLEN + ETH_FCS_LEN + 2 * VLAN_HLEN + ETH_DATA_LEN)
--
2.8.1