2023-07-24 09:06:55

by Mahesh Salgaonkar

[permalink] [raw]
Subject: [PATCH v7 1/2] powerpc/rtas: Rename rtas_error_rc to rtas_generic_errno

rtas_generic_errno() function will convert the generic rtas return codes
into errno. Also, #define descriptive names for rtas return codes and use
it instead of numeric values.

Signed-off-by: Mahesh Salgaonkar <[email protected]>
---

- Until v6 there was only one patch with subject "PCI hotplug: rpaphp:
Error out on busy status from get-sensor-state". Starting from v7, adding
this new patch to introduce rtas_generic_errno() to handle generic rtas
error codes.
https://lore.kernel.org/all/20220429162545.GA79541@bhelgaas/
---
arch/powerpc/include/asm/rtas.h | 10 +++++++
arch/powerpc/kernel/rtas.c | 53 ++++++++++++++++++++-------------------
2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 3abe15ac79db1..5572a0a2f6e18 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -202,7 +202,9 @@ typedef struct {
#define RTAS_USER_REGION_SIZE (64 * 1024)

/* RTAS return status codes */
-#define RTAS_BUSY -2 /* RTAS Busy */
+#define RTAS_HARDWARE_ERROR (-1) /* Hardware Error */
+#define RTAS_BUSY (-2) /* RTAS Busy */
+#define RTAS_INVALID_PARAMETER (-3) /* Invalid indicator/domain/sensor etc. */
#define RTAS_EXTENDED_DELAY_MIN 9900
#define RTAS_EXTENDED_DELAY_MAX 9905

@@ -212,6 +214,11 @@ typedef struct {
#define RTAS_THREADS_ACTIVE -9005 /* Multiple processor threads active */
#define RTAS_OUTSTANDING_COPROC -9006 /* Outstanding coprocessor operations */

+/* statuses specific to get-sensor-state */
+#define RTAS_SLOT_UNISOLATED (-9000)
+#define RTAS_SLOT_NOT_UNISOLATED (-9001)
+#define RTAS_SLOT_NOT_USABLE (-9002)
+
/* RTAS event classes */
#define RTAS_INTERNAL_ERROR 0x80000000 /* set bit 0 */
#define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */
@@ -425,6 +432,7 @@ extern int rtas_set_indicator(int indicator, int index, int new_value);
extern int rtas_set_indicator_fast(int indicator, int index, int new_value);
extern void rtas_progress(char *s, unsigned short hex);
int rtas_ibm_suspend_me(int *fw_status);
+int rtas_generic_errno(int rtas_rc);

struct rtc_time;
extern time64_t rtas_get_boot_time(void);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index c087eeee320ff..80b6099e8ce20 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1330,33 +1330,34 @@ bool __ref rtas_busy_delay(int status)
}
EXPORT_SYMBOL_GPL(rtas_busy_delay);

-static int rtas_error_rc(int rtas_rc)
+int rtas_generic_errno(int rtas_rc)
{
int rc;

switch (rtas_rc) {
- case -1: /* Hardware Error */
- rc = -EIO;
- break;
- case -3: /* Bad indicator/domain/etc */
- rc = -EINVAL;
- break;
- case -9000: /* Isolation error */
- rc = -EFAULT;
- break;
- case -9001: /* Outstanding TCE/PTE */
- rc = -EEXIST;
- break;
- case -9002: /* No usable slot */
- rc = -ENODEV;
- break;
- default:
- pr_err("%s: unexpected error %d\n", __func__, rtas_rc);
- rc = -ERANGE;
- break;
+ case RTAS_HARDWARE_ERROR: /* Hardware Error */
+ rc = -EIO;
+ break;
+ case RTAS_INVALID_PARAMETER: /* Bad indicator/domain/etc */
+ rc = -EINVAL;
+ break;
+ case RTAS_SLOT_UNISOLATED: /* Isolation error */
+ rc = -EFAULT;
+ break;
+ case RTAS_SLOT_NOT_UNISOLATED: /* Outstanding TCE/PTE */
+ rc = -EEXIST;
+ break;
+ case RTAS_SLOT_NOT_USABLE: /* No usable slot */
+ rc = -ENODEV;
+ break;
+ default:
+ pr_err("%s: unexpected error %d\n", __func__, rtas_rc);
+ rc = -ERANGE;
+ break;
}
return rc;
}
+EXPORT_SYMBOL(rtas_generic_errno);

int rtas_get_power_level(int powerdomain, int *level)
{
@@ -1370,7 +1371,7 @@ int rtas_get_power_level(int powerdomain, int *level)
udelay(1);

if (rc < 0)
- return rtas_error_rc(rc);
+ return rtas_generic_errno(rc);
return rc;
}
EXPORT_SYMBOL_GPL(rtas_get_power_level);
@@ -1388,7 +1389,7 @@ int rtas_set_power_level(int powerdomain, int level, int *setlevel)
} while (rtas_busy_delay(rc));

if (rc < 0)
- return rtas_error_rc(rc);
+ return rtas_generic_errno(rc);
return rc;
}
EXPORT_SYMBOL_GPL(rtas_set_power_level);
@@ -1406,7 +1407,7 @@ int rtas_get_sensor(int sensor, int index, int *state)
} while (rtas_busy_delay(rc));

if (rc < 0)
- return rtas_error_rc(rc);
+ return rtas_generic_errno(rc);
return rc;
}
EXPORT_SYMBOL_GPL(rtas_get_sensor);
@@ -1424,7 +1425,7 @@ int rtas_get_sensor_fast(int sensor, int index, int *state)
rc <= RTAS_EXTENDED_DELAY_MAX));

if (rc < 0)
- return rtas_error_rc(rc);
+ return rtas_generic_errno(rc);
return rc;
}

@@ -1466,7 +1467,7 @@ int rtas_set_indicator(int indicator, int index, int new_value)
} while (rtas_busy_delay(rc));

if (rc < 0)
- return rtas_error_rc(rc);
+ return rtas_generic_errno(rc);
return rc;
}
EXPORT_SYMBOL_GPL(rtas_set_indicator);
@@ -1488,7 +1489,7 @@ int rtas_set_indicator_fast(int indicator, int index, int new_value)
rc <= RTAS_EXTENDED_DELAY_MAX));

if (rc < 0)
- return rtas_error_rc(rc);
+ return rtas_generic_errno(rc);

return rc;
}




2023-07-24 09:18:51

by Mahesh Salgaonkar

[permalink] [raw]
Subject: [PATCH v7 2/2] PCI: rpaphp: Error out on busy status from get-sensor-state

When certain PHB HW failure causes pHyp to recover PHB, it marks the PE
state as temporarily unavailable until recovery is complete. This also
triggers an EEH handler in Linux which needs to notify drivers, and perform
recovery. But before notifying the driver about the PCI error it uses
get_adapter_state()->get-sensor-state() operation of the hotplug_slot to
determine if the slot contains a device or not. if the slot is empty, the
recovery is skipped entirely.

However on certain PHB failures, the RTAS call get-sensor-state() returns
extended busy error (9902) until PHB is recovered by pHyp. Once PHB is
recovered, the get-sensor-state() returns success with correct presence
status. The RTAS call interface rtas_get_sensor() loops over the RTAS call
on extended delay return code (9902) until the return value is either
success (0) or error (-1). This causes the EEH handler to get stuck for ~6
seconds before it could notify that the PCI error has been detected and
stop any active operations. Hence with running I/O traffic, during this 6
seconds, the network driver continues its operation and hits a timeout
(netdev watchdog).

------------
[52732.244731] DEBUG: ibm_read_slot_reset_state2()
[52732.244762] DEBUG: ret = 0, rets[0]=5, rets[1]=1, rets[2]=4000, rets[3]=>
[52732.244798] DEBUG: in eeh_slot_presence_check
[52732.244804] DEBUG: error state check
[52732.244807] DEBUG: Is slot hotpluggable
[52732.244810] DEBUG: hotpluggable ops ?
[52732.244953] DEBUG: Calling ops->get_adapter_status
[52732.244958] DEBUG: calling rpaphp_get_sensor_state
[52736.564262] ------------[ cut here ]------------
[52736.564299] NETDEV WATCHDOG: enP64p1s0f3 (tg3): transmit queue 0 timed o>
[52736.564324] WARNING: CPU: 1442 PID: 0 at net/sched/sch_generic.c:478 dev>
[...]
[52736.564505] NIP [c000000000c32368] dev_watchdog+0x438/0x440
[52736.564513] LR [c000000000c32364] dev_watchdog+0x434/0x440
------------

On timeouts, network driver starts dumping debug information to console
(e.g bnx2 driver calls bnx2x_panic_dump()), and go into recovery path while
pHyp is still recovering the PHB. As part of recovery, the driver tries to
reset the device and it keeps failing since every PCI read/write returns
ff's. And when EEH recovery kicks-in, the driver is unable to recover the
device. This impacts the ssh connection and leads to the system being
inaccessible. To get the NIC working again it needs a reboot or re-assign
the I/O adapter from HMC.

[ 9531.168587] EEH: Beginning: 'slot_reset'
[ 9531.168601] PCI 0013:01:00.0#10000: EEH: Invoking bnx2x->slot_reset()
[...]
[ 9614.110094] bnx2x: [bnx2x_func_stop:9129(enP19p1s0f0)]FUNC_STOP ramrod failed. Running a dry transaction
[ 9614.110300] bnx2x: [bnx2x_igu_int_disable:902(enP19p1s0f0)]BUG! Proper val not read from IGU!
[ 9629.178067] bnx2x: [bnx2x_fw_command:3055(enP19p1s0f0)]FW failed to respond!
[ 9629.178085] bnx2x 0013:01:00.0 enP19p1s0f0: bc 7.10.4
[ 9629.178091] bnx2x: [bnx2x_fw_dump_lvl:789(enP19p1s0f0)]Cannot dump MCP info while in PCI error
[ 9644.241813] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f0)]IO slot reset --> driver unload
[...]
[ 9644.241819] PCI 0013:01:00.0#10000: EEH: bnx2x driver reports: 'disconnect'
[ 9644.241823] PCI 0013:01:00.1#10000: EEH: Invoking bnx2x->slot_reset()
[ 9644.241827] bnx2x: [bnx2x_io_slot_reset:14229(enP19p1s0f1)]IO slot reset initializing...
[ 9644.241916] bnx2x 0013:01:00.1: enabling device (0140 -> 0142)
[ 9644.258604] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f1)]IO slot reset --> driver unload
[ 9644.258612] PCI 0013:01:00.1#10000: EEH: bnx2x driver reports: 'disconnect'
[ 9644.258615] EEH: Finished:'slot_reset' with aggregate recovery state:'disconnect'
[ 9644.258620] EEH: Unable to recover from failure from PHB#13-PE#10000.
[ 9644.261811] EEH: Beginning: 'error_detected(permanent failure)'
[...]
[ 9644.261823] EEH: Finished:'error_detected(permanent failure)'

Hence, it becomes important to inform driver about the PCI error detection
as early as possible, so that driver is aware of PCI error and waits for
EEH handler's next action for successful recovery.

Current implementation uses rtas_get_sensor() API which blocks the slot
check state until RTAS call returns success. To avoid this, fix the PCI
hotplug driver (rpaphp) to return an error (-EBUSY) if the slot presence
state can not be detected immediately while PE is in EEH recovery state.
Change rpaphp_get_sensor_state() to invoke rtas_call(get-sensor-state)
directly only if the respective PE is in EEH recovery state, and take
actions based on RTAS return status. This way EEH handler will not be
blocked on rpaphp_get_sensor_state() and can immediately notify driver
about the PCI error and stop any active operations.

In normal cases (non-EEH case) rpaphp_get_sensor_state() will continue to
invoke rtas_get_sensor() as it was earlier with no change in existing
behavior.

Signed-off-by: Mahesh Salgaonkar <[email protected]>
Reviewed-by: Nathan Lynch <[email protected]>
---
Change in v7:
- Modified patch description to explain affect of timeout on NIC
functioning.
- Fix few nits requested in previous review at
https://lore.kernel.org/all/[email protected]/
- Add additional patch before this to introduce rtas_generic_errno() to
handle generic rtas error codes.
https://lore.kernel.org/all/20220429162545.GA79541@bhelgaas/

Change in v6:
- Fixed typo's in the patch description as per review comments.

Change in v5:
- Fixup #define macros with parentheses around the values.

Change in V4:
- Error out on sensor busy only if PE is going through EEH recovery instead
of always error out.

Change in V3:
- Invoke rtas_call(get-sensor-state) directly from
rpaphp_get_sensor_state() directly and do special handling.
- See v2 at
https://lore.kernel.org/all/163817631601.2016996.16085383012429651821.stgit@jupiter/

Change in V2:
- Alternate approach to fix the EEH issue instead of delaying slot presence
check proposed at
https://lore.kernel.org/all/163767273634.1368569.7327743414665595326.stgit@jupiter/

Also refer:
https://lore.kernel.org/all/[email protected]/
---
drivers/pci/hotplug/rpaphp_pci.c | 82 +++++++++++++++++++++++++++++++++++++-
1 file changed, 79 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c
index 630f77057c23d..c8a4699d1f029 100644
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -19,12 +19,89 @@
#include "../pci.h" /* for pci_add_new_bus */
#include "rpaphp.h"

+/*
+ * RTAS call get-sensor-state(DR_ENTITY_SENSE) return values as per PAPR:
+ * -- generic return codes ---
+ * -1: Hardware Error
+ * -2: RTAS_BUSY
+ * -3: Invalid sensor. RTAS Parameter Error.
+ * -- rtas_get_sensor function specific return codes ---
+ * -9000: Need DR entity to be powered up and unisolated before RTAS call
+ * -9001: Need DR entity to be powered up, but not unisolated, before RTAS call
+ * -9002: DR entity unusable
+ * 990x: Extended delay - where x is a number in the range of 0-5
+ */
+static int rtas_get_sensor_errno(int rtas_rc)
+{
+ switch (rtas_rc) {
+ case 0:
+ /* Success case */
+ return 0;
+ case RTAS_SLOT_UNISOLATED:
+ case RTAS_SLOT_NOT_UNISOLATED:
+ return -EFAULT;
+ case RTAS_SLOT_NOT_USABLE:
+ return -ENODEV;
+ case RTAS_BUSY:
+ case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX:
+ return -EBUSY;
+ default:
+ return rtas_generic_errno(rtas_rc);
+ }
+}
+
+/*
+ * get_adapter_status() can be called by the EEH handler during EEH recovery.
+ * On certain PHB failures, the RTAS call get-seHsor-state() returns extended
+ * busy error (9902) until PHB is recovered by pHyp. The RTAS call interface
+ * rtas_get_sensor() loops over the RTAS call on extended delay return code
+ * (9902) until the return value is either success (0) or error (-1). This
+ * causes the EEH handler to get stuck for ~6 seconds before it could notify
+ * that the PCI error has been detected and stop any active operations. This
+ * sometimes causes EEH recovery to fail. To avoid this issue, invoke
+ * rtas_call(get-sensor-state) directly if the respective PE is in EEH recovery
+ * state and return -EBUSY error based on RTAS return status. This will help
+ * the EEH handler to notify the driver about the PCI error immediately and
+ * successfully proceed with EEH recovery steps.
+ */
+static int __rpaphp_get_sensor_state(struct slot *slot, int *state)
+{
+#ifdef CONFIG_EEH
+ int rc;
+ int token = rtas_token("get-sensor-state");
+ struct pci_dn *pdn;
+ struct eeh_pe *pe;
+ struct pci_controller *phb = PCI_DN(slot->dn)->phb;
+
+ if (token == RTAS_UNKNOWN_SERVICE)
+ return -ENOENT;
+
+ /*
+ * Fallback to existing method for empty slot or PE isn't in EEH
+ * recovery.
+ */
+ pdn = list_first_entry_or_null(&PCI_DN(phb->dn)->child_list,
+ struct pci_dn, list);
+ if (!pdn)
+ goto fallback;
+
+ pe = eeh_dev_to_pe(pdn->edev);
+ if (pe && (pe->state & EEH_PE_RECOVERING)) {
+ rc = rtas_call(token, 2, 2, state, DR_ENTITY_SENSE,
+ slot->index);
+ return rtas_get_sensor_errno(rc);
+ }
+fallback:
+#endif
+ return rtas_get_sensor(DR_ENTITY_SENSE, slot->index, state);
+}
+
int rpaphp_get_sensor_state(struct slot *slot, int *state)
{
int rc;
int setlevel;

- rc = rtas_get_sensor(DR_ENTITY_SENSE, slot->index, state);
+ rc = __rpaphp_get_sensor_state(slot, state);

if (rc < 0) {
if (rc == -EFAULT || rc == -EEXIST) {
@@ -40,8 +117,7 @@ int rpaphp_get_sensor_state(struct slot *slot, int *state)
dbg("%s: power on slot[%s] failed rc=%d.\n",
__func__, slot->name, rc);
} else {
- rc = rtas_get_sensor(DR_ENTITY_SENSE,
- slot->index, state);
+ rc = __rpaphp_get_sensor_state(slot, state);
}
} else if (rc == -ENODEV)
info("%s: slot is unusable\n", __func__);



2023-08-01 22:06:12

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v7 2/2] PCI: rpaphp: Error out on busy status from get-sensor-state

On Mon, Jul 24, 2023 at 02:25:19PM +0530, Mahesh Salgaonkar wrote:
> When certain PHB HW failure causes pHyp to recover PHB, it marks the PE
> state as temporarily unavailable until recovery is complete. This also
> triggers an EEH handler in Linux which needs to notify drivers, and perform
> recovery. But before notifying the driver about the PCI error it uses
> get_adapter_state()->get-sensor-state() operation of the hotplug_slot to
> determine if the slot contains a device or not. if the slot is empty, the
> recovery is skipped entirely.

It's helpful to use the exact function name so it's greppable; I think
get_adapter_status() or rpaphp_get_sensor_state()?

s/if the slot is empty,/If the slot is empty,/

> However on certain PHB failures, the RTAS call get-sensor-state() returns
> extended busy error (9902) until PHB is recovered by pHyp. Once PHB is
> recovered, the get-sensor-state() returns success with correct presence
> status. The RTAS call interface rtas_get_sensor() loops over the RTAS call
> on extended delay return code (9902) until the return value is either
> success (0) or error (-1). This causes the EEH handler to get stuck for ~6
> seconds before it could notify that the PCI error has been detected and
> stop any active operations. Hence with running I/O traffic, during this 6
> seconds, the network driver continues its operation and hits a timeout
> (netdev watchdog).
>
> ------------
> [52732.244731] DEBUG: ibm_read_slot_reset_state2()
> [52732.244762] DEBUG: ret = 0, rets[0]=5, rets[1]=1, rets[2]=4000, rets[3]=>
> [52732.244798] DEBUG: in eeh_slot_presence_check
> [52732.244804] DEBUG: error state check
> [52732.244807] DEBUG: Is slot hotpluggable
> [52732.244810] DEBUG: hotpluggable ops ?
> [52732.244953] DEBUG: Calling ops->get_adapter_status
> [52732.244958] DEBUG: calling rpaphp_get_sensor_state
> [52736.564262] ------------[ cut here ]------------
> [52736.564299] NETDEV WATCHDOG: enP64p1s0f3 (tg3): transmit queue 0 timed o>
> [52736.564324] WARNING: CPU: 1442 PID: 0 at net/sched/sch_generic.c:478 dev>
> [...]
> [52736.564505] NIP [c000000000c32368] dev_watchdog+0x438/0x440
> [52736.564513] LR [c000000000c32364] dev_watchdog+0x434/0x440
> ------------
>
> On timeouts, network driver starts dumping debug information to console
> (e.g bnx2 driver calls bnx2x_panic_dump()), and go into recovery path while
> pHyp is still recovering the PHB. As part of recovery, the driver tries to
> reset the device and it keeps failing since every PCI read/write returns
> ff's. And when EEH recovery kicks-in, the driver is unable to recover the
> device. This impacts the ssh connection and leads to the system being
> inaccessible. To get the NIC working again it needs a reboot or re-assign
> the I/O adapter from HMC.
>
> [ 9531.168587] EEH: Beginning: 'slot_reset'
> [ 9531.168601] PCI 0013:01:00.0#10000: EEH: Invoking bnx2x->slot_reset()
> [...]
> [ 9614.110094] bnx2x: [bnx2x_func_stop:9129(enP19p1s0f0)]FUNC_STOP ramrod failed. Running a dry transaction
> [ 9614.110300] bnx2x: [bnx2x_igu_int_disable:902(enP19p1s0f0)]BUG! Proper val not read from IGU!
> [ 9629.178067] bnx2x: [bnx2x_fw_command:3055(enP19p1s0f0)]FW failed to respond!
> [ 9629.178085] bnx2x 0013:01:00.0 enP19p1s0f0: bc 7.10.4
> [ 9629.178091] bnx2x: [bnx2x_fw_dump_lvl:789(enP19p1s0f0)]Cannot dump MCP info while in PCI error
> [ 9644.241813] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f0)]IO slot reset --> driver unload
> [...]
> [ 9644.241819] PCI 0013:01:00.0#10000: EEH: bnx2x driver reports: 'disconnect'
> [ 9644.241823] PCI 0013:01:00.1#10000: EEH: Invoking bnx2x->slot_reset()
> [ 9644.241827] bnx2x: [bnx2x_io_slot_reset:14229(enP19p1s0f1)]IO slot reset initializing...
> [ 9644.241916] bnx2x 0013:01:00.1: enabling device (0140 -> 0142)
> [ 9644.258604] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f1)]IO slot reset --> driver unload
> [ 9644.258612] PCI 0013:01:00.1#10000: EEH: bnx2x driver reports: 'disconnect'
> [ 9644.258615] EEH: Finished:'slot_reset' with aggregate recovery state:'disconnect'
> [ 9644.258620] EEH: Unable to recover from failure from PHB#13-PE#10000.
> [ 9644.261811] EEH: Beginning: 'error_detected(permanent failure)'
> [...]
> [ 9644.261823] EEH: Finished:'error_detected(permanent failure)'
>
> Hence, it becomes important to inform driver about the PCI error detection
> as early as possible, so that driver is aware of PCI error and waits for
> EEH handler's next action for successful recovery.

I don't really understand the connection between EEH and
get_adapter_status(), but I guess this probably refers to
arch/powerpc/kernel/eeh_driver.c, not the PCI core aer.c and err.c?

> Current implementation uses rtas_get_sensor() API which blocks the slot
> check state until RTAS call returns success. To avoid this, fix the PCI
> hotplug driver (rpaphp) to return an error (-EBUSY) if the slot presence
> state can not be detected immediately while PE is in EEH recovery state.
> Change rpaphp_get_sensor_state() to invoke rtas_call(get-sensor-state)
> directly only if the respective PE is in EEH recovery state, and take
> actions based on RTAS return status. This way EEH handler will not be
> blocked on rpaphp_get_sensor_state() and can immediately notify driver
> about the PCI error and stop any active operations.
>
> In normal cases (non-EEH case) rpaphp_get_sensor_state() will continue to
> invoke rtas_get_sensor() as it was earlier with no change in existing
> behavior.
>
> Signed-off-by: Mahesh Salgaonkar <[email protected]>
> Reviewed-by: Nathan Lynch <[email protected]>

Seems like maybe both patches could go via a ppc tree since they seem
very ppc-specific? A couple minor comments below.

Acked-by: Bjorn Helgaas <[email protected]>

> + * get_adapter_status() can be called by the EEH handler during EEH recovery.
> + * On certain PHB failures, the RTAS call get-seHsor-state() returns extended

Looks like a typo in "get-seHsor-state"?

> +static int __rpaphp_get_sensor_state(struct slot *slot, int *state)
> +{
> +#ifdef CONFIG_EEH

Is this #ifdef redundant? It looks like this file is only compiled
if CONFIG_EEH is set:

config HOTPLUG_PCI_RPA
tristate "RPA PCI Hotplug driver"
depends on PPC_PSERIES && EEH

obj-$(CONFIG_HOTPLUG_PCI_RPA) += rpaphp.o

rpaphp-objs := rpaphp_core.o \
rpaphp_pci.o \
rpaphp_slot.o

> + int rc;
> + int token = rtas_token("get-sensor-state");
> + struct pci_dn *pdn;
> + struct eeh_pe *pe;
> + struct pci_controller *phb = PCI_DN(slot->dn)->phb;
> +
> + if (token == RTAS_UNKNOWN_SERVICE)
> + return -ENOENT;
> +
> + /*
> + * Fallback to existing method for empty slot or PE isn't in EEH
> + * recovery.
> + */
> + pdn = list_first_entry_or_null(&PCI_DN(phb->dn)->child_list,
> + struct pci_dn, list);
> + if (!pdn)
> + goto fallback;
> +
> + pe = eeh_dev_to_pe(pdn->edev);
> + if (pe && (pe->state & EEH_PE_RECOVERING)) {
> + rc = rtas_call(token, 2, 2, state, DR_ENTITY_SENSE,
> + slot->index);
> + return rtas_get_sensor_errno(rc);
> + }
> +fallback:
> +#endif
> + return rtas_get_sensor(DR_ENTITY_SENSE, slot->index, state);
> +}
> +
> int rpaphp_get_sensor_state(struct slot *slot, int *state)
> {
> int rc;
> int setlevel;
>
> - rc = rtas_get_sensor(DR_ENTITY_SENSE, slot->index, state);
> + rc = __rpaphp_get_sensor_state(slot, state);
>
> if (rc < 0) {
> if (rc == -EFAULT || rc == -EEXIST) {
> @@ -40,8 +117,7 @@ int rpaphp_get_sensor_state(struct slot *slot, int *state)
> dbg("%s: power on slot[%s] failed rc=%d.\n",
> __func__, slot->name, rc);
> } else {
> - rc = rtas_get_sensor(DR_ENTITY_SENSE,
> - slot->index, state);
> + rc = __rpaphp_get_sensor_state(slot, state);
> }
> } else if (rc == -ENODEV)
> info("%s: slot is unusable\n", __func__);
>
>

2023-08-02 17:46:23

by Mahesh Salgaonkar

[permalink] [raw]
Subject: Re: [PATCH v7 2/2] PCI: rpaphp: Error out on busy status from get-sensor-state

On 2023-08-01 16:38:08 Tue, Bjorn Helgaas wrote:
> On Mon, Jul 24, 2023 at 02:25:19PM +0530, Mahesh Salgaonkar wrote:
> > When certain PHB HW failure causes pHyp to recover PHB, it marks the PE
> > state as temporarily unavailable until recovery is complete. This also
> > triggers an EEH handler in Linux which needs to notify drivers, and perform
> > recovery. But before notifying the driver about the PCI error it uses
> > get_adapter_state()->get-sensor-state() operation of the hotplug_slot to
> > determine if the slot contains a device or not. if the slot is empty, the
> > recovery is skipped entirely.
>
> It's helpful to use the exact function name so it's greppable; I think
> get_adapter_status() or rpaphp_get_sensor_state()?
>
> s/if the slot is empty,/If the slot is empty,/

Sure, will correct it in next revision.

>
> > However on certain PHB failures, the RTAS call get-sensor-state() returns
> > extended busy error (9902) until PHB is recovered by pHyp. Once PHB is
> > recovered, the get-sensor-state() returns success with correct presence
> > status. The RTAS call interface rtas_get_sensor() loops over the RTAS call
> > on extended delay return code (9902) until the return value is either
> > success (0) or error (-1). This causes the EEH handler to get stuck for ~6
> > seconds before it could notify that the PCI error has been detected and
> > stop any active operations. Hence with running I/O traffic, during this 6
> > seconds, the network driver continues its operation and hits a timeout
> > (netdev watchdog).
> >
> > ------------
> > [52732.244731] DEBUG: ibm_read_slot_reset_state2()
> > [52732.244762] DEBUG: ret = 0, rets[0]=5, rets[1]=1, rets[2]=4000, rets[3]=>
> > [52732.244798] DEBUG: in eeh_slot_presence_check
> > [52732.244804] DEBUG: error state check
> > [52732.244807] DEBUG: Is slot hotpluggable
> > [52732.244810] DEBUG: hotpluggable ops ?
> > [52732.244953] DEBUG: Calling ops->get_adapter_status
> > [52732.244958] DEBUG: calling rpaphp_get_sensor_state
> > [52736.564262] ------------[ cut here ]------------
> > [52736.564299] NETDEV WATCHDOG: enP64p1s0f3 (tg3): transmit queue 0 timed o>
> > [52736.564324] WARNING: CPU: 1442 PID: 0 at net/sched/sch_generic.c:478 dev>
> > [...]
> > [52736.564505] NIP [c000000000c32368] dev_watchdog+0x438/0x440
> > [52736.564513] LR [c000000000c32364] dev_watchdog+0x434/0x440
> > ------------
> >
> > On timeouts, network driver starts dumping debug information to console
> > (e.g bnx2 driver calls bnx2x_panic_dump()), and go into recovery path while
> > pHyp is still recovering the PHB. As part of recovery, the driver tries to
> > reset the device and it keeps failing since every PCI read/write returns
> > ff's. And when EEH recovery kicks-in, the driver is unable to recover the
> > device. This impacts the ssh connection and leads to the system being
> > inaccessible. To get the NIC working again it needs a reboot or re-assign
> > the I/O adapter from HMC.
> >
> > [ 9531.168587] EEH: Beginning: 'slot_reset'
> > [ 9531.168601] PCI 0013:01:00.0#10000: EEH: Invoking bnx2x->slot_reset()
> > [...]
> > [ 9614.110094] bnx2x: [bnx2x_func_stop:9129(enP19p1s0f0)]FUNC_STOP ramrod failed. Running a dry transaction
> > [ 9614.110300] bnx2x: [bnx2x_igu_int_disable:902(enP19p1s0f0)]BUG! Proper val not read from IGU!
> > [ 9629.178067] bnx2x: [bnx2x_fw_command:3055(enP19p1s0f0)]FW failed to respond!
> > [ 9629.178085] bnx2x 0013:01:00.0 enP19p1s0f0: bc 7.10.4
> > [ 9629.178091] bnx2x: [bnx2x_fw_dump_lvl:789(enP19p1s0f0)]Cannot dump MCP info while in PCI error
> > [ 9644.241813] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f0)]IO slot reset --> driver unload
> > [...]
> > [ 9644.241819] PCI 0013:01:00.0#10000: EEH: bnx2x driver reports: 'disconnect'
> > [ 9644.241823] PCI 0013:01:00.1#10000: EEH: Invoking bnx2x->slot_reset()
> > [ 9644.241827] bnx2x: [bnx2x_io_slot_reset:14229(enP19p1s0f1)]IO slot reset initializing...
> > [ 9644.241916] bnx2x 0013:01:00.1: enabling device (0140 -> 0142)
> > [ 9644.258604] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f1)]IO slot reset --> driver unload
> > [ 9644.258612] PCI 0013:01:00.1#10000: EEH: bnx2x driver reports: 'disconnect'
> > [ 9644.258615] EEH: Finished:'slot_reset' with aggregate recovery state:'disconnect'
> > [ 9644.258620] EEH: Unable to recover from failure from PHB#13-PE#10000.
> > [ 9644.261811] EEH: Beginning: 'error_detected(permanent failure)'
> > [...]
> > [ 9644.261823] EEH: Finished:'error_detected(permanent failure)'
> >
> > Hence, it becomes important to inform driver about the PCI error detection
> > as early as possible, so that driver is aware of PCI error and waits for
> > EEH handler's next action for successful recovery.
>
> I don't really understand the connection between EEH and
> get_adapter_status(), but I guess this probably refers to
> arch/powerpc/kernel/eeh_driver.c, not the PCI core aer.c and err.c?

Yup, EEH is an I/O error recovery mechanism on powerpc. On error during
during I/O operations on the PCI bus, EEH handler makes sure that the
error is not due to hotplug remove, before proceeding with recovery.
Depending on whether device is present or hot removed,
get_adapter_status() returns state as true or false respectively. Only
in certain failure cases it returns BUSY error.

eeh_event_handler()
->eeh_handle_normal_event()
->eeh_slot_presence_check()
->get_adapter_status(&state)
skip recovery if state == false

>
> > Current implementation uses rtas_get_sensor() API which blocks the slot
> > check state until RTAS call returns success. To avoid this, fix the PCI
> > hotplug driver (rpaphp) to return an error (-EBUSY) if the slot presence
> > state can not be detected immediately while PE is in EEH recovery state.
> > Change rpaphp_get_sensor_state() to invoke rtas_call(get-sensor-state)
> > directly only if the respective PE is in EEH recovery state, and take
> > actions based on RTAS return status. This way EEH handler will not be
> > blocked on rpaphp_get_sensor_state() and can immediately notify driver
> > about the PCI error and stop any active operations.
> >
> > In normal cases (non-EEH case) rpaphp_get_sensor_state() will continue to
> > invoke rtas_get_sensor() as it was earlier with no change in existing
> > behavior.
> >
> > Signed-off-by: Mahesh Salgaonkar <[email protected]>
> > Reviewed-by: Nathan Lynch <[email protected]>
>
> Seems like maybe both patches could go via a ppc tree since they seem
> very ppc-specific? A couple minor comments below.
>
> Acked-by: Bjorn Helgaas <[email protected]>
>
> > + * get_adapter_status() can be called by the EEH handler during EEH recovery.
> > + * On certain PHB failures, the RTAS call get-seHsor-state() returns extended
>
> Looks like a typo in "get-seHsor-state"?

ouch. Will fix it.

>
> > +static int __rpaphp_get_sensor_state(struct slot *slot, int *state)
> > +{
> > +#ifdef CONFIG_EEH
>
> Is this #ifdef redundant? It looks like this file is only compiled
> if CONFIG_EEH is set:

Thansk for pointing this out. I missed this, yes it is redundant. Will
remove it.

>
> config HOTPLUG_PCI_RPA
> tristate "RPA PCI Hotplug driver"
> depends on PPC_PSERIES && EEH
>
> obj-$(CONFIG_HOTPLUG_PCI_RPA) += rpaphp.o
>
> rpaphp-objs := rpaphp_core.o \
> rpaphp_pci.o \
> rpaphp_slot.o

Thanks for your review.
-Mahesh