2023-11-15 16:40:24

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 01/10] accel/habanalabs/gaudi2: assume hard-reset by FW upon PCIe AXI drain

From: Tomer Tayar <[email protected]>

When a PCIe AXI drain event happens, it is possible that the driver
cannot access the device through PCIe, and therefore cannot send a
hard-reset request to FW.
Starting from FW version 1.13, FW will initiate a hard-reset in such
a case without waiting for a reset request from the driver.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/habanalabs.h | 8 ++++++++
drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 ++
2 files changed, 10 insertions(+)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 1655c101c705..5c69a482b8de 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3594,6 +3594,14 @@ static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major
return false;
}

+static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
+ u32 fw_sw_minor)
+{
+ return (hdev->fw_sw_major_ver > fw_sw_major ||
+ (hdev->fw_sw_major_ver == fw_sw_major &&
+ hdev->fw_sw_minor_ver >= fw_sw_minor));
+}
+
/*
* Kernel module functions that can be accessed by entire module
*/
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 819660c684cf..b739078c2d87 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -10007,6 +10007,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+ if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13))
+ is_critical = true;
break;

case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN:
--
2.34.1


2023-11-15 16:40:32

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 02/10] accel/habanalabs: add log when eq event is not received

From: Farah Kassabri <[email protected]>

Add error log when no eq event is received from FW,
to cover a scenario when FW is stuck for some reason.
In such case driver will not receive neither the eq error interrupt
or the eq heartbeat event, and will just initiate a reset without
indication in the dmesg about the reason.

Signed-off-by: Farah Kassabri <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 9711e8fc979d..d95a981b2906 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1049,10 +1049,12 @@ static void hl_device_eq_heartbeat(struct hl_device *hdev)
if (!prop->cpucp_info.eq_health_check_supported)
return;

- if (hdev->eq_heartbeat_received)
+ if (hdev->eq_heartbeat_received) {
hdev->eq_heartbeat_received = false;
- else
+ } else {
+ dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+ }
}

static void hl_device_heartbeat(struct work_struct *work)
--
2.34.1

2023-11-15 16:40:47

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 04/10] accel/habanalabs: fix EQ heartbeat mechanism

From: Farah Kassabri <[email protected]>

Stop rescheduling another heartbeat check when EQ heartbeat check fails
as it generates confusing logs in dmesg that the heartbeat fails.

Signed-off-by: Farah Kassabri <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index d9447aeb3937..6bf5f1d0d005 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1044,20 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (vendor_id == PCI_VENDOR_ID_HABANALABS);
}

-static void hl_device_eq_heartbeat(struct hl_device *hdev)
+static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
{
- u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
struct asic_fixed_properties *prop = &hdev->asic_prop;

if (!prop->cpucp_info.eq_health_check_supported)
- return;
+ return 0;

if (hdev->eq_heartbeat_received) {
hdev->eq_heartbeat_received = false;
} else {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
- hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+ return -EIO;
}
+
+ return 0;
}

static void hl_device_heartbeat(struct work_struct *work)
@@ -1074,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work)
/*
* For EQ health check need to check if driver received the heartbeat eq event
* in order to validate the eq is working.
+ * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
*/
- hl_device_eq_heartbeat(hdev);
-
- if (!hdev->asic_funcs->send_heartbeat(hdev))
+ if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
goto reschedule;

if (hl_device_operational(hdev, NULL))
--
2.34.1

2023-11-15 16:41:19

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 10/10] accel/habanalabs: expose module id through sysfs

From: Dani Liberman <[email protected]>

Module ID exposes the physical location of the device in the server,
from the pov of the devices in regard to how they are connected by
internal fabric.

This information is already exposed in our INFO ioctl, but there are
utilities and scripts running in data-center which are already
accessing sysfs for topology information and it is easier for them
to continue getting that information from sysfs instead of opening
a file descriptor.

Signed-off-by: Dani Liberman <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
Documentation/ABI/testing/sysfs-driver-habanalabs | 6 ++++++
drivers/accel/habanalabs/common/sysfs.c | 10 ++++++++++
2 files changed, 16 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs
index c63ca1ad500d..89fe3b09d4ad 100644
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -149,6 +149,12 @@ Contact: [email protected]
Description: Displays the current clock frequency, in Hz, of the MME compute
engine. This property is valid only for the Goya ASIC family

+What: /sys/class/accel/accel<n>/device/module_id
+Date: Nov 2023
+KernelVersion: not yet upstreamed
+Contact: [email protected]
+Description: Displays the device's module id
+
What: /sys/class/accel/accel<n>/device/pci_addr
Date: Jan 2019
KernelVersion: 5.1
diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c
index 278606373055..8d2164691d81 100644
--- a/drivers/accel/habanalabs/common/sysfs.c
+++ b/drivers/accel/habanalabs/common/sysfs.c
@@ -386,6 +386,14 @@ static ssize_t security_enabled_show(struct device *dev,
return sprintf(buf, "%d\n", hdev->asic_prop.fw_security_enabled);
}

+static ssize_t module_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hl_device *hdev = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location));
+}
+
static DEVICE_ATTR_RO(armcp_kernel_ver);
static DEVICE_ATTR_RO(armcp_ver);
static DEVICE_ATTR_RO(cpld_ver);
@@ -405,6 +413,7 @@ static DEVICE_ATTR_RO(thermal_ver);
static DEVICE_ATTR_RO(uboot_ver);
static DEVICE_ATTR_RO(fw_os_ver);
static DEVICE_ATTR_RO(security_enabled);
+static DEVICE_ATTR_RO(module_id);

static struct bin_attribute bin_attr_eeprom = {
.attr = {.name = "eeprom", .mode = (0444)},
@@ -430,6 +439,7 @@ static struct attribute *hl_dev_attrs[] = {
&dev_attr_uboot_ver.attr,
&dev_attr_fw_os_ver.attr,
&dev_attr_security_enabled.attr,
+ &dev_attr_module_id.attr,
NULL,
};

--
2.34.1

2023-11-15 16:41:22

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 09/10] accel/habanalabs: print error code when mapping fails

From: Dani Liberman <[email protected]>

Failure to map is considered a non-trivial error and we need to notify
the user about it.

Signed-off-by: Dani Liberman <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/memory.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index 0b8689fe0b64..3348ad12c237 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -955,8 +955,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
(i + 1) == phys_pg_pack->npages);
if (rc) {
dev_err(hdev->dev,
- "map failed for handle %u, npages: %llu, mapped: %llu",
- phys_pg_pack->handle, phys_pg_pack->npages,
+ "map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",
+ rc, phys_pg_pack->handle, phys_pg_pack->npages,
mapped_pg_cnt);
goto err;
}
@@ -1186,7 +1186,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device

rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
if (rc) {
- dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle);
+ dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",
+ rc, handle);
mutex_unlock(&hdev->mmu_lock);
goto map_err;
}
--
2.34.1