2024-05-27 11:50:04

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 1/9] accel/habanalabs: add device name to error print

From: Dani Liberman <[email protected]>

The extra info will help in better traceability and debug.

Signed-off-by: Dani Liberman <[email protected]>
Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 8f92445c5a90..a381ece25592 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -439,16 +439,19 @@ static void print_idle_status_mask(struct hl_device *hdev, const char *message,
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
{
if (idle_mask[3])
- dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx_%016llx)\n",
- message, idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
+ dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n",
+ dev_name(&hdev->pdev->dev), message,
+ idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
else if (idle_mask[2])
- dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx)\n",
- message, idle_mask[2], idle_mask[1], idle_mask[0]);
+ dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n",
+ dev_name(&hdev->pdev->dev), message,
+ idle_mask[2], idle_mask[1], idle_mask[0]);
else if (idle_mask[1])
- dev_err(hdev->dev, "%s (mask %#llx_%016llx)\n",
- message, idle_mask[1], idle_mask[0]);
+ dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n",
+ dev_name(&hdev->pdev->dev), message, idle_mask[1], idle_mask[0]);
else
- dev_err(hdev->dev, "%s (mask %#llx)\n", message, idle_mask[0]);
+ dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message,
+ idle_mask[0]);
}

static void hpriv_release(struct kref *ref)
--
2.34.1



2024-05-27 11:50:20

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 9/9] accel/habanalabs: add heartbeat debug info

From: Farah Kassabri <[email protected]>

It is hard to debug the reason for heartbeat check failures.
As an attempt to ease this task, this patch will provide more
information when this failure happens.
Heartbeat checks the communication with FW, so printing
the CPU queue pi/ci and the counter of how many times that event
was received would help in debugging the issue.

Signed-off-by: Farah Kassabri <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 12 ++++++++++++
drivers/accel/habanalabs/common/habanalabs.h | 15 ++++++++++++++-
drivers/accel/habanalabs/gaudi2/gaudi2.c | 3 +++
3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index bb3f44392908..35502e938b5d 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1052,12 +1052,22 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 cpu_q_id;

if (!prop->cpucp_info.eq_health_check_supported)
return true;

if (!hdev->eq_heartbeat_received) {
+ cpu_q_id = hdev->heartbeat_debug_info.cpu_queue_id;
+
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
+
+ dev_err(hdev->dev, "Heartbeat events counter: %u, Q_PI: %u, Q_CI: %u, EQ CI: %u, EQ prev: %u\n",
+ hdev->heartbeat_debug_info.heartbeat_event_counter,
+ hdev->kernel_queues[cpu_q_id].pi,
+ atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
+ hdev->event_queue.ci,
+ hdev->event_queue.prev_eqe_index);
return false;
}

@@ -1138,6 +1148,8 @@ static int device_late_init(struct hl_device *hdev)
hdev->high_pll = hdev->asic_prop.high_pll;

if (hdev->heartbeat) {
+ hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+
/*
* Before scheduling the heartbeat driver will check if eq event has received.
* for the first schedule we need to set the indication as true then for the next
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 55495861f432..5e9f54ca336a 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -71,7 +71,7 @@ struct hl_fpriv;

#define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */

-#define HL_HEARTBEAT_PER_USEC 5000000 /* 5 s */
+#define HL_HEARTBEAT_PER_USEC 10000000 /* 10 s */

#define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */

@@ -3174,6 +3174,16 @@ struct hl_reset_info {
u8 watchdog_active;
};

+/**
+ * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
+ * @heartbeat_event_counter: number of heartbeat events received.
+ * @cpu_queue_id: used to read the queue pi/ci
+ */
+struct eq_heartbeat_debug_info {
+ u32 heartbeat_event_counter;
+ u32 cpu_queue_id;
+};
+
/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -3262,6 +3272,7 @@ struct hl_reset_info {
* @clk_throttling: holds information about current/previous clock throttling events
* @captured_err_info: holds information about errors.
* @reset_info: holds current device reset information.
+ * @heartbeat_debug_info: counters used to debug heartbeat failures.
* @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
* @stream_master_qid_arr: pointer to array with QIDs of master streams.
* @fw_inner_major_ver: the major of current loaded preboot inner version.
@@ -3452,6 +3463,8 @@ struct hl_device {

struct hl_reset_info reset_info;

+ struct eq_heartbeat_debug_info heartbeat_debug_info;
+
cpumask_t irq_affinity_mask;

u32 *stream_master_qid_arr;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 962b7fcd4318..08276f03c80f 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -3796,6 +3796,8 @@ static int gaudi2_sw_init(struct hl_device *hdev)
if (rc)
goto special_blocks_free;

+ hdev->heartbeat_debug_info.cpu_queue_id = GAUDI2_QUEUE_ID_CPU_PQ;
+
return 0;

special_blocks_free:
@@ -9777,6 +9779,7 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)

static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
{
+ hdev->heartbeat_debug_info.heartbeat_event_counter++;
hdev->eq_heartbeat_received = true;
}

--
2.34.1


2024-05-27 11:50:46

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 2/9] accel/habanalabs/gaudi2: update interrupts related headers

From: Farah Kassabri <[email protected]>

Align the interrupts related headers to latest release.

Signed-off-by: Farah Kassabri <[email protected]>
Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
.../gaudi2/gaudi2_async_ids_map_extended.h | 94 +++++++++----------
1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
index b2dbe1f64430..1db73923de62 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
@@ -330,9 +330,9 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
{ .fc_id = 149, .cpu_id = 48, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "EDMA7_ECC_SERR" },
{ .fc_id = 150, .cpu_id = 48, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
- .name = "HDMA4_ECC_SERR" },
+ .name = "EDMA4_ECC_SERR" },
{ .fc_id = 151, .cpu_id = 48, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
- .name = "HDMA5_ECC_SERR" },
+ .name = "EDMA5_ECC_SERR" },
{ .fc_id = 152, .cpu_id = 49, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "EDMA2_ECC_DERR" },
{ .fc_id = 153, .cpu_id = 49, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
@@ -965,73 +965,73 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
.name = "MME3_CTRL_AXI_ERROR_RESPONSE" },
{ .fc_id = 467, .cpu_id = 91, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
.name = "MME3_QMAN_SW_ERROR" },
- { .fc_id = 468, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 468, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "PSOC_MME_PLL_LOCK_ERR" },
- { .fc_id = 469, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 469, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "PSOC_CPU_PLL_LOCK_ERR" },
- { .fc_id = 470, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 470, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_TPC_PLL_LOCK_ERR" },
- { .fc_id = 471, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 471, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_NIC_PLL_LOCK_ERR" },
- { .fc_id = 472, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 472, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_XBAR_MMU_PLL_LOCK_ERR" },
- { .fc_id = 473, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 473, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_XBAR_DMA_PLL_LOCK_ERR" },
- { .fc_id = 474, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 474, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_XBAR_IF_PLL_LOCK_ERR" },
- { .fc_id = 475, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 475, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_XBAR_BANK_PLL_LOCK_ERR" },
- { .fc_id = 476, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 476, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_XBAR_MMU_PLL_LOCK_ERR" },
- { .fc_id = 477, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 477, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_XBAR_DMA_PLL_LOCK_ERR" },
- { .fc_id = 478, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 478, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_XBAR_IF_PLL_LOCK_ERR" },
- { .fc_id = 479, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 479, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_XBAR_MESH_PLL_LOCK_ERR" },
- { .fc_id = 480, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 480, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_TPC_PLL_LOCK_ERR" },
- { .fc_id = 481, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 481, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_NIC_PLL_LOCK_ERR" },
- { .fc_id = 482, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 482, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "PMMU_MME_PLL_LOCK_ERR" },
- { .fc_id = 483, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 483, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_TPC_PLL_LOCK_ERR" },
- { .fc_id = 484, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 484, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_PCI_PLL_LOCK_ERR" },
- { .fc_id = 485, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 485, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_XBAR_MMU_PLL_LOCK_ERR" },
- { .fc_id = 486, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 486, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_XBAR_DMA_PLL_LOCK_ERR" },
- { .fc_id = 487, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 487, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_XBAR_IF_PLL_LOCK_ERR" },
- { .fc_id = 488, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 488, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_XBAR_MESH_PLL_LOCK_ERR" },
- { .fc_id = 489, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 489, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_XBAR_MMU_PLL_LOCK_ERR" },
- { .fc_id = 490, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 490, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_XBAR_DMA_PLL_LOCK_ERR" },
- { .fc_id = 491, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 491, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_XBAR_IF_PLL_LOCK_ERR" },
- { .fc_id = 492, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 492, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_XBAR_BANK_PLL_LOCK_ERR" },
- { .fc_id = 493, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 493, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_TPC_PLL_LOCK_ERR" },
- { .fc_id = 494, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 494, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "PSOC_VID_PLL_LOCK_ERR" },
- { .fc_id = 495, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 495, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "PMMU_VID_PLL_LOCK_ERR" },
- { .fc_id = 496, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 496, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE3_HBM_PLL_LOCK_ERR" },
- { .fc_id = 497, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 497, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_XBAR_HBM_PLL_LOCK_ERR" },
- { .fc_id = 498, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 498, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE1_HBM_PLL_LOCK_ERR" },
- { .fc_id = 499, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 499, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE0_HBM_PLL_LOCK_ERR" },
- { .fc_id = 500, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 500, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_XBAR_HBM_PLL_LOCK_ERR" },
- { .fc_id = 501, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ { .fc_id = 501, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "DCORE2_HBM_PLL_LOCK_ERR" },
{ .fc_id = 502, .cpu_id = 93, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "CPU_AXI_ERR_RSP" },
@@ -1827,8 +1827,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
.name = "DEC0_BMON_SPMU" },
{ .fc_id = 898, .cpu_id = 330, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
.name = "DEC1_SPI" },
- { .fc_id = 899, .cpu_id = 330, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "DEC1_SPI" },
+ { .fc_id = 899, .cpu_id = 330, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+ .name = "DEC1_BMON_SPMU" },
{ .fc_id = 900, .cpu_id = 331, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
.name = "DEC2_SPI" },
{ .fc_id = 901, .cpu_id = 331, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
@@ -2377,8 +2377,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
.name = "" },
{ .fc_id = 1173, .cpu_id = 479, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "" },
- { .fc_id = 1174, .cpu_id = 480, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
- .name = "" },
+ { .fc_id = 1174, .cpu_id = 480, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
+ .name = "PSOC_DMA_QM" },
{ .fc_id = 1175, .cpu_id = 481, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "" },
{ .fc_id = 1176, .cpu_id = 482, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
@@ -2674,19 +2674,19 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
{ .fc_id = 1321, .cpu_id = 627, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_HARD,
.name = "DEV_RESET_REQ" },
{ .fc_id = 1322, .cpu_id = 628, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_PWR_BRK_ENTRY" },
+ .name = "PWR_BRK_ENTRY" },
{ .fc_id = 1323, .cpu_id = 629, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_PWR_BRK_EXT" },
+ .name = "PWR_BRK_EXT" },
{ .fc_id = 1324, .cpu_id = 630, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_PWR_RD_MODE0" },
+ .name = "PWR_RD_MODE0" },
{ .fc_id = 1325, .cpu_id = 631, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_PWR_RD_MODE1" },
+ .name = "PWR_RD_MODE1" },
{ .fc_id = 1326, .cpu_id = 632, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_PWR_RD_MODE2" },
+ .name = "PWR_RD_MODE2" },
{ .fc_id = 1327, .cpu_id = 633, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_PWR_RD_MODE3" },
+ .name = "PWR_RD_MODE3" },
{ .fc_id = 1328, .cpu_id = 634, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
- .name = "ARC_EQ_HEARTBEAT" },
+ .name = "EQ_HEARTBEAT" },
};

#endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */
--
2.34.1


2024-05-27 11:54:25

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 8/9] accel/habanalabs: add device name to invalidation failure msg

From: Ohad Sharabi <[email protected]>

This addition helps log parsers better define the error without the need
to go back and search the device name on former log lines.

Signed-off-by: Ohad Sharabi <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/mmu/mmu.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index 166c7da8b937..a9813ffcde14 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -645,7 +645,8 @@ int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags)
rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags);
if (rc)
dev_err_ratelimited(hdev->dev,
- "%s cache invalidation failed, rc=%d\n",
+ "%s: %s cache invalidation failed, rc=%d\n",
+ dev_name(&hdev->pdev->dev),
flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc);

return rc;
@@ -660,8 +661,9 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
asid, va, size);
if (rc)
dev_err_ratelimited(hdev->dev,
- "%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d",
- flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc);
+ "%s: %s cache range invalidation failed: va=%#llx, size=%llu, rc=%d",
+ dev_name(&hdev->pdev->dev), flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU",
+ va, size, rc);

return rc;
}
--
2.34.1


2024-05-27 11:58:24

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 6/9] accel/habanalabs: use parent device for trace events

From: Tomer Tayar <[email protected]>

Trace events might still be recorded after the accel device is released,
while the device name is no longer available.
Modify the trace functions to use the parent device instead, which is
available at that point and still informative as the device name.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 25 ++++++++++---------
drivers/accel/habanalabs/common/firmware_if.c | 10 +++++---
drivers/accel/habanalabs/common/mmu/mmu.c | 5 ++--
drivers/accel/habanalabs/common/pci/pci.c | 4 +--
4 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 087bbb1778e5..bb3f44392908 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -130,8 +130,8 @@ static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t
}

if (trace_habanalabs_dma_alloc_enabled() && !ZERO_OR_NULL_PTR(ptr))
- trace_habanalabs_dma_alloc(hdev->dev, (u64) (uintptr_t) ptr, *dma_handle, size,
- caller);
+ trace_habanalabs_dma_alloc(&(hdev)->pdev->dev, (u64) (uintptr_t) ptr, *dma_handle,
+ size, caller);

return ptr;
}
@@ -152,7 +152,7 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c
break;
}

- trace_habanalabs_dma_free(hdev->dev, store_cpu_addr, dma_handle, size, caller);
+ trace_habanalabs_dma_free(&(hdev)->pdev->dev, store_cpu_addr, dma_handle, size, caller);
}

void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle,
@@ -204,15 +204,15 @@ int hl_dma_map_sgtable_caller(struct hl_device *hdev, struct sg_table *sgt,
return 0;

for_each_sgtable_dma_sg(sgt, sg, i)
- trace_habanalabs_dma_map_page(hdev->dev,
- page_to_phys(sg_page(sg)),
- sg->dma_address - prop->device_dma_offset_for_host_access,
+ trace_habanalabs_dma_map_page(&(hdev)->pdev->dev,
+ page_to_phys(sg_page(sg)),
+ sg->dma_address - prop->device_dma_offset_for_host_access,
#ifdef CONFIG_NEED_SG_DMA_LENGTH
- sg->dma_length,
+ sg->dma_length,
#else
- sg->length,
+ sg->length,
#endif
- dir, caller);
+ dir, caller);

return 0;
}
@@ -247,7 +247,8 @@ void hl_dma_unmap_sgtable_caller(struct hl_device *hdev, struct sg_table *sgt,

if (trace_habanalabs_dma_unmap_page_enabled()) {
for_each_sgtable_dma_sg(sgt, sg, i)
- trace_habanalabs_dma_unmap_page(hdev->dev, page_to_phys(sg_page(sg)),
+ trace_habanalabs_dma_unmap_page(&(hdev)->pdev->dev,
+ page_to_phys(sg_page(sg)),
sg->dma_address - prop->device_dma_offset_for_host_access,
#ifdef CONFIG_NEED_SG_DMA_LENGTH
sg->dma_length,
@@ -2593,7 +2594,7 @@ inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
u32 val = readl(hdev->rmmio + reg);

if (unlikely(trace_habanalabs_rreg32_enabled()))
- trace_habanalabs_rreg32(hdev->dev, reg, val);
+ trace_habanalabs_rreg32(&(hdev)->pdev->dev, reg, val);

return val;
}
@@ -2611,7 +2612,7 @@ inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
{
if (unlikely(trace_habanalabs_wreg32_enabled()))
- trace_habanalabs_wreg32(hdev->dev, reg, val);
+ trace_habanalabs_wreg32(&(hdev)->pdev->dev, reg, val);

writel(val, hdev->rmmio + reg);
}
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index b5b960ce4ebd..d1a1d601bde9 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -8,6 +8,7 @@
#include "habanalabs.h"
#include <linux/habanalabs/hl_boot_if.h>

+#include <linux/pci.h>
#include <linux/firmware.h>
#include <linux/crc32.h>
#include <linux/slab.h>
@@ -1803,7 +1804,7 @@ static void hl_fw_dynamic_send_cmd(struct hl_device *hdev,
val = FIELD_PREP(COMMS_COMMAND_CMD_MASK, cmd);
val |= FIELD_PREP(COMMS_COMMAND_SIZE_MASK, size);

- trace_habanalabs_comms_send_cmd(hdev->dev, comms_cmd_str_arr[cmd]);
+ trace_habanalabs_comms_send_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]);
WREG32(le32_to_cpu(dyn_regs->kmd_msg_to_cpu), val);
}

@@ -1861,7 +1862,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,

dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;

- trace_habanalabs_comms_wait_status(hdev->dev, comms_sts_str_arr[expected_status]);
+ trace_habanalabs_comms_wait_status(&hdev->pdev->dev, comms_sts_str_arr[expected_status]);

/* Wait for expected status */
rc = hl_poll_timeout(
@@ -1878,7 +1879,8 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
return -EIO;
}

- trace_habanalabs_comms_wait_status_done(hdev->dev, comms_sts_str_arr[expected_status]);
+ trace_habanalabs_comms_wait_status_done(&hdev->pdev->dev,
+ comms_sts_str_arr[expected_status]);

/*
* skip storing FW response for NOOP to preserve the actual desired
@@ -1952,7 +1954,7 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
{
int rc;

- trace_habanalabs_comms_protocol_cmd(hdev->dev, comms_cmd_str_arr[cmd]);
+ trace_habanalabs_comms_protocol_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]);

/* first send clear command to clean former commands */
rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader);
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index d3eaab908457..166c7da8b937 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -6,6 +6,7 @@
*/

#include <linux/slab.h>
+#include <linux/pci.h>

#include "../habanalabs.h"

@@ -262,7 +263,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, bool flu
mmu_funcs->flush(ctx);

if (trace_habanalabs_mmu_unmap_enabled() && !rc)
- trace_habanalabs_mmu_unmap(hdev->dev, virt_addr, 0, page_size, flush_pte);
+ trace_habanalabs_mmu_unmap(&hdev->pdev->dev, virt_addr, 0, page_size, flush_pte);

return rc;
}
@@ -349,7 +350,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_s
if (flush_pte)
mmu_funcs->flush(ctx);

- trace_habanalabs_mmu_map(hdev->dev, virt_addr, phys_addr, page_size, flush_pte);
+ trace_habanalabs_mmu_map(&hdev->pdev->dev, virt_addr, phys_addr, page_size, flush_pte);

return 0;

diff --git a/drivers/accel/habanalabs/common/pci/pci.c b/drivers/accel/habanalabs/common/pci/pci.c
index 191e0e3cf3a5..81cbd8697d4c 100644
--- a/drivers/accel/habanalabs/common/pci/pci.c
+++ b/drivers/accel/habanalabs/common/pci/pci.c
@@ -123,7 +123,7 @@ int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data)
pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data);

if (unlikely(trace_habanalabs_elbi_read_enabled()))
- trace_habanalabs_elbi_read(hdev->dev, (u32) addr, val);
+ trace_habanalabs_elbi_read(&hdev->pdev->dev, (u32) addr, val);

return 0;
}
@@ -186,7 +186,7 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data)

if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) {
if (unlikely(trace_habanalabs_elbi_write_enabled()))
- trace_habanalabs_elbi_write(hdev->dev, (u32) addr, val);
+ trace_habanalabs_elbi_write(&hdev->pdev->dev, (u32) addr, val);
return 0;
}

--
2.34.1


2024-05-27 12:07:53

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 5/9] accel/habanalabs: no CPUCP prints on heartbeat failure

From: Ohad Sharabi <[email protected]>

If we detected heartbet event while some daemon in the background send
(via driver interface) CPUCP messages the dmesg will be flooded.

Instead, a slight refactor in hl_fw_send_cpu_message() returns -EAGAIN
when CPU is disabled (i.e. heartbeat failure) and only then.

Later, all calling functions that may be invoked by user space can issue
prints only if the error code is not -EAGAIN.

Signed-off-by: Ohad Sharabi <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/debugfs.c | 17 +--
drivers/accel/habanalabs/common/device.c | 4 +-
drivers/accel/habanalabs/common/firmware_if.c | 142 +++++++++++-------
drivers/accel/habanalabs/common/hwmon.c | 60 ++++----
drivers/accel/habanalabs/gaudi/gaudi.c | 12 +-
drivers/accel/habanalabs/gaudi2/gaudi2.c | 12 +-
drivers/accel/habanalabs/goya/goya.c | 13 +-
7 files changed, 128 insertions(+), 132 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index b1c88d1837d9..de3ae2e47ec4 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -42,9 +42,8 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
pkt.i2c_reg = i2c_reg;
pkt.i2c_len = i2c_len;

- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- 0, val);
- if (rc)
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, val);
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);

return rc;
@@ -75,10 +74,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
pkt.i2c_len = i2c_len;
pkt.value = cpu_to_le64(val);

- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- 0, NULL);
-
- if (rc)
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);

return rc;
@@ -99,10 +96,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
pkt.led_index = cpu_to_le32(led);
pkt.value = cpu_to_le64(state);

- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- 0, NULL);
-
- if (rc)
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
}

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index eee41c367bd1..087bbb1778e5 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1498,10 +1498,8 @@ static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
* of heartbeat, the device CPU is marked as disable
* so this message won't be sent
*/
- if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
- dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+ if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
return;
- }

/* verify that last EQs are handled before disabled is set */
if (hdev->cpu_queues_enable)
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 348418643709..b5b960ce4ebd 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -370,43 +370,63 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value)
{
struct cpucp_packet pkt = {};
+ int rc;

pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(value);

- return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ if (rc)
+ dev_err(hdev->dev, "Failed to disable FW's PCI access\n");
+
+ return rc;
}

+/**
+ * hl_fw_send_cpu_message() - send CPU message to the device.
+ *
+ * @hdev: pointer to hl_device structure.
+ * @hw_queue_id: HW queue ID
+ * @msg: raw data of the message/packet
+ * @size: size of @msg in bytes
+ * @timeout_us: timeout in usec to wait for CPU reply on the message
+ * @result: return code reported by FW
+ *
+ * send message to the device CPU.
+ *
+ * Return: 0 on success, non-zero for failure.
+ * -ENOMEM: memory allocation failure
+ * -EAGAIN: CPU is disabled (try again when enabled)
+ * -ETIMEDOUT: timeout waiting for FW response
+ * -EIO: protocol error
+ */
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
- u16 len, u32 timeout, u64 *result)
+ u16 size, u32 timeout_us, u64 *result)
{
struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 tmp, expected_ack_val, pi, opcode;
struct cpucp_packet *pkt;
dma_addr_t pkt_dma_addr;
struct hl_bd *sent_bd;
- u32 tmp, expected_ack_val, pi, opcode;
- int rc;
+ int rc = 0, fw_rc;

- pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr);
+ pkt = hl_cpu_accessible_dma_pool_alloc(hdev, size, &pkt_dma_addr);
if (!pkt) {
- dev_err(hdev->dev,
- "Failed to allocate DMA memory for packet to CPU\n");
+ dev_err(hdev->dev, "Failed to allocate DMA memory for packet to CPU\n");
return -ENOMEM;
}

- memcpy(pkt, msg, len);
+ memcpy(pkt, msg, size);

mutex_lock(&hdev->send_cpu_message_lock);

/* CPU-CP messages can be sent during soft-reset */
- if (hdev->disabled && !hdev->reset_info.in_compute_reset) {
- rc = 0;
+ if (hdev->disabled && !hdev->reset_info.in_compute_reset)
goto out;
- }

if (hdev->device_cpu_disabled) {
- rc = -EIO;
+ rc = -EAGAIN;
goto out;
}

@@ -422,7 +442,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
* Which means that we don't need to lock the access to the entire H/W
* queues module when submitting a JOB to the CPU queue.
*/
- hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr);
+ hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), size, pkt_dma_addr);

if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
expected_ack_val = queue->pi;
@@ -431,7 +451,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,

rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
(tmp == expected_ack_val), 1000,
- timeout, true);
+ timeout_us, true);

hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);

@@ -450,8 +470,8 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,

tmp = le32_to_cpu(pkt->ctl);

- rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
- if (rc) {
+ fw_rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
+ if (fw_rc) {
opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT;

if (!prop->supports_advanced_cpucp_rc) {
@@ -460,7 +480,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
goto scrub_descriptor;
}

- switch (rc) {
+ switch (fw_rc) {
case cpucp_packet_invalid:
dev_err(hdev->dev,
"CPU packet %d is not supported by F/W\n", opcode);
@@ -485,7 +505,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,

/* propagate the return code from the f/w to the callers who want to check it */
if (result)
- *result = rc;
+ *result = fw_rc;

rc = -EIO;

@@ -505,7 +525,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
out:
mutex_unlock(&hdev->send_cpu_message_lock);

- hl_cpu_accessible_dma_pool_free(hdev, len, pkt);
+ hl_cpu_accessible_dma_pool_free(hdev, size, pkt);

return rc;
}
@@ -575,7 +595,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
int hl_fw_test_cpu_queue(struct hl_device *hdev)
{
struct cpucp_packet test_pkt = {};
- u64 result;
+ u64 result = 0;
int rc;

test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
@@ -648,7 +668,7 @@ int hl_fw_send_device_activity(struct hl_device *hdev, bool open)
int hl_fw_send_heartbeat(struct hl_device *hdev)
{
struct cpucp_packet hb_pkt;
- u64 result;
+ u64 result = 0;
int rc;

memset(&hb_pkt, 0, sizeof(hb_pkt));
@@ -910,7 +930,7 @@ static int hl_fw_send_msi_info_msg(struct hl_device *hdev)
{
struct cpucp_array_data_packet *pkt;
size_t total_pkt_size, data_size;
- u64 result;
+ u64 result = 0;
int rc;

/* skip sending this info for unsupported ASICs */
@@ -1001,11 +1021,10 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
-
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP EEPROM packet, error %d\n",
- rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP EEPROM packet, error %d\n", rc);
goto out;
}

@@ -1046,7 +1065,9 @@ int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
goto out;
}

@@ -1080,8 +1101,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->rx_throughput = result;
@@ -1095,8 +1117,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->tx_throughput = result;
@@ -1109,8 +1132,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->replay_cnt = (u32) result;
@@ -1130,9 +1154,9 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CpuCP total energy pkt, error %d\n",
- rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CpuCP total energy pkt, error %d\n", rc);
return rc;
}

@@ -1208,7 +1232,8 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
return rc;
}

@@ -1235,7 +1260,8 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
return rc;
}

@@ -1272,8 +1298,9 @@ int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
goto out;
}

@@ -1298,7 +1325,8 @@ int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
if (rc) {
- dev_err(hdev->dev,
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
"Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
goto out;
}
@@ -3147,10 +3175,10 @@ long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
pkt.pll_index = cpu_to_le32((u32)used_pll_idx);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
if (rc) {
- dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
- used_pll_idx, rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
+ used_pll_idx, rc);
return rc;
}

@@ -3174,8 +3202,7 @@ void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
pkt.value = cpu_to_le64(freq);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to set frequency to PLL %d, error %d\n",
used_pll_idx, rc);
}
@@ -3191,9 +3218,9 @@ long hl_fw_get_max_power(struct hl_device *hdev)
pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
if (rc) {
- dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
return rc;
}

@@ -3215,8 +3242,7 @@ void hl_fw_set_max_power(struct hl_device *hdev)
pkt.value = cpu_to_le64(hdev->max_power);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
}

@@ -3242,11 +3268,11 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void
pkt.data_max_size = cpu_to_le32(size);
pkt.nonce = cpu_to_le32(nonce);

- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- timeout, NULL);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), timeout, NULL);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
goto out;
}

@@ -3288,10 +3314,12 @@ int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)&pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
- if (rc)
- dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
- else
+ if (rc) {
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
+ } else {
dev_dbg(hdev->dev, "generic pkt was successful, result: 0x%llx\n", result);
+ }

*size = (u32)result;

diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 36b951b5f503..52d1e6bf10dc 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -585,9 +585,10 @@ int hl_get_temperature(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err_ratelimited(hdev->dev,
- "Failed to get temperature from sensor %d, error %d\n",
- sensor_index, rc);
+ if (rc != -EAGAIN)
+ dev_err_ratelimited(hdev->dev,
+ "Failed to get temperature from sensor %d, error %d\n",
+ sensor_index, rc);
*value = 0;
}

@@ -610,8 +611,7 @@ int hl_set_temperature(struct hl_device *hdev,

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err_ratelimited(hdev->dev,
"Failed to set temperature of sensor %d, error %d\n",
sensor_index, rc);
@@ -639,9 +639,10 @@ int hl_get_voltage(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err_ratelimited(hdev->dev,
- "Failed to get voltage from sensor %d, error %d\n",
- sensor_index, rc);
+ if (rc != -EAGAIN)
+ dev_err_ratelimited(hdev->dev,
+ "Failed to get voltage from sensor %d, error %d\n",
+ sensor_index, rc);
*value = 0;
}

@@ -668,9 +669,10 @@ int hl_get_current(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err_ratelimited(hdev->dev,
- "Failed to get current from sensor %d, error %d\n",
- sensor_index, rc);
+ if (rc != -EAGAIN)
+ dev_err_ratelimited(hdev->dev,
+ "Failed to get current from sensor %d, error %d\n",
+ sensor_index, rc);
*value = 0;
}

@@ -697,9 +699,10 @@ int hl_get_fan_speed(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err_ratelimited(hdev->dev,
- "Failed to get fan speed from sensor %d, error %d\n",
- sensor_index, rc);
+ if (rc != -EAGAIN)
+ dev_err_ratelimited(hdev->dev,
+ "Failed to get fan speed from sensor %d, error %d\n",
+ sensor_index, rc);
*value = 0;
}

@@ -726,9 +729,10 @@ int hl_get_pwm_info(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err_ratelimited(hdev->dev,
- "Failed to get pwm info from sensor %d, error %d\n",
- sensor_index, rc);
+ if (rc != -EAGAIN)
+ dev_err_ratelimited(hdev->dev,
+ "Failed to get pwm info from sensor %d, error %d\n",
+ sensor_index, rc);
*value = 0;
}

@@ -751,8 +755,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err_ratelimited(hdev->dev,
"Failed to set pwm info to sensor %d, error %d\n",
sensor_index, rc);
@@ -774,8 +777,7 @@ int hl_set_voltage(struct hl_device *hdev,

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err_ratelimited(hdev->dev,
"Failed to set voltage of sensor %d, error %d\n",
sensor_index, rc);
@@ -797,10 +799,8 @@ int hl_set_current(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);
pkt.value = __cpu_to_le64(value);

- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- 0, NULL);
-
- if (rc)
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ if (rc && rc != -EAGAIN)
dev_err_ratelimited(hdev->dev,
"Failed to set current of sensor %d, error %d\n",
sensor_index, rc);
@@ -830,8 +830,7 @@ int hl_set_power(struct hl_device *hdev,

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err_ratelimited(hdev->dev,
"Failed to set power of sensor %d, error %d\n",
sensor_index, rc);
@@ -859,9 +858,10 @@ int hl_get_power(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err_ratelimited(hdev->dev,
- "Failed to get power of sensor %d, error %d\n",
- sensor_index, rc);
+ if (rc != -EAGAIN)
+ dev_err_ratelimited(hdev->dev,
+ "Failed to get power of sensor %d, error %d\n",
+ sensor_index, rc);
*value = 0;
}

diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index f2b04ffb0ecb..fa893a9b826e 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -1639,10 +1639,8 @@ static int gaudi_late_init(struct hl_device *hdev)
}

rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
- if (rc) {
- dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+ if (rc)
return rc;
- }

/* Scrub both SRAM and DRAM */
rc = hdev->asic_funcs->scrub_device_mem(hdev);
@@ -4154,13 +4152,7 @@ static int gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)

static int gaudi_suspend(struct hl_device *hdev)
{
- int rc;
-
- rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
- if (rc)
- dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
- return rc;
+ return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
}

static int gaudi_resume(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index ba1518f2bf5c..962b7fcd4318 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -3312,10 +3312,8 @@ static int gaudi2_late_init(struct hl_device *hdev)

rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
gaudi2->virt_msix_db_dma_addr);
- if (rc) {
- dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+ if (rc)
return rc;
- }

rc = gaudi2_fetch_psoc_frequency(hdev);
if (rc) {
@@ -6467,13 +6465,7 @@ static int gaudi2_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset

static int gaudi2_suspend(struct hl_device *hdev)
{
- int rc;
-
- rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
- if (rc)
- dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
- return rc;
+ return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
}

static int gaudi2_resume(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 5a359c3bdc78..84768e306269 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -893,11 +893,8 @@ int goya_late_init(struct hl_device *hdev)
WREG32(mmMMU_LOG2_DDR_SIZE, ilog2(prop->dram_size));

rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
- if (rc) {
- dev_err(hdev->dev,
- "Failed to enable PCI access from CPU %d\n", rc);
+ if (rc)
return rc;
- }

/* force setting to low frequency */
goya->curr_pll_profile = PLL_LOW;
@@ -2864,13 +2861,7 @@ static int goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)

int goya_suspend(struct hl_device *hdev)
{
- int rc;
-
- rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
- if (rc)
- dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
- return rc;
+ return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
}

int goya_resume(struct hl_device *hdev)
--
2.34.1


2024-05-27 12:10:09

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 3/9] accel/habanalabs: restructure function that checks heartbeat received

From: Ohad Sharabi <[email protected]>

The function returned an error code which isn't propagated up the stack
(nor is it printed).

The return value is only checked for =0 or !=0 which implies bool return
value.

The function signature is updated accordingly, renamed, and slightly
refactored.

Signed-off-by: Ohad Sharabi <[email protected]>
Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index a381ece25592..eee41c367bd1 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1048,21 +1048,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (device_id == hdev->pdev->device);
}

-static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
+static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;

if (!prop->cpucp_info.eq_health_check_supported)
- return 0;
+ return true;

- if (hdev->eq_heartbeat_received) {
- hdev->eq_heartbeat_received = false;
- } else {
+ if (!hdev->eq_heartbeat_received) {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
- return -EIO;
+ return false;
}

- return 0;
+ hdev->eq_heartbeat_received = false;
+
+ return true;
}

static void hl_device_heartbeat(struct work_struct *work)
@@ -1081,7 +1081,7 @@ static void hl_device_heartbeat(struct work_struct *work)
* in order to validate the eq is working.
* Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
*/
- if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
+ if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev)))
goto reschedule;

if (hl_device_operational(hdev, NULL))
--
2.34.1


2024-05-27 12:11:45

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 4/9] accel/habanalabs/gaudi2: align embedded specs headers

Align embedded headers to latest release.

Signed-off-by: Ofir Bitton <[email protected]>
---
.../habanalabs/include/gaudi2/gaudi2_fw_if.h | 27 +++++------------
.../include/gaudi2/gaudi2_reg_map.h | 8 +++++
include/linux/habanalabs/cpucp_if.h | 10 +++++--
include/linux/habanalabs/hl_boot_if.h | 29 ++++++++++++++-----
4 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
index 18ca147b1c86..6ea936c9594e 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
@@ -45,6 +45,13 @@
#define GAUDI2_ARM_RX_MB_OFFSET (GAUDI2_ARM_RX_MB_ADDR - \
GAUDI2_SP_SRAM_BASE_ADDR)

+#define POWER_MODE_LEVELS { \
+ 150000, /* 00 */ \
+ 250000, /* 01 */ \
+ 400000, /* 10 */ \
+ /* 11: Normal mode */ \
+}
+
enum gaudi2_fw_status {
GAUDI2_PID_STATUS_UP = 0x1, /* PID on ARC0 is up */
GAUDI2_ARM_STATUS_UP = 0x2, /* ARM Linux Boot complete */
@@ -52,26 +59,6 @@ enum gaudi2_fw_status {
GAUDI2_STATUS_LAST = 0xFF
};

-struct gaudi2_cold_rst_data {
- union {
- struct {
- u32 recovery_flag: 1;
- u32 validation_flag: 1;
- u32 efuse_read_flag: 1;
- u32 spsram_init_done : 1;
- u32 fake_security_enable : 1;
- u32 fake_sig_validation_en : 1;
- u32 bist_skip_enable : 1;
- u32 reserved1 : 1;
- u32 fake_bis_compliant : 1;
- u32 wd_rst_cause_arm : 1;
- u32 wd_rst_cause_arcpid : 1;
- u32 reserved : 21;
- };
- __le32 data;
- };
-};
-
enum gaudi2_rst_src {
HL_COLD_RST = 1,
HL_MANUAL_RST = 2,
diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
index f3eaeb6d9b7e..1e9c056e437d 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
@@ -58,4 +58,12 @@
#define mmWD_GPIO_DATAOUT_REG mmPSOC_GPIO3_DATAOUT
#define mmSTM_PROFILER_SPE_REG mmPSOC_STM_STMSPER

+/* Registers below are used to pass the boot_if data between ARM and ARC1 */
+#define mmARM_MSG_BOOT_ERR_SET mmCPU_IF_SPECIAL_GLBL_SPARE_0
+#define mmARM_MSG_BOOT_ERR_CLR mmCPU_IF_SPECIAL_GLBL_SPARE_1
+#define mmARM_MSG_BOOT_DEV_STS_SET mmCPU_IF_SPECIAL_GLBL_SPARE_2
+#define mmARM_MSG_BOOT_DEV_STS_CLR mmCPU_IF_SPECIAL_GLBL_SPARE_3
+#define mmMGMT_MSG_BOOT_ERR mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_0
+#define mmMGMT_MSG_BOOT_DEV_STS mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_1
+
#endif /* GAUDI2_REG_MAP_H_ */
diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index f316c8d0f3fc..1ac1d68193e3 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -42,6 +42,12 @@ enum eq_event_id {
EQ_EVENT_PWR_BRK_ENTRY,
EQ_EVENT_PWR_BRK_EXIT,
EQ_EVENT_HEARTBEAT,
+ EQ_EVENT_CPLD_RESET_REASON,
+ EQ_EVENT_CPLD_SHUTDOWN,
+ EQ_EVENT_POWER_EVT_START,
+ EQ_EVENT_POWER_EVT_END,
+ EQ_EVENT_THERMAL_EVT_START,
+ EQ_EVENT_THERMAL_EVT_END,
};

/*
@@ -1165,7 +1171,7 @@ struct cpucp_security_info {
struct cpucp_info {
struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
__u8 kernel_version[VERSION_MAX_LEN];
- __le32 reserved;
+ __le32 reserved1;
__le32 card_type;
__le32 card_location;
__le32 cpld_version;
@@ -1187,7 +1193,7 @@ struct cpucp_info {
__u8 substrate_version;
__u8 eq_health_check_supported;
struct cpucp_security_info sec_info;
- __le32 fw_hbm_region_size;
+ __le32 reserved2;
__u8 pll_map[PLL_MAP_LEN];
__le64 mme_binning_mask;
__u8 fw_os_version[VERSION_MAX_LEN];
diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h
index 93366d5621fd..d2a9fc96424b 100644
--- a/include/linux/habanalabs/hl_boot_if.h
+++ b/include/linux/habanalabs/hl_boot_if.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0
*
- * Copyright 2018-2020 HabanaLabs, Ltd.
+ * Copyright 2018-2023 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
@@ -49,7 +49,6 @@ enum cpu_boot_err {
#define CPU_BOOT_ERR_FATAL_MASK \
((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) | \
(1 << CPU_BOOT_ERR_PLL_FAIL) | \
- (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) | \
(1 << CPU_BOOT_ERR_BINNING_FAIL) | \
(1 << CPU_BOOT_ERR_DRAM_SKIPPED) | \
(1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) | \
@@ -194,6 +193,8 @@ enum cpu_boot_dev_sts {
CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24,
CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25,
CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26,
+ CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN = 27,
+ CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN = 28,
CPU_BOOT_DEV_STS_ENABLED = 31,
CPU_BOOT_DEV_STS_SCND_EN = 63,
CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */
@@ -331,6 +332,17 @@ enum cpu_boot_dev_sts {
* HWMON enum mapping to cpucp enums.
* Initialized in: linux
*
+ * CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN
+ * If set, means f/w supports nic hbm memory clear and
+ * tmr,txs hbm memory init.
+ * Initialized in: zephyr-mgmt
+ *
+ * CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN
+ * MMU page tables are located in DRAM.
+ * F/W initializes security settings for MMU
+ * page tables to reside in DRAM.
+ * Initialized in: zephyr-mgmt
+ *
* CPU_BOOT_DEV_STS0_ENABLED Device status register enabled.
* This is a main indication that the
* running FW populates the device status
@@ -367,6 +379,8 @@ enum cpu_boot_dev_sts {
#define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN (1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN)
#define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN (1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN)
#define CPU_BOOT_DEV_STS0_MAP_HWMON_EN (1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN)
+#define CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN (1 << CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN)
+#define CPU_BOOT_DEV_STS0_MMU_PGTBL_DRAM_EN (1 << CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN)
#define CPU_BOOT_DEV_STS0_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED)
#define CPU_BOOT_DEV_STS1_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED)

@@ -450,11 +464,11 @@ struct cpu_dyn_regs {
__le32 gic_dma_core_irq_ctrl;
__le32 gic_host_halt_irq;
__le32 gic_host_ints_irq;
- __le32 gic_host_soft_rst_irq;
+ __le32 reserved0;
__le32 gic_rot_qm_irq_ctrl;
- __le32 cpu_rst_status;
+ __le32 reserved1;
__le32 eng_arc_irq_ctrl;
- __le32 reserved1[20]; /* reserve for future use */
+ __le32 reserved2[20]; /* reserve for future use */
};

/* TODO: remove the desc magic after the code is updated to use message */
@@ -551,8 +565,9 @@ enum lkd_fw_ascii_msg_lvls {
LKD_FW_ASCII_MSG_DBG = 3,
};

-#define LKD_FW_ASCII_MSG_MAX_LEN 128
-#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */
+#define LKD_FW_ASCII_MSG_MAX_LEN 128
+#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */
+#define LKD_FW_ASCII_MSG_MIN_DESC_VERSION 3

struct lkd_fw_ascii_msg {
__u8 valid;
--
2.34.1


2024-05-27 12:13:26

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 7/9] accel/habanalabs: expose server type in debugfs

From: Tal Risin <[email protected]>

Exposing server type through debugfs to enable easier access via
scripts.

Signed-off-by: Tal Risin <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
Documentation/ABI/testing/debugfs-driver-habanalabs | 6 ++++++
drivers/accel/habanalabs/common/debugfs.c | 5 +++++
2 files changed, 11 insertions(+)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index a7a432dc4015..efbb78bedb8b 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -253,6 +253,12 @@ Description: Triggers dump of monitor data. The value to trigger the operatio
When the write is finished, the user can read the "monitor_dump"
blob

+What: /sys/kernel/debug/accel/<parent_device>/server_type
+Date: Feb 2024
+KernelVersion: 6.11
+Contact: [email protected]
+Description: Exposes the device's server type, maps to enum hl_server_type.
+
What: /sys/kernel/debug/accel/<parent_device>/set_power_state
Date: Jan 2019
KernelVersion: 5.1
diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index de3ae2e47ec4..ca7677293a55 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -1717,6 +1717,11 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
root,
&hdev->device_release_watchdog_timeout_sec);

+ debugfs_create_u16("server_type",
+ 0444,
+ root,
+ &hdev->asic_prop.server_type);
+
for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
0644,
--
2.34.1


2024-05-27 15:49:04

by Tomer Tayar

[permalink] [raw]
Subject: Re: [PATCH 4/9] accel/habanalabs/gaudi2: align embedded specs headers

On 27/05/2024 14:47, Ofir Bitton wrote:
> Align embedded headers to latest release.
>
> Signed-off-by: Ofir Bitton <[email protected]>

Reviewed-by: Tomer Tayar <[email protected]>

> ---
> .../habanalabs/include/gaudi2/gaudi2_fw_if.h | 27 +++++------------
> .../include/gaudi2/gaudi2_reg_map.h | 8 +++++
> include/linux/habanalabs/cpucp_if.h | 10 +++++--
> include/linux/habanalabs/hl_boot_if.h | 29 ++++++++++++++-----
> 4 files changed, 45 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
> index 18ca147b1c86..6ea936c9594e 100644
> --- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
> +++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
> @@ -45,6 +45,13 @@
> #define GAUDI2_ARM_RX_MB_OFFSET (GAUDI2_ARM_RX_MB_ADDR - \
> GAUDI2_SP_SRAM_BASE_ADDR)
>
> +#define POWER_MODE_LEVELS { \
> + 150000, /* 00 */ \
> + 250000, /* 01 */ \
> + 400000, /* 10 */ \
> + /* 11: Normal mode */ \
> +}
> +
> enum gaudi2_fw_status {
> GAUDI2_PID_STATUS_UP = 0x1, /* PID on ARC0 is up */
> GAUDI2_ARM_STATUS_UP = 0x2, /* ARM Linux Boot complete */
> @@ -52,26 +59,6 @@ enum gaudi2_fw_status {
> GAUDI2_STATUS_LAST = 0xFF
> };
>
> -struct gaudi2_cold_rst_data {
> - union {
> - struct {
> - u32 recovery_flag: 1;
> - u32 validation_flag: 1;
> - u32 efuse_read_flag: 1;
> - u32 spsram_init_done : 1;
> - u32 fake_security_enable : 1;
> - u32 fake_sig_validation_en : 1;
> - u32 bist_skip_enable : 1;
> - u32 reserved1 : 1;
> - u32 fake_bis_compliant : 1;
> - u32 wd_rst_cause_arm : 1;
> - u32 wd_rst_cause_arcpid : 1;
> - u32 reserved : 21;
> - };
> - __le32 data;
> - };
> -};
> -
> enum gaudi2_rst_src {
> HL_COLD_RST = 1,
> HL_MANUAL_RST = 2,
> diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
> index f3eaeb6d9b7e..1e9c056e437d 100644
> --- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
> +++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
> @@ -58,4 +58,12 @@
> #define mmWD_GPIO_DATAOUT_REG mmPSOC_GPIO3_DATAOUT
> #define mmSTM_PROFILER_SPE_REG mmPSOC_STM_STMSPER
>
> +/* Registers below are used to pass the boot_if data between ARM and ARC1 */
> +#define mmARM_MSG_BOOT_ERR_SET mmCPU_IF_SPECIAL_GLBL_SPARE_0
> +#define mmARM_MSG_BOOT_ERR_CLR mmCPU_IF_SPECIAL_GLBL_SPARE_1
> +#define mmARM_MSG_BOOT_DEV_STS_SET mmCPU_IF_SPECIAL_GLBL_SPARE_2
> +#define mmARM_MSG_BOOT_DEV_STS_CLR mmCPU_IF_SPECIAL_GLBL_SPARE_3
> +#define mmMGMT_MSG_BOOT_ERR mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_0
> +#define mmMGMT_MSG_BOOT_DEV_STS mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_1
> +
> #endif /* GAUDI2_REG_MAP_H_ */
> diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
> index f316c8d0f3fc..1ac1d68193e3 100644
> --- a/include/linux/habanalabs/cpucp_if.h
> +++ b/include/linux/habanalabs/cpucp_if.h
> @@ -42,6 +42,12 @@ enum eq_event_id {
> EQ_EVENT_PWR_BRK_ENTRY,
> EQ_EVENT_PWR_BRK_EXIT,
> EQ_EVENT_HEARTBEAT,
> + EQ_EVENT_CPLD_RESET_REASON,
> + EQ_EVENT_CPLD_SHUTDOWN,
> + EQ_EVENT_POWER_EVT_START,
> + EQ_EVENT_POWER_EVT_END,
> + EQ_EVENT_THERMAL_EVT_START,
> + EQ_EVENT_THERMAL_EVT_END,
> };
>
> /*
> @@ -1165,7 +1171,7 @@ struct cpucp_security_info {
> struct cpucp_info {
> struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
> __u8 kernel_version[VERSION_MAX_LEN];
> - __le32 reserved;
> + __le32 reserved1;
> __le32 card_type;
> __le32 card_location;
> __le32 cpld_version;
> @@ -1187,7 +1193,7 @@ struct cpucp_info {
> __u8 substrate_version;
> __u8 eq_health_check_supported;
> struct cpucp_security_info sec_info;
> - __le32 fw_hbm_region_size;
> + __le32 reserved2;
> __u8 pll_map[PLL_MAP_LEN];
> __le64 mme_binning_mask;
> __u8 fw_os_version[VERSION_MAX_LEN];
> diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h
> index 93366d5621fd..d2a9fc96424b 100644
> --- a/include/linux/habanalabs/hl_boot_if.h
> +++ b/include/linux/habanalabs/hl_boot_if.h
> @@ -1,6 +1,6 @@
> /* SPDX-License-Identifier: GPL-2.0
> *
> - * Copyright 2018-2020 HabanaLabs, Ltd.
> + * Copyright 2018-2023 HabanaLabs, Ltd.
> * All Rights Reserved.
> *
> */
> @@ -49,7 +49,6 @@ enum cpu_boot_err {
> #define CPU_BOOT_ERR_FATAL_MASK \
> ((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) | \
> (1 << CPU_BOOT_ERR_PLL_FAIL) | \
> - (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) | \
> (1 << CPU_BOOT_ERR_BINNING_FAIL) | \
> (1 << CPU_BOOT_ERR_DRAM_SKIPPED) | \
> (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) | \
> @@ -194,6 +193,8 @@ enum cpu_boot_dev_sts {
> CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24,
> CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25,
> CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26,
> + CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN = 27,
> + CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN = 28,
> CPU_BOOT_DEV_STS_ENABLED = 31,
> CPU_BOOT_DEV_STS_SCND_EN = 63,
> CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */
> @@ -331,6 +332,17 @@ enum cpu_boot_dev_sts {
> * HWMON enum mapping to cpucp enums.
> * Initialized in: linux
> *
> + * CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN
> + * If set, means f/w supports nic hbm memory clear and
> + * tmr,txs hbm memory init.
> + * Initialized in: zephyr-mgmt
> + *
> + * CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN
> + * MMU page tables are located in DRAM.
> + * F/W initializes security settings for MMU
> + * page tables to reside in DRAM.
> + * Initialized in: zephyr-mgmt
> + *
> * CPU_BOOT_DEV_STS0_ENABLED Device status register enabled.
> * This is a main indication that the
> * running FW populates the device status
> @@ -367,6 +379,8 @@ enum cpu_boot_dev_sts {
> #define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN (1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN)
> #define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN (1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN)
> #define CPU_BOOT_DEV_STS0_MAP_HWMON_EN (1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN)
> +#define CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN (1 << CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN)
> +#define CPU_BOOT_DEV_STS0_MMU_PGTBL_DRAM_EN (1 << CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN)
> #define CPU_BOOT_DEV_STS0_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED)
> #define CPU_BOOT_DEV_STS1_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED)
>
> @@ -450,11 +464,11 @@ struct cpu_dyn_regs {
> __le32 gic_dma_core_irq_ctrl;
> __le32 gic_host_halt_irq;
> __le32 gic_host_ints_irq;
> - __le32 gic_host_soft_rst_irq;
> + __le32 reserved0;
> __le32 gic_rot_qm_irq_ctrl;
> - __le32 cpu_rst_status;
> + __le32 reserved1;
> __le32 eng_arc_irq_ctrl;
> - __le32 reserved1[20]; /* reserve for future use */
> + __le32 reserved2[20]; /* reserve for future use */
> };
>
> /* TODO: remove the desc magic after the code is updated to use message */
> @@ -551,8 +565,9 @@ enum lkd_fw_ascii_msg_lvls {
> LKD_FW_ASCII_MSG_DBG = 3,
> };
>
> -#define LKD_FW_ASCII_MSG_MAX_LEN 128
> -#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */
> +#define LKD_FW_ASCII_MSG_MAX_LEN 128
> +#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */
> +#define LKD_FW_ASCII_MSG_MIN_DESC_VERSION 3
>
> struct lkd_fw_ascii_msg {
> __u8 valid;