2024-05-27 12:13:24

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 1/8] accel/habanalabs: use msg_header instead of desc_header

From: Igal Zeltser <[email protected]>

Struct comms_desc_header is deprecated and replaced by struct
comms_msg_header. As a preparation for removing comms_desc_header
from FW, all it's usage in code is replaced by comms_msg_header.

Signed-off-by: Igal Zeltser <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index d1a1d601bde9..886b3c07503d 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2093,7 +2093,7 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev,
* note that no alignment/stride address issues here as all structures
* are 64 bit padded.
*/
- data_ptr = (u8 *)fw_desc + sizeof(struct comms_desc_header);
+ data_ptr = (u8 *)fw_desc + sizeof(struct comms_msg_header);
data_size = le16_to_cpu(fw_desc->header.size);

data_crc32 = hl_fw_compat_crc32(data_ptr, data_size);
@@ -2247,11 +2247,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
fw_data_size = le16_to_cpu(fw_desc->header.size);

- temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size);
+ temp_fw_desc = vzalloc(sizeof(struct comms_msg_header) + fw_data_size);
if (!temp_fw_desc)
return -ENOMEM;

- memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size);
+ memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_msg_header) + fw_data_size);

rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader,
(struct lkd_fw_comms_desc *) temp_fw_desc);
--
2.34.1



2024-05-27 12:13:46

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 6/8] accel/habanalabs: separate nonce from max_size in cpucp_packet struct

From: Dani Liberman <[email protected]>

In struct cpucp_packet both nonce and data_max_size members are in an
union overlapping each other. This is a problem as they both are used
in attestation and info_signed packets.
The solution here is to move the nonce member to a different union under
the same structure.

Signed-off-by: Dani Liberman <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
include/linux/habanalabs/cpucp_if.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h
index 1ac1d68193e3..0913415243e8 100644
--- a/include/linux/habanalabs/cpucp_if.h
+++ b/include/linux/habanalabs/cpucp_if.h
@@ -859,9 +859,6 @@ struct cpucp_packet {
* result cannot be used to hold general purpose data.
*/
__le32 status_mask;
-
- /* random, used once number, for security packets */
- __le32 nonce;
};

union {
@@ -870,6 +867,9 @@ struct cpucp_packet {

/* For Generic packet sub index */
__le32 pkt_subidx;
+
+ /* random, used once number, for security packets */
+ __le32 nonce;
};
};

--
2.34.1


2024-05-27 12:13:47

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 2/8] accel/habanalabs: check for errors after preboot is ready

From: Farah Kassabri <[email protected]>

Driver should check and report any fatal errors detected by preboot,
before it attempts to load the boot fit.
Some errors may cause the driver to stop the boot process and mark
the device as unusable.
This check will allow the driver to fail and print the error reported
by preboot and skip the time wasting attempt of trying to load the
boot fit, which will fail due to the error.

Signed-off-by: Farah Kassabri <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 24 +++++++++----------
1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 886b3c07503d..6f0c40b12072 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -1482,7 +1482,7 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev)
{
struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
u32 status = 0, timeout;
- int rc, tries = 1;
+ int rc, tries = 1, fw_err = 0;
bool preboot_still_runs;

/* Need to check two possible scenarios:
@@ -1522,18 +1522,18 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev)
}
}

- if (rc) {
+ /* If we read all FF, then something is totally wrong, no point
+ * of reading specific errors
+ */
+ if (status != -1)
+ fw_err = fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
+ pre_fw_load->boot_err1_reg,
+ pre_fw_load->sts_boot_dev_sts0_reg,
+ pre_fw_load->sts_boot_dev_sts1_reg);
+ if (rc || fw_err) {
detect_cpu_boot_status(hdev, status);
- dev_err(hdev->dev, "CPU boot ready timeout (status = %d)\n", status);
-
- /* If we read all FF, then something is totally wrong, no point
- * of reading specific errors
- */
- if (status != -1)
- fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
- pre_fw_load->boot_err1_reg,
- pre_fw_load->sts_boot_dev_sts0_reg,
- pre_fw_load->sts_boot_dev_sts1_reg);
+ dev_err(hdev->dev, "CPU boot %s (status = %d)\n",
+ fw_err ? "failed due to an error" : "ready timeout", status);
return -EIO;
}

--
2.34.1


2024-05-27 12:14:23

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 8/8] accel/habanalabs: move hl_eq_heartbeat_event_handle() to common code

From: Tomer Tayar <[email protected]>

hl_eq_heartbeat_event_handle() doesn't have ASIC specific code, and
therefore can be moved from Gaudi2-only code to common code, and
possibly used for other ASICs.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 6 ++++++
drivers/accel/habanalabs/common/habanalabs.h | 1 +
drivers/accel/habanalabs/gaudi2/gaudi2.c | 6 ------
3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 35502e938b5d..f9b8601c4396 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2850,3 +2850,9 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq)
if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
}
+
+void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
+{
+ hdev->heartbeat_debug_info.heartbeat_event_counter++;
+ hdev->eq_heartbeat_received = true;
+}
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 8d0df685e627..057087dc8592 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -4059,6 +4059,7 @@ void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_coun
void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
void hl_init_cpu_for_irq(struct hl_device *hdev);
void hl_set_irq_affinity(struct hl_device *hdev, int irq);
+void hl_eq_heartbeat_event_handle(struct hl_device *hdev);

#ifdef CONFIG_DEBUG_FS

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4791582d157c..a38b88baadf2 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9777,12 +9777,6 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
return U16_MAX;
}

-static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
-{
- hdev->heartbeat_debug_info.heartbeat_event_counter++;
- hdev->eq_heartbeat_received = true;
-}
-
static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{
struct gaudi2_device *gaudi2 = hdev->asic_specific;
--
2.34.1


2024-05-27 12:14:39

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 3/8] accel/habanalabs/gaudi2: align interrupt names to table

From: Ariel Suller <[email protected]>

when reporting tpc events, the dcore and tpc in dcore should
be reported and propagated, and not the generatl tpc number

Signed-off-by: Ariel Suller <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
.../gaudi2/gaudi2_async_ids_map_extended.h | 150 +++++++++---------
1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
index 1db73923de62..82d639990cca 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
@@ -856,55 +856,55 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
{ .fc_id = 412, .cpu_id = 84, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "PCIE_ADDR_DEC_ERR" },
{ .fc_id = 413, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC0_AXI_ERR_RSP" },
+ .name = "DCORE0_TPC0_AXI_ERR_RSP" },
{ .fc_id = 414, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC1_AXI_ERR_RSP" },
+ .name = "DCORE0_TPC1_AXI_ERR_RSP" },
{ .fc_id = 415, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC2_AXI_ERR_RSP" },
+ .name = "DCORE0_TPC2_AXI_ERR_RSP" },
{ .fc_id = 416, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC3_AXI_ERR_RSP" },
+ .name = "DCORE0_TPC3_AXI_ERR_RSP" },
{ .fc_id = 417, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC4_AXI_ERR_RSP" },
+ .name = "DCORE0_TPC4_AXI_ERR_RSP" },
{ .fc_id = 418, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC5_AXI_ERR_RSP" },
+ .name = "DCORE0_TPC5_AXI_ERR_RSP" },
{ .fc_id = 419, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC6_AXI_ERR_RSP" },
+ .name = "DCORE1_TPC0_AXI_ERR_RSP" },
{ .fc_id = 420, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC7_AXI_ERR_RSP" },
+ .name = "DCORE1_TPC1_AXI_ERR_RSP" },
{ .fc_id = 421, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC8_AXI_ERR_RSP" },
+ .name = "DCORE1_TPC2_AXI_ERR_RSP" },
{ .fc_id = 422, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC9_AXI_ERR_RSP" },
+ .name = "DCORE1_TPC3_AXI_ERR_RSP" },
{ .fc_id = 423, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC10_AXI_ERR_RSP" },
+ .name = "DCORE1_TPC4_AXI_ERR_RSP" },
{ .fc_id = 424, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC11_AXI_ERR_RSP" },
+ .name = "DCORE1_TPC5_AXI_ERR_RSP" },
{ .fc_id = 425, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC12_AXI_ERR_RSP" },
+ .name = "DCORE2_TPC0_AXI_ERR_RSP" },
{ .fc_id = 426, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC13_AXI_ERR_RSP" },
+ .name = "DCORE2_TPC1_AXI_ERR_RSP" },
{ .fc_id = 427, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC14_AXI_ERR_RSP" },
+ .name = "DCORE2_TPC2_AXI_ERR_RSP" },
{ .fc_id = 428, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC15_AXI_ERR_RSP" },
+ .name = "DCORE2_TPC3_AXI_ERR_RSP" },
{ .fc_id = 429, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC16_AXI_ERR_RSP" },
+ .name = "DCORE2_TPC4_AXI_ERR_RSP" },
{ .fc_id = 430, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC17_AXI_ERR_RSP" },
+ .name = "DCORE2_TPC5_AXI_ERR_RSP" },
{ .fc_id = 431, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC18_AXI_ERR_RSP" },
+ .name = "DCORE3_TPC0_AXI_ERR_RSP" },
{ .fc_id = 432, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC19_AXI_ERR_RSP" },
+ .name = "DCORE3_TPC1_AXI_ERR_RSP" },
{ .fc_id = 433, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC20_AXI_ERR_RSP" },
+ .name = "DCORE3_TPC2_AXI_ERR_RSP" },
{ .fc_id = 434, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC21_AXI_ERR_RSP" },
+ .name = "DCORE3_TPC3_AXI_ERR_RSP" },
{ .fc_id = 435, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC22_AXI_ERR_RSP" },
+ .name = "DCORE3_TPC4_AXI_ERR_RSP" },
{ .fc_id = 436, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC23_AXI_ERR_RSP" },
+ .name = "DCORE3_TPC5_AXI_ERR_RSP" },
{ .fc_id = 437, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC24_AXI_ERR_RSP" },
+ .name = "DCORE4_TPC0_AXI_ERR_RSP" },
{ .fc_id = 438, .cpu_id = 86, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
.name = "AXI_ECC" },
{ .fc_id = 439, .cpu_id = 87, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
@@ -1298,103 +1298,103 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
{ .fc_id = 633, .cpu_id = 130, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC0_BMON_SPMU" },
{ .fc_id = 634, .cpu_id = 131, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC0_KERNEL_ERR" },
+ .name = "DCORE0_TPC0_KERNEL_ERR" },
{ .fc_id = 635, .cpu_id = 132, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC1_BMON_SPMU" },
{ .fc_id = 636, .cpu_id = 133, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC1_KERNEL_ERR" },
+ .name = "DCORE0_TPC1_KERNEL_ERR" },
{ .fc_id = 637, .cpu_id = 134, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC2_BMON_SPMU" },
{ .fc_id = 638, .cpu_id = 135, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC2_KERNEL_ERR" },
+ .name = "DCORE0_TPC2_KERNEL_ERR" },
{ .fc_id = 639, .cpu_id = 136, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC3_BMON_SPMU" },
{ .fc_id = 640, .cpu_id = 137, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC3_KERNEL_ERR" },
+ .name = "DCORE0_TPC3_KERNEL_ERR" },
{ .fc_id = 641, .cpu_id = 138, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC4_BMON_SPMU" },
{ .fc_id = 642, .cpu_id = 139, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC4_KERNEL_ERR" },
+ .name = "DCORE0_TPC4_KERNEL_ERR" },
{ .fc_id = 643, .cpu_id = 140, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC5_BMON_SPMU" },
{ .fc_id = 644, .cpu_id = 141, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC5_KERNEL_ERR" },
+ .name = "DCORE0_TPC5_KERNEL_ERR" },
{ .fc_id = 645, .cpu_id = 150, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC6_BMON_SPMU" },
{ .fc_id = 646, .cpu_id = 151, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC6_KERNEL_ERR" },
+ .name = "DCORE1_TPC0_KERNEL_ERR" },
{ .fc_id = 647, .cpu_id = 152, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC7_BMON_SPMU" },
{ .fc_id = 648, .cpu_id = 153, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC7_KERNEL_ERR" },
+ .name = "DCORE1_TPC1_KERNEL_ERR" },
{ .fc_id = 649, .cpu_id = 146, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC8_BMON_SPMU" },
{ .fc_id = 650, .cpu_id = 147, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC8_KERNEL_ERR" },
+ .name = "DCORE1_TPC2_KERNEL_ERR" },
{ .fc_id = 651, .cpu_id = 148, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC9_BMON_SPMU" },
{ .fc_id = 652, .cpu_id = 149, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC9_KERNEL_ERR" },
+ .name = "DCORE1_TPC3_KERNEL_ERR" },
{ .fc_id = 653, .cpu_id = 142, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC10_BMON_SPMU" },
{ .fc_id = 654, .cpu_id = 143, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC10_KERNEL_ERR" },
+ .name = "DCORE1_TPC4_KERNEL_ERR" },
{ .fc_id = 655, .cpu_id = 144, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC11_BMON_SPMU" },
{ .fc_id = 656, .cpu_id = 145, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC11_KERNEL_ERR" },
+ .name = "DCORE1_TPC5_KERNEL_ERR" },
{ .fc_id = 657, .cpu_id = 162, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC12_BMON_SPMU" },
{ .fc_id = 658, .cpu_id = 163, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC12_KERNEL_ERR" },
+ .name = "DCORE2_TPC0_KERNEL_ERR" },
{ .fc_id = 659, .cpu_id = 164, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC13_BMON_SPMU" },
{ .fc_id = 660, .cpu_id = 165, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC13_KERNEL_ERR" },
+ .name = "DCORE2_TPC1_KERNEL_ERR" },
{ .fc_id = 661, .cpu_id = 158, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC14_BMON_SPMU" },
{ .fc_id = 662, .cpu_id = 159, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC14_KERNEL_ERR" },
+ .name = "DCORE2_TPC2_KERNEL_ERR" },
{ .fc_id = 663, .cpu_id = 160, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC15_BMON_SPMU" },
{ .fc_id = 664, .cpu_id = 161, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC15_KERNEL_ERR" },
+ .name = "DCORE2_TPC3_KERNEL_ERR" },
{ .fc_id = 665, .cpu_id = 154, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC16_BMON_SPMU" },
{ .fc_id = 666, .cpu_id = 155, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC16_KERNEL_ERR" },
+ .name = "DCORE2_TPC4_KERNEL_ERR" },
{ .fc_id = 667, .cpu_id = 156, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC17_BMON_SPMU" },
{ .fc_id = 668, .cpu_id = 157, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC17_KERNEL_ERR" },
+ .name = "DCORE2_TPC5_KERNEL_ERR" },
{ .fc_id = 669, .cpu_id = 166, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC18_BMON_SPMU" },
{ .fc_id = 670, .cpu_id = 167, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC18_KERNEL_ERR" },
+ .name = "DCORE3_TPC0_KERNEL_ERR" },
{ .fc_id = 671, .cpu_id = 168, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC19_BMON_SPMU" },
{ .fc_id = 672, .cpu_id = 169, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC19_KERNEL_ERR" },
+ .name = "DCORE3_TPC1_KERNEL_ERR" },
{ .fc_id = 673, .cpu_id = 170, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC20_BMON_SPMU" },
{ .fc_id = 674, .cpu_id = 171, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC20_KERNEL_ERR" },
+ .name = "DCORE3_TPC2_KERNEL_ERR" },
{ .fc_id = 675, .cpu_id = 172, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC21_BMON_SPMU" },
{ .fc_id = 676, .cpu_id = 173, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC21_KERNEL_ERR" },
+ .name = "DCORE3_TPC3_KERNEL_ERR" },
{ .fc_id = 677, .cpu_id = 174, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC22_BMON_SPMU" },
{ .fc_id = 678, .cpu_id = 175, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC22_KERNEL_ERR" },
+ .name = "DCORE3_TPC4_KERNEL_ERR" },
{ .fc_id = 679, .cpu_id = 176, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC23_BMON_SPMU" },
{ .fc_id = 680, .cpu_id = 177, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC23_KERNEL_ERR" },
+ .name = "DCORE3_TPC5_KERNEL_ERR" },
{ .fc_id = 681, .cpu_id = 178, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "TPC24_BMON_SPMU" },
{ .fc_id = 682, .cpu_id = 179, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC24_KERNEL_ERR" },
+ .name = "DCORE4_TPC0_KERNEL_ERR" },
{ .fc_id = 683, .cpu_id = 180, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "" },
{ .fc_id = 684, .cpu_id = 180, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
@@ -2442,55 +2442,55 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
{ .fc_id = 1205, .cpu_id = 511, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
.name = "" },
{ .fc_id = 1206, .cpu_id = 512, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC0_QM" },
+ .name = "DCORE0_TPC0_QM" },
{ .fc_id = 1207, .cpu_id = 513, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC1_QM" },
+ .name = "DCORE0_TPC1_QM" },
{ .fc_id = 1208, .cpu_id = 514, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC2_QM" },
+ .name = "DCORE0_TPC2_QM" },
{ .fc_id = 1209, .cpu_id = 515, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC3_QM" },
+ .name = "DCORE0_TPC3_QM" },
{ .fc_id = 1210, .cpu_id = 516, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC4_QM" },
+ .name = "DCORE0_TPC4_QM" },
{ .fc_id = 1211, .cpu_id = 517, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC5_QM" },
+ .name = "DCORE0_TPC5_QM" },
{ .fc_id = 1212, .cpu_id = 518, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC6_QM" },
+ .name = "DCORE1_TPC0_QM" },
{ .fc_id = 1213, .cpu_id = 519, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC7_QM" },
+ .name = "DCORE1_TPC1_QM" },
{ .fc_id = 1214, .cpu_id = 520, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC8_QM" },
+ .name = "DCORE1_TPC2_QM" },
{ .fc_id = 1215, .cpu_id = 521, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC9_QM" },
+ .name = "DCORE1_TPC3_QM" },
{ .fc_id = 1216, .cpu_id = 522, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC10_QM" },
+ .name = "DCORE1_TPC4_QM" },
{ .fc_id = 1217, .cpu_id = 523, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC11_QM" },
+ .name = "DCORE1_TPC5_QM" },
{ .fc_id = 1218, .cpu_id = 524, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC12_QM" },
+ .name = "DCORE2_TPC0_QM" },
{ .fc_id = 1219, .cpu_id = 525, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC13_QM" },
+ .name = "DCORE2_TPC1_QM" },
{ .fc_id = 1220, .cpu_id = 526, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC14_QM" },
+ .name = "DCORE2_TPC2_QM" },
{ .fc_id = 1221, .cpu_id = 527, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC15_QM" },
+ .name = "DCORE2_TPC3_QM" },
{ .fc_id = 1222, .cpu_id = 528, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC16_QM" },
+ .name = "DCORE2_TPC4_QM" },
{ .fc_id = 1223, .cpu_id = 529, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC17_QM" },
+ .name = "DCORE2_TPC5_QM" },
{ .fc_id = 1224, .cpu_id = 530, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC18_QM" },
+ .name = "DCORE3_TPC0_QM" },
{ .fc_id = 1225, .cpu_id = 531, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC19_QM" },
+ .name = "DCORE3_TPC1_QM" },
{ .fc_id = 1226, .cpu_id = 532, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC20_QM" },
+ .name = "DCORE3_TPC2_QM" },
{ .fc_id = 1227, .cpu_id = 533, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC21_QM" },
+ .name = "DCORE3_TPC3_QM" },
{ .fc_id = 1228, .cpu_id = 534, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC22_QM" },
+ .name = "DCORE3_TPC4_QM" },
{ .fc_id = 1229, .cpu_id = 535, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC23_QM" },
+ .name = "DCORE3_TPC5_QM" },
{ .fc_id = 1230, .cpu_id = 536, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
- .name = "TPC24_QM" },
+ .name = "DCORE4_TPC0_QM" },
{ .fc_id = 1231, .cpu_id = 537, .valid = 0, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
.name = "" },
{ .fc_id = 1232, .cpu_id = 538, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
--
2.34.1


2024-05-27 12:19:09

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 7/8] accel/habanalabs: add an EQ size ASIC property

From: Tomer Tayar <[email protected]>

Future supported ASICs might use the dynamic EQ mechanism with the
firmware, and in that case the EQ size won't be equal to the default
HL_EQ_SIZE_IN_BYTES value.
Add an ASIC property to enable overriding this value.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/common/habanalabs.h | 5 +++++
drivers/accel/habanalabs/common/irq.c | 8 +++++---
2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 5e9f54ca336a..8d0df685e627 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -651,6 +651,8 @@ struct hl_hints_range {
* @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
* not supported.
* @reserved_fw_mem_size: size of dram memory reserved for FW.
+ * @fw_event_queue_size: queue size for events from CPU-CP.
+ * A value of 0 means using the default HL_EQ_SIZE_IN_BYTES value.
* @collective_first_sob: first sync object available for collective use
* @collective_first_mon: first monitor available for collective use
* @sync_stream_first_sob: first sync object available for sync stream use
@@ -782,6 +784,7 @@ struct asic_fixed_properties {
u32 glbl_err_max_cause_num;
u32 hbw_flush_reg;
u32 reserved_fw_mem_size;
+ u32 fw_event_queue_size;
u16 collective_first_sob;
u16 collective_first_mon;
u16 sync_stream_first_sob;
@@ -1229,6 +1232,7 @@ struct hl_user_pending_interrupt {
* @hdev: pointer to the device structure
* @kernel_address: holds the queue's kernel virtual address
* @bus_address: holds the queue's DMA address
+ * @size: the event queue size
* @ci: ci inside the queue
* @prev_eqe_index: the index of the previous event queue entry. The index of
* the current entry's index must be +1 of the previous one.
@@ -1240,6 +1244,7 @@ struct hl_eq {
struct hl_device *hdev;
void *kernel_address;
dma_addr_t bus_address;
+ u32 size;
u32 ci;
u32 prev_eqe_index;
bool check_eqe_index;
diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c
index 978b7f4d5eeb..2caf2df4de08 100644
--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -652,14 +652,16 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q)
*/
int hl_eq_init(struct hl_device *hdev, struct hl_eq *q)
{
+ u32 size = hdev->asic_prop.fw_event_queue_size ? : HL_EQ_SIZE_IN_BYTES;
void *p;

- p = hl_cpu_accessible_dma_pool_alloc(hdev, HL_EQ_SIZE_IN_BYTES, &q->bus_address);
+ p = hl_cpu_accessible_dma_pool_alloc(hdev, size, &q->bus_address);
if (!p)
return -ENOMEM;

q->hdev = hdev;
q->kernel_address = p;
+ q->size = size;
q->ci = 0;
q->prev_eqe_index = 0;

@@ -678,7 +680,7 @@ void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q)
{
flush_workqueue(hdev->eq_wq);

- hl_cpu_accessible_dma_pool_free(hdev, HL_EQ_SIZE_IN_BYTES, q->kernel_address);
+ hl_cpu_accessible_dma_pool_free(hdev, q->size, q->kernel_address);
}

void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
@@ -693,5 +695,5 @@ void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
* when the device is operational again
*/

- memset(q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES);
+ memset(q->kernel_address, 0, q->size);
}
--
2.34.1


2024-05-27 12:20:05

by Ofir Bitton

[permalink] [raw]
Subject: [PATCH 4/8] accel/habanalabs/gaudi2: revise return value handling in gaudi2_hbm_sei_handle_read_err()

From: Tomer Tayar <[email protected]>

The return value in gaudi2_hbm_sei_handle_read_err() is boolean and not
a bitmask, so there is need for "|= true".
In addition, rename the 'rc' variable, as no "return code" is returned
here but an indication if a hard reset is required.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Ofir Bitton <[email protected]>
---
drivers/accel/habanalabs/gaudi2/gaudi2.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 08276f03c80f..18cc7b773650 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9263,8 +9263,8 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type,
static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
struct hl_eq_hbm_sei_read_err_intr_info *rd_err_data, u32 err_cnt)
{
+ bool require_hard_reset = false;
u32 addr, beat, beat_shift;
- bool rc = false;

dev_err_ratelimited(hdev->dev,
"READ ERROR count: ECC SERR: %d, ECC DERR: %d, RD_PARITY: %d\n",
@@ -9296,7 +9296,7 @@ static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
beat,
le32_to_cpu(rd_err_data->dbg_rd_err_dm),
le32_to_cpu(rd_err_data->dbg_rd_err_syndrome));
- rc |= true;
+ require_hard_reset = true;
}

beat_shift = beat * HBM_RD_ERR_BEAT_SHIFT;
@@ -9309,7 +9309,7 @@ static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
(le32_to_cpu(rd_err_data->dbg_rd_err_misc) &
(HBM_RD_ERR_PAR_DATA_BEAT0_MASK << beat_shift)) >>
(HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT + beat_shift));
- rc |= true;
+ require_hard_reset = true;
}

dev_err_ratelimited(hdev->dev, "Beat%d DQ data:\n", beat);
@@ -9319,7 +9319,7 @@ static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
le32_to_cpu(rd_err_data->dbg_rd_err_data[beat * 2 + 1]));
}

- return rc;
+ return require_hard_reset;
}

static void gaudi2_hbm_sei_print_wr_par_info(struct hl_device *hdev,
--
2.34.1