2024-02-20 16:01:41

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions

From: Ohad Sharabi <[email protected]>

Currently, the code contains 2 types of FW version comparison functions:
- hl_is_fw_sw_ver_[below/equal_or_greater]()
- gaudi2 specific function of the type
gaudi2_is_fw_ver_[below/above]x_y_z()

Moreover, some functions use the inner FW version which should be only
stage during development but not version dependencies.

Finally, some tests are done to deprecated FW version to which LKD
should hold no compatibility.

This commit aligns all APIs to a single function that just compares the
version and return an integers indicator (similar in some way to
strcmp()).

In addition, this generic function now considers also the sub-minor FW
version and also remove dead code resulting in deprecated FW versions
compatibility.

Signed-off-by: Ohad Sharabi <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
drivers/accel/habanalabs/common/habanalabs.h | 20 +------
drivers/accel/habanalabs/gaudi2/gaudi2.c | 57 +++----------------
3 files changed, 34 insertions(+), 68 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 3558a6a8e192..e7dcf2fe6552 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -40,6 +40,31 @@ static char *comms_sts_str_arr[COMMS_STS_INVLD_LAST] = {
[COMMS_STS_TIMEOUT_ERR] = __stringify(COMMS_STS_TIMEOUT_ERR),
};

+/**
+ * hl_fw_version_cmp() - compares the FW version to a specific version
+ *
+ * @hdev: pointer to hl_device structure
+ * @major: major number of a reference version
+ * @minor: minor number of a reference version
+ * @subminor: sub-minor number of a reference version
+ *
+ * Return 1 if FW version greater than the reference version, -1 if it's
+ * smaller and 0 if versions are identical.
+ */
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor)
+{
+ if (hdev->fw_sw_major_ver != major)
+ return (hdev->fw_sw_major_ver > major) ? 1 : -1;
+
+ if (hdev->fw_sw_minor_ver != minor)
+ return (hdev->fw_sw_minor_ver > minor) ? 1 : -1;
+
+ if (hdev->fw_sw_sub_minor_ver != subminor)
+ return (hdev->fw_sw_sub_minor_ver > subminor) ? 1 : -1;
+
+ return 0;
+}
+
static char *extract_fw_ver_from_str(const char *fw_str)
{
char *str, *fw_ver, *whitespace;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 7397ce86b7f0..634a470efe27 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3600,25 +3600,6 @@ struct hl_ioctl_desc {
hl_ioctl_t *func;
};

-static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major, u32 fw_sw_minor)
-{
- if (hdev->fw_sw_major_ver < fw_sw_major)
- return true;
- if (hdev->fw_sw_major_ver > fw_sw_major)
- return false;
- if (hdev->fw_sw_minor_ver < fw_sw_minor)
- return true;
- return false;
-}
-
-static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
- u32 fw_sw_minor)
-{
- return (hdev->fw_sw_major_ver > fw_sw_major ||
- (hdev->fw_sw_major_ver == fw_sw_major &&
- hdev->fw_sw_minor_ver >= fw_sw_minor));
-}
-
/*
* Kernel module functions that can be accessed by entire module
*/
@@ -3923,6 +3904,7 @@ void hl_mmu_dr_flush(struct hl_ctx *ctx);
int hl_mmu_dr_init(struct hl_device *hdev);
void hl_mmu_dr_fini(struct hl_device *hdev);

+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor);
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
void __iomem *dst, u32 src_offset, u32 size);
int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 1f061209ae21..4a0917aa4dd7 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)

prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;

+ prop->supports_advanced_cpucp_rc = true;
+
return 0;

free_qprops:
@@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device *hdev)
struct gaudi2_device *gaudi2 = hdev->asic_specific;
int rc;

- hdev->asic_prop.supports_advanced_cpucp_rc = true;
-
rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
gaudi2->virt_msix_db_dma_addr);
if (rc) {
@@ -3783,7 +3783,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
prop->supports_compute_reset = true;

/* Event queue sanity check added in FW version 1.11 */
- if (hl_is_fw_sw_ver_below(hdev, 1, 11))
+ if (hl_fw_version_cmp(hdev, 1, 11, 0) < 0)
hdev->event_queue.check_eqe_index = false;
else
hdev->event_queue.check_eqe_index = true;
@@ -6314,26 +6314,6 @@ static void gaudi2_execute_hard_reset(struct hl_device *hdev)
WREG32(mmPSOC_RESET_CONF_SW_ALL_RST, 1);
}

-static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_timeout_us)
-{
- int i, rc = 0;
- u32 reg_val;
-
- for (i = 0 ; i < GAUDI2_RESET_POLL_CNT ; i++)
- rc = hl_poll_timeout(
- hdev,
- mmCPU_RST_STATUS_TO_HOST,
- reg_val,
- reg_val == CPU_RST_STATUS_SOFT_RST_DONE,
- 1000,
- poll_timeout_us);
-
- if (rc)
- dev_err(hdev->dev, "Timeout while waiting for FW to complete soft reset (0x%x)\n",
- reg_val);
- return rc;
-}
-
/**
* gaudi2_execute_soft_reset - execute soft reset by driver/FW
*
@@ -6346,23 +6326,8 @@ static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_
static int gaudi2_execute_soft_reset(struct hl_device *hdev, bool driver_performs_reset,
u32 poll_timeout_us)
{
- int rc;
-
- if (!driver_performs_reset) {
- if (hl_is_fw_sw_ver_below(hdev, 1, 10)) {
- /* set SP to indicate reset request sent to FW */
- WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA);
-
- WREG32(mmGIC_HOST_SOFT_RST_IRQ_POLL_REG,
- gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id);
-
- /* wait for f/w response */
- rc = gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us);
- } else {
- rc = hl_fw_send_soft_reset(hdev);
- }
- return rc;
- }
+ if (!driver_performs_reset)
+ return hl_fw_send_soft_reset(hdev);

/* Block access to engines, QMANs and SM during reset, these
* RRs will be reconfigured after soft reset.
@@ -7914,7 +7879,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
bool has_block_id = false;
u16 block_id;

- if (!hl_is_fw_sw_ver_below(hdev, 1, 12))
+ if (hl_fw_version_cmp(hdev, 1, 12, 0) >= 0)
has_block_id = true;

ecc_address = le64_to_cpu(ecc_data->ecc_address);
@@ -8165,13 +8130,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
}

hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx];
-
- if (hl_is_fw_sw_ver_below(hdev, 1, 9) &&
- !hdev->asic_prop.fw_security_enabled &&
- ((module_idx == 0) || (module_idx == 1)))
- lbw_rtr_id = DCORE0_RTR0;
- else
- lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
+ lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
break;
case RAZWI_MME:
sprintf(initiator_name, "MME_%u", module_idx);
@@ -10080,7 +10039,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
- if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13))
+ if (hl_fw_version_cmp(hdev, 1, 13, 0) >= 0)
is_critical = true;
break;

--
2.34.1



2024-02-20 16:02:01

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log

From: Tomer Tayar <[email protected]>

Skip loading a linux FW image into the device with the current supported
ASICs is done for test purposes only.
Moreover, for future supported ASICs it is possible that there won't be
a need to load such an image.
The print in such a case is therefore not needed in most cases, so
replace the used dev_info() with dev_dbg().

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index e7dcf2fe6552..364d292c76fa 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2820,7 +2820,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->asic_funcs->init_cpu_scrambler_dram(hdev);

if (!(hdev->fw_components & FW_TYPE_LINUX)) {
- dev_info(hdev->dev, "Skip loading Linux F/W\n");
+ dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
return 0;
}

--
2.34.1


2024-02-20 16:02:12

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info

From: Tomer Tayar <[email protected]>

The FW interrupt info for a PCIe addr_dec event is set correctly, so
check for either global errors or razwi according to the indications
there.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/gaudi2/gaudi2.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 26975179763a..671241735a6e 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -8942,9 +8942,6 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
u32 error_count = 0;
int i;

- gaudi2_print_event(hdev, event_type, true,
- "intr_cause_data: %#llx", intr_cause_data);
-
for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE ; i++) {
if (!(intr_cause_data & BIT_ULL(i)))
continue;
@@ -8953,15 +8950,16 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
"err cause: %s", gaudi2_pcie_addr_dec_error_cause[i]);
error_count++;

- /*
- * Always check for LBW and HBW additional info as the indication itself is
- * sometimes missing
- */
+ switch (intr_cause_data & BIT_ULL(i)) {
+ case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK:
+ hl_check_for_glbl_errors(hdev);
+ break;
+ case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK:
+ gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
+ break;
+ }
}

- hl_check_for_glbl_errors(hdev);
- gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
-
return error_count;
}

--
2.34.1


2024-02-20 16:02:32

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 05/13] accel/habanalabs: fix glbl error cause handling

From: Tomer Tayar <[email protected]>

The glbl error cause handling has a wrong assumption that all error
bits are consecutive.
Fix the handling to check all relevant error bits per ASIC.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/habanalabs.h | 4 +--
drivers/accel/habanalabs/common/security.c | 33 +++++++++++++++-----
drivers/accel/habanalabs/common/security.h | 3 +-
drivers/accel/habanalabs/gaudi2/gaudi2.c | 10 +++---
drivers/accel/habanalabs/gaudi2/gaudi2P.h | 3 +-
5 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index c85849aefba6..40107a4eba93 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -647,7 +647,7 @@ struct hl_hints_range {
* @num_engine_cores: number of engine cpu cores.
* @max_num_of_engines: maximum number of all engines in the ASIC.
* @num_of_special_blocks: special_blocks array size.
- * @glbl_err_cause_num: global err cause number.
+ * @glbl_err_max_cause_num: global err max cause number.
* @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
* not supported.
* @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
@@ -779,7 +779,7 @@ struct asic_fixed_properties {
u32 num_engine_cores;
u32 max_num_of_engines;
u32 num_of_special_blocks;
- u32 glbl_err_cause_num;
+ u32 glbl_err_max_cause_num;
u32 hbw_flush_reg;
u32 reserved_fw_mem_size;
u16 collective_first_sob;
diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c
index fe913965dbad..5402a3cd0491 100644
--- a/drivers/accel/habanalabs/common/security.c
+++ b/drivers/accel/habanalabs/common/security.c
@@ -7,15 +7,31 @@

#include "habanalabs.h"

-static const char * const hl_glbl_error_cause[HL_MAX_NUM_OF_GLBL_ERR_CAUSE] = {
+static const char * const hl_glbl_error_cause[] = {
"Error due to un-priv read",
"Error due to un-secure read",
"Error due to read from unmapped reg",
"Error due to un-priv write",
"Error due to un-secure write",
"Error due to write to unmapped reg",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
"External I/F write sec violation",
"External I/F write to un-mapped reg",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
+ "N/A",
"Read to write only",
"Write to read only"
};
@@ -671,10 +687,11 @@ static bool hl_check_block_range_exclusion(struct hl_device *hdev,
static int hl_read_glbl_errors(struct hl_device *hdev,
u32 blk_idx, u32 major, u32 minor, u32 sub_minor, void *data)
{
- struct hl_special_block_info *special_blocks = hdev->asic_prop.special_blocks;
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+ struct hl_special_block_info *special_blocks = prop->special_blocks;
struct hl_special_block_info *current_block = &special_blocks[blk_idx];
u32 glbl_err_addr, glbl_err_cause, addr_val, cause_val, block_base,
- base = current_block->base_addr - lower_32_bits(hdev->asic_prop.cfg_base_address);
+ base = current_block->base_addr - lower_32_bits(prop->cfg_base_address);
int i;

block_base = base + major * current_block->major_offset +
@@ -689,13 +706,13 @@ static int hl_read_glbl_errors(struct hl_device *hdev,
glbl_err_addr = block_base + HL_GLBL_ERR_ADDR_OFFSET;
addr_val = RREG32(glbl_err_addr);

- for (i = 0 ; i < hdev->asic_prop.glbl_err_cause_num ; i++) {
+ for (i = 0 ; i <= prop->glbl_err_max_cause_num ; i++) {
if (cause_val & BIT(i))
dev_err_ratelimited(hdev->dev,
- "%s, addr %#llx\n",
- hl_glbl_error_cause[i],
- hdev->asic_prop.cfg_base_address + block_base +
- FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
+ "%s, addr %#llx\n",
+ hl_glbl_error_cause[i],
+ prop->cfg_base_address + block_base +
+ FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
}

WREG32(glbl_err_cause, cause_val);
diff --git a/drivers/accel/habanalabs/common/security.h b/drivers/accel/habanalabs/common/security.h
index d7a3b3e82ea4..476f70687c09 100644
--- a/drivers/accel/habanalabs/common/security.h
+++ b/drivers/accel/habanalabs/common/security.h
@@ -13,8 +13,7 @@
struct hl_device;

/* special blocks */
-#define HL_MAX_NUM_OF_GLBL_ERR_CAUSE 10
-#define HL_GLBL_ERR_ADDRESS_MASK GENMASK(11, 0)
+#define HL_GLBL_ERR_ADDRESS_MASK GENMASK(11, 0)
/* GLBL_ERR_ADDR register offset from the start of the block */
#define HL_GLBL_ERR_ADDR_OFFSET 0xF44
/* GLBL_ERR_CAUSE register offset from the start of the block */
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 671241735a6e..189d8da6a624 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -158,11 +158,13 @@
#define RAZWI_INITIATOR_ID_X_Y(xl, yl, xh) \
(RAZWI_INITIATOR_ID_X_Y_LOW(xl, yl) | RAZWI_INITIATOR_ID_X_HIGH(xh))

-#define PSOC_RAZWI_ENG_STR_SIZE 128
-#define PSOC_RAZWI_MAX_ENG_PER_RTR 5
+#define PSOC_RAZWI_ENG_STR_SIZE 128
+#define PSOC_RAZWI_MAX_ENG_PER_RTR 5

/* HW scrambles only bits 0-25 */
-#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+
+#define GAUDI2_GLBL_ERR_MAX_CAUSE_NUM 17

struct gaudi2_razwi_info {
u32 axuser_xy;
@@ -3587,7 +3589,7 @@ static int gaudi2_special_blocks_config(struct hl_device *hdev)
int i, rc;

/* Configure Special blocks */
- prop->glbl_err_cause_num = GAUDI2_NUM_OF_GLBL_ERR_CAUSE;
+ prop->glbl_err_max_cause_num = GAUDI2_GLBL_ERR_MAX_CAUSE_NUM;
prop->num_of_special_blocks = ARRAY_SIZE(gaudi2_special_blocks);
prop->special_blocks = kmalloc_array(prop->num_of_special_blocks,
sizeof(*prop->special_blocks), GFP_KERNEL);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index bc508c9cee5c..eee41387b269 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -237,9 +237,8 @@
#define GAUDI2_SOB_INCREMENT_BY_ONE (FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \
FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1))

-#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
+#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)

-#define GAUDI2_NUM_OF_GLBL_ERR_CAUSE 8

enum gaudi2_reserved_sob_id {
GAUDI2_RESERVED_SOB_CS_COMPLETION_FIRST,
--
2.34.1


2024-02-20 16:02:57

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables

From: Tal Risin <[email protected]>

Prevent static analysis warning.

Signed-off-by: Tal Risin <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/debugfs.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index ab0fe74b49d0..b1c88d1837d9 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -484,7 +484,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
struct hl_debugfs_entry *entry = s->private;
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
struct hl_device *hdev = dev_entry->hdev;
- char kbuf[MMU_KBUF_SIZE];
+ char kbuf[MMU_KBUF_SIZE] = {0};
char *c;
ssize_t rc;

@@ -546,7 +546,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
struct hl_debugfs_entry *entry = s->private;
struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
struct hl_device *hdev = dev_entry->hdev;
- char kbuf[MMU_KBUF_SIZE];
+ char kbuf[MMU_KBUF_SIZE] = {0};
ssize_t rc;

if (count > sizeof(kbuf) - 1)
--
2.34.1


2024-02-20 16:03:09

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 08/13] accel/habanalabs: fix error print

From: Dani Liberman <[email protected]>

The unmasking is for event and it can be other event than RAZWI.

Signed-off-by: Dani Liberman <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 364d292c76fa..a3df7cf162d8 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -526,7 +526,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
0, &result);

if (rc)
- dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
+ dev_err(hdev->dev, "failed to unmask event %d", event_type);

return rc;
}
@@ -565,7 +565,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
total_pkt_size, 0, &result);

if (rc)
- dev_err(hdev->dev, "failed to unmask IRQ array\n");
+ dev_err(hdev->dev, "failed to unmask event array\n");

kfree(pkt);

--
2.34.1


2024-02-20 16:03:33

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate

From: Ofir Bitton <[email protected]>

Fetching sensor data can fail due to various reasons. In order
not to pollute the kernel log, those error prints must be
rate limited.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/hwmon.c | 29 +++++++++++++------------
1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 1ee2ee07e9ed..36b951b5f503 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -46,7 +46,7 @@ static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types
break;

default:
- dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+ dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
flags = cpucp_flags;
break;
}
@@ -134,7 +134,7 @@ static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types ty
break;

default:
- dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+ dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
flags = cpucp_flags;
break;
}
@@ -162,7 +162,8 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sen
break;

if (type >= HWMON_NR_SENSOR_TYPES) {
- dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type);
+ dev_err_ratelimited(hdev->dev,
+ "Got wrong sensor type %d from device\n", type);
return -EINVAL;
}

@@ -584,7 +585,7 @@ int hl_get_temperature(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to get temperature from sensor %d, error %d\n",
sensor_index, rc);
*value = 0;
@@ -611,7 +612,7 @@ int hl_set_temperature(struct hl_device *hdev,
0, NULL);

if (rc)
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to set temperature of sensor %d, error %d\n",
sensor_index, rc);

@@ -638,7 +639,7 @@ int hl_get_voltage(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to get voltage from sensor %d, error %d\n",
sensor_index, rc);
*value = 0;
@@ -667,7 +668,7 @@ int hl_get_current(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to get current from sensor %d, error %d\n",
sensor_index, rc);
*value = 0;
@@ -696,7 +697,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to get fan speed from sensor %d, error %d\n",
sensor_index, rc);
*value = 0;
@@ -725,7 +726,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to get pwm info from sensor %d, error %d\n",
sensor_index, rc);
*value = 0;
@@ -752,7 +753,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
0, NULL);

if (rc)
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to set pwm info to sensor %d, error %d\n",
sensor_index, rc);
}
@@ -775,7 +776,7 @@ int hl_set_voltage(struct hl_device *hdev,
0, NULL);

if (rc)
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to set voltage of sensor %d, error %d\n",
sensor_index, rc);

@@ -800,7 +801,7 @@ int hl_set_current(struct hl_device *hdev,
0, NULL);

if (rc)
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to set current of sensor %d, error %d\n",
sensor_index, rc);

@@ -831,7 +832,7 @@ int hl_set_power(struct hl_device *hdev,
0, NULL);

if (rc)
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to set power of sensor %d, error %d\n",
sensor_index, rc);

@@ -858,7 +859,7 @@ int hl_get_power(struct hl_device *hdev,
*value = (long) result;

if (rc) {
- dev_err(hdev->dev,
+ dev_err_ratelimited(hdev->dev,
"Failed to get power of sensor %d, error %d\n",
sensor_index, rc);
*value = 0;
--
2.34.1


2024-02-20 16:04:03

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW

From: Tomer Tayar <[email protected]>

The reserved memory for FW is currently saved in an ASIC property in
units of MB, just like the value that comes from FW.
Except the fact that it is not clear from the property's name, it means
also that a calculation to actual size is required everywhere that it is
used.
Modify the property to hold the size in bytes.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 2 +-
drivers/accel/habanalabs/common/habanalabs.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 4246162b6807..348418643709 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2749,7 +2749,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,

if (hdev->asic_prop.support_dynamic_resereved_fw_size)
hdev->asic_prop.reserved_fw_mem_size =
- le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
+ le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;

if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
struct lkd_fw_binning_info *binning_info;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 40107a4eba93..55495861f432 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -650,7 +650,7 @@ struct hl_hints_range {
* @glbl_err_max_cause_num: global err max cause number.
* @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
* not supported.
- * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
+ * @reserved_fw_mem_size: size of dram memory reserved for FW.
* @collective_first_sob: first sync object available for collective use
* @collective_first_mon: first monitor available for collective use
* @sync_stream_first_sob: first sync object available for sync stream use
--
2.34.1


2024-02-20 16:04:15

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 13/13] accel/habanalabs: modify pci health check

From: Ofir Bitton <[email protected]>

Today we read PCI VENDOR-ID in order to make sure PCI link is
healthy. Apparently the VENDOR-ID might be stored on host and
hence, when we read it we might not access the PCI bus.
In order to make sure PCI health check is reliable, we will start
checking the DEVICE-ID instead.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 3b9e8a21d7df..8f92445c5a90 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1035,14 +1035,14 @@ static void device_early_fini(struct hl_device *hdev)

static bool is_pci_link_healthy(struct hl_device *hdev)
{
- u16 vendor_id;
+ u16 device_id;

if (!hdev->pdev)
return false;

- pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+ pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);

- return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+ return (device_id == hdev->pdev->device);
}

static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
--
2.34.1


2024-02-20 16:05:12

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 02/13] accel/habanalabs: remove hop size from asic properties

From: Farah Kassabri <[email protected]>

The hop size related properties is a MMU properties and not
asic properties.
As for PMMU and HMMU we could have different sizes.

Signed-off-by: Farah Kassabri <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/habanalabs.h | 4 ----
drivers/accel/habanalabs/common/mmu/mmu.c | 22 ++++++++---------
.../accel/habanalabs/common/mmu/mmu_v2_hr.c | 24 +++++++++----------
drivers/accel/habanalabs/gaudi/gaudi.c | 8 +++----
drivers/accel/habanalabs/gaudi2/gaudi2.c | 12 ++++------
drivers/accel/habanalabs/goya/goya.c | 12 ++++------
6 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 634a470efe27..c85849aefba6 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -594,8 +594,6 @@ struct hl_hints_range {
* we display to the user
* @mmu_pgt_size: MMU page tables total size.
* @mmu_pte_size: PTE size in MMU page tables.
- * @mmu_hop_table_size: MMU hop table size.
- * @mmu_hop0_tables_total_size: total size of MMU hop0 tables.
* @dram_page_size: The DRAM physical page size.
* @cfg_size: configuration space size on SRAM.
* @sram_size: total size of SRAM.
@@ -747,8 +745,6 @@ struct asic_fixed_properties {
u32 clk_pll_index;
u32 mmu_pgt_size;
u32 mmu_pte_size;
- u32 mmu_hop_table_size;
- u32 mmu_hop0_tables_total_size;
u32 dram_page_size;
u32 cfg_size;
u32 sram_size;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index fa7919dba783..d3eaab908457 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -1236,7 +1236,7 @@ void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
struct hl_device *hdev = ctx->hdev;

gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
- hdev->asic_prop.mmu_hop_table_size);
+ hdev->asic_prop.dmmu.hop_table_size);
hash_del(&pgt_info->node);
kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
kfree(pgt_info);
@@ -1245,18 +1245,18 @@ void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
{
return ctx->hdev->asic_prop.mmu_pgt_addr +
- (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+ (ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
}

u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
{
return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
- (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+ (ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
}

u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
{
- u64 page_mask = ctx->hdev->asic_prop.mmu_hop_table_size - 1;
+ u64 page_mask = ctx->hdev->asic_prop.dmmu.hop_table_size - 1;
u64 shadow_hop_addr = shadow_addr & (~page_mask);
u64 pte_offset = shadow_addr & page_mask;
u64 phys_hop_addr;
@@ -1326,13 +1326,13 @@ u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
return ULLONG_MAX;

phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
- prop->mmu_hop_table_size);
+ prop->dmmu.hop_table_size);
if (!phys_addr) {
dev_err(hdev->dev, "failed to allocate page\n");
goto pool_add_err;
}

- shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+ shadow_addr = (u64) (uintptr_t) kzalloc(prop->dmmu.hop_table_size,
GFP_KERNEL);
if (!shadow_addr)
goto shadow_err;
@@ -1347,7 +1347,7 @@ u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)

shadow_err:
gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
- phys_addr, prop->mmu_hop_table_size);
+ phys_addr, prop->dmmu.hop_table_size);
pool_add_err:
kfree(pgt_info);

@@ -1379,7 +1379,7 @@ int hl_mmu_dr_init(struct hl_device *hdev)
int rc;

hdev->mmu_priv.dr.mmu_pgt_pool =
- gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+ gen_pool_create(__ffs(prop->dmmu.hop_table_size), -1);

if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
dev_err(hdev->dev, "Failed to create page gen pool\n");
@@ -1387,8 +1387,8 @@ int hl_mmu_dr_init(struct hl_device *hdev)
}

rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
- prop->mmu_hop0_tables_total_size,
- prop->dmmu.pgt_size - prop->mmu_hop0_tables_total_size,
+ prop->dmmu.hop0_tables_total_size,
+ prop->dmmu.pgt_size - prop->dmmu.hop0_tables_total_size,
-1);
if (rc) {
dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
@@ -1396,7 +1396,7 @@ int hl_mmu_dr_init(struct hl_device *hdev)
}

hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
- prop->mmu_hop_table_size, GFP_KERNEL);
+ prop->dmmu.hop_table_size, GFP_KERNEL);
if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
rc = -ENOMEM;
goto err_pool_add;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
index afe7ef964f82..31507b2a431b 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
@@ -47,7 +47,7 @@ static inline int hl_mmu_v2_hr_init(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;

- return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size,
+ return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size,
prop->mmu_pgt_size);
}

@@ -65,7 +65,7 @@ static inline void hl_mmu_v2_hr_fini(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;

- hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size);
+ hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size);
}

/**
@@ -108,7 +108,7 @@ static void hl_mmu_v2_hr_ctx_fini(struct hl_ctx *ctx)
"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
hl_mmu_hr_free_hop_remove_pgt(pgt_info, &ctx->hdev->mmu_priv.hr,
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);
}
}

@@ -150,7 +150,7 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,

curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
hop_pte_phys_addr[i],
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);

if ((i < hop_last) && (curr_pte & mmu_prop->last_mask)) {
hop_last = i;
@@ -169,14 +169,14 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,

for (i = hop_last ; i > 0 ; i--) {
hl_mmu_hr_clear_pte(ctx, hops_pgt_info[i], hop_pte_phys_addr[i],
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);

if (hl_mmu_hr_put_pte(ctx, hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
- ctx->hdev->asic_prop.mmu_hop_table_size))
+ ctx->hdev->asic_prop.pmmu.hop_table_size))
goto mapped;
}
hl_mmu_hr_clear_pte(ctx, hops_pgt_info[0], hop_pte_phys_addr[0],
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);

mapped:
return 0;
@@ -255,7 +255,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
scrambled_virt_addr);
curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
hop_pte_phys_addr[i],
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);
}

if (curr_pte & PAGE_PRESENT_MASK) {
@@ -268,7 +268,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
*(u64 *) (uintptr_t)
hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
hop_pte_phys_addr[i],
- ctx->hdev->asic_prop.mmu_hop_table_size),
+ ctx->hdev->asic_prop.pmmu.hop_table_size),
hop_pte_phys_addr[i]);
rc = -EINVAL;
goto err;
@@ -279,7 +279,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,

/* Write the PTEs */
hl_mmu_hr_write_pte(ctx, hops_pgt_info[hop_last], hop_pte_phys_addr[hop_last], curr_pte,
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);

/* for each new hop, add its address to the table of previous-hop */
for (i = 1 ; i <= hop_last ; i++) {
@@ -287,7 +287,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
curr_pte = (hops_pgt_info[i]->phys_addr & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK;
hl_mmu_hr_write_pte(ctx, hops_pgt_info[i - 1], hop_pte_phys_addr[i - 1],
- curr_pte, ctx->hdev->asic_prop.mmu_hop_table_size);
+ curr_pte, ctx->hdev->asic_prop.pmmu.hop_table_size);
if (i - 1)
hl_mmu_hr_get_pte(ctx, &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs,
hops_pgt_info[i - 1]->phys_addr);
@@ -303,7 +303,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
for (i = 1 ; i <= hop_last ; i++)
if (hop_new[i] && hops_pgt_info[i])
hl_mmu_hr_free_hop_remove_pgt(hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
- ctx->hdev->asic_prop.mmu_hop_table_size);
+ ctx->hdev->asic_prop.pmmu.hop_table_size);

return rc;
}
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index dde3839fe0e0..f2b04ffb0ecb 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -614,8 +614,6 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
else
prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
prop->mmu_pte_size = HL_PTE_SIZE;
- prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
- prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB;
prop->device_mem_alloc_default_page_size = prop->dram_page_size;
prop->dram_supports_virtual_memory = false;
@@ -637,8 +635,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
prop->pmmu.last_mask = LAST_MASK;
/* TODO: will be duplicated until implementing per-MMU props */
- prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
- prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+ prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+ prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;

/* PMMU and HPMMU are the same except of page size */
memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -3653,7 +3651,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)

for (i = 0 ; i < prop->max_asid ; i++) {
hop0_addr = prop->mmu_pgt_addr +
- (i * prop->mmu_hop_table_size);
+ (i * prop->dmmu.hop_table_size);

rc = gaudi_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
if (rc) {
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4a0917aa4dd7..26975179763a 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2467,8 +2467,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)

prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE;
prop->mmu_pte_size = HL_PTE_SIZE;
- prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
- prop->mmu_hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;

prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT;
prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT;
@@ -2482,8 +2480,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
prop->dmmu.num_hops = MMU_ARCH_4_HOPS;
prop->dmmu.last_mask = LAST_MASK;
prop->dmmu.host_resident = 0;
- prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
- prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+ prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+ prop->dmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;

/* As we need to set the pgt address in dram for HMMU init so we cannot
* wait to the fw cpucp info to set the dram props as mmu init comes before
@@ -2500,8 +2498,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
prop->pmmu.host_resident = 1;
prop->pmmu.num_hops = MMU_ARCH_6_HOPS;
prop->pmmu.last_mask = LAST_MASK;
- prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
- prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+ prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+ prop->pmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;

prop->hints_host_reserved_va_range.start_addr = RESERVED_VA_FOR_VIRTUAL_MSIX_DOORBELL_START;
prop->hints_host_reserved_va_range.end_addr = RESERVED_VA_RANGE_FOR_ARC_ON_HOST_END;
@@ -5934,7 +5932,7 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base,
if (host_resident_pgt)
hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
else
- hop0_addr = prop->mmu_pgt_addr + (asid * prop->mmu_hop_table_size);
+ hop0_addr = prop->mmu_pgt_addr + (asid * prop->dmmu.hop_table_size);

rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr);
if (rc) {
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 1322cb330c57..5a359c3bdc78 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -413,8 +413,6 @@ int goya_set_fixed_properties(struct hl_device *hdev)
else
prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
prop->mmu_pte_size = HL_PTE_SIZE;
- prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
- prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB;
prop->device_mem_alloc_default_page_size = prop->dram_page_size;
prop->dram_supports_virtual_memory = true;
@@ -435,8 +433,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
prop->dmmu.num_hops = MMU_ARCH_5_HOPS;
prop->dmmu.last_mask = LAST_MASK;
/* TODO: will be duplicated until implementing per-MMU props */
- prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
- prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+ prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+ prop->dmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;

/* shifts and masks are the same in PMMU and DMMU */
memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
@@ -446,8 +444,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
prop->pmmu.last_mask = LAST_MASK;
/* TODO: will be duplicated until implementing per-MMU props */
- prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
- prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+ prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+ prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;

/* PMMU and HPMMU are the same except of page size */
memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -2678,7 +2676,7 @@ int goya_mmu_init(struct hl_device *hdev)

for (i = 0 ; i < prop->max_asid ; i++) {
hop0_addr = prop->mmu_pgt_addr +
- (i * prop->mmu_hop_table_size);
+ (i * prop->dmmu.hop_table_size);

rc = goya_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
if (rc) {
--
2.34.1


2024-02-20 16:06:26

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 06/13] accel/habanalabs: fix debugfs files permissions

From: Avri Kehat <[email protected]>

debugfs files are created with permissions that don't align
with the access requirements.

Signed-off-by: Avri Kehat <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/debugfs.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index 01f071d52570..ab0fe74b49d0 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -1643,19 +1643,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
&hl_data64b_fops);

debugfs_create_file("set_power_state",
- 0200,
+ 0644,
root,
dev_entry,
&hl_power_fops);

debugfs_create_file("device",
- 0200,
+ 0644,
root,
dev_entry,
&hl_device_fops);

debugfs_create_file("clk_gate",
- 0200,
+ 0644,
root,
dev_entry,
&hl_clk_gate_fops);
@@ -1667,13 +1667,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
&hl_stop_on_err_fops);

debugfs_create_file("dump_security_violations",
- 0644,
+ 0400,
root,
dev_entry,
&hl_security_violations_fops);

debugfs_create_file("dump_razwi_events",
- 0644,
+ 0400,
root,
dev_entry,
&hl_razwi_check_fops);
@@ -1706,7 +1706,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
&hdev->reset_info.skip_reset_on_timeout);

debugfs_create_file("state_dump",
- 0600,
+ 0644,
root,
dev_entry,
&hl_state_dump_fops);
@@ -1724,7 +1724,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent

for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
debugfs_create_file(hl_debugfs_list[i].name,
- 0444,
+ 0644,
root,
entry,
&hl_debugfs_fops);
--
2.34.1


2024-02-20 16:07:24

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication

From: Ofir Bitton <[email protected]>

Due to a H/W issue, AXI drain event does not include a read/write
indication, hence we remove this print.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 189d8da6a624..ba1518f2bf5c 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9548,25 +9548,17 @@ static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev, u16 event_type)
static int gaudi2_handle_pcie_drain(struct hl_device *hdev,
struct hl_eq_pcie_drain_ind_data *drain_data)
{
- u64 lbw_rd, lbw_wr, hbw_rd, hbw_wr, cause, error_count = 0;
+ u64 cause, error_count = 0;

cause = le64_to_cpu(drain_data->intr_cause.intr_cause_data);
- lbw_rd = le64_to_cpu(drain_data->drain_rd_addr_lbw);
- lbw_wr = le64_to_cpu(drain_data->drain_wr_addr_lbw);
- hbw_rd = le64_to_cpu(drain_data->drain_rd_addr_hbw);
- hbw_wr = le64_to_cpu(drain_data->drain_wr_addr_hbw);

if (cause & BIT_ULL(0)) {
- dev_err_ratelimited(hdev->dev,
- "PCIE AXI drain LBW completed, read_err %u, write_err %u\n",
- !!lbw_rd, !!lbw_wr);
+ dev_err_ratelimited(hdev->dev, "PCIE AXI drain LBW completed\n");
error_count++;
}

if (cause & BIT_ULL(1)) {
- dev_err_ratelimited(hdev->dev,
- "PCIE AXI drain HBW completed, raddr %#llx, waddr %#llx\n",
- hbw_rd, hbw_wr);
+ dev_err_ratelimited(hdev->dev, "PCIE AXI drain HBW completed\n");
error_count++;
}

--
2.34.1


2024-02-20 16:08:06

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW

From: Tomer Tayar <[email protected]>

Currently the reserved memory request from FW is handled when running
with preboot only, but this request is relevant also when running with
full FW.
Modify to always handle this reservation request.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 19 ++++++++-----------
1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index a3df7cf162d8..4246162b6807 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2743,18 +2743,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
}

+ rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
+ if (rc)
+ goto protocol_err;
+
+ if (hdev->asic_prop.support_dynamic_resereved_fw_size)
+ hdev->asic_prop.reserved_fw_mem_size =
+ le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
+
if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
struct lkd_fw_binning_info *binning_info;

- rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
- sizeof(struct lkd_msg_comms));
- if (rc)
- goto protocol_err;
-
/* read preboot version */
rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-
if (rc)
return rc;

@@ -2781,11 +2783,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->decoder_binning, hdev->rotator_binning);
}

- if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
- hdev->asic_prop.reserved_fw_mem_size =
- le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
- }
-
return 0;
}

--
2.34.1


2024-02-23 22:38:26

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions

On 2/20/2024 8:01 AM, Oded Gabbay wrote:> From: Ohad Sharabi
<[email protected]>
>
> Currently, the code contains 2 types of FW version comparison functions:
> - hl_is_fw_sw_ver_[below/equal_or_greater]()
> - gaudi2 specific function of the type
> gaudi2_is_fw_ver_[below/above]x_y_z()
>
> Moreover, some functions use the inner FW version which should be only
> stage during development but not version dependencies.
>
> Finally, some tests are done to deprecated FW version to which LKD
> should hold no compatibility.
>
> This commit aligns all APIs to a single function that just compares the
> version and return an integers indicator (similar in some way to
> strcmp()).
>
> In addition, this generic function now considers also the sub-minor FW
> version and also remove dead code resulting in deprecated FW versions
> compatibility.
>
> Signed-off-by: Ohad Sharabi <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>
> ---
> drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
> drivers/accel/habanalabs/common/habanalabs.h | 20 +------
> drivers/accel/habanalabs/gaudi2/gaudi2.c | 57 +++----------------
> 3 files changed, 34 insertions(+), 68 deletions(-)
>
..
> diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c
b/drivers/accel/habanalabs/gaudi2/gaudi2.c
> index 1f061209ae21..4a0917aa4dd7 100644
> --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
> +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
> @@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct
hl_device *hdev)
>
> prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
>
> + prop->supports_advanced_cpucp_rc = true;
> +
> return 0;
>
> free_qprops:
> @@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device *hdev)
> struct gaudi2_device *gaudi2 = hdev->asic_specific;
> int rc;
>
> - hdev->asic_prop.supports_advanced_cpucp_rc = true;
> -
> rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
> gaudi2->virt_msix_db_dma_addr);
> if (rc) {

Is this change in support of the others in this patch? Feels like this
should be more than one patch (adding new version_cmp, removing old checks).

-Carl V.

2024-02-23 22:42:58

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 02/13] accel/habanalabs: remove hop size from asic properties


On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Farah Kassabri <[email protected]>
>
> The hop size related properties is a MMU properties and not
> asic properties.
> As for PMMU and HMMU we could have different sizes.
>
> Signed-off-by: Farah Kassabri <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 22:45:57

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 03/13] accel/habanalabs: modify print for skip loading linux FW to debug log


On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <[email protected]>
>
> Skip loading a linux FW image into the device with the current supported
> ASICs is done for test purposes only.
> Moreover, for future supported ASICs it is possible that there won't be
> a need to load such an image.
> The print in such a case is therefore not needed in most cases, so
> replace the used dev_info() with dev_dbg().
>
> Signed-off-by: Tomer Tayar <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 22:49:52

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 04/13] accel/habanalabs/gaudi2: check extended errors according to PCIe addr_dec interrupt info

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <[email protected]>
>
> The FW interrupt info for a PCIe addr_dec event is set correctly, so
> check for either global errors or razwi according to the indications
> there.
>
> Signed-off-by: Tomer Tayar <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:01:57

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 05/13] accel/habanalabs: fix glbl error cause handling

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <[email protected]>
>
> The glbl error cause handling has a wrong assumption that all error
> bits are consecutive.
> Fix the handling to check all relevant error bits per ASIC.
>
> Signed-off-by: Tomer Tayar <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:10:21

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 06/13] accel/habanalabs: fix debugfs files permissions

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Avri Kehat <[email protected]>
>
> debugfs files are created with permissions that don't align
> with the access requirements.
>
> Signed-off-by: Avri Kehat <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:13:13

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 07/13] accel/habanalabs: initialize maybe-uninitialized variables

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tal Risin <[email protected]>
>
> Prevent static analysis warning.
>
> Signed-off-by: Tal Risin <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:15:56

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 08/13] accel/habanalabs: fix error print

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Dani Liberman <[email protected]>
>
> The unmasking is for event and it can be other event than RAZWI.
>
> Signed-off-by: Dani Liberman <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:17:41

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 09/13] accel/habanalabs/gaudi2: drain event lacks rd/wr indication

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Ofir Bitton <[email protected]>
>
> Due to a H/W issue, AXI drain event does not include a read/write
> indication, hence we remove this print.
>
> Signed-off-by: Ofir Bitton <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:21:40

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 10/13] accel/habanalabs/hwmon: rate limit errors user can generate

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Ofir Bitton <[email protected]>
>
> Fetching sensor data can fail due to various reasons. In order
> not to pollute the kernel log, those error prints must be
> rate limited.
>
> Signed-off-by: Ofir Bitton <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:24:53

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 11/13] accel/habanalabs: handle reserved memory request when working with full FW

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <[email protected]>
>
> Currently the reserved memory request from FW is handled when running
> with preboot only, but this request is relevant also when running with
> full FW.
> Modify to always handle this reservation request.
>
> Signed-off-by: Tomer Tayar <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:27:26

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 12/13] accel/habanalabs: keep explicit size of reserved memory for FW

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Tomer Tayar <[email protected]>
>
> The reserved memory for FW is currently saved in an ASIC property in
> units of MB, just like the value that comes from FW.
> Except the fact that it is not clear from the property's name, it means
> also that a calculation to actual size is required everywhere that it is
> used.
> Modify the property to hold the size in bytes.
>
> Signed-off-by: Tomer Tayar <[email protected]>
> Reviewed-by: Oded Gabbay <[email protected]>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Carl Vanderlip <[email protected]>

2024-02-23 23:32:14

by Carl Vanderlip

[permalink] [raw]
Subject: Re: [PATCH 13/13] accel/habanalabs: modify pci health check

On 2/20/2024 8:01 AM, Oded Gabbay wrote:
> From: Ofir Bitton <[email protected]>
>
> Today we read PCI VENDOR-ID in order to make sure PCI link is
> healthy. Apparently the VENDOR-ID might be stored on host and
> hence, when we read it we might not access the PCI bus.
> In order to make sure PCI health check is reliable, we will start
> checking the DEVICE-ID instead.

What's keeping some system from caching that as well?

Since this is checking for PCIe link health, it will be 0xFF when bad.
Checking some part of Config Space that is writable would be more reliable.

-Carl V.

2024-02-24 20:02:15

by Ofir Bitton

[permalink] [raw]
Subject: Re: [PATCH 13/13] accel/habanalabs: modify pci health check

On 24/02/2024 1:32, Carl Vanderlip wrote:
>
> On 2/20/2024 8:01 AM, Oded Gabbay wrote:
>> From: Ofir Bitton <[email protected]>
>>
>> Today we read PCI VENDOR-ID in order to make sure PCI link is
>> healthy. Apparently the VENDOR-ID might be stored on host and
>> hence, when we read it we might not access the PCI bus.
>> In order to make sure PCI health check is reliable, we will start
>> checking the DEVICE-ID instead.
>
> What's keeping some system from caching that as well?

The PCI Controllers/switches we use in Gaudi family products might cache
only the VENDOR-ID and not the DEVICE-ID.

>
> Since this is checking for PCIe link health, it will be 0xFF when bad.
> Checking some part of Config Space that is writable would be more reliable.

Generally speaking I agree but there is no product in the Gaudi family
with a 0xFF DEVICE-ID (nor there will be), so I think this approach is
good enough for our use-case.

--
Ofir

>
> -Carl V.

2024-02-25 11:44:17

by Ohad Sharabi

[permalink] [raw]
Subject: Re: [PATCH 01/13] accel/habanalabs/gaudi2: use single function to compare FW versions

On 24/02/2024 0:38, Carl Vanderlip wrote:
> [You don't often get email from [email protected]. Learn why this
> is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On 2/20/2024 8:01 AM, Oded Gabbay wrote:> From: Ohad Sharabi
> <[email protected]>
> >
> > Currently, the code contains 2 types of FW version comparison
> functions:
> > - hl_is_fw_sw_ver_[below/equal_or_greater]()
> > - gaudi2 specific function of the type
> >    gaudi2_is_fw_ver_[below/above]x_y_z()
> >
> > Moreover, some functions use the inner FW version which should be only
> > stage during development but not version dependencies.
> >
> > Finally, some tests are done to deprecated FW version to which LKD
> > should hold no compatibility.
> >
> > This commit aligns all APIs to a single function that just compares the
> > version and return an integers indicator (similar in some way to
> > strcmp()).
> >
> > In addition, this generic function now considers also the sub-minor FW
> > version and also remove dead code resulting in deprecated FW versions
> > compatibility.
> >
> > Signed-off-by: Ohad Sharabi <[email protected]>
> > Reviewed-by: Oded Gabbay <[email protected]>
> > Signed-off-by: Oded Gabbay <[email protected]>
> > ---
> >   drivers/accel/habanalabs/common/firmware_if.c | 25 ++++++++
> >   drivers/accel/habanalabs/common/habanalabs.h  | 20 +------
> >   drivers/accel/habanalabs/gaudi2/gaudi2.c      | 57
> +++----------------
> >   3 files changed, 34 insertions(+), 68 deletions(-)
> >
> ...
> > diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c
> b/drivers/accel/habanalabs/gaudi2/gaudi2.c
> > index 1f061209ae21..4a0917aa4dd7 100644
> > --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
> > +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
> > @@ -2601,6 +2601,8 @@ static int gaudi2_set_fixed_properties(struct
> hl_device *hdev)
> >
> >      prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
> >
> > +    prop->supports_advanced_cpucp_rc = true;
> > +
> >      return 0;
> >
> >   free_qprops:
> > @@ -3308,8 +3310,6 @@ static int gaudi2_late_init(struct hl_device
> *hdev)
> >      struct gaudi2_device *gaudi2 = hdev->asic_specific;
> >      int rc;
> >
> > -    hdev->asic_prop.supports_advanced_cpucp_rc = true;
> > -
> >      rc = hl_fw_send_pci_access_msg(hdev,
> CPUCP_PACKET_ENABLE_PCI_ACCESS,
> > gaudi2->virt_msix_db_dma_addr);
> >      if (rc) {

Carl,

Sure, we'll split the patches

Ohad

>
> Is this change in support of the others in this patch? Feels like this
> should be more than one patch (adding new version_cmp, removing old
> checks).
>
> -Carl V.