2023-01-08 17:27:03

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings'

From: Koby Elbaz <[email protected]>

When HL_INFO_USER_MAPPINGS IOCTL is called, we copy_to_user from
a dynamically allocated memory - 'user_mappings'.
Since freeing/allocating it happens in runtime (upon a page fault),
it not unlikely to access it even before being initially allocated
(i.e., accessing a NULL pointer).

The solution is to simply mark the spot when the err info has been
collected, and that way to know whether err info (either page fault
or RAZWI) is available to be read.

Signed-off-by: Koby Elbaz <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 4 +++
drivers/accel/habanalabs/common/habanalabs.h | 4 +++
.../accel/habanalabs/common/habanalabs_drv.c | 2 ++
.../habanalabs/common/habanalabs_ioctl.c | 36 ++++++++++++-------
4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index e1b5a2c34986..6a05ab3fda23 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2441,6 +2441,8 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0],
num_of_engines * sizeof(u16));
razwi_info->razwi.flags = flags;
+
+ razwi_info->razwi_info_available = true;
}

void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
@@ -2526,6 +2528,8 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is
pgf_info->page_fault.addr = addr;
pgf_info->page_fault.engine_id = eng_id;
hl_capture_user_mappings(hdev, is_pmmu);
+
+ pgf_info->page_fault_info_available = true;
}

void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index e578645acba9..cd474422163d 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -2984,12 +2984,14 @@ struct undefined_opcode_info {
* Since we're looking for the page-fault's root cause,
* we don't care of the others that might follow it-
* so once changed to 1, it will remain that way.
+ * @page_fault_info_available: indicates that a page fault info is now available.
*/
struct page_fault_info {
struct hl_page_fault_info page_fault;
struct hl_user_mapping *user_mappings;
u64 num_of_user_mappings;
atomic_t page_fault_detected;
+ bool page_fault_info_available;
};

/**
@@ -3000,10 +3002,12 @@ struct page_fault_info {
* Since we're looking for the RAZWI's root cause,
* we don't care of the others that might follow it-
* so once changed to 1, it will remain that way.
+ * @razwi_info_available: indicates that a RAZWI info is now available.
*/
struct razwi_info {
struct hl_info_razwi_event razwi;
atomic_t razwi_detected;
+ bool razwi_info_available;
};

/**
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index d7fe0af33bca..03dae57dc838 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -225,6 +225,8 @@ int hl_device_open(struct inode *inode, struct file *filp)
atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
hdev->captured_err_info.undef_opcode.write_enable = true;
+ hdev->captured_err_info.razwi_info.razwi_info_available = false;
+ hdev->captured_err_info.page_fault_info.page_fault_info_available = false;

hdev->open_counter++;
hdev->last_successful_open_jif = jiffies;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 949d38527160..72493bf94ba3 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -607,16 +607,20 @@ static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args)

static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
struct hl_device *hdev = hpriv->hdev;
u32 max_size = args->return_size;
- struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi_info.razwi;
- void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+ struct razwi_info *razwi_info;

if ((!max_size) || (!out))
return -EINVAL;

- return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event)))
- ? -EFAULT : 0;
+ razwi_info = &hdev->captured_err_info.razwi_info;
+ if (!razwi_info->razwi_info_available)
+ return 0;
+
+ return copy_to_user(out, &razwi_info->razwi,
+ min_t(size_t, max_size, sizeof(struct hl_info_razwi_event))) ? -EFAULT : 0;
}

static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
@@ -786,16 +790,20 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args)

static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
struct hl_device *hdev = hpriv->hdev;
u32 max_size = args->return_size;
- struct hl_page_fault_info *info = &hdev->captured_err_info.page_fault_info.page_fault;
- void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+ struct page_fault_info *pgf_info;

if ((!max_size) || (!out))
return -EINVAL;

- return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info)))
- ? -EFAULT : 0;
+ pgf_info = &hdev->captured_err_info.page_fault_info;
+ if (!pgf_info->page_fault_info_available)
+ return 0;
+
+ return copy_to_user(out, &pgf_info->page_fault,
+ min_t(size_t, max_size, sizeof(struct hl_page_fault_info))) ? -EFAULT : 0;
}

static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
@@ -806,18 +814,20 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
struct page_fault_info *pgf_info;
u64 actual_size;

- pgf_info = &hdev->captured_err_info.page_fault_info;
- args->array_size = pgf_info->num_of_user_mappings;
-
if (!out)
return -EINVAL;

+ pgf_info = &hdev->captured_err_info.page_fault_info;
+ if (!pgf_info->page_fault_info_available)
+ return 0;
+
+ args->array_size = pgf_info->num_of_user_mappings;
+
actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping);
if (user_buf_size < actual_size)
return -ENOMEM;

- return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size))
- ? -EFAULT : 0;
+ return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0;
}

static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
--
2.25.1


2023-01-08 17:28:26

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 4/7] habanalabs: define events to trace PCI LBW access

From: Ohad Sharabi <[email protected]>

There are cases where it may be useful to dump the whole LBW configs.
Yet, doing so while spamming the kernel log will probably shade other
important messages since the LBW access is done in sheer volume.
To answer this we add trace events for those too.

Signed-off-by: Ohad Sharabi <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
include/trace/events/habanalabs.h | 39 +++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)

diff --git a/include/trace/events/habanalabs.h b/include/trace/events/habanalabs.h
index 10233e13cee4..951643e6a7a9 100644
--- a/include/trace/events/habanalabs.h
+++ b/include/trace/events/habanalabs.h
@@ -123,6 +123,45 @@ DEFINE_EVENT(habanalabs_comms_template, habanalabs_comms_wait_status_done,
TP_PROTO(struct device *dev, char *op_str),
TP_ARGS(dev, op_str));

+DECLARE_EVENT_CLASS(habanalabs_reg_access_template,
+ TP_PROTO(struct device *dev, u32 addr, u32 val),
+
+ TP_ARGS(dev, addr, val),
+
+ TP_STRUCT__entry(
+ __string(dname, dev_name(dev))
+ __field(u32, addr)
+ __field(u32, val)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dname, dev_name(dev));
+ __entry->addr = addr;
+ __entry->val = val;
+ ),
+
+ TP_printk("%s: addr: %#x, val: %#x",
+ __get_str(dname),
+ __entry->addr,
+ __entry->val)
+);
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_rreg32,
+ TP_PROTO(struct device *dev, u32 addr, u32 val),
+ TP_ARGS(dev, addr, val));
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_wreg32,
+ TP_PROTO(struct device *dev, u32 addr, u32 val),
+ TP_ARGS(dev, addr, val));
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_elbi_read,
+ TP_PROTO(struct device *dev, u32 addr, u32 val),
+ TP_ARGS(dev, addr, val));
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_elbi_write,
+ TP_PROTO(struct device *dev, u32 addr, u32 val),
+ TP_ARGS(dev, addr, val));
+
#endif /* if !defined(_TRACE_HABANALABS_H) || defined(TRACE_HEADER_MULTI_READ) */

/* This part must be outside protection */
--
2.25.1

2023-01-08 17:30:02

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 2/7] habanalabs: add set engines masks ASIC function

From: Ohad Sharabi <[email protected]>

This function shall be used whenever components enable/binning masks
should be updated.

Usage is in one of the below cases:
- update user (or default) component masks
- update when getting the masks from FW (either CPUCP or COMMS)

Signed-off-by: Ohad Sharabi <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/firmware_if.c | 12 +++++---
drivers/accel/habanalabs/common/habanalabs.h | 2 ++
drivers/accel/habanalabs/gaudi/gaudi.c | 6 ++++
drivers/accel/habanalabs/gaudi2/gaudi2.c | 30 +++++++++++++------
drivers/accel/habanalabs/goya/goya.c | 6 ++++
5 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index eb000e035026..ef228087ef55 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2647,7 +2647,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
fw_loader->dynamic_loader.comm_desc.cur_fw_ver);

if (rc)
- goto out;
+ return rc;

/* read binning info from preboot */
if (hdev->support_preboot_binning) {
@@ -2660,15 +2660,19 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,

rc = hdev->asic_funcs->set_dram_properties(hdev);
if (rc)
- goto out;
+ return rc;
+
+ rc = hdev->asic_funcs->set_binning_masks(hdev);
+ if (rc)
+ return rc;

dev_dbg(hdev->dev,
"Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x, rot:0x%x\n",
hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning,
hdev->decoder_binning, hdev->rotator_binning);
}
-out:
- return rc;
+
+ return 0;
}

/* load boot fit to FW */
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index cd474422163d..0b7fe4afd92d 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -1549,6 +1549,7 @@ struct engines_data {
* @set_engine_cores: set a config command to engine cores
* @send_device_activity: indication to FW about device availability
* @set_dram_properties: set DRAM related properties.
+ * @set_binning_masks: set binning/enable masks for all relevant components.
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -1687,6 +1688,7 @@ struct hl_asic_funcs {
u32 num_cores, u32 core_command);
int (*send_device_activity)(struct hl_device *hdev, bool open);
int (*set_dram_properties)(struct hl_device *hdev);
+ int (*set_binning_masks)(struct hl_device *hdev);
};


diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 733916f38752..71debe862c86 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -9135,6 +9135,11 @@ static int gaudi_set_dram_properties(struct hl_device *hdev)
return 0;
}

+static int gaudi_set_binning_masks(struct hl_device *hdev)
+{
+ return 0;
+}
+
static void gaudi_check_if_razwi_happened(struct hl_device *hdev)
{
}
@@ -9262,6 +9267,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
.set_dram_bar_base = gaudi_set_hbm_bar_base,
.send_device_activity = gaudi_send_device_activity,
.set_dram_properties = gaudi_set_dram_properties,
+ .set_binning_masks = gaudi_set_binning_masks,
};

/**
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4529a64d49b6..0f3e690041af 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2437,6 +2437,25 @@ static int gaudi2_set_cluster_binning_masks(struct hl_device *hdev)
return 0;
}

+static int gaudi2_set_binning_masks(struct hl_device *hdev)
+{
+ int rc;
+
+ rc = gaudi2_set_cluster_binning_masks(hdev);
+ if (rc)
+ return rc;
+
+ rc = gaudi2_set_tpc_binning_masks(hdev);
+ if (rc)
+ return rc;
+
+ rc = gaudi2_set_dec_binning_masks(hdev);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
static int gaudi2_cpucp_info_get(struct hl_device *hdev)
{
struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -2492,15 +2511,7 @@ static int gaudi2_cpucp_info_get(struct hl_device *hdev)
if (rc)
return rc;

- rc = gaudi2_set_cluster_binning_masks(hdev);
- if (rc)
- return rc;
-
- rc = gaudi2_set_tpc_binning_masks(hdev);
- if (rc)
- return rc;
-
- rc = gaudi2_set_dec_binning_masks(hdev);
+ rc = hdev->asic_funcs->set_binning_masks(hdev);
if (rc)
return rc;

@@ -10597,6 +10608,7 @@ static const struct hl_asic_funcs gaudi2_funcs = {
.set_engine_cores = gaudi2_set_engine_cores,
.send_device_activity = gaudi2_send_device_activity,
.set_dram_properties = gaudi2_set_dram_properties,
+ .set_binning_masks = gaudi2_set_binning_masks,
};

void gaudi2_set_asic_funcs(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index ee0c7db16270..2b135e856607 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -5425,6 +5425,11 @@ static int goya_set_dram_properties(struct hl_device *hdev)
return 0;
}

+static int goya_set_binning_masks(struct hl_device *hdev)
+{
+ return 0;
+}
+
static int goya_send_device_activity(struct hl_device *hdev, bool open)
{
return 0;
@@ -5524,6 +5529,7 @@ static const struct hl_asic_funcs goya_funcs = {
.set_dram_bar_base = goya_set_ddr_bar_base,
.send_device_activity = goya_send_device_activity,
.set_dram_properties = goya_set_dram_properties,
+ .set_binning_masks = goya_set_binning_masks,
};

/*
--
2.25.1

2023-01-08 17:40:06

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 7/7] habanalabs: extend fatal messages to contain PCI info

From: Moti Haimovski <[email protected]>

This commit attaches the PCI device address to driver fatal messages
in order to ease debugging in multi-device setups.

Signed-off-by: Moti Haimovski <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/common/device.c | 38 ++++++++++++++++--------
1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 722a5beb0974..2b6971463f12 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1563,7 +1563,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
if (rc == -EBUSY) {
if (hdev->device_fini_pending) {
dev_crit(hdev->dev,
- "Failed to kill all open processes, stopping hard reset\n");
+ "%s Failed to kill all open processes, stopping hard reset\n",
+ dev_name(&(hdev)->pdev->dev));
goto out_err;
}

@@ -1573,7 +1574,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

if (rc) {
dev_crit(hdev->dev,
- "Failed to kill all open processes, stopping hard reset\n");
+ "%s Failed to kill all open processes, stopping hard reset\n",
+ dev_name(&(hdev)->pdev->dev));
goto out_err;
}

@@ -1624,14 +1626,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
* ensure driver puts the driver in a unusable state
*/
dev_crit(hdev->dev,
- "Consecutive FW fatal errors received, stopping hard reset\n");
+ "%s Consecutive FW fatal errors received, stopping hard reset\n",
+ dev_name(&(hdev)->pdev->dev));
rc = -EIO;
goto out_err;
}

if (hdev->kernel_ctx) {
dev_crit(hdev->dev,
- "kernel ctx was alive during hard reset, something is terribly wrong\n");
+ "%s kernel ctx was alive during hard reset, something is terribly wrong\n",
+ dev_name(&(hdev)->pdev->dev));
rc = -EBUSY;
goto out_err;
}
@@ -1749,9 +1753,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->reset_info.needs_reset = false;

if (hard_reset)
- dev_info(hdev->dev, "Successfully finished resetting the device\n");
+ dev_info(hdev->dev,
+ "Successfully finished resetting the %s device\n",
+ dev_name(&(hdev)->pdev->dev));
else
- dev_dbg(hdev->dev, "Successfully finished resetting the device\n");
+ dev_dbg(hdev->dev,
+ "Successfully finished resetting the %s device\n",
+ dev_name(&(hdev)->pdev->dev));

if (hard_reset) {
hdev->reset_info.hard_reset_cnt++;
@@ -1786,7 +1794,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->reset_info.in_compute_reset = 0;

if (hard_reset) {
- dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
+ dev_err(hdev->dev,
+ "%s Failed to reset! Device is NOT usable\n",
+ dev_name(&(hdev)->pdev->dev));
hdev->reset_info.hard_reset_cnt++;
} else if (reset_upon_device_release) {
spin_unlock(&hdev->reset_info.lock);
@@ -2185,7 +2195,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
}

dev_notice(hdev->dev,
- "Successfully added device to habanalabs driver\n");
+ "Successfully added device %s to habanalabs driver\n",
+ dev_name(&(hdev)->pdev->dev));

hdev->init_done = true;

@@ -2234,11 +2245,11 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
device_cdev_sysfs_add(hdev);
if (hdev->pdev)
dev_err(&hdev->pdev->dev,
- "Failed to initialize hl%d. Device is NOT usable !\n",
- hdev->cdev_idx);
+ "Failed to initialize hl%d. Device %s is NOT usable !\n",
+ hdev->cdev_idx, dev_name(&(hdev)->pdev->dev));
else
- pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
- hdev->cdev_idx);
+ pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n",
+ hdev->cdev_idx, dev_name(&(hdev)->pdev->dev));

return rc;
}
@@ -2294,7 +2305,8 @@ void hl_device_fini(struct hl_device *hdev)

if (ktime_compare(ktime_get(), timeout) > 0) {
dev_crit(hdev->dev,
- "Failed to remove device because reset function did not finish\n");
+ "%s Failed to remove device because reset function did not finish\n",
+ dev_name(&(hdev)->pdev->dev));
return;
}
}
--
2.25.1

2023-01-08 17:52:33

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 3/7] habanalabs/gaudi2: fix log for sob value overflow/underflow

From: Carmit Carmel <[email protected]>

The value in SM_SEI_CAUSE includes the SOB index and not the SOB group
index.
Remove usage of log_mask in sm_sei_cause structure as it was never
used.

Signed-off-by: Carmit Carmel <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/accel/habanalabs/gaudi2/gaudi2.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 0f3e690041af..503a52db203f 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -676,14 +676,13 @@ static const char * const gaudi2_kdma_core_interrupts_cause[GAUDI2_NUM_OF_DMA_CO
struct gaudi2_sm_sei_cause_data {
const char *cause_name;
const char *log_name;
- u32 log_mask;
};

static const struct gaudi2_sm_sei_cause_data
gaudi2_sm_sei_cause[GAUDI2_NUM_OF_SM_SEI_ERR_CAUSE] = {
- {"calculated SO value overflow/underflow", "SOB group ID", 0x7FF},
- {"payload address of monitor is not aligned to 4B", "monitor addr", 0xFFFF},
- {"armed monitor write got BRESP (SLVERR or DECERR)", "AXI id", 0xFFFF},
+ {"calculated SO value overflow/underflow", "SOB ID"},
+ {"payload address of monitor is not aligned to 4B", "monitor addr"},
+ {"armed monitor write got BRESP (SLVERR or DECERR)", "AXI id"},
};

static const char * const
@@ -8418,7 +8417,7 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_in
"err cause: %s. %s: 0x%X\n",
gaudi2_sm_sei_cause[i].cause_name,
gaudi2_sm_sei_cause[i].log_name,
- sei_cause_log & gaudi2_sm_sei_cause[i].log_mask);
+ sei_cause_log);
error_count++;
break;
}
--
2.25.1