2022-11-23 15:09:03

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 1/8] habanalabs: fix rc when new CPUCP opcodes are not supported

From: Tomer Tayar <[email protected]>

When the new CPUCP opcodes are not supported and a CPUCP packet fails,
the return value is the F/W error resposone which is a positive value.
If this packet is sent from IOCTL and the positive value is used, the
ICOTL will not be considered as unsuccessful.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/firmware_if.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index c0909d76d6eb..cf8147e43833 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -324,6 +324,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,

if (!prop->supports_advanced_cpucp_rc) {
dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode);
+ rc = -EIO;
goto scrub_descriptor;
}

--
2.25.1


2022-11-23 15:09:47

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 7/8] habanalabs: fail driver load if EEPROM errors detected

From: Ofir Bitton <[email protected]>

In case EEPROM is not burned, firmware sets default EEPROM values.
As this is not valid in production, driver should fail load upon any
EEPROM error reported by firmware.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/firmware_if.c | 23 ++++++++++----------
1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index cf8147e43833..228b92278e48 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -617,16 +617,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);

- /* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
- dev_warn(hdev->dev,
- "Device boot warning - EEPROM failure detected, default settings applied\n");
- /* This is a warning so we don't want it to disable the
- * device
- */
- err_val &= ~CPU_BOOT_ERR0_EEPROM_FAIL;
+ dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
+ err_exists = true;
}

+ /* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n");
@@ -2532,7 +2528,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
struct fw_load_mgr *fw_loader)
{
struct cpu_dyn_regs *dyn_regs;
- int rc;
+ int rc, fw_error_rc;

dev_info(hdev->dev,
"Loading %sfirmware to device, may take some time...\n",
@@ -2632,14 +2628,17 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,

hl_fw_dynamic_update_linux_interrupt_if(hdev);

- return 0;
-
protocol_err:
- if (fw_loader->dynamic_loader.fw_desc_valid)
- fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
+ if (fw_loader->dynamic_loader.fw_desc_valid) {
+ fw_error_rc = fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
le32_to_cpu(dyn_regs->cpu_boot_err1),
le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
+
+ if (fw_error_rc)
+ return fw_error_rc;
+ }
+
return rc;
}

--
2.25.1

2022-11-23 15:21:25

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 4/8] habanalabs: don't put context in hl_encaps_handle_do_release_sob()

From: Tomer Tayar <[email protected]>

hl_encaps_handle_do_release_sob() can be called only when the last
reference to the context object is released and hl_ctx_do_release() is
initiated, and therefore it shouldn't call hl_ctx_put().

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/context.c | 1 -
1 file changed, 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 2f4620b7990c..ba6675960203 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -39,7 +39,6 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref)
idr_remove(&mgr->handles, handle->id);
spin_unlock(&mgr->lock);

- hl_ctx_put(handle->ctx);
kfree(handle);
}

--
2.25.1

2022-11-23 15:22:50

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 2/8] habanalabs: add RMWREG32_SHIFTED to set a val within a mask

From: Dafna Hirschfeld <[email protected]>

This is similar to RMWREG32, but the given 'val' is already shifted
according to the mask.
This allows several 'ORed' vals and masks to be set at once
The patch also fixes wrong usage of RMWREG32 by replacing
it with RMWREG32_SHIFTED

Signed-off-by: Dafna Hirschfeld <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/habanalabs.h | 10 +++-------
drivers/misc/habanalabs/gaudi2/gaudi2.c | 6 +++---
2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index e7f89868428d..0329a0980bb7 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2498,13 +2498,9 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
#define WREG32_AND(reg, and) WREG32_P(reg, 0, and)
#define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or))

-#define RMWREG32(reg, val, mask) \
- do { \
- u32 tmp_ = RREG32(reg); \
- tmp_ &= ~(mask); \
- tmp_ |= ((val) << __ffs(mask)); \
- WREG32(reg, tmp_); \
- } while (0)
+#define RMWREG32_SHIFTED(reg, val, mask) WREG32_P(reg, val, ~(mask))
+
+#define RMWREG32(reg, val, mask) RMWREG32_SHIFTED(reg, (val) << __ffs(mask), mask)

#define RREG32_MASK(reg, mask) ((RREG32(reg) & mask) >> __ffs(mask))

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index a33a9072fca4..e793fb2bdcbe 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -5052,7 +5052,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev)
mmu_base = mmPMMU_HBW_MMU_BASE;
stlb_base = mmPMMU_HBW_STLB_BASE;

- RMWREG32(stlb_base + STLB_HOP_CONFIGURATION_OFFSET,
+ RMWREG32_SHIFTED(stlb_base + STLB_HOP_CONFIGURATION_OFFSET,
(0 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_HOP_SHIFT) |
(5 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_SMALL_P_SHIFT) |
(4 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_LARGE_P_SHIFT) |
@@ -5068,7 +5068,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev)

if (PAGE_SIZE == SZ_64K) {
/* Set page sizes to 64K on hop5 and 16M on hop4 + enable 8 bit hops */
- RMWREG32(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET,
+ RMWREG32_SHIFTED(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET,
FIELD_PREP(DCORE0_HMMU0_MMU_STATIC_MULTI_PAGE_SIZE_HOP5_PAGE_SIZE_MASK, 4) |
FIELD_PREP(DCORE0_HMMU0_MMU_STATIC_MULTI_PAGE_SIZE_HOP4_PAGE_SIZE_MASK, 3) |
FIELD_PREP(
@@ -5116,7 +5116,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id,
RMWREG32(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, 5 /* 64MB */,
MMU_STATIC_MULTI_PAGE_SIZE_HOP4_PAGE_SIZE_MASK);

- RMWREG32(stlb_base + STLB_HOP_CONFIGURATION_OFFSET,
+ RMWREG32_SHIFTED(stlb_base + STLB_HOP_CONFIGURATION_OFFSET,
FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_HOP_MASK, 0) |
FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_SMALL_P_MASK, 3) |
FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_LARGE_P_MASK, 3) |
--
2.25.1

2022-11-23 15:25:17

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 5/8] habanalabs: clear non-released encapsulated signals

From: Tomer Tayar <[email protected]>

Reserved encapsulated signals which were not released hold the context
refcount, leading to a failure when killing the user process on device
reset or device fini.
Add the release of these left signals in the CS roll-back process.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
.../habanalabs/common/command_submission.c | 46 ++++++++++++----
drivers/misc/habanalabs/common/context.c | 53 +++++++++++--------
drivers/misc/habanalabs/common/habanalabs.h | 3 +-
3 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index f1c69c8ed74a..ea0e5101c10e 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref)
*/
if (hl_cs_cmpl->encaps_signals)
kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
- hl_encaps_handle_do_release);
+ hl_encaps_release_handle_and_put_ctx);
}

- if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
- && cs->encaps_signals)
- kref_put(&cs->encaps_sig_hdl->refcount,
- hl_encaps_handle_do_release);
+ if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
+ kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);

out:
/* Must be called before hl_ctx_put because inside we use ctx to get
@@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
hl_complete_job(hdev, job);
}

+/*
+ * release_reserved_encaps_signals() - release reserved encapsulated signals.
+ * @hdev: pointer to habanalabs device structure
+ *
+ * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
+ * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
+ * For these signals need also to put the refcount of the H/W SOB which was taken at the
+ * reservation.
+ */
+static void release_reserved_encaps_signals(struct hl_device *hdev)
+{
+ struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
+ struct hl_cs_encaps_sig_handle *handle;
+ struct hl_encaps_signals_mgr *mgr;
+ u32 id;
+
+ if (!ctx)
+ return;
+
+ mgr = &ctx->sig_mgr;
+
+ idr_for_each_entry(&mgr->handles, handle, id)
+ if (handle->cs_seq == ULLONG_MAX)
+ kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
+
+ hl_ctx_put(ctx);
+}
+
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
{
int i;
@@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
}

force_complete_multi_cs(hdev);
+
+ release_reserved_encaps_signals(hdev);
}

static void
@@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
*/
handle->pre_sob_val = prop->next_sob_val - handle->count;

+ handle->cs_seq = ULLONG_MAX;
+
*signals_count = prop->next_sob_val;
hdev->asic_funcs->hw_queues_unlock(hdev);

@@ -2350,10 +2380,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
/* We finished with the CS in this function, so put the ref */
cs_put(cs);
free_cs_chunk_array:
- if (!wait_cs_submitted && cs_encaps_signals && handle_found &&
- is_wait_cs)
- kref_put(&encaps_sig_hdl->refcount,
- hl_encaps_handle_do_release);
+ if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
+ kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
kfree(cs_chunk_array);
out:
return rc;
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index ba6675960203..9c8b1b37b510 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -9,37 +9,46 @@

#include <linux/slab.h>

-void hl_encaps_handle_do_release(struct kref *ref)
+static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob,
+ bool put_ctx)
{
- struct hl_cs_encaps_sig_handle *handle =
- container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;

+ if (put_hw_sob)
+ hw_sob_put(handle->hw_sob);
+
spin_lock(&mgr->lock);
idr_remove(&mgr->handles, handle->id);
spin_unlock(&mgr->lock);

- hl_ctx_put(handle->ctx);
+ if (put_ctx)
+ hl_ctx_put(handle->ctx);
+
kfree(handle);
}

-static void hl_encaps_handle_do_release_sob(struct kref *ref)
+void hl_encaps_release_handle_and_put_ctx(struct kref *ref)
{
struct hl_cs_encaps_sig_handle *handle =
- container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
- struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
+ container_of(ref, struct hl_cs_encaps_sig_handle, refcount);

- /* if we're here, then there was a signals reservation but cs with
- * encaps signals wasn't submitted, so need to put refcount
- * to hw_sob taken at the reservation.
- */
- hw_sob_put(handle->hw_sob);
+ encaps_handle_do_release(handle, false, true);
+}

- spin_lock(&mgr->lock);
- idr_remove(&mgr->handles, handle->id);
- spin_unlock(&mgr->lock);
+static void hl_encaps_release_handle_and_put_sob(struct kref *ref)
+{
+ struct hl_cs_encaps_sig_handle *handle =
+ container_of(ref, struct hl_cs_encaps_sig_handle, refcount);

- kfree(handle);
+ encaps_handle_do_release(handle, true, false);
+}
+
+void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref)
+{
+ struct hl_cs_encaps_sig_handle *handle =
+ container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
+
+ encaps_handle_do_release(handle, true, true);
}

static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
@@ -48,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr)
idr_init(&mgr->handles);
}

-static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,
- struct hl_encaps_signals_mgr *mgr)
+static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr)
{
struct hl_cs_encaps_sig_handle *handle;
struct idr *idp;
@@ -57,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev,

idp = &mgr->handles;

+ /* The IDR is expected to be empty at this stage, because any left signal should have been
+ * released as part of CS roll-back.
+ */
if (!idr_is_empty(idp)) {
- dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n");
+ dev_warn(hdev->dev,
+ "device released while some encaps signals handles are still allocated\n");
idr_for_each_entry(idp, handle, id)
- kref_put(&handle->refcount,
- hl_encaps_handle_do_release_sob);
+ kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob);
}

idr_destroy(&mgr->handles);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 0329a0980bb7..e2527d976ee0 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -3775,7 +3775,8 @@ void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *d

void hw_sob_get(struct hl_hw_sob *hw_sob);
void hw_sob_put(struct hl_hw_sob *hw_sob);
-void hl_encaps_handle_do_release(struct kref *ref);
+void hl_encaps_release_handle_and_put_ctx(struct kref *ref);
+void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref);
void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
struct hl_cs *cs, struct hl_cs_job *job,
struct hl_cs_compl *cs_cmpl);
--
2.25.1

2022-11-23 15:40:14

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 3/8] habanalabs: print context refcount value if hard reset fails

From: Tomer Tayar <[email protected]>

Failing to kill a user process during a hard reset can be due to a
reference to the user context which isn't released.
To make it easier to understand if this the reason for the failure and
not something else, add a print of the context refcount value.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index f5864893237c..926f230def56 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -696,10 +696,22 @@ static void device_hard_reset_pending(struct work_struct *work)
flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR;

rc = hl_device_reset(hdev, flags);
+
if ((rc == -EBUSY) && !hdev->device_fini_pending) {
- dev_info(hdev->dev,
- "Could not reset device. will try again in %u seconds",
- HL_PENDING_RESET_PER_SEC);
+ struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
+
+ if (ctx) {
+ /* The read refcount value should subtracted by one, because the read is
+ * protected with hl_get_compute_ctx().
+ */
+ dev_info(hdev->dev,
+ "Could not reset device (compute_ctx refcount %u). will try again in %u seconds",
+ kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC);
+ hl_ctx_put(ctx);
+ } else {
+ dev_info(hdev->dev, "Could not reset device. will try again in %u seconds",
+ HL_PENDING_RESET_PER_SEC);
+ }

queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work,
msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
--
2.25.1

2022-11-23 15:50:57

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 6/8] habanalabs: make print of engines idle mask more readable

From: Tomer Tayar <[email protected]>

The engines idle mask was increased to be an array of 4 u64 entries.
To make the print of this mask more readable, remove the "0x" prefix,
and zero-pad each u64 to 16 bytes if either it isn't zero or if any of
the higher-order u64's is not zero.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 27 +++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 926f230def56..87ab329e65d4 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -386,6 +386,23 @@ bool hl_ctrl_device_operational(struct hl_device *hdev,
}
}

+static void print_idle_status_mask(struct hl_device *hdev, const char *message,
+ u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
+{
+ u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {};
+
+ BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4);
+
+ pad_width[3] = idle_mask[3] ? 16 : 0;
+ pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0;
+ pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0;
+ pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0;
+
+ dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n",
+ message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2],
+ pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]);
+}
+
static void hpriv_release(struct kref *ref)
{
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
@@ -416,9 +433,8 @@ static void hpriv_release(struct kref *ref)
device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
if (!device_is_idle) {
- dev_err(hdev->dev,
- "device not idle after user context is closed (0x%llx_%llx_%llx_%llx)\n",
- idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
+ print_idle_status_mask(hdev, "device is not idle after user context is closed",
+ idle_mask);
reset_device = true;
}

@@ -1673,9 +1689,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
- HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
- dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx_%llx_%llx) after reset\n",
- idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
+ HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
+ print_idle_status_mask(hdev, "device is not idle after reset", idle_mask);
rc = -EIO;
goto out_err;
}
--
2.25.1