2022-11-17 16:53:23

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 01/20] habanalabs/gaudi2: add PCI revision 2 support

From: Ofir Bitton <[email protected]>

Add support for Gaudi2 Device with PCI revision 2.
Functionality is exactly the same as revision 1, the only difference
is device name exposed to user.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 4 +++
drivers/misc/habanalabs/common/habanalabs.h | 2 ++
.../misc/habanalabs/common/habanalabs_drv.c | 26 +++++++++++++------
.../misc/habanalabs/common/habanalabs_ioctl.c | 6 +++--
drivers/misc/habanalabs/common/mmu/mmu.c | 1 +
drivers/misc/habanalabs/common/sysfs.c | 2 ++
drivers/misc/habanalabs/gaudi2/gaudi2.c | 6 +----
drivers/misc/habanalabs/gaudi2/gaudi2P.h | 2 --
.../include/hw_ip/pci/pci_general.h | 7 +++++
include/uapi/misc/habanalabs.h | 7 +++++
10 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 3ea1ee1ec8ef..35ed494fcfdf 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -748,6 +748,10 @@ static int device_early_init(struct hl_device *hdev)
gaudi2_set_asic_funcs(hdev);
strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name));
break;
+ case ASIC_GAUDI2B:
+ gaudi2_set_asic_funcs(hdev);
+ strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
+ break;
break;
default:
dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 7d191f388953..e391e7951fb7 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1192,6 +1192,7 @@ struct hl_dec {
* @ASIC_GAUDI: Gaudi device (HL-2000).
* @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000).
* @ASIC_GAUDI2: Gaudi2 device.
+ * @ASIC_GAUDI2B: Gaudi2B device.
*/
enum hl_asic_type {
ASIC_INVALID,
@@ -1199,6 +1200,7 @@ enum hl_asic_type {
ASIC_GAUDI,
ASIC_GAUDI_SEC,
ASIC_GAUDI2,
+ ASIC_GAUDI2B,
};

struct hl_cs_parser;
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index e82af8989700..7815c60df54e 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -9,6 +9,7 @@
#define pr_fmt(fmt) "habanalabs: " fmt

#include "habanalabs.h"
+#include "../include/hw_ip/pci/pci_general.h"

#include <linux/pci.h>
#include <linux/aer.h>
@@ -74,16 +75,17 @@ MODULE_DEVICE_TABLE(pci, ids);
/*
* get_asic_type - translate device id to asic type
*
- * @device: id of the PCI device
+ * @hdev: pointer to habanalabs device structure.
*
- * Translate device id to asic type.
+ * Translate device id and revision id to asic type.
* In case of unidentified device, return -1
*/
-static enum hl_asic_type get_asic_type(u16 device)
+static enum hl_asic_type get_asic_type(struct hl_device *hdev)
{
- enum hl_asic_type asic_type;
+ struct pci_dev *pdev = hdev->pdev;
+ enum hl_asic_type asic_type = ASIC_INVALID;

- switch (device) {
+ switch (pdev->device) {
case PCI_IDS_GOYA:
asic_type = ASIC_GOYA;
break;
@@ -94,10 +96,18 @@ static enum hl_asic_type get_asic_type(u16 device)
asic_type = ASIC_GAUDI_SEC;
break;
case PCI_IDS_GAUDI2:
- asic_type = ASIC_GAUDI2;
+ switch (pdev->revision) {
+ case REV_ID_A:
+ asic_type = ASIC_GAUDI2;
+ break;
+ case REV_ID_B:
+ asic_type = ASIC_GAUDI2B;
+ break;
+ default:
+ break;
+ }
break;
default:
- asic_type = ASIC_INVALID;
break;
}

@@ -416,7 +426,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
/* First, we must find out which ASIC are we handling. This is needed
* to configure the behavior of the driver (kernel parameters)
*/
- hdev->asic_type = get_asic_type(pdev->device);
+ hdev->asic_type = get_asic_type(hdev);
if (hdev->asic_type == ASIC_INVALID) {
dev_err(&pdev->dev, "Unsupported ASIC\n");
rc = -ENODEV;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 5ce5c42e2731..ee43017eb563 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -10,10 +10,11 @@
#include <uapi/misc/habanalabs.h>
#include "habanalabs.h"

-#include <linux/kernel.h>
#include <linux/fs.h>
-#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>

static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
@@ -105,6 +106,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
hw_ip.edma_enabled_mask = prop->edma_enabled_mask;
hw_ip.server_type = prop->server_type;
hw_ip.security_enabled = prop->fw_security_enabled;
+ hw_ip.revision_id = hdev->pdev->revision;

return copy_to_user(out, &hw_ip,
min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c
index 67d3e70cf571..2c1005f74cf4 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu.c
@@ -635,6 +635,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
break;
case ASIC_GAUDI2:
+ case ASIC_GAUDI2B:
/* MMUs in Gaudi2 are always host resident */
hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
break;
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index c924fc994bd9..735d8bed0066 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -248,6 +248,8 @@ static ssize_t device_type_show(struct device *dev,
case ASIC_GAUDI2:
str = "GAUDI2";
break;
+ case ASIC_GAUDI2B:
+ str = "GAUDI2B";
break;
default:
dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 03f8cf9bb136..f21b68be6d20 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -3968,11 +3968,7 @@ static void gaudi2_init_firmware_loader(struct hl_device *hdev)
fw_loader->skip_bmc = false;
fw_loader->sram_bar_id = SRAM_CFG_BAR_ID;
fw_loader->dram_bar_id = DRAM_BAR_ID;
-
- if (hdev->asic_type == ASIC_GAUDI2)
- fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC;
- else /* ASIC_GAUDI2_FPGA */
- fw_loader->cpu_timeout = GAUDI2_FPGA_CPU_TIMEOUT;
+ fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC;

/* here we update initial values for few specific dynamic regs (as
* before reading the first descriptor from FW those value has to be
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2P.h b/drivers/misc/habanalabs/gaudi2/gaudi2P.h
index a99c348bbf39..b4383c199bbb 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2P.h
@@ -23,8 +23,6 @@

#define GAUDI2_CPU_TIMEOUT_USEC 30000000 /* 30s */

-#define GAUDI2_FPGA_CPU_TIMEOUT 100000000 /* 100s */
-
#define NUMBER_OF_PDMA_QUEUES 2
#define NUMBER_OF_EDMA_QUEUES 8
#define NUMBER_OF_MME_QUEUES 4
diff --git a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h
index d232081d4e0f..f5d497dc9bdc 100644
--- a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h
@@ -20,4 +20,11 @@
#define PCI_CONFIG_ELBI_STS_MASK (PCI_CONFIG_ELBI_STS_ERR | \
PCI_CONFIG_ELBI_STS_DONE)

+enum hl_revision_id {
+ /* PCI revision ID 0 is not legal */
+ REV_ID_INVALID = 0x00,
+ REV_ID_A = 0x01,
+ REV_ID_B = 0x02,
+};
+
#endif /* INCLUDE_PCI_GENERAL_H_ */
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index a4ceee681898..58343998bd63 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -868,6 +868,7 @@ enum hl_server_type {
* @number_of_user_interrupts: The number of interrupts that are available to the userspace
* application to use. Relevant for Gaudi2 and later.
* @device_mem_alloc_default_page_size: default page size used in device memory allocation.
+ * @revision_id: PCI revision ID of the ASIC.
*/
struct hl_info_hw_ip_info {
__u64 sram_base_address;
@@ -898,6 +899,12 @@ struct hl_info_hw_ip_info {
__u16 pad2;
__u64 reserved4;
__u64 device_mem_alloc_default_page_size;
+ __u64 reserved5;
+ __u64 reserved6;
+ __u32 reserved7;
+ __u8 reserved8;
+ __u8 revision_id;
+ __u8 pad[2];
};

struct hl_info_dram_usage {
--
2.25.1



2022-11-17 16:55:02

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 08/20] habanalabs: skip events info ioctl if not supported

From: Ohad Sharabi <[email protected]>

Some ASICs haven't yet implemented this functionality and so the
ioctl call should fail and the user should be notified of the reason.

Signed-off-by: Ohad Sharabi <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/habanalabs_ioctl.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index ee43017eb563..b6abfa7761a7 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -123,6 +123,10 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
return -EINVAL;

arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size);
+ if (!arr) {
+ dev_err(hdev->dev, "Events info not supported\n");
+ return -EOPNOTSUPP;
+ }

return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
}
--
2.25.1


2022-11-17 17:16:18

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 17/20] habanalabs: check schedule_hard_reset correctly

schedule_hard_reset can be true only if we didn't do hard-reset.
Therefore, no point of checking it in case hard_reset is true.

Signed-off-by: Oded Gabbay <[email protected]>
Reviewed-by: Tomer Tayar <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 49640c8ca910..0650e511a0f5 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1737,18 +1737,19 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
* the device will be operational although it shouldn't be
*/
hdev->asic_funcs->enable_events_from_fw(hdev);
- } else if (!reset_upon_device_release) {
- hdev->reset_info.compute_reset_cnt++;
- }
-
- if (schedule_hard_reset) {
- dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
- flags = hdev->reset_info.hard_reset_schedule_flags;
- hdev->reset_info.hard_reset_schedule_flags = 0;
- hdev->disabled = true;
- hard_reset = true;
- handle_reset_trigger(hdev, flags);
- goto again;
+ } else {
+ if (!reset_upon_device_release)
+ hdev->reset_info.compute_reset_cnt++;
+
+ if (schedule_hard_reset) {
+ dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n");
+ flags = hdev->reset_info.hard_reset_schedule_flags;
+ hdev->reset_info.hard_reset_schedule_flags = 0;
+ hdev->disabled = true;
+ hard_reset = true;
+ handle_reset_trigger(hdev, flags);
+ goto again;
+ }
}

return 0;
--
2.25.1


2022-11-17 17:16:43

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 03/20] habanalabs: use single threaded WQ for event handling

From: Dani Liberman <[email protected]>

Creating event queue workqueue using alloc_workqueue made it run in
multi threaded mode, which caused parallel dumping of events as well as
parallel events notifying to user, causing logs with multiple
events to be out of order.

Fixed by creating event queue workqueue as single threaded work queue.

Signed-off-by: Dani Liberman <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index d1a609589558..65bb40f81901 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -787,7 +787,7 @@ static int device_early_init(struct hl_device *hdev)
}
}

- hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
+ hdev->eq_wq = create_singlethread_workqueue("hl-events");
if (hdev->eq_wq == NULL) {
dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
rc = -ENOMEM;
--
2.25.1


2022-11-17 17:16:50

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 14/20] habanalabs/gaudi2: don't enable entries in the MSIX_GW table

From: Tomer Tayar <[email protected]>

User should use the virtual MSI-X doorbell to generate interrupts from
the device, so there is no need to enable entries in the MSIX_GW table.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/gaudi2/gaudi2.c | 26 -------------------------
1 file changed, 26 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 36f0ea1100bb..d5efec347bc1 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -4695,30 +4695,6 @@ static void gaudi2_init_dec(struct hl_device *hdev)
}
}

-static void gaudi2_init_msix_gw_table(struct hl_device *hdev)
-{
- u32 first_reg_offset, last_reg_offset, msix_gw_table_base;
- u8 first_bit, last_bit;
- int i;
-
- msix_gw_table_base = mmPCIE_WRAP_MSIX_GW_TABLE_0;
- first_reg_offset = (GAUDI2_IRQ_NUM_USER_FIRST >> 5) << 2;
- first_bit = GAUDI2_IRQ_NUM_USER_FIRST % 32;
- last_reg_offset = (GAUDI2_IRQ_NUM_USER_LAST >> 5) << 2;
- last_bit = GAUDI2_IRQ_NUM_USER_LAST % 32;
-
- if (first_reg_offset == last_reg_offset) {
- WREG32(msix_gw_table_base + first_reg_offset, GENMASK(last_bit, first_bit));
- return;
- }
-
- WREG32(msix_gw_table_base + first_reg_offset, GENMASK(31, first_bit));
- WREG32(msix_gw_table_base + last_reg_offset, GENMASK(last_bit, 0));
-
- for (i = first_reg_offset + 4; i < last_reg_offset ; i += 4)
- WREG32(msix_gw_table_base + i, 0xFFFFFFFF);
-}
-
static int gaudi2_mmu_update_asid_hop0_addr(struct hl_device *hdev,
u32 stlb_base, u32 asid, u64 phys_addr)
{
@@ -5232,8 +5208,6 @@ static int gaudi2_hw_init(struct hl_device *hdev)
return rc;
}

- gaudi2_init_msix_gw_table(hdev);
-
gaudi2_init_scrambler_hbm(hdev);
gaudi2_init_kdma(hdev);

--
2.25.1


2022-11-17 17:17:22

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 09/20] habanalabs/gaudi2: classify power/thermal events as info

From: Ofir Bitton <[email protected]>

As power and thermal envelope events are pure informative and not
indicating an error, we reduce the print level to info only.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 59940c8df2d2..61960fa059e0 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -6828,6 +6828,7 @@ static inline bool is_info_event(u32 event)
{
switch (event) {
case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_CAUSE:
+ case GAUDI2_EVENT_CPU_FIX_POWER_ENV_S ... GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
return true;
default:
return false;
--
2.25.1


2022-11-17 17:18:13

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 07/20] habanalabs: fix firmware descriptor copy operation

From: farah kassabri <[email protected]>

This is needed to allow adding more data to the lkd_fw_comms_desc
structure.

Signed-off-by: farah kassabri <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/firmware_if.c | 26 ++++++++++++++++++--
1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index f18e53bbba6b..01c4ffba6e97 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -12,6 +12,7 @@
#include <linux/crc32.h>
#include <linux/slab.h>
#include <linux/ctype.h>
+#include <linux/vmalloc.h>

#define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */

@@ -1988,10 +1989,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
struct fw_load_mgr *fw_loader)
{
struct lkd_fw_comms_desc *fw_desc;
+ void __iomem *src, *temp_fw_desc;
struct pci_mem_region *region;
struct fw_response *response;
+ u16 fw_data_size;
enum pci_region region_id;
- void __iomem *src;
int rc;

fw_desc = &fw_loader->dynamic_loader.comm_desc;
@@ -2018,9 +2020,29 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
fw_loader->dynamic_loader.fw_desc_valid = false;
src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
response->ram_offset;
+
+ /*
+ * We do the copy of the fw descriptor in 2 phases:
+ * 1. copy the header + data info according to our lkd_fw_comms_desc definition.
+ * then we're able to read the actual data size provided by fw.
+ * this is needed for cases where data in descriptor was changed(add/remove)
+ * in embedded specs header file before updating lkd copy of the header file
+ * 2. copy descriptor to temporary buffer with aligned size and send it to validation
+ */
memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
+ fw_data_size = le16_to_cpu(fw_desc->header.size);
+
+ temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size);
+ if (!temp_fw_desc)
+ return -ENOMEM;
+
+ memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size);

- return hl_fw_dynamic_validate_descriptor(hdev, fw_loader, fw_desc);
+ rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader,
+ (struct lkd_fw_comms_desc *) temp_fw_desc);
+ vfree(temp_fw_desc);
+
+ return rc;
}

/**
--
2.25.1


2022-11-17 17:18:17

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 11/20] habanalabs: fix print for out-of-sync and pkt-failure events

From: Tomer Tayar <[email protected]>

Add missing le32_to_cpu() conversions, and use %d for the value
returned from atomic_read().

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++--
drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 ++++----
drivers/misc/habanalabs/goya/goya.c | 4 ++--
3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index cbe1daf5a793..7b93f0d26dd0 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7347,8 +7347,8 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
{
struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ];

- dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
- sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+ le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
}

static void gaudi_print_fw_alive_info(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 65c9b535aa69..bdb5782afb7e 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8684,8 +8684,8 @@ static void gaudi2_print_out_of_sync_info(struct hl_device *hdev,
{
struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ];

- dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
- sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+ le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
}

static void gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev)
@@ -8751,8 +8751,8 @@ static void gaudi2_print_cpu_pkt_failure_info(struct hl_device *hdev,
struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ];

dev_warn(hdev->dev,
- "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
- sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+ "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+ le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
}

static void hl_arc_event_handle(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 5ef9e3ca97a6..0f083fcf81a6 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4475,8 +4475,8 @@ static void goya_print_out_of_sync_info(struct hl_device *hdev,
{
struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];

- dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
- sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n",
+ le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci));
}

static void goya_print_irq_info(struct hl_device *hdev, u16 event_type,
--
2.25.1


2022-11-17 17:18:34

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 20/20] habanalabs: increase the size of busy engines mask

From: Tomer Tayar <[email protected]>

Increase the size of the busy engines mask in 'struct hl_info_hw_idle',
for future ASICs with more than 128 engines.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 9 +++++----
include/uapi/misc/habanalabs.h | 2 +-
2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 63d0cb7087e8..f5864893237c 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -416,8 +416,9 @@ static void hpriv_release(struct kref *ref)
device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL);
if (!device_is_idle) {
- dev_err(hdev->dev, "device not idle after user context is closed (0x%llx_%llx)\n",
- idle_mask[1], idle_mask[0]);
+ dev_err(hdev->dev,
+ "device not idle after user context is closed (0x%llx_%llx_%llx_%llx)\n",
+ idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
reset_device = true;
}

@@ -1661,8 +1662,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
- dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
- idle_mask[1], idle_mask[0]);
+ dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx_%llx_%llx) after reset\n",
+ idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
}
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index e50cb71df081..3b995e841eb8 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -916,7 +916,7 @@ struct hl_info_dram_usage {
__u64 ctx_dram_mem;
};

-#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2
+#define HL_BUSY_ENGINES_MASK_EXT_SIZE 4

struct hl_info_hw_idle {
__u32 is_idle;
--
2.25.1


2022-11-17 17:21:03

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 16/20] habanalabs: reset device if still in use when released

From: Tomer Tayar <[email protected]>

If the device file is released while a context is still held, it won't
be possible to reopen it until the context is eventually released.
If that doesn't happen, only a device reset will revert it back to an
operational state, i.e. need to wait for a CS timeout or an error, or to
wait for an external intervention of injecting a reset via sysfs.

At this stage, after the device was released by user, context is held
either because of CS which were left running on the device and are not
relevant anymore, or due to missing cleanup steps from user side.

All of this is in any case handled in the device reset flow, so initiate
the reset at this point instead of waiting for it.

Signed-off-by: Tomer Tayar <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/common/device.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 708db0f48ee0..49640c8ca910 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -504,9 +504,10 @@ static int hl_device_release(struct inode *inode, struct file *filp)

hdev->compute_ctx_in_release = 1;

- if (!hl_hpriv_put(hpriv))
- dev_notice(hdev->dev,
- "User process closed FD but device still in use\n");
+ if (!hl_hpriv_put(hpriv)) {
+ dev_notice(hdev->dev, "User process closed FD but device still in use\n");
+ hl_device_reset(hdev, HL_DRV_RESET_HARD);
+ }

hdev->last_open_session_duration_jif =
jiffies - hdev->last_successful_open_jif;
--
2.25.1