2020-06-16 06:15:46

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 1/7] habanalabs: remove rate limiters from GAUDI

We no longer need to initialize the rate limiters in GAUDI A1.

Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 126 +----------------------
drivers/misc/habanalabs/habanalabs_drv.c | 1 -
2 files changed, 1 insertion(+), 126 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 61f88e9884ce..4d69727bb53b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1638,8 +1638,8 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev)
uint32_t hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd;

hbm0_wr = 0x33333333;
- hbm1_wr = 0x33333333;
hbm0_rd = 0x77777777;
+ hbm1_wr = 0x55555555;
hbm1_rd = 0xDDDDDDDD;

WREG32(mmDMA_IF_E_N_HBM0_WR_CRED_CNT, hbm0_wr);
@@ -1689,125 +1689,6 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev)
(1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT));
}

-static void gaudi_init_rate_limiter(struct hl_device *hdev)
-{
- u32 nr, nf, od, sat, rst, timeout;
- u64 freq;
-
- nr = RREG32(mmPSOC_HBM_PLL_NR);
- nf = RREG32(mmPSOC_HBM_PLL_NF);
- od = RREG32(mmPSOC_HBM_PLL_OD);
- freq = (50 * (nf + 1)) / ((nr + 1) * (od + 1));
-
- dev_dbg(hdev->dev, "HBM frequency is %lluMHz\n", freq);
-
- /* Configuration is for five (5) DDMA channels */
- if (freq == 800) {
- sat = 4;
- rst = 11;
- timeout = 15;
- } else if (freq == 900) {
- sat = 4;
- rst = 15;
- timeout = 16;
- } else if (freq == 950) {
- sat = 4;
- rst = 15;
- timeout = 15;
- } else {
- dev_warn(hdev->dev,
- "unsupported HBM frequency %lluMHz, no rate-limiters\n",
- freq);
- return;
- }
-
- WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_1, 0x111);
- WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_1, 0x111);
- WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_1, 0x111);
- WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_1, 0x111);
-
- if (!hdev->rl_enable) {
- dev_info(hdev->dev, "Rate limiters disabled\n");
- return;
- }
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_SAT, sat);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_RST, rst);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_EN, 1);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_SAT, sat);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_RST, rst);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_EN, 1);
-}
-
static void gaudi_init_golden_registers(struct hl_device *hdev)
{
u32 tpc_offset;
@@ -1817,8 +1698,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)

gaudi_init_hbm_cred(hdev);

- gaudi_init_rate_limiter(hdev);
-
gaudi_disable_clock_gating(hdev);

for (tpc_id = 0, tpc_offset = 0;
@@ -1839,9 +1718,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)
WREG32(mmMME1_CTRL_EUS_ROLLUP_CNT_ADD, 3);
WREG32(mmMME2_CTRL_EUS_ROLLUP_CNT_ADD, 3);
WREG32(mmMME3_CTRL_EUS_ROLLUP_CNT_ADD, 3);
-
- /* WA for H3-2081 */
- WREG32(mmPCIE_WRAP_MAX_OUTSTAND, 0x10ff);
}

static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id,
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index 8652c7e5d7f1..f38664b03865 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -238,7 +238,6 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
hdev->axi_drain = 0;
hdev->sram_scrambler_enable = 1;
hdev->dram_scrambler_enable = 1;
- hdev->rl_enable = 1;
hdev->bmc_enable = 1;
hdev->hard_reset_on_fw_events = 1;
}
--
2.17.1


2020-06-16 06:15:51

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 2/7] habanalabs: Use pending CS amount per ASIC

From: Ofir Bitton <[email protected]>

Training schemes requires much more concurrent command submissions than
inference does. In addition, training command submissions can be completed
in a non serialized manner. Hence, we add support in which each ASIC will
be able to configure the amount of concurrent pending command submissions,
rather than use a predefined amount. This change will enhance performance
by allowing the user to add more concurrent work without waiting for the
previous work to be completed.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/command_submission.c | 6 ++++--
drivers/misc/habanalabs/context.c | 14 +++++++++++---
drivers/misc/habanalabs/gaudi/gaudi.c | 2 ++
drivers/misc/habanalabs/gaudi/gaudiP.h | 6 ++++++
drivers/misc/habanalabs/goya/goya.c | 2 ++
drivers/misc/habanalabs/goya/goyaP.h | 6 ++++++
drivers/misc/habanalabs/habanalabs.h | 9 +++++----
drivers/misc/habanalabs/hw_queue.c | 2 +-
8 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index f82974a916c3..e156803f4a99 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -405,7 +405,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
spin_lock(&ctx->cs_lock);

cs_cmpl->cs_seq = ctx->cs_sequence;
- other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
+ other = ctx->cs_pending[cs_cmpl->cs_seq &
+ (hdev->asic_prop.max_pending_cs - 1)];
if ((other) && (!dma_fence_is_signaled(other))) {
spin_unlock(&ctx->cs_lock);
dev_dbg(hdev->dev,
@@ -419,7 +420,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,

cs->sequence = cs_cmpl->cs_seq;

- ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+ ctx->cs_pending[cs_cmpl->cs_seq &
+ (hdev->asic_prop.max_pending_cs - 1)] =
&cs_cmpl->base_fence;
ctx->cs_sequence++;

diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index ec92b3506b1f..1b96fefa4a65 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
* to this function unless the ref count is 0
*/

- for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+ for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
dma_fence_put(ctx->cs_pending[i]);

+ kfree(ctx->cs_pending);
+
if (ctx->asid != HL_KERNEL_ASID_ID) {
/* The engines are stopped as there is no executing CS, but the
* Coresight might be still working by accessing addresses
@@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
spin_lock_init(&ctx->cs_lock);
atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0;
+ ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
+ sizeof(struct dma_fence *),
+ GFP_KERNEL);
+ if (!ctx->cs_pending)
+ return -ENOMEM;

if (is_kernel_ctx) {
ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
@@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx)

struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{
+ struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
struct dma_fence *fence;

spin_lock(&ctx->cs_lock);
@@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return ERR_PTR(-EINVAL);
}

- if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+ if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
spin_unlock(&ctx->cs_lock);
return NULL;
}

fence = dma_fence_get(
- ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+ ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
spin_unlock(&ctx->cs_lock);

return fence;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4d69727bb53b..35e9080f6976 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);

+ prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
+
return 0;
}

diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index a46530d375fa..76c3f840e05a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -57,6 +57,12 @@

#define GAUDI_DEFAULT_CARD_NAME "HL2000"

+#define GAUDI_MAX_PENDING_CS 1024
+
+#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
+#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
#define PCI_DMA_NUMBER_OF_CHNLS 3
#define HBM_DMA_NUMBER_OF_CHNLS 5
#define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0d2952bb58df..e872099a3f7a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev)

strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
+
+ prop->max_pending_cs = GOYA_MAX_PENDING_CS;
}

/*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index d36f8d90c9c9..9d8a1761252d 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -57,6 +57,12 @@

#define GOYA_DEFAULT_CARD_NAME "HL1000"

+#define GOYA_MAX_PENDING_CS 64
+
+#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS)
+#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
/* DRAM Memory Map */

#define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 1ecdcf8b763a..64d9b2dd3e19 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -42,9 +42,6 @@

#define HL_MAX_QUEUES 128

-/* MUST BE POWER OF 2 and larger than 1 */
-#define HL_MAX_PENDING_CS 64
-
#define HL_IDLE_BUSY_TS_ARR_SIZE 4096

/* Memory */
@@ -61,6 +58,9 @@

#define HL_MAX_SOB_VAL (1 << 15)

+#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0))
+#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1))
+
/**
* struct pgt_info - MMU hop page info.
* @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -285,6 +285,7 @@ struct asic_fixed_properties {
u32 high_pll;
u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size;
+ u32 max_pending_cs;
u8 tpc_enabled_mask;
u8 completion_queues_count;
};
@@ -782,7 +783,7 @@ struct hl_ctx {
struct hl_fpriv *hpriv;
struct hl_device *hdev;
struct kref refcount;
- struct dma_fence *cs_pending[HL_MAX_PENDING_CS];
+ struct dma_fence **cs_pending;
struct hl_va_range *host_va_range;
struct hl_va_range *host_huge_va_range;
struct hl_va_range *dram_va_range;
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index f4434b39ef1b..29b96d24edc2 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
* write address offset in the SM block (QMAN LBW message).
* The write address offset is calculated as "COMP_OFFSET << 2".
*/
- offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+ offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);

--
2.17.1

2020-06-16 06:16:35

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 7/7] habanalabs: calculate trace frequency from PLL

From: Adam Aharon <[email protected]>

The profiler needs to know the PLL values for correctly showing the
profiling data. Because our firmware can use different PLL configurations,
we need to read the PLL values from the ASIC to pass them to the profiler.

Signed-off-by: Adam Aharon <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 33 ++++-
.../misc/habanalabs/gaudi/gaudi_coresight.c | 6 +-
drivers/misc/habanalabs/goya/goya.c | 33 ++++-
drivers/misc/habanalabs/goya/goya_coresight.c | 6 +-
drivers/misc/habanalabs/habanalabs.h | 11 ++
.../include/gaudi/asic_reg/gaudi_regs.h | 1 +
.../gaudi/asic_reg/psoc_cpu_pll_regs.h | 114 ++++++++++++++++++
7 files changed, 194 insertions(+), 10 deletions(-)
create mode 100644 drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 1ac7eb2498ef..1bde7e6eeb9b 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -555,11 +555,36 @@ static int gaudi_early_fini(struct hl_device *hdev)
static void gaudi_fetch_psoc_frequency(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 trace_freq = 0;
+ u32 pll_clk = 0;
+ u32 div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2);
+ u32 div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2);
+ u32 nr = RREG32(mmPSOC_CPU_PLL_NR);
+ u32 nf = RREG32(mmPSOC_CPU_PLL_NF);
+ u32 od = RREG32(mmPSOC_CPU_PLL_OD);
+
+ if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) {
+ if (div_sel == DIV_SEL_REF_CLK)
+ trace_freq = PLL_REF_CLK;
+ else
+ trace_freq = PLL_REF_CLK / (div_fctr + 1);
+ } else if (div_sel == DIV_SEL_PLL_CLK ||
+ div_sel == DIV_SEL_DIVIDED_PLL) {
+ pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1));
+ if (div_sel == DIV_SEL_PLL_CLK)
+ trace_freq = pll_clk;
+ else
+ trace_freq = pll_clk / (div_fctr + 1);
+ } else {
+ dev_warn(hdev->dev,
+ "Received invalid div select value: %d", div_sel);
+ }

- prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR);
- prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF);
- prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD);
- prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
+ prop->psoc_timestamp_frequency = trace_freq;
+ prop->psoc_pci_pll_nr = nr;
+ prop->psoc_pci_pll_nf = nf;
+ prop->psoc_pci_pll_od = od;
+ prop->psoc_pci_pll_div_factor = div_fctr;
}

static int _gaudi_init_tpc_mem(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
index bf0e062d7b87..c32322cb1728 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
@@ -392,6 +392,7 @@ static int gaudi_config_stm(struct hl_device *hdev,
{
struct hl_debug_params_stm *input;
u64 base_reg;
+ u32 frequency;
int rc;

if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) {
@@ -420,7 +421,10 @@ static int gaudi_config_stm(struct hl_device *hdev,
WREG32(base_reg + 0xE00, lower_32_bits(input->sp_mask));
WREG32(base_reg + 0xEF4, input->id);
WREG32(base_reg + 0xDF4, 0x80);
- WREG32(base_reg + 0xE8C, input->frequency);
+ frequency = hdev->asic_prop.psoc_timestamp_frequency;
+ if (frequency == 0)
+ frequency = input->frequency;
+ WREG32(base_reg + 0xE8C, frequency);
WREG32(base_reg + 0xE90, 0x7FF);

/* SW-2176 - SW WA for HW bug */
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 547fd766667a..6816f6217ee1 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -594,11 +594,36 @@ static void goya_qman0_set_security(struct hl_device *hdev, bool secure)
static void goya_fetch_psoc_frequency(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 trace_freq = 0;
+ u32 pll_clk = 0;
+ u32 div_fctr = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
+ u32 div_sel = RREG32(mmPSOC_PCI_PLL_DIV_SEL_1);
+ u32 nr = RREG32(mmPSOC_PCI_PLL_NR);
+ u32 nf = RREG32(mmPSOC_PCI_PLL_NF);
+ u32 od = RREG32(mmPSOC_PCI_PLL_OD);
+
+ if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) {
+ if (div_sel == DIV_SEL_REF_CLK)
+ trace_freq = PLL_REF_CLK;
+ else
+ trace_freq = PLL_REF_CLK / (div_fctr + 1);
+ } else if (div_sel == DIV_SEL_PLL_CLK ||
+ div_sel == DIV_SEL_DIVIDED_PLL) {
+ pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1));
+ if (div_sel == DIV_SEL_PLL_CLK)
+ trace_freq = pll_clk;
+ else
+ trace_freq = pll_clk / (div_fctr + 1);
+ } else {
+ dev_warn(hdev->dev,
+ "Received invalid div select value: %d", div_sel);
+ }

- prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR);
- prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF);
- prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD);
- prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
+ prop->psoc_timestamp_frequency = trace_freq;
+ prop->psoc_pci_pll_nr = nr;
+ prop->psoc_pci_pll_nf = nf;
+ prop->psoc_pci_pll_od = od;
+ prop->psoc_pci_pll_div_factor = div_fctr;
}

int goya_late_init(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index 1258724ea510..472246e3f4cd 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -232,6 +232,7 @@ static int goya_config_stm(struct hl_device *hdev,
{
struct hl_debug_params_stm *input;
u64 base_reg;
+ u32 frequency;
int rc;

if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) {
@@ -264,7 +265,10 @@ static int goya_config_stm(struct hl_device *hdev,
WREG32(base_reg + 0xE20, 0xFFFFFFFF);
WREG32(base_reg + 0xEF4, input->id);
WREG32(base_reg + 0xDF4, 0x80);
- WREG32(base_reg + 0xE8C, input->frequency);
+ frequency = hdev->asic_prop.psoc_timestamp_frequency;
+ if (frequency == 0)
+ frequency = input->frequency;
+ WREG32(base_reg + 0xE8C, frequency);
WREG32(base_reg + 0xE90, 0x7FF);
WREG32(base_reg + 0xE80, 0x27 | (input->id << 16));
} else {
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 8cd4b55d0608..4e68a41cce77 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -247,6 +247,7 @@ struct hl_mmu_properties {
* @psoc_pci_pll_nf: PCI PLL NF value.
* @psoc_pci_pll_od: PCI PLL OD value.
* @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value.
+ * @psoc_timestamp_frequency: frequency of the psoc timestamp clock.
* @high_pll: high PLL frequency used by the device.
* @cb_pool_cb_cnt: number of CBs in the CB pool.
* @cb_pool_cb_size: size of each CB in the CB pool.
@@ -291,6 +292,7 @@ struct asic_fixed_properties {
u32 psoc_pci_pll_nf;
u32 psoc_pci_pll_od;
u32 psoc_pci_pll_div_factor;
+ u32 psoc_timestamp_frequency;
u32 high_pll;
u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size;
@@ -533,6 +535,15 @@ enum hl_pll_frequency {
PLL_LAST
};

+#define PLL_REF_CLK 50
+
+enum div_select_defs {
+ DIV_SEL_REF_CLK = 0,
+ DIV_SEL_PLL_CLK = 1,
+ DIV_SEL_DIVIDED_REF = 2,
+ DIV_SEL_DIVIDED_PLL = 3,
+};
+
/**
* struct hl_asic_funcs - ASIC specific functions that are can be called from
* common code.
diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
index 85e3b5148595..62078077aee5 100644
--- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
+++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
@@ -91,6 +91,7 @@

#include "psoc_pci_pll_regs.h"
#include "psoc_hbm_pll_regs.h"
+#include "psoc_cpu_pll_regs.h"

#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18
#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C
diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h
new file mode 100644
index 000000000000..2585c70f59ef
--- /dev/null
+++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+/************************************
+ ** This is an auto-generated file **
+ ** DO NOT EDIT BELOW **
+ ************************************/
+
+#ifndef ASIC_REG_PSOC_CPU_PLL_REGS_H_
+#define ASIC_REG_PSOC_CPU_PLL_REGS_H_
+
+/*
+ *****************************************
+ * PSOC_CPU_PLL (Prototype: PLL)
+ *****************************************
+ */
+
+#define mmPSOC_CPU_PLL_NR 0xC70100
+
+#define mmPSOC_CPU_PLL_NF 0xC70104
+
+#define mmPSOC_CPU_PLL_OD 0xC70108
+
+#define mmPSOC_CPU_PLL_NB 0xC7010C
+
+#define mmPSOC_CPU_PLL_CFG 0xC70110
+
+#define mmPSOC_CPU_PLL_LOSE_MASK 0xC70120
+
+#define mmPSOC_CPU_PLL_LOCK_INTR 0xC70128
+
+#define mmPSOC_CPU_PLL_LOCK_BYPASS 0xC7012C
+
+#define mmPSOC_CPU_PLL_DATA_CHNG 0xC70130
+
+#define mmPSOC_CPU_PLL_RST 0xC70134
+
+#define mmPSOC_CPU_PLL_SLIP_WD_CNTR 0xC70150
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_0 0xC70200
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_1 0xC70204
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_2 0xC70208
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_3 0xC7020C
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_0 0xC70220
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_1 0xC70224
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_2 0xC70228
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_3 0xC7022C
+
+#define mmPSOC_CPU_PLL_DIV_SEL_0 0xC70280
+
+#define mmPSOC_CPU_PLL_DIV_SEL_1 0xC70284
+
+#define mmPSOC_CPU_PLL_DIV_SEL_2 0xC70288
+
+#define mmPSOC_CPU_PLL_DIV_SEL_3 0xC7028C
+
+#define mmPSOC_CPU_PLL_DIV_EN_0 0xC702A0
+
+#define mmPSOC_CPU_PLL_DIV_EN_1 0xC702A4
+
+#define mmPSOC_CPU_PLL_DIV_EN_2 0xC702A8
+
+#define mmPSOC_CPU_PLL_DIV_EN_3 0xC702AC
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_0 0xC702C0
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_1 0xC702C4
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_2 0xC702C8
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_3 0xC702CC
+
+#define mmPSOC_CPU_PLL_CLK_GATER 0xC70300
+
+#define mmPSOC_CPU_PLL_CLK_RLX_0 0xC70310
+
+#define mmPSOC_CPU_PLL_CLK_RLX_1 0xC70314
+
+#define mmPSOC_CPU_PLL_CLK_RLX_2 0xC70318
+
+#define mmPSOC_CPU_PLL_CLK_RLX_3 0xC7031C
+
+#define mmPSOC_CPU_PLL_REF_CNTR_PERIOD 0xC70400
+
+#define mmPSOC_CPU_PLL_REF_LOW_THRESHOLD 0xC70410
+
+#define mmPSOC_CPU_PLL_REF_HIGH_THRESHOLD 0xC70420
+
+#define mmPSOC_CPU_PLL_PLL_NOT_STABLE 0xC70430
+
+#define mmPSOC_CPU_PLL_FREQ_CALC_EN 0xC70440
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_CFG 0xC70500
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_0 0xC70510
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_1 0xC70514
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_2 0xC70518
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_3 0xC7051C
+
+#endif /* ASIC_REG_PSOC_CPU_PLL_REGS_H_ */
--
2.17.1

2020-06-16 06:17:02

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 3/7] habanalabs: sync stream generic functionality

From: Ofir Bitton <[email protected]>

Currently sync stream is limited only for external queues. We want to
remove this constraint by adding a new queue property dedicated for sync
stream. In addition we move the initialization and reset methods to the
common code since we can re-use them with slight changes.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/command_submission.c | 6 ++-
drivers/misc/habanalabs/gaudi/gaudi.c | 46 ++-----------------
drivers/misc/habanalabs/gaudi/gaudiP.h | 2 -
drivers/misc/habanalabs/goya/goya.c | 12 -----
drivers/misc/habanalabs/habanalabs.h | 19 ++++++--
drivers/misc/habanalabs/hw_queue.c | 48 ++++++++++++++++++--
6 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index e156803f4a99..7b410bff72e0 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -727,6 +727,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
struct hl_cs_job *job;
struct hl_cs *cs;
struct hl_cb *cb;
+ enum hl_queue_type q_type;
u64 *signal_seq_arr = NULL, signal_seq;
u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size;
int rc;
@@ -759,9 +760,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
chunk = &cs_chunk_array[0];
q_idx = chunk->queue_index;
hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
+ q_type = hw_queue_prop->type;

if ((q_idx >= HL_MAX_QUEUES) ||
- (hw_queue_prop->type != QUEUE_TYPE_EXT)) {
+ (!hw_queue_prop->supports_sync_stream)) {
dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx);
rc = -EINVAL;
goto free_cs_chunk_array;
@@ -858,7 +860,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,

*cs_seq = cs->sequence;

- job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
+ job = hl_cs_allocate_job(hdev, q_type, true);
if (!job) {
dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 35e9080f6976..9ce032466243 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -345,10 +345,12 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 1;
+ prop->hw_queues_props[i].supports_sync_stream = 1;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) {
prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
prop->hw_queues_props[i].driver_only = 1;
prop->hw_queues_props[i].requires_kernel_cb = 0;
+ prop->hw_queues_props[i].supports_sync_stream = 0;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) {
prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
prop->hw_queues_props[i].driver_only = 0;
@@ -357,6 +359,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 0;
+ prop->hw_queues_props[i].supports_sync_stream = 0;
}
}

@@ -364,7 +367,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->hw_queues_props[i].type = QUEUE_TYPE_NA;

prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
-
+ prop->sync_stream_first_sob = 0;
+ prop->sync_stream_first_mon = 0;
prop->dram_base_address = DRAM_PHYS_BASE;
prop->dram_size = GAUDI_HBM_SIZE_32GB;
prop->dram_end_address = prop->dram_base_address +
@@ -6263,44 +6267,6 @@ static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
return gaudi_cq_assignment[cq_idx];
}

-static void gaudi_ext_queue_init(struct hl_device *hdev, u32 q_idx)
-{
- struct gaudi_device *gaudi = hdev->asic_specific;
- struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
- struct hl_hw_sob *hw_sob;
- int sob, ext_idx = gaudi->ext_queue_idx++;
-
- /*
- * The external queues might not sit sequentially, hence use the
- * real external queue index for the SOB/MON base id.
- */
- hw_queue->base_sob_id = ext_idx * HL_RSVD_SOBS;
- hw_queue->base_mon_id = ext_idx * HL_RSVD_MONS;
- hw_queue->next_sob_val = 1;
- hw_queue->curr_sob_offset = 0;
-
- for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) {
- hw_sob = &hw_queue->hw_sob[sob];
- hw_sob->hdev = hdev;
- hw_sob->sob_id = hw_queue->base_sob_id + sob;
- hw_sob->q_idx = q_idx;
- kref_init(&hw_sob->kref);
- }
-}
-
-static void gaudi_ext_queue_reset(struct hl_device *hdev, u32 q_idx)
-{
- struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
-
- /*
- * In case we got here due to a stuck CS, the refcnt might be bigger
- * than 1 and therefore we reset it.
- */
- kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref);
- hw_queue->curr_sob_offset = 0;
- hw_queue->next_sob_val = 1;
-}
-
static u32 gaudi_get_signal_cb_size(struct hl_device *hdev)
{
return sizeof(struct packet_msg_short) +
@@ -6603,8 +6569,6 @@ static const struct hl_asic_funcs gaudi_funcs = {
.read_device_fw_version = gaudi_read_device_fw_version,
.load_firmware_to_device = gaudi_load_firmware_to_device,
.load_boot_fit_to_device = gaudi_load_boot_fit_to_device,
- .ext_queue_init = gaudi_ext_queue_init,
- .ext_queue_reset = gaudi_ext_queue_reset,
.get_signal_cb_size = gaudi_get_signal_cb_size,
.get_wait_cb_size = gaudi_get_wait_cb_size,
.gen_signal_cb = gaudi_gen_signal_cb,
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 76c3f840e05a..8c654711d543 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -234,7 +234,6 @@ struct gaudi_internal_qman_info {
* engine.
* @multi_msi_mode: whether we are working in multi MSI single MSI mode.
* Multi MSI is possible only with IOMMU enabled.
- * @ext_queue_idx: helper index for external queues initialization.
*/
struct gaudi_device {
int (*armcp_info_get)(struct hl_device *hdev);
@@ -253,7 +252,6 @@ struct gaudi_device {
u32 events_stat_aggregate[GAUDI_EVENT_SIZE];
u32 hw_cap_initialized;
u8 multi_msi_mode;
- u8 ext_queue_idx;
};

void gaudi_init_security(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index e872099a3f7a..547fd766667a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5160,16 +5160,6 @@ u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
return cq_idx;
}

-static void goya_ext_queue_init(struct hl_device *hdev, u32 q_idx)
-{
-
-}
-
-static void goya_ext_queue_reset(struct hl_device *hdev, u32 q_idx)
-{
-
-}
-
static u32 goya_get_signal_cb_size(struct hl_device *hdev)
{
return 0;
@@ -5283,8 +5273,6 @@ static const struct hl_asic_funcs goya_funcs = {
.read_device_fw_version = goya_read_device_fw_version,
.load_firmware_to_device = goya_load_firmware_to_device,
.load_boot_fit_to_device = goya_load_boot_fit_to_device,
- .ext_queue_init = goya_ext_queue_init,
- .ext_queue_reset = goya_ext_queue_reset,
.get_signal_cb_size = goya_get_signal_cb_size,
.get_wait_cb_size = goya_get_wait_cb_size,
.gen_signal_cb = goya_gen_signal_cb,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 64d9b2dd3e19..8cd4b55d0608 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -50,6 +50,10 @@
/* MMU */
#define MMU_HASH_TABLE_BITS 7 /* 1 << 7 buckets */

+/*
+ * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream
+ * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream
+ */
#define HL_RSVD_SOBS 4
#define HL_RSVD_MONS 2

@@ -141,11 +145,13 @@ struct hl_hw_sob {
* false otherwise.
* @requires_kernel_cb: true if a CB handle must be provided for jobs on this
* queue, false otherwise (a CB address must be provided).
+ * @supports_sync_stream: True if queue supports sync stream
*/
struct hw_queue_properties {
enum hl_queue_type type;
u8 driver_only;
u8 requires_kernel_cb;
+ u8 supports_sync_stream;
};

/**
@@ -245,6 +251,9 @@ struct hl_mmu_properties {
* @cb_pool_cb_cnt: number of CBs in the CB pool.
* @cb_pool_cb_size: size of each CB in the CB pool.
* @tpc_enabled_mask: which TPCs are enabled.
+ * @sync_stream_first_sob: first sync object available for sync stream use
+ * @sync_stream_first_mon: first monitor available for sync stream use
+ * @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
*/
struct asic_fixed_properties {
@@ -286,6 +295,8 @@ struct asic_fixed_properties {
u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size;
u32 max_pending_cs;
+ u16 sync_stream_first_sob;
+ u16 sync_stream_first_mon;
u8 tpc_enabled_mask;
u8 completion_queues_count;
};
@@ -423,6 +434,7 @@ struct hl_cs_job;
* exist).
* @curr_sob_offset: the id offset to the currently used SOB from the
* HL_RSVD_SOBS that are being used by this queue.
+ * @supports_sync_stream: True if queue supports sync stream
*/
struct hl_hw_queue {
struct hl_hw_sob hw_sob[HL_RSVD_SOBS];
@@ -441,6 +453,7 @@ struct hl_hw_queue {
u16 base_mon_id;
u8 valid;
u8 curr_sob_offset;
+ u8 supports_sync_stream;
};

/**
@@ -603,8 +616,6 @@ enum hl_pll_frequency {
* contained in registers
* @load_firmware_to_device: load the firmware to the device's memory
* @load_boot_fit_to_device: load boot fit to device's memory
- * @ext_queue_init: Initialize the given external queue.
- * @ext_queue_reset: Reset the given external queue.
* @get_signal_cb_size: Get signal CB size.
* @get_wait_cb_size: Get wait CB size.
* @gen_signal_cb: Generate a signal CB.
@@ -707,8 +718,6 @@ struct hl_asic_funcs {
enum hl_fw_component fwc);
int (*load_firmware_to_device)(struct hl_device *hdev);
int (*load_boot_fit_to_device)(struct hl_device *hdev);
- void (*ext_queue_init)(struct hl_device *hdev, u32 hw_queue_id);
- void (*ext_queue_reset)(struct hl_device *hdev, u32 hw_queue_id);
u32 (*get_signal_cb_size)(struct hl_device *hdev);
u32 (*get_wait_cb_size)(struct hl_device *hdev);
void (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id);
@@ -1436,6 +1445,7 @@ struct hl_device_idle_busy_ts {
* @cdev_sysfs_created: were char devices and sysfs nodes created.
* @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported.
+ * @sync_stream_queue_idx: helper index for sync stream queues initialization.
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
*/
@@ -1523,6 +1533,7 @@ struct hl_device {
u8 cdev_sysfs_created;
u8 stop_on_err;
u8 supports_sync_stream;
+ u8 sync_stream_queue_idx;
u8 supports_coresight;
u8 supports_soft_reset;

diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 29b96d24edc2..a8fa8021c617 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -663,9 +663,6 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
q->ci = 0;
q->pi = 0;

- if (!is_cpu_queue)
- hdev->asic_funcs->ext_queue_init(hdev, q->hw_queue_id);
-
return 0;

free_queue:
@@ -732,6 +729,42 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
return 0;
}

+static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
+{
+ struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+ struct hl_hw_sob *hw_sob;
+ int sob, queue_idx = hdev->sync_stream_queue_idx++;
+
+ hw_queue->base_sob_id =
+ prop->sync_stream_first_sob + queue_idx * HL_RSVD_SOBS;
+ hw_queue->base_mon_id =
+ prop->sync_stream_first_mon + queue_idx * HL_RSVD_MONS;
+ hw_queue->next_sob_val = 1;
+ hw_queue->curr_sob_offset = 0;
+
+ for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) {
+ hw_sob = &hw_queue->hw_sob[sob];
+ hw_sob->hdev = hdev;
+ hw_sob->sob_id = hw_queue->base_sob_id + sob;
+ hw_sob->q_idx = q_idx;
+ kref_init(&hw_sob->kref);
+ }
+}
+
+void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx)
+{
+ struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
+
+ /*
+ * In case we got here due to a stuck CS, the refcnt might be bigger
+ * than 1 and therefore we reset it.
+ */
+ kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref);
+ hw_queue->curr_sob_offset = 0;
+ hw_queue->next_sob_val = 1;
+}
+
/*
* queue_init - main initialization function for H/W queue object
*
@@ -774,6 +807,9 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
break;
}

+ if (q->supports_sync_stream)
+ sync_stream_queue_init(hdev, q->hw_queue_id);
+
if (rc)
return rc;

@@ -848,6 +884,8 @@ int hl_hw_queues_create(struct hl_device *hdev)
i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {

q->queue_type = asic->hw_queues_props[i].type;
+ q->supports_sync_stream =
+ asic->hw_queues_props[i].supports_sync_stream;
rc = queue_init(hdev, q, i);
if (rc) {
dev_err(hdev->dev,
@@ -889,7 +927,7 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
continue;
q->pi = q->ci = 0;

- if (q->queue_type == QUEUE_TYPE_EXT)
- hdev->asic_funcs->ext_queue_reset(hdev, q->hw_queue_id);
+ if (q->supports_sync_stream)
+ sync_stream_queue_reset(hdev, q->hw_queue_id);
}
}
--
2.17.1

2020-06-16 06:17:08

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 5/7] uapi/habanalabs: fix some comments

MAP/UNMAP are done also for device memory.

Signed-off-by: Oded Gabbay <[email protected]>
---
include/uapi/misc/habanalabs.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index f6267a8d7416..f218d1c62c62 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -530,13 +530,13 @@ union hl_wait_cs_args {
struct hl_wait_cs_out out;
};

-/* Opcode to alloc device memory */
+/* Opcode to allocate device memory */
#define HL_MEM_OP_ALLOC 0
/* Opcode to free previously allocated device memory */
#define HL_MEM_OP_FREE 1
-/* Opcode to map host memory */
+/* Opcode to map host and device memory */
#define HL_MEM_OP_MAP 2
-/* Opcode to unmap previously mapped host memory */
+/* Opcode to unmap previously mapped host and device memory */
#define HL_MEM_OP_UNMAP 3

/* Memory flags */
--
2.17.1

2020-06-16 06:17:27

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 4/7] habanalabs: Use mask instead of shift in sync stream registers

From: Ofir Bitton <[email protected]>

Use proper bitfield masks instead of shifting values when configuring
packets sent to device.

Signed-off-by: Ofir Bitton <[email protected]>
Reviewed-by: Oded Gabbay <[email protected]>
Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/gaudi/gaudi.c | 70 ++++++++++---------
.../habanalabs/include/gaudi/gaudi_packets.h | 4 +-
2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 9ce032466243..1ac7eb2498ef 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -21,6 +21,7 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/iommu.h>
#include <linux/seq_file.h>
+#include <linux/bitfield.h>

/*
* Gaudi security scheme:
@@ -6289,16 +6290,17 @@ static void gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id)
pkt = (struct packet_msg_short *) (uintptr_t) cb->kernel_address;
memset(pkt, 0, sizeof(*pkt));

- value = 1 << GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT; /* inc by 1 */
- value |= 1 << GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT; /* add mode */
+ /* Inc by 1, Mode ADD */
+ value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK, 1);
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK, 1);

- ctl = (sob_id * 4) << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; /* SOB id */
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */
- ctl |= 3 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S SOB base */
- ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT;
+ ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, sob_id * 4);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 3); /* W_S SOB base */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1);

pkt->value = cpu_to_le32(value);
pkt->ctl = cpu_to_le32(ctl);
@@ -6311,12 +6313,12 @@ static u32 gaudi_add_mon_msg_short(struct packet_msg_short *pkt, u32 value,

memset(pkt, 0, pkt_size);

- ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT;
- ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */
- ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; /* only last pkt needs MB */
+ ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 0); /* last pkt MB */

pkt->value = cpu_to_le32(value);
pkt->ctl = cpu_to_le32(ctl);
@@ -6332,18 +6334,19 @@ static u32 gaudi_add_arm_monitor_pkt(struct packet_msg_short *pkt, u16 sob_id,

memset(pkt, 0, pkt_size);

- value = (sob_id / 8) << GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_SHIFT;
- value |= sob_val << GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_SHIFT;
- value |= 0 << GAUDI_PKT_SHORT_VAL_MON_MODE_SHIFT; /* GREATER_OR_EQUAL */
- value |= mask << GAUDI_PKT_SHORT_VAL_MON_MASK_SHIFT;
+ value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_MASK, sob_id / 8);
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_MASK, sob_val);
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MODE_MASK,
+ 0); /* GREATER OR EQUAL*/
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MASK_MASK, mask);

- ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */
- ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */
- ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT;
+ ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1);

pkt->value = cpu_to_le32(value);
pkt->ctl = cpu_to_le32(ctl);
@@ -6357,15 +6360,14 @@ static u32 gaudi_add_fence_pkt(struct packet_fence *pkt)

memset(pkt, 0, pkt_size);

- cfg = 1 << GAUDI_PKT_FENCE_CFG_DEC_VAL_SHIFT;
- cfg |= 1 << GAUDI_PKT_FENCE_CFG_TARGET_VAL_SHIFT;
- cfg |= 2 << GAUDI_PKT_FENCE_CFG_ID_SHIFT;
+ cfg = FIELD_PREP(GAUDI_PKT_FENCE_CFG_DEC_VAL_MASK, 1);
+ cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK, 1);
+ cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_ID_MASK, 2);

- ctl = 0 << GAUDI_PKT_FENCE_CTL_PRED_SHIFT;
- ctl |= PACKET_FENCE << GAUDI_PKT_FENCE_CTL_OPCODE_SHIFT;
- ctl |= 0 << GAUDI_PKT_FENCE_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_FENCE_CTL_RB_SHIFT;
- ctl |= 1 << GAUDI_PKT_FENCE_CTL_MB_SHIFT;
+ ctl = FIELD_PREP(GAUDI_PKT_FENCE_CTL_OPCODE_MASK, PACKET_FENCE);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1);

pkt->cfg = cpu_to_le32(cfg);
pkt->ctl = cpu_to_le32(ctl);
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h
index 9a5800b0086b..02d62bc53f7f 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h
@@ -85,7 +85,7 @@ struct packet_msg_long {
};

#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT 0
-#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x0000EFFF
+#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x00007FFF

#define GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT 31
#define GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK 0x80000000
@@ -141,7 +141,7 @@ struct packet_msg_prot {
#define GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK 0x00FF0000

#define GAUDI_PKT_FENCE_CFG_ID_SHIFT 30
-#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC000000
+#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC0000000

#define GAUDI_PKT_FENCE_CTL_PRED_SHIFT 0
#define GAUDI_PKT_FENCE_CTL_PRED_MASK 0x0000001F
--
2.17.1

2020-06-16 06:18:54

by Oded Gabbay

[permalink] [raw]
Subject: [PATCH 6/7] habanalabs: align armcp_packet structure to 8 bytes

Once there is a 64-bit field in a structure, GCC compiler for ARM aligns
the structure to 8 bytes. In order to avoid confusion when these
structures are being passed between CPUs from different architectures, we
explicitly align the structure to 8 bytes.

Signed-off-by: Oded Gabbay <[email protected]>
---
drivers/misc/habanalabs/include/armcp_if.h | 2 ++
1 file changed, 2 insertions(+)

diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index a34fc39ad87e..dea7c90faafa 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -276,6 +276,8 @@ struct armcp_packet {
/* For get Armcp info/EEPROM data */
__le32 data_max_size;
};
+
+ __le32 reserved;
};

struct armcp_unmask_irq_arr_packet {
--
2.17.1

2020-06-18 10:55:18

by Omer Shpigelman

[permalink] [raw]
Subject: RE: [PATCH 1/7] habanalabs: remove rate limiters from GAUDI

On Tue, Jun 16, 2020 at 9:13 AM, Oded Gabbay <[email protected]> wrote:
> We no longer need to initialize the rate limiters in GAUDI A1.
>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Omer Shpigelman <[email protected]>

2020-06-18 11:02:25

by Omer Shpigelman

[permalink] [raw]
Subject: RE: [PATCH 6/7] habanalabs: align armcp_packet structure to 8 bytes

On Tue, Jun 13, 2020 at 09:13 AM, Oded Gabbay <[email protected]> wrote:
> Once there is a 64-bit field in a structure, GCC compiler for ARM aligns the
> structure to 8 bytes. In order to avoid confusion when these structures are
> being passed between CPUs from different architectures, we explicitly align
> the structure to 8 bytes.
>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Omer Shpigelman <[email protected]>

2020-06-18 11:58:31

by Omer Shpigelman

[permalink] [raw]
Subject: RE: [PATCH 5/7] uapi/habanalabs: fix some comments

On Tue, Jun 16, 2020 at 09:13 AM, Oded Gabbay <[email protected]> wrote:
> MAP/UNMAP are done also for device memory.
>
> Signed-off-by: Oded Gabbay <[email protected]>

Reviewed-by: Omer Shpigelman <[email protected]>