LinuxLists.cc - [PATCH 1/4] habanalabs: Mark queue as expecting to CB handle or address

2019-10-03 08:43:04

Subject: [PATCH 1/4] habanalabs: Mark queue as expecting to CB handle or address

Jobs on some queues must be provided with a handle to a driver command
buffer object, while for other queues, jobs must be provided with an
address to a command buffer.
Currently the distinction is done based on the queue type, which is less
flexible if the same type of queue behaves differently on different
types of ASIC.
This patch adds a new queue property for this target, which is
configured per queue type per ASIC type.

Signed-off-by: Tomer Tayar <[email protected]>
---
drivers/misc/habanalabs/command_submission.c | 4 +++-
drivers/misc/habanalabs/goya/goya.c | 3 +++
drivers/misc/habanalabs/habanalabs.h | 3 +++
3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index a9ac045dcfde..f44205540520 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -414,7 +414,9 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
"Queue index %d is restricted for the kernel driver\n",
chunk->queue_index);
return NULL;
- } else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
+ }
+
+ if (!hw_queue_prop->requires_kernel_cb) {
*ext_queue = false;
return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
}
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 09caef7642fd..71693fcffb16 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -337,17 +337,20 @@ void goya_get_fixed_properties(struct hl_device *hdev)
for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
+ prop->hw_queues_props[i].requires_kernel_cb = 1;
}

for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
prop->hw_queues_props[i].driver_only = 1;
+ prop->hw_queues_props[i].requires_kernel_cb = 0;
}

for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
NUMBER_OF_INT_HW_QUEUES; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
prop->hw_queues_props[i].driver_only = 0;
+ prop->hw_queues_props[i].requires_kernel_cb = 0;
}

for (; i < HL_MAX_QUEUES; i++)
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index c3d24ffad9fa..f47f4b22cb6b 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -98,10 +98,13 @@ enum hl_queue_type {
* @type: queue type.
* @driver_only: true if only the driver is allowed to send a job to this queue,
* false otherwise.
+ * @requires_kernel_cb: true if a CB handle must be provided for jobs on this
+ * queue, false otherwise (a CB address must be provided).
*/
struct hw_queue_properties {
enum hl_queue_type type;
u8 driver_only;
+ u8 requires_kernel_cb;
};

/**
--
2.17.1

2019-10-03 08:43:26

by Tomer Tayar

[permalink] [raw]

Subject: [PATCH 2/4] habanalabs: Add a new H/W queue type

This patch adds a support for a new H/W queue type.
This type of queue is for DMA and compute engines jobs, for which
completion notification are sent be H/W.
Command buffer for this queue can be created either through the CB
IOCTL and using the retrieved CB handle, or by preparing a buffer on the
host or device SRAM/DRAM, and using the device address to that buffer.
The patch includes the handling of the 2 options, as well as the
initialization of the H/W queue and its jobs scheduling.

Signed-off-by: Tomer Tayar <[email protected]>
---
drivers/misc/habanalabs/command_submission.c | 120 ++++++---
drivers/misc/habanalabs/goya/goya.c | 4 +-
drivers/misc/habanalabs/habanalabs.h | 24 +-
drivers/misc/habanalabs/hw_queue.c | 247 +++++++++++++++----
drivers/misc/habanalabs/include/qman_if.h | 12 +
5 files changed, 307 insertions(+), 100 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index f44205540520..776ddafc47fb 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
kref_put(&cs->refcount, cs_do_release);
}

+static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
+{
+ /*
+ * Patched CB is created for external queues jobs, and for H/W queues
+ * jobs if the user CB was allocated by driver and MMU is disabled.
+ */
+ return (job->queue_type == QUEUE_TYPE_EXT ||
+ (job->queue_type == QUEUE_TYPE_HW &&
+ job->is_kernel_allocated_cb &&
+ !hdev->mmu_enable));
+}
+
/*
* cs_parser - parse the user command submission
*
@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
parser.patched_cb = NULL;
parser.user_cb = job->user_cb;
parser.user_cb_size = job->user_cb_size;
- parser.ext_queue = job->ext_queue;
+ parser.queue_type = job->queue_type;
+ parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
job->patched_cb = NULL;

rc = hdev->asic_funcs->cs_parser(hdev, &parser);
- if (job->ext_queue) {
+
+ if (is_cb_patched(hdev, job)) {
if (!rc) {
job->patched_cb = parser.patched_cb;
job->job_cb_size = parser.patched_cb_size;
@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
{
struct hl_cs *cs = job->cs;

- if (job->ext_queue) {
+ if (is_cb_patched(hdev, job)) {
hl_userptr_delete_list(hdev, &job->userptr_list);

/*
@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
}
}

+ /* For H/W queue jobs, if a user CB was allocated by driver and MMU is
+ * enabled, the user CB isn't released in cs_parser() and thus should be
+ * released here.
+ */
+ if (job->queue_type == QUEUE_TYPE_HW &&
+ job->is_kernel_allocated_cb && hdev->mmu_enable) {
+ spin_lock(&job->user_cb->lock);
+ job->user_cb->cs_cnt--;
+ spin_unlock(&job->user_cb->lock);
+
+ hl_cb_put(job->user_cb);
+ }
+
/*
* This is the only place where there can be multiple threads
* modifying the list at the same time
@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)

hl_debugfs_remove_job(hdev, job);

- if (job->ext_queue)
+ if (job->queue_type == QUEUE_TYPE_EXT ||
+ job->queue_type == QUEUE_TYPE_HW)
cs_put(cs);

kfree(job);
@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
free_job(hdev, job);
}

-static struct hl_cb *validate_queue_index(struct hl_device *hdev,
- struct hl_cb_mgr *cb_mgr,
- struct hl_cs_chunk *chunk,
- bool *ext_queue)
+static int validate_queue_index(struct hl_device *hdev,
+ struct hl_cs_chunk *chunk,
+ enum hl_queue_type *queue_type,
+ bool *is_kernel_allocated_cb)
{
struct asic_fixed_properties *asic = &hdev->asic_prop;
struct hw_queue_properties *hw_queue_prop;
- u32 cb_handle;
- struct hl_cb *cb;
-
- /* Assume external queue */
- *ext_queue = true;

hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];

@@ -406,22 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
(hw_queue_prop->type == QUEUE_TYPE_NA)) {
dev_err(hdev->dev, "Queue index %d is invalid\n",
chunk->queue_index);
- return NULL;
+ return -EINVAL;
}

if (hw_queue_prop->driver_only) {
dev_err(hdev->dev,
"Queue index %d is restricted for the kernel driver\n",
chunk->queue_index);
- return NULL;
+ return -EINVAL;
}

- if (!hw_queue_prop->requires_kernel_cb) {
- *ext_queue = false;
- return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
- }
+ *queue_type = hw_queue_prop->type;
+ *is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
+
+ return 0;
+}
+
+static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
+ struct hl_cb_mgr *cb_mgr,
+ struct hl_cs_chunk *chunk)
+{
+ struct hl_cb *cb;
+ u32 cb_handle;

- /* Retrieve CB object */
cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);

cb = hl_cb_get(hdev, cb_mgr, cb_handle);
@@ -446,7 +476,8 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
return NULL;
}

-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+ enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
{
struct hl_cs_job *job;

@@ -454,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
if (!job)
return NULL;

- job->ext_queue = ext_queue;
+ job->queue_type = queue_type;
+ job->is_kernel_allocated_cb = is_kernel_allocated_cb;

- if (job->ext_queue) {
+ if (is_cb_patched(hdev, job))
INIT_LIST_HEAD(&job->userptr_list);
+
+ if (job->queue_type == QUEUE_TYPE_EXT)
INIT_WORK(&job->finish_work, job_wq_completion);
- }

return job;
}
@@ -472,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
struct hl_cs_job *job;
struct hl_cs *cs;
struct hl_cb *cb;
- bool ext_queue_present = false;
+ bool int_queues_only = true;
u32 size_to_copy;
int rc, i, parse_cnt;

@@ -516,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
/* Validate ALL the CS chunks before submitting the CS */
for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
struct hl_cs_chunk *chunk = &cs_chunk_array[i];
- bool ext_queue;
+ enum hl_queue_type queue_type;
+ bool is_kernel_allocated_cb;

- cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
- &ext_queue);
- if (ext_queue) {
- ext_queue_present = true;
+ rc = validate_queue_index(hdev, chunk, &queue_type,
+ &is_kernel_allocated_cb);
+ if (rc)
+ goto free_cs_object;
+
+ if (is_kernel_allocated_cb) {
+ cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
if (!cb) {
rc = -EINVAL;
goto free_cs_object;
}
+ } else {
+ cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
}

- job = hl_cs_allocate_job(hdev, ext_queue);
+ if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+ int_queues_only = false;
+
+ job = hl_cs_allocate_job(hdev, queue_type,
+ is_kernel_allocated_cb);
if (!job) {
dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM;
- if (ext_queue)
+ if (is_kernel_allocated_cb)
goto release_cb;
else
goto free_cs_object;
@@ -542,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
job->cs = cs;
job->user_cb = cb;
job->user_cb_size = chunk->cb_size;
- if (job->ext_queue)
+ if (is_kernel_allocated_cb)
job->job_cb_size = cb->size;
else
job->job_cb_size = chunk->cb_size;
@@ -555,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
/*
* Increment CS reference. When CS reference is 0, CS is
* done and can be signaled to user and free all its resources
- * Only increment for JOB on external queues, because only
- * for those JOBs we get completion
+ * Only increment for JOB on external or H/W queues, because
+ * only for those JOBs we get completion
*/
- if (job->ext_queue)
+ if (job->queue_type == QUEUE_TYPE_EXT ||
+ job->queue_type == QUEUE_TYPE_HW)
cs_get(cs);

hl_debugfs_add_job(hdev, job);
@@ -572,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
}
}

- if (!ext_queue_present) {
+ if (int_queues_only) {
dev_err(hdev->dev,
- "Reject CS %d.%llu because no external queues jobs\n",
+ "Reject CS %d.%llu because only internal queues jobs are present\n",
cs->ctx->asid, cs->sequence);
rc = -EINVAL;
goto free_cs_object;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 71693fcffb16..0b40915bede2 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3943,7 +3943,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
{
struct goya_device *goya = hdev->asic_specific;

- if (!parser->ext_queue)
+ if (parser->queue_type == QUEUE_TYPE_INT)
return goya_parse_cb_no_ext_queue(hdev, parser);

if (goya->hw_cap_initialized & HW_CAP_MMU)
@@ -4614,7 +4614,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
lin_dma_pkt++;
} while (--lin_dma_pkts_cnt);

- job = hl_cs_allocate_job(hdev, true);
+ job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
if (!job) {
dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index f47f4b22cb6b..371d1ec15697 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -85,12 +85,15 @@ struct hl_fpriv;
* @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
* memories and/or operates the compute engines.
* @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
+ * @QUEUE_TYPE_HW: queue of DMA and compute engines jobs, for which completion
+ * notifications are sent by H/W.
*/
enum hl_queue_type {
QUEUE_TYPE_NA,
QUEUE_TYPE_EXT,
QUEUE_TYPE_INT,
- QUEUE_TYPE_CPU
+ QUEUE_TYPE_CPU,
+ QUEUE_TYPE_HW
};

/**
@@ -755,11 +758,14 @@ struct hl_cs {
* @userptr_list: linked-list of userptr mappings that belong to this job and
* wait for completion.
* @debugfs_list: node in debugfs list of command submission jobs.
+ * @queue_type: the type of the H/W queue this job is submitted to.
* @id: the id of this job inside a CS.
* @hw_queue_id: the id of the H/W queue this job is submitted to.
* @user_cb_size: the actual size of the CB we got from the user.
* @job_cb_size: the actual size of the CB that we put on the queue.
- * @ext_queue: whether the job is for external queue or internal queue.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ * handle to a kernel-allocated CB object, false
+ * otherwise (SRAM/DRAM/host address).
*/
struct hl_cs_job {
struct list_head cs_node;
@@ -769,11 +775,12 @@ struct hl_cs_job {
struct work_struct finish_work;
struct list_head userptr_list;
struct list_head debugfs_list;
+ enum hl_queue_type queue_type;
u32 id;
u32 hw_queue_id;
u32 user_cb_size;
u32 job_cb_size;
- u8 ext_queue;
+ u8 is_kernel_allocated_cb;
};

/**
@@ -784,24 +791,28 @@ struct hl_cs_job {
* @job_userptr_list: linked-list of userptr mappings that belong to the related
* job and wait for completion.
* @cs_sequence: the sequence number of the related CS.
+ * @queue_type: the type of the H/W queue this job is submitted to.
* @ctx_id: the ID of the context the related CS belongs to.
* @hw_queue_id: the id of the H/W queue this job is submitted to.
* @user_cb_size: the actual size of the CB we got from the user.
* @patched_cb_size: the size of the CB after parsing.
- * @ext_queue: whether the job is for external queue or internal queue.
* @job_id: the id of the related job inside the related CS.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ * handle to a kernel-allocated CB object, false
+ * otherwise (SRAM/DRAM/host address).
*/
struct hl_cs_parser {
struct hl_cb *user_cb;
struct hl_cb *patched_cb;
struct list_head *job_userptr_list;
u64 cs_sequence;
+ enum hl_queue_type queue_type;
u32 ctx_id;
u32 hw_queue_id;
u32 user_cb_size;
u32 patched_cb_size;
- u8 ext_queue;
u8 job_id;
+ u8 is_kernel_allocated_cb;
};

@@ -1504,7 +1515,8 @@ int hl_cb_pool_init(struct hl_device *hdev);
int hl_cb_pool_fini(struct hl_device *hdev);

void hl_cs_rollback_all(struct hl_device *hdev);
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+ enum hl_queue_type queue_type, bool is_kernel_allocated_cb);

void goya_set_asic_funcs(struct hl_device *hdev);

diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index f733b534f738..185628b98d7e 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -58,8 +58,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
}

/*
- * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
- *
+ * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
+ * H/W queue.
* @hdev: pointer to habanalabs device structure
* @q: pointer to habanalabs queue structure
* @ctl: BD's control word
@@ -73,8 +73,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
* This function must be called when the scheduler mutex is taken
*
*/
-static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
- u32 ctl, u32 len, u64 ptr)
+static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
+ struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
{
struct hl_bd *bd;

@@ -173,6 +173,45 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
return 0;
}

+/*
+ * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue.
+ * @hdev: Pointer to hl_device structure.
+ * @q: Pointer to hl_hw_queue structure.
+ * @num_of_entries: How many entries to check for space.
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the completion queue.
+ * This check also ensures that there is enough space in the h/w queue, as
+ * both queues are of the same size.
+ * - Reserve space in the completion queue (needs to be reversed if there
+ * is a failure down the road before the actual submission of work).
+ *
+ * Both operations are done using the "free_slots_cnt" field of the completion
+ * queue. The CI counters of the queue and the completion queue are not
+ * needed/used for the H/W queue type.
+ */
+static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
+ int num_of_entries)
+{
+ atomic_t *free_slots =
+ &hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
+
+ /*
+ * Check we have enough space in the completion queue.
+ * Add -1 to counter (decrement) unless counter was already 0.
+ * In that case, CQ is full so we can't submit a new CB.
+ * atomic_add_unless will return 0 if counter was already 0.
+ */
+ if (atomic_add_negative(num_of_entries * -1, free_slots)) {
+ dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
+ num_of_entries, q->hw_queue_id);
+ atomic_add(num_of_entries, free_slots);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
/*
* hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
*
@@ -206,11 +245,18 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
goto out;
}

- rc = ext_queue_sanity_checks(hdev, q, 1, false);
- if (rc)
- goto out;
+ /*
+ * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
+ * type only on init phase, when the queues are empty and being tested,
+ * so there is no need for sanity checks.
+ */
+ if (q->queue_type != QUEUE_TYPE_HW) {
+ rc = ext_queue_sanity_checks(hdev, q, 1, false);
+ if (rc)
+ goto out;
+ }

- ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+ ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);

out:
if (q->queue_type != QUEUE_TYPE_CPU)
@@ -220,14 +266,14 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
}

/*
- * ext_hw_queue_schedule_job - submit a JOB to an external queue
+ * ext_queue_schedule_job - submit a JOB to an external queue
*
* @job: pointer to the job that needs to be submitted to the queue
*
* This function must be called when the scheduler mutex is taken
*
*/
-static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+static void ext_queue_schedule_job(struct hl_cs_job *job)
{
struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -260,7 +306,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
* H/W queues is done under the scheduler mutex
*
* No need to check if CQ is full because it was already
- * checked in hl_queue_sanity_checks
+ * checked in ext_queue_sanity_checks
*/
cq = &hdev->completion_queue[q->hw_queue_id];
cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
@@ -274,18 +320,18 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)

cq->pi = hl_cq_inc_ptr(cq->pi);

- ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+ ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
}

/*
- * int_hw_queue_schedule_job - submit a JOB to an internal queue
+ * int_queue_schedule_job - submit a JOB to an internal queue
*
* @job: pointer to the job that needs to be submitted to the queue
*
* This function must be called when the scheduler mutex is taken
*
*/
-static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+static void int_queue_schedule_job(struct hl_cs_job *job)
{
struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -307,6 +353,60 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job)
hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
}

+/*
+ * hw_queue_schedule_job - submit a JOB to a H/W queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void hw_queue_schedule_job(struct hl_cs_job *job)
+{
+ struct hl_device *hdev = job->cs->ctx->hdev;
+ struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+ struct hl_cq *cq;
+ u64 ptr;
+ u32 offset, ctl, len;
+
+ /*
+ * Upon PQE completion, COMP_DATA is used as the write data to the
+ * completion queue (QMAN HBW message), and COMP_OFFSET is used as the
+ * write address offset in the SM block (QMAN LBW message).
+ * The write address offset is calculated as "COMP_OFFSET << 2".
+ */
+ offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+ ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
+ ((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
+
+ len = job->job_cb_size;
+
+ /*
+ * A patched CB is created only if a user CB was allocated by driver and
+ * MMU is disabled. If MMU is enabled, the user CB should be used
+ * instead. If the user CB wasn't allocated by driver, assume that it
+ * holds an address.
+ */
+ if (job->patched_cb)
+ ptr = job->patched_cb->bus_address;
+ else if (job->is_kernel_allocated_cb)
+ ptr = job->user_cb->bus_address;
+ else
+ ptr = (u64) (uintptr_t) job->user_cb;
+
+ /*
+ * No need to protect pi_offset because scheduling to the
+ * H/W queues is done under the scheduler mutex
+ *
+ * No need to check if CQ is full because it was already
+ * checked in hw_queue_sanity_checks
+ */
+ cq = &hdev->completion_queue[q->hw_queue_id];
+ cq->pi = hl_cq_inc_ptr(cq->pi);
+
+ ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
+
/*
* hl_hw_queue_schedule_cs - schedule a command submission
*
@@ -330,23 +430,34 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
}

q = &hdev->kernel_queues[0];
- /* This loop assumes all external queues are consecutive */
for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
- if (q->queue_type == QUEUE_TYPE_EXT) {
- if (cs->jobs_in_queue_cnt[i]) {
+ if (cs->jobs_in_queue_cnt[i]) {
+ switch (q->queue_type) {
+ case QUEUE_TYPE_EXT:
rc = ext_queue_sanity_checks(hdev, q,
- cs->jobs_in_queue_cnt[i], true);
- if (rc)
- goto unroll_cq_resv;
- cq_cnt++;
- }
- } else if (q->queue_type == QUEUE_TYPE_INT) {
- if (cs->jobs_in_queue_cnt[i]) {
+ cs->jobs_in_queue_cnt[i], true);
+ break;
+ case QUEUE_TYPE_INT:
rc = int_queue_sanity_checks(hdev, q,
- cs->jobs_in_queue_cnt[i]);
- if (rc)
- goto unroll_cq_resv;
+ cs->jobs_in_queue_cnt[i]);
+ break;
+ case QUEUE_TYPE_HW:
+ rc = hw_queue_sanity_checks(hdev, q,
+ cs->jobs_in_queue_cnt[i]);
+ break;
+ default:
+ dev_err(hdev->dev, "Queue type %d is invalid\n",
+ q->queue_type);
+ rc = -EINVAL;
+ break;
}
+
+ if (rc)
+ goto unroll_cq_resv;
+
+ if (q->queue_type == QUEUE_TYPE_EXT ||
+ q->queue_type == QUEUE_TYPE_HW)
+ cq_cnt++;
}
}

@@ -373,21 +484,30 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
}

list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
- if (job->ext_queue)
- ext_hw_queue_schedule_job(job);
- else
- int_hw_queue_schedule_job(job);
+ switch (job->queue_type) {
+ case QUEUE_TYPE_EXT:
+ ext_queue_schedule_job(job);
+ break;
+ case QUEUE_TYPE_INT:
+ int_queue_schedule_job(job);
+ break;
+ case QUEUE_TYPE_HW:
+ hw_queue_schedule_job(job);
+ break;
+ default:
+ break;
+ }

cs->submitted = true;

goto out;

unroll_cq_resv:
- /* This loop assumes all external queues are consecutive */
q = &hdev->kernel_queues[0];
for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
- if ((q->queue_type == QUEUE_TYPE_EXT) &&
- (cs->jobs_in_queue_cnt[i])) {
+ if ((q->queue_type == QUEUE_TYPE_EXT ||
+ q->queue_type == QUEUE_TYPE_HW) &&
+ cs->jobs_in_queue_cnt[i]) {
atomic_t *free_slots =
&hdev->completion_queue[i].free_slots_cnt;
atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
@@ -414,8 +534,8 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
q->ci = hl_queue_inc_ptr(q->ci);
}

-static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
- struct hl_hw_queue *q, bool is_cpu_queue)
+static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+ bool is_cpu_queue)
{
void *p;
int rc;
@@ -465,7 +585,7 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
return rc;
}

-static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{
void *p;

@@ -485,18 +605,38 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
return 0;
}

-static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{
- return ext_and_cpu_hw_queue_init(hdev, q, true);
+ return ext_and_cpu_queue_init(hdev, q, true);
}

-static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{
- return ext_and_cpu_hw_queue_init(hdev, q, false);
+ return ext_and_cpu_queue_init(hdev, q, false);
+}
+
+static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+ void *p;
+
+ p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
+ HL_QUEUE_SIZE_IN_BYTES,
+ &q->bus_address,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!p)
+ return -ENOMEM;
+
+ q->kernel_address = (u64) (uintptr_t) p;
+
+ /* Make sure read/write pointers are initialized to start of queue */
+ q->ci = 0;
+ q->pi = 0;
+
+ return 0;
}

/*
- * hw_queue_init - main initialization function for H/W queue object
+ * queue_init - main initialization function for H/W queue object
*
* @hdev: pointer to hl_device device structure
* @q: pointer to hl_hw_queue queue structure
@@ -505,7 +645,7 @@ static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
* Allocate dma-able memory for the queue and initialize fields
* Returns 0 on success
*/
-static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
u32 hw_queue_id)
{
int rc;
@@ -516,21 +656,20 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,

switch (q->queue_type) {
case QUEUE_TYPE_EXT:
- rc = ext_hw_queue_init(hdev, q);
+ rc = ext_queue_init(hdev, q);
break;
-
case QUEUE_TYPE_INT:
- rc = int_hw_queue_init(hdev, q);
+ rc = int_queue_init(hdev, q);
break;
-
case QUEUE_TYPE_CPU:
- rc = cpu_hw_queue_init(hdev, q);
+ rc = cpu_queue_init(hdev, q);
+ break;
+ case QUEUE_TYPE_HW:
+ rc = hw_queue_init(hdev, q);
break;
-
case QUEUE_TYPE_NA:
q->valid = 0;
return 0;
-
default:
dev_crit(hdev->dev, "wrong queue type %d during init\n",
q->queue_type);
@@ -554,7 +693,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
*
* Free the queue memory
*/
-static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
+static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
{
if (!q->valid)
return;
@@ -612,7 +751,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {

q->queue_type = asic->hw_queues_props[i].type;
- rc = hw_queue_init(hdev, q, i);
+ rc = queue_init(hdev, q, i);
if (rc) {
dev_err(hdev->dev,
"failed to initialize queue %d\n", i);
@@ -624,7 +763,7 @@ int hl_hw_queues_create(struct hl_device *hdev)

release_queues:
for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
- hw_queue_fini(hdev, q);
+ queue_fini(hdev, q);

kfree(hdev->kernel_queues);

@@ -637,7 +776,7 @@ void hl_hw_queues_destroy(struct hl_device *hdev)
int i;

for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
- hw_queue_fini(hdev, q);
+ queue_fini(hdev, q);

kfree(hdev->kernel_queues);
}
diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/qman_if.h
index bf59bbe27fdc..0fdb49188ed7 100644
--- a/drivers/misc/habanalabs/include/qman_if.h
+++ b/drivers/misc/habanalabs/include/qman_if.h
@@ -23,6 +23,8 @@ struct hl_bd {
#define HL_BD_SIZE sizeof(struct hl_bd)

/*
+ * S/W CTL FIELDS.
+ *
* BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
* valid. 1 means the repeat field is valid, 0 means not-valid,
* i.e. repeat == 1
@@ -33,6 +35,16 @@ struct hl_bd {
#define BD_CTL_SHADOW_INDEX_SHIFT 0
#define BD_CTL_SHADOW_INDEX_MASK 0x00000FFF

+/*
+ * H/W CTL FIELDS
+ */
+
+#define BD_CTL_COMP_OFFSET_SHIFT 16
+#define BD_CTL_COMP_OFFSET_MASK 0x00FF0000
+
+#define BD_CTL_COMP_DATA_SHIFT 0
+#define BD_CTL_COMP_DATA_MASK 0x0000FFFF
+
/*
* COMPLETION QUEUE
*/
--
2.17.1

2019-10-03 08:43:31

by Tomer Tayar

[permalink] [raw]

Subject: [PATCH 3/4] habanalabs: Add pre-CS-scheduling ASIC function

The patch adds a pre-CS-scheduling ASIC function for operations just
before placing CS jobs on the queues.

For H/W queues, it is used to configure dedicated sync object and
monitor for this CS. The sync object value is increased when each of
the jobs is completed. When it reaches the number of jobs, the monitor
generates interrupt towards the host, to inform the driver about the
completion of the CS.

Signed-off-by: Tomer Tayar <[email protected]>
---
drivers/misc/habanalabs/command_submission.c | 1 +
drivers/misc/habanalabs/goya/goya.c | 8 +++++++-
drivers/misc/habanalabs/goya/goyaP.h | 2 ++
drivers/misc/habanalabs/habanalabs.h | 6 ++++++
drivers/misc/habanalabs/hw_queue.c | 8 ++++++++
5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 776ddafc47fb..25dc7308da19 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -592,6 +592,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
job->hw_queue_id = chunk->queue_index;

cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+ cs->jobs_cnt++;

list_add_tail(&job->cs_node, &cs->job_list);

diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0b40915bede2..d0d4c1a38dbe 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -5089,6 +5089,11 @@ static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev)
return RREG32(mmHW_STATE);
}

+int goya_pre_schedule_cs(struct hl_cs *cs)
+{
+ return 0;
+}
+
static const struct hl_asic_funcs goya_funcs = {
.early_init = goya_early_init,
.early_fini = goya_early_fini,
@@ -5145,7 +5150,8 @@ static const struct hl_asic_funcs goya_funcs = {
.init_iatu = goya_init_iatu,
.rreg = hl_rreg,
.wreg = hl_wreg,
- .halt_coresight = goya_halt_coresight
+ .halt_coresight = goya_halt_coresight,
+ .pre_schedule_cs = goya_pre_schedule_cs
};

/*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index 89b6574f8e4f..346e70d3afa8 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -233,4 +233,6 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
void *vaddr);
void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev);

+int goya_pre_schedule_cs(struct hl_cs *cs);
+
#endif /* GOYAP_H_ */
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 371d1ec15697..c1af83f96415 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -433,6 +433,8 @@ enum hl_pll_frequency {
PLL_LAST
};

+struct hl_cs;
+
/**
* struct hl_asic_funcs - ASIC specific functions that are can be called from
* common code.
@@ -508,6 +510,7 @@ enum hl_pll_frequency {
* @rreg: Read a register. Needed for simulator support.
* @wreg: Write a register. Needed for simulator support.
* @halt_coresight: stop the ETF and ETR traces.
+ * @pre_schedule_cs: Perform pre-CS-scheduling operations.
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -590,6 +593,7 @@ struct hl_asic_funcs {
u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
void (*halt_coresight)(struct hl_device *hdev);
+ int (*pre_schedule_cs)(struct hl_cs *cs);
};

@@ -722,6 +726,7 @@ struct hl_userptr {
* @mirror_node : node in device mirror list of command submissions.
* @debugfs_list: node in debugfs list of command submissions.
* @sequence: the sequence number of this CS.
+ * @jobs_cnt: counter of submitted jobs on all queues.
* @submitted: true if CS was submitted to H/W.
* @completed: true if CS was completed by device.
* @timedout : true if CS was timedout.
@@ -740,6 +745,7 @@ struct hl_cs {
struct list_head mirror_node;
struct list_head debugfs_list;
u64 sequence;
+ u32 jobs_cnt;
u8 submitted;
u8 completed;
u8 timedout;
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 185628b98d7e..a1205ae47250 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -461,6 +461,14 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
}
}

+ rc = hdev->asic_funcs->pre_schedule_cs(cs);
+ if (rc) {
+ dev_err(hdev->dev,
+ "Failed in pre-submission operations of CS %d.%llu\n",
+ cs->ctx->asid, cs->sequence);
+ goto unroll_cq_resv;
+ }
+
spin_lock(&hdev->hw_queues_mirror_lock);
list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);

--
2.17.1

2019-10-03 08:43:33

by Tomer Tayar

[permalink] [raw]

Subject: [PATCH 4/4] habanalabs: Add IRQ handler for CS completions

This patch adds an IRQ handler for CS completions of CS jobs which are
sent on H/W queues.
The patch adds a CS shadow queue, from which the handler retrieves the
CS, and a dedicated workqueue, on which the handler queues a work to
free the CS jobs.

Signed-off-by: Tomer Tayar <[email protected]>
---
drivers/misc/habanalabs/command_submission.c | 16 +++++++
drivers/misc/habanalabs/device.c | 27 +++++++++++-
drivers/misc/habanalabs/habanalabs.h | 18 ++++++++
drivers/misc/habanalabs/hw_queue.c | 2 +
drivers/misc/habanalabs/irq.c | 46 ++++++++++++++++++++
5 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 25dc7308da19..b995a02a31dd 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -267,6 +267,8 @@ static void cs_do_release(struct kref *ref)

hl_ctx_put(cs->ctx);

+ hdev->shadow_cs_queue[cs->sequence & (HL_MAX_PENDING_CS - 1)] = NULL;
+
if (cs->timedout)
dma_fence_set_error(cs->fence, -ETIMEDOUT);
else if (cs->aborted)
@@ -391,6 +393,7 @@ void hl_cs_rollback_all(struct hl_device *hdev)

/* flush all completions */
flush_workqueue(hdev->cq_wq);
+ flush_workqueue(hdev->cs_cmplt_wq);

/* Make sure we don't have leftovers in the H/W queues mirror list */
list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
@@ -415,6 +418,16 @@ static void job_wq_completion(struct work_struct *work)
free_job(hdev, job);
}

+static void cs_completion(struct work_struct *work)
+{
+ struct hl_cs *cs = container_of(work, struct hl_cs, finish_work);
+ struct hl_device *hdev = cs->ctx->hdev;
+ struct hl_cs_job *job, *tmp;
+
+ list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
+ free_job(hdev, job);
+}
+
static int validate_queue_index(struct hl_device *hdev,
struct hl_cs_chunk *chunk,
enum hl_queue_type *queue_type,
@@ -625,6 +638,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
goto free_cs_object;
}

+ if (job->queue_type == QUEUE_TYPE_HW)
+ INIT_WORK(&cs->finish_work, cs_completion);
+
rc = hl_hw_queue_schedule_cs(cs);
if (rc) {
dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 2f5a4da707e7..6c13f05c3120 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -284,11 +284,19 @@ static int device_early_init(struct hl_device *hdev)
goto free_cq_wq;
}

+ hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0);
+ if (!hdev->cs_cmplt_wq) {
+ dev_err(hdev->dev,
+ "Failed to allocate CS completions workqueue\n");
+ rc = -ENOMEM;
+ goto free_eq_wq;
+ }
+
hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
GFP_KERNEL);
if (!hdev->hl_chip_info) {
rc = -ENOMEM;
- goto free_eq_wq;
+ goto free_cs_cmplt_wq;
}

hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
@@ -314,6 +322,8 @@ static int device_early_init(struct hl_device *hdev)

free_chip_info:
kfree(hdev->hl_chip_info);
+free_cs_cmplt_wq:
+ destroy_workqueue(hdev->cs_cmplt_wq);
free_eq_wq:
destroy_workqueue(hdev->eq_wq);
free_cq_wq:
@@ -346,6 +356,7 @@ static void device_early_fini(struct hl_device *hdev)
kfree(hdev->idle_busy_ts_arr);
kfree(hdev->hl_chip_info);

+ destroy_workqueue(hdev->cs_cmplt_wq);
destroy_workqueue(hdev->eq_wq);
destroy_workqueue(hdev->cq_wq);

@@ -1138,6 +1149,14 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
}
}

+ hdev->shadow_cs_queue = kmalloc_array(HL_MAX_PENDING_CS,
+ sizeof(*hdev->shadow_cs_queue),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!hdev->shadow_cs_queue) {
+ rc = -ENOMEM;
+ goto cq_fini;
+ }
+
/*
* Initialize the event queue. Must be done before hw_init,
* because there the address of the event queue is being
@@ -1146,7 +1165,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
rc = hl_eq_init(hdev, &hdev->event_queue);
if (rc) {
dev_err(hdev->dev, "failed to initialize event queue\n");
- goto cq_fini;
+ goto free_shadow_cs_queue;
}

/* MMU S/W must be initialized before kernel context is created */
@@ -1269,6 +1288,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
hl_mmu_fini(hdev);
eq_fini:
hl_eq_fini(hdev, &hdev->event_queue);
+free_shadow_cs_queue:
+ kfree(hdev->shadow_cs_queue);
cq_fini:
for (i = 0 ; i < cq_ready_cnt ; i++)
hl_cq_fini(hdev, &hdev->completion_queue[i]);
@@ -1383,6 +1404,8 @@ void hl_device_fini(struct hl_device *hdev)

hl_eq_fini(hdev, &hdev->event_queue);

+ kfree(hdev->shadow_cs_queue);
+
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
hl_cq_fini(hdev, &hdev->completion_queue[i]);
kfree(hdev->completion_queue);
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index c1af83f96415..2efb5e1e62cb 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -722,6 +722,7 @@ struct hl_userptr {
* @job_lock: spinlock for the CS's jobs list. Needed for free_job.
* @refcount: reference counter for usage of the CS.
* @fence: pointer to the fence object of this CS.
+ * @finish_work: workqueue object to run when CS is completed by H/W.
* @work_tdr: delayed work node for TDR.
* @mirror_node : node in device mirror list of command submissions.
* @debugfs_list: node in debugfs list of command submissions.
@@ -741,6 +742,7 @@ struct hl_cs {
spinlock_t job_lock;
struct kref refcount;
struct dma_fence *fence;
+ struct work_struct finish_work;
struct delayed_work work_tdr;
struct list_head mirror_node;
struct list_head debugfs_list;
@@ -1203,8 +1205,12 @@ struct hl_device_idle_busy_ts {
* @asic_name: ASIC specific nmae.
* @asic_type: ASIC specific type.
* @completion_queue: array of hl_cq.
+ * @shadow_cs_queue: pointer to a shadow queue that holds pointers to
+ * outstanding command submissions.
* @cq_wq: work queue of completion queues for executing work in process context
* @eq_wq: work queue of event queue for executing work in process context.
+ * @cs_cmplt_wq: work queue of CS completions for executing work in process
+ * context.
* @kernel_ctx: Kernel driver context structure.
* @kernel_queues: array of hl_hw_queue.
* @hw_queues_mirror_list: CS mirror list for TDR.
@@ -1284,8 +1290,10 @@ struct hl_device {
char asic_name[16];
enum hl_asic_type asic_type;
struct hl_cq *completion_queue;
+ struct hl_cs **shadow_cs_queue;
struct workqueue_struct *cq_wq;
struct workqueue_struct *eq_wq;
+ struct workqueue_struct *cs_cmplt_wq;
struct hl_ctx *kernel_ctx;
struct hl_hw_queue *kernel_queues;
struct list_head hw_queues_mirror_list;
@@ -1359,6 +1367,15 @@ struct hl_device {
u8 pldm;
};

+/**
+ * struct hl_cs_irq_info - IRQ info structure for CS completion interrupt.
+ * @hdev: pointer to habanalabs device structure.
+ * @relative_idx: CS completion relative interrupt index (0-based).
+ */
+struct hl_cs_irq_info {
+ struct hl_device *hdev;
+ int relative_idx;
+};

/*
* IOCTLs
@@ -1470,6 +1487,7 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q);
void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q);
irqreturn_t hl_irq_handler_cq(int irq, void *arg);
irqreturn_t hl_irq_handler_eq(int irq, void *arg);
+irqreturn_t hl_irq_handler_cs_cmplt(int irq, void *arg);
u32 hl_cq_inc_ptr(u32 ptr);

int hl_asid_init(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index a1205ae47250..7b80e571a27c 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -469,6 +469,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
goto unroll_cq_resv;
}

+ hdev->shadow_cs_queue[cs->sequence & (HL_MAX_PENDING_CS - 1)] = cs;
+
spin_lock(&hdev->hw_queues_mirror_lock);
list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);

diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c
index fac65fbd70e8..93fa13218dd4 100644
--- a/drivers/misc/habanalabs/irq.c
+++ b/drivers/misc/habanalabs/irq.c
@@ -205,6 +205,52 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
return IRQ_HANDLED;
}

+/*
+ * hl_irq_handler_cs_cmplt() - irq handler for CS completions.
+ * @irq: IRQ number
+ * @arg: pointer to hl_device structure.
+ */
+irqreturn_t hl_irq_handler_cs_cmplt(int irq, void *arg)
+{
+ struct hl_cs_irq_info *cs_irq_info = arg;
+ struct hl_device *hdev = cs_irq_info->hdev;
+ struct hl_cs *cs;
+ struct hl_cs_job *job;
+ struct hl_cq *cq;
+ int relative_idx = cs_irq_info->relative_idx;
+
+ if (hdev->disabled) {
+ dev_dbg(hdev->dev,
+ "Device disabled but received IRQ %d for CS completion\n",
+ irq);
+ goto out;
+ }
+
+ cs = hdev->shadow_cs_queue[relative_idx & (HL_MAX_PENDING_CS - 1)];
+ if (!cs) {
+ dev_warn(hdev->dev,
+ "No pointer to CS in shadow array at index %d\n",
+ relative_idx);
+ goto out;
+ }
+
+ queue_work(hdev->cs_cmplt_wq, &cs->finish_work);
+
+ /*
+ * The same CQs can be accessed from parallel IRQ handlers that handle
+ * the completion of different CSs. However, locking is not needed
+ * because the "free_slots_cnt" variable is atomic.
+ * There is no need to update the CI counters of the queues/CQs, as they
+ * are not needed/used for the H/W queue type.
+ */
+ list_for_each_entry(job, &cs->job_list, cs_node) {
+ cq = &hdev->completion_queue[job->hw_queue_id];
+ atomic_inc(&cq->free_slots_cnt);
+ }
+out:
+ return IRQ_HANDLED;
+}
+
/*
* hl_cq_init - main initialization function for an cq object
*
--
2.17.1