This patch series enables userspace utilities like gputop and nvtop to
query a render context's fdinfo file and figure out rates of engine
and memory utilisation.
Previous discussion can be found at
https://lore.kernel.org/dri-devel/[email protected]/
Changelog:
v3:
- Fixed some nits and removed useless bounds check in panthor_sched.c
- Added support for sysfs profiling knob and optional job accounting
- Added new patches for calculating size of internal BO's
v2:
- Split original first patch in two, one for FW CS cycle and timestamp
calculations and job accounting memory management, and a second one
that enables fdinfo.
- Moved NUM_INSTRS_PER_SLOT to the file prelude
- Removed nelem variable from the group's struct definition.
- Precompute size of group's syncobj BO to avoid code duplication.
- Some minor nits.
Adrián Larumbe (7):
drm/panthor: introduce job cycle and timestamp accounting
drm/panthor: add DRM fdinfo support
drm/panthor: enable fdinfo for memory stats
drm/panthor: add sysfs knob for enabling job profiling
drm/panthor: support job accounting
drm/drm_file: add display of driver's internal memory size
drm/panthor: register size of internal objects through fdinfo
Documentation/gpu/drm-usage-stats.rst | 4 +
drivers/gpu/drm/drm_file.c | 9 +-
drivers/gpu/drm/msm/msm_drv.c | 2 +-
drivers/gpu/drm/panfrost/panfrost_drv.c | 2 +-
drivers/gpu/drm/panthor/panthor_devfreq.c | 10 +
drivers/gpu/drm/panthor/panthor_device.c | 2 +
drivers/gpu/drm/panthor/panthor_device.h | 21 ++
drivers/gpu/drm/panthor/panthor_drv.c | 83 +++++-
drivers/gpu/drm/panthor/panthor_fw.c | 16 +-
drivers/gpu/drm/panthor/panthor_fw.h | 5 +-
drivers/gpu/drm/panthor/panthor_gem.c | 67 ++++-
drivers/gpu/drm/panthor/panthor_gem.h | 16 +-
drivers/gpu/drm/panthor/panthor_heap.c | 23 +-
drivers/gpu/drm/panthor/panthor_heap.h | 6 +-
drivers/gpu/drm/panthor/panthor_mmu.c | 8 +-
drivers/gpu/drm/panthor/panthor_mmu.h | 3 +-
drivers/gpu/drm/panthor/panthor_sched.c | 304 +++++++++++++++++++---
include/drm/drm_file.h | 7 +-
18 files changed, 522 insertions(+), 66 deletions(-)
base-commit: 310ec03841a36e3f45fb528f0dfdfe5b9e84b037
--
2.45.1
Enable calculations of job submission times in clock cycles and wall
time. This is done by expanding the boilerplate command stream when running
a job to include instructions that compute said times right before an after
a user CS.
Those numbers are stored in the queue's group's sync objects BO, right
after them. Because the queues in a group might have a different number of
slots, one must keep track of the overall slot tally when reckoning the
offset of a queue's time sample structs, one for each slot.
NUM_INSTRS_PER_SLOT had to be increased to 32 because of adding new FW
instructions for storing and subtracting the cycle counter and timestamp
register, and it must always remain a power of two.
This commit is done in preparation for enabling DRM fdinfo support in the
Panthor driver, which depends on the numbers calculated herein.
Signed-off-by: Adrián Larumbe <[email protected]>
Reviewed-by: Liviu Dudau <[email protected]>
---
drivers/gpu/drm/panthor/panthor_sched.c | 156 ++++++++++++++++++++----
1 file changed, 132 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 79ffcbc41d78..62a67d6bd37a 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -93,6 +93,9 @@
#define MIN_CSGS 3
#define MAX_CSG_PRIO 0xf
+#define NUM_INSTRS_PER_SLOT 32
+#define SLOTSIZE (NUM_INSTRS_PER_SLOT * sizeof(u64))
+
struct panthor_group;
/**
@@ -466,6 +469,9 @@ struct panthor_queue {
*/
struct list_head in_flight_jobs;
} fence_ctx;
+
+ /** @time_offset: Offset of panthor_job_times structs in group's syncobj bo. */
+ unsigned long time_offset;
};
/**
@@ -592,7 +598,17 @@ struct panthor_group {
* One sync object per queue. The position of the sync object is
* determined by the queue index.
*/
- struct panthor_kernel_bo *syncobjs;
+
+ struct {
+ /** @bo: Kernel BO holding the sync objects. */
+ struct panthor_kernel_bo *bo;
+
+ /**
+ * @job_times_offset: Beginning of panthor_job_times struct samples after
+ * the group's array of sync objects.
+ */
+ size_t job_times_offset;
+ } syncobjs;
/** @state: Group state. */
enum panthor_group_state state;
@@ -651,6 +667,18 @@ struct panthor_group {
struct list_head wait_node;
};
+struct panthor_job_times {
+ struct {
+ u64 before;
+ u64 after;
+ } cycles;
+
+ struct {
+ u64 before;
+ u64 after;
+ } time;
+};
+
/**
* group_queue_work() - Queue a group work
* @group: Group to queue the work for.
@@ -730,6 +758,9 @@ struct panthor_job {
/** @queue_idx: Index of the queue inside @group. */
u32 queue_idx;
+ /** @ringbuf_idx: Index of the ringbuffer inside @queue. */
+ u32 ringbuf_idx;
+
/** @call_info: Information about the userspace command stream call. */
struct {
/** @start: GPU address of the userspace command stream. */
@@ -844,7 +875,7 @@ static void group_release_work(struct work_struct *work)
panthor_kernel_bo_destroy(group->suspend_buf);
panthor_kernel_bo_destroy(group->protm_suspend_buf);
- panthor_kernel_bo_destroy(group->syncobjs);
+ panthor_kernel_bo_destroy(group->syncobjs.bo);
panthor_vm_put(group->vm);
kfree(group);
@@ -1969,8 +2000,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
}
}
-#define NUM_INSTRS_PER_SLOT 16
-
static void
group_term_post_processing(struct panthor_group *group)
{
@@ -2007,7 +2036,7 @@ group_term_post_processing(struct panthor_group *group)
spin_unlock(&queue->fence_ctx.lock);
/* Manually update the syncobj seqno to unblock waiters. */
- syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj));
+ syncobj = group->syncobjs.bo->kmap + (i * sizeof(*syncobj));
syncobj->status = ~0;
syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno);
sched_queue_work(group->ptdev->scheduler, sync_upd);
@@ -2780,7 +2809,7 @@ static void group_sync_upd_work(struct work_struct *work)
if (!queue)
continue;
- syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj));
+ syncobj = group->syncobjs.bo->kmap + (queue_idx * sizeof(*syncobj));
spin_lock(&queue->fence_ctx.lock);
list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) {
@@ -2815,11 +2844,17 @@ queue_run_job(struct drm_sched_job *sched_job)
struct panthor_scheduler *sched = ptdev->scheduler;
u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
+ u32 ringbuf_index = ringbuf_insert / (SLOTSIZE);
u64 addr_reg = ptdev->csif_info.cs_reg_count -
ptdev->csif_info.unpreserved_cs_reg_count;
u64 val_reg = addr_reg + 2;
- u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
- job->queue_idx * sizeof(struct panthor_syncobj_64b);
+ u64 cycle_reg = addr_reg;
+ u64 time_reg = val_reg;
+ u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) +
+ job->queue_idx * sizeof(struct panthor_syncobj_64b);
+ u64 times_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) + queue->time_offset +
+ (ringbuf_index * sizeof(struct panthor_job_times));
+
u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
struct dma_fence *done_fence;
int ret;
@@ -2831,6 +2866,18 @@ queue_run_job(struct drm_sched_job *sched_job)
/* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
(36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
+ /* MOV48 rX:rX+1, cycles_offset */
+ (1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.before)),
+
+ /* MOV48 rX:rX+1, time_offset */
+ (1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.before)),
+
+ /* STORE_STATE cycles */
+ (40ull << 56) | (cycle_reg << 40) | (1ll << 32),
+
+ /* STORE_STATE timer */
+ (40ull << 56) | (time_reg << 40) | (0ll << 32),
+
/* MOV48 rX:rX+1, cs.start */
(1ull << 56) | (addr_reg << 48) | job->call_info.start,
@@ -2843,6 +2890,18 @@ queue_run_job(struct drm_sched_job *sched_job)
/* CALL rX:rX+1, rX+2 */
(32ull << 56) | (addr_reg << 40) | (val_reg << 32),
+ /* MOV48 rX:rX+1, cycles_offset */
+ (1ull << 56) | (cycle_reg << 48) | (times_addr + offsetof(struct panthor_job_times, cycles.after)),
+
+ /* MOV48 rX:rX+1, time_offset */
+ (1ull << 56) | (time_reg << 48) | (times_addr + offsetof(struct panthor_job_times, time.after)),
+
+ /* STORE_STATE cycles */
+ (40ull << 56) | (cycle_reg << 40) | (1ll << 32),
+
+ /* STORE_STATE timer */
+ (40ull << 56) | (time_reg << 40) | (0ll << 32),
+
/* MOV48 rX:rX+1, sync_addr */
(1ull << 56) | (addr_reg << 48) | sync_addr,
@@ -2897,6 +2956,7 @@ queue_run_job(struct drm_sched_job *sched_job)
job->ringbuf.start = queue->iface.input->insert;
job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
+ job->ringbuf_idx = ringbuf_index;
/* Make sure the ring buffer is updated before the INSERT
* register.
@@ -2987,7 +3047,8 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
static struct panthor_queue *
group_create_queue(struct panthor_group *group,
- const struct drm_panthor_queue_create *args)
+ const struct drm_panthor_queue_create *args,
+ unsigned int slots_so_far)
{
struct drm_gpu_scheduler *drm_sched;
struct panthor_queue *queue;
@@ -3038,9 +3099,12 @@ group_create_queue(struct panthor_group *group,
goto err_free_queue;
}
+ queue->time_offset = group->syncobjs.job_times_offset +
+ (slots_so_far * sizeof(struct panthor_job_times));
+
ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
group->ptdev->scheduler->wq, 1,
- args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
+ args->ringbuf_size / SLOTSIZE,
0, msecs_to_jiffies(JOB_TIMEOUT_MS),
group->ptdev->reset.wq,
NULL, "panthor-queue", group->ptdev->base.dev);
@@ -3068,7 +3132,9 @@ int panthor_group_create(struct panthor_file *pfile,
struct panthor_scheduler *sched = ptdev->scheduler;
struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0);
struct panthor_group *group = NULL;
+ unsigned int total_slots;
u32 gid, i, suspend_size;
+ size_t syncobj_bo_size;
int ret;
if (group_args->pad)
@@ -3134,33 +3200,75 @@ int panthor_group_create(struct panthor_file *pfile,
goto err_put_group;
}
- group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm,
- group_args->queues.count *
- sizeof(struct panthor_syncobj_64b),
- DRM_PANTHOR_BO_NO_MMAP,
- DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
- DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
- PANTHOR_VM_KERNEL_AUTO_VA);
- if (IS_ERR(group->syncobjs)) {
- ret = PTR_ERR(group->syncobjs);
+ /*
+ * Need to add size for the panthor_job_times structs, as many as the sum
+ * of the number of job slots for every single queue ringbuffer.
+ */
+ for (i = 0, total_slots = 0; i < group_args->queues.count; i++)
+ total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
+
+ syncobj_bo_size = (group_args->queues.count * sizeof(struct panthor_syncobj_64b))
+ + (total_slots * sizeof(struct panthor_job_times));
+
+ /*
+ * Memory layout of group's syncobjs BO
+ * group->syncobjs.bo {
+ * struct panthor_syncobj_64b sync1;
+ * struct panthor_syncobj_64b sync2;
+ * ...
+ * As many as group_args->queues.count
+ * ...
+ * struct panthor_syncobj_64b syncn;
+ * struct panthor_job_times queue1_slot1
+ * struct panthor_job_times queue1_slot2
+ * ...
+ * As many as queue[i].ringbuf_size / SLOTSIZE
+ * ...
+ * struct panthor_job_times queue1_slotP
+ * ...
+ * As many as group_args->queues.count
+ * ...
+ * struct panthor_job_times queueN_slot1
+ * struct panthor_job_times queueN_slot2
+ * ...
+ * As many as queue[n].ringbuf_size / SLOTSIZE
+ * struct panthor_job_times queueN_slotQ
+ *
+ * Linearly, group->syncobjs.bo = {syncojb1,..,syncobjN,
+ * {queue1 = {js1,..,jsP},..,queueN = {js1,..,jsQ}}}
+ * }
+ *
+ */
+
+ group->syncobjs.bo = panthor_kernel_bo_create(ptdev, group->vm,
+ syncobj_bo_size,
+ DRM_PANTHOR_BO_NO_MMAP,
+ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+ PANTHOR_VM_KERNEL_AUTO_VA);
+ if (IS_ERR(group->syncobjs.bo)) {
+ ret = PTR_ERR(group->syncobjs.bo);
goto err_put_group;
}
- ret = panthor_kernel_bo_vmap(group->syncobjs);
+ ret = panthor_kernel_bo_vmap(group->syncobjs.bo);
if (ret)
goto err_put_group;
- memset(group->syncobjs->kmap, 0,
- group_args->queues.count * sizeof(struct panthor_syncobj_64b));
+ memset(group->syncobjs.bo->kmap, 0, syncobj_bo_size);
+
+ group->syncobjs.job_times_offset =
+ group_args->queues.count * sizeof(struct panthor_syncobj_64b);
- for (i = 0; i < group_args->queues.count; i++) {
- group->queues[i] = group_create_queue(group, &queue_args[i]);
+ for (i = 0, total_slots = 0; i < group_args->queues.count; i++) {
+ group->queues[i] = group_create_queue(group, &queue_args[i], total_slots);
if (IS_ERR(group->queues[i])) {
ret = PTR_ERR(group->queues[i]);
group->queues[i] = NULL;
goto err_put_group;
}
+ total_slots += (queue_args[i].ringbuf_size / (SLOTSIZE));
group->queue_count++;
}
--
2.45.1
Drawing from the FW-calculated values in the previous commit, we can
increase the numbers for an open file by collecting them from finished jobs
when updating their group synchronisation objects.
Signed-off-by: Adrián Larumbe <[email protected]>
---
drivers/gpu/drm/panthor/panthor_devfreq.c | 10 +++++
drivers/gpu/drm/panthor/panthor_device.h | 11 ++++++
drivers/gpu/drm/panthor/panthor_drv.c | 31 +++++++++++++++
drivers/gpu/drm/panthor/panthor_sched.c | 46 +++++++++++++++++++++++
4 files changed, 98 insertions(+)
diff --git a/drivers/gpu/drm/panthor/panthor_devfreq.c b/drivers/gpu/drm/panthor/panthor_devfreq.c
index c6d3c327cc24..5eededaeade7 100644
--- a/drivers/gpu/drm/panthor/panthor_devfreq.c
+++ b/drivers/gpu/drm/panthor/panthor_devfreq.c
@@ -91,6 +91,7 @@ static int panthor_devfreq_get_dev_status(struct device *dev,
spin_lock_irqsave(&pdevfreq->lock, irqflags);
panthor_devfreq_update_utilization(pdevfreq);
+ ptdev->current_frequency = status->current_frequency;
status->total_time = ktime_to_ns(ktime_add(pdevfreq->busy_time,
pdevfreq->idle_time));
@@ -130,6 +131,7 @@ int panthor_devfreq_init(struct panthor_device *ptdev)
struct panthor_devfreq *pdevfreq;
struct dev_pm_opp *opp;
unsigned long cur_freq;
+ unsigned long freq = ULONG_MAX;
int ret;
pdevfreq = drmm_kzalloc(&ptdev->base, sizeof(*ptdev->devfreq), GFP_KERNEL);
@@ -204,6 +206,14 @@ int panthor_devfreq_init(struct panthor_device *ptdev)
dev_pm_opp_put(opp);
+ /* Find the fastest defined rate */
+ opp = dev_pm_opp_find_freq_floor(dev, &freq);
+ if (IS_ERR(opp))
+ return PTR_ERR(opp);
+ ptdev->fast_rate = freq;
+
+ dev_pm_opp_put(opp);
+
/*
* Setup default thresholds for the simple_ondemand governor.
* The values are chosen based on experiments.
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index e388c0472ba7..8a0260a7b90a 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -162,6 +162,14 @@ struct panthor_device {
*/
struct page *dummy_latest_flush;
} pm;
+
+ unsigned long current_frequency;
+ unsigned long fast_rate;
+};
+
+struct panthor_gpu_usage {
+ u64 time;
+ u64 cycles;
};
/**
@@ -176,6 +184,9 @@ struct panthor_file {
/** @groups: Scheduling group pool attached to this file. */
struct panthor_group_pool *groups;
+
+ /** @stats: cycle and timestamp measures for job execution. */
+ struct panthor_gpu_usage stats;
};
int panthor_device_init(struct panthor_device *ptdev);
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index b8a84f26b3ef..6d25385e02a1 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -3,12 +3,17 @@
/* Copyright 2019 Linaro, Ltd., Rob Herring <[email protected]> */
/* Copyright 2019 Collabora ltd. */
+#ifdef CONFIG_ARM_ARCH_TIMER
+#include <asm/arch_timer.h>
+#endif
+
#include <linux/list.h>
#include <linux/module.h>
#include <linux/of_platform.h>
#include <linux/pagemap.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
+#include <linux/time64.h>
#include <drm/drm_debugfs.h>
#include <drm/drm_drv.h>
@@ -1351,6 +1356,30 @@ static int panthor_mmap(struct file *filp, struct vm_area_struct *vma)
return ret;
}
+static void panthor_gpu_show_fdinfo(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
+ struct drm_printer *p)
+{
+#ifdef CONFIG_ARM_ARCH_TIMER
+ drm_printf(p, "drm-engine-panthor:\t%llu ns\n",
+ DIV_ROUND_UP_ULL((pfile->stats.time * NSEC_PER_SEC),
+ arch_timer_get_cntfrq()));
+#endif
+ drm_printf(p, "drm-cycles-panthor:\t%llu\n", pfile->stats.cycles);
+ drm_printf(p, "drm-maxfreq-panthor:\t%lu Hz\n", ptdev->fast_rate);
+ drm_printf(p, "drm-curfreq-panthor:\t%lu Hz\n", ptdev->current_frequency);
+}
+
+static void panthor_show_fdinfo(struct drm_printer *p, struct drm_file *file)
+{
+ struct drm_device *dev = file->minor->dev;
+ struct panthor_device *ptdev = container_of(dev, struct panthor_device, base);
+
+ panthor_gpu_show_fdinfo(ptdev, file->driver_priv, p);
+
+ drm_show_memory_stats(p, file);
+}
+
static const struct file_operations panthor_drm_driver_fops = {
.open = drm_open,
.release = drm_release,
@@ -1360,6 +1389,7 @@ static const struct file_operations panthor_drm_driver_fops = {
.read = drm_read,
.llseek = noop_llseek,
.mmap = panthor_mmap,
+ .show_fdinfo = drm_show_fdinfo,
};
#ifdef CONFIG_DEBUG_FS
@@ -1378,6 +1408,7 @@ static const struct drm_driver panthor_drm_driver = {
DRIVER_SYNCOBJ_TIMELINE | DRIVER_GEM_GPUVA,
.open = panthor_open,
.postclose = panthor_postclose,
+ .show_fdinfo = panthor_show_fdinfo,
.ioctls = panthor_drm_driver_ioctls,
.num_ioctls = ARRAY_SIZE(panthor_drm_driver_ioctls),
.fops = &panthor_drm_driver_fops,
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 62a67d6bd37a..bbd20db40e7b 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -610,6 +610,18 @@ struct panthor_group {
size_t job_times_offset;
} syncobjs;
+ /** @fdinfo: Per-file total cycle and timestamp values reference. */
+ struct {
+ /** @data: Pointer to actual per-file sample data. */
+ struct panthor_gpu_usage *data;
+
+ /**
+ * @lock: Mutex to govern concurrent access from drm file's fdinfo callback
+ * and job post-completion processing function
+ */
+ struct mutex lock;
+ } fdinfo;
+
/** @state: Group state. */
enum panthor_group_state state;
@@ -870,6 +882,8 @@ static void group_release_work(struct work_struct *work)
release_work);
u32 i;
+ mutex_destroy(&group->fdinfo.lock);
+
for (i = 0; i < group->queue_count; i++)
group_free_queue(group, group->queues[i]);
@@ -2792,6 +2806,30 @@ void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed)
}
}
+static void update_fdinfo_stats(struct panthor_job *job)
+{
+ struct panthor_group *group = job->group;
+ struct panthor_queue *queue = group->queues[job->queue_idx];
+ struct panthor_device *ptdev = group->ptdev;
+ struct panthor_gpu_usage *fdinfo;
+ struct panthor_job_times *times;
+
+ drm_WARN_ON(&ptdev->base, job->ringbuf_idx >=
+ panthor_kernel_bo_size(queue->ringbuf) / (SLOTSIZE));
+
+ times = (struct panthor_job_times *)
+ ((unsigned long)group->syncobjs.bo->kmap + queue->time_offset +
+ (job->ringbuf_idx * sizeof(struct panthor_job_times)));
+
+ mutex_lock(&group->fdinfo.lock);
+ if ((group->fdinfo.data)) {
+ fdinfo = group->fdinfo.data;
+ fdinfo->cycles += times->cycles.after - times->cycles.before;
+ fdinfo->time += times->time.after - times->time.before;
+ }
+ mutex_unlock(&group->fdinfo.lock);
+}
+
static void group_sync_upd_work(struct work_struct *work)
{
struct panthor_group *group =
@@ -2827,6 +2865,7 @@ static void group_sync_upd_work(struct work_struct *work)
dma_fence_end_signalling(cookie);
list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
+ update_fdinfo_stats(job);
list_del_init(&job->node);
panthor_job_put(&job->base);
}
@@ -3289,6 +3328,9 @@ int panthor_group_create(struct panthor_file *pfile,
}
mutex_unlock(&sched->reset.lock);
+ group->fdinfo.data = &pfile->stats;
+ mutex_init(&group->fdinfo.lock);
+
return gid;
err_put_group:
@@ -3328,6 +3370,10 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle)
mutex_unlock(&sched->lock);
mutex_unlock(&sched->reset.lock);
+ mutex_lock(&group->fdinfo.lock);
+ group->fdinfo.data = NULL;
+ mutex_unlock(&group->fdinfo.lock);
+
group_put(group);
return 0;
}
--
2.45.1
Implement drm object's status callback.
Also, we consider a PRIME imported BO to be resident if its matching
dma_buf has an open attachment, which means its backing storage had already
been allocated.
Signed-off-by: Adrián Larumbe <[email protected]>
Reviewed-by: Liviu Dudau <[email protected]>
---
drivers/gpu/drm/panthor/panthor_gem.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
index 38f560864879..c60b599665d8 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -145,6 +145,17 @@ panthor_gem_prime_export(struct drm_gem_object *obj, int flags)
return drm_gem_prime_export(obj, flags);
}
+static enum drm_gem_object_status panthor_gem_status(struct drm_gem_object *obj)
+{
+ struct panthor_gem_object *bo = to_panthor_bo(obj);
+ enum drm_gem_object_status res = 0;
+
+ if (bo->base.base.import_attach || bo->base.pages)
+ res |= DRM_GEM_OBJECT_RESIDENT;
+
+ return res;
+}
+
static const struct drm_gem_object_funcs panthor_gem_funcs = {
.free = panthor_gem_free_object,
.print_info = drm_gem_shmem_object_print_info,
@@ -154,6 +165,7 @@ static const struct drm_gem_object_funcs panthor_gem_funcs = {
.vmap = drm_gem_shmem_object_vmap,
.vunmap = drm_gem_shmem_object_vunmap,
.mmap = panthor_gem_mmap,
+ .status = panthor_gem_status,
.export = panthor_gem_prime_export,
.vm_ops = &drm_gem_shmem_vm_ops,
};
--
2.45.1
This includes both DRM objects created to support queues, groups and heaps,
and also objects whose pages are shared between the GPU and the MCU.
However, this doesn't include objects that hold the firmware's binary
regions, since these aren't owned by a render context and are allocated
only once at driver's initialisation time.
Signed-off-by: Adrián Larumbe <[email protected]>
---
drivers/gpu/drm/panthor/panthor_device.c | 2 +
drivers/gpu/drm/panthor/panthor_device.h | 13 +++++-
drivers/gpu/drm/panthor/panthor_drv.c | 20 ++++++---
drivers/gpu/drm/panthor/panthor_fw.c | 16 +++++--
drivers/gpu/drm/panthor/panthor_fw.h | 5 ++-
drivers/gpu/drm/panthor/panthor_gem.c | 55 ++++++++++++++++++++++--
drivers/gpu/drm/panthor/panthor_gem.h | 16 +++++--
drivers/gpu/drm/panthor/panthor_heap.c | 23 +++++++---
drivers/gpu/drm/panthor/panthor_heap.h | 6 ++-
drivers/gpu/drm/panthor/panthor_mmu.c | 8 +++-
drivers/gpu/drm/panthor/panthor_mmu.h | 3 +-
drivers/gpu/drm/panthor/panthor_sched.c | 19 ++++----
12 files changed, 147 insertions(+), 39 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c
index 4082c8f2951d..868fa9aba570 100644
--- a/drivers/gpu/drm/panthor/panthor_device.c
+++ b/drivers/gpu/drm/panthor/panthor_device.c
@@ -179,6 +179,8 @@ int panthor_device_init(struct panthor_device *ptdev)
if (ret)
return ret;
+ drmm_mutex_init(&ptdev->base, &ptdev->private_obj_list_lock);
+
/*
* Set the dummy page holding the latest flush to 1. This will cause the
* flush to avoided as we know it isn't necessary if the submission
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index c3ec1e31f8b7..d3abf9700887 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -166,6 +166,9 @@ struct panthor_device {
bool profile_mode;
unsigned long current_frequency;
unsigned long fast_rate;
+
+ /** @private_obj_list_lock: Lock around per-file lists of internal GEM objects */
+ struct mutex private_obj_list_lock;
};
struct panthor_gpu_usage {
@@ -186,8 +189,14 @@ struct panthor_file {
/** @groups: Scheduling group pool attached to this file. */
struct panthor_group_pool *groups;
- /** @stats: cycle and timestamp measures for job execution. */
- struct panthor_gpu_usage stats;
+ /** @fdinfo: Open file tracking information */
+ struct {
+ /** @stats: cycle and timestamp measures for job execution. */
+ struct panthor_gpu_usage stats;
+
+ /** @private_file_list: File's list of private GEM objects. */
+ struct list_head private_file_list;
+ } fdinfo;
};
int panthor_device_init(struct panthor_device *ptdev);
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index a2876310856f..20a1add84014 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1048,13 +1048,13 @@ static int panthor_ioctl_tiler_heap_create(struct drm_device *ddev, void *data,
if (!vm)
return -EINVAL;
- pool = panthor_vm_get_heap_pool(vm, true);
+ pool = panthor_vm_get_heap_pool(vm, true, pfile);
if (IS_ERR(pool)) {
ret = PTR_ERR(pool);
goto out_put_vm;
}
- ret = panthor_heap_create(pool,
+ ret = panthor_heap_create(pool, pfile,
args->initial_chunk_count,
args->chunk_size,
args->max_chunks,
@@ -1094,7 +1094,7 @@ static int panthor_ioctl_tiler_heap_destroy(struct drm_device *ddev, void *data,
if (!vm)
return -EINVAL;
- pool = panthor_vm_get_heap_pool(vm, false);
+ pool = panthor_vm_get_heap_pool(vm, false, NULL);
if (IS_ERR(pool)) {
ret = PTR_ERR(pool);
goto out_put_vm;
@@ -1268,6 +1268,8 @@ panthor_open(struct drm_device *ddev, struct drm_file *file)
pfile->ptdev = ptdev;
+ INIT_LIST_HEAD(&pfile->fdinfo.private_file_list);
+
ret = panthor_vm_pool_create(pfile);
if (ret)
goto err_free_file;
@@ -1295,6 +1297,12 @@ panthor_postclose(struct drm_device *ddev, struct drm_file *file)
{
struct panthor_file *pfile = file->driver_priv;
+ /*
+ * Group's internal BO's are destroyed asynchronously in a separate worker thread,
+ * so there's a chance by the time BO release happens, the file is already gone.
+ */
+ panthor_gem_dettach_internal_bos(pfile);
+
panthor_group_pool_destroy(pfile);
panthor_vm_pool_destroy(pfile);
@@ -1363,10 +1371,10 @@ static void panthor_gpu_show_fdinfo(struct panthor_device *ptdev,
if (ptdev->profile_mode) {
#ifdef CONFIG_ARM_ARCH_TIMER
drm_printf(p, "drm-engine-panthor:\t%llu ns\n",
- DIV_ROUND_UP_ULL((pfile->stats.time * NSEC_PER_SEC),
+ DIV_ROUND_UP_ULL((pfile->fdinfo.stats.time * NSEC_PER_SEC),
arch_timer_get_cntfrq()));
#endif
- drm_printf(p, "drm-cycles-panthor:\t%llu\n", pfile->stats.cycles);
+ drm_printf(p, "drm-cycles-panthor:\t%llu\n", pfile->fdinfo.stats.cycles);
}
drm_printf(p, "drm-maxfreq-panthor:\t%lu Hz\n", ptdev->fast_rate);
drm_printf(p, "drm-curfreq-panthor:\t%lu Hz\n", ptdev->current_frequency);
@@ -1379,7 +1387,7 @@ static void panthor_show_fdinfo(struct drm_printer *p, struct drm_file *file)
panthor_gpu_show_fdinfo(ptdev, file->driver_priv, p);
- drm_show_memory_stats(p, file);
+ drm_show_memory_stats(p, file, panthor_internal_bos);
}
static const struct file_operations panthor_drm_driver_fops = {
diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
index 857f3f11258a..a4f24e6c25e0 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.c
+++ b/drivers/gpu/drm/panthor/panthor_fw.c
@@ -420,6 +420,7 @@ static void panthor_fw_init_section_mem(struct panthor_device *ptdev,
/**
* panthor_fw_alloc_queue_iface_mem() - Allocate a ring-buffer interfaces.
* @ptdev: Device.
+ * @pfile: File.
* @input: Pointer holding the input interface on success.
* Should be ignored on failure.
* @output: Pointer holding the output interface on success.
@@ -436,6 +437,7 @@ static void panthor_fw_init_section_mem(struct panthor_device *ptdev,
*/
struct panthor_kernel_bo *
panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
struct panthor_fw_ringbuf_input_iface **input,
const struct panthor_fw_ringbuf_output_iface **output,
u32 *input_fw_va, u32 *output_fw_va)
@@ -443,11 +445,12 @@ panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev,
struct panthor_kernel_bo *mem;
int ret;
- mem = panthor_kernel_bo_create(ptdev, ptdev->fw->vm, SZ_8K,
+ mem = panthor_kernel_bo_create(ptdev, pfile, ptdev->fw->vm, SZ_8K,
DRM_PANTHOR_BO_NO_MMAP,
DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
PANTHOR_VM_KERNEL_AUTO_VA);
+
if (IS_ERR(mem))
return mem;
@@ -469,14 +472,18 @@ panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev,
/**
* panthor_fw_alloc_suspend_buf_mem() - Allocate a suspend buffer for a command stream group.
* @ptdev: Device.
+ * @pfile: File.
* @size: Size of the suspend buffer.
*
* Return: A valid pointer in case of success, an ERR_PTR() otherwise.
*/
struct panthor_kernel_bo *
-panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size)
+panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
+ size_t size)
{
- return panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev), size,
+ return panthor_kernel_bo_create(ptdev, pfile,
+ panthor_fw_vm(ptdev), size,
DRM_PANTHOR_BO_NO_MMAP,
DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
PANTHOR_VM_KERNEL_AUTO_VA);
@@ -596,7 +603,8 @@ static int panthor_fw_load_section_entry(struct panthor_device *ptdev,
if (cache_mode != CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED)
vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED;
- section->mem = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev),
+ section->mem = panthor_kernel_bo_create(ptdev, NULL,
+ panthor_fw_vm(ptdev),
section_size,
DRM_PANTHOR_BO_NO_MMAP,
vm_map_flags, va);
diff --git a/drivers/gpu/drm/panthor/panthor_fw.h b/drivers/gpu/drm/panthor/panthor_fw.h
index 22448abde992..ea4746ab381f 100644
--- a/drivers/gpu/drm/panthor/panthor_fw.h
+++ b/drivers/gpu/drm/panthor/panthor_fw.h
@@ -476,11 +476,14 @@ void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_slot);
struct panthor_kernel_bo *
panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
struct panthor_fw_ringbuf_input_iface **input,
const struct panthor_fw_ringbuf_output_iface **output,
u32 *input_fw_va, u32 *output_fw_va);
struct panthor_kernel_bo *
-panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size);
+panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
+ size_t size);
struct panthor_vm *panthor_fw_vm(struct panthor_device *ptdev);
diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
index c60b599665d8..5e2cc8d46643 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0 or MIT
/* Copyright 2019 Linaro, Ltd, Rob Herring <[email protected]> */
+
/* Copyright 2023 Collabora ltd. */
#include <linux/dma-buf.h>
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/mutex.h>
#include <drm/panthor_drm.h>
@@ -24,6 +26,20 @@ static void panthor_gem_free_object(struct drm_gem_object *obj)
drm_gem_object_put(vm_root_gem);
}
+void panthor_gem_dettach_internal_bos(struct panthor_file *pfile)
+{
+ struct panthor_kernel_bo *kbo, *tmp;
+
+ mutex_lock(&pfile->ptdev->private_obj_list_lock);
+ list_for_each_entry_safe(kbo, tmp,
+ &pfile->fdinfo.private_file_list,
+ private_obj) {
+ list_del(&kbo->private_obj);
+ INIT_LIST_HEAD(&kbo->private_obj);
+ }
+ mutex_unlock(&pfile->ptdev->private_obj_list_lock);
+}
+
/**
* panthor_kernel_bo_destroy() - Destroy a kernel buffer object
* @bo: Kernel buffer object to destroy. If NULL or an ERR_PTR(), the destruction
@@ -31,6 +47,8 @@ static void panthor_gem_free_object(struct drm_gem_object *obj)
*/
void panthor_kernel_bo_destroy(struct panthor_kernel_bo *bo)
{
+ struct panthor_device *ptdev =
+ container_of(bo->obj->dev, struct panthor_device, base);
struct panthor_vm *vm;
int ret;
@@ -40,6 +58,13 @@ void panthor_kernel_bo_destroy(struct panthor_kernel_bo *bo)
vm = bo->vm;
panthor_kernel_bo_vunmap(bo);
+ mutex_lock(&ptdev->private_obj_list_lock);
+ if (!list_empty(&bo->private_obj)) {
+ list_del(&bo->private_obj);
+ INIT_LIST_HEAD(&bo->private_obj);
+ }
+ mutex_unlock(&ptdev->private_obj_list_lock);
+
if (drm_WARN_ON(bo->obj->dev,
to_panthor_bo(bo->obj)->exclusive_vm_root_gem != panthor_vm_root_gem(vm)))
goto out_free_bo;
@@ -57,6 +82,20 @@ void panthor_kernel_bo_destroy(struct panthor_kernel_bo *bo)
kfree(bo);
}
+void panthor_internal_bos(struct drm_memory_stats *status,
+ struct drm_file *file)
+{
+ struct panthor_file *pfile = file->driver_priv;
+ struct panthor_kernel_bo *kbo;
+
+ mutex_lock(&pfile->ptdev->private_obj_list_lock);
+ list_for_each_entry(kbo, &pfile->fdinfo.private_file_list, private_obj) {
+ status->resident += kbo->obj->size;
+ status->internal += kbo->obj->size;
+ }
+ mutex_unlock(&pfile->ptdev->private_obj_list_lock);
+}
+
/**
* panthor_kernel_bo_create() - Create and map a GEM object to a VM
* @ptdev: Device.
@@ -72,9 +111,9 @@ void panthor_kernel_bo_destroy(struct panthor_kernel_bo *bo)
* Return: A valid pointer in case of success, an ERR_PTR() otherwise.
*/
struct panthor_kernel_bo *
-panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm,
- size_t size, u32 bo_flags, u32 vm_map_flags,
- u64 gpu_va)
+panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_file *pfile,
+ struct panthor_vm *vm, size_t size, u32 bo_flags,
+ u32 vm_map_flags, u64 gpu_va)
{
struct drm_gem_shmem_object *obj;
struct panthor_kernel_bo *kbo;
@@ -111,6 +150,16 @@ panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm,
bo->exclusive_vm_root_gem = panthor_vm_root_gem(vm);
drm_gem_object_get(bo->exclusive_vm_root_gem);
bo->base.base.resv = bo->exclusive_vm_root_gem->resv;
+
+ INIT_LIST_HEAD(&kbo->private_obj);
+
+ /* Only FW regions are not bound to an open file */
+ if (pfile) {
+ mutex_lock(&ptdev->private_obj_list_lock);
+ list_add(&kbo->private_obj, &pfile->fdinfo.private_file_list);
+ mutex_unlock(&ptdev->private_obj_list_lock);
+ }
+
return kbo;
err_free_va:
diff --git a/drivers/gpu/drm/panthor/panthor_gem.h b/drivers/gpu/drm/panthor/panthor_gem.h
index e43021cf6d45..007970fea43a 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.h
+++ b/drivers/gpu/drm/panthor/panthor_gem.h
@@ -12,6 +12,8 @@
#include <linux/rwsem.h>
struct panthor_vm;
+struct panthor_file;
+struct panthor_device;
/**
* struct panthor_gem_object - Driver specific GEM object.
@@ -75,8 +77,14 @@ struct panthor_kernel_bo {
* @kmap: Kernel CPU mapping of @gem.
*/
void *kmap;
+
+ /** @private_node: Link to driver's list of private GEM objects. */
+ struct list_head private_obj;
};
+void panthor_internal_bos(struct drm_memory_stats *status,
+ struct drm_file *file);
+
static inline
struct panthor_gem_object *to_panthor_bo(struct drm_gem_object *obj)
{
@@ -137,10 +145,12 @@ panthor_kernel_bo_vunmap(struct panthor_kernel_bo *bo)
}
struct panthor_kernel_bo *
-panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm,
- size_t size, u32 bo_flags, u32 vm_map_flags,
- u64 gpu_va);
+panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_file *pfile,
+ struct panthor_vm *vm, size_t size, u32 bo_flags,
+ u32 vm_map_flags, u64 gpu_va);
void panthor_kernel_bo_destroy(struct panthor_kernel_bo *bo);
+void panthor_gem_dettach_internal_bos(struct panthor_file *pfile);
+
#endif /* __PANTHOR_GEM_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_heap.c b/drivers/gpu/drm/panthor/panthor_heap.c
index 3796a9eb22af..cffd259aae12 100644
--- a/drivers/gpu/drm/panthor/panthor_heap.c
+++ b/drivers/gpu/drm/panthor/panthor_heap.c
@@ -86,6 +86,9 @@ struct panthor_heap_pool {
/** @ptdev: Device. */
struct panthor_device *ptdev;
+ /** @pfile: Pointer to Panfrost file struct */
+ struct panthor_file *pfile;
+
/** @vm: VM this pool is bound to. */
struct panthor_vm *vm;
@@ -132,6 +135,7 @@ static void panthor_free_heap_chunk(struct panthor_vm *vm,
}
static int panthor_alloc_heap_chunk(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
struct panthor_vm *vm,
struct panthor_heap *heap,
bool initial_chunk)
@@ -144,7 +148,7 @@ static int panthor_alloc_heap_chunk(struct panthor_device *ptdev,
if (!chunk)
return -ENOMEM;
- chunk->bo = panthor_kernel_bo_create(ptdev, vm, heap->chunk_size,
+ chunk->bo = panthor_kernel_bo_create(ptdev, pfile, vm, heap->chunk_size,
DRM_PANTHOR_BO_NO_MMAP,
DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
PANTHOR_VM_KERNEL_AUTO_VA);
@@ -201,6 +205,7 @@ static void panthor_free_heap_chunks(struct panthor_vm *vm,
}
static int panthor_alloc_heap_chunks(struct panthor_device *ptdev,
+ struct panthor_file *pfile,
struct panthor_vm *vm,
struct panthor_heap *heap,
u32 chunk_count)
@@ -209,7 +214,7 @@ static int panthor_alloc_heap_chunks(struct panthor_device *ptdev,
u32 i;
for (i = 0; i < chunk_count; i++) {
- ret = panthor_alloc_heap_chunk(ptdev, vm, heap, true);
+ ret = panthor_alloc_heap_chunk(ptdev, pfile, vm, heap, true);
if (ret)
return ret;
}
@@ -251,6 +256,7 @@ int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle)
/**
* panthor_heap_create() - Create a heap context
* @pool: Pool to instantiate the heap context from.
+ * @pfile: File.
* @initial_chunk_count: Number of chunk allocated at initialization time.
* Must be at least 1.
* @chunk_size: The size of each chunk. Must be page-aligned and lie in the
@@ -265,6 +271,7 @@ int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle)
* Return: a positive handle on success, a negative error otherwise.
*/
int panthor_heap_create(struct panthor_heap_pool *pool,
+ struct panthor_file *pfile,
u32 initial_chunk_count,
u32 chunk_size,
u32 max_chunks,
@@ -308,8 +315,8 @@ int panthor_heap_create(struct panthor_heap_pool *pool,
heap->max_chunks = max_chunks;
heap->target_in_flight = target_in_flight;
- ret = panthor_alloc_heap_chunks(pool->ptdev, vm, heap,
- initial_chunk_count);
+ ret = panthor_alloc_heap_chunks(pool->ptdev, pfile, vm,
+ heap, initial_chunk_count);
if (ret)
goto err_free_heap;
@@ -466,7 +473,7 @@ int panthor_heap_grow(struct panthor_heap_pool *pool,
* further jobs in this queue fail immediately instead of having to
* wait for the job timeout.
*/
- ret = panthor_alloc_heap_chunk(pool->ptdev, pool->vm, heap, false);
+ ret = panthor_alloc_heap_chunk(pool->ptdev, pool->pfile, pool->vm, heap, false);
if (ret)
goto out_unlock;
@@ -526,7 +533,9 @@ panthor_heap_pool_get(struct panthor_heap_pool *pool)
* Return: A valid pointer on success, a negative error code otherwise.
*/
struct panthor_heap_pool *
-panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm)
+panthor_heap_pool_create(struct panthor_device *ptdev,
+ struct panthor_vm *vm,
+ struct panthor_file *pfile)
{
size_t bosize = ALIGN(MAX_HEAPS_PER_POOL *
panthor_heap_ctx_stride(ptdev),
@@ -547,7 +556,7 @@ panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm)
xa_init_flags(&pool->xa, XA_FLAGS_ALLOC);
kref_init(&pool->refcount);
- pool->gpu_contexts = panthor_kernel_bo_create(ptdev, vm, bosize,
+ pool->gpu_contexts = panthor_kernel_bo_create(ptdev, pfile, vm, bosize,
DRM_PANTHOR_BO_NO_MMAP,
DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
PANTHOR_VM_KERNEL_AUTO_VA);
diff --git a/drivers/gpu/drm/panthor/panthor_heap.h b/drivers/gpu/drm/panthor/panthor_heap.h
index 25a5f2bba445..1d1b409064e3 100644
--- a/drivers/gpu/drm/panthor/panthor_heap.h
+++ b/drivers/gpu/drm/panthor/panthor_heap.h
@@ -9,8 +9,10 @@
struct panthor_device;
struct panthor_heap_pool;
struct panthor_vm;
+struct panthor_file;
int panthor_heap_create(struct panthor_heap_pool *pool,
+ struct panthor_file *pfile,
u32 initial_chunk_count,
u32 chunk_size,
u32 max_chunks,
@@ -20,7 +22,9 @@ int panthor_heap_create(struct panthor_heap_pool *pool,
int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle);
struct panthor_heap_pool *
-panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm);
+panthor_heap_pool_create(struct panthor_device *ptdev,
+ struct panthor_vm *vm,
+ struct panthor_file *pfile);
void panthor_heap_pool_destroy(struct panthor_heap_pool *pool);
struct panthor_heap_pool *
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index fa0a002b1016..15977779292b 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -1872,6 +1872,7 @@ struct panthor_vm *panthor_vm_get(struct panthor_vm *vm)
* panthor_vm_get_heap_pool() - Get the heap pool attached to a VM
* @vm: VM to query the heap pool on.
* @create: True if the heap pool should be created when it doesn't exist.
+ * @pfile: File.
*
* Heap pools are per-VM. This function allows one to retrieve the heap pool
* attached to a VM.
@@ -1882,16 +1883,19 @@ struct panthor_vm *panthor_vm_get(struct panthor_vm *vm)
*
* Return: A valid pointer on success, an ERR_PTR() otherwise.
*/
-struct panthor_heap_pool *panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create)
+struct panthor_heap_pool *panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create,
+ struct panthor_file *pfile)
{
struct panthor_heap_pool *pool;
+ drm_WARN_ON(&vm->ptdev->base, (!create && pfile));
+
mutex_lock(&vm->heaps.lock);
if (!vm->heaps.pool && create) {
if (vm->destroyed)
pool = ERR_PTR(-EINVAL);
else
- pool = panthor_heap_pool_create(vm->ptdev, vm);
+ pool = panthor_heap_pool_create(vm->ptdev, vm, pfile);
if (!IS_ERR(pool))
vm->heaps.pool = panthor_heap_pool_get(pool);
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h
index f3c1ed19f973..d950128b7b50 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.h
+++ b/drivers/gpu/drm/panthor/panthor_mmu.h
@@ -14,6 +14,7 @@ struct panthor_heap_pool;
struct panthor_vm;
struct panthor_vma;
struct panthor_mmu;
+struct panthor_file;
int panthor_mmu_init(struct panthor_device *ptdev);
void panthor_mmu_unplug(struct panthor_device *ptdev);
@@ -33,7 +34,7 @@ void panthor_vm_idle(struct panthor_vm *vm);
int panthor_vm_as(struct panthor_vm *vm);
struct panthor_heap_pool *
-panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create);
+panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create, struct panthor_file *pfile);
struct panthor_vm *panthor_vm_get(struct panthor_vm *vm);
void panthor_vm_put(struct panthor_vm *vm);
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 4fb6fc5c2314..c24b09933bfa 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1414,7 +1414,7 @@ static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
struct panthor_fw_cs_iface *cs_iface;
cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
- heaps = panthor_vm_get_heap_pool(group->vm, false);
+ heaps = panthor_vm_get_heap_pool(group->vm, false, NULL);
heap_address = cs_iface->output->heap_address;
vt_start = cs_iface->output->heap_vt_start;
vt_end = cs_iface->output->heap_vt_end;
@@ -3154,7 +3154,8 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
};
static struct panthor_queue *
-group_create_queue(struct panthor_group *group,
+group_create_queue(struct panthor_file *pfile,
+ struct panthor_group *group,
const struct drm_panthor_queue_create *args,
unsigned int slots_so_far)
{
@@ -3182,7 +3183,7 @@ group_create_queue(struct panthor_group *group,
queue->priority = args->priority;
- queue->ringbuf = panthor_kernel_bo_create(group->ptdev, group->vm,
+ queue->ringbuf = panthor_kernel_bo_create(group->ptdev, pfile, group->vm,
args->ringbuf_size,
DRM_PANTHOR_BO_NO_MMAP,
DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
@@ -3197,7 +3198,7 @@ group_create_queue(struct panthor_group *group,
if (ret)
goto err_free_queue;
- queue->iface.mem = panthor_fw_alloc_queue_iface_mem(group->ptdev,
+ queue->iface.mem = panthor_fw_alloc_queue_iface_mem(group->ptdev, pfile,
&queue->iface.input,
&queue->iface.output,
&queue->iface.input_fw_va,
@@ -3298,7 +3299,7 @@ int panthor_group_create(struct panthor_file *pfile,
}
suspend_size = csg_iface->control->suspend_size;
- group->suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size);
+ group->suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, pfile, suspend_size);
if (IS_ERR(group->suspend_buf)) {
ret = PTR_ERR(group->suspend_buf);
group->suspend_buf = NULL;
@@ -3306,7 +3307,7 @@ int panthor_group_create(struct panthor_file *pfile,
}
suspend_size = csg_iface->control->protm_suspend_size;
- group->protm_suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size);
+ group->protm_suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, pfile, suspend_size);
if (IS_ERR(group->protm_suspend_buf)) {
ret = PTR_ERR(group->protm_suspend_buf);
group->protm_suspend_buf = NULL;
@@ -3353,7 +3354,7 @@ int panthor_group_create(struct panthor_file *pfile,
*
*/
- group->syncobjs.bo = panthor_kernel_bo_create(ptdev, group->vm,
+ group->syncobjs.bo = panthor_kernel_bo_create(ptdev, pfile, group->vm,
syncobj_bo_size,
DRM_PANTHOR_BO_NO_MMAP,
DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
@@ -3374,7 +3375,7 @@ int panthor_group_create(struct panthor_file *pfile,
group_args->queues.count * sizeof(struct panthor_syncobj_64b);
for (i = 0, total_slots = 0; i < group_args->queues.count; i++) {
- group->queues[i] = group_create_queue(group, &queue_args[i], total_slots);
+ group->queues[i] = group_create_queue(pfile, group, &queue_args[i], total_slots);
if (IS_ERR(group->queues[i])) {
ret = PTR_ERR(group->queues[i]);
group->queues[i] = NULL;
@@ -3402,7 +3403,7 @@ int panthor_group_create(struct panthor_file *pfile,
}
mutex_unlock(&sched->reset.lock);
- group->fdinfo.data = &pfile->stats;
+ group->fdinfo.data = &pfile->fdinfo.stats;
mutex_init(&group->fdinfo.lock);
return gid;
--
2.45.1
Just like it is already present in Panfrost, this commit introduces a DRM
device sysfs file that lets UM control the job accounting status in the
device.
The present commit only brings in the sysfs knob and also hides the cycles
and engine fdinfo tags when it's disabled, but leveraging it for job
accounting will be the matter of a later commit.
Signed-off-by: Adrián Larumbe <[email protected]>
---
drivers/gpu/drm/panthor/panthor_device.h | 1 +
drivers/gpu/drm/panthor/panthor_drv.c | 46 +++++++++++++++++++++---
2 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h
index 8a0260a7b90a..c3ec1e31f8b7 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -163,6 +163,7 @@ struct panthor_device {
struct page *dummy_latest_flush;
} pm;
+ bool profile_mode;
unsigned long current_frequency;
unsigned long fast_rate;
};
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c
index 6d25385e02a1..a2876310856f 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -1360,12 +1360,14 @@ static void panthor_gpu_show_fdinfo(struct panthor_device *ptdev,
struct panthor_file *pfile,
struct drm_printer *p)
{
+ if (ptdev->profile_mode) {
#ifdef CONFIG_ARM_ARCH_TIMER
- drm_printf(p, "drm-engine-panthor:\t%llu ns\n",
- DIV_ROUND_UP_ULL((pfile->stats.time * NSEC_PER_SEC),
- arch_timer_get_cntfrq()));
+ drm_printf(p, "drm-engine-panthor:\t%llu ns\n",
+ DIV_ROUND_UP_ULL((pfile->stats.time * NSEC_PER_SEC),
+ arch_timer_get_cntfrq()));
#endif
- drm_printf(p, "drm-cycles-panthor:\t%llu\n", pfile->stats.cycles);
+ drm_printf(p, "drm-cycles-panthor:\t%llu\n", pfile->stats.cycles);
+ }
drm_printf(p, "drm-maxfreq-panthor:\t%lu Hz\n", ptdev->fast_rate);
drm_printf(p, "drm-curfreq-panthor:\t%lu Hz\n", ptdev->current_frequency);
}
@@ -1446,6 +1448,41 @@ static void panthor_remove(struct platform_device *pdev)
panthor_device_unplug(ptdev);
}
+static ssize_t profiling_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct panthor_device *ptdev = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%d\n", ptdev->profile_mode);
+}
+
+static ssize_t profiling_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct panthor_device *ptdev = dev_get_drvdata(dev);
+ bool value;
+ int err;
+
+ err = kstrtobool(buf, &value);
+ if (err)
+ return err;
+
+ ptdev->profile_mode = value;
+
+ return len;
+}
+
+static DEVICE_ATTR_RW(profiling);
+
+static struct attribute *panthor_attrs[] = {
+ &dev_attr_profiling.attr,
+ NULL,
+};
+
+ATTRIBUTE_GROUPS(panthor);
+
static const struct of_device_id dt_match[] = {
{ .compatible = "rockchip,rk3588-mali" },
{ .compatible = "arm,mali-valhall-csf" },
@@ -1465,6 +1502,7 @@ static struct platform_driver panthor_driver = {
.name = "panthor",
.pm = pm_ptr(&panthor_pm_ops),
.of_match_table = dt_match,
+ .dev_groups = panthor_groups,
},
};
--
2.45.1
A previous commit brought in a sysfs knob to control the driver's profiling
status. This changeset flags jobs as being profiled according to the
driver's global profiling status, and picks one of two call instruction
arrays to insert into the ring buffer. One of them includes FW logic to
sample the timestamp and cycle counter registers and write them into the
job's syncobj, and the other does not.
A profiled job's call sequence takes up two ring buffer slots, and this is
reflected when initialising the DRM scheduler for each queue, with a
profiled job contributing twice as many credits.
Signed-off-by: Adrián Larumbe <[email protected]>
---
drivers/gpu/drm/panthor/panthor_sched.c | 95 ++++++++++++++++++++++---
1 file changed, 86 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index bbd20db40e7b..4fb6fc5c2314 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -93,7 +93,7 @@
#define MIN_CSGS 3
#define MAX_CSG_PRIO 0xf
-#define NUM_INSTRS_PER_SLOT 32
+#define NUM_INSTRS_PER_SLOT 16
#define SLOTSIZE (NUM_INSTRS_PER_SLOT * sizeof(u64))
struct panthor_group;
@@ -807,6 +807,9 @@ struct panthor_job {
/** @done_fence: Fence signaled when the job is finished or cancelled. */
struct dma_fence *done_fence;
+
+ /** @is_profiled: Whether timestamp and cycle numbers were gathered for this job */
+ bool is_profiled;
};
static void
@@ -2865,7 +2868,8 @@ static void group_sync_upd_work(struct work_struct *work)
dma_fence_end_signalling(cookie);
list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
- update_fdinfo_stats(job);
+ if (job->is_profiled)
+ update_fdinfo_stats(job);
list_del_init(&job->node);
panthor_job_put(&job->base);
}
@@ -2884,6 +2888,8 @@ queue_run_job(struct drm_sched_job *sched_job)
u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
u32 ringbuf_index = ringbuf_insert / (SLOTSIZE);
+ bool ringbuf_wraparound =
+ job->is_profiled && ((ringbuf_size/SLOTSIZE) == ringbuf_index + 1);
u64 addr_reg = ptdev->csif_info.cs_reg_count -
ptdev->csif_info.unpreserved_cs_reg_count;
u64 val_reg = addr_reg + 2;
@@ -2893,12 +2899,51 @@ queue_run_job(struct drm_sched_job *sched_job)
job->queue_idx * sizeof(struct panthor_syncobj_64b);
u64 times_addr = panthor_kernel_bo_gpuva(group->syncobjs.bo) + queue->time_offset +
(ringbuf_index * sizeof(struct panthor_job_times));
+ size_t call_insrt_size;
+ u64 *call_instrs;
u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
struct dma_fence *done_fence;
int ret;
- u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
+ u64 call_instrs_simple[NUM_INSTRS_PER_SLOT] = {
+ /* MOV32 rX+2, cs.latest_flush */
+ (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
+
+ /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
+ (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
+
+ /* MOV48 rX:rX+1, cs.start */
+ (1ull << 56) | (addr_reg << 48) | job->call_info.start,
+
+ /* MOV32 rX+2, cs.size */
+ (2ull << 56) | (val_reg << 48) | job->call_info.size,
+
+ /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
+ (3ull << 56) | (1 << 16),
+
+ /* CALL rX:rX+1, rX+2 */
+ (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
+
+ /* MOV48 rX:rX+1, sync_addr */
+ (1ull << 56) | (addr_reg << 48) | sync_addr,
+
+ /* MOV48 rX+2, #1 */
+ (1ull << 56) | (val_reg << 48) | 1,
+
+ /* WAIT(all) */
+ (3ull << 56) | (waitall_mask << 16),
+
+ /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
+ (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
+
+ /* ERROR_BARRIER, so we can recover from faults at job
+ * boundaries.
+ */
+ (47ull << 56),
+ };
+
+ u64 call_instrs_profile[NUM_INSTRS_PER_SLOT*2] = {
/* MOV32 rX+2, cs.latest_flush */
(2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
@@ -2960,9 +3005,18 @@ queue_run_job(struct drm_sched_job *sched_job)
};
/* Need to be cacheline aligned to please the prefetcher. */
- static_assert(sizeof(call_instrs) % 64 == 0,
+ static_assert(sizeof(call_instrs_simple) % 64 == 0 && sizeof(call_instrs_profile) % 64 == 0,
"call_instrs is not aligned on a cacheline");
+ if (job->is_profiled) {
+ call_instrs = call_instrs_profile;
+ call_insrt_size = sizeof(call_instrs_profile);
+
+ } else {
+ call_instrs = call_instrs_simple;
+ call_insrt_size = sizeof(call_instrs_simple);
+ }
+
/* Stream size is zero, nothing to do => return a NULL fence and let
* drm_sched signal the parent.
*/
@@ -2985,8 +3039,23 @@ queue_run_job(struct drm_sched_job *sched_job)
queue->fence_ctx.id,
atomic64_inc_return(&queue->fence_ctx.seqno));
- memcpy(queue->ringbuf->kmap + ringbuf_insert,
- call_instrs, sizeof(call_instrs));
+ /*
+ * Need to handle the wrap-around case when copying profiled instructions
+ * from an odd-indexed slot. The reason this can happen is user space is
+ * able to control the profiling status of the driver through a sysfs
+ * knob, so this might lead to a timestamp and cycles-profiling call
+ * instruction stream beginning at an odd-number slot. The GPU should
+ * be able to gracefully handle this.
+ */
+ if (!ringbuf_wraparound) {
+ memcpy(queue->ringbuf->kmap + ringbuf_insert,
+ call_instrs, call_insrt_size);
+ } else {
+ memcpy(queue->ringbuf->kmap + ringbuf_insert,
+ call_instrs, call_insrt_size/2);
+ memcpy(queue->ringbuf->kmap, call_instrs +
+ NUM_INSTRS_PER_SLOT, call_insrt_size/2);
+ }
panthor_job_get(&job->base);
spin_lock(&queue->fence_ctx.lock);
@@ -2994,7 +3063,7 @@ queue_run_job(struct drm_sched_job *sched_job)
spin_unlock(&queue->fence_ctx.lock);
job->ringbuf.start = queue->iface.input->insert;
- job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
+ job->ringbuf.end = job->ringbuf.start + call_insrt_size;
job->ringbuf_idx = ringbuf_index;
/* Make sure the ring buffer is updated before the INSERT
@@ -3141,9 +3210,14 @@ group_create_queue(struct panthor_group *group,
queue->time_offset = group->syncobjs.job_times_offset +
(slots_so_far * sizeof(struct panthor_job_times));
+ /*
+ * Credit limit argument tells us the total number of instructions
+ * across all CS slots in the ringbuffer, with some jobs requiring
+ * twice as many as others, depending on their profiling status.
+ */
ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
group->ptdev->scheduler->wq, 1,
- args->ringbuf_size / SLOTSIZE,
+ args->ringbuf_size / sizeof(u64),
0, msecs_to_jiffies(JOB_TIMEOUT_MS),
group->ptdev->reset.wq,
NULL, "panthor-queue", group->ptdev->base.dev);
@@ -3538,9 +3612,12 @@ panthor_job_create(struct panthor_file *pfile,
goto err_put_job;
}
+ job->is_profiled = pfile->ptdev->profile_mode;
+
ret = drm_sched_job_init(&job->base,
&job->group->queues[job->queue_idx]->entity,
- 1, job->group);
+ job->is_profiled ? NUM_INSTRS_PER_SLOT * 2 :
+ NUM_INSTRS_PER_SLOT, job->group);
if (ret)
goto err_put_job;
--
2.45.1
Some drivers must allocate a considerable amount of memory for bookkeeping
structures and GPU's MCU-kernel shared communication regions. These are
often created as a result of the invocation of the driver's ioctl()
interface functions, so it is sensible to consider them as being owned by
the render context associated with an open drm file.
However, at the moment drm_show_memory_stats only traverses the UM-exposed
drm objects for which a handle exists. Private driver objects and memory
regions, though connected to a render context, are unaccounted for in their
fdinfo numbers.
Add a new drm_memory_stats 'internal' memory category.
Because deciding what constitutes an 'internal' object and where to find
these are driver-dependent, calculation of this size must be done through a
driver-provided function pointer, which becomes the third argument of
drm_show_memory_stats. Drivers which have no interest in exposing the size
of internal memory objects can keep passing NULL for unaltered behaviour.
Signed-off-by: Adrián Larumbe <[email protected]>
---
Documentation/gpu/drm-usage-stats.rst | 4 ++++
drivers/gpu/drm/drm_file.c | 9 +++++++--
drivers/gpu/drm/msm/msm_drv.c | 2 +-
drivers/gpu/drm/panfrost/panfrost_drv.c | 2 +-
include/drm/drm_file.h | 7 ++++++-
5 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
index 6dc299343b48..0da5ebecd232 100644
--- a/Documentation/gpu/drm-usage-stats.rst
+++ b/Documentation/gpu/drm-usage-stats.rst
@@ -157,6 +157,10 @@ The total size of buffers that are purgeable.
The total size of buffers that are active on one or more engines.
+- drm-internal-<region>: <uint> [KiB|MiB]
+
+The total size of GEM objects that aren't exposed to user space.
+
Implementation Details
======================
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 638ffa4444f5..d1c13eed8d34 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -874,9 +874,10 @@ void drm_print_memory_stats(struct drm_printer *p,
enum drm_gem_object_status supported_status,
const char *region)
{
- print_size(p, "total", region, stats->private + stats->shared);
+ print_size(p, "total", region, stats->private + stats->shared + stats->internal);
print_size(p, "shared", region, stats->shared);
print_size(p, "active", region, stats->active);
+ print_size(p, "internal", region, stats->internal);
if (supported_status & DRM_GEM_OBJECT_RESIDENT)
print_size(p, "resident", region, stats->resident);
@@ -890,11 +891,12 @@ EXPORT_SYMBOL(drm_print_memory_stats);
* drm_show_memory_stats - Helper to collect and show standard fdinfo memory stats
* @p: the printer to print output to
* @file: the DRM file
+ * @func: driver-specific function pointer to count the size of internal objects
*
* Helper to iterate over GEM objects with a handle allocated in the specified
* file.
*/
-void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file)
+void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file, internal_bos func)
{
struct drm_gem_object *obj;
struct drm_memory_stats status = {};
@@ -940,6 +942,9 @@ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file)
}
spin_unlock(&file->table_lock);
+ if (func)
+ func(&status, file);
+
drm_print_memory_stats(p, &status, supported_status, "memory");
}
EXPORT_SYMBOL(drm_show_memory_stats);
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 9c33f4e3f822..f97d3cdc4f50 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -880,7 +880,7 @@ static void msm_show_fdinfo(struct drm_printer *p, struct drm_file *file)
msm_gpu_show_fdinfo(priv->gpu, file->driver_priv, p);
- drm_show_memory_stats(p, file);
+ drm_show_memory_stats(p, file, NULL);
}
static const struct file_operations fops = {
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
index ef9f6c0716d5..53640ac44e42 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -570,7 +570,7 @@ static void panfrost_show_fdinfo(struct drm_printer *p, struct drm_file *file)
panfrost_gpu_show_fdinfo(pfdev, file->driver_priv, p);
- drm_show_memory_stats(p, file);
+ drm_show_memory_stats(p, file, NULL);
}
static const struct file_operations panfrost_drm_driver_fops = {
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index ab230d3af138..d71a5ac50ea9 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -464,6 +464,7 @@ void drm_send_event_timestamp_locked(struct drm_device *dev,
* @resident: Total size of GEM objects backing pages
* @purgeable: Total size of GEM objects that can be purged (resident and not active)
* @active: Total size of GEM objects active on one or more engines
+ * @internal: Total size of GEM objects that aren't exposed to user space
*
* Used by drm_print_memory_stats()
*/
@@ -473,16 +474,20 @@ struct drm_memory_stats {
u64 resident;
u64 purgeable;
u64 active;
+ u64 internal;
};
enum drm_gem_object_status;
+typedef void (*internal_bos)(struct drm_memory_stats *status,
+ struct drm_file *file);
+
void drm_print_memory_stats(struct drm_printer *p,
const struct drm_memory_stats *stats,
enum drm_gem_object_status supported_status,
const char *region);
-void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file);
+void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file, internal_bos func);
void drm_show_fdinfo(struct seq_file *m, struct file *f);
struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags);
--
2.45.1
On 06/06/2024 02:49, Adrián Larumbe wrote:
> Some drivers must allocate a considerable amount of memory for bookkeeping
> structures and GPU's MCU-kernel shared communication regions. These are
> often created as a result of the invocation of the driver's ioctl()
> interface functions, so it is sensible to consider them as being owned by
> the render context associated with an open drm file.
>
> However, at the moment drm_show_memory_stats only traverses the UM-exposed
> drm objects for which a handle exists. Private driver objects and memory
> regions, though connected to a render context, are unaccounted for in their
> fdinfo numbers.
>
> Add a new drm_memory_stats 'internal' memory category.
>
> Because deciding what constitutes an 'internal' object and where to find
> these are driver-dependent, calculation of this size must be done through a
> driver-provided function pointer, which becomes the third argument of
> drm_show_memory_stats. Drivers which have no interest in exposing the size
> of internal memory objects can keep passing NULL for unaltered behaviour.
>
> Signed-off-by: Adrián Larumbe <[email protected]>
Please Cc people who were previously involved in defining
drm-usage-stats.rst. I added Rob, but I am not sure if I forgot someone
from the top of my head.
Internal as a category sounds potentially useful. One reservation I have
though is itdoes not necessarily fit with the others but is something
semantically different from them.
In i915 I had the similar desire to account for internal objects and
have approached it by similarly tracking them outside the DRM idr but
counting them under the existing respective categories and memory
regions. Ie. internal objects can also be purgeable or not, etc, and can
be backed by either system memory or device local memory.
Advantage is it is more accurate in those aspect and does not require
adding a new category.
Downside of this is that 'internal' is bunched with the explicit
userspace objects so perhaps less accurate in this other aspect.
Regards,
Tvrtko
> ---
> Documentation/gpu/drm-usage-stats.rst | 4 ++++
> drivers/gpu/drm/drm_file.c | 9 +++++++--
> drivers/gpu/drm/msm/msm_drv.c | 2 +-
> drivers/gpu/drm/panfrost/panfrost_drv.c | 2 +-
> include/drm/drm_file.h | 7 ++++++-
> 5 files changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> index 6dc299343b48..0da5ebecd232 100644
> --- a/Documentation/gpu/drm-usage-stats.rst
> +++ b/Documentation/gpu/drm-usage-stats.rst
> @@ -157,6 +157,10 @@ The total size of buffers that are purgeable.
>
> The total size of buffers that are active on one or more engines.
>
> +- drm-internal-<region>: <uint> [KiB|MiB]
> +
> +The total size of GEM objects that aren't exposed to user space.
> +
> Implementation Details
> ======================
>
> diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
> index 638ffa4444f5..d1c13eed8d34 100644
> --- a/drivers/gpu/drm/drm_file.c
> +++ b/drivers/gpu/drm/drm_file.c
> @@ -874,9 +874,10 @@ void drm_print_memory_stats(struct drm_printer *p,
> enum drm_gem_object_status supported_status,
> const char *region)
> {
> - print_size(p, "total", region, stats->private + stats->shared);
> + print_size(p, "total", region, stats->private + stats->shared + stats->internal);
> print_size(p, "shared", region, stats->shared);
> print_size(p, "active", region, stats->active);
> + print_size(p, "internal", region, stats->internal);
>
> if (supported_status & DRM_GEM_OBJECT_RESIDENT)
> print_size(p, "resident", region, stats->resident);
> @@ -890,11 +891,12 @@ EXPORT_SYMBOL(drm_print_memory_stats);
> * drm_show_memory_stats - Helper to collect and show standard fdinfo memory stats
> * @p: the printer to print output to
> * @file: the DRM file
> + * @func: driver-specific function pointer to count the size of internal objects
> *
> * Helper to iterate over GEM objects with a handle allocated in the specified
> * file.
> */
> -void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file)
> +void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file, internal_bos func)
> {
> struct drm_gem_object *obj;
> struct drm_memory_stats status = {};
> @@ -940,6 +942,9 @@ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file)
> }
> spin_unlock(&file->table_lock);
>
> + if (func)
> + func(&status, file);
> +
> drm_print_memory_stats(p, &status, supported_status, "memory");
> }
> EXPORT_SYMBOL(drm_show_memory_stats);
> diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
> index 9c33f4e3f822..f97d3cdc4f50 100644
> --- a/drivers/gpu/drm/msm/msm_drv.c
> +++ b/drivers/gpu/drm/msm/msm_drv.c
> @@ -880,7 +880,7 @@ static void msm_show_fdinfo(struct drm_printer *p, struct drm_file *file)
>
> msm_gpu_show_fdinfo(priv->gpu, file->driver_priv, p);
>
> - drm_show_memory_stats(p, file);
> + drm_show_memory_stats(p, file, NULL);
> }
>
> static const struct file_operations fops = {
> diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c
> index ef9f6c0716d5..53640ac44e42 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_drv.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
> @@ -570,7 +570,7 @@ static void panfrost_show_fdinfo(struct drm_printer *p, struct drm_file *file)
>
> panfrost_gpu_show_fdinfo(pfdev, file->driver_priv, p);
>
> - drm_show_memory_stats(p, file);
> + drm_show_memory_stats(p, file, NULL);
> }
>
> static const struct file_operations panfrost_drm_driver_fops = {
> diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
> index ab230d3af138..d71a5ac50ea9 100644
> --- a/include/drm/drm_file.h
> +++ b/include/drm/drm_file.h
> @@ -464,6 +464,7 @@ void drm_send_event_timestamp_locked(struct drm_device *dev,
> * @resident: Total size of GEM objects backing pages
> * @purgeable: Total size of GEM objects that can be purged (resident and not active)
> * @active: Total size of GEM objects active on one or more engines
> + * @internal: Total size of GEM objects that aren't exposed to user space
> *
> * Used by drm_print_memory_stats()
> */
> @@ -473,16 +474,20 @@ struct drm_memory_stats {
> u64 resident;
> u64 purgeable;
> u64 active;
> + u64 internal;
> };
>
> enum drm_gem_object_status;
>
> +typedef void (*internal_bos)(struct drm_memory_stats *status,
> + struct drm_file *file);
> +
> void drm_print_memory_stats(struct drm_printer *p,
> const struct drm_memory_stats *stats,
> enum drm_gem_object_status supported_status,
> const char *region);
>
> -void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file);
> +void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file, internal_bos func);
> void drm_show_fdinfo(struct seq_file *m, struct file *f);
>
> struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags);
On 06/06/2024 01:49, Adrián Larumbe wrote:
> This patch series enables userspace utilities like gputop and nvtop to
> query a render context's fdinfo file and figure out rates of engine
> and memory utilisation.
>
> Previous discussion can be found at
> https://lore.kernel.org/dri-devel/[email protected]/
>
> Changelog:
> v3:
> - Fixed some nits and removed useless bounds check in panthor_sched.c
> - Added support for sysfs profiling knob and optional job accounting
> - Added new patches for calculating size of internal BO's
> v2:
> - Split original first patch in two, one for FW CS cycle and timestamp
> calculations and job accounting memory management, and a second one
> that enables fdinfo.
> - Moved NUM_INSTRS_PER_SLOT to the file prelude
> - Removed nelem variable from the group's struct definition.
> - Precompute size of group's syncobj BO to avoid code duplication.
> - Some minor nits.
>
>
> Adrián Larumbe (7):
> drm/panthor: introduce job cycle and timestamp accounting
> drm/panthor: add DRM fdinfo support
> drm/panthor: enable fdinfo for memory stats
> drm/panthor: add sysfs knob for enabling job profiling
> drm/panthor: support job accounting
> drm/drm_file: add display of driver's internal memory size
> drm/panthor: register size of internal objects through fdinfo
The general shape of what you end up with looks correct, but these
patches are now in a bit of a mess. It's confusing to review when the
accounting is added unconditionally and then a sysfs knob is added which
changes it all to be conditional. Equally that last patch (register size
of internal objects through fdinfo) includes a massive amount of churn
moving everything into an 'fdinfo' struct which really should be in a
separate patch.
Ideally this needs to be reworked into a logical series of patches with
knowledge of what's coming next. E.g. the first patch could introduce
the code for cycle/timestamp accounting but leave it disabled to be then
enabled by the sysfs knob patch.
One thing I did notice though is that I wasn't seeing the GPU frequency
change, looking more closely at this it seems like there's something
dodgy going on with the devfreq code. From what I can make out I often
end up in a situation where all contexts are idle every time tick_work()
is called - I think this is simply because tick_work() is scheduled with
a delay and by the time the delay has hit the work is complete. Nothing
to do with this series, but something that needs looking into. I'm on
holiday for a week but I'll try to look at this when I'm back.
Steve
> Documentation/gpu/drm-usage-stats.rst | 4 +
> drivers/gpu/drm/drm_file.c | 9 +-
> drivers/gpu/drm/msm/msm_drv.c | 2 +-
> drivers/gpu/drm/panfrost/panfrost_drv.c | 2 +-
> drivers/gpu/drm/panthor/panthor_devfreq.c | 10 +
> drivers/gpu/drm/panthor/panthor_device.c | 2 +
> drivers/gpu/drm/panthor/panthor_device.h | 21 ++
> drivers/gpu/drm/panthor/panthor_drv.c | 83 +++++-
> drivers/gpu/drm/panthor/panthor_fw.c | 16 +-
> drivers/gpu/drm/panthor/panthor_fw.h | 5 +-
> drivers/gpu/drm/panthor/panthor_gem.c | 67 ++++-
> drivers/gpu/drm/panthor/panthor_gem.h | 16 +-
> drivers/gpu/drm/panthor/panthor_heap.c | 23 +-
> drivers/gpu/drm/panthor/panthor_heap.h | 6 +-
> drivers/gpu/drm/panthor/panthor_mmu.c | 8 +-
> drivers/gpu/drm/panthor/panthor_mmu.h | 3 +-
> drivers/gpu/drm/panthor/panthor_sched.c | 304 +++++++++++++++++++---
> include/drm/drm_file.h | 7 +-
> 18 files changed, 522 insertions(+), 66 deletions(-)
>
>
> base-commit: 310ec03841a36e3f45fb528f0dfdfe5b9e84b037