From: Eric Anholt <eric@anholt.net>
To: dri-devel@lists.freedesktop.org
Cc: linux-kernel@vger.kernel.org, Eric Anholt <eric@anholt.net>
Subject: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory.
Date: Mon, 27 Mar 2017 16:10:25 -0700
Message-Id: <20170327231025.19391-1-eric@anholt.net>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 16000
Lines: 484

Until now, we've had to limit Raspberry Pi to 256MB of CMA memory to
keep from triggering the hardware addressing bug between of the tile
binner of the tile alloc memory (where the top 4 bits come from the
tile state data array's address).

To work around that and allow more memory to be reserved for graphics,
allocate a single BO to store tile state data arrays and tile
alloc/overflow memory while the GPU is active, and make sure that that
one BO doesn't happen to cross a 256MB boundary.  With that in place,
we can allocate textures and shaders anywhere in system memory (still
contiguous, of course).

Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/vc4/vc4_drv.h       |  28 +++++--
 drivers/gpu/drm/vc4/vc4_gem.c       |  12 ++-
 drivers/gpu/drm/vc4/vc4_irq.c       |  61 +++++++--------
 drivers/gpu/drm/vc4/vc4_render_cl.c |   3 +-
 drivers/gpu/drm/vc4/vc4_v3d.c       | 150 ++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_validate.c  |  54 ++++++-------
 6 files changed, 234 insertions(+), 74 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index dffce6293d87..5d9532163cbe 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -95,12 +95,23 @@ struct vc4_dev {
 	 */
 	struct list_head seqno_cb_list;
 
-	/* The binner overflow memory that's currently set up in
-	 * BPOA/BPOS registers.  When overflow occurs and a new one is
-	 * allocated, the previous one will be moved to
-	 * vc4->current_exec's free list.
+	/* The memory used for storing binner tile alloc, tile state,
+	 * and overflow memory allocations.  This is freed when V3D
+	 * powers down.
 	 */
-	struct vc4_bo *overflow_mem;
+	struct vc4_bo *bin_bo;
+
+	/* Size of blocks allocated within bin_bo. */
+	uint32_t bin_alloc_size;
+
+	/* Bitmask of the bin_alloc_size chunks in bin_bo that are
+	 * used.
+	 */
+	uint32_t bin_alloc_used;
+
+	/* Bitmask of the current bin_alloc used for overflow memory. */
+	uint32_t bin_alloc_overflow;
+
 	struct work_struct overflow_mem_work;
 
 	int power_refcount;
@@ -293,8 +304,12 @@ struct vc4_exec_info {
 	bool found_increment_semaphore_packet;
 	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
-	struct drm_gem_cma_object *tile_bo;
+	/* Physical address of the start of the tile alloc array
+	 * (where each tile's binned CL will start)
+	 */
 	uint32_t tile_alloc_offset;
+	/* Bitmask of which binner slots are freed when this job completes. */
+	uint32_t bin_slots;
 
 	/**
 	 * Computed addresses pointing into exec_bo where we start the
@@ -522,6 +537,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
 
 /* vc4_validate.c */
 int
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index e9c381c42139..3ecd1ba7af75 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -705,6 +705,7 @@ static void
 vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
 	unsigned i;
 
 	if (exec->bo) {
@@ -720,6 +721,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 		drm_gem_object_unreference_unlocked(&bo->base.base);
 	}
 
+	/* Free up the allocation of any bin slots we used. */
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	vc4->bin_alloc_used &= ~exec->bin_slots;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
 	mutex_lock(&vc4->power_lock);
 	if (--vc4->power_refcount == 0) {
 		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -968,9 +974,9 @@ vc4_gem_destroy(struct drm_device *dev)
 	/* V3D should already have disabled its interrupt and cleared
 	 * the overflow allocation registers.  Now free the object.
 	 */
-	if (vc4->overflow_mem) {
-		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
-		vc4->overflow_mem = NULL;
+	if (vc4->bin_bo) {
+		drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+		vc4->bin_bo = NULL;
 	}
 
 	if (vc4->hang_state)
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index cdc6e6760705..47bbec962f45 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work)
 {
 	struct vc4_dev *vc4 =
 		container_of(work, struct vc4_dev, overflow_mem_work);
-	struct drm_device *dev = vc4->dev;
-	struct vc4_bo *bo;
+	struct vc4_bo *bo = vc4->bin_bo;
+	int bin_bo_slot;
+	struct vc4_exec_info *exec;
+	unsigned long irqflags;
 
-	bo = vc4_bo_create(dev, 256 * 1024, true);
-	if (IS_ERR(bo)) {
+	bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
+	if (bin_bo_slot < 0) {
 		DRM_ERROR("Couldn't allocate binner overflow mem\n");
 		return;
 	}
 
-	/* If there's a job executing currently, then our previous
-	 * overflow allocation is getting used in that job and we need
-	 * to queue it to be released when the job is done.  But if no
-	 * job is executing at all, then we can free the old overflow
-	 * object direcctly.
-	 *
-	 * No lock necessary for this pointer since we're the only
-	 * ones that update the pointer, and our workqueue won't
-	 * reenter.
-	 */
-	if (vc4->overflow_mem) {
-		struct vc4_exec_info *current_exec;
-		unsigned long irqflags;
-
-		spin_lock_irqsave(&vc4->job_lock, irqflags);
-		current_exec = vc4_first_bin_job(vc4);
-		if (!current_exec)
-			current_exec = vc4_last_render_job(vc4);
-		if (current_exec) {
-			vc4->overflow_mem->seqno = current_exec->seqno;
-			list_add_tail(&vc4->overflow_mem->unref_head,
-				      &current_exec->unref_list);
-			vc4->overflow_mem = NULL;
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+
+	if (vc4->bin_alloc_overflow) {
+		/* If we had overflow memory allocated previously,
+		 * then that chunk will free when the current bin job
+		 * is done.  If we don't have a bin job running, then
+		 * the chunk will be done whenever the list of render
+		 * jobs has drained.
+		 */
+		exec = vc4_first_bin_job(vc4);
+		if (!exec)
+			exec = vc4_last_render_job(vc4);
+		if (exec) {
+			exec->bin_slots |= vc4->bin_alloc_overflow;
+		} else {
+			/* There's nothing queued in the hardware, so
+			 * the old slot is free immediately.
+			 */
+			vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
 		}
-		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 	}
+	vc4->bin_alloc_overflow = BIT(bin_bo_slot);
 
-	if (vc4->overflow_mem)
-		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
-	vc4->overflow_mem = bo;
-
-	V3D_WRITE(V3D_BPOA, bo->base.paddr);
+	V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
 	V3D_WRITE(V3D_BPOS, bo->base.base.size);
 	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
 	V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 }
 
 static void
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
index 4339471f517f..5dc19429d4ae 100644
--- a/drivers/gpu/drm/vc4/vc4_render_cl.c
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec,
 
 	if (has_bin) {
 		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
-		rcl_u32(setup, (exec->tile_bo->paddr +
-				exec->tile_alloc_offset +
+		rcl_u32(setup, (exec->tile_alloc_offset +
 				(y * exec->bin_tiles_x + x) * 32));
 	}
 
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 7cc346ad9b0b..a88078d7c9d1 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
 	V3D_WRITE(V3D_VPMBASE, 0);
 }
 
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
+{
+	struct drm_device *dev = vc4->dev;
+	unsigned long irqflags;
+	int slot;
+	uint64_t seqno = 0;
+	struct vc4_exec_info *exec;
+
+try_again:
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	slot = ffs(~vc4->bin_alloc_used);
+	if (slot != 0) {
+		/* Switch from ffs() bit index to a 0-based index. */
+		slot--;
+		vc4->bin_alloc_used |= BIT(slot);
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return slot;
+	}
+
+	/* Couldn't find an open slot.  Wait for render to complete
+	 * and try again.
+	 */
+	exec = vc4_last_render_job(vc4);
+	if (exec)
+		seqno = exec->seqno;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	if (seqno) {
+		int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
+
+		if (ret == 0)
+			goto try_again;
+
+		return ret;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * vc4_allocate_bin_bo() - allocates the memory that will be used for
+ * tile binning.
+ *
+ * The binner has a limitation that the addresses in the tile state
+ * buffer that point into the tile alloc buffer or binner overflow
+ * memory only have 28 bits (256MB), and the top 4 on the bus for
+ * tile alloc references end up coming from the tile state buffer's
+ * address.
+ *
+ * To work around this, we allocate a single large buffer while V3D is
+ * in use, make sure that it has the top 4 bits constant across its
+ * entire extent, and then put the tile state, tile alloc, and binner
+ * overflow memory inside that buffer.
+ *
+ * This creates a limitation where we may not be able to execute a job
+ * if it doesn't fit within the buffer that we allocated up front.
+ * However, it turns out that 16MB is "enough for anybody", and
+ * real-world applications run into allocation failures from the
+ * overall CMA pool before they make scenes complicated enough to run
+ * out of bin space.
+ */
+int
+vc4_allocate_bin_bo(struct drm_device *drm)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+	struct vc4_v3d *v3d = vc4->v3d;
+	uint32_t size = 16 * 1024 * 1024;
+	int ret = 0;
+	struct list_head list;
+
+	/* We may need to try allocating more than once to get a BO
+	 * that doesn't cross 256MB.  Track the ones we've allocated
+	 * that failed so far, so that we can free them when we've got
+	 * one that succeeded (if we freed them right away, our next
+	 * allocation would probably be the same chunk of memory).
+	 */
+	INIT_LIST_HEAD(&list);
+
+	while (true) {
+		struct vc4_bo *bo = vc4_bo_create(drm, size, true);
+
+		if (IS_ERR(bo)) {
+			ret = PTR_ERR(bo);
+
+			dev_err(&v3d->pdev->dev,
+				"Failed to allocate memory for tile binning: "
+				"%d. You may need to enable CMA or give it "
+				"more memory.",
+				ret);
+			break;
+		}
+
+		/* Check if this BO won't trigger the addressing bug. */
+		if ((bo->base.paddr & 0xf0000000) ==
+		    ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
+			vc4->bin_bo = bo;
+
+			/* Set up for allocating 512KB chunks of
+			 * binner memory.  The biggest allocation we
+			 * need to do is for the initial tile alloc +
+			 * tile state buffer.  We can render to a
+			 * maximum of ((2048*2048) / (32*32) = 4096
+			 * tiles in a frame (until we do floating
+			 * point rendering, at which point it would be
+			 * 8192).  Tile state is 48b/tile (rounded to
+			 * a page), and tile alloc is 32b/tile
+			 * (rounded to a page), plus a page of extra,
+			 * for a total of 320kb for our worst-case.
+			 * We choose 512kb so that it divides evenly
+			 * into our 16MB, and the rest of the 512kb
+			 * will be used as storage for the overflow
+			 * from the initial 32b CL per bin.
+			 */
+			vc4->bin_alloc_size = 512 * 1024;
+			vc4->bin_alloc_used = 0;
+			vc4->bin_alloc_overflow = 0;
+			WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
+				     bo->base.base.size / vc4->bin_alloc_size);
+
+			break;
+		}
+
+		/* Put it on the list to free later, and try again. */
+		list_add(&bo->unref_head, &list);
+	}
+
+	/* Free all the BOs we allocated but didn't choose. */
+	while (!list_empty(&list)) {
+		struct vc4_bo *bo = list_last_entry(&list,
+						    struct vc4_bo, unref_head);
+
+		list_del(&bo->unref_head);
+		drm_gem_object_put_unlocked(&bo->base.base);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_PM
 static int vc4_v3d_runtime_suspend(struct device *dev)
 {
@@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev)
 
 	vc4_irq_uninstall(vc4->dev);
 
+	drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+	vc4->bin_bo = NULL;
+
 	return 0;
 }
 
@@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev)
 {
 	struct vc4_v3d *v3d = dev_get_drvdata(dev);
 	struct vc4_dev *vc4 = v3d->vc4;
+	int ret;
+
+	ret = vc4_allocate_bin_bo(vc4->dev);
+	if (ret)
+		return ret;
 
 	vc4_v3d_init_hw(vc4->dev);
 	vc4_irq_postinstall(vc4->dev);
@@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return -EINVAL;
 	}
 
+	ret = vc4_allocate_bin_bo(drm);
+	if (ret)
+		return ret;
+
 	/* Reset the binner overflow address/size at setup, to be sure
 	 * we don't reuse an old one.
 	 */
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index da6f1e138e8d..3de8f11595c0 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -348,10 +348,11 @@ static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
 	struct drm_device *dev = exec->exec_bo->base.dev;
-	struct vc4_bo *tile_bo;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	uint8_t flags;
-	uint32_t tile_state_size, tile_alloc_size;
-	uint32_t tile_count;
+	uint32_t tile_state_size;
+	uint32_t tile_count, bin_addr;
+	int bin_slot;
 
 	if (exec->found_tile_binning_mode_config_packet) {
 		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
@@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
+	bin_slot = vc4_v3d_get_bin_slot(vc4);
+	if (bin_slot < 0) {
+		if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
+			DRM_ERROR("Failed to allocate binner memory: %d\n",
+				  bin_slot);
+		}
+		return bin_slot;
+	}
+
+	/* The slot we allocated will only be used by this job, and is
+	 * free when the job completes rendering.
+	 */
+	exec->bin_slots |= BIT(bin_slot);
+	bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
+
 	/* The tile state data array is 48 bytes per tile, and we put it at
 	 * the start of a BO containing both it and the tile alloc.
 	 */
 	tile_state_size = 48 * tile_count;
 
 	/* Since the tile alloc array will follow us, align. */
-	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+	exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
 
 	*(uint8_t *)(validated + 14) =
 		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
@@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
 			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
 
-	/* Initial block size. */
-	tile_alloc_size = 32 * tile_count;
-
-	/*
-	 * The initial allocation gets rounded to the next 256 bytes before
-	 * the hardware starts fulfilling further allocations.
-	 */
-	tile_alloc_size = roundup(tile_alloc_size, 256);
-
-	/* Add space for the extra allocations.  This is what gets used first,
-	 * before overflow memory.  It must have at least 4096 bytes, but we
-	 * want to avoid overflow memory usage if possible.
-	 */
-	tile_alloc_size += 1024 * 1024;
-
-	tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
-				true);
-	exec->tile_bo = &tile_bo->base;
-	if (IS_ERR(exec->tile_bo))
-		return PTR_ERR(exec->tile_bo);
-	list_add_tail(&tile_bo->unref_head, &exec->unref_list);
-
 	/* tile alloc address. */
-	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
-					exec->tile_alloc_offset);
+	*(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
 	/* tile alloc size. */
-	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	*(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
+					exec->tile_alloc_offset);
 	/* tile state address. */
-	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
+	*(uint32_t *)(validated + 8) = bin_addr;
 
 	return 0;
 }
-- 
2.11.0