Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752379AbdDNIIq (ORCPT ); Fri, 14 Apr 2017 04:08:46 -0400 Received: from mail.free-electrons.com ([62.4.15.54]:40854 "EHLO mail.free-electrons.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751522AbdDNIIh (ORCPT ); Fri, 14 Apr 2017 04:08:37 -0400 Date: Fri, 14 Apr 2017 10:08:25 +0200 From: Boris Brezillon To: Eric Anholt Cc: dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org Subject: Re: [PATCH] drm/vc4: Allow using more than 256MB of CMA memory. Message-ID: <20170414100825.254b8181@bbrezillon> In-Reply-To: <20170327231025.19391-1-eric@anholt.net> References: <20170327231025.19391-1-eric@anholt.net> X-Mailer: Claws Mail 3.13.2 (GTK+ 2.24.30; x86_64-pc-linux-gnu) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 17410 Lines: 495 On Mon, 27 Mar 2017 16:10:25 -0700 Eric Anholt wrote: > Until now, we've had to limit Raspberry Pi to 256MB of CMA memory to > keep from triggering the hardware addressing bug between of the tile > binner of the tile alloc memory (where the top 4 bits come from the > tile state data array's address). > > To work around that and allow more memory to be reserved for graphics, > allocate a single BO to store tile state data arrays and tile > alloc/overflow memory while the GPU is active, and make sure that that > one BO doesn't happen to cross a 256MB boundary. With that in place, > we can allocate textures and shaders anywhere in system memory (still > contiguous, of course). It's hard to review something I don't quite understand, but I didn't spot any problem and the code seems to follow what the commit message says: make sure the memory used by the tile binner (still have to look at what a tile binner is :-)) does not cross a 256MB. So, not sure my review has a real value here, but Reviewed-by: Boris Brezillon > > Signed-off-by: Eric Anholt > --- > drivers/gpu/drm/vc4/vc4_drv.h | 28 +++++-- > drivers/gpu/drm/vc4/vc4_gem.c | 12 ++- > drivers/gpu/drm/vc4/vc4_irq.c | 61 +++++++-------- > drivers/gpu/drm/vc4/vc4_render_cl.c | 3 +- > drivers/gpu/drm/vc4/vc4_v3d.c | 150 ++++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/vc4/vc4_validate.c | 54 ++++++------- > 6 files changed, 234 insertions(+), 74 deletions(-) > > diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h > index dffce6293d87..5d9532163cbe 100644 > --- a/drivers/gpu/drm/vc4/vc4_drv.h > +++ b/drivers/gpu/drm/vc4/vc4_drv.h > @@ -95,12 +95,23 @@ struct vc4_dev { > */ > struct list_head seqno_cb_list; > > - /* The binner overflow memory that's currently set up in > - * BPOA/BPOS registers. When overflow occurs and a new one is > - * allocated, the previous one will be moved to > - * vc4->current_exec's free list. > + /* The memory used for storing binner tile alloc, tile state, > + * and overflow memory allocations. This is freed when V3D > + * powers down. > */ > - struct vc4_bo *overflow_mem; > + struct vc4_bo *bin_bo; > + > + /* Size of blocks allocated within bin_bo. */ > + uint32_t bin_alloc_size; > + > + /* Bitmask of the bin_alloc_size chunks in bin_bo that are > + * used. > + */ > + uint32_t bin_alloc_used; > + > + /* Bitmask of the current bin_alloc used for overflow memory. */ > + uint32_t bin_alloc_overflow; > + > struct work_struct overflow_mem_work; > > int power_refcount; > @@ -293,8 +304,12 @@ struct vc4_exec_info { > bool found_increment_semaphore_packet; > bool found_flush; > uint8_t bin_tiles_x, bin_tiles_y; > - struct drm_gem_cma_object *tile_bo; > + /* Physical address of the start of the tile alloc array > + * (where each tile's binned CL will start) > + */ > uint32_t tile_alloc_offset; > + /* Bitmask of which binner slots are freed when this job completes. */ > + uint32_t bin_slots; > > /** > * Computed addresses pointing into exec_bo where we start the > @@ -522,6 +537,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane, > extern struct platform_driver vc4_v3d_driver; > int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused); > int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused); > +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4); > > /* vc4_validate.c */ > int > diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c > index e9c381c42139..3ecd1ba7af75 100644 > --- a/drivers/gpu/drm/vc4/vc4_gem.c > +++ b/drivers/gpu/drm/vc4/vc4_gem.c > @@ -705,6 +705,7 @@ static void > vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec) > { > struct vc4_dev *vc4 = to_vc4_dev(dev); > + unsigned long irqflags; > unsigned i; > > if (exec->bo) { > @@ -720,6 +721,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec) > drm_gem_object_unreference_unlocked(&bo->base.base); > } > > + /* Free up the allocation of any bin slots we used. */ > + spin_lock_irqsave(&vc4->job_lock, irqflags); > + vc4->bin_alloc_used &= ~exec->bin_slots; > + spin_unlock_irqrestore(&vc4->job_lock, irqflags); > + > mutex_lock(&vc4->power_lock); > if (--vc4->power_refcount == 0) { > pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev); > @@ -968,9 +974,9 @@ vc4_gem_destroy(struct drm_device *dev) > /* V3D should already have disabled its interrupt and cleared > * the overflow allocation registers. Now free the object. > */ > - if (vc4->overflow_mem) { > - drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base); > - vc4->overflow_mem = NULL; > + if (vc4->bin_bo) { > + drm_gem_object_put_unlocked(&vc4->bin_bo->base.base); > + vc4->bin_bo = NULL; > } > > if (vc4->hang_state) > diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c > index cdc6e6760705..47bbec962f45 100644 > --- a/drivers/gpu/drm/vc4/vc4_irq.c > +++ b/drivers/gpu/drm/vc4/vc4_irq.c > @@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work) > { > struct vc4_dev *vc4 = > container_of(work, struct vc4_dev, overflow_mem_work); > - struct drm_device *dev = vc4->dev; > - struct vc4_bo *bo; > + struct vc4_bo *bo = vc4->bin_bo; > + int bin_bo_slot; > + struct vc4_exec_info *exec; > + unsigned long irqflags; > > - bo = vc4_bo_create(dev, 256 * 1024, true); > - if (IS_ERR(bo)) { > + bin_bo_slot = vc4_v3d_get_bin_slot(vc4); > + if (bin_bo_slot < 0) { > DRM_ERROR("Couldn't allocate binner overflow mem\n"); > return; > } > > - /* If there's a job executing currently, then our previous > - * overflow allocation is getting used in that job and we need > - * to queue it to be released when the job is done. But if no > - * job is executing at all, then we can free the old overflow > - * object direcctly. > - * > - * No lock necessary for this pointer since we're the only > - * ones that update the pointer, and our workqueue won't > - * reenter. > - */ > - if (vc4->overflow_mem) { > - struct vc4_exec_info *current_exec; > - unsigned long irqflags; > - > - spin_lock_irqsave(&vc4->job_lock, irqflags); > - current_exec = vc4_first_bin_job(vc4); > - if (!current_exec) > - current_exec = vc4_last_render_job(vc4); > - if (current_exec) { > - vc4->overflow_mem->seqno = current_exec->seqno; > - list_add_tail(&vc4->overflow_mem->unref_head, > - ¤t_exec->unref_list); > - vc4->overflow_mem = NULL; > + spin_lock_irqsave(&vc4->job_lock, irqflags); > + > + if (vc4->bin_alloc_overflow) { > + /* If we had overflow memory allocated previously, > + * then that chunk will free when the current bin job > + * is done. If we don't have a bin job running, then > + * the chunk will be done whenever the list of render > + * jobs has drained. > + */ > + exec = vc4_first_bin_job(vc4); > + if (!exec) > + exec = vc4_last_render_job(vc4); > + if (exec) { > + exec->bin_slots |= vc4->bin_alloc_overflow; > + } else { > + /* There's nothing queued in the hardware, so > + * the old slot is free immediately. > + */ > + vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow; > } > - spin_unlock_irqrestore(&vc4->job_lock, irqflags); > } > + vc4->bin_alloc_overflow = BIT(bin_bo_slot); > > - if (vc4->overflow_mem) > - drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base); > - vc4->overflow_mem = bo; > - > - V3D_WRITE(V3D_BPOA, bo->base.paddr); > + V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size); > V3D_WRITE(V3D_BPOS, bo->base.base.size); > V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM); > V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM); > + spin_unlock_irqrestore(&vc4->job_lock, irqflags); > } > > static void > diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c > index 4339471f517f..5dc19429d4ae 100644 > --- a/drivers/gpu/drm/vc4/vc4_render_cl.c > +++ b/drivers/gpu/drm/vc4/vc4_render_cl.c > @@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec, > > if (has_bin) { > rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST); > - rcl_u32(setup, (exec->tile_bo->paddr + > - exec->tile_alloc_offset + > + rcl_u32(setup, (exec->tile_alloc_offset + > (y * exec->bin_tiles_x + x) * 32)); > } > > diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c > index 7cc346ad9b0b..a88078d7c9d1 100644 > --- a/drivers/gpu/drm/vc4/vc4_v3d.c > +++ b/drivers/gpu/drm/vc4/vc4_v3d.c > @@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev) > V3D_WRITE(V3D_VPMBASE, 0); > } > > +int vc4_v3d_get_bin_slot(struct vc4_dev *vc4) > +{ > + struct drm_device *dev = vc4->dev; > + unsigned long irqflags; > + int slot; > + uint64_t seqno = 0; > + struct vc4_exec_info *exec; > + > +try_again: > + spin_lock_irqsave(&vc4->job_lock, irqflags); > + slot = ffs(~vc4->bin_alloc_used); > + if (slot != 0) { > + /* Switch from ffs() bit index to a 0-based index. */ > + slot--; > + vc4->bin_alloc_used |= BIT(slot); > + spin_unlock_irqrestore(&vc4->job_lock, irqflags); > + return slot; > + } > + > + /* Couldn't find an open slot. Wait for render to complete > + * and try again. > + */ > + exec = vc4_last_render_job(vc4); > + if (exec) > + seqno = exec->seqno; > + spin_unlock_irqrestore(&vc4->job_lock, irqflags); > + > + if (seqno) { > + int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true); > + > + if (ret == 0) > + goto try_again; > + > + return ret; > + } > + > + return -ENOMEM; > +} > + > +/** > + * vc4_allocate_bin_bo() - allocates the memory that will be used for > + * tile binning. > + * > + * The binner has a limitation that the addresses in the tile state > + * buffer that point into the tile alloc buffer or binner overflow > + * memory only have 28 bits (256MB), and the top 4 on the bus for > + * tile alloc references end up coming from the tile state buffer's > + * address. > + * > + * To work around this, we allocate a single large buffer while V3D is > + * in use, make sure that it has the top 4 bits constant across its > + * entire extent, and then put the tile state, tile alloc, and binner > + * overflow memory inside that buffer. > + * > + * This creates a limitation where we may not be able to execute a job > + * if it doesn't fit within the buffer that we allocated up front. > + * However, it turns out that 16MB is "enough for anybody", and > + * real-world applications run into allocation failures from the > + * overall CMA pool before they make scenes complicated enough to run > + * out of bin space. > + */ > +int > +vc4_allocate_bin_bo(struct drm_device *drm) > +{ > + struct vc4_dev *vc4 = to_vc4_dev(drm); > + struct vc4_v3d *v3d = vc4->v3d; > + uint32_t size = 16 * 1024 * 1024; > + int ret = 0; > + struct list_head list; > + > + /* We may need to try allocating more than once to get a BO > + * that doesn't cross 256MB. Track the ones we've allocated > + * that failed so far, so that we can free them when we've got > + * one that succeeded (if we freed them right away, our next > + * allocation would probably be the same chunk of memory). > + */ > + INIT_LIST_HEAD(&list); > + > + while (true) { > + struct vc4_bo *bo = vc4_bo_create(drm, size, true); > + > + if (IS_ERR(bo)) { > + ret = PTR_ERR(bo); > + > + dev_err(&v3d->pdev->dev, > + "Failed to allocate memory for tile binning: " > + "%d. You may need to enable CMA or give it " > + "more memory.", > + ret); > + break; > + } > + > + /* Check if this BO won't trigger the addressing bug. */ > + if ((bo->base.paddr & 0xf0000000) == > + ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) { > + vc4->bin_bo = bo; > + > + /* Set up for allocating 512KB chunks of > + * binner memory. The biggest allocation we > + * need to do is for the initial tile alloc + > + * tile state buffer. We can render to a > + * maximum of ((2048*2048) / (32*32) = 4096 > + * tiles in a frame (until we do floating > + * point rendering, at which point it would be > + * 8192). Tile state is 48b/tile (rounded to > + * a page), and tile alloc is 32b/tile > + * (rounded to a page), plus a page of extra, > + * for a total of 320kb for our worst-case. > + * We choose 512kb so that it divides evenly > + * into our 16MB, and the rest of the 512kb > + * will be used as storage for the overflow > + * from the initial 32b CL per bin. > + */ > + vc4->bin_alloc_size = 512 * 1024; > + vc4->bin_alloc_used = 0; > + vc4->bin_alloc_overflow = 0; > + WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 != > + bo->base.base.size / vc4->bin_alloc_size); > + > + break; > + } > + > + /* Put it on the list to free later, and try again. */ > + list_add(&bo->unref_head, &list); > + } > + > + /* Free all the BOs we allocated but didn't choose. */ > + while (!list_empty(&list)) { > + struct vc4_bo *bo = list_last_entry(&list, > + struct vc4_bo, unref_head); > + > + list_del(&bo->unref_head); > + drm_gem_object_put_unlocked(&bo->base.base); > + } > + > + return ret; > +} > + > #ifdef CONFIG_PM > static int vc4_v3d_runtime_suspend(struct device *dev) > { > @@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev) > > vc4_irq_uninstall(vc4->dev); > > + drm_gem_object_put_unlocked(&vc4->bin_bo->base.base); > + vc4->bin_bo = NULL; > + > return 0; > } > > @@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev) > { > struct vc4_v3d *v3d = dev_get_drvdata(dev); > struct vc4_dev *vc4 = v3d->vc4; > + int ret; > + > + ret = vc4_allocate_bin_bo(vc4->dev); > + if (ret) > + return ret; > > vc4_v3d_init_hw(vc4->dev); > vc4_irq_postinstall(vc4->dev); > @@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data) > return -EINVAL; > } > > + ret = vc4_allocate_bin_bo(drm); > + if (ret) > + return ret; > + > /* Reset the binner overflow address/size at setup, to be sure > * we don't reuse an old one. > */ > diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c > index da6f1e138e8d..3de8f11595c0 100644 > --- a/drivers/gpu/drm/vc4/vc4_validate.c > +++ b/drivers/gpu/drm/vc4/vc4_validate.c > @@ -348,10 +348,11 @@ static int > validate_tile_binning_config(VALIDATE_ARGS) > { > struct drm_device *dev = exec->exec_bo->base.dev; > - struct vc4_bo *tile_bo; > + struct vc4_dev *vc4 = to_vc4_dev(dev); > uint8_t flags; > - uint32_t tile_state_size, tile_alloc_size; > - uint32_t tile_count; > + uint32_t tile_state_size; > + uint32_t tile_count, bin_addr; > + int bin_slot; > > if (exec->found_tile_binning_mode_config_packet) { > DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n"); > @@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS) > return -EINVAL; > } > > + bin_slot = vc4_v3d_get_bin_slot(vc4); > + if (bin_slot < 0) { > + if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) { > + DRM_ERROR("Failed to allocate binner memory: %d\n", > + bin_slot); > + } > + return bin_slot; > + } > + > + /* The slot we allocated will only be used by this job, and is > + * free when the job completes rendering. > + */ > + exec->bin_slots |= BIT(bin_slot); > + bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size; > + > /* The tile state data array is 48 bytes per tile, and we put it at > * the start of a BO containing both it and the tile alloc. > */ > tile_state_size = 48 * tile_count; > > /* Since the tile alloc array will follow us, align. */ > - exec->tile_alloc_offset = roundup(tile_state_size, 4096); > + exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096); > > *(uint8_t *)(validated + 14) = > ((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK | > @@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS) > VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128, > VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE)); > > - /* Initial block size. */ > - tile_alloc_size = 32 * tile_count; > - > - /* > - * The initial allocation gets rounded to the next 256 bytes before > - * the hardware starts fulfilling further allocations. > - */ > - tile_alloc_size = roundup(tile_alloc_size, 256); > - > - /* Add space for the extra allocations. This is what gets used first, > - * before overflow memory. It must have at least 4096 bytes, but we > - * want to avoid overflow memory usage if possible. > - */ > - tile_alloc_size += 1024 * 1024; > - > - tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size, > - true); > - exec->tile_bo = &tile_bo->base; > - if (IS_ERR(exec->tile_bo)) > - return PTR_ERR(exec->tile_bo); > - list_add_tail(&tile_bo->unref_head, &exec->unref_list); > - > /* tile alloc address. */ > - *(uint32_t *)(validated + 0) = (exec->tile_bo->paddr + > - exec->tile_alloc_offset); > + *(uint32_t *)(validated + 0) = exec->tile_alloc_offset; > /* tile alloc size. */ > - *(uint32_t *)(validated + 4) = tile_alloc_size; > + *(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size - > + exec->tile_alloc_offset); > /* tile state address. */ > - *(uint32_t *)(validated + 8) = exec->tile_bo->paddr; > + *(uint32_t *)(validated + 8) = bin_addr; > > return 0; > }