summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/vc4/vc4_drv.h28
-rw-r--r--drivers/gpu/drm/vc4/vc4_gem.c12
-rw-r--r--drivers/gpu/drm/vc4/vc4_irq.c61
-rw-r--r--drivers/gpu/drm/vc4/vc4_render_cl.c3
-rw-r--r--drivers/gpu/drm/vc4/vc4_v3d.c150
-rw-r--r--drivers/gpu/drm/vc4/vc4_validate.c54
6 files changed, 234 insertions, 74 deletions
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 81d2bc08e766..b0967e2f7e88 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -99,12 +99,23 @@ struct vc4_dev {
*/
struct list_head seqno_cb_list;
- /* The binner overflow memory that's currently set up in
- * BPOA/BPOS registers. When overflow occurs and a new one is
- * allocated, the previous one will be moved to
- * vc4->current_exec's free list.
+ /* The memory used for storing binner tile alloc, tile state,
+ * and overflow memory allocations. This is freed when V3D
+ * powers down.
*/
- struct vc4_bo *overflow_mem;
+ struct vc4_bo *bin_bo;
+
+ /* Size of blocks allocated within bin_bo. */
+ uint32_t bin_alloc_size;
+
+ /* Bitmask of the bin_alloc_size chunks in bin_bo that are
+ * used.
+ */
+ uint32_t bin_alloc_used;
+
+ /* Bitmask of the current bin_alloc used for overflow memory. */
+ uint32_t bin_alloc_overflow;
+
struct work_struct overflow_mem_work;
int power_refcount;
@@ -316,8 +327,12 @@ struct vc4_exec_info {
bool found_increment_semaphore_packet;
bool found_flush;
uint8_t bin_tiles_x, bin_tiles_y;
- struct drm_gem_cma_object *tile_bo;
+ /* Physical address of the start of the tile alloc array
+ * (where each tile's binned CL will start)
+ */
uint32_t tile_alloc_offset;
+ /* Bitmask of which binner slots are freed when this job completes. */
+ uint32_t bin_slots;
/**
* Computed addresses pointing into exec_bo where we start the
@@ -552,6 +567,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
extern struct platform_driver vc4_v3d_driver;
int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
/* vc4_validate.c */
int
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 12fb70ef3170..735412e3725a 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -820,6 +820,7 @@ static void
vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
+ unsigned long irqflags;
unsigned i;
/* If we got force-completed because of GPU reset rather than
@@ -841,6 +842,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
drm_gem_object_unreference_unlocked(&bo->base.base);
}
+ /* Free up the allocation of any bin slots we used. */
+ spin_lock_irqsave(&vc4->job_lock, irqflags);
+ vc4->bin_alloc_used &= ~exec->bin_slots;
+ spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
mutex_lock(&vc4->power_lock);
if (--vc4->power_refcount == 0) {
pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1101,9 +1107,9 @@ vc4_gem_destroy(struct drm_device *dev)
/* V3D should already have disabled its interrupt and cleared
* the overflow allocation registers. Now free the object.
*/
- if (vc4->overflow_mem) {
- drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
- vc4->overflow_mem = NULL;
+ if (vc4->bin_bo) {
+ drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+ vc4->bin_bo = NULL;
}
if (vc4->hang_state)
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index 1384af9fc987..7d7af3a93d94 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work)
{
struct vc4_dev *vc4 =
container_of(work, struct vc4_dev, overflow_mem_work);
- struct drm_device *dev = vc4->dev;
- struct vc4_bo *bo;
+ struct vc4_bo *bo = vc4->bin_bo;
+ int bin_bo_slot;
+ struct vc4_exec_info *exec;
+ unsigned long irqflags;
- bo = vc4_bo_create(dev, 256 * 1024, true);
- if (IS_ERR(bo)) {
+ bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
+ if (bin_bo_slot < 0) {
DRM_ERROR("Couldn't allocate binner overflow mem\n");
return;
}
- /* If there's a job executing currently, then our previous
- * overflow allocation is getting used in that job and we need
- * to queue it to be released when the job is done. But if no
- * job is executing at all, then we can free the old overflow
- * object direcctly.
- *
- * No lock necessary for this pointer since we're the only
- * ones that update the pointer, and our workqueue won't
- * reenter.
- */
- if (vc4->overflow_mem) {
- struct vc4_exec_info *current_exec;
- unsigned long irqflags;
-
- spin_lock_irqsave(&vc4->job_lock, irqflags);
- current_exec = vc4_first_bin_job(vc4);
- if (!current_exec)
- current_exec = vc4_last_render_job(vc4);
- if (current_exec) {
- vc4->overflow_mem->seqno = current_exec->seqno;
- list_add_tail(&vc4->overflow_mem->unref_head,
- &current_exec->unref_list);
- vc4->overflow_mem = NULL;
+ spin_lock_irqsave(&vc4->job_lock, irqflags);
+
+ if (vc4->bin_alloc_overflow) {
+ /* If we had overflow memory allocated previously,
+ * then that chunk will free when the current bin job
+ * is done. If we don't have a bin job running, then
+ * the chunk will be done whenever the list of render
+ * jobs has drained.
+ */
+ exec = vc4_first_bin_job(vc4);
+ if (!exec)
+ exec = vc4_last_render_job(vc4);
+ if (exec) {
+ exec->bin_slots |= vc4->bin_alloc_overflow;
+ } else {
+ /* There's nothing queued in the hardware, so
+ * the old slot is free immediately.
+ */
+ vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
}
- spin_unlock_irqrestore(&vc4->job_lock, irqflags);
}
+ vc4->bin_alloc_overflow = BIT(bin_bo_slot);
- if (vc4->overflow_mem)
- drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
- vc4->overflow_mem = bo;
-
- V3D_WRITE(V3D_BPOA, bo->base.paddr);
+ V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
V3D_WRITE(V3D_BPOS, bo->base.base.size);
V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
+ spin_unlock_irqrestore(&vc4->job_lock, irqflags);
}
static void
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
index 4339471f517f..5dc19429d4ae 100644
--- a/drivers/gpu/drm/vc4/vc4_render_cl.c
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec,
if (has_bin) {
rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
- rcl_u32(setup, (exec->tile_bo->paddr +
- exec->tile_alloc_offset +
+ rcl_u32(setup, (exec->tile_alloc_offset +
(y * exec->bin_tiles_x + x) * 32));
}
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 7cc346ad9b0b..a88078d7c9d1 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
V3D_WRITE(V3D_VPMBASE, 0);
}
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
+{
+ struct drm_device *dev = vc4->dev;
+ unsigned long irqflags;
+ int slot;
+ uint64_t seqno = 0;
+ struct vc4_exec_info *exec;
+
+try_again:
+ spin_lock_irqsave(&vc4->job_lock, irqflags);
+ slot = ffs(~vc4->bin_alloc_used);
+ if (slot != 0) {
+ /* Switch from ffs() bit index to a 0-based index. */
+ slot--;
+ vc4->bin_alloc_used |= BIT(slot);
+ spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+ return slot;
+ }
+
+ /* Couldn't find an open slot. Wait for render to complete
+ * and try again.
+ */
+ exec = vc4_last_render_job(vc4);
+ if (exec)
+ seqno = exec->seqno;
+ spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+ if (seqno) {
+ int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
+
+ if (ret == 0)
+ goto try_again;
+
+ return ret;
+ }
+
+ return -ENOMEM;
+}
+
+/**
+ * vc4_allocate_bin_bo() - allocates the memory that will be used for
+ * tile binning.
+ *
+ * The binner has a limitation that the addresses in the tile state
+ * buffer that point into the tile alloc buffer or binner overflow
+ * memory only have 28 bits (256MB), and the top 4 on the bus for
+ * tile alloc references end up coming from the tile state buffer's
+ * address.
+ *
+ * To work around this, we allocate a single large buffer while V3D is
+ * in use, make sure that it has the top 4 bits constant across its
+ * entire extent, and then put the tile state, tile alloc, and binner
+ * overflow memory inside that buffer.
+ *
+ * This creates a limitation where we may not be able to execute a job
+ * if it doesn't fit within the buffer that we allocated up front.
+ * However, it turns out that 16MB is "enough for anybody", and
+ * real-world applications run into allocation failures from the
+ * overall CMA pool before they make scenes complicated enough to run
+ * out of bin space.
+ */
+int
+vc4_allocate_bin_bo(struct drm_device *drm)
+{
+ struct vc4_dev *vc4 = to_vc4_dev(drm);
+ struct vc4_v3d *v3d = vc4->v3d;
+ uint32_t size = 16 * 1024 * 1024;
+ int ret = 0;
+ struct list_head list;
+
+ /* We may need to try allocating more than once to get a BO
+ * that doesn't cross 256MB. Track the ones we've allocated
+ * that failed so far, so that we can free them when we've got
+ * one that succeeded (if we freed them right away, our next
+ * allocation would probably be the same chunk of memory).
+ */
+ INIT_LIST_HEAD(&list);
+
+ while (true) {
+ struct vc4_bo *bo = vc4_bo_create(drm, size, true);
+
+ if (IS_ERR(bo)) {
+ ret = PTR_ERR(bo);
+
+ dev_err(&v3d->pdev->dev,
+ "Failed to allocate memory for tile binning: "
+ "%d. You may need to enable CMA or give it "
+ "more memory.",
+ ret);
+ break;
+ }
+
+ /* Check if this BO won't trigger the addressing bug. */
+ if ((bo->base.paddr & 0xf0000000) ==
+ ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
+ vc4->bin_bo = bo;
+
+ /* Set up for allocating 512KB chunks of
+ * binner memory. The biggest allocation we
+ * need to do is for the initial tile alloc +
+ * tile state buffer. We can render to a
+ * maximum of ((2048*2048) / (32*32) = 4096
+ * tiles in a frame (until we do floating
+ * point rendering, at which point it would be
+ * 8192). Tile state is 48b/tile (rounded to
+ * a page), and tile alloc is 32b/tile
+ * (rounded to a page), plus a page of extra,
+ * for a total of 320kb for our worst-case.
+ * We choose 512kb so that it divides evenly
+ * into our 16MB, and the rest of the 512kb
+ * will be used as storage for the overflow
+ * from the initial 32b CL per bin.
+ */
+ vc4->bin_alloc_size = 512 * 1024;
+ vc4->bin_alloc_used = 0;
+ vc4->bin_alloc_overflow = 0;
+ WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
+ bo->base.base.size / vc4->bin_alloc_size);
+
+ break;
+ }
+
+ /* Put it on the list to free later, and try again. */
+ list_add(&bo->unref_head, &list);
+ }
+
+ /* Free all the BOs we allocated but didn't choose. */
+ while (!list_empty(&list)) {
+ struct vc4_bo *bo = list_last_entry(&list,
+ struct vc4_bo, unref_head);
+
+ list_del(&bo->unref_head);
+ drm_gem_object_put_unlocked(&bo->base.base);
+ }
+
+ return ret;
+}
+
#ifdef CONFIG_PM
static int vc4_v3d_runtime_suspend(struct device *dev)
{
@@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev)
vc4_irq_uninstall(vc4->dev);
+ drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+ vc4->bin_bo = NULL;
+
return 0;
}
@@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev)
{
struct vc4_v3d *v3d = dev_get_drvdata(dev);
struct vc4_dev *vc4 = v3d->vc4;
+ int ret;
+
+ ret = vc4_allocate_bin_bo(vc4->dev);
+ if (ret)
+ return ret;
vc4_v3d_init_hw(vc4->dev);
vc4_irq_postinstall(vc4->dev);
@@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
return -EINVAL;
}
+ ret = vc4_allocate_bin_bo(drm);
+ if (ret)
+ return ret;
+
/* Reset the binner overflow address/size at setup, to be sure
* we don't reuse an old one.
*/
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index da6f1e138e8d..3de8f11595c0 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -348,10 +348,11 @@ static int
validate_tile_binning_config(VALIDATE_ARGS)
{
struct drm_device *dev = exec->exec_bo->base.dev;
- struct vc4_bo *tile_bo;
+ struct vc4_dev *vc4 = to_vc4_dev(dev);
uint8_t flags;
- uint32_t tile_state_size, tile_alloc_size;
- uint32_t tile_count;
+ uint32_t tile_state_size;
+ uint32_t tile_count, bin_addr;
+ int bin_slot;
if (exec->found_tile_binning_mode_config_packet) {
DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
@@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS)
return -EINVAL;
}
+ bin_slot = vc4_v3d_get_bin_slot(vc4);
+ if (bin_slot < 0) {
+ if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
+ DRM_ERROR("Failed to allocate binner memory: %d\n",
+ bin_slot);
+ }
+ return bin_slot;
+ }
+
+ /* The slot we allocated will only be used by this job, and is
+ * free when the job completes rendering.
+ */
+ exec->bin_slots |= BIT(bin_slot);
+ bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
+
/* The tile state data array is 48 bytes per tile, and we put it at
* the start of a BO containing both it and the tile alloc.
*/
tile_state_size = 48 * tile_count;
/* Since the tile alloc array will follow us, align. */
- exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+ exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
*(uint8_t *)(validated + 14) =
((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
@@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS)
VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
- /* Initial block size. */
- tile_alloc_size = 32 * tile_count;
-
- /*
- * The initial allocation gets rounded to the next 256 bytes before
- * the hardware starts fulfilling further allocations.
- */
- tile_alloc_size = roundup(tile_alloc_size, 256);
-
- /* Add space for the extra allocations. This is what gets used first,
- * before overflow memory. It must have at least 4096 bytes, but we
- * want to avoid overflow memory usage if possible.
- */
- tile_alloc_size += 1024 * 1024;
-
- tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
- true);
- exec->tile_bo = &tile_bo->base;
- if (IS_ERR(exec->tile_bo))
- return PTR_ERR(exec->tile_bo);
- list_add_tail(&tile_bo->unref_head, &exec->unref_list);
-
/* tile alloc address. */
- *(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
- exec->tile_alloc_offset);
+ *(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
/* tile alloc size. */
- *(uint32_t *)(validated + 4) = tile_alloc_size;
+ *(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
+ exec->tile_alloc_offset);
/* tile state address. */
- *(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
+ *(uint32_t *)(validated + 8) = bin_addr;
return 0;
}