6 files changed, 234 insertions, 74 deletions
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 81d2bc08e766..b0967e2f7e88 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -99,12 +99,23 @@ struct vc4_dev {
 	 */
 	struct list_head seqno_cb_list;
 
-	/* The binner overflow memory that's currently set up in
-	 * BPOA/BPOS registers.  When overflow occurs and a new one is
-	 * allocated, the previous one will be moved to
-	 * vc4->current_exec's free list.
+	/* The memory used for storing binner tile alloc, tile state,
+	 * and overflow memory allocations.  This is freed when V3D
+	 * powers down.
 	 */
-	struct vc4_bo *overflow_mem;
+	struct vc4_bo *bin_bo;
+
+	/* Size of blocks allocated within bin_bo. */
+	uint32_t bin_alloc_size;
+
+	/* Bitmask of the bin_alloc_size chunks in bin_bo that are
+	 * used.
+	 */
+	uint32_t bin_alloc_used;
+
+	/* Bitmask of the current bin_alloc used for overflow memory. */
+	uint32_t bin_alloc_overflow;
+
 	struct work_struct overflow_mem_work;
 
 	int power_refcount;
@@ -316,8 +327,12 @@ struct vc4_exec_info {
 	bool found_increment_semaphore_packet;
 	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
-	struct drm_gem_cma_object *tile_bo;
+	/* Physical address of the start of the tile alloc array
+	 * (where each tile's binned CL will start)
+	 */
 	uint32_t tile_alloc_offset;
+	/* Bitmask of which binner slots are freed when this job completes. */
+	uint32_t bin_slots;
 
 	/**
 	 * Computed addresses pointing into exec_bo where we start the
@@ -552,6 +567,7 @@ void vc4_plane_async_set_fb(struct drm_plane *plane,
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
 
 /* vc4_validate.c */
 int
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 12fb70ef3170..735412e3725a 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -820,6 +820,7 @@ static void
 vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
 	unsigned i;
 
 	/* If we got force-completed because of GPU reset rather than
@@ -841,6 +842,11 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 		drm_gem_object_unreference_unlocked(&bo->base.base);
 	}
 
+	/* Free up the allocation of any bin slots we used. */
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	vc4->bin_alloc_used &= ~exec->bin_slots;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
 	mutex_lock(&vc4->power_lock);
 	if (--vc4->power_refcount == 0) {
 		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1101,9 +1107,9 @@ vc4_gem_destroy(struct drm_device *dev)
 	/* V3D should already have disabled its interrupt and cleared
 	 * the overflow allocation registers.  Now free the object.
 	 */
-	if (vc4->overflow_mem) {
-		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
-		vc4->overflow_mem = NULL;
+	if (vc4->bin_bo) {
+		drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+		vc4->bin_bo = NULL;
 	}
 
 	if (vc4->hang_state)
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index 1384af9fc987..7d7af3a93d94 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -59,50 +59,45 @@ vc4_overflow_mem_work(struct work_struct *work)
 {
 	struct vc4_dev *vc4 =
 		container_of(work, struct vc4_dev, overflow_mem_work);
-	struct drm_device *dev = vc4->dev;
-	struct vc4_bo *bo;
+	struct vc4_bo *bo = vc4->bin_bo;
+	int bin_bo_slot;
+	struct vc4_exec_info *exec;
+	unsigned long irqflags;
 
-	bo = vc4_bo_create(dev, 256 * 1024, true);
-	if (IS_ERR(bo)) {
+	bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
+	if (bin_bo_slot < 0) {
 		DRM_ERROR("Couldn't allocate binner overflow mem\n");
 		return;
 	}
 
-	/* If there's a job executing currently, then our previous
-	 * overflow allocation is getting used in that job and we need
-	 * to queue it to be released when the job is done.  But if no
-	 * job is executing at all, then we can free the old overflow
-	 * object direcctly.
-	 *
-	 * No lock necessary for this pointer since we're the only
-	 * ones that update the pointer, and our workqueue won't
-	 * reenter.
-	 */
-	if (vc4->overflow_mem) {
-		struct vc4_exec_info *current_exec;
-		unsigned long irqflags;
-
-		spin_lock_irqsave(&vc4->job_lock, irqflags);
-		current_exec = vc4_first_bin_job(vc4);
-		if (!current_exec)
-			current_exec = vc4_last_render_job(vc4);
-		if (current_exec) {
-			vc4->overflow_mem->seqno = current_exec->seqno;
-			list_add_tail(&vc4->overflow_mem->unref_head,
-				      &current_exec->unref_list);
-			vc4->overflow_mem = NULL;
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+
+	if (vc4->bin_alloc_overflow) {
+		/* If we had overflow memory allocated previously,
+		 * then that chunk will free when the current bin job
+		 * is done.  If we don't have a bin job running, then
+		 * the chunk will be done whenever the list of render
+		 * jobs has drained.
+		 */
+		exec = vc4_first_bin_job(vc4);
+		if (!exec)
+			exec = vc4_last_render_job(vc4);
+		if (exec) {
+			exec->bin_slots |= vc4->bin_alloc_overflow;
+		} else {
+			/* There's nothing queued in the hardware, so
+			 * the old slot is free immediately.
+			 */
+			vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
 		}
-		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 	}
+	vc4->bin_alloc_overflow = BIT(bin_bo_slot);
 
-	if (vc4->overflow_mem)
-		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
-	vc4->overflow_mem = bo;
-
-	V3D_WRITE(V3D_BPOA, bo->base.paddr);
+	V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
 	V3D_WRITE(V3D_BPOS, bo->base.base.size);
 	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
 	V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 }
 
 static void
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
index 4339471f517f..5dc19429d4ae 100644
--- a/drivers/gpu/drm/vc4/vc4_render_cl.c
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -182,8 +182,7 @@ static void emit_tile(struct vc4_exec_info *exec,
 
 	if (has_bin) {
 		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
-		rcl_u32(setup, (exec->tile_bo->paddr +
-				exec->tile_alloc_offset +
+		rcl_u32(setup, (exec->tile_alloc_offset +
 				(y * exec->bin_tiles_x + x) * 32));
 	}
 
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 7cc346ad9b0b..a88078d7c9d1 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -156,6 +156,144 @@ static void vc4_v3d_init_hw(struct drm_device *dev)
 	V3D_WRITE(V3D_VPMBASE, 0);
 }
 
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
+{
+	struct drm_device *dev = vc4->dev;
+	unsigned long irqflags;
+	int slot;
+	uint64_t seqno = 0;
+	struct vc4_exec_info *exec;
+
+try_again:
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	slot = ffs(~vc4->bin_alloc_used);
+	if (slot != 0) {
+		/* Switch from ffs() bit index to a 0-based index. */
+		slot--;
+		vc4->bin_alloc_used |= BIT(slot);
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return slot;
+	}
+
+	/* Couldn't find an open slot.  Wait for render to complete
+	 * and try again.
+	 */
+	exec = vc4_last_render_job(vc4);
+	if (exec)
+		seqno = exec->seqno;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	if (seqno) {
+		int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
+
+		if (ret == 0)
+			goto try_again;
+
+		return ret;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * vc4_allocate_bin_bo() - allocates the memory that will be used for
+ * tile binning.
+ *
+ * The binner has a limitation that the addresses in the tile state
+ * buffer that point into the tile alloc buffer or binner overflow
+ * memory only have 28 bits (256MB), and the top 4 on the bus for
+ * tile alloc references end up coming from the tile state buffer's
+ * address.
+ *
+ * To work around this, we allocate a single large buffer while V3D is
+ * in use, make sure that it has the top 4 bits constant across its
+ * entire extent, and then put the tile state, tile alloc, and binner
+ * overflow memory inside that buffer.
+ *
+ * This creates a limitation where we may not be able to execute a job
+ * if it doesn't fit within the buffer that we allocated up front.
+ * However, it turns out that 16MB is "enough for anybody", and
+ * real-world applications run into allocation failures from the
+ * overall CMA pool before they make scenes complicated enough to run
+ * out of bin space.
+ */
+int
+vc4_allocate_bin_bo(struct drm_device *drm)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+	struct vc4_v3d *v3d = vc4->v3d;
+	uint32_t size = 16 * 1024 * 1024;
+	int ret = 0;
+	struct list_head list;
+
+	/* We may need to try allocating more than once to get a BO
+	 * that doesn't cross 256MB.  Track the ones we've allocated
+	 * that failed so far, so that we can free them when we've got
+	 * one that succeeded (if we freed them right away, our next
+	 * allocation would probably be the same chunk of memory).
+	 */
+	INIT_LIST_HEAD(&list);
+
+	while (true) {
+		struct vc4_bo *bo = vc4_bo_create(drm, size, true);
+
+		if (IS_ERR(bo)) {
+			ret = PTR_ERR(bo);
+
+			dev_err(&v3d->pdev->dev,
+				"Failed to allocate memory for tile binning: "
+				"%d. You may need to enable CMA or give it "
+				"more memory.",
+				ret);
+			break;
+		}
+
+		/* Check if this BO won't trigger the addressing bug. */
+		if ((bo->base.paddr & 0xf0000000) ==
+		    ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
+			vc4->bin_bo = bo;
+
+			/* Set up for allocating 512KB chunks of
+			 * binner memory.  The biggest allocation we
+			 * need to do is for the initial tile alloc +
+			 * tile state buffer.  We can render to a
+			 * maximum of ((2048*2048) / (32*32) = 4096
+			 * tiles in a frame (until we do floating
+			 * point rendering, at which point it would be
+			 * 8192).  Tile state is 48b/tile (rounded to
+			 * a page), and tile alloc is 32b/tile
+			 * (rounded to a page), plus a page of extra,
+			 * for a total of 320kb for our worst-case.
+			 * We choose 512kb so that it divides evenly
+			 * into our 16MB, and the rest of the 512kb
+			 * will be used as storage for the overflow
+			 * from the initial 32b CL per bin.
+			 */
+			vc4->bin_alloc_size = 512 * 1024;
+			vc4->bin_alloc_used = 0;
+			vc4->bin_alloc_overflow = 0;
+			WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
+				     bo->base.base.size / vc4->bin_alloc_size);
+
+			break;
+		}
+
+		/* Put it on the list to free later, and try again. */
+		list_add(&bo->unref_head, &list);
+	}
+
+	/* Free all the BOs we allocated but didn't choose. */
+	while (!list_empty(&list)) {
+		struct vc4_bo *bo = list_last_entry(&list,
+						    struct vc4_bo, unref_head);
+
+		list_del(&bo->unref_head);
+		drm_gem_object_put_unlocked(&bo->base.base);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_PM
 static int vc4_v3d_runtime_suspend(struct device *dev)
 {
@@ -164,6 +302,9 @@ static int vc4_v3d_runtime_suspend(struct device *dev)
 
 	vc4_irq_uninstall(vc4->dev);
 
+	drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+	vc4->bin_bo = NULL;
+
 	return 0;
 }
 
@@ -171,6 +312,11 @@ static int vc4_v3d_runtime_resume(struct device *dev)
 {
 	struct vc4_v3d *v3d = dev_get_drvdata(dev);
 	struct vc4_dev *vc4 = v3d->vc4;
+	int ret;
+
+	ret = vc4_allocate_bin_bo(vc4->dev);
+	if (ret)
+		return ret;
 
 	vc4_v3d_init_hw(vc4->dev);
 	vc4_irq_postinstall(vc4->dev);
@@ -208,6 +354,10 @@ static int vc4_v3d_bind(struct device *dev, struct device *master, void *data)
 		return -EINVAL;
 	}
 
+	ret = vc4_allocate_bin_bo(drm);
+	if (ret)
+		return ret;
+
 	/* Reset the binner overflow address/size at setup, to be sure
 	 * we don't reuse an old one.
 	 */
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index da6f1e138e8d..3de8f11595c0 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -348,10 +348,11 @@ static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
 	struct drm_device *dev = exec->exec_bo->base.dev;
-	struct vc4_bo *tile_bo;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	uint8_t flags;
-	uint32_t tile_state_size, tile_alloc_size;
-	uint32_t tile_count;
+	uint32_t tile_state_size;
+	uint32_t tile_count, bin_addr;
+	int bin_slot;
 
 	if (exec->found_tile_binning_mode_config_packet) {
 		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
@@ -377,13 +378,28 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
+	bin_slot = vc4_v3d_get_bin_slot(vc4);
+	if (bin_slot < 0) {
+		if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
+			DRM_ERROR("Failed to allocate binner memory: %d\n",
+				  bin_slot);
+		}
+		return bin_slot;
+	}
+
+	/* The slot we allocated will only be used by this job, and is
+	 * free when the job completes rendering.
+	 */
+	exec->bin_slots |= BIT(bin_slot);
+	bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
+
 	/* The tile state data array is 48 bytes per tile, and we put it at
 	 * the start of a BO containing both it and the tile alloc.
 	 */
 	tile_state_size = 48 * tile_count;
 
 	/* Since the tile alloc array will follow us, align. */
-	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+	exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
 
 	*(uint8_t *)(validated + 14) =
 		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
@@ -394,35 +410,13 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
 			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
 
-	/* Initial block size. */
-	tile_alloc_size = 32 * tile_count;
-
-	/*
-	 * The initial allocation gets rounded to the next 256 bytes before
-	 * the hardware starts fulfilling further allocations.
-	 */
-	tile_alloc_size = roundup(tile_alloc_size, 256);
-
-	/* Add space for the extra allocations.  This is what gets used first,
-	 * before overflow memory.  It must have at least 4096 bytes, but we
-	 * want to avoid overflow memory usage if possible.
-	 */
-	tile_alloc_size += 1024 * 1024;
-
-	tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
-				true);
-	exec->tile_bo = &tile_bo->base;
-	if (IS_ERR(exec->tile_bo))
-		return PTR_ERR(exec->tile_bo);
-	list_add_tail(&tile_bo->unref_head, &exec->unref_list);
-
 	/* tile alloc address. */
-	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
-					exec->tile_alloc_offset);
+	*(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
 	/* tile alloc size. */
-	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	*(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
+					exec->tile_alloc_offset);
 	/* tile state address. */
-	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
+	*(uint32_t *)(validated + 8) = bin_addr;
 
 	return 0;
 }