drm/i915/tgl: Suspend pre-parser across GTT invalidations

Before we execute a batch, we must first issue any and all TLB invalidations so that batch picks up the new page table entries. Tigerlake's preparser is weakening our post-sync CS_STALL inside the invalidate pipe-control and allowing the loading of the batch buffer before we have setup its page table (and so it loads the wrong page and executes indefinitely). The igt_cs_tlb indicates that this issue can only be observed on rcs, even though the preparser is common to all engines. Alternatively, we could do TLB shootdown via mmio on updating the GTT. By inserting the pre-parser disable inside EMIT_INVALIDATE, we will also accidentally fixup execution that writes into subsequent batches, such as gem_exec_whisper and even relocations performed on the GPU. We should be careful not to allow this disable to become baked into the uABI! The issue is that if userspace relies on our disabling of the HW optimisation, when we are ready to enable that optimisation, userspace will then be broken... Testcase: igt/i915_selftests/live_gtt/igt_cs_tlb Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=111753 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190919151811.9526-1-chris@chris-wilson.co.uk
author: Chris Wilson <chris@chris-wilson.co.uk> 2019-09-19 16:18:11 +0100
committer: Chris Wilson <chris@chris-wilson.co.uk> 2019-09-20 09:47:52 +0100
commit: c45e788d95b470e9f68fabe1f3cb44beb5dd7840 (patch)
tree: 2081a90d41d917b680514bc3f40fb75c1591ed9e /drivers/gpu/drm
parent: 2d20411e25a3bf3d2914a2219f47ed48dc57aed5 (diff)
1 files changed, 73 insertions, 1 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index a99166a2d2eb..49ed9230a2cb 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2807,6 +2807,78 @@ static int gen11_emit_flush_render(struct i915_request *request,
 	return 0;
 }
 
+static u32 preparser_disable(bool state)
+{
+	return MI_ARB_CHECK | 1 << 8 | state;
+}
+
+static int gen12_emit_flush_render(struct i915_request *request,
+				   u32 mode)
+{
+	const u32 scratch_addr =
+		intel_gt_scratch_offset(request->engine->gt,
+					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
+
+	if (mode & EMIT_FLUSH) {
+		u32 flags = 0;
+		u32 *cs;
+
+		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+		flags |= PIPE_CONTROL_FLUSH_ENABLE;
+
+		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+		flags |= PIPE_CONTROL_QW_WRITE;
+
+		flags |= PIPE_CONTROL_CS_STALL;
+
+		cs = intel_ring_begin(request, 6);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
+		intel_ring_advance(request, cs);
+	}
+
+	if (mode & EMIT_INVALIDATE) {
+		u32 flags = 0;
+		u32 *cs;
+
+		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_TLB_INVALIDATE;
+		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+
+		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+		flags |= PIPE_CONTROL_QW_WRITE;
+
+		flags |= PIPE_CONTROL_CS_STALL;
+
+		cs = intel_ring_begin(request, 8);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		/*
+		 * Prevent the pre-parser from skipping past the TLB
+		 * invalidate and loading a stale page for the batch
+		 * buffer / request payload.
+		 */
+		*cs++ = preparser_disable(true);
+
+		cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
+
+		*cs++ = preparser_disable(false);
+		intel_ring_advance(request, cs);
+	}
+
+	return 0;
+}
+
 /*
  * Reserve space for 2 NOOPs at the end of each request to be
  * used as a workaround for not being allowed to do lite
@@ -3072,7 +3144,7 @@ static void rcs_submission_override(struct intel_engine_cs *engine)
 {
 	switch (INTEL_GEN(engine->i915)) {
 	case 12:
-		engine->emit_flush = gen11_emit_flush_render;
+		engine->emit_flush = gen12_emit_flush_render;
 		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
 		break;
 	case 11:
author	Chris Wilson <chris@chris-wilson.co.uk>	2019-09-19 16:18:11 +0100
committer	Chris Wilson <chris@chris-wilson.co.uk>	2019-09-20 09:47:52 +0100
commit	c45e788d95b470e9f68fabe1f3cb44beb5dd7840 (patch)
tree	2081a90d41d917b680514bc3f40fb75c1591ed9e /drivers/gpu/drm
parent	2d20411e25a3bf3d2914a2219f47ed48dc57aed5 (diff)