summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/gt/intel_lrc.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_lrc.c')
-rw-r--r--drivers/gpu/drm/i915/gt/intel_lrc.c597
1 files changed, 489 insertions, 108 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index a27bac0a4bfb..e8927ad49142 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -3,19 +3,49 @@
* Copyright © 2014 Intel Corporation
*/
+#include <drm/drm_print.h>
+
#include "gem/i915_gem_lmem.h"
#include "gen8_engine_cs.h"
#include "i915_drv.h"
#include "i915_perf.h"
+#include "i915_reg.h"
+#include "intel_context.h"
#include "intel_engine.h"
+#include "intel_engine_regs.h"
#include "intel_gpu_commands.h"
#include "intel_gt.h"
+#include "intel_gt_regs.h"
#include "intel_lrc.h"
#include "intel_lrc_reg.h"
#include "intel_ring.h"
#include "shmem_utils.h"
+/*
+ * The per-platform tables are u8-encoded in @data. Decode @data and set the
+ * addresses' offset and commands in @regs. The following encoding is used
+ * for each byte. There are 2 steps: decoding commands and decoding addresses.
+ *
+ * Commands:
+ * [7]: create NOPs - number of NOPs are set in lower bits
+ * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
+ * MI_LRI_FORCE_POSTED
+ * [5:0]: Number of NOPs or registers to set values to in case of
+ * MI_LOAD_REGISTER_IMM
+ *
+ * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
+ * number of registers. They are set by using the REG/REG16 macros: the former
+ * is used for offsets smaller than 0x200 while the latter is for values bigger
+ * than that. Those macros already set all the bits documented below correctly:
+ *
+ * [7]: When a register offset needs more than 6 bits, use additional bytes, to
+ * follow, for the lower bits
+ * [6:0]: Register offset, without considering the engine base.
+ *
+ * This function only tweaks the commands and register offsets. Values are not
+ * filled out.
+ */
static void set_offsets(u32 *regs,
const u8 *data,
const struct intel_engine_cs *engine,
@@ -70,7 +100,7 @@ static void set_offsets(u32 *regs,
if (close) {
/* Close the batch; used mainly by live_lrc_layout() */
*regs = MI_BATCH_BUFFER_END;
- if (GRAPHICS_VER(engine->i915) >= 10)
+ if (GRAPHICS_VER(engine->i915) >= 11)
*regs |= BIT(0);
}
}
@@ -226,6 +256,40 @@ static const u8 gen12_xcs_offsets[] = {
END
};
+static const u8 dg2_xcs_offsets[] = {
+ NOP(1),
+ LRI(15, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+ REG(0x180),
+ REG16(0x2b4),
+ REG(0x120),
+ REG(0x124),
+
+ NOP(1),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ END
+};
+
static const u8 gen8_rcs_offsets[] = {
NOP(1),
LRI(14, POSTED),
@@ -484,6 +548,92 @@ static const u8 gen12_rcs_offsets[] = {
END
};
+static const u8 dg2_rcs_offsets[] = {
+ NOP(1),
+ LRI(15, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+ REG(0x180),
+ REG16(0x2b4),
+ REG(0x120),
+ REG(0x124),
+
+ NOP(1),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ LRI(3, POSTED),
+ REG(0x1b0),
+ REG16(0x5a8),
+ REG16(0x5ac),
+
+ NOP(6),
+ LRI(1, 0),
+ REG(0x0c8),
+
+ END
+};
+
+static const u8 mtl_rcs_offsets[] = {
+ NOP(1),
+ LRI(15, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+ REG(0x180),
+ REG16(0x2b4),
+ REG(0x120),
+ REG(0x124),
+
+ NOP(1),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ NOP(2),
+ LRI(2, POSTED),
+ REG16(0x5a8),
+ REG16(0x5ac),
+
+ NOP(6),
+ LRI(1, 0),
+ REG(0x0c8),
+
+ END
+};
+
#undef END
#undef REG16
#undef REG
@@ -501,8 +651,12 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine)
GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
!intel_engine_has_relative_mmio(engine));
- if (engine->class == RENDER_CLASS) {
- if (GRAPHICS_VER(engine->i915) >= 12)
+ if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
+ if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
+ return mtl_rcs_offsets;
+ else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
+ return dg2_rcs_offsets;
+ else if (GRAPHICS_VER(engine->i915) >= 12)
return gen12_rcs_offsets;
else if (GRAPHICS_VER(engine->i915) >= 11)
return gen11_rcs_offsets;
@@ -511,7 +665,9 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine)
else
return gen8_rcs_offsets;
} else {
- if (GRAPHICS_VER(engine->i915) >= 12)
+ if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
+ return dg2_xcs_offsets;
+ else if (GRAPHICS_VER(engine->i915) >= 12)
return gen12_xcs_offsets;
else if (GRAPHICS_VER(engine->i915) >= 9)
return gen9_xcs_offsets;
@@ -522,7 +678,9 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine)
static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
{
- if (GRAPHICS_VER(engine->i915) >= 12)
+ if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
+ return 0x70;
+ else if (GRAPHICS_VER(engine->i915) >= 12)
return 0x60;
else if (GRAPHICS_VER(engine->i915) >= 9)
return 0x54;
@@ -532,9 +690,26 @@ static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
return -1;
}
+static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
+{
+ if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
+ return 0x80;
+ else if (GRAPHICS_VER(engine->i915) >= 12)
+ return 0x70;
+ else if (GRAPHICS_VER(engine->i915) >= 9)
+ return 0x64;
+ else if (GRAPHICS_VER(engine->i915) >= 8 &&
+ engine->class == RENDER_CLASS)
+ return 0xc4;
+ else
+ return -1;
+}
+
static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
{
- if (GRAPHICS_VER(engine->i915) >= 12)
+ if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
+ return 0x84;
+ else if (GRAPHICS_VER(engine->i915) >= 12)
return 0x74;
else if (GRAPHICS_VER(engine->i915) >= 9)
return 0x68;
@@ -578,10 +753,15 @@ static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
{
- if (engine->class != RENDER_CLASS)
+ if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
+ /*
+ * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
+ * simply to match the RCS context image layout.
+ */
+ return 0xc6;
+ else if (engine->class != RENDER_CLASS)
return -1;
-
- if (GRAPHICS_VER(engine->i915) >= 12)
+ else if (GRAPHICS_VER(engine->i915) >= 12)
return 0xb6;
else if (GRAPHICS_VER(engine->i915) >= 11)
return 0xaa;
@@ -592,21 +772,30 @@ static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
{
- switch (GRAPHICS_VER(engine->i915)) {
- default:
- MISSING_CASE(GRAPHICS_VER(engine->i915));
- fallthrough;
- case 12:
+ if (GRAPHICS_VER(engine->i915) >= 12)
return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- case 11:
+ else if (GRAPHICS_VER(engine->i915) >= 11)
return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- case 10:
- return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- case 9:
+ else if (GRAPHICS_VER(engine->i915) >= 9)
return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- case 8:
+ else if (GRAPHICS_VER(engine->i915) >= 8)
return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- }
+
+ GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
+
+ return 0;
+}
+
+static void
+lrc_setup_bb_per_ctx(u32 *regs,
+ const struct intel_engine_cs *engine,
+ u32 ctx_bb_ggtt_addr)
+{
+ GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
+ regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
+ ctx_bb_ggtt_addr |
+ PER_CTX_BB_FORCE |
+ PER_CTX_BB_VALID;
}
static void
@@ -626,12 +815,36 @@ lrc_setup_indirect_ctx(u32 *regs,
lrc_ring_indirect_offset_default(engine) << 6;
}
+static bool ctx_needs_runalone(const struct intel_context *ce)
+{
+ struct i915_gem_context *gem_ctx;
+ bool ctx_is_protected = false;
+
+ /*
+ * Wa_14019159160 - Case 2.
+ * On some platforms, protected contexts require setting
+ * the LRC run-alone bit or else the encryption/decryption will not happen.
+ * NOTE: Case 2 only applies to PXP use-case of said workaround.
+ */
+ if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
+ (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
+ rcu_read_lock();
+ gem_ctx = rcu_dereference(ce->gem_context);
+ if (gem_ctx)
+ ctx_is_protected = gem_ctx->uses_protected_content;
+ rcu_read_unlock();
+ }
+
+ return ctx_is_protected;
+}
+
static void init_common_regs(u32 * const regs,
const struct intel_context *ce,
const struct intel_engine_cs *engine,
bool inhibit)
{
u32 ctl;
+ int loc;
ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
@@ -640,9 +853,16 @@ static void init_common_regs(u32 * const regs,
if (GRAPHICS_VER(engine->i915) < 11)
ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
CTX_CTRL_RS_CTX_ENABLE);
+ /* Wa_14019159160 - Case 2.*/
+ if (ctx_needs_runalone(ce))
+ ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
regs[CTX_CONTEXT_CONTROL] = ctl;
- regs[CTX_TIMESTAMP] = ce->runtime.last;
+ regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
+
+ loc = lrc_ring_bb_offset(engine);
+ if (loc != -1)
+ regs[loc + 1] = 0;
}
static void init_wa_bb_regs(u32 * const regs,
@@ -768,6 +988,31 @@ check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
engine->name);
}
+static u32 context_wa_bb_offset(const struct intel_context *ce)
+{
+ return PAGE_SIZE * ce->wa_bb_page;
+}
+
+/*
+ * per_ctx below determines which WABB section is used.
+ * When true, the function returns the location of the
+ * PER_CTX_BB. When false, the function returns the
+ * location of the INDIRECT_CTX.
+ */
+static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
+{
+ void *ptr;
+
+ GEM_BUG_ON(!ce->wa_bb_page);
+
+ ptr = ce->lrc_reg_state;
+ ptr -= LRC_STATE_OFFSET; /* back to start of context image */
+ ptr += context_wa_bb_offset(ce);
+ ptr += per_ctx ? PAGE_SIZE : 0;
+
+ return ptr;
+}
+
void lrc_init_state(struct intel_context *ce,
struct intel_engine_cs *engine,
void *state)
@@ -776,9 +1021,8 @@ void lrc_init_state(struct intel_context *ce,
set_redzone(state, engine);
- if (engine->default_state) {
- shmem_read(engine->default_state, 0,
- state, engine->context_size);
+ if (ce->default_state) {
+ shmem_read(ce->default_state, 0, state, engine->context_size);
__set_bit(CONTEXT_VALID_BIT, &ce->flags);
inhibit = false;
}
@@ -786,6 +1030,10 @@ void lrc_init_state(struct intel_context *ce,
/* Clear the ppHWSP (inc. per-context counters) */
memset(state, 0, PAGE_SIZE);
+ /* Clear the indirect wa and storage */
+ if (ce->wa_bb_page)
+ memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
+
/*
* The second page of the context object contains some registers which
* must be set up prior to the first execution.
@@ -793,6 +1041,35 @@ void lrc_init_state(struct intel_context *ce,
__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
}
+u32 lrc_indirect_bb(const struct intel_context *ce)
+{
+ return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
+}
+
+static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
+{
+ /* If predication is active, this will be noop'ed */
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
+ *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
+ *cs++ = 0;
+ *cs++ = 0; /* No predication */
+
+ /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
+ *cs++ = MI_BATCH_BUFFER_END | BIT(15);
+ *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
+
+ /* Instructions are no longer predicated (disabled), we can proceed */
+ *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
+ *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
+ *cs++ = 0;
+ *cs++ = 1; /* enable predication before the next BB */
+
+ *cs++ = MI_BATCH_BUFFER_END;
+ GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
+
+ return cs;
+}
+
static struct i915_vma *
__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
{
@@ -805,16 +1082,32 @@ __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
context_size += I915_GTT_PAGE_SIZE; /* for redzone */
- if (GRAPHICS_VER(engine->i915) == 12) {
+ if (GRAPHICS_VER(engine->i915) >= 12) {
ce->wa_bb_page = context_size / PAGE_SIZE;
- context_size += PAGE_SIZE;
+ /* INDIRECT_CTX and PER_CTX_BB need separate pages. */
+ context_size += PAGE_SIZE * 2;
}
- obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
- if (IS_ERR(obj))
+ if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
+ ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
+ context_size += PARENT_SCRATCH_SIZE;
+ }
+
+ obj = i915_gem_object_create_lmem(engine->i915, context_size,
+ I915_BO_ALLOC_PM_VOLATILE);
+ if (IS_ERR(obj)) {
obj = i915_gem_object_create_shmem(engine->i915, context_size);
- if (IS_ERR(obj))
- return ERR_CAST(obj);
+ if (IS_ERR(obj))
+ return ERR_CAST(obj);
+
+ /*
+ * Wa_22016122933: For Media version 13.0, all Media GT shared
+ * memory needs to be mapped as WC on CPU side and UC (PAT
+ * index 2) on GPU side.
+ */
+ if (intel_gt_needs_wa_22016122933(engine->gt))
+ i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
+ }
vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
if (IS_ERR(vma)) {
@@ -841,11 +1134,14 @@ int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
GEM_BUG_ON(ce->state);
+ if (!intel_context_has_own_state(ce))
+ ce->default_state = engine->default_state;
+
vma = __lrc_alloc_state(ce, engine);
if (IS_ERR(vma))
return PTR_ERR(vma);
- ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
+ ring = intel_engine_create_ring(engine, ce->ring_size);
if (IS_ERR(ring)) {
err = PTR_ERR(ring);
goto err_vma;
@@ -903,9 +1199,9 @@ lrc_pre_pin(struct intel_context *ce,
GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
*vaddr = i915_gem_object_pin_map(ce->state->obj,
- i915_coherent_map_type(ce->engine->i915,
- ce->state->obj,
- false) |
+ intel_gt_coherent_map_type(ce->engine->gt,
+ ce->state->obj,
+ false) |
I915_MAP_OVERRIDE);
return PTR_ERR_OR_ZERO(*vaddr);
@@ -927,6 +1223,10 @@ lrc_pin(struct intel_context *ce,
void lrc_unpin(struct intel_context *ce)
{
+ if (unlikely(ce->parallel.last_rq)) {
+ i915_request_put(ce->parallel.last_rq);
+ ce->parallel.last_rq = NULL;
+ }
check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
ce->engine);
}
@@ -1022,6 +1322,32 @@ gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
return cs;
}
+/*
+ * The bspec's tuning guide asks us to program a vertical watermark value of
+ * 0x3FF. However this register is not saved/restored properly by the
+ * hardware, so we're required to apply the desired value via INDIRECT_CTX
+ * batch buffer to ensure the value takes effect properly. All other bits
+ * in this register should remain at 0 (the hardware default).
+ */
+static u32 *
+dg2_emit_draw_watermark_setting(u32 *cs)
+{
+ *cs++ = MI_LOAD_REGISTER_IMM(1);
+ *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
+ *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
+
+ return cs;
+}
+
+static u32 *
+gen12_invalidate_state_cache(u32 *cs)
+{
+ *cs++ = MI_LOAD_REGISTER_IMM(1);
+ *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
+ *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
+ return cs;
+}
+
static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
{
@@ -1029,6 +1355,22 @@ gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
cs = gen12_emit_cmd_buf_wa(ce, cs);
cs = gen12_emit_restore_scratch(ce, cs);
+ /* Wa_16013000631:dg2 */
+ if (IS_DG2_G11(ce->engine->i915))
+ cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
+
+ cs = gen12_emit_aux_table_inv(ce->engine, cs);
+
+ /* Wa_18022495364 */
+ if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
+ cs = gen12_invalidate_state_cache(cs);
+
+ /* Wa_16014892111 */
+ if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
+ IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
+ IS_DG2(ce->engine->i915))
+ cs = dg2_emit_draw_watermark_setting(cs);
+
return cs;
}
@@ -1038,25 +1380,87 @@ gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
cs = gen12_emit_timestamp_wa(ce, cs);
cs = gen12_emit_restore_scratch(ce, cs);
+ /* Wa_16013000631:dg2 */
+ if (IS_DG2_G11(ce->engine->i915))
+ if (ce->engine->class == COMPUTE_CLASS)
+ cs = gen8_emit_pipe_control(cs,
+ PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
+ 0);
+
+ return gen12_emit_aux_table_inv(ce->engine, cs);
+}
+
+static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
+{
+ struct intel_gt *gt = ce->engine->gt;
+ int mocs = gt->mocs.uc_index << 1;
+
+ /**
+ * Wa_16018031267 / Wa_16018063123 requires that SW forces the
+ * main copy engine arbitration into round robin mode. We
+ * additionally need to submit the following WABB blt command
+ * to produce 4 subblits with each subblit generating 0 byte
+ * write requests as WABB:
+ *
+ * XY_FASTCOLOR_BLT
+ * BG0 -> 5100000E
+ * BG1 -> 0000003F (Dest pitch)
+ * BG2 -> 00000000 (X1, Y1) = (0, 0)
+ * BG3 -> 00040001 (X2, Y2) = (1, 4)
+ * BG4 -> scratch
+ * BG5 -> scratch
+ * BG6-12 -> 00000000
+ * BG13 -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
+ * BG14 -> 00000010 (Qpitch = 4)
+ * BG15 -> 00000000
+ */
+ *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
+ *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
+ *cs++ = 0;
+ *cs++ = 4 << 16 | 1;
+ *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
+ *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
+ *cs++ = 0;
+ *cs++ = 0;
+ *cs++ = 0;
+ *cs++ = 0;
+ *cs++ = 0;
+ *cs++ = 0;
+ *cs++ = 0;
+ *cs++ = 0x20004004;
+ *cs++ = 0x10;
+ *cs++ = 0;
+
return cs;
}
-static u32 context_wa_bb_offset(const struct intel_context *ce)
+static u32 *
+xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
{
- return PAGE_SIZE * ce->wa_bb_page;
+ /* Wa_16018031267, Wa_16018063123 */
+ if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
+ cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
+
+ return cs;
}
-static u32 *context_indirect_bb(const struct intel_context *ce)
+static void
+setup_per_ctx_bb(const struct intel_context *ce,
+ const struct intel_engine_cs *engine,
+ u32 *(*emit)(const struct intel_context *, u32 *))
{
- void *ptr;
+ /* Place PER_CTX_BB on next page after INDIRECT_CTX */
+ u32 * const start = context_wabb(ce, true);
+ u32 *cs;
- GEM_BUG_ON(!ce->wa_bb_page);
+ cs = emit(ce, start);
- ptr = ce->lrc_reg_state;
- ptr -= LRC_STATE_OFFSET; /* back to start of context image */
- ptr += context_wa_bb_offset(ce);
+ /* PER_CTX_BB must manually terminate */
+ *cs++ = MI_BATCH_BUFFER_END;
- return ptr;
+ GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
+ lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
+ lrc_indirect_bb(ce) + PAGE_SIZE);
}
static void
@@ -1064,7 +1468,7 @@ setup_indirect_ctx_bb(const struct intel_context *ce,
const struct intel_engine_cs *engine,
u32 *(*emit)(const struct intel_context *, u32 *))
{
- u32 * const start = context_indirect_bb(ce);
+ u32 * const start = context_wabb(ce, false);
u32 *cs;
cs = emit(ce, start);
@@ -1072,9 +1476,11 @@ setup_indirect_ctx_bb(const struct intel_context *ce,
while ((unsigned long)cs % CACHELINE_BYTES)
*cs++ = MI_NOOP;
+ GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
+ setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
+
lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
- i915_ggtt_offset(ce->state) +
- context_wa_bb_offset(ce),
+ lrc_indirect_bb(ce),
(cs - start) * sizeof(*cs));
}
@@ -1101,6 +1507,14 @@ setup_indirect_ctx_bb(const struct intel_context *ce,
* bits 55-60: SW counter
* bits 61-63: engine class
*
+ * On Xe_HP, the upper dword of the descriptor has a new format:
+ *
+ * bits 32-37: virtual function number
+ * bit 38: mbz, reserved for use by hardware
+ * bits 39-54: SW context ID
+ * bits 55-57: reserved
+ * bits 58-63: SW counter
+ *
* engine info, SW context ID and SW counter need to form a unique number
* (Context ID) per lrc.
*/
@@ -1153,6 +1567,7 @@ u32 lrc_update_regs(const struct intel_context *ce,
/* Mutually exclusive wrt to global indirect bb */
GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
setup_indirect_ctx_bb(ce, engine, fn);
+ setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
}
return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
@@ -1387,40 +1802,6 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
return batch;
}
-static u32 *
-gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- int i;
-
- /*
- * WaPipeControlBefore3DStateSamplePattern: cnl
- *
- * Ensure the engine is idle prior to programming a
- * 3DSTATE_SAMPLE_PATTERN during a context restore.
- */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_CS_STALL,
- 0);
- /*
- * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
- * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
- * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
- * confusing. Since gen8_emit_pipe_control() already advances the
- * batch by 6 dwords, we advance the other 10 here, completing a
- * cacheline. It's not clear if the workaround requires this padding
- * before other commands, or if it's just the regular padding we would
- * already have for the workaround bb, so leave it here for now.
- */
- for (i = 0; i < 10; i++)
- *batch++ = MI_NOOP;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
#define CTX_WA_BB_SIZE (PAGE_SIZE)
static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
@@ -1466,28 +1847,16 @@ void lrc_init_wa_ctx(struct intel_engine_cs *engine)
unsigned int i;
int err;
- if (engine->class != RENDER_CLASS)
+ if (GRAPHICS_VER(engine->i915) >= 11 ||
+ !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
return;
- switch (GRAPHICS_VER(engine->i915)) {
- case 12:
- case 11:
- return;
- case 10:
- wa_bb_fn[0] = gen10_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- case 9:
+ if (GRAPHICS_VER(engine->i915) == 9) {
wa_bb_fn[0] = gen9_init_indirectctx_bb;
wa_bb_fn[1] = NULL;
- break;
- case 8:
+ } else if (GRAPHICS_VER(engine->i915) == 8) {
wa_bb_fn[0] = gen8_init_indirectctx_bb;
wa_bb_fn[1] = NULL;
- break;
- default:
- MISSING_CASE(GRAPHICS_VER(engine->i915));
- return;
}
err = lrc_create_wa_ctx(engine);
@@ -1565,35 +1934,47 @@ err:
}
}
-static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
+static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
{
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
- ce->runtime.num_underflow++;
- ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
+ stats->runtime.num_underflow++;
+ stats->runtime.max_underflow =
+ max_t(u32, stats->runtime.max_underflow, -dt);
#endif
}
+static u32 lrc_get_runtime(const struct intel_context *ce)
+{
+ /*
+ * We can use either ppHWSP[16] which is recorded before the context
+ * switch (and so excludes the cost of context switches) or use the
+ * value from the context image itself, which is saved/restored earlier
+ * and so includes the cost of the save.
+ */
+ return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
+}
+
void lrc_update_runtime(struct intel_context *ce)
{
+ struct intel_context_stats *stats = &ce->stats;
u32 old;
s32 dt;
- if (intel_context_is_barrier(ce))
+ old = stats->runtime.last;
+ stats->runtime.last = lrc_get_runtime(ce);
+ dt = stats->runtime.last - old;
+ if (!dt)
return;
- old = ce->runtime.last;
- ce->runtime.last = lrc_get_runtime(ce);
- dt = ce->runtime.last - old;
-
if (unlikely(dt < 0)) {
CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
- old, ce->runtime.last, dt);
- st_update_runtime_underflow(ce, dt);
+ old, stats->runtime.last, dt);
+ st_runtime_underflow(stats, dt);
return;
}
- ewma_runtime_add(&ce->runtime.avg, dt);
- ce->runtime.total += dt;
+ ewma_runtime_add(&stats->runtime.avg, dt);
+ stats->runtime.total += dt;
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)