diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_lrc.c')
| -rw-r--r-- | drivers/gpu/drm/i915/gt/intel_lrc.c | 597 |
1 files changed, 489 insertions, 108 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index a27bac0a4bfb..e8927ad49142 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -3,19 +3,49 @@ * Copyright © 2014 Intel Corporation */ +#include <drm/drm_print.h> + #include "gem/i915_gem_lmem.h" #include "gen8_engine_cs.h" #include "i915_drv.h" #include "i915_perf.h" +#include "i915_reg.h" +#include "intel_context.h" #include "intel_engine.h" +#include "intel_engine_regs.h" #include "intel_gpu_commands.h" #include "intel_gt.h" +#include "intel_gt_regs.h" #include "intel_lrc.h" #include "intel_lrc_reg.h" #include "intel_ring.h" #include "shmem_utils.h" +/* + * The per-platform tables are u8-encoded in @data. Decode @data and set the + * addresses' offset and commands in @regs. The following encoding is used + * for each byte. There are 2 steps: decoding commands and decoding addresses. + * + * Commands: + * [7]: create NOPs - number of NOPs are set in lower bits + * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set + * MI_LRI_FORCE_POSTED + * [5:0]: Number of NOPs or registers to set values to in case of + * MI_LOAD_REGISTER_IMM + * + * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" + * number of registers. They are set by using the REG/REG16 macros: the former + * is used for offsets smaller than 0x200 while the latter is for values bigger + * than that. Those macros already set all the bits documented below correctly: + * + * [7]: When a register offset needs more than 6 bits, use additional bytes, to + * follow, for the lower bits + * [6:0]: Register offset, without considering the engine base. + * + * This function only tweaks the commands and register offsets. Values are not + * filled out. + */ static void set_offsets(u32 *regs, const u8 *data, const struct intel_engine_cs *engine, @@ -70,7 +100,7 @@ static void set_offsets(u32 *regs, if (close) { /* Close the batch; used mainly by live_lrc_layout() */ *regs = MI_BATCH_BUFFER_END; - if (GRAPHICS_VER(engine->i915) >= 10) + if (GRAPHICS_VER(engine->i915) >= 11) *regs |= BIT(0); } } @@ -226,6 +256,40 @@ static const u8 gen12_xcs_offsets[] = { END }; +static const u8 dg2_xcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + REG(0x120), + REG(0x124), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + END +}; + static const u8 gen8_rcs_offsets[] = { NOP(1), LRI(14, POSTED), @@ -484,6 +548,92 @@ static const u8 gen12_rcs_offsets[] = { END }; +static const u8 dg2_rcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + REG(0x120), + REG(0x124), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + LRI(3, POSTED), + REG(0x1b0), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + + END +}; + +static const u8 mtl_rcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + REG(0x120), + REG(0x124), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + NOP(2), + LRI(2, POSTED), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + + END +}; + #undef END #undef REG16 #undef REG @@ -501,8 +651,12 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine) GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && !intel_engine_has_relative_mmio(engine)); - if (engine->class == RENDER_CLASS) { - if (GRAPHICS_VER(engine->i915) >= 12) + if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) + return mtl_rcs_offsets; + else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + return dg2_rcs_offsets; + else if (GRAPHICS_VER(engine->i915) >= 12) return gen12_rcs_offsets; else if (GRAPHICS_VER(engine->i915) >= 11) return gen11_rcs_offsets; @@ -511,7 +665,9 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine) else return gen8_rcs_offsets; } else { - if (GRAPHICS_VER(engine->i915) >= 12) + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + return dg2_xcs_offsets; + else if (GRAPHICS_VER(engine->i915) >= 12) return gen12_xcs_offsets; else if (GRAPHICS_VER(engine->i915) >= 9) return gen9_xcs_offsets; @@ -522,7 +678,9 @@ static const u8 *reg_offsets(const struct intel_engine_cs *engine) static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) { - if (GRAPHICS_VER(engine->i915) >= 12) + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + return 0x70; + else if (GRAPHICS_VER(engine->i915) >= 12) return 0x60; else if (GRAPHICS_VER(engine->i915) >= 9) return 0x54; @@ -532,9 +690,26 @@ static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) return -1; } +static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) +{ + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + return 0x80; + else if (GRAPHICS_VER(engine->i915) >= 12) + return 0x70; + else if (GRAPHICS_VER(engine->i915) >= 9) + return 0x64; + else if (GRAPHICS_VER(engine->i915) >= 8 && + engine->class == RENDER_CLASS) + return 0xc4; + else + return -1; +} + static int lrc_ring_gpr0(const struct intel_engine_cs *engine) { - if (GRAPHICS_VER(engine->i915) >= 12) + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + return 0x84; + else if (GRAPHICS_VER(engine->i915) >= 12) return 0x74; else if (GRAPHICS_VER(engine->i915) >= 9) return 0x68; @@ -578,10 +753,15 @@ static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) { - if (engine->class != RENDER_CLASS) + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) + /* + * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL + * simply to match the RCS context image layout. + */ + return 0xc6; + else if (engine->class != RENDER_CLASS) return -1; - - if (GRAPHICS_VER(engine->i915) >= 12) + else if (GRAPHICS_VER(engine->i915) >= 12) return 0xb6; else if (GRAPHICS_VER(engine->i915) >= 11) return 0xaa; @@ -592,21 +772,30 @@ static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) static u32 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) { - switch (GRAPHICS_VER(engine->i915)) { - default: - MISSING_CASE(GRAPHICS_VER(engine->i915)); - fallthrough; - case 12: + if (GRAPHICS_VER(engine->i915) >= 12) return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - case 11: + else if (GRAPHICS_VER(engine->i915) >= 11) return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - case 10: - return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - case 9: + else if (GRAPHICS_VER(engine->i915) >= 9) return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - case 8: + else if (GRAPHICS_VER(engine->i915) >= 8) return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - } + + GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); + + return 0; +} + +static void +lrc_setup_bb_per_ctx(u32 *regs, + const struct intel_engine_cs *engine, + u32 ctx_bb_ggtt_addr) +{ + GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); + regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = + ctx_bb_ggtt_addr | + PER_CTX_BB_FORCE | + PER_CTX_BB_VALID; } static void @@ -626,12 +815,36 @@ lrc_setup_indirect_ctx(u32 *regs, lrc_ring_indirect_offset_default(engine) << 6; } +static bool ctx_needs_runalone(const struct intel_context *ce) +{ + struct i915_gem_context *gem_ctx; + bool ctx_is_protected = false; + + /* + * Wa_14019159160 - Case 2. + * On some platforms, protected contexts require setting + * the LRC run-alone bit or else the encryption/decryption will not happen. + * NOTE: Case 2 only applies to PXP use-case of said workaround. + */ + if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) && + (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) { + rcu_read_lock(); + gem_ctx = rcu_dereference(ce->gem_context); + if (gem_ctx) + ctx_is_protected = gem_ctx->uses_protected_content; + rcu_read_unlock(); + } + + return ctx_is_protected; +} + static void init_common_regs(u32 * const regs, const struct intel_context *ce, const struct intel_engine_cs *engine, bool inhibit) { u32 ctl; + int loc; ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); @@ -640,9 +853,16 @@ static void init_common_regs(u32 * const regs, if (GRAPHICS_VER(engine->i915) < 11) ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | CTX_CTRL_RS_CTX_ENABLE); + /* Wa_14019159160 - Case 2.*/ + if (ctx_needs_runalone(ce)) + ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE); regs[CTX_CONTEXT_CONTROL] = ctl; - regs[CTX_TIMESTAMP] = ce->runtime.last; + regs[CTX_TIMESTAMP] = ce->stats.runtime.last; + + loc = lrc_ring_bb_offset(engine); + if (loc != -1) + regs[loc + 1] = 0; } static void init_wa_bb_regs(u32 * const regs, @@ -768,6 +988,31 @@ check_redzone(const void *vaddr, const struct intel_engine_cs *engine) engine->name); } +static u32 context_wa_bb_offset(const struct intel_context *ce) +{ + return PAGE_SIZE * ce->wa_bb_page; +} + +/* + * per_ctx below determines which WABB section is used. + * When true, the function returns the location of the + * PER_CTX_BB. When false, the function returns the + * location of the INDIRECT_CTX. + */ +static u32 *context_wabb(const struct intel_context *ce, bool per_ctx) +{ + void *ptr; + + GEM_BUG_ON(!ce->wa_bb_page); + + ptr = ce->lrc_reg_state; + ptr -= LRC_STATE_OFFSET; /* back to start of context image */ + ptr += context_wa_bb_offset(ce); + ptr += per_ctx ? PAGE_SIZE : 0; + + return ptr; +} + void lrc_init_state(struct intel_context *ce, struct intel_engine_cs *engine, void *state) @@ -776,9 +1021,8 @@ void lrc_init_state(struct intel_context *ce, set_redzone(state, engine); - if (engine->default_state) { - shmem_read(engine->default_state, 0, - state, engine->context_size); + if (ce->default_state) { + shmem_read(ce->default_state, 0, state, engine->context_size); __set_bit(CONTEXT_VALID_BIT, &ce->flags); inhibit = false; } @@ -786,6 +1030,10 @@ void lrc_init_state(struct intel_context *ce, /* Clear the ppHWSP (inc. per-context counters) */ memset(state, 0, PAGE_SIZE); + /* Clear the indirect wa and storage */ + if (ce->wa_bb_page) + memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); + /* * The second page of the context object contains some registers which * must be set up prior to the first execution. @@ -793,6 +1041,35 @@ void lrc_init_state(struct intel_context *ce, __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); } +u32 lrc_indirect_bb(const struct intel_context *ce) +{ + return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); +} + +static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) +{ + /* If predication is active, this will be noop'ed */ + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); + *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; + *cs++ = 0; + *cs++ = 0; /* No predication */ + + /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ + *cs++ = MI_BATCH_BUFFER_END | BIT(15); + *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; + + /* Instructions are no longer predicated (disabled), we can proceed */ + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); + *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; + *cs++ = 0; + *cs++ = 1; /* enable predication before the next BB */ + + *cs++ = MI_BATCH_BUFFER_END; + GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); + + return cs; +} + static struct i915_vma * __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) { @@ -805,16 +1082,32 @@ __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) context_size += I915_GTT_PAGE_SIZE; /* for redzone */ - if (GRAPHICS_VER(engine->i915) == 12) { + if (GRAPHICS_VER(engine->i915) >= 12) { ce->wa_bb_page = context_size / PAGE_SIZE; - context_size += PAGE_SIZE; + /* INDIRECT_CTX and PER_CTX_BB need separate pages. */ + context_size += PAGE_SIZE * 2; } - obj = i915_gem_object_create_lmem(engine->i915, context_size, 0); - if (IS_ERR(obj)) + if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { + ce->parallel.guc.parent_page = context_size / PAGE_SIZE; + context_size += PARENT_SCRATCH_SIZE; + } + + obj = i915_gem_object_create_lmem(engine->i915, context_size, + I915_BO_ALLOC_PM_VOLATILE); + if (IS_ERR(obj)) { obj = i915_gem_object_create_shmem(engine->i915, context_size); - if (IS_ERR(obj)) - return ERR_CAST(obj); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + /* + * Wa_22016122933: For Media version 13.0, all Media GT shared + * memory needs to be mapped as WC on CPU side and UC (PAT + * index 2) on GPU side. + */ + if (intel_gt_needs_wa_22016122933(engine->gt)) + i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE); + } vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); if (IS_ERR(vma)) { @@ -841,11 +1134,14 @@ int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) GEM_BUG_ON(ce->state); + if (!intel_context_has_own_state(ce)) + ce->default_state = engine->default_state; + vma = __lrc_alloc_state(ce, engine); if (IS_ERR(vma)) return PTR_ERR(vma); - ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); + ring = intel_engine_create_ring(engine, ce->ring_size); if (IS_ERR(ring)) { err = PTR_ERR(ring); goto err_vma; @@ -903,9 +1199,9 @@ lrc_pre_pin(struct intel_context *ce, GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); *vaddr = i915_gem_object_pin_map(ce->state->obj, - i915_coherent_map_type(ce->engine->i915, - ce->state->obj, - false) | + intel_gt_coherent_map_type(ce->engine->gt, + ce->state->obj, + false) | I915_MAP_OVERRIDE); return PTR_ERR_OR_ZERO(*vaddr); @@ -927,6 +1223,10 @@ lrc_pin(struct intel_context *ce, void lrc_unpin(struct intel_context *ce) { + if (unlikely(ce->parallel.last_rq)) { + i915_request_put(ce->parallel.last_rq); + ce->parallel.last_rq = NULL; + } check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, ce->engine); } @@ -1022,6 +1322,32 @@ gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) return cs; } +/* + * The bspec's tuning guide asks us to program a vertical watermark value of + * 0x3FF. However this register is not saved/restored properly by the + * hardware, so we're required to apply the desired value via INDIRECT_CTX + * batch buffer to ensure the value takes effect properly. All other bits + * in this register should remain at 0 (the hardware default). + */ +static u32 * +dg2_emit_draw_watermark_setting(u32 *cs) +{ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); + *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); + + return cs; +} + +static u32 * +gen12_invalidate_state_cache(u32 *cs) +{ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2); + *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); + return cs; +} + static u32 * gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) { @@ -1029,6 +1355,22 @@ gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) cs = gen12_emit_cmd_buf_wa(ce, cs); cs = gen12_emit_restore_scratch(ce, cs); + /* Wa_16013000631:dg2 */ + if (IS_DG2_G11(ce->engine->i915)) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); + + cs = gen12_emit_aux_table_inv(ce->engine, cs); + + /* Wa_18022495364 */ + if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10))) + cs = gen12_invalidate_state_cache(cs); + + /* Wa_16014892111 */ + if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) || + IS_DG2(ce->engine->i915)) + cs = dg2_emit_draw_watermark_setting(cs); + return cs; } @@ -1038,25 +1380,87 @@ gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) cs = gen12_emit_timestamp_wa(ce, cs); cs = gen12_emit_restore_scratch(ce, cs); + /* Wa_16013000631:dg2 */ + if (IS_DG2_G11(ce->engine->i915)) + if (ce->engine->class == COMPUTE_CLASS) + cs = gen8_emit_pipe_control(cs, + PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, + 0); + + return gen12_emit_aux_table_inv(ce->engine, cs); +} + +static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs) +{ + struct intel_gt *gt = ce->engine->gt; + int mocs = gt->mocs.uc_index << 1; + + /** + * Wa_16018031267 / Wa_16018063123 requires that SW forces the + * main copy engine arbitration into round robin mode. We + * additionally need to submit the following WABB blt command + * to produce 4 subblits with each subblit generating 0 byte + * write requests as WABB: + * + * XY_FASTCOLOR_BLT + * BG0 -> 5100000E + * BG1 -> 0000003F (Dest pitch) + * BG2 -> 00000000 (X1, Y1) = (0, 0) + * BG3 -> 00040001 (X2, Y2) = (1, 4) + * BG4 -> scratch + * BG5 -> scratch + * BG6-12 -> 00000000 + * BG13 -> 20004004 (Surf. Width= 2,Surf. Height = 5 ) + * BG14 -> 00000010 (Qpitch = 4) + * BG15 -> 00000000 + */ + *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2); + *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f; + *cs++ = 0; + *cs++ = 4 << 16 | 1; + *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); + *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0x20004004; + *cs++ = 0x10; + *cs++ = 0; + return cs; } -static u32 context_wa_bb_offset(const struct intel_context *ce) +static u32 * +xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs) { - return PAGE_SIZE * ce->wa_bb_page; + /* Wa_16018031267, Wa_16018063123 */ + if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine)) + cs = xehp_emit_fastcolor_blt_wabb(ce, cs); + + return cs; } -static u32 *context_indirect_bb(const struct intel_context *ce) +static void +setup_per_ctx_bb(const struct intel_context *ce, + const struct intel_engine_cs *engine, + u32 *(*emit)(const struct intel_context *, u32 *)) { - void *ptr; + /* Place PER_CTX_BB on next page after INDIRECT_CTX */ + u32 * const start = context_wabb(ce, true); + u32 *cs; - GEM_BUG_ON(!ce->wa_bb_page); + cs = emit(ce, start); - ptr = ce->lrc_reg_state; - ptr -= LRC_STATE_OFFSET; /* back to start of context image */ - ptr += context_wa_bb_offset(ce); + /* PER_CTX_BB must manually terminate */ + *cs++ = MI_BATCH_BUFFER_END; - return ptr; + GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); + lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine, + lrc_indirect_bb(ce) + PAGE_SIZE); } static void @@ -1064,7 +1468,7 @@ setup_indirect_ctx_bb(const struct intel_context *ce, const struct intel_engine_cs *engine, u32 *(*emit)(const struct intel_context *, u32 *)) { - u32 * const start = context_indirect_bb(ce); + u32 * const start = context_wabb(ce, false); u32 *cs; cs = emit(ce, start); @@ -1072,9 +1476,11 @@ setup_indirect_ctx_bb(const struct intel_context *ce, while ((unsigned long)cs % CACHELINE_BYTES) *cs++ = MI_NOOP; + GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); + setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); + lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, - i915_ggtt_offset(ce->state) + - context_wa_bb_offset(ce), + lrc_indirect_bb(ce), (cs - start) * sizeof(*cs)); } @@ -1101,6 +1507,14 @@ setup_indirect_ctx_bb(const struct intel_context *ce, * bits 55-60: SW counter * bits 61-63: engine class * + * On Xe_HP, the upper dword of the descriptor has a new format: + * + * bits 32-37: virtual function number + * bit 38: mbz, reserved for use by hardware + * bits 39-54: SW context ID + * bits 55-57: reserved + * bits 58-63: SW counter + * * engine info, SW context ID and SW counter need to form a unique number * (Context ID) per lrc. */ @@ -1153,6 +1567,7 @@ u32 lrc_update_regs(const struct intel_context *ce, /* Mutually exclusive wrt to global indirect bb */ GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); setup_indirect_ctx_bb(ce, engine, fn); + setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb); } return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; @@ -1387,40 +1802,6 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) return batch; } -static u32 * -gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) -{ - int i; - - /* - * WaPipeControlBefore3DStateSamplePattern: cnl - * - * Ensure the engine is idle prior to programming a - * 3DSTATE_SAMPLE_PATTERN during a context restore. - */ - batch = gen8_emit_pipe_control(batch, - PIPE_CONTROL_CS_STALL, - 0); - /* - * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for - * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in - * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is - * confusing. Since gen8_emit_pipe_control() already advances the - * batch by 6 dwords, we advance the other 10 here, completing a - * cacheline. It's not clear if the workaround requires this padding - * before other commands, or if it's just the regular padding we would - * already have for the workaround bb, so leave it here for now. - */ - for (i = 0; i < 10; i++) - *batch++ = MI_NOOP; - - /* Pad to end of cacheline */ - while ((unsigned long)batch % CACHELINE_BYTES) - *batch++ = MI_NOOP; - - return batch; -} - #define CTX_WA_BB_SIZE (PAGE_SIZE) static int lrc_create_wa_ctx(struct intel_engine_cs *engine) @@ -1466,28 +1847,16 @@ void lrc_init_wa_ctx(struct intel_engine_cs *engine) unsigned int i; int err; - if (engine->class != RENDER_CLASS) + if (GRAPHICS_VER(engine->i915) >= 11 || + !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) return; - switch (GRAPHICS_VER(engine->i915)) { - case 12: - case 11: - return; - case 10: - wa_bb_fn[0] = gen10_init_indirectctx_bb; - wa_bb_fn[1] = NULL; - break; - case 9: + if (GRAPHICS_VER(engine->i915) == 9) { wa_bb_fn[0] = gen9_init_indirectctx_bb; wa_bb_fn[1] = NULL; - break; - case 8: + } else if (GRAPHICS_VER(engine->i915) == 8) { wa_bb_fn[0] = gen8_init_indirectctx_bb; wa_bb_fn[1] = NULL; - break; - default: - MISSING_CASE(GRAPHICS_VER(engine->i915)); - return; } err = lrc_create_wa_ctx(engine); @@ -1565,35 +1934,47 @@ err: } } -static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) +static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) { #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) - ce->runtime.num_underflow++; - ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); + stats->runtime.num_underflow++; + stats->runtime.max_underflow = + max_t(u32, stats->runtime.max_underflow, -dt); #endif } +static u32 lrc_get_runtime(const struct intel_context *ce) +{ + /* + * We can use either ppHWSP[16] which is recorded before the context + * switch (and so excludes the cost of context switches) or use the + * value from the context image itself, which is saved/restored earlier + * and so includes the cost of the save. + */ + return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); +} + void lrc_update_runtime(struct intel_context *ce) { + struct intel_context_stats *stats = &ce->stats; u32 old; s32 dt; - if (intel_context_is_barrier(ce)) + old = stats->runtime.last; + stats->runtime.last = lrc_get_runtime(ce); + dt = stats->runtime.last - old; + if (!dt) return; - old = ce->runtime.last; - ce->runtime.last = lrc_get_runtime(ce); - dt = ce->runtime.last - old; - if (unlikely(dt < 0)) { CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", - old, ce->runtime.last, dt); - st_update_runtime_underflow(ce, dt); + old, stats->runtime.last, dt); + st_runtime_underflow(stats, dt); return; } - ewma_runtime_add(&ce->runtime.avg, dt); - ce->runtime.total += dt; + ewma_runtime_add(&stats->runtime.avg, dt); + stats->runtime.total += dt; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
