summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/gt
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/gt')
-rw-r--r--drivers/gpu/drm/i915/gt/intel_context.c12
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_cs.c4
-rw-r--r--drivers/gpu/drm/i915/gt/intel_lrc.c71
-rw-r--r--drivers/gpu/drm/i915/gt/intel_ring.c4
-rw-r--r--drivers/gpu/drm/i915/gt/intel_workarounds.c241
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_hangcheck.c25
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_lrc.c185
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_mocs.c18
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_ring.c110
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_rps.c77
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_timeline.c15
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_workarounds.c2
-rw-r--r--drivers/gpu/drm/i915/gt/shaders/README46
-rw-r--r--drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm119
-rw-r--r--drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm117
15 files changed, 902 insertions, 144 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
index e4aece20bc80..52db2bde44a3 100644
--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -204,25 +204,25 @@ static int __ring_active(struct intel_ring *ring)
{
int err;
- err = i915_active_acquire(&ring->vma->active);
+ err = intel_ring_pin(ring);
if (err)
return err;
- err = intel_ring_pin(ring);
+ err = i915_active_acquire(&ring->vma->active);
if (err)
- goto err_active;
+ goto err_pin;
return 0;
-err_active:
- i915_active_release(&ring->vma->active);
+err_pin:
+ intel_ring_unpin(ring);
return err;
}
static void __ring_retire(struct intel_ring *ring)
{
- intel_ring_unpin(ring);
i915_active_release(&ring->vma->active);
+ intel_ring_unpin(ring);
}
__i915_active_call
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index da5b61085257..8691eb61e185 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -646,7 +646,7 @@ static int engine_setup_common(struct intel_engine_cs *engine)
struct measure_breadcrumb {
struct i915_request rq;
struct intel_ring ring;
- u32 cs[1024];
+ u32 cs[2048];
};
static int measure_breadcrumb_dw(struct intel_context *ce)
@@ -668,6 +668,8 @@ static int measure_breadcrumb_dw(struct intel_context *ce)
frame->ring.vaddr = frame->cs;
frame->ring.size = sizeof(frame->cs);
+ frame->ring.wrap =
+ BITS_PER_TYPE(frame->ring.size) - ilog2(frame->ring.size);
frame->ring.effective_size = frame->ring.size;
intel_ring_update_space(&frame->ring);
frame->rq.ring = &frame->ring;
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 87e6c5bdd2dc..cb07e1d2a353 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1134,6 +1134,13 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
list_move(&rq->sched.link, pl);
set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
+ /* Check in case we rollback so far we wrap [size/2] */
+ if (intel_ring_direction(rq->ring,
+ intel_ring_wrap(rq->ring,
+ rq->tail),
+ rq->ring->tail) > 0)
+ rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
+
active = rq;
} else {
struct intel_engine_cs *owner = rq->context->engine;
@@ -1498,8 +1505,9 @@ static u64 execlists_update_context(struct i915_request *rq)
* HW has a tendency to ignore us rewinding the TAIL to the end of
* an earlier request.
*/
+ GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
+ prev = rq->ring->tail;
tail = intel_ring_set_tail(rq->ring, rq->tail);
- prev = ce->lrc_reg_state[CTX_RING_TAIL];
if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
desc |= CTX_DESC_FORCE_RESTORE;
ce->lrc_reg_state[CTX_RING_TAIL] = tail;
@@ -1895,7 +1903,8 @@ static void defer_active(struct intel_engine_cs *engine)
static bool
need_timeslice(const struct intel_engine_cs *engine,
- const struct i915_request *rq)
+ const struct i915_request *rq,
+ const struct rb_node *rb)
{
int hint;
@@ -1903,9 +1912,28 @@ need_timeslice(const struct intel_engine_cs *engine,
return false;
hint = engine->execlists.queue_priority_hint;
+
+ if (rb) {
+ const struct virtual_engine *ve =
+ rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+ const struct intel_engine_cs *inflight =
+ intel_context_inflight(&ve->context);
+
+ if (!inflight || inflight == engine) {
+ struct i915_request *next;
+
+ rcu_read_lock();
+ next = READ_ONCE(ve->request);
+ if (next)
+ hint = max(hint, rq_prio(next));
+ rcu_read_unlock();
+ }
+ }
+
if (!list_is_last(&rq->sched.link, &engine->active.requests))
hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
+ GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
return hint >= effective_prio(rq);
}
@@ -1977,10 +2005,9 @@ static void set_timeslice(struct intel_engine_cs *engine)
set_timer_ms(&engine->execlists.timer, duration);
}
-static void start_timeslice(struct intel_engine_cs *engine)
+static void start_timeslice(struct intel_engine_cs *engine, int prio)
{
struct intel_engine_execlists *execlists = &engine->execlists;
- const int prio = queue_prio(execlists);
unsigned long duration;
if (!intel_engine_has_timeslices(engine))
@@ -2140,7 +2167,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
__unwind_incomplete_requests(engine);
last = NULL;
- } else if (need_timeslice(engine, last) &&
+ } else if (need_timeslice(engine, last, rb) &&
timeslice_expired(execlists, last)) {
if (i915_request_completed(last)) {
tasklet_hi_schedule(&execlists->tasklet);
@@ -2188,7 +2215,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* Even if ELSP[1] is occupied and not worthy
* of timeslices, our queue might be.
*/
- start_timeslice(engine);
+ start_timeslice(engine, queue_prio(execlists));
return;
}
}
@@ -2223,7 +2250,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
if (last && !can_merge_rq(last, rq)) {
spin_unlock(&ve->base.active.lock);
- start_timeslice(engine);
+ start_timeslice(engine, rq_prio(rq));
return; /* leave this for another sibling */
}
@@ -4739,6 +4766,14 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode)
return 0;
}
+static void assert_request_valid(struct i915_request *rq)
+{
+ struct intel_ring *ring __maybe_unused = rq->ring;
+
+ /* Can we unwind this request without appearing to go forwards? */
+ GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
+}
+
/*
* Reserve space for 2 NOOPs at the end of each request to be
* used as a workaround for not being allowed to do lite
@@ -4751,6 +4786,9 @@ static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
*cs++ = MI_NOOP;
request->wa_tail = intel_ring_offset(request, cs);
+ /* Check that entire request is less than half the ring */
+ assert_request_valid(request);
+
return cs;
}
@@ -5358,13 +5396,8 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
* typically be the first we inspect for submission.
*/
swp = prandom_u32_max(ve->num_siblings);
- if (!swp)
- return;
-
- swap(ve->siblings[swp], ve->siblings[0]);
- if (!intel_engine_has_relative_mmio(ve->siblings[0]))
- virtual_update_register_offsets(ve->context.lrc_reg_state,
- ve->siblings[0]);
+ if (swp)
+ swap(ve->siblings[swp], ve->siblings[0]);
}
static int virtual_context_alloc(struct intel_context *ce)
@@ -5377,15 +5410,9 @@ static int virtual_context_alloc(struct intel_context *ce)
static int virtual_context_pin(struct intel_context *ce)
{
struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
- int err;
/* Note: we must use a real engine class for setting up reg state */
- err = __execlists_context_pin(ce, ve->siblings[0]);
- if (err)
- return err;
-
- virtual_engine_initial_hint(ve);
- return 0;
+ return __execlists_context_pin(ce, ve->siblings[0]);
}
static void virtual_context_enter(struct intel_context *ce)
@@ -5650,6 +5677,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings,
intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
intel_engine_init_breadcrumbs(&ve->base);
intel_engine_init_execlists(&ve->base);
+ ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */
ve->base.cops = &virtual_context_ops;
ve->base.request_alloc = execlists_request_alloc;
@@ -5731,6 +5759,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings,
ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
+ virtual_engine_initial_hint(ve);
return &ve->context;
err_put:
diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c
index 8cda1b7e17ba..bdb324167ef3 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring.c
@@ -315,3 +315,7 @@ int intel_ring_cacheline_align(struct i915_request *rq)
GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1));
return 0;
}
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_ring.c"
+#endif
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 90a2b9e399b0..85d2bef51524 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -179,6 +179,12 @@ wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
}
static void
+wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
+{
+ wa_write_masked_or(wal, reg, clr, 0);
+}
+
+static void
wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
{
wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val);
@@ -687,6 +693,227 @@ int intel_engine_emit_ctx_wa(struct i915_request *rq)
}
static void
+gen4_gt_workarounds_init(struct drm_i915_private *i915,
+ struct i915_wa_list *wal)
+{
+ /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
+ wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
+}
+
+static void
+g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
+{
+ gen4_gt_workarounds_init(i915, wal);
+
+ /* WaDisableRenderCachePipelinedFlush:g4x,ilk */
+ wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
+}
+
+static void
+ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
+{
+ g4x_gt_workarounds_init(i915, wal);
+
+ wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
+}
+
+static void
+snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
+{
+ /* WaDisableHiZPlanesWhenMSAAEnabled:snb */
+ wa_masked_en(wal,
+ _3D_CHICKEN,
+ _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
+
+ /* WaDisable_RenderCache_OperationalFlush:snb */
+ wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
+
+ /*
+ * BSpec recommends 8x4 when MSAA is used,
+ * however in practice 16x4 seems fastest.
+ *
+ * Note that PS/WM thread counts depend on the WIZ hashing
+ * disable bit, which we don't touch here, but it's good
+ * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+ */
+ wa_add(wal,
+ GEN6_GT_MODE, 0,
+ _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
+ GEN6_WIZ_HASHING_16x4);
+
+ wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB);
+
+ wa_masked_en(wal,
+ _3D_CHICKEN3,
+ /* WaStripsFansDisableFastClipPerformanceFix:snb */
+ _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
+ /*
+ * Bspec says:
+ * "This bit must be set if 3DSTATE_CLIP clip mode is set
+ * to normal and 3DSTATE_SF number of SF output attributes
+ * is more than 16."
+ */
+ _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
+}
+
+static void
+ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
+{
+ /* WaDisableEarlyCull:ivb */
+ wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
+
+ /* WaDisablePSDDualDispatchEnable:ivb */
+ if (IS_IVB_GT1(i915))
+ wa_masked_en(wal,
+ GEN7_HALF_SLICE_CHICKEN1,
+ GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
+
+ /* WaDisable_RenderCache_OperationalFlush:ivb */
+ wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
+
+ /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
+ wa_masked_dis(wal,
+ GEN7_COMMON_SLICE_CHICKEN1,
+ GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
+
+ /* WaApplyL3ControlAndL3ChickenMode:ivb */
+ wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
+ wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
+
+ /* WaForceL3Serialization:ivb */
+ wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
+
+ /*
+ * WaVSThreadDispatchOverride:ivb,vlv
+ *
+ * This actually overrides the dispatch
+ * mode for all thread types.
+ */
+ wa_write_masked_or(wal, GEN7_FF_THREAD_MODE,
+ GEN7_FF_SCHED_MASK,
+ GEN7_FF_TS_SCHED_HW |
+ GEN7_FF_VS_SCHED_HW |
+ GEN7_FF_DS_SCHED_HW);
+
+ if (0) { /* causes HiZ corruption on ivb:gt1 */
+ /* enable HiZ Raw Stall Optimization */
+ wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
+ }
+
+ /* WaDisable4x2SubspanOptimization:ivb */
+ wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
+
+ /*
+ * BSpec recommends 8x4 when MSAA is used,
+ * however in practice 16x4 seems fastest.
+ *
+ * Note that PS/WM thread counts depend on the WIZ hashing
+ * disable bit, which we don't touch here, but it's good
+ * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+ */
+ wa_add(wal, GEN7_GT_MODE, 0,
+ _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
+ GEN6_WIZ_HASHING_16x4);
+}
+
+static void
+vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
+{
+ /* WaDisableEarlyCull:vlv */
+ wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
+
+ /* WaPsdDispatchEnable:vlv */
+ /* WaDisablePSDDualDispatchEnable:vlv */
+ wa_masked_en(wal,
+ GEN7_HALF_SLICE_CHICKEN1,
+ GEN7_MAX_PS_THREAD_DEP |
+ GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
+
+ /* WaDisable_RenderCache_OperationalFlush:vlv */
+ wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
+
+ /* WaForceL3Serialization:vlv */
+ wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
+
+ /*
+ * WaVSThreadDispatchOverride:ivb,vlv
+ *
+ * This actually overrides the dispatch
+ * mode for all thread types.
+ */
+ wa_write_masked_or(wal,
+ GEN7_FF_THREAD_MODE,
+ GEN7_FF_SCHED_MASK,
+ GEN7_FF_TS_SCHED_HW |
+ GEN7_FF_VS_SCHED_HW |
+ GEN7_FF_DS_SCHED_HW);
+
+ /*
+ * BSpec says this must be set, even though
+ * WaDisable4x2SubspanOptimization isn't listed for VLV.
+ */
+ wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
+
+ /*
+ * BSpec recommends 8x4 when MSAA is used,
+ * however in practice 16x4 seems fastest.
+ *
+ * Note that PS/WM thread counts depend on the WIZ hashing
+ * disable bit, which we don't touch here, but it's good
+ * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+ */
+ wa_add(wal, GEN7_GT_MODE, 0,
+ _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
+ GEN6_WIZ_HASHING_16x4);
+
+ /*
+ * WaIncreaseL3CreditsForVLVB0:vlv
+ * This is the hardware default actually.
+ */
+ wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
+}
+
+static void
+hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
+{
+ /* L3 caching of data atomics doesn't work -- disable it. */
+ wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
+
+ wa_add(wal,
+ HSW_ROW_CHICKEN3, 0,
+ _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
+ 0 /* XXX does this reg exist? */);
+
+ /* WaVSRefCountFullforceMissDisable:hsw */
+ wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
+
+ wa_masked_dis(wal,
+ CACHE_MODE_0_GEN7,
+ /* WaDisable_RenderCache_OperationalFlush:hsw */
+ RC_OP_FLUSH_ENABLE |
+ /* enable HiZ Raw Stall Optimization */
+ HIZ_RAW_STALL_OPT_DISABLE);
+
+ /* WaDisable4x2SubspanOptimization:hsw */
+ wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
+
+ /*
+ * BSpec recommends 8x4 when MSAA is used,
+ * however in practice 16x4 seems fastest.
+ *
+ * Note that PS/WM thread counts depend on the WIZ hashing
+ * disable bit, which we don't touch here, but it's good
+ * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+ */
+ wa_add(wal, GEN7_GT_MODE, 0,
+ _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4),
+ GEN6_WIZ_HASHING_16x4);
+
+ /* WaSampleCChickenBitEnable:hsw */
+ wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
+}
+
+static void
gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
{
/* WaDisableKillLogic:bxt,skl,kbl */
@@ -963,6 +1190,20 @@ gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal)
bxt_gt_workarounds_init(i915, wal);
else if (IS_SKYLAKE(i915))
skl_gt_workarounds_init(i915, wal);
+ else if (IS_HASWELL(i915))
+ hsw_gt_workarounds_init(i915, wal);
+ else if (IS_VALLEYVIEW(i915))
+ vlv_gt_workarounds_init(i915, wal);
+ else if (IS_IVYBRIDGE(i915))
+ ivb_gt_workarounds_init(i915, wal);
+ else if (IS_GEN(i915, 6))
+ snb_gt_workarounds_init(i915, wal);
+ else if (IS_GEN(i915, 5))
+ ilk_gt_workarounds_init(i915, wal);
+ else if (IS_G4X(i915))
+ g4x_gt_workarounds_init(i915, wal);
+ else if (IS_GEN(i915, 4))
+ gen4_gt_workarounds_init(i915, wal);
else if (INTEL_GEN(i915) <= 8)
return;
else
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 2b2efff6e19d..4aa4cc917d8b 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -310,22 +310,20 @@ static bool wait_until_running(struct hang *h, struct i915_request *rq)
1000));
}
-static void engine_heartbeat_disable(struct intel_engine_cs *engine,
- unsigned long *saved)
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
{
- *saved = engine->props.heartbeat_interval_ms;
engine->props.heartbeat_interval_ms = 0;
intel_engine_pm_get(engine);
intel_engine_park_heartbeat(engine);
}
-static void engine_heartbeat_enable(struct intel_engine_cs *engine,
- unsigned long saved)
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
{
intel_engine_pm_put(engine);
- engine->props.heartbeat_interval_ms = saved;
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
}
static int igt_hang_sanitycheck(void *arg)
@@ -473,7 +471,6 @@ static int igt_reset_nop_engine(void *arg)
for_each_engine(engine, gt, id) {
unsigned int reset_count, reset_engine_count, count;
struct intel_context *ce;
- unsigned long heartbeat;
IGT_TIMEOUT(end_time);
int err;
@@ -485,7 +482,7 @@ static int igt_reset_nop_engine(void *arg)
reset_engine_count = i915_reset_engine_count(global, engine);
count = 0;
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
do {
int i;
@@ -529,7 +526,7 @@ static int igt_reset_nop_engine(void *arg)
}
} while (time_before(jiffies, end_time));
clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
@@ -564,7 +561,6 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
for_each_engine(engine, gt, id) {
unsigned int reset_count, reset_engine_count;
- unsigned long heartbeat;
IGT_TIMEOUT(end_time);
if (active && !intel_engine_can_store_dword(engine))
@@ -580,7 +576,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
reset_count = i915_reset_count(global);
reset_engine_count = i915_reset_engine_count(global, engine);
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
do {
if (active) {
@@ -632,7 +628,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
}
} while (time_before(jiffies, end_time));
clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
if (err)
break;
@@ -789,7 +785,6 @@ static int __igt_reset_engines(struct intel_gt *gt,
struct active_engine threads[I915_NUM_ENGINES] = {};
unsigned long device = i915_reset_count(global);
unsigned long count = 0, reported;
- unsigned long heartbeat;
IGT_TIMEOUT(end_time);
if (flags & TEST_ACTIVE &&
@@ -832,7 +827,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
yield(); /* start all threads before we begin */
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
do {
struct i915_request *rq = NULL;
@@ -906,7 +901,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
}
} while (time_before(jiffies, end_time));
clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
pr_info("i915_reset_engine(%s:%s): %lu resets\n",
engine->name, test_name, count);
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 824f99c4cc7c..924bc01ef526 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -51,22 +51,20 @@ static struct i915_vma *create_scratch(struct intel_gt *gt)
return vma;
}
-static void engine_heartbeat_disable(struct intel_engine_cs *engine,
- unsigned long *saved)
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
{
- *saved = engine->props.heartbeat_interval_ms;
engine->props.heartbeat_interval_ms = 0;
intel_engine_pm_get(engine);
intel_engine_park_heartbeat(engine);
}
-static void engine_heartbeat_enable(struct intel_engine_cs *engine,
- unsigned long saved)
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
{
intel_engine_pm_put(engine);
- engine->props.heartbeat_interval_ms = saved;
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
}
static bool is_active(struct i915_request *rq)
@@ -224,7 +222,6 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
struct intel_context *ce[2] = {};
struct i915_request *rq[2];
struct igt_live_test t;
- unsigned long saved;
int n;
if (prio && !intel_engine_has_preemption(engine))
@@ -237,7 +234,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
err = -EIO;
break;
}
- engine_heartbeat_disable(engine, &saved);
+ engine_heartbeat_disable(engine);
for (n = 0; n < ARRAY_SIZE(ce); n++) {
struct intel_context *tmp;
@@ -345,7 +342,7 @@ err_ce:
intel_context_put(ce[n]);
}
- engine_heartbeat_enable(engine, saved);
+ engine_heartbeat_enable(engine);
if (igt_live_test_end(&t))
err = -EIO;
if (err)
@@ -466,7 +463,6 @@ static int live_hold_reset(void *arg)
for_each_engine(engine, gt, id) {
struct intel_context *ce;
- unsigned long heartbeat;
struct i915_request *rq;
ce = intel_context_create(engine);
@@ -475,7 +471,7 @@ static int live_hold_reset(void *arg)
break;
}
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
if (IS_ERR(rq)) {
@@ -535,7 +531,7 @@ static int live_hold_reset(void *arg)
i915_request_put(rq);
out:
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
intel_context_put(ce);
if (err)
break;
@@ -580,10 +576,9 @@ static int live_error_interrupt(void *arg)
for_each_engine(engine, gt, id) {
const struct error_phase *p;
- unsigned long heartbeat;
int err = 0;
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
for (p = phases; p->error[0] != GOOD; p++) {
struct i915_request *client[ARRAY_SIZE(phases->error)];
@@ -682,7 +677,7 @@ out:
}
}
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
if (err) {
intel_gt_set_wedged(gt);
return err;
@@ -828,7 +823,7 @@ slice_semaphore_queue(struct intel_engine_cs *outer,
}
}
- err = release_queue(outer, vma, n, INT_MAX);
+ err = release_queue(outer, vma, n, I915_PRIORITY_BARRIER);
if (err)
goto out;
@@ -895,16 +890,14 @@ static int live_timeslice_preempt(void *arg)
enum intel_engine_id id;
for_each_engine(engine, gt, id) {
- unsigned long saved;
-
if (!intel_engine_has_preemption(engine))
continue;
memset(vaddr, 0, PAGE_SIZE);
- engine_heartbeat_disable(engine, &saved);
+ engine_heartbeat_disable(engine);
err = slice_semaphore_queue(engine, vma, count);
- engine_heartbeat_enable(engine, saved);
+ engine_heartbeat_enable(engine);
if (err)
goto err_pin;
@@ -1009,7 +1002,6 @@ static int live_timeslice_rewind(void *arg)
enum { X = 1, Z, Y };
struct i915_request *rq[3] = {};
struct intel_context *ce;
- unsigned long heartbeat;
unsigned long timeslice;
int i, err = 0;
u32 *slot;
@@ -1028,7 +1020,7 @@ static int live_timeslice_rewind(void *arg)
* Expect execution/evaluation order XZY
*/
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
timeslice = xchg(&engine->props.timeslice_duration_ms, 1);
slot = memset32(engine->status_page.addr + 1000, 0, 4);
@@ -1122,7 +1114,7 @@ err:
wmb();
engine->props.timeslice_duration_ms = timeslice;
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
for (i = 0; i < 3; i++)
i915_request_put(rq[i]);
if (igt_flush_test(gt->i915))
@@ -1202,12 +1194,11 @@ static int live_timeslice_queue(void *arg)
.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
};
struct i915_request *rq, *nop;
- unsigned long saved;
if (!intel_engine_has_preemption(engine))
continue;
- engine_heartbeat_disable(engine, &saved);
+ engine_heartbeat_disable(engine);
memset(vaddr, 0, PAGE_SIZE);
/* ELSP[0]: semaphore wait */
@@ -1284,7 +1275,7 @@ static int live_timeslice_queue(void *arg)
err_rq:
i915_request_put(rq);
err_heartbeat:
- engine_heartbeat_enable(engine, saved);
+ engine_heartbeat_enable(engine);
if (err)
break;
}
@@ -1298,6 +1289,121 @@ err_obj:
return err;
}
+static int live_timeslice_nopreempt(void *arg)
+{
+ struct intel_gt *gt = arg;
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ struct igt_spinner spin;
+ int err = 0;
+
+ /*
+ * We should not timeslice into a request that is marked with
+ * I915_REQUEST_NOPREEMPT.
+ */
+ if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
+ return 0;
+
+ if (igt_spinner_init(&spin, gt))
+ return -ENOMEM;
+
+ for_each_engine(engine, gt, id) {
+ struct intel_context *ce;
+ struct i915_request *rq;
+ unsigned long timeslice;
+
+ if (!intel_engine_has_preemption(engine))
+ continue;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce)) {
+ err = PTR_ERR(ce);
+ break;
+ }
+
+ engine_heartbeat_disable(engine);
+ timeslice = xchg(&engine->props.timeslice_duration_ms, 1);
+
+ /* Create an unpreemptible spinner */
+
+ rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
+ intel_context_put(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto out_heartbeat;
+ }
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+
+ if (!igt_wait_for_spinner(&spin, rq)) {
+ i915_request_put(rq);
+ err = -ETIME;
+ goto out_spin;
+ }
+
+ set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
+ i915_request_put(rq);
+
+ /* Followed by a maximum priority barrier (heartbeat) */
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce)) {
+ err = PTR_ERR(rq);
+ goto out_spin;
+ }
+
+ rq = intel_context_create_request(ce);
+ intel_context_put(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto out_spin;
+ }
+
+ rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+ i915_request_get(rq);
+ i915_request_add(rq);
+
+ /*
+ * Wait until the barrier is in ELSP, and we know timeslicing
+ * will have been activated.
+ */
+ if (wait_for_submit(engine, rq, HZ / 2)) {
+ i915_request_put(rq);
+ err = -ETIME;
+ goto out_spin;
+ }
+
+ /*
+ * Since the ELSP[0] request is unpreemptible, it should not
+ * allow the maximum priority barrier through. Wait long
+ * enough to see if it is timesliced in by mistake.
+ */
+ if (i915_request_wait(rq, 0, timeslice_threshold(engine)) >= 0) {
+ pr_err("%s: I915_PRIORITY_BARRIER request completed, bypassing no-preempt request\n",
+ engine->name);
+ err = -EINVAL;
+ }
+ i915_request_put(rq);
+
+out_spin:
+ igt_spinner_end(&spin);
+out_heartbeat:
+ xchg(&engine->props.timeslice_duration_ms, timeslice);
+ engine_heartbeat_enable(engine);
+ if (err)
+ break;
+
+ if (igt_flush_test(gt->i915)) {
+ err = -EIO;
+ break;
+ }
+ }
+
+ igt_spinner_fini(&spin);
+ return err;
+}
+
static int live_busywait_preempt(void *arg)
{
struct intel_gt *gt = arg;
@@ -4153,7 +4259,6 @@ static int reset_virtual_engine(struct intel_gt *gt,
{
struct intel_engine_cs *engine;
struct intel_context *ve;
- unsigned long *heartbeat;
struct igt_spinner spin;
struct i915_request *rq;
unsigned int n;
@@ -4165,15 +4270,9 @@ static int reset_virtual_engine(struct intel_gt *gt,
* descendents are not executed while the capture is in progress.
*/
- heartbeat = kmalloc_array(nsibling, sizeof(*heartbeat), GFP_KERNEL);
- if (!heartbeat)
+ if (igt_spinner_init(&spin, gt))
return -ENOMEM;
- if (igt_spinner_init(&spin, gt)) {
- err = -ENOMEM;
- goto out_free;
- }
-
ve = intel_execlists_create_virtual(siblings, nsibling);
if (IS_ERR(ve)) {
err = PTR_ERR(ve);
@@ -4181,7 +4280,7 @@ static int reset_virtual_engine(struct intel_gt *gt,
}
for (n = 0; n < nsibling; n++)
- engine_heartbeat_disable(siblings[n], &heartbeat[n]);
+ engine_heartbeat_disable(siblings[n]);
rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK);
if (IS_ERR(rq)) {
@@ -4252,13 +4351,11 @@ out_rq:
i915_request_put(rq);
out_heartbeat:
for (n = 0; n < nsibling; n++)
- engine_heartbeat_enable(siblings[n], heartbeat[n]);
+ engine_heartbeat_enable(siblings[n]);
intel_context_put(ve);
out_spin:
igt_spinner_fini(&spin);
-out_free:
- kfree(heartbeat);
return err;
}
@@ -4314,6 +4411,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
SUBTEST(live_timeslice_preempt),
SUBTEST(live_timeslice_rewind),
SUBTEST(live_timeslice_queue),
+ SUBTEST(live_timeslice_nopreempt),
SUBTEST(live_busywait_preempt),
SUBTEST(live_preempt),
SUBTEST(live_late_preempt),
@@ -4932,9 +5030,7 @@ static int live_lrc_gpr(void *arg)
return PTR_ERR(scratch);
for_each_engine(engine, gt, id) {
- unsigned long heartbeat;
-
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
err = __live_lrc_gpr(engine, scratch, false);
if (err)
@@ -4945,7 +5041,7 @@ static int live_lrc_gpr(void *arg)
goto err;
err:
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
if (igt_flush_test(gt->i915))
err = -EIO;
if (err)
@@ -5092,10 +5188,9 @@ static int live_lrc_timestamp(void *arg)
*/
for_each_engine(data.engine, gt, id) {
- unsigned long heartbeat;
int i, err = 0;
- engine_heartbeat_disable(data.engine, &heartbeat);
+ engine_heartbeat_disable(data.engine);
for (i = 0; i < ARRAY_SIZE(data.ce); i++) {
struct intel_context *tmp;
@@ -5128,7 +5223,7 @@ static int live_lrc_timestamp(void *arg)
}
err:
- engine_heartbeat_enable(data.engine, heartbeat);
+ engine_heartbeat_enable(data.engine);
for (i = 0; i < ARRAY_SIZE(data.ce); i++) {
if (!data.ce[i])
break;
diff --git a/drivers/gpu/drm/i915/gt/selftest_mocs.c b/drivers/gpu/drm/i915/gt/selftest_mocs.c
index 8831ffee2061..63f87d8608c3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_mocs.c
+++ b/drivers/gpu/drm/i915/gt/selftest_mocs.c
@@ -18,6 +18,20 @@ struct live_mocs {
void *vaddr;
};
+static struct intel_context *mocs_context_create(struct intel_engine_cs *engine)
+{
+ struct intel_context *ce;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ return ce;
+
+ /* We build large requests to read the registers from the ring */
+ ce->ring = __intel_context_ring_size(SZ_16K);
+
+ return ce;
+}
+
static int request_add_sync(struct i915_request *rq, int err)
{
i915_request_get(rq);
@@ -301,7 +315,7 @@ static int live_mocs_clean(void *arg)
for_each_engine(engine, gt, id) {
struct intel_context *ce;
- ce = intel_context_create(engine);
+ ce = mocs_context_create(engine);
if (IS_ERR(ce)) {
err = PTR_ERR(ce);
break;
@@ -395,7 +409,7 @@ static int live_mocs_reset(void *arg)
for_each_engine(engine, gt, id) {
struct intel_context *ce;
- ce = intel_context_create(engine);
+ ce = mocs_context_create(engine);
if (IS_ERR(ce)) {
err = PTR_ERR(ce);
break;
diff --git a/drivers/gpu/drm/i915/gt/selftest_ring.c b/drivers/gpu/drm/i915/gt/selftest_ring.c
new file mode 100644
index 000000000000..2a8c534dc125
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_ring.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+static struct intel_ring *mock_ring(unsigned long sz)
+{
+ struct intel_ring *ring;
+
+ ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
+ if (!ring)
+ return NULL;
+
+ kref_init(&ring->ref);
+ ring->size = sz;
+ ring->wrap = BITS_PER_TYPE(ring->size) - ilog2(sz);
+ ring->effective_size = sz;
+ ring->vaddr = (void *)(ring + 1);
+ atomic_set(&ring->pin_count, 1);
+
+ intel_ring_update_space(ring);
+
+ return ring;
+}
+
+static void mock_ring_free(struct intel_ring *ring)
+{
+ kfree(ring);
+}
+
+static int check_ring_direction(struct intel_ring *ring,
+ u32 next, u32 prev,
+ int expected)
+{
+ int result;
+
+ result = intel_ring_direction(ring, next, prev);
+ if (result < 0)
+ result = -1;
+ else if (result > 0)
+ result = 1;
+
+ if (result != expected) {
+ pr_err("intel_ring_direction(%u, %u):%d != %d\n",
+ next, prev, result, expected);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int check_ring_step(struct intel_ring *ring, u32 x, u32 step)
+{
+ u32 prev = x, next = intel_ring_wrap(ring, x + step);
+ int err = 0;
+
+ err |= check_ring_direction(ring, next, next, 0);
+ err |= check_ring_direction(ring, prev, prev, 0);
+ err |= check_ring_direction(ring, next, prev, 1);
+ err |= check_ring_direction(ring, prev, next, -1);
+
+ return err;
+}
+
+static int check_ring_offset(struct intel_ring *ring, u32 x, u32 step)
+{
+ int err = 0;
+
+ err |= check_ring_step(ring, x, step);
+ err |= check_ring_step(ring, intel_ring_wrap(ring, x + 1), step);
+ err |= check_ring_step(ring, intel_ring_wrap(ring, x - 1), step);
+
+ return err;
+}
+
+static int igt_ring_direction(void *dummy)
+{
+ struct intel_ring *ring;
+ unsigned int half = 2048;
+ int step, err = 0;
+
+ ring = mock_ring(2 * half);
+ if (!ring)
+ return -ENOMEM;
+
+ GEM_BUG_ON(ring->size != 2 * half);
+
+ /* Precision of wrap detection is limited to ring->size / 2 */
+ for (step = 1; step < half; step <<= 1) {
+ err |= check_ring_offset(ring, 0, step);
+ err |= check_ring_offset(ring, half, step);
+ }
+ err |= check_ring_step(ring, 0, half - 64);
+
+ /* And check unwrapped handling for good measure */
+ err |= check_ring_offset(ring, 0, 2 * half + 64);
+ err |= check_ring_offset(ring, 3 * half, 1);
+
+ mock_ring_free(ring);
+ return err;
+}
+
+int intel_ring_mock_selftests(void)
+{
+ static const struct i915_subtest tests[] = {
+ SUBTEST(igt_ring_direction),
+ };
+
+ return i915_subtests(tests, NULL);
+}
diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c
index 6275d69aa9cc..c91981e75ebf 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rps.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rps.c
@@ -20,24 +20,20 @@
/* Try to isolate the impact of cstates from determing frequency response */
#define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */
-static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine)
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
{
- unsigned long old;
-
- old = fetch_and_zero(&engine->props.heartbeat_interval_ms);
+ engine->props.heartbeat_interval_ms = 0;
intel_engine_pm_get(engine);
intel_engine_park_heartbeat(engine);
-
- return old;
}
-static void engine_heartbeat_enable(struct intel_engine_cs *engine,
- unsigned long saved)
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
{
intel_engine_pm_put(engine);
- engine->props.heartbeat_interval_ms = saved;
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
}
static void dummy_rps_work(struct work_struct *wrk)
@@ -48,9 +44,9 @@ static int cmp_u64(const void *A, const void *B)
{
const u64 *a = A, *b = B;
- if (a < b)
+ if (*a < *b)
return -1;
- else if (a > b)
+ else if (*a > *b)
return 1;
else
return 0;
@@ -60,9 +56,9 @@ static int cmp_u32(const void *A, const void *B)
{
const u32 *a = A, *b = B;
- if (a < b)
+ if (*a < *b)
return -1;
- else if (a > b)
+ else if (*a > *b)
return 1;
else
return 0;
@@ -246,7 +242,6 @@ int live_rps_clock_interval(void *arg)
intel_gt_check_clock_frequency(gt);
for_each_engine(engine, gt, id) {
- unsigned long saved_heartbeat;
struct i915_request *rq;
u32 cycles;
u64 dt;
@@ -254,13 +249,13 @@ int live_rps_clock_interval(void *arg)
if (!intel_engine_can_store_dword(engine))
continue;
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
rq = igt_spinner_create_request(&spin,
engine->kernel_context,
MI_NOOP);
if (IS_ERR(rq)) {
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
err = PTR_ERR(rq);
break;
}
@@ -271,7 +266,7 @@ int live_rps_clock_interval(void *arg)
pr_err("%s: RPS spinner did not start\n",
engine->name);
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
intel_gt_set_wedged(engine->gt);
err = -EIO;
break;
@@ -327,7 +322,7 @@ int live_rps_clock_interval(void *arg)
intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
if (err == 0) {
u64 time = intel_gt_pm_interval_to_ns(gt, cycles);
@@ -405,7 +400,6 @@ int live_rps_control(void *arg)
intel_gt_pm_get(gt);
for_each_engine(engine, gt, id) {
- unsigned long saved_heartbeat;
struct i915_request *rq;
ktime_t min_dt, max_dt;
int f, limit;
@@ -414,7 +408,7 @@ int live_rps_control(void *arg)
if (!intel_engine_can_store_dword(engine))
continue;
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
rq = igt_spinner_create_request(&spin,
engine->kernel_context,
@@ -430,7 +424,7 @@ int live_rps_control(void *arg)
pr_err("%s: RPS spinner did not start\n",
engine->name);
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
intel_gt_set_wedged(engine->gt);
err = -EIO;
break;
@@ -440,7 +434,7 @@ int live_rps_control(void *arg)
pr_err("%s: could not set minimum frequency [%x], only %x!\n",
engine->name, rps->min_freq, read_cagf(rps));
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
show_pstate_limits(rps);
err = -EINVAL;
break;
@@ -457,7 +451,7 @@ int live_rps_control(void *arg)
pr_err("%s: could not restore minimum frequency [%x], only %x!\n",
engine->name, rps->min_freq, read_cagf(rps));
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
show_pstate_limits(rps);
err = -EINVAL;
break;
@@ -472,7 +466,7 @@ int live_rps_control(void *arg)
min_dt = ktime_sub(ktime_get(), min_dt);
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n",
engine->name,
@@ -635,7 +629,6 @@ int live_rps_frequency_cs(void *arg)
rps->work.func = dummy_rps_work;
for_each_engine(engine, gt, id) {
- unsigned long saved_heartbeat;
struct i915_request *rq;
struct i915_vma *vma;
u32 *cancel, *cntr;
@@ -644,14 +637,14 @@ int live_rps_frequency_cs(void *arg)
int freq;
} min, max;
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
vma = create_spin_counter(engine,
engine->kernel_context->vm, false,
&cancel, &cntr);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
break;
}
@@ -732,7 +725,7 @@ err_vma:
i915_vma_unpin(vma);
i915_vma_put(vma);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
if (igt_flush_test(gt->i915))
err = -EIO;
if (err)
@@ -778,7 +771,6 @@ int live_rps_frequency_srm(void *arg)
rps->work.func = dummy_rps_work;
for_each_engine(engine, gt, id) {
- unsigned long saved_heartbeat;
struct i915_request *rq;
struct i915_vma *vma;
u32 *cancel, *cntr;
@@ -787,14 +779,14 @@ int live_rps_frequency_srm(void *arg)
int freq;
} min, max;
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
vma = create_spin_counter(engine,
engine->kernel_context->vm, true,
&cancel, &cntr);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
break;
}
@@ -874,7 +866,7 @@ err_vma:
i915_vma_unpin(vma);
i915_vma_put(vma);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
if (igt_flush_test(gt->i915))
err = -EIO;
if (err)
@@ -1066,16 +1058,14 @@ int live_rps_interrupt(void *arg)
for_each_engine(engine, gt, id) {
/* Keep the engine busy with a spinner; expect an UP! */
if (pm_events & GEN6_PM_RP_UP_THRESHOLD) {
- unsigned long saved_heartbeat;
-
intel_gt_pm_wait_for_idle(engine->gt);
GEM_BUG_ON(intel_rps_is_active(rps));
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
err = __rps_up_interrupt(rps, engine, &spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
if (err)
goto out;
@@ -1084,15 +1074,13 @@ int live_rps_interrupt(void *arg)
/* Keep the engine awake but idle and check for DOWN */
if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) {
- unsigned long saved_heartbeat;
-
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
intel_rc6_disable(&gt->rc6);
err = __rps_down_interrupt(rps, engine);
intel_rc6_enable(&gt->rc6);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
if (err)
goto out;
}
@@ -1168,7 +1156,6 @@ int live_rps_power(void *arg)
rps->work.func = dummy_rps_work;
for_each_engine(engine, gt, id) {
- unsigned long saved_heartbeat;
struct i915_request *rq;
struct {
u64 power;
@@ -1178,13 +1165,13 @@ int live_rps_power(void *arg)
if (!intel_engine_can_store_dword(engine))
continue;
- saved_heartbeat = engine_heartbeat_disable(engine);
+ engine_heartbeat_disable(engine);
rq = igt_spinner_create_request(&spin,
engine->kernel_context,
MI_NOOP);
if (IS_ERR(rq)) {
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
err = PTR_ERR(rq);
break;
}
@@ -1195,7 +1182,7 @@ int live_rps_power(void *arg)
pr_err("%s: RPS spinner did not start\n",
engine->name);
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
intel_gt_set_wedged(engine->gt);
err = -EIO;
break;
@@ -1208,7 +1195,7 @@ int live_rps_power(void *arg)
min.power = measure_power_at(rps, &min.freq);
igt_spinner_end(&spin);
- engine_heartbeat_enable(engine, saved_heartbeat);
+ engine_heartbeat_enable(engine);
pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n",
engine->name,
diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c
index c2578a0f2f14..ef1c35073dc0 100644
--- a/drivers/gpu/drm/i915/gt/selftest_timeline.c
+++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c
@@ -751,22 +751,20 @@ out_free:
return err;
}
-static void engine_heartbeat_disable(struct intel_engine_cs *engine,
- unsigned long *saved)
+static void engine_heartbeat_disable(struct intel_engine_cs *engine)
{
- *saved = engine->props.heartbeat_interval_ms;
engine->props.heartbeat_interval_ms = 0;
intel_engine_pm_get(engine);
intel_engine_park_heartbeat(engine);
}
-static void engine_heartbeat_enable(struct intel_engine_cs *engine,
- unsigned long saved)
+static void engine_heartbeat_enable(struct intel_engine_cs *engine)
{
intel_engine_pm_put(engine);
- engine->props.heartbeat_interval_ms = saved;
+ engine->props.heartbeat_interval_ms =
+ engine->defaults.heartbeat_interval_ms;
}
static int live_hwsp_rollover_kernel(void *arg)
@@ -785,10 +783,9 @@ static int live_hwsp_rollover_kernel(void *arg)
struct intel_context *ce = engine->kernel_context;
struct intel_timeline *tl = ce->timeline;
struct i915_request *rq[3] = {};
- unsigned long heartbeat;
int i;
- engine_heartbeat_disable(engine, &heartbeat);
+ engine_heartbeat_disable(engine);
if (intel_gt_wait_for_idle(gt, HZ / 2)) {
err = -EIO;
goto out;
@@ -839,7 +836,7 @@ static int live_hwsp_rollover_kernel(void *arg)
out:
for (i = 0; i < ARRAY_SIZE(rq); i++)
i915_request_put(rq[i]);
- engine_heartbeat_enable(engine, heartbeat);
+ engine_heartbeat_enable(engine);
if (err)
break;
}
diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c
index 5ed323254ee1..32785463ec9e 100644
--- a/drivers/gpu/drm/i915/gt/selftest_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c
@@ -623,6 +623,8 @@ err_request:
err = -EINVAL;
goto out_unpin;
}
+ } else {
+ rsvd = 0;
}
expect = results[0];
diff --git a/drivers/gpu/drm/i915/gt/shaders/README b/drivers/gpu/drm/i915/gt/shaders/README
new file mode 100644
index 000000000000..e7e96d7073c7
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/shaders/README
@@ -0,0 +1,46 @@
+ASM sources for auto generated shaders
+======================================
+
+The i915/gt/hsw_clear_kernel.c and i915/gt/ivb_clear_kernel.c files contain
+pre-compiled batch chunks that will clear any residual render cache during
+context switch.
+
+They are generated from their respective platform ASM files present on
+i915/gt/shaders/clear_kernel directory.
+
+The generated .c files should never be modified directly. Instead, any modification
+needs to be done on the on their respective ASM files and build instructions below
+needes to be followed.
+
+Building
+========
+
+Environment
+-----------
+
+IGT GPU tool scripts and the Mesa's i965 instruction assembler tool are used
+on building.
+
+Please make sure your Mesa tool is compiled with "-Dtools=intel" and
+"-Ddri-drivers=i965", and run this script from IGT source root directory"
+
+The instructions bellow assume:
+ * IGT gpu tools source code is located on your home directory (~) as ~/igt
+ * Mesa source code is located on your home directory (~) as ~/mesa
+ and built under the ~/mesa/build directory
+ * Linux kernel source code is under your home directory (~) as ~/linux
+
+Instructions
+------------
+
+~ $ cp ~/linux/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm \
+ ~/igt/lib/i915/shaders/clear_kernel/ivb.asm
+~ $ cd ~/igt
+igt $ ./scripts/generate_clear_kernel.sh -g ivb \
+ -m ~/mesa/build/src/intel/tools/i965_asm
+
+~ $ cp ~/linux/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm \
+ ~/igt/lib/i915/shaders/clear_kernel/hsw.asm
+~ $ cd ~/igt
+igt $ ./scripts/generate_clear_kernel.sh -g hsw \
+ -m ~/mesa/build/src/intel/tools/i965_asm \ No newline at end of file
diff --git a/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm
new file mode 100644
index 000000000000..5fdf384bb621
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+/*
+ * Kernel for PAVP buffer clear.
+ *
+ * 1. Clear all 64 GRF registers assigned to the kernel with designated value;
+ * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears
+ * 512 bytes of Render Cache.
+ */
+
+/* Store designated "clear GRF" value */
+mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N };
+
+/**
+ * Curbe Format
+ *
+ * DW 1.0 - Block Offset to write Render Cache
+ * DW 1.1 [15:0] - Clear Word
+ * DW 1.2 - Delay iterations
+ * DW 1.3 - Enable Instrumentation (only for debug)
+ * DW 1.4 - Rsvd (intended for context ID)
+ * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount
+ * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count)
+ * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count)
+ *
+ * Binding Table
+ *
+ * BTI 0: 2D Surface to help clear L3 (Render/Data Cache)
+ * BTI 1: Wait/Instrumentation Buffer
+ * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT)
+ * Expected to be initialized to 0 by driver/another kernel
+ * Layout:
+ * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS]
+ * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N
+ */
+add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */
+cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N };
+(+f0.0) jmpi(1) 352D { align1 WE_all 1N };
+
+/**
+ * State Register has info on where this thread is running
+ * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID
+ * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID
+ */
+mov(8) g3<1>UD 0x00000000UD { align1 1Q };
+shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N };
+and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */
+shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N };
+and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */
+mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N };
+add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */
+shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N };
+and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */
+mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N };
+add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */
+
+mov(8) g5<1>UD 0x00000000UD { align1 1Q };
+and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N };
+mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N };
+
+mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */
+mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */
+mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */
+mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */
+and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N };
+
+/* Media block read to fetch current value at specified location in instrumentation buffer */
+sendc(8) g5<1>UD g4<8,8,1>F 0x02190001
+
+ render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q };
+add(1) g5<1>D g5<0,1,0>D 1D { align1 1N };
+
+/* Media block write for updated value at specified location in instrumentation buffer */
+sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001
+ render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q };
+
+/* Delay thread for specified parameter */
+add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N };
+(+f0.0) jmpi(1) -32D { align1 WE_all 1N };
+
+/* Store designated "clear GRF" value */
+mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N };
+
+/* Initialize looping parameters */
+mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */
+mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */
+
+/* Write 32x16 all "0" block */
+mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q };
+mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q };
+mov(2) g2<1>UD g1<2,2,1>UW { align1 1N };
+mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */
+and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N };
+mov(16) g3<1>UD 0x00000000UD { align1 1H };
+mov(16) g4<1>UD 0x00000000UD { align1 1H };
+mov(16) g5<1>UD 0x00000000UD { align1 1H };
+mov(16) g6<1>UD 0x00000000UD { align1 1H };
+mov(16) g7<1>UD 0x00000000UD { align1 1H };
+mov(16) g8<1>UD 0x00000000UD { align1 1H };
+mov(16) g9<1>UD 0x00000000UD { align1 1H };
+mov(16) g10<1>UD 0x00000000UD { align1 1H };
+sendc(8) null<1>UD g2<8,8,1>F 0x120a8000
+ render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q };
+add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N };
+sendc(8) null<1>UD g2<8,8,1>F 0x120a8000
+ render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q };
+
+/* Now, clear all GRF registers */
+add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N };
+mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H };
+add(1) a0<1>D a0<0,1,0>D 32D { align1 1N };
+(+f0.0) jmpi(1) -64D { align1 WE_all 1N };
+
+/* Terminante the thread */
+sendc(8) null<1>UD g127<8,8,1>F 0x82000010
+ thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT };
diff --git a/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm
new file mode 100644
index 000000000000..97c7ac9e3854
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+/*
+ * Kernel for PAVP buffer clear.
+ *
+ * 1. Clear all 64 GRF registers assigned to the kernel with designated value;
+ * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears
+ * 512 bytes of Render Cache.
+ */
+
+/* Store designated "clear GRF" value */
+mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N };
+
+/**
+ * Curbe Format
+ *
+ * DW 1.0 - Block Offset to write Render Cache
+ * DW 1.1 [15:0] - Clear Word
+ * DW 1.2 - Delay iterations
+ * DW 1.3 - Enable Instrumentation (only for debug)
+ * DW 1.4 - Rsvd (intended for context ID)
+ * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount
+ * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count)
+ * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count)
+ *
+ * Binding Table
+ *
+ * BTI 0: 2D Surface to help clear L3 (Render/Data Cache)
+ * BTI 1: Wait/Instrumentation Buffer
+ * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT)
+ * Expected to be initialized to 0 by driver/another kernel
+ * Layout :
+ * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS]
+ * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N
+ */
+add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */
+cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N };
+(+f0.0) jmpi(1) 44D { align1 WE_all 1N };
+
+/**
+ * State Register has info on where this thread is running
+ * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID
+ * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID
+ */
+mov(8) g3<1>UD 0x00000000UD { align1 1Q };
+shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N };
+and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */
+shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N };
+and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */
+mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N };
+add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */
+shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N };
+and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */
+mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N };
+add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */
+
+mov(8) g5<1>UD 0x00000000UD { align1 1Q };
+and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N };
+mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N };
+
+mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */
+mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */
+mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */
+mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */
+and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N };
+
+/* Media block read to fetch current value at specified location in instrumentation buffer */
+sendc(8) g5<1>UD g4<8,8,1>F 0x02190001
+ render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q };
+add(1) g5<1>D g5<0,1,0>D 1D { align1 1N };
+
+/* Media block write for updated value at specified location in instrumentation buffer */
+sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001
+ render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q };
+/* Delay thread for specified parameter */
+add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N };
+(+f0.0) jmpi(1) -4D { align1 WE_all 1N };
+
+/* Store designated "clear GRF" value */
+mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N };
+
+/* Initialize looping parameters */
+mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */
+mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */
+
+/* Write 32x16 all "0" block */
+mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q };
+mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q };
+mov(2) g2<1>UD g1<2,2,1>UW { align1 1N };
+mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */
+and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N };
+mov(16) g3<1>UD 0x00000000UD { align1 1H };
+mov(16) g4<1>UD 0x00000000UD { align1 1H };
+mov(16) g5<1>UD 0x00000000UD { align1 1H };
+mov(16) g6<1>UD 0x00000000UD { align1 1H };
+mov(16) g7<1>UD 0x00000000UD { align1 1H };
+mov(16) g8<1>UD 0x00000000UD { align1 1H };
+mov(16) g9<1>UD 0x00000000UD { align1 1H };
+mov(16) g10<1>UD 0x00000000UD { align1 1H };
+sendc(8) null<1>UD g2<8,8,1>F 0x120a8000
+ render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q };
+add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N };
+sendc(8) null<1>UD g2<8,8,1>F 0x120a8000
+ render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q };
+
+/* Now, clear all GRF registers */
+add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N };
+mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H };
+add(1) a0<1>D a0<0,1,0>D 32D { align1 1N };
+(+f0.0) jmpi(1) -8D { align1 WE_all 1N };
+
+/* Terminante the thread */
+sendc(8) null<1>UD g127<8,8,1>F 0x82000010
+ thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT };