summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/gt/selftest_hangcheck.c')
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_hangcheck.c476
1 files changed, 327 insertions, 149 deletions
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 853246fad05f..4f252f704975 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -6,7 +6,9 @@
#include <linux/kthread.h>
#include "gem/i915_gem_context.h"
+#include "gem/i915_gem_internal.h"
+#include "i915_gem_evict.h"
#include "intel_gt.h"
#include "intel_engine_heartbeat.h"
#include "intel_engine_pm.h"
@@ -17,6 +19,8 @@
#include "selftests/igt_flush_test.h"
#include "selftests/igt_reset.h"
#include "selftests/igt_atomic.h"
+#include "selftests/igt_spinner.h"
+#include "selftests/intel_scheduler_helpers.h"
#include "selftests/mock_drm.h"
@@ -42,7 +46,7 @@ static int hang_init(struct hang *h, struct intel_gt *gt)
memset(h, 0, sizeof(*h));
h->gt = gt;
- h->ctx = kernel_context(gt->i915);
+ h->ctx = kernel_context(gt->i915, NULL);
if (IS_ERR(h->ctx))
return PTR_ERR(h->ctx);
@@ -69,7 +73,7 @@ static int hang_init(struct hang *h, struct intel_gt *gt)
h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
vaddr = i915_gem_object_pin_map_unlocked(h->obj,
- i915_coherent_map_type(gt->i915, h->obj, false));
+ intel_gt_coherent_map_type(gt, h->obj, false));
if (IS_ERR(vaddr)) {
err = PTR_ERR(vaddr);
goto err_unpin_hws;
@@ -92,30 +96,15 @@ err_ctx:
static u64 hws_address(const struct i915_vma *hws,
const struct i915_request *rq)
{
- return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
-}
-
-static int move_to_active(struct i915_vma *vma,
- struct i915_request *rq,
- unsigned int flags)
-{
- int err;
-
- i915_vma_lock(vma);
- err = i915_request_await_object(rq, vma->obj,
- flags & EXEC_OBJECT_WRITE);
- if (err == 0)
- err = i915_vma_move_to_active(vma, rq, flags);
- i915_vma_unlock(vma);
-
- return err;
+ return i915_vma_offset(hws) +
+ offset_in_page(sizeof(u32) * rq->fence.context);
}
static struct i915_request *
hang_create_request(struct hang *h, struct intel_engine_cs *engine)
{
struct intel_gt *gt = h->gt;
- struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
+ struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
struct drm_i915_gem_object *obj;
struct i915_request *rq = NULL;
struct i915_vma *hws, *vma;
@@ -130,7 +119,7 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
return ERR_CAST(obj);
}
- vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
+ vaddr = i915_gem_object_pin_map_unlocked(obj, intel_gt_coherent_map_type(gt, obj, false));
if (IS_ERR(vaddr)) {
i915_gem_object_put(obj);
i915_vm_put(vm);
@@ -171,11 +160,11 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
goto unpin_hws;
}
- err = move_to_active(vma, rq, 0);
+ err = igt_vma_move_to_active_unlocked(vma, rq, 0);
if (err)
goto cancel_rq;
- err = move_to_active(hws, rq, 0);
+ err = igt_vma_move_to_active_unlocked(hws, rq, 0);
if (err)
goto cancel_rq;
@@ -192,8 +181,8 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
*batch++ = MI_NOOP;
*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
- *batch++ = lower_32_bits(vma->node.start);
- *batch++ = upper_32_bits(vma->node.start);
+ *batch++ = lower_32_bits(i915_vma_offset(vma));
+ *batch++ = upper_32_bits(i915_vma_offset(vma));
} else if (GRAPHICS_VER(gt->i915) >= 6) {
*batch++ = MI_STORE_DWORD_IMM_GEN4;
*batch++ = 0;
@@ -206,7 +195,7 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
*batch++ = MI_NOOP;
*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
- *batch++ = lower_32_bits(vma->node.start);
+ *batch++ = lower_32_bits(i915_vma_offset(vma));
} else if (GRAPHICS_VER(gt->i915) >= 4) {
*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
*batch++ = 0;
@@ -219,7 +208,7 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
*batch++ = MI_NOOP;
*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
- *batch++ = lower_32_bits(vma->node.start);
+ *batch++ = lower_32_bits(i915_vma_offset(vma));
} else {
*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
*batch++ = lower_32_bits(hws_address(hws, rq));
@@ -231,7 +220,7 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
*batch++ = MI_NOOP;
*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
- *batch++ = lower_32_bits(vma->node.start);
+ *batch++ = lower_32_bits(i915_vma_offset(vma));
}
*batch++ = MI_BATCH_BUFFER_END; /* not reached */
intel_gt_chipset_flush(engine->gt);
@@ -246,7 +235,7 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
if (GRAPHICS_VER(gt->i915) <= 5)
flags |= I915_DISPATCH_SECURE;
- err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
+ err = rq->engine->emit_bb_start(rq, i915_vma_offset(vma), PAGE_SIZE, flags);
cancel_rq:
if (err) {
@@ -330,7 +319,7 @@ static int igt_hang_sanitycheck(void *arg)
i915_request_add(rq);
timeout = 0;
- intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
+ intel_wedge_on_timeout(&w, gt, HZ / 5 /* 200ms */)
timeout = i915_request_wait(rq, 0,
MAX_SCHEDULE_TIMEOUT);
if (intel_gt_is_wedged(gt))
@@ -378,6 +367,7 @@ static int igt_reset_nop(void *arg)
ce = intel_context_create(engine);
if (IS_ERR(ce)) {
err = PTR_ERR(ce);
+ pr_err("[%s] Create context failed: %d!\n", engine->name, err);
break;
}
@@ -387,6 +377,8 @@ static int igt_reset_nop(void *arg)
rq = intel_context_create_request(ce);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
+ pr_err("[%s] Create request failed: %d!\n",
+ engine->name, err);
break;
}
@@ -401,24 +393,31 @@ static int igt_reset_nop(void *arg)
igt_global_reset_unlock(gt);
if (intel_gt_is_wedged(gt)) {
+ pr_err("[%s] GT is wedged!\n", engine->name);
err = -EIO;
break;
}
if (i915_reset_count(global) != reset_count + ++count) {
- pr_err("Full GPU reset not recorded!\n");
+ pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
+ engine->name, i915_reset_count(global), reset_count, count);
err = -EINVAL;
break;
}
err = igt_flush_test(gt->i915);
- if (err)
+ if (err) {
+ pr_err("[%s] Flush failed: %d!\n", engine->name, err);
break;
+ }
} while (time_before(jiffies, end_time));
pr_info("%s: %d resets\n", __func__, count);
- if (igt_flush_test(gt->i915))
+ if (igt_flush_test(gt->i915)) {
+ pr_err("Post flush failed: %d!\n", err);
err = -EIO;
+ }
+
return err;
}
@@ -440,16 +439,27 @@ static int igt_reset_nop_engine(void *arg)
IGT_TIMEOUT(end_time);
int err;
+ if (intel_engine_uses_guc(engine)) {
+ /* Engine level resets are triggered by GuC when a hang
+ * is detected. They can't be triggered by the KMD any
+ * more. Thus a nop batch cannot be used as a reset test
+ */
+ continue;
+ }
+
ce = intel_context_create(engine);
- if (IS_ERR(ce))
+ if (IS_ERR(ce)) {
+ pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
return PTR_ERR(ce);
+ }
reset_count = i915_reset_count(global);
reset_engine_count = i915_reset_engine_count(global, engine);
count = 0;
st_engine_heartbeat_disable(engine);
- set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
+ &gt->reset.flags));
do {
int i;
@@ -506,7 +516,7 @@ static int igt_reset_nop_engine(void *arg)
break;
}
} while (time_before(jiffies, end_time));
- clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
st_engine_heartbeat_enable(engine);
pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
@@ -538,7 +548,7 @@ static int igt_reset_fail_engine(void *arg)
struct intel_engine_cs *engine;
enum intel_engine_id id;
- /* Check that we can recover from engine-reset failues */
+ /* Check that we can recover from engine-reset failures */
if (!intel_has_reset_engine(gt))
return 0;
@@ -549,12 +559,19 @@ static int igt_reset_fail_engine(void *arg)
IGT_TIMEOUT(end_time);
int err;
+ /* Can't manually break the reset if i915 doesn't perform it */
+ if (intel_engine_uses_guc(engine))
+ continue;
+
ce = intel_context_create(engine);
- if (IS_ERR(ce))
+ if (IS_ERR(ce)) {
+ pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
return PTR_ERR(ce);
+ }
st_engine_heartbeat_disable(engine);
- set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
+ &gt->reset.flags));
force_reset_timeout(engine);
err = intel_engine_reset(engine, NULL);
@@ -651,7 +668,7 @@ static int igt_reset_fail_engine(void *arg)
out:
pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
skip:
- clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
st_engine_heartbeat_enable(engine);
intel_context_put(ce);
@@ -686,8 +703,12 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
for_each_engine(engine, gt, id) {
unsigned int reset_count, reset_engine_count;
unsigned long count;
+ bool using_guc = intel_engine_uses_guc(engine);
IGT_TIMEOUT(end_time);
+ if (using_guc && !active)
+ continue;
+
if (active && !intel_engine_can_store_dword(engine))
continue;
@@ -702,16 +723,28 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
reset_engine_count = i915_reset_engine_count(global, engine);
st_engine_heartbeat_disable(engine);
- set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
+ &gt->reset.flags));
count = 0;
do {
- if (active) {
- struct i915_request *rq;
+ struct i915_request *rq = NULL;
+ struct intel_selftest_saved_policy saved;
+ int err2;
+
+ err = intel_selftest_modify_policy(engine, &saved,
+ SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
+ if (err) {
+ pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
+ break;
+ }
+ if (active) {
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
- break;
+ pr_err("[%s] Create hang request failed: %d!\n",
+ engine->name, err);
+ goto restore;
}
i915_request_get(rq);
@@ -727,36 +760,61 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
i915_request_put(rq);
err = -EIO;
- break;
+ goto restore;
}
+ }
- i915_request_put(rq);
+ if (!using_guc) {
+ err = intel_engine_reset(engine, NULL);
+ if (err) {
+ pr_err("intel_engine_reset(%s) failed, err:%d\n",
+ engine->name, err);
+ goto skip;
+ }
}
- err = intel_engine_reset(engine, NULL);
- if (err) {
- pr_err("intel_engine_reset(%s) failed, err:%d\n",
- engine->name, err);
- break;
+ if (rq) {
+ /* Ensure the reset happens and kills the engine */
+ err = intel_selftest_wait_for_rq(rq);
+ if (err)
+ pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
+ engine->name, rq->fence.context,
+ rq->fence.seqno, rq->context->guc_id.id, err);
}
+skip:
+ if (rq)
+ i915_request_put(rq);
+
if (i915_reset_count(global) != reset_count) {
pr_err("Full GPU reset recorded! (engine reset expected)\n");
err = -EINVAL;
- break;
+ goto restore;
}
- if (i915_reset_engine_count(global, engine) !=
- ++reset_engine_count) {
- pr_err("%s engine reset not recorded!\n",
- engine->name);
- err = -EINVAL;
- break;
+ /* GuC based resets are not logged per engine */
+ if (!using_guc) {
+ if (i915_reset_engine_count(global, engine) !=
+ ++reset_engine_count) {
+ pr_err("%s engine reset not recorded!\n",
+ engine->name);
+ err = -EINVAL;
+ goto restore;
+ }
}
count++;
+
+restore:
+ err2 = intel_selftest_restore_policy(engine, &saved);
+ if (err2)
+ pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
+ if (err == 0)
+ err = err2;
+ if (err)
+ break;
} while (time_before(jiffies, end_time));
- clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
st_engine_heartbeat_enable(engine);
pr_info("%s: Completed %lu %s resets\n",
engine->name, count, active ? "active" : "idle");
@@ -765,12 +823,16 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
break;
err = igt_flush_test(gt->i915);
- if (err)
+ if (err) {
+ pr_err("[%s] Flush failed: %d!\n", engine->name, err);
break;
+ }
}
- if (intel_gt_is_wedged(gt))
+ if (intel_gt_is_wedged(gt)) {
+ pr_err("GT is wedged!\n");
err = -EIO;
+ }
if (active)
hang_fini(&h);
@@ -789,10 +851,13 @@ static int igt_reset_active_engine(void *arg)
}
struct active_engine {
- struct task_struct *task;
+ struct kthread_worker *worker;
+ struct kthread_work work;
struct intel_engine_cs *engine;
unsigned long resets;
unsigned int flags;
+ bool stop;
+ int result;
};
#define TEST_ACTIVE BIT(0)
@@ -807,7 +872,7 @@ static int active_request_put(struct i915_request *rq)
if (!rq)
return 0;
- if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
+ if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
rq->engine->name,
rq->fence.context,
@@ -823,10 +888,10 @@ static int active_request_put(struct i915_request *rq)
return err;
}
-static int active_engine(void *data)
+static void active_engine(struct kthread_work *work)
{
I915_RND_STATE(prng);
- struct active_engine *arg = data;
+ struct active_engine *arg = container_of(work, typeof(*arg), work);
struct intel_engine_cs *engine = arg->engine;
struct i915_request *rq[8] = {};
struct intel_context *ce[ARRAY_SIZE(rq)];
@@ -836,15 +901,17 @@ static int active_engine(void *data)
for (count = 0; count < ARRAY_SIZE(ce); count++) {
ce[count] = intel_context_create(engine);
if (IS_ERR(ce[count])) {
- err = PTR_ERR(ce[count]);
- while (--count)
+ arg->result = PTR_ERR(ce[count]);
+ pr_err("[%s] Create context #%ld failed: %d!\n",
+ engine->name, count, arg->result);
+ while (count--)
intel_context_put(ce[count]);
- return err;
+ return;
}
}
count = 0;
- while (!kthread_should_stop()) {
+ while (!READ_ONCE(arg->stop)) {
unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
struct i915_request *old = rq[idx];
struct i915_request *new;
@@ -852,23 +919,26 @@ static int active_engine(void *data)
new = intel_context_create_request(ce[idx]);
if (IS_ERR(new)) {
err = PTR_ERR(new);
+ pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
break;
}
rq[idx] = i915_request_get(new);
i915_request_add(new);
- if (engine->schedule && arg->flags & TEST_PRIORITY) {
+ if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
struct i915_sched_attr attr = {
.priority =
i915_prandom_u32_max_state(512, &prng),
};
- engine->schedule(rq[idx], &attr);
+ engine->sched_engine->schedule(rq[idx], &attr);
}
err = active_request_put(old);
- if (err)
+ if (err) {
+ pr_err("[%s] Request put failed: %d!\n", engine->name, err);
break;
+ }
cond_resched();
}
@@ -876,6 +946,9 @@ static int active_engine(void *data)
for (count = 0; count < ARRAY_SIZE(rq); count++) {
int err__ = active_request_put(rq[count]);
+ if (err)
+ pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
+
/* Keep the first error */
if (!err)
err = err__;
@@ -883,7 +956,7 @@ static int active_engine(void *data)
intel_context_put(ce[count]);
}
- return err;
+ arg->result = err;
}
static int __igt_reset_engines(struct intel_gt *gt,
@@ -892,6 +965,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
{
struct i915_gpu_error *global = &gt->i915->gpu_error;
struct intel_engine_cs *engine, *other;
+ struct active_engine *threads;
enum intel_engine_id id, tmp;
struct hang h;
int err = 0;
@@ -912,14 +986,20 @@ static int __igt_reset_engines(struct intel_gt *gt,
h.ctx->sched.priority = 1024;
}
+ threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
+ if (!threads)
+ return -ENOMEM;
+
for_each_engine(engine, gt, id) {
- struct active_engine threads[I915_NUM_ENGINES] = {};
unsigned long device = i915_reset_count(global);
unsigned long count = 0, reported;
+ bool using_guc = intel_engine_uses_guc(engine);
IGT_TIMEOUT(end_time);
- if (flags & TEST_ACTIVE &&
- !intel_engine_can_store_dword(engine))
+ if (flags & TEST_ACTIVE) {
+ if (!intel_engine_can_store_dword(engine))
+ continue;
+ } else if (using_guc)
continue;
if (!wait_for_idle(engine)) {
@@ -929,9 +1009,9 @@ static int __igt_reset_engines(struct intel_gt *gt,
break;
}
- memset(threads, 0, sizeof(threads));
+ memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
for_each_engine(other, gt, tmp) {
- struct task_struct *tsk;
+ struct kthread_worker *worker;
threads[tmp].resets =
i915_reset_engine_count(global, other);
@@ -945,29 +1025,44 @@ static int __igt_reset_engines(struct intel_gt *gt,
threads[tmp].engine = other;
threads[tmp].flags = flags;
- tsk = kthread_run(active_engine, &threads[tmp],
- "igt/%s", other->name);
- if (IS_ERR(tsk)) {
- err = PTR_ERR(tsk);
+ worker = kthread_run_worker(0, "igt/%s",
+ other->name);
+ if (IS_ERR(worker)) {
+ err = PTR_ERR(worker);
+ pr_err("[%s] Worker create failed: %d!\n",
+ engine->name, err);
goto unwind;
}
- threads[tmp].task = tsk;
- get_task_struct(tsk);
- }
+ threads[tmp].worker = worker;
- yield(); /* start all threads before we begin */
+ kthread_init_work(&threads[tmp].work, active_engine);
+ kthread_queue_work(threads[tmp].worker,
+ &threads[tmp].work);
+ }
- st_engine_heartbeat_disable(engine);
- set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ st_engine_heartbeat_disable_no_pm(engine);
+ GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
+ &gt->reset.flags));
do {
struct i915_request *rq = NULL;
+ struct intel_selftest_saved_policy saved;
+ int err2;
+
+ err = intel_selftest_modify_policy(engine, &saved,
+ SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
+ if (err) {
+ pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
+ break;
+ }
if (flags & TEST_ACTIVE) {
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
- break;
+ pr_err("[%s] Create hang request failed: %d!\n",
+ engine->name, err);
+ goto restore;
}
i915_request_get(rq);
@@ -983,32 +1078,44 @@ static int __igt_reset_engines(struct intel_gt *gt,
i915_request_put(rq);
err = -EIO;
- break;
+ goto restore;
}
+ } else {
+ intel_engine_pm_get(engine);
}
- err = intel_engine_reset(engine, NULL);
- if (err) {
- pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
- engine->name, test_name, err);
- break;
+ if (!using_guc) {
+ err = intel_engine_reset(engine, NULL);
+ if (err) {
+ pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
+ engine->name, test_name, err);
+ goto restore;
+ }
+ }
+
+ if (rq) {
+ /* Ensure the reset happens and kills the engine */
+ err = intel_selftest_wait_for_rq(rq);
+ if (err)
+ pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
+ engine->name, rq->fence.context,
+ rq->fence.seqno, rq->context->guc_id.id, err);
}
count++;
if (rq) {
if (rq->fence.error != -EIO) {
- pr_err("i915_reset_engine(%s:%s):"
- " failed to reset request %llx:%lld\n",
+ pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
engine->name, test_name,
rq->fence.context,
- rq->fence.seqno);
+ rq->fence.seqno, rq->context->guc_id.id);
i915_request_put(rq);
GEM_TRACE_DUMP();
intel_gt_set_wedged(gt);
err = -EIO;
- break;
+ goto restore;
}
if (i915_request_wait(rq, 0, HZ / 5) < 0) {
@@ -1027,12 +1134,15 @@ static int __igt_reset_engines(struct intel_gt *gt,
GEM_TRACE_DUMP();
intel_gt_set_wedged(gt);
err = -EIO;
- break;
+ goto restore;
}
i915_request_put(rq);
}
+ if (!(flags & TEST_ACTIVE))
+ intel_engine_pm_put(engine);
+
if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
struct drm_printer p =
drm_info_printer(gt->i915->drm.dev);
@@ -1044,49 +1154,67 @@ static int __igt_reset_engines(struct intel_gt *gt,
"%s\n", engine->name);
err = -EIO;
- break;
+ goto restore;
}
+
+restore:
+ err2 = intel_selftest_restore_policy(engine, &saved);
+ if (err2)
+ pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
+ if (err == 0)
+ err = err2;
+ if (err)
+ break;
} while (time_before(jiffies, end_time));
- clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
- st_engine_heartbeat_enable(engine);
+ clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
+ st_engine_heartbeat_enable_no_pm(engine);
pr_info("i915_reset_engine(%s:%s): %lu resets\n",
engine->name, test_name, count);
- reported = i915_reset_engine_count(global, engine);
- reported -= threads[engine->id].resets;
- if (reported != count) {
- pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
- engine->name, test_name, count, reported);
- if (!err)
- err = -EINVAL;
+ /* GuC based resets are not logged per engine */
+ if (!using_guc) {
+ reported = i915_reset_engine_count(global, engine);
+ reported -= threads[engine->id].resets;
+ if (reported != count) {
+ pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+ engine->name, test_name, count, reported);
+ if (!err)
+ err = -EINVAL;
+ }
}
unwind:
for_each_engine(other, gt, tmp) {
int ret;
- if (!threads[tmp].task)
+ if (!threads[tmp].worker)
continue;
- ret = kthread_stop(threads[tmp].task);
+ WRITE_ONCE(threads[tmp].stop, true);
+ kthread_flush_work(&threads[tmp].work);
+ ret = READ_ONCE(threads[tmp].result);
if (ret) {
pr_err("kthread for other engine %s failed, err=%d\n",
other->name, ret);
if (!err)
err = ret;
}
- put_task_struct(threads[tmp].task);
-
- if (other->uabi_class != engine->uabi_class &&
- threads[tmp].resets !=
- i915_reset_engine_count(global, other)) {
- pr_err("Innocent engine %s was reset (count=%ld)\n",
- other->name,
- i915_reset_engine_count(global, other) -
- threads[tmp].resets);
- if (!err)
- err = -EINVAL;
+
+ kthread_destroy_worker(threads[tmp].worker);
+
+ /* GuC based resets are not logged per engine */
+ if (!using_guc) {
+ if (other->uabi_class != engine->uabi_class &&
+ threads[tmp].resets !=
+ i915_reset_engine_count(global, other)) {
+ pr_err("Innocent engine %s was reset (count=%ld)\n",
+ other->name,
+ i915_reset_engine_count(global, other) -
+ threads[tmp].resets);
+ if (!err)
+ err = -EINVAL;
+ }
}
}
@@ -1101,9 +1229,12 @@ unwind:
break;
err = igt_flush_test(gt->i915);
- if (err)
+ if (err) {
+ pr_err("[%s] Flush failed: %d!\n", engine->name, err);
break;
+ }
}
+ kfree(threads);
if (intel_gt_is_wedged(gt))
err = -EIO;
@@ -1165,13 +1296,15 @@ static int igt_reset_wait(void *arg)
{
struct intel_gt *gt = arg;
struct i915_gpu_error *global = &gt->i915->gpu_error;
- struct intel_engine_cs *engine = gt->engine[RCS0];
+ struct intel_engine_cs *engine;
struct i915_request *rq;
unsigned int reset_count;
struct hang h;
long timeout;
int err;
+ engine = intel_selftest_find_any_engine(gt);
+
if (!engine || !intel_engine_can_store_dword(engine))
return 0;
@@ -1180,12 +1313,15 @@ static int igt_reset_wait(void *arg)
igt_global_reset_lock(gt);
err = hang_init(&h, gt);
- if (err)
+ if (err) {
+ pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
goto unlock;
+ }
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
+ pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
goto fini;
}
@@ -1249,7 +1385,7 @@ static int evict_vma(void *data)
complete(&arg->completion);
mutex_lock(&vm->mutex);
- err = i915_gem_evict_for_node(vm, &evict, 0);
+ err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
mutex_unlock(&vm->mutex);
return err;
@@ -1292,7 +1428,7 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
int (*fn)(void *),
unsigned int flags)
{
- struct intel_engine_cs *engine = gt->engine[RCS0];
+ struct intel_engine_cs *engine;
struct drm_i915_gem_object *obj;
struct task_struct *tsk = NULL;
struct i915_request *rq;
@@ -1304,18 +1440,23 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
return 0;
+ engine = intel_selftest_find_any_engine(gt);
+
if (!engine || !intel_engine_can_store_dword(engine))
return 0;
/* Check that we can recover an unbind stuck on a hanging request */
err = hang_init(&h, gt);
- if (err)
+ if (err) {
+ pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
return err;
+ }
obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
+ pr_err("[%s] Create object failed: %d!\n", engine->name, err);
goto fini;
}
@@ -1330,12 +1471,14 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
arg.vma = i915_vma_instance(obj, vm, NULL);
if (IS_ERR(arg.vma)) {
err = PTR_ERR(arg.vma);
+ pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
goto out_obj;
}
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
+ pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
goto out_obj;
}
@@ -1347,6 +1490,7 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
if (err) {
i915_request_add(rq);
+ pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
goto out_obj;
}
@@ -1360,12 +1504,9 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
}
}
- i915_vma_lock(arg.vma);
- err = i915_request_await_object(rq, arg.vma->obj,
- flags & EXEC_OBJECT_WRITE);
- if (err == 0)
- err = i915_vma_move_to_active(arg.vma, rq, flags);
- i915_vma_unlock(arg.vma);
+ err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags);
+ if (err)
+ pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
if (flags & EXEC_OBJECT_NEEDS_FENCE)
i915_vma_unpin_fence(arg.vma);
@@ -1392,6 +1533,7 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
tsk = kthread_run(fn, &arg, "igt/evict_vma");
if (IS_ERR(tsk)) {
err = PTR_ERR(tsk);
+ pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
tsk = NULL;
goto out_reset;
}
@@ -1454,7 +1596,7 @@ static int igt_reset_evict_ppgtt(void *arg)
if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
return 0;
- ppgtt = i915_ppgtt_create(gt);
+ ppgtt = i915_ppgtt_create(gt, 0);
if (IS_ERR(ppgtt))
return PTR_ERR(ppgtt);
@@ -1508,17 +1650,29 @@ static int igt_reset_queue(void *arg)
goto unlock;
for_each_engine(engine, gt, id) {
+ struct intel_selftest_saved_policy saved;
struct i915_request *prev;
IGT_TIMEOUT(end_time);
unsigned int count;
+ bool using_guc = intel_engine_uses_guc(engine);
if (!intel_engine_can_store_dword(engine))
continue;
+ if (using_guc) {
+ err = intel_selftest_modify_policy(engine, &saved,
+ SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
+ if (err) {
+ pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
+ goto fini;
+ }
+ }
+
prev = hang_create_request(&h, engine);
if (IS_ERR(prev)) {
err = PTR_ERR(prev);
- goto fini;
+ pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
+ goto restore;
}
i915_request_get(prev);
@@ -1532,7 +1686,8 @@ static int igt_reset_queue(void *arg)
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
- goto fini;
+ pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
+ goto restore;
}
i915_request_get(rq);
@@ -1557,7 +1712,7 @@ static int igt_reset_queue(void *arg)
GEM_TRACE_DUMP();
intel_gt_set_wedged(gt);
- goto fini;
+ goto restore;
}
if (!wait_until_running(&h, prev)) {
@@ -1575,7 +1730,7 @@ static int igt_reset_queue(void *arg)
intel_gt_set_wedged(gt);
err = -EIO;
- goto fini;
+ goto restore;
}
reset_count = fake_hangcheck(gt, BIT(id));
@@ -1586,7 +1741,7 @@ static int igt_reset_queue(void *arg)
i915_request_put(rq);
i915_request_put(prev);
err = -EINVAL;
- goto fini;
+ goto restore;
}
if (rq->fence.error) {
@@ -1595,7 +1750,7 @@ static int igt_reset_queue(void *arg)
i915_request_put(rq);
i915_request_put(prev);
err = -EINVAL;
- goto fini;
+ goto restore;
}
if (i915_reset_count(global) == reset_count) {
@@ -1603,7 +1758,7 @@ static int igt_reset_queue(void *arg)
i915_request_put(rq);
i915_request_put(prev);
err = -EINVAL;
- goto fini;
+ goto restore;
}
i915_request_put(prev);
@@ -1618,9 +1773,24 @@ static int igt_reset_queue(void *arg)
i915_request_put(prev);
- err = igt_flush_test(gt->i915);
+restore:
+ if (using_guc) {
+ int err2 = intel_selftest_restore_policy(engine, &saved);
+
+ if (err2)
+ pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
+ __func__, __LINE__, engine->name, err2);
+ if (err == 0)
+ err = err2;
+ }
if (err)
+ goto fini;
+
+ err = igt_flush_test(gt->i915);
+ if (err) {
+ pr_err("[%s] Flush failed: %d!\n", engine->name, err);
break;
+ }
}
fini:
@@ -1638,12 +1808,14 @@ static int igt_handle_error(void *arg)
{
struct intel_gt *gt = arg;
struct i915_gpu_error *global = &gt->i915->gpu_error;
- struct intel_engine_cs *engine = gt->engine[RCS0];
+ struct intel_engine_cs *engine;
struct hang h;
struct i915_request *rq;
struct i915_gpu_coredump *error;
int err;
+ engine = intel_selftest_find_any_engine(gt);
+
/* Check that we can issue a global GPU and engine reset */
if (!intel_has_reset_engine(gt))
@@ -1653,12 +1825,15 @@ static int igt_handle_error(void *arg)
return 0;
err = hang_init(&h, gt);
- if (err)
+ if (err) {
+ pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
return err;
+ }
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
+ pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
goto err_fini;
}
@@ -1702,7 +1877,7 @@ static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
const struct igt_atomic_section *p,
const char *mode)
{
- struct tasklet_struct * const t = &engine->execlists.tasklet;
+ struct tasklet_struct * const t = &engine->sched_engine->tasklet;
int err;
GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
@@ -1743,12 +1918,15 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
return err;
err = hang_init(&h, engine->gt);
- if (err)
+ if (err) {
+ pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
return err;
+ }
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
+ pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
goto out;
}
@@ -1838,7 +2016,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
SUBTEST(igt_reset_evict_fence),
SUBTEST(igt_handle_error),
};
- struct intel_gt *gt = &i915->gt;
+ struct intel_gt *gt = to_gt(i915);
intel_wakeref_t wakeref;
int err;