summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/i915/i915_debugfs.c38
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h5
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c18
-rw-r--r--drivers/gpu/drm/i915/i915_gem_fence_reg.h1
-rw-r--r--drivers/gpu/drm/i915/i915_gem_gtt.h1
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.c104
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.h28
-rw-r--r--drivers/gpu/drm/i915/i915_request.c47
-rw-r--r--drivers/gpu/drm/i915/i915_reset.c392
-rw-r--r--drivers/gpu/drm/i915/i915_reset.h3
-rw-r--r--drivers/gpu/drm/i915/intel_engine_cs.c6
-rw-r--r--drivers/gpu/drm/i915/intel_guc_submission.c5
-rw-r--r--drivers/gpu/drm/i915/intel_hangcheck.c28
-rw-r--r--drivers/gpu/drm/i915/intel_lrc.c91
-rw-r--r--drivers/gpu/drm/i915/intel_overlay.c2
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.c91
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.h17
-rw-r--r--drivers/gpu/drm/i915/selftests/intel_hangcheck.c57
-rw-r--r--drivers/gpu/drm/i915/selftests/intel_workarounds.c3
-rw-r--r--drivers/gpu/drm/i915/selftests/mock_gem_device.c4
20 files changed, 404 insertions, 537 deletions
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 9a9e1da496dc..76dea0572f3e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
seq_puts(m, "Wedged\n");
if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
seq_puts(m, "Reset in progress: struct_mutex backoff\n");
- if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
- seq_puts(m, "Reset in progress: reset handoff to waiter\n");
if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
seq_puts(m, "Waiter holding struct mutex\n");
if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
struct rb_node *rb;
seq_printf(m, "%s:\n", engine->name);
- seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+ seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
engine->hangcheck.seqno, seqno[id],
- intel_engine_last_submit(engine));
- seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+ intel_engine_last_submit(engine),
+ jiffies_to_msecs(jiffies -
+ engine->hangcheck.action_timestamp));
+ seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
yesno(intel_engine_has_waiter(engine)),
yesno(test_bit(engine->id,
- &dev_priv->gpu_error.missed_irq_rings)),
- yesno(engine->hangcheck.stalled),
- yesno(engine->hangcheck.wedged));
+ &dev_priv->gpu_error.missed_irq_rings)));
spin_lock_irq(&b->rb_lock);
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
(long long)engine->hangcheck.acthd,
(long long)acthd[id]);
- seq_printf(m, "\taction = %s(%d) %d ms ago\n",
- hangcheck_action_to_str(engine->hangcheck.action),
- engine->hangcheck.action,
- jiffies_to_msecs(jiffies -
- engine->hangcheck.action_timestamp));
if (engine->id == RCS) {
seq_puts(m, "\tinstdone read =\n");
@@ -3911,8 +3904,6 @@ static int
i915_wedged_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
- struct intel_engine_cs *engine;
- unsigned int tmp;
/*
* There is no safeguard against this debugfs entry colliding
@@ -3925,18 +3916,8 @@ i915_wedged_set(void *data, u64 val)
if (i915_reset_backoff(&i915->gpu_error))
return -EAGAIN;
- for_each_engine_masked(engine, i915, val, tmp) {
- engine->hangcheck.seqno = intel_engine_get_seqno(engine);
- engine->hangcheck.stalled = true;
- }
-
i915_handle_error(i915, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
-
- wait_on_bit(&i915->gpu_error.flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE);
-
return 0;
}
@@ -4091,13 +4072,8 @@ i915_drop_caches_set(void *data, u64 val)
mutex_unlock(&i915->drm.struct_mutex);
}
- if (val & DROP_RESET_ACTIVE &&
- i915_terminally_wedged(&i915->gpu_error)) {
+ if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
- wait_on_bit(&i915->gpu_error.flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE);
- }
fs_reclaim_acquire(GFP_KERNEL);
if (val & DROP_BOUND)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3c111ad09922..0133d1da3d3c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
}
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
- return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_WEDGED, &error->flags));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b359390ba22c..d20b42386c3c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
struct intel_rps_client *rps_client)
{
might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
- GEM_BUG_ON(debug_locks &&
- !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
- !!(flags & I915_WAIT_LOCKED));
-#endif
GEM_BUG_ON(timeout < 0);
timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
GEM_TRACE("\n");
- mutex_lock(&i915->drm.struct_mutex);
-
wakeref = intel_runtime_pm_get(i915);
intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
intel_runtime_pm_put(i915, wakeref);
+ mutex_lock(&i915->drm.struct_mutex);
i915_gem_contexts_lost(i915);
mutex_unlock(&i915->drm.struct_mutex);
}
@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
wakeref = intel_runtime_pm_get(i915);
intel_suspend_gt_powersave(i915);
+ flush_workqueue(i915->wq);
+
mutex_lock(&i915->drm.struct_mutex);
/*
@@ -4563,11 +4559,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
i915_retire_requests(i915); /* ensure we flush after wedging */
mutex_unlock(&i915->drm.struct_mutex);
+ i915_reset_flush(i915);
- intel_uc_suspend(i915);
-
- cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
- cancel_delayed_work_sync(&i915->gt.retire_work);
+ drain_delayed_work(&i915->gt.retire_work);
/*
* As the idle_work is rearming if it detects a race, play safe and
@@ -4575,6 +4569,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
*/
drain_delayed_work(&i915->gt.idle_work);
+ intel_uc_suspend(i915);
+
/*
* Assert that we successfully flushed all the work and
* reset the GPU back to its idle, low power state.
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 99a31ded4dfd..09dcaf14121b 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
};
#endif
-
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 9229b03d629b..a0039ea97cdc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -39,6 +39,7 @@
#include <linux/pagevec.h>
#include "i915_request.h"
+#include "i915_reset.h"
#include "i915_selftest.h"
#include "i915_timeline.h"
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1f8e80e31b49..4eef0462489c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
- err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
- err_printf(m, " hangcheck action: %s\n",
- hangcheck_action_to_str(ee->hangcheck_action));
- err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n",
+ err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
ee->hangcheck_timestamp,
ee->hangcheck_timestamp == epoch ? "; epoch" : "");
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
jiffies_to_msecs(error->capture - error->epoch));
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- if (error->engine[i].hangcheck_stalled &&
- error->engine[i].context.pid) {
- err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
- engine_name(m->i915, i),
- error->engine[i].context.comm,
- error->engine[i].context.pid,
- error->engine[i].context.ban_score,
- bannable(&error->engine[i].context));
- }
+ if (!error->engine[i].context.pid)
+ continue;
+
+ err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
+ engine_name(m->i915, i),
+ error->engine[i].context.comm,
+ error->engine[i].context.pid,
+ error->engine[i].context.ban_score,
+ bannable(&error->engine[i].context));
}
err_printf(m, "Reset count: %u\n", error->reset_count);
err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
return i;
}
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
+/*
+ * Generate a semi-unique error code. The code is not meant to have meaning, The
* code's only purpose is to try to prevent false duplicated bug reports by
* grossly estimating a GPU error state.
*
@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
*
* It's only a small step better than a random number in its current form.
*/
-static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
- struct i915_gpu_state *error,
- int *engine_id)
+static u32 i915_error_generate_code(struct i915_gpu_state *error,
+ unsigned long engine_mask)
{
- u32 error_code = 0;
- int i;
-
- /* IPEHR would be an ideal way to detect errors, as it's the gross
+ /*
+ * IPEHR would be an ideal way to detect errors, as it's the gross
* measure of "the command that hung." However, has some very common
* synchronization commands which almost always appear in the case
* strictly a client bug. Use instdone to differentiate those some.
*/
- for (i = 0; i < I915_NUM_ENGINES; i++) {
- if (error->engine[i].hangcheck_stalled) {
- if (engine_id)
- *engine_id = i;
+ if (engine_mask) {
+ struct drm_i915_error_engine *ee =
+ &error->engine[ffs(engine_mask)];
- return error->engine[i].ipehr ^
- error->engine[i].instdone.instdone;
- }
+ return ee->ipehr ^ ee->instdone.instdone;
}
- return error_code;
+ return 0;
}
static void gem_record_fences(struct i915_gpu_state *error)
@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
}
ee->idle = intel_engine_is_idle(engine);
- ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
- ee->hangcheck_action = engine->hangcheck.action;
- ee->hangcheck_stalled = engine->hangcheck.stalled;
+ if (!ee->idle)
+ ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
engine);
@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
error->pgtbl_er = I915_READ(PGTBL_ER);
}
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
- struct i915_gpu_state *error,
- u32 engine_mask,
- const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
{
- u32 ecode;
- int engine_id = -1, len;
+ int len;
+ int i;
- ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+ for (i = 0; i < ARRAY_SIZE(error->engine); i++)
+ if (!error->engine[i].context.pid)
+ engines &= ~BIT(i);
len = scnprintf(error->error_msg, sizeof(error->error_msg),
- "GPU HANG: ecode %d:%d:0x%08x",
- INTEL_GEN(dev_priv), engine_id, ecode);
-
- if (engine_id != -1 && error->engine[engine_id].context.pid)
+ "GPU HANG: ecode %d:%lx:0x%08x",
+ INTEL_GEN(error->i915), engines,
+ i915_error_generate_code(error, engines));
+ if (engines) {
+ /* Just show the first executing process, more is confusing */
+ i = ffs(engines);
len += scnprintf(error->error_msg + len,
sizeof(error->error_msg) - len,
", in %s [%d]",
- error->engine[engine_id].context.comm,
- error->engine[engine_id].context.pid);
+ error->engine[i].context.comm,
+ error->engine[i].context.pid);
+ }
+ if (msg)
+ len += scnprintf(error->error_msg + len,
+ sizeof(error->error_msg) - len,
+ ", %s", msg);
- scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
- ", reason: %s, action: %s",
- error_msg,
- engine_mask ? "reset" : "continue");
+ return error->error_msg;
}
static void capture_gen_state(struct i915_gpu_state *error)
@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
const struct drm_i915_error_engine *ee = &error->engine[i];
- if (ee->hangcheck_stalled &&
+ if (ee->hangcheck_timestamp &&
time_before(ee->hangcheck_timestamp, epoch))
epoch = ee->hangcheck_timestamp;
}
@@ -1921,7 +1916,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* i915_capture_error_state - capture an error record for later analysis
* @i915: i915 device
* @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: a message to insert into the error capture header
*
* Should be called when an error is detected (either a hang or an error
* interrupt) to capture error state from the time of the error. Fills
@@ -1929,8 +1924,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *i915,
- u32 engine_mask,
- const char *error_msg)
+ unsigned long engine_mask,
+ const char *msg)
{
static bool warned;
struct i915_gpu_state *error;
@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
if (IS_ERR(error))
return;
- i915_error_capture_msg(i915, error, engine_mask, error_msg);
- DRM_INFO("%s\n", error->error_msg);
+ dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
if (!error->simulated) {
spin_lock_irqsave(&i915->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 604291f7762d..231173786eae 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -85,8 +85,6 @@ struct i915_gpu_state {
bool waiting;
int num_waiters;
unsigned long hangcheck_timestamp;
- bool hangcheck_stalled;
- enum intel_engine_hangcheck_action hangcheck_action;
struct i915_address_space *vm;
int num_requests;
u32 reset_count;
@@ -197,6 +195,8 @@ struct i915_gpu_state {
struct scatterlist *sgl, *fit;
};
+struct i915_gpu_restart;
+
struct i915_gpu_error {
/* For hangcheck timer */
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
@@ -247,15 +247,6 @@ struct i915_gpu_error {
* i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
* secondary role in preventing two concurrent global reset attempts.
*
- * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
- * struct_mutex. We try to acquire the struct_mutex in the reset worker,
- * but it may be held by some long running waiter (that we cannot
- * interrupt without causing trouble). Once we are ready to do the GPU
- * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
- * they already hold the struct_mutex and want to participate they can
- * inspect the bit and do the reset directly, otherwise the worker
- * waits for the struct_mutex.
- *
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset attempts in the same engine.
@@ -269,20 +260,13 @@ struct i915_gpu_error {
*/
unsigned long flags;
#define I915_RESET_BACKOFF 0
-#define I915_RESET_HANDOFF 1
-#define I915_RESET_MODESET 2
-#define I915_RESET_ENGINE 3
+#define I915_RESET_MODESET 1
+#define I915_RESET_ENGINE 2
#define I915_WEDGED (BITS_PER_LONG - 1)
/** Number of times an engine has been reset */
u32 reset_engine_count[I915_NUM_ENGINES];
- /** Set of stalled engines with guilty requests, in the current reset */
- u32 stalled_mask;
-
- /** Reason for the current *global* reset */
- const char *reason;
-
struct mutex wedge_mutex; /* serialises wedging/unwedging */
/**
@@ -299,6 +283,8 @@ struct i915_gpu_error {
/* For missed irq/seqno simulation. */
unsigned long test_irq_rings;
+
+ struct i915_gpu_restart *restart;
};
struct drm_i915_error_state_buf {
@@ -320,7 +306,7 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *dev_priv,
- u32 engine_mask,
+ unsigned long engine_mask,
const char *error_msg);
static inline struct i915_gpu_state *
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index ddc35e9dc0c0..f4241a17e2ad 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1083,18 +1083,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
return false;
}
-static bool __i915_wait_request_check_and_reset(struct i915_request *request)
-{
- struct i915_gpu_error *error = &request->i915->gpu_error;
-
- if (likely(!i915_reset_handoff(error)))
- return false;
-
- __set_current_state(TASK_RUNNING);
- i915_reset(request->i915, error->stalled_mask, error->reason);
- return true;
-}
-
/**
* i915_request_wait - wait until execution of request has finished
* @rq: the request to wait upon
@@ -1120,17 +1108,10 @@ long i915_request_wait(struct i915_request *rq,
{
const int state = flags & I915_WAIT_INTERRUPTIBLE ?
TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
- wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
- DEFINE_WAIT_FUNC(reset, default_wake_function);
DEFINE_WAIT_FUNC(exec, default_wake_function);
struct intel_wait wait;
might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
- GEM_BUG_ON(debug_locks &&
- !!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
- !!(flags & I915_WAIT_LOCKED));
-#endif
GEM_BUG_ON(timeout < 0);
if (i915_request_completed(rq))
@@ -1140,11 +1121,7 @@ long i915_request_wait(struct i915_request *rq,
return -ETIME;
trace_i915_request_wait_begin(rq, flags);
-
add_wait_queue(&rq->execute, &exec);
- if (flags & I915_WAIT_LOCKED)
- add_wait_queue(errq, &reset);
-
intel_wait_init(&wait);
if (flags & I915_WAIT_PRIORITY)
i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
@@ -1155,10 +1132,6 @@ restart:
if (intel_wait_update_request(&wait, rq))
break;
- if (flags & I915_WAIT_LOCKED &&
- __i915_wait_request_check_and_reset(rq))
- continue;
-
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
goto complete;
@@ -1188,9 +1161,6 @@ restart:
*/
goto wakeup;
- if (flags & I915_WAIT_LOCKED)
- __i915_wait_request_check_and_reset(rq);
-
for (;;) {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
@@ -1214,21 +1184,6 @@ wakeup:
if (i915_request_completed(rq))
break;
- /*
- * If the GPU is hung, and we hold the lock, reset the GPU
- * and then check for completion. On a full reset, the engine's
- * HW seqno will be advanced passed us and we are complete.
- * If we do a partial reset, we have to wait for the GPU to
- * resume and update the breadcrumb.
- *
- * If we don't hold the mutex, we can just wait for the worker
- * to come along and update the breadcrumb (either directly
- * itself, or indirectly by recovering the GPU).
- */
- if (flags & I915_WAIT_LOCKED &&
- __i915_wait_request_check_and_reset(rq))
- continue;
-
/* Only spin if we know the GPU is processing this request */
if (__i915_spin_request(rq, wait.seqno, state, 2))
break;
@@ -1242,8 +1197,6 @@ wakeup:
intel_engine_remove_wait(rq->engine, &wait);
complete:
__set_current_state(TASK_RUNNING);
- if (flags & I915_WAIT_LOCKED)
- remove_wait_queue(errq, &reset);
remove_wait_queue(&rq->execute, &exec);
trace_i915_request_wait_end(rq);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 33408c4e6358..68af017ee548 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -5,6 +5,7 @@
*/
#include <linux/sched/mm.h>
+#include <linux/stop_machine.h>
#include "i915_drv.h"
#include "i915_gpu_error.h"
@@ -14,27 +15,31 @@
#define RESET_MAX_RETRIES 3
+/* XXX How to handle concurrent GGTT updates using tiling registers? */
+#define RESET_UNDER_STOP_MACHINE 0
+
static void engine_skip_context(struct i915_request *rq)
{
struct intel_engine_cs *engine = rq->engine;
struct i915_gem_context *hung_ctx = rq->gem_context;
struct i915_timeline *timeline = rq->timeline;
- unsigned long flags;
+ lockdep_assert_held(&engine->timeline.lock);
GEM_BUG_ON(timeline == &engine->timeline);
- spin_lock_irqsave(&engine->timeline.lock, flags);
spin_lock(&timeline->lock);
- list_for_each_entry_continue(rq, &engine->timeline.requests, link)
- if (rq->gem_context == hung_ctx)
- i915_request_skip(rq, -EIO);
+ if (rq->global_seqno) {
+ list_for_each_entry_continue(rq,
+ &engine->timeline.requests, link)
+ if (rq->gem_context == hung_ctx)
+ i915_request_skip(rq, -EIO);
+ }
list_for_each_entry(rq, &timeline->requests, link)
i915_request_skip(rq, -EIO);
spin_unlock(&timeline->lock);
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
static void client_mark_guilty(struct drm_i915_file_private *file_priv,
@@ -61,7 +66,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
}
}
-static void context_mark_guilty(struct i915_gem_context *ctx)
+static bool context_mark_guilty(struct i915_gem_context *ctx)
{
unsigned int score;
bool banned, bannable;
@@ -74,7 +79,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
/* Cool contexts don't accumulate client ban score */
if (!bannable)
- return;
+ return false;
if (banned) {
DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
@@ -85,6 +90,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
if (!IS_ERR_OR_NULL(ctx->file_priv))
client_mark_guilty(ctx->file_priv, ctx);
+
+ return banned;
}
static void context_mark_innocent(struct i915_gem_context *ctx)
@@ -92,6 +99,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
atomic_inc(&ctx->active_count);
}
+void i915_reset_request(struct i915_request *rq, bool guilty)
+{
+ lockdep_assert_held(&rq->engine->timeline.lock);
+ GEM_BUG_ON(i915_request_completed(rq));
+
+ if (guilty) {
+ i915_request_skip(rq, -EIO);
+ if (context_mark_guilty(rq->gem_context))
+ engine_skip_context(rq);
+ } else {
+ dma_fence_set_error(&rq->fence, -EAGAIN);
+ context_mark_innocent(rq->gem_context);
+ }
+}
+
static void gen3_stop_engine(struct intel_engine_cs *engine)
{
struct drm_i915_private *dev_priv = engine->i915;
@@ -604,11 +626,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
* Ensure irq handler finishes, and not run again.
* Also return the active request so that we only search for it once.
*/
-static struct i915_request *
-reset_prepare_engine(struct intel_engine_cs *engine)
+static void reset_prepare_engine(struct intel_engine_cs *engine)
{
- struct i915_request *rq;
-
/*
* During the reset sequence, we must prevent the engine from
* entering RC6. As the context state is undefined until we restart
@@ -617,162 +636,85 @@ reset_prepare_engine(struct intel_engine_cs *engine)
* GPU state upon resume, i.e. fail to restart after a reset.
*/
intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
- rq = engine->reset.prepare(engine);
- if (rq && rq->fence.error == -EIO)
- rq = ERR_PTR(-EIO); /* Previous reset failed! */
-
- return rq;
+ engine->reset.prepare(engine);
}
-static int reset_prepare(struct drm_i915_private *i915)
+static void reset_prepare(struct drm_i915_private *i915)
{
struct intel_engine_cs *engine;
- struct i915_request *rq;
enum intel_engine_id id;
- int err = 0;
- for_each_engine(engine, i915, id) {
- rq = reset_prepare_engine(engine);
- if (IS_ERR(rq)) {
- err = PTR_ERR(rq);
- continue;
- }
-
- engine->hangcheck.active_request = rq;
- }
+ for_each_engine(engine, i915, id)
+ reset_prepare_engine(engine);
- i915_gem_revoke_fences(i915);
intel_uc_sanitize(i915);
-
- return err;
}
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-reset_request(struct intel_engine_cs *engine,
- struct i915_request *rq,
- bool stalled)
+static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
{
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ int err;
+
/*
- * The guilty request will get skipped on a hung engine.
- *
- * Users of client default contexts do not rely on logical
- * state preserved between batches so it is safe to execute
- * queued requests following the hang. Non default contexts
- * rely on preserved state, so skipping a batch loses the
- * evolution of the state and it needs to be considered corrupted.
- * Executing more queued batches on top of corrupted state is
- * risky. But we take the risk by trying to advance through
- * the queued requests in order to make the client behaviour
- * more predictable around resets, by not throwing away random
- * amount of batches it has prepared for execution. Sophisticated
- * clients can use gem_reset_stats_ioctl and dma fence status
- * (exported via sync_file info ioctl on explicit fences) to observe
- * when it loses the context state and should rebuild accordingly.
- *
- * The context ban, and ultimately the client ban, mechanism are safety
- * valves if client submission ends up resulting in nothing more than
- * subsequent hangs.
+ * Everything depends on having the GTT running, so we need to start
+ * there.
*/
+ err = i915_ggtt_enable_hw(i915);
+ if (err)
+ return err;
- if (i915_request_completed(rq)) {
- GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
- engine->name, rq->global_seqno,
- rq->fence.context, rq->fence.seqno,
- intel_engine_get_seqno(engine));
- stalled = false;
- }
-
- if (stalled) {
- context_mark_guilty(rq->gem_context);
- i915_request_skip(rq, -EIO);
+ for_each_engine(engine, i915, id)
+ intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
- /* If this context is now banned, skip all pending requests. */
- if (i915_gem_context_is_banned(rq->gem_context))
- engine_skip_context(rq);
- } else {
- /*
- * Since this is not the hung engine, it may have advanced
- * since the hang declaration. Double check by refinding
- * the active request at the time of the reset.
- */
- rq = i915_gem_find_active_request(engine);
- if (rq) {
- unsigned long flags;
-
- context_mark_innocent(rq->gem_context);
- dma_fence_set_error(&rq->fence, -EAGAIN);
-
- /* Rewind the engine to replay the incomplete rq */
- spin_lock_irqsave(&engine->timeline.lock, flags);
- rq = list_prev_entry(rq, link);
- if (&rq->link == &engine->timeline.requests)
- rq = NULL;
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
- }
- }
+ i915_gem_restore_fences(i915);
- return rq;
+ return err;
}
-static void reset_engine(struct intel_engine_cs *engine,
- struct i915_request *rq,
- bool stalled)
+static void reset_finish_engine(struct intel_engine_cs *engine)
{
- if (rq)
- rq = reset_request(engine, rq, stalled);
-
- /* Setup the CS to resume from the breadcrumb of the hung request */
- engine->reset.reset(engine, rq);
+ engine->reset.finish(engine);
+ intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
}
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+struct i915_gpu_restart {
+ struct work_struct work;
+ struct drm_i915_private *i915;
+};
+
+static void restart_work(struct work_struct *work)
{
+ struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
+ struct drm_i915_private *i915 = arg->i915;
struct intel_engine_cs *engine;
enum intel_engine_id id;
+ intel_wakeref_t wakeref;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
- i915_retire_requests(i915);
+ wakeref = intel_runtime_pm_get(i915);
+ mutex_lock(&i915->drm.struct_mutex);
+ WRITE_ONCE(i915->gpu_error.restart, NULL);
for_each_engine(engine, i915, id) {
- struct intel_context *ce;
-
- reset_engine(engine,
- engine->hangcheck.active_request,
- stalled_mask & ENGINE_MASK(id));
- ce = fetch_and_zero(&engine->last_retired_context);
- if (ce)
- intel_context_unpin(ce);
+ struct i915_request *rq;
/*
* Ostensibily, we always want a context loaded for powersaving,
* so if the engine is idle after the reset, send a request
* to load our scratch kernel_context.
- *
- * More mysteriously, if we leave the engine idle after a reset,
- * the next userspace batch may hang, with what appears to be
- * an incoherent read by the CS (presumably stale TLB). An
- * empty request appears sufficient to paper over the glitch.
*/
- if (intel_engine_is_idle(engine)) {
- struct i915_request *rq;
+ if (!intel_engine_is_idle(engine))
+ continue;
- rq = i915_request_alloc(engine, i915->kernel_context);
- if (!IS_ERR(rq))
- i915_request_add(rq);
- }
+ rq = i915_request_alloc(engine, i915->kernel_context);
+ if (!IS_ERR(rq))
+ i915_request_add(rq);
}
- i915_gem_restore_fences(i915);
-}
-
-static void reset_finish_engine(struct intel_engine_cs *engine)
-{
- engine->reset.finish(engine);
+ mutex_unlock(&i915->drm.struct_mutex);
+ intel_runtime_pm_put(i915, wakeref);
- intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+ kfree(arg);
}
static void reset_finish(struct drm_i915_private *i915)
@@ -780,11 +722,30 @@ static void reset_finish(struct drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
- for_each_engine(engine, i915, id) {
- engine->hangcheck.active_request = NULL;
+ for_each_engine(engine, i915, id)
reset_finish_engine(engine);
+}
+
+static void reset_restart(struct drm_i915_private *i915)
+{
+ struct i915_gpu_restart *arg;
+
+ /*
+ * Following the reset, ensure that we always reload context for
+ * powersaving, and to correct engine->last_retired_context. Since
+ * this requires us to submit a request, queue a worker to do that
+ * task for us to evade any locking here.
+ */
+ if (READ_ONCE(i915->gpu_error.restart))
+ return;
+
+ arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+ if (arg) {
+ arg->i915 = i915;
+ INIT_WORK(&arg->work, restart_work);
+
+ WRITE_ONCE(i915->gpu_error.restart, arg);
+ queue_work(i915->wq, &arg->work);
}
}
@@ -873,8 +834,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
struct i915_timeline *tl;
bool ret = false;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
if (!test_bit(I915_WEDGED, &error->flags))
return true;
@@ -897,9 +856,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
*/
list_for_each_entry(tl, &i915->gt.timelines, link) {
struct i915_request *rq;
+ long timeout;
- rq = i915_gem_active_peek(&tl->last_request,
- &i915->drm.struct_mutex);
+ rq = i915_gem_active_get_unlocked(&tl->last_request);
if (!rq)
continue;
@@ -914,12 +873,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
* and when the seqno passes the fence, the signaler
* then signals the fence waking us up).
*/
- if (dma_fence_default_wait(&rq->fence, true,
- MAX_SCHEDULE_TIMEOUT) < 0)
+ timeout = dma_fence_default_wait(&rq->fence, true,
+ MAX_SCHEDULE_TIMEOUT);
+ i915_request_put(rq);
+ if (timeout < 0)
goto unlock;
}
- i915_retire_requests(i915);
- GEM_BUG_ON(i915->gt.active_requests);
intel_engines_sanitize(i915, false);
@@ -933,7 +892,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
* context and do not require stop_machine().
*/
intel_engines_reset_default_submission(i915);
- i915_gem_contexts_lost(i915);
GEM_TRACE("end\n");
@@ -946,6 +904,52 @@ unlock:
return ret;
}
+struct __i915_reset {
+ struct drm_i915_private *i915;
+ unsigned int stalled_mask;
+};
+
+static int __i915_reset__BKL(void *data)
+{
+ struct __i915_reset *arg = data;
+ int err;
+
+ err = intel_gpu_reset(arg->i915, ALL_ENGINES);
+ if (err)
+ return err;
+
+ return gt_reset(arg->i915, arg->stalled_mask);
+}
+
+#if RESET_UNDER_STOP_MACHINE
+/*
+ * XXX An alternative to using stop_machine would be to park only the
+ * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
+ * we should be able to prevent their memmory accesses via the lost fence
+ * registers over the course of the reset without the potential recursive
+ * of mutexes between the pagefault handler and reset.
+ *
+ * See igt/gem_mmap_gtt/hang
+ */
+#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
+#else
+#define __do_reset(fn, arg) fn(arg)
+#endif
+
+static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+ struct __i915_reset arg = { i915, stalled_mask };
+ int err, i;
+
+ err = __do_reset(__i915_reset__BKL, &arg);
+ for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
+ msleep(100);
+ err = __do_reset(__i915_reset__BKL, &arg);
+ }
+
+ return err;
+}
+
/**
* i915_reset - reset chip after a hang
* @i915: #drm_i915_private to reset
@@ -971,31 +975,22 @@ void i915_reset(struct drm_i915_private *i915,
{
struct i915_gpu_error *error = &i915->gpu_error;
int ret;
- int i;
GEM_TRACE("flags=%lx\n", error->flags);
might_sleep();
- lockdep_assert_held(&i915->drm.struct_mutex);
assert_rpm_wakelock_held(i915);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
- if (!test_bit(I915_RESET_HANDOFF, &error->flags))
- return;
-
/* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(i915))
- goto wakeup;
+ return;
if (reason)
dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
error->reset_count++;
- ret = reset_prepare(i915);
- if (ret) {
- dev_err(i915->drm.dev, "GPU recovery failed\n");
- goto taint;
- }
+ reset_prepare(i915);
if (!intel_has_gpu_reset(i915)) {
if (i915_modparams.reset)
@@ -1005,32 +1000,11 @@ void i915_reset(struct drm_i915_private *i915,
goto error;
}
- for (i = 0; i < RESET_MAX_RETRIES; i++) {
- ret = intel_gpu_reset(i915, ALL_ENGINES);
- if (ret == 0)
- break;
-
- msleep(100);
- }
- if (ret) {
+ if (do_reset(i915, stalled_mask)) {
dev_err(i915->drm.dev, "Failed to reset chip\n");
goto taint;
}
- /* Ok, now get things going again... */
-
- /*
- * Everything depends on having the GTT running, so we need to start
- * there.
- */
- ret = i915_ggtt_enable_hw(i915);
- if (ret) {
- DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
- ret);
- goto error;
- }
-
- gt_reset(i915, stalled_mask);
intel_overlay_reset(i915);
/*
@@ -1052,9 +1026,8 @@ void i915_reset(struct drm_i915_private *i915,
finish:
reset_finish(i915);
-wakeup:
- clear_bit(I915_RESET_HANDOFF, &error->flags);
- wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+ if (!i915_terminally_wedged(error))
+ reset_restart(i915);
return;
taint:
@@ -1073,7 +1046,6 @@ taint:
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
error:
i915_gem_set_wedged(i915);
- i915_retire_requests(i915);
goto finish;
}
@@ -1099,18 +1071,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
{
struct i915_gpu_error *error = &engine->i915->gpu_error;
- struct i915_request *active_request;
int ret;
GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
- active_request = reset_prepare_engine(engine);
- if (IS_ERR_OR_NULL(active_request)) {
- /* Either the previous reset failed, or we pardon the reset. */
- ret = PTR_ERR(active_request);
- goto out;
- }
+ if (i915_seqno_passed(intel_engine_get_seqno(engine),
+ intel_engine_last_submit(engine)))
+ return 0;
+
+ reset_prepare_engine(engine);
if (msg)
dev_notice(engine->i915->drm.dev,
@@ -1134,7 +1104,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
* active request and can drop it, adjust head to skip the offending
* request to resume executing remaining requests in the queue.
*/
- reset_engine(engine, active_request, true);
+ intel_engine_reset(engine, true);
/*
* The engine and its registers (and workarounds in case of render)
@@ -1171,30 +1141,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
i915_wedge_on_timeout(&w, i915, 5 * HZ) {
intel_prepare_reset(i915);
- error->reason = reason;
- error->stalled_mask = engine_mask;
-
- /* Signal that locked waiters should reset the GPU */
- smp_mb__before_atomic();
- set_bit(I915_RESET_HANDOFF, &error->flags);
- wake_up_all(&error->wait_queue);
-
- /*
- * Wait for anyone holding the lock to wakeup, without
- * blocking indefinitely on struct_mutex.
- */
- do {
- if (mutex_trylock(&i915->drm.struct_mutex)) {
- i915_reset(i915, engine_mask, reason);
- mutex_unlock(&i915->drm.struct_mutex);
- }
- } while (wait_on_bit_timeout(&error->flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE,
- 1));
-
- error->stalled_mask = 0;
- error->reason = NULL;
+ i915_reset(i915, engine_mask, reason);
intel_finish_reset(i915);
}
@@ -1350,6 +1297,25 @@ out:
intel_runtime_pm_put(i915, wakeref);
}
+bool i915_reset_flush(struct drm_i915_private *i915)
+{
+ int err;
+
+ cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
+ flush_workqueue(i915->wq);
+ GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED |
+ I915_WAIT_FOR_IDLE_BOOST,
+ MAX_SCHEDULE_TIMEOUT);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ return !err;
+}
+
static void i915_wedge_me(struct work_struct *work)
{
struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index b6a519bde67d..f2d347f319df 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915,
int i915_reset_engine(struct intel_engine_cs *engine,
const char *reason);
+void i915_reset_request(struct i915_request *rq, bool guilty);
+bool i915_reset_flush(struct drm_i915_private *i915);
+
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index ef4c8c50a4ba..1a5c163b98d6 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1119,10 +1119,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force)
if (!reset_engines(i915) && !force)
return;
- for_each_engine(engine, i915, id) {
- if (engine->reset.reset)
- engine->reset.reset(engine, NULL);
- }
+ for_each_engine(engine, i915, id)
+ intel_engine_reset(engine, false);
}
/**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 349ae5844f24..45e2db683fe5 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data)
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
-static struct i915_request *
-guc_reset_prepare(struct intel_engine_cs *engine)
+static void guc_reset_prepare(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
*/
if (engine->i915->guc.preempt_wq)
flush_workqueue(engine->i915->guc.preempt_wq);
-
- return i915_gem_find_active_request(engine);
}
/*
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 741441daae32..5662d6fed523 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -25,6 +25,17 @@
#include "i915_drv.h"
#include "i915_reset.h"
+struct hangcheck {
+ u64 acthd;
+ u32 seqno;
+ enum intel_engine_hangcheck_action action;
+ unsigned long action_timestamp;
+ int deadlock;
+ struct intel_instdone instdone;
+ bool wedged:1;
+ bool stalled:1;
+};
+
static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
{
u32 tmp = current_instdone | *old_instdone;
@@ -119,25 +130,22 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
}
static void hangcheck_load_sample(struct intel_engine_cs *engine,
- struct intel_engine_hangcheck *hc)
+ struct hangcheck *hc)
{
hc->acthd = intel_engine_get_active_head(engine);
hc->seqno = intel_engine_get_seqno(engine);
}
static void hangcheck_store_sample(struct intel_engine_cs *engine,
- const struct intel_engine_hangcheck *hc)
+ const struct hangcheck *hc)
{
engine->hangcheck.acthd = hc->acthd;
engine->hangcheck.seqno = hc->seqno;
- engine->hangcheck.action = hc->action;
- engine->hangcheck.stalled = hc->stalled;
- engine->hangcheck.wedged = hc->wedged;
}
static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs *engine,
- const struct intel_engine_hangcheck *hc)
+ const struct hangcheck *hc)
{
if (engine->hangcheck.seqno != hc->seqno)
return ENGINE_ACTIVE_SEQNO;
@@ -149,7 +157,7 @@ hangcheck_get_action(struct intel_engine_cs *engine,
}
static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
- struct intel_engine_hangcheck *hc)
+ struct hangcheck *hc)
{
unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
@@ -265,19 +273,19 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine(engine, dev_priv, id) {
- struct intel_engine_hangcheck hc;
+ struct hangcheck hc;
hangcheck_load_sample(engine, &hc);
hangcheck_accumulate_sample(engine, &hc);
hangcheck_store_sample(engine, &hc);
- if (engine->hangcheck.stalled) {
+ if (hc.stalled) {
hung |= intel_engine_flag(engine);
if (hc.action != ENGINE_DEAD)
stuck |= intel_engine_flag(engine);
}
- if (engine->hangcheck.wedged)
+ if (hc.wedged)
wedged |= intel_engine_flag(engine);
}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5551dd2ec0e6..185867106c14 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -136,6 +136,7 @@
#include <drm/i915_drm.h>
#include "i915_drv.h"
#include "i915_gem_render_state.h"
+#include "i915_reset.h"
#include "i915_vgpu.h"
#include "intel_lrc_reg.h"
#include "intel_mocs.h"
@@ -264,7 +265,8 @@ static void unwind_wa_tail(struct i915_request *rq)
assert_ring_tail_valid(rq->ring, rq->tail);
}
-static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
+static struct i915_request *
+__unwind_incomplete_requests(struct intel_engine_cs *engine)
{
struct i915_request *rq, *rn, *active = NULL;
struct list_head *uninitialized_var(pl);
@@ -306,6 +308,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
list_move_tail(&active->sched.link,
i915_sched_lookup_priolist(engine, prio));
}
+
+ return active;
}
void
@@ -1732,11 +1736,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
return 0;
}
-static struct i915_request *
-execlists_reset_prepare(struct intel_engine_cs *engine)
+static void execlists_reset_prepare(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
- struct i915_request *request, *active;
unsigned long flags;
GEM_TRACE("%s: depth<-%d\n", engine->name,
@@ -1752,59 +1754,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
* prevents the race.
*/
__tasklet_disable_sync_once(&execlists->tasklet);
+ GEM_BUG_ON(!reset_in_progress(execlists));
+ /* And flush any current direct submission. */
spin_lock_irqsave(&engine->timeline.lock, flags);
-
- /*
- * We want to flush the pending context switches, having disabled
- * the tasklet above, we can assume exclusive access to the execlists.
- * For this allows us to catch up with an inflight preemption event,
- * and avoid blaming an innocent request if the stall was due to the
- * preemption itself.
- */
- process_csb(engine);
-
- /*
- * The last active request can then be no later than the last request
- * now in ELSP[0]. So search backwards from there, so that if the GPU
- * has advanced beyond the last CSB update, it will be pardoned.
- */
- active = NULL;
- request = port_request(execlists->port);
- if (request) {
- /*
- * Prevent the breadcrumb from advancing before we decide
- * which request is currently active.
- */
- intel_engine_stop_cs(engine);
-
- list_for_each_entry_from_reverse(request,
- &engine->timeline.requests,
- link) {
- if (__i915_request_completed(request,
- request->global_seqno))
- break;
-
- active = request;
- }
- }
-
+ process_csb(engine); /* drain preemption events */
spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
- return active;
}
-static void execlists_reset(struct intel_engine_cs *engine,
- struct i915_request *request)
+static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
+ struct i915_request *rq;
unsigned long flags;
u32 *regs;
- GEM_TRACE("%s request global=%d, current=%d\n",
- engine->name, request ? request->global_seqno : 0,
- intel_engine_get_seqno(engine));
-
spin_lock_irqsave(&engine->timeline.lock, flags);
/*
@@ -1819,12 +1783,18 @@ static void execlists_reset(struct intel_engine_cs *engine,
execlists_cancel_port_requests(execlists);
/* Push back any incomplete requests for replay after the reset. */
- __unwind_incomplete_requests(engine);
+ rq = __unwind_incomplete_requests(engine);
/* Following the reset, we need to reload the CSB read/write pointers */
reset_csb_pointers(&engine->execlists);
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
+ GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+ engine->name,
+ rq ? rq->global_seqno : 0,
+ intel_engine_get_seqno(engine),
+ yesno(stalled));
+ if (!rq)
+ goto out_unlock;
/*
* If the request was innocent, we leave the request in the ELSP
@@ -1837,8 +1807,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
* and have to at least restore the RING register in the context
* image back to the expected values to skip over the guilty request.
*/
- if (!request || request->fence.error != -EIO)
- return;
+ i915_reset_request(rq, stalled);
+ if (!stalled)
+ goto out_unlock;
/*
* We want a simple context + ring to execute the breadcrumb update.
@@ -1848,7 +1819,7 @@ static void execlists_reset(struct intel_engine_cs *engine,
* future request will be after userspace has had the opportunity
* to recreate its own state.
*/
- regs = request->hw_context->lrc_reg_state;
+ regs = rq->hw_context->lrc_reg_state;
if (engine->pinned_default_state) {
memcpy(regs, /* skip restoring the vanilla PPHWSP */
engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
@@ -1856,17 +1827,14 @@ static void execlists_reset(struct intel_engine_cs *engine,
}
/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
- request->ring->head = intel_ring_wrap(request->ring, request->postfix);
+ rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+ intel_ring_update_space(rq->ring);
- execlists_init_reg_state(regs, request->gem_context, engine,
- request->ring);
+ execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
+ __execlists_update_reg_state(engine, rq->hw_context);
- __execlists_update_reg_state(engine, request->hw_context);
-
- intel_ring_update_space(request->ring);
-
- /* Reset WaIdleLiteRestore:bdw,skl as well */
- unwind_wa_tail(request);
+out_unlock:
+ spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
static void execlists_reset_finish(struct intel_engine_cs *engine)
@@ -1879,6 +1847,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
* to sleep before we restart and reload a context.
*
*/
+ GEM_BUG_ON(!reset_in_progress(execlists));
if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
execlists->tasklet.func(execlists->tasklet.data);
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index c81db81e4416..f68c7975006c 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
if (!overlay)
return;
- intel_overlay_release_old_vid(overlay);
-
overlay->old_xscale = 0;
overlay->old_yscale = 0;
overlay->crtc = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 09c90475168a..a9efc8c71254 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -33,6 +33,7 @@
#include "i915_drv.h"
#include "i915_gem_render_state.h"
+#include "i915_reset.h"
#include "i915_trace.h"
#include "intel_drv.h"
#include "intel_workarounds.h"
@@ -711,52 +712,80 @@ out:
return ret;
}
-static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
+static void reset_prepare(struct intel_engine_cs *engine)
{
intel_engine_stop_cs(engine);
- return i915_gem_find_active_request(engine);
}
-static void skip_request(struct i915_request *rq)
+static void reset_ring(struct intel_engine_cs *engine, bool stalled)
{
- void *vaddr = rq->ring->vaddr;
+ struct i915_timeline *tl = &engine->timeline;
+ struct i915_request *pos, *rq;
+ unsigned long flags;
u32 head;
- head = rq->infix;
- if (rq->postfix < head) {
- memset32(vaddr + head, MI_NOOP,
- (rq->ring->size - head) / sizeof(u32));
- head = 0;
+ rq = NULL;
+ spin_lock_irqsave(&tl->lock, flags);
+ list_for_each_entry(pos, &tl->requests, link) {
+ if (!__i915_request_completed(pos, pos->global_seqno)) {
+ rq = pos;
+ break;
+ }
}
- memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
-}
-
-static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
-{
- GEM_TRACE("%s request global=%d, current=%d\n",
- engine->name, rq ? rq->global_seqno : 0,
- intel_engine_get_seqno(engine));
+ GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+ engine->name,
+ rq ? rq->global_seqno : 0,
+ intel_engine_get_seqno(engine),
+ yesno(stalled));
/*
- * Try to restore the logical GPU state to match the continuation
- * of the request queue. If we skip the context/PD restore, then
- * the next request may try to execute assuming that its context
- * is valid and loaded on the GPU and so may try to access invalid
- * memory, prompting repeated GPU hangs.
+ * The guilty request will get skipped on a hung engine.
*
- * If the request was guilty, we still restore the logical state
- * in case the next request requires it (e.g. the aliasing ppgtt),
- * but skip over the hung batch.
+ * Users of client default contexts do not rely on logical
+ * state preserved between batches so it is safe to execute
+ * queued requests following the hang. Non default contexts
+ * rely on preserved state, so skipping a batch loses the
+ * evolution of the state and it needs to be considered corrupted.
+ * Executing more queued batches on top of corrupted state is
+ * risky. But we take the risk by trying to advance through
+ * the queued requests in order to make the client behaviour
+ * more predictable around resets, by not throwing away random
+ * amount of batches it has prepared for execution. Sophisticated
+ * clients can use gem_reset_stats_ioctl and dma fence status
+ * (exported via sync_file info ioctl on explicit fences) to observe
+ * when it loses the context state and should rebuild accordingly.
*
- * If the request was innocent, we try to replay the request with
- * the restored context.
+ * The context ban, and ultimately the client ban, mechanism are safety
+ * valves if client submission ends up resulting in nothing more than
+ * subsequent hangs.
*/
+
if (rq) {
- /* If the rq hung, jump to its breadcrumb and skip the batch */
- rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
- if (rq->fence.error == -EIO)
- skip_request(rq);
+ /*
+ * Try to restore the logical GPU state to match the
+ * continuation of the request queue. If we skip the
+ * context/PD restore, then the next request may try to execute
+ * assuming that its context is valid and loaded on the GPU and
+ * so may try to access invalid memory, prompting repeated GPU
+ * hangs.
+ *
+ * If the request was guilty, we still restore the logical
+ * state in case the next request requires it (e.g. the
+ * aliasing ppgtt), but skip over the hung batch.
+ *
+ * If the request was innocent, we try to replay the request
+ * with the restored context.
+ */
+ i915_reset_request(rq, stalled);
+
+ GEM_BUG_ON(rq->ring != engine->buffer);
+ head = rq->head;
+ } else {
+ head = engine->buffer->tail;
}
+ engine->buffer->head = intel_ring_wrap(engine->buffer, head);
+
+ spin_unlock_irqrestore(&tl->lock, flags);
}
static void reset_finish(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5ad46c2fbc0f..f2effd001540 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -120,13 +120,8 @@ struct intel_instdone {
struct intel_engine_hangcheck {
u64 acthd;
u32 seqno;
- enum intel_engine_hangcheck_action action;
unsigned long action_timestamp;
- int deadlock;
struct intel_instdone instdone;
- struct i915_request *active_request;
- bool stalled:1;
- bool wedged:1;
};
struct intel_ring {
@@ -444,9 +439,8 @@ struct intel_engine_cs {
int (*init_hw)(struct intel_engine_cs *engine);
struct {
- struct i915_request *(*prepare)(struct intel_engine_cs *engine);
- void (*reset)(struct intel_engine_cs *engine,
- struct i915_request *rq);
+ void (*prepare)(struct intel_engine_cs *engine);
+ void (*reset)(struct intel_engine_cs *engine, bool stalled);
void (*finish)(struct intel_engine_cs *engine);
} reset;
@@ -1018,6 +1012,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
return cs;
}
+static inline void intel_engine_reset(struct intel_engine_cs *engine,
+ bool stalled)
+{
+ if (engine->reset.reset)
+ engine->reset.reset(engine, stalled);
+}
+
void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
bool intel_engine_is_idle(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 12550b55c42f..67431355cd6e 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -363,9 +363,7 @@ static int igt_global_reset(void *arg)
/* Check that we can issue a global GPU reset */
igt_global_reset_lock(i915);
- set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
- mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error);
i915_reset(i915, ALL_ENGINES, NULL);
@@ -374,9 +372,7 @@ static int igt_global_reset(void *arg)
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
}
- mutex_unlock(&i915->drm.struct_mutex);
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
igt_global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
@@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg)
i915_gem_set_wedged(i915);
GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
- set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
i915_reset(i915, ALL_ENGINES, NULL);
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
intel_runtime_pm_put(i915, wakeref);
mutex_unlock(&i915->drm.struct_mutex);
@@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
- if (!wait_for_idle(engine)) {
+ if (!i915_reset_flush(i915)) {
struct drm_printer p =
drm_info_printer(i915->drm.dev);
@@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg)
return 0;
}
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
{
- struct i915_gpu_error *error = &rq->i915->gpu_error;
- u32 reset_count = i915_reset_count(error);
-
- error->stalled_mask = mask;
-
- /* set_bit() must be after we have setup the backchannel (mask) */
- smp_mb__before_atomic();
- set_bit(I915_RESET_HANDOFF, &error->flags);
+ u32 count = i915_reset_count(&i915->gpu_error);
- wake_up_all(&error->wait_queue);
+ i915_reset(i915, mask, NULL);
- return reset_count;
+ return count;
}
static int igt_reset_wait(void *arg)
@@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg)
goto out_rq;
}
- reset_count = fake_hangcheck(rq, ALL_ENGINES);
+ reset_count = fake_hangcheck(i915, ALL_ENGINES);
timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
if (timeout < 0) {
@@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg)
goto out_rq;
}
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
@@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
}
out_reset:
- fake_hangcheck(rq, intel_engine_flag(rq->engine));
+ fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
if (tsk) {
struct igt_wedge_me w;
@@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg)
goto fini;
}
- reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
- i915_reset(i915, ENGINE_MASK(id), NULL);
-
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
- &i915->gpu_error.flags));
+ reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
if (prev->fence.error != -EIO) {
pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
@@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
pr_err("%s(%s): Failed to start request %llx, at %x\n",
__func__, engine->name,
rq->fence.seqno, hws_seqno(&h, rq));
+ i915_gem_set_wedged(i915);
err = -EIO;
}
@@ -1588,7 +1570,6 @@ out:
static void force_reset(struct drm_i915_private *i915)
{
i915_gem_set_wedged(i915);
- set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
i915_reset(i915, 0, NULL);
}
@@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg)
if (i915_terminally_wedged(&i915->gpu_error))
goto unlock;
+ if (intel_has_gpu_reset(i915)) {
+ const typeof(*phases) *p;
+
+ for (p = phases; p->name; p++) {
+ GEM_TRACE("intel_gpu_reset under %s\n", p->name);
+
+ p->critical_section_begin();
+ err = intel_gpu_reset(i915, ALL_ENGINES);
+ p->critical_section_end();
+
+ if (err) {
+ pr_err("intel_gpu_reset failed under %s\n",
+ p->name);
+ goto out;
+ }
+ }
+
+ force_reset(i915);
+ }
+
if (intel_has_reset_engine(i915)) {
struct intel_engine_cs *engine;
enum intel_engine_id id;
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index a8cac56be835..b15c4f26c593 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -214,7 +214,6 @@ out_put:
static int do_device_reset(struct intel_engine_cs *engine)
{
- set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags);
i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds");
return 0;
}
@@ -394,7 +393,6 @@ static int
live_gpu_reset_gt_engine_workarounds(void *arg)
{
struct drm_i915_private *i915 = arg;
- struct i915_gpu_error *error = &i915->gpu_error;
intel_wakeref_t wakeref;
struct wa_lists lists;
bool ok;
@@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg)
if (!ok)
goto out;
- set_bit(I915_RESET_HANDOFF, &error->flags);
i915_reset(i915, ALL_ENGINES, "live_workarounds");
ok = verify_gt_engine_wa(i915, &lists, "after reset");
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 5477ad4a7e7d..8ab5a2688a0c 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
i915_gem_contexts_lost(i915);
mutex_unlock(&i915->drm.struct_mutex);
- cancel_delayed_work_sync(&i915->gt.retire_work);
- cancel_delayed_work_sync(&i915->gt.idle_work);
+ drain_delayed_work(&i915->gt.retire_work);
+ drain_delayed_work(&i915->gt.idle_work);
i915_gem_drain_workqueue(i915);
mutex_lock(&i915->drm.struct_mutex);