diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_reset.c')
| -rw-r--r-- | drivers/gpu/drm/i915/gt/intel_reset.c | 857 |
1 files changed, 585 insertions, 272 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 39070b514e65..41b5036dc538 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1,59 +1,38 @@ +// SPDX-License-Identifier: MIT /* - * SPDX-License-Identifier: MIT - * * Copyright © 2008-2018 Intel Corporation */ #include <linux/sched/mm.h> #include <linux/stop_machine.h> +#include <linux/string_helpers.h> -#include "display/intel_display_types.h" +#include "display/intel_display_reset.h" #include "display/intel_overlay.h" - #include "gem/i915_gem_context.h" +#include "gt/intel_gt_regs.h" +#include "gt/uc/intel_gsc_fw.h" +#include "uc/intel_guc.h" #include "i915_drv.h" +#include "i915_file_private.h" #include "i915_gpu_error.h" #include "i915_irq.h" +#include "i915_reg.h" +#include "i915_wait_util.h" +#include "intel_breadcrumbs.h" #include "intel_engine_pm.h" +#include "intel_engine_regs.h" #include "intel_gt.h" #include "intel_gt_pm.h" +#include "intel_gt_print.h" +#include "intel_gt_requests.h" +#include "intel_mchbar_regs.h" +#include "intel_pci_config.h" #include "intel_reset.h" -#include "uc/intel_guc.h" -#include "uc/intel_guc_submission.h" - #define RESET_MAX_RETRIES 3 -/* XXX How to handle concurrent GGTT updates using tiling registers? */ -#define RESET_UNDER_STOP_MACHINE 0 - -static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) -{ - intel_uncore_rmw_fw(uncore, reg, 0, set); -} - -static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) -{ - intel_uncore_rmw_fw(uncore, reg, clr, 0); -} - -static void engine_skip_context(struct i915_request *rq) -{ - struct intel_engine_cs *engine = rq->engine; - struct intel_context *hung_ctx = rq->context; - - if (!i915_request_is_active(rq)) - return; - - lockdep_assert_held(&engine->active.lock); - list_for_each_entry_continue(rq, &engine->active.requests, sched.link) - if (rq->context == hung_ctx) { - i915_request_set_error_once(rq, -EIO); - __i915_request_skip(rq); - } -} - static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) { struct drm_i915_file_private *file_priv = ctx->file_priv; @@ -88,10 +67,8 @@ static bool mark_guilty(struct i915_request *rq) bool banned; int i; - if (intel_context_is_closed(rq->context)) { - intel_context_set_banned(rq->context); + if (intel_context_is_closed(rq->context)) return true; - } rcu_read_lock(); ctx = rcu_dereference(rq->context->gem_context); @@ -123,11 +100,9 @@ static bool mark_guilty(struct i915_request *rq) banned = !i915_gem_context_is_recoverable(ctx); if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) banned = true; - if (banned) { + if (banned) drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", ctx->name, atomic_read(&ctx->guilty_count)); - intel_context_set_banned(rq->context); - } client_mark_guilty(ctx, banned); @@ -149,21 +124,24 @@ static void mark_innocent(struct i915_request *rq) void __i915_request_reset(struct i915_request *rq, bool guilty) { - RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); + bool banned = false; - GEM_BUG_ON(i915_request_completed(rq)); + RQ_TRACE(rq, "guilty? %s\n", str_yes_no(guilty)); + GEM_BUG_ON(__i915_request_is_complete(rq)); rcu_read_lock(); /* protect the GEM context */ if (guilty) { i915_request_set_error_once(rq, -EIO); __i915_request_skip(rq); - if (mark_guilty(rq)) - engine_skip_context(rq); + banned = mark_guilty(rq); } else { i915_request_set_error_once(rq, -EAGAIN); mark_innocent(rq); } rcu_read_unlock(); + + if (banned) + intel_context_ban(rq->context, rq); } static bool i915_in_reset(struct pci_dev *pdev) @@ -178,19 +156,19 @@ static int i915_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, unsigned int retry) { - struct pci_dev *pdev = gt->i915->drm.pdev; + struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); int err; - /* Assert reset for at least 20 usec, and wait for acknowledgement. */ + /* Assert reset for at least 50 usec, and wait for acknowledgement. */ pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); udelay(50); - err = wait_for_atomic(i915_in_reset(pdev), 50); + err = _wait_for_atomic(i915_in_reset(pdev), 50000, 0); /* Clear the reset request. */ pci_write_config_byte(pdev, I915_GDRST, 0); udelay(50); if (!err) - err = wait_for_atomic(!i915_in_reset(pdev), 50); + err = _wait_for_atomic(!i915_in_reset(pdev), 50000, 0); return err; } @@ -207,44 +185,44 @@ static int g33_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, unsigned int retry) { - struct pci_dev *pdev = gt->i915->drm.pdev; + struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); - return wait_for_atomic(g4x_reset_complete(pdev), 50); + return _wait_for_atomic(g4x_reset_complete(pdev), 50000, 0); } static int g4x_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, unsigned int retry) { - struct pci_dev *pdev = gt->i915->drm.pdev; + struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); struct intel_uncore *uncore = gt->uncore; int ret; /* WaVcpClkGateDisableForMediaReset:ctg,elk */ - rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); + intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, 0, VCP_UNIT_CLOCK_GATE_DISABLE); intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); pci_write_config_byte(pdev, I915_GDRST, GRDOM_MEDIA | GRDOM_RESET_ENABLE); - ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + ret = _wait_for_atomic(g4x_reset_complete(pdev), 50000, 0); if (ret) { - drm_dbg(>->i915->drm, "Wait for media reset failed\n"); + GT_TRACE(gt, "Wait for media reset failed\n"); goto out; } pci_write_config_byte(pdev, I915_GDRST, GRDOM_RENDER | GRDOM_RESET_ENABLE); - ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + ret = _wait_for_atomic(g4x_reset_complete(pdev), 50000, 0); if (ret) { - drm_dbg(>->i915->drm, "Wait for render reset failed\n"); + GT_TRACE(gt, "Wait for render reset failed\n"); goto out; } out: pci_write_config_byte(pdev, I915_GDRST, 0); - rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); + intel_uncore_rmw_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE, 0); intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); return ret; @@ -263,7 +241,7 @@ static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 5000, 0, NULL); if (ret) { - drm_dbg(>->i915->drm, "Wait for render reset failed\n"); + GT_TRACE(gt, "Wait for render reset failed\n"); goto out; } @@ -274,7 +252,7 @@ static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 5000, 0, NULL); if (ret) { - drm_dbg(>->i915->drm, "Wait for media reset failed\n"); + GT_TRACE(gt, "Wait for media reset failed\n"); goto out; } @@ -288,39 +266,59 @@ out: static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) { struct intel_uncore *uncore = gt->uncore; + int loops; int err; /* + * On some platforms, e.g. Jasperlake, we see that the engine register + * state is not cleared until shortly after GDRST reports completion, + * causing a failure as we try to immediately resume while the internal + * state is still in flux. If we immediately repeat the reset, the + * second reset appears to serialise with the first, and since it is a + * no-op, the registers should retain their reset value. However, there + * is still a concern that upon leaving the second reset, the internal + * engine state is still in flux and not ready for resuming. + * + * Starting on MTL, there are some prep steps that we need to do when + * resetting some engines that need to be applied every time we write to + * GEN6_GDRST. As those are time consuming (tens of ms), we don't want + * to perform that twice, so, since the Jasperlake issue hasn't been + * observed on MTL, we avoid repeating the reset on newer platforms. + */ + loops = GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 70) ? 2 : 1; + + /* * GEN6_GDRST is not in the gt power well, no need to check * for fifo space for the write or forcewake the chip for * the read */ - intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); - - /* Wait for the device to ack the reset requests */ - err = __intel_wait_for_register_fw(uncore, - GEN6_GDRST, hw_domain_mask, 0, - 500, 0, - NULL); + do { + intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); + + /* Wait for the device to ack the reset requests. */ + err = __intel_wait_for_register_fw(uncore, GEN6_GDRST, + hw_domain_mask, 0, + 2000, 0, + NULL); + } while (err == 0 && --loops); if (err) - drm_dbg(>->i915->drm, - "Wait for 0x%08x engines reset failed\n", - hw_domain_mask); + GT_TRACE(gt, + "Wait for 0x%08x engines reset failed\n", + hw_domain_mask); + + /* + * As we have observed that the engine state is still volatile + * after GDRST is acked, impose a small delay to let everything settle. + */ + udelay(50); return err; } -static int gen6_reset_engines(struct intel_gt *gt, - intel_engine_mask_t engine_mask, - unsigned int retry) +static int __gen6_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) { - static const u32 hw_engine_mask[] = { - [RCS0] = GEN6_GRDOM_RENDER, - [BCS0] = GEN6_GRDOM_BLT, - [VCS0] = GEN6_GRDOM_MEDIA, - [VCS1] = GEN8_GRDOM_MEDIA2, - [VECS0] = GEN6_GRDOM_VECS, - }; struct intel_engine_cs *engine; u32 hw_mask; @@ -331,150 +329,215 @@ static int gen6_reset_engines(struct intel_gt *gt, hw_mask = 0; for_each_engine_masked(engine, gt, engine_mask, tmp) { - GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); - hw_mask |= hw_engine_mask[engine->id]; + hw_mask |= engine->reset_domain; } } return gen6_hw_domain_reset(gt, hw_mask); } -static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask) +static int gen6_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) { - struct intel_uncore *uncore = engine->uncore; - u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; - i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; - u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; - i915_reg_t sfc_usage; - u32 sfc_usage_bit; - u32 sfc_reset_bit; + unsigned long flags; int ret; + spin_lock_irqsave(>->uncore->lock, flags); + ret = __gen6_reset_engines(gt, engine_mask, retry); + spin_unlock_irqrestore(>->uncore->lock, flags); + + return ret; +} + +static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine) +{ + int vecs_id; + + GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS); + + vecs_id = _VECS((engine->instance) / 2); + + return engine->gt->engine[vecs_id]; +} + +struct sfc_lock_data { + i915_reg_t lock_reg; + i915_reg_t ack_reg; + i915_reg_t usage_reg; + u32 lock_bit; + u32 ack_bit; + u32 usage_bit; + u32 reset_bit; +}; + +static void get_sfc_forced_lock_data(struct intel_engine_cs *engine, + struct sfc_lock_data *sfc_lock) +{ switch (engine->class) { + default: + MISSING_CASE(engine->class); + fallthrough; case VIDEO_DECODE_CLASS: - if ((BIT(engine->instance) & vdbox_sfc_access) == 0) - return 0; + sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine->mmio_base); + sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; - sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); - sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; + sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base); + sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; - sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); - sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; + sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine->mmio_base); + sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT; + sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); - sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); - sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; - sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); break; - case VIDEO_ENHANCEMENT_CLASS: - sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); - sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; + sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine->mmio_base); + sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; + + sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine->mmio_base); + sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; - sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); - sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; + sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine->mmio_base); + sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT; + sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); - sfc_usage = GEN11_VECS_SFC_USAGE(engine); - sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; - sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); break; + } +} + +static int gen11_lock_sfc(struct intel_engine_cs *engine, + u32 *reset_mask, + u32 *unlock_mask) +{ + struct intel_uncore *uncore = engine->uncore; + u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; + struct sfc_lock_data sfc_lock; + bool lock_obtained, lock_to_other = false; + int ret; + switch (engine->class) { + case VIDEO_DECODE_CLASS: + if ((BIT(engine->instance) & vdbox_sfc_access) == 0) + return 0; + + fallthrough; + case VIDEO_ENHANCEMENT_CLASS: + get_sfc_forced_lock_data(engine, &sfc_lock); + + break; default: return 0; } + if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) { + struct intel_engine_cs *paired_vecs; + + if (engine->class != VIDEO_DECODE_CLASS || + GRAPHICS_VER(engine->i915) != 12) + return 0; + + /* + * Wa_14010733141 + * + * If the VCS-MFX isn't using the SFC, we also need to check + * whether VCS-HCP is using it. If so, we need to issue a *VE* + * forced lock on the VE engine that shares the same SFC. + */ + if (!(intel_uncore_read_fw(uncore, + GEN12_HCP_SFC_LOCK_STATUS(engine->mmio_base)) & + GEN12_HCP_SFC_USAGE_BIT)) + return 0; + + paired_vecs = find_sfc_paired_vecs_engine(engine); + get_sfc_forced_lock_data(paired_vecs, &sfc_lock); + lock_to_other = true; + *unlock_mask |= paired_vecs->mask; + } else { + *unlock_mask |= engine->mask; + } + /* - * If the engine is using a SFC, tell the engine that a software reset + * If the engine is using an SFC, tell the engine that a software reset * is going to happen. The engine will then try to force lock the SFC. * If SFC ends up being locked to the engine we want to reset, we have * to reset it as well (we will unlock it once the reset sequence is * completed). */ - if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) - return 0; - - rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); + intel_uncore_rmw_fw(uncore, sfc_lock.lock_reg, 0, sfc_lock.lock_bit); ret = __intel_wait_for_register_fw(uncore, - sfc_forced_lock_ack, - sfc_forced_lock_ack_bit, - sfc_forced_lock_ack_bit, + sfc_lock.ack_reg, + sfc_lock.ack_bit, + sfc_lock.ack_bit, 1000, 0, NULL); - /* Was the SFC released while we were trying to lock it? */ - if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)) + /* + * Was the SFC released while we were trying to lock it? + * + * We should reset both the engine and the SFC if: + * - We were locking the SFC to this engine and the lock succeeded + * OR + * - We were locking the SFC to a different engine (Wa_14010733141) + * but the SFC was released before the lock was obtained. + * + * Otherwise we need only reset the engine by itself and we can + * leave the SFC alone. + */ + lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & + sfc_lock.usage_bit) != 0; + if (lock_obtained == lock_to_other) return 0; if (ret) { - drm_dbg(&engine->i915->drm, - "Wait for SFC forced lock ack failed\n"); + ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n"); return ret; } - *hw_mask |= sfc_reset_bit; + *reset_mask |= sfc_lock.reset_bit; return 0; } static void gen11_unlock_sfc(struct intel_engine_cs *engine) { struct intel_uncore *uncore = engine->uncore; - u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; - i915_reg_t sfc_forced_lock; - u32 sfc_forced_lock_bit; - - switch (engine->class) { - case VIDEO_DECODE_CLASS: - if ((BIT(engine->instance) & vdbox_sfc_access) == 0) - return; - - sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); - sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; - break; + u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; + struct sfc_lock_data sfc_lock = {}; - case VIDEO_ENHANCEMENT_CLASS: - sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); - sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; - break; + if (engine->class != VIDEO_DECODE_CLASS && + engine->class != VIDEO_ENHANCEMENT_CLASS) + return; - default: + if (engine->class == VIDEO_DECODE_CLASS && + (BIT(engine->instance) & vdbox_sfc_access) == 0) return; - } - rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); + get_sfc_forced_lock_data(engine, &sfc_lock); + + intel_uncore_rmw_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit, 0); } -static int gen11_reset_engines(struct intel_gt *gt, - intel_engine_mask_t engine_mask, - unsigned int retry) +static int __gen11_reset_engines(struct intel_gt *gt, + intel_engine_mask_t engine_mask, + unsigned int retry) { - static const u32 hw_engine_mask[] = { - [RCS0] = GEN11_GRDOM_RENDER, - [BCS0] = GEN11_GRDOM_BLT, - [VCS0] = GEN11_GRDOM_MEDIA, - [VCS1] = GEN11_GRDOM_MEDIA2, - [VCS2] = GEN11_GRDOM_MEDIA3, - [VCS3] = GEN11_GRDOM_MEDIA4, - [VECS0] = GEN11_GRDOM_VECS, - [VECS1] = GEN11_GRDOM_VECS2, - }; struct intel_engine_cs *engine; intel_engine_mask_t tmp; - u32 hw_mask; + u32 reset_mask, unlock_mask = 0; int ret; if (engine_mask == ALL_ENGINES) { - hw_mask = GEN11_GRDOM_FULL; + reset_mask = GEN11_GRDOM_FULL; } else { - hw_mask = 0; + reset_mask = 0; for_each_engine_masked(engine, gt, engine_mask, tmp) { - GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); - hw_mask |= hw_engine_mask[engine->id]; - ret = gen11_lock_sfc(engine, &hw_mask); + reset_mask |= engine->reset_domain; + ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask); if (ret) goto sfc_unlock; } } - ret = gen6_hw_domain_reset(gt, hw_mask); + ret = gen6_hw_domain_reset(gt, reset_mask); sfc_unlock: /* @@ -482,10 +545,14 @@ sfc_unlock: * gen11_lock_sfc to make sure that we clean properly if something * wrong happened during the lock (e.g. lock acquired after timeout * expiration). + * + * Due to Wa_14010733141, we may have locked an SFC to an engine that + * wasn't being reset. So instead of calling gen11_unlock_sfc() + * on engine_mask, we instead call it on the mask of engines that our + * gen11_lock_sfc() calls told us actually had locks attempted. */ - if (engine_mask != ALL_ENGINES) - for_each_engine_masked(engine, gt, engine_mask, tmp) - gen11_unlock_sfc(engine); + for_each_engine_masked(engine, gt, unlock_mask, tmp) + gen11_unlock_sfc(engine); return ret; } @@ -497,6 +564,9 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) u32 request, mask, ack; int ret; + if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1))) + return -ETIMEDOUT; + ack = intel_uncore_read_fw(uncore, reg); if (ack & RESET_CTL_CAT_ERROR) { /* @@ -520,10 +590,10 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 700, 0, NULL); if (ret) - drm_err(&engine->i915->drm, - "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", - engine->name, request, - intel_uncore_read_fw(uncore, reg)); + gt_err(engine->gt, + "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", + engine->name, request, + intel_uncore_read_fw(uncore, reg)); return ret; } @@ -542,8 +612,11 @@ static int gen8_reset_engines(struct intel_gt *gt, struct intel_engine_cs *engine; const bool reset_non_ready = retry >= 1; intel_engine_mask_t tmp; + unsigned long flags; int ret; + spin_lock_irqsave(>->uncore->lock, flags); + for_each_engine_masked(engine, gt, engine_mask, tmp) { ret = gen8_engine_reset_prepare(engine); if (ret && !reset_non_ready) @@ -564,15 +637,26 @@ static int gen8_reset_engines(struct intel_gt *gt, */ } - if (INTEL_GEN(gt->i915) >= 11) - ret = gen11_reset_engines(gt, engine_mask, retry); + /* + * Wa_22011100796:dg2, whenever Full soft reset is required, + * reset all individual engines firstly, and then do a full soft reset. + * + * This is best effort, so ignore any error from the initial reset. + */ + if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES) + __gen11_reset_engines(gt, gt->info.engine_mask, 0); + + if (GRAPHICS_VER(gt->i915) >= 11) + ret = __gen11_reset_engines(gt, engine_mask, retry); else - ret = gen6_reset_engines(gt, engine_mask, retry); + ret = __gen6_reset_engines(gt, engine_mask, retry); skip_reset: for_each_engine_masked(engine, gt, engine_mask, tmp) gen8_engine_reset_cancel(engine); + spin_unlock_irqrestore(>->uncore->lock, flags); + return ret; } @@ -593,23 +677,91 @@ static reset_func intel_get_gpu_reset(const struct intel_gt *gt) if (is_mock_gt(gt)) return mock_reset; - else if (INTEL_GEN(i915) >= 8) + else if (GRAPHICS_VER(i915) >= 8) return gen8_reset_engines; - else if (INTEL_GEN(i915) >= 6) + else if (GRAPHICS_VER(i915) >= 6) return gen6_reset_engines; - else if (INTEL_GEN(i915) >= 5) + else if (GRAPHICS_VER(i915) >= 5) return ilk_do_reset; else if (IS_G4X(i915)) return g4x_do_reset; else if (IS_G33(i915) || IS_PINEVIEW(i915)) return g33_do_reset; - else if (INTEL_GEN(i915) >= 3) + else if (GRAPHICS_VER(i915) >= 3) return i915_do_reset; else return NULL; } -int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) +static int __reset_guc(struct intel_gt *gt) +{ + u32 guc_domain = + GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; + + return gen6_hw_domain_reset(gt, guc_domain); +} + +static bool needs_wa_14015076503(struct intel_gt *gt, intel_engine_mask_t engine_mask) +{ + if (MEDIA_VER_FULL(gt->i915) != IP_VER(13, 0) || !HAS_ENGINE(gt, GSC0)) + return false; + + if (!__HAS_ENGINE(engine_mask, GSC0)) + return false; + + return intel_gsc_uc_fw_init_done(>->uc.gsc); +} + +static intel_engine_mask_t +wa_14015076503_start(struct intel_gt *gt, intel_engine_mask_t engine_mask, bool first) +{ + if (!needs_wa_14015076503(gt, engine_mask)) + return engine_mask; + + /* + * wa_14015076503: if the GSC FW is loaded, we need to alert it that + * we're going to do a GSC engine reset and then wait for 200ms for the + * FW to get ready for it. However, if this is the first ALL_ENGINES + * reset attempt and the GSC is not busy, we can try to instead reset + * the GuC and all the other engines individually to avoid the 200ms + * wait. + * Skipping the GSC engine is safe because, differently from other + * engines, the GSCCS only role is to forward the commands to the GSC + * FW, so it doesn't have any HW outside of the CS itself and therefore + * it has no state that we don't explicitly re-init on resume or on + * context switch LRC or power context). The HW for the GSC uC is + * managed by the GSC FW so we don't need to care about that. + */ + if (engine_mask == ALL_ENGINES && first && intel_engine_is_idle(gt->engine[GSC0])) { + __reset_guc(gt); + engine_mask = gt->info.engine_mask & ~BIT(GSC0); + } else { + intel_uncore_rmw(gt->uncore, + HECI_H_GS1(MTL_GSC_HECI2_BASE), + 0, HECI_H_GS1_ER_PREP); + + /* make sure the reset bit is clear when writing the CSR reg */ + intel_uncore_rmw(gt->uncore, + HECI_H_CSR(MTL_GSC_HECI2_BASE), + HECI_H_CSR_RST, HECI_H_CSR_IG); + msleep(200); + } + + return engine_mask; +} + +static void +wa_14015076503_end(struct intel_gt *gt, intel_engine_mask_t engine_mask) +{ + if (!needs_wa_14015076503(gt, engine_mask)) + return; + + intel_uncore_rmw(gt->uncore, + HECI_H_GS1(MTL_GSC_HECI2_BASE), + HECI_H_GS1_ER_PREP, 0); +} + +static int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) { const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; reset_func reset; @@ -626,10 +778,14 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) */ intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { - GT_TRACE(gt, "engine_mask=%x\n", engine_mask); - preempt_disable(); - ret = reset(gt, engine_mask, retry); - preempt_enable(); + intel_engine_mask_t reset_mask; + + reset_mask = wa_14015076503_start(gt, engine_mask, !retry); + + GT_TRACE(gt, "engine_mask=%x\n", reset_mask); + ret = reset(gt, reset_mask, retry); + + wa_14015076503_end(gt, reset_mask); } intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); @@ -638,7 +794,7 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) bool intel_has_gpu_reset(const struct intel_gt *gt) { - if (!i915_modparams.reset) + if (!gt->i915->params.reset) return NULL; return intel_get_gpu_reset(gt); @@ -646,7 +802,7 @@ bool intel_has_gpu_reset(const struct intel_gt *gt) bool intel_has_reset_engine(const struct intel_gt *gt) { - if (i915_modparams.reset < 2) + if (gt->i915->params.reset < 2) return false; return INTEL_INFO(gt->i915)->has_reset_engine; @@ -654,14 +810,12 @@ bool intel_has_reset_engine(const struct intel_gt *gt) int intel_reset_guc(struct intel_gt *gt) { - u32 guc_domain = - INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; int ret; GEM_BUG_ON(!HAS_GT_UC(gt->i915)); intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); - ret = gen6_hw_domain_reset(gt, guc_domain); + ret = __reset_guc(gt); intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); return ret; @@ -707,7 +861,7 @@ static void revoke_mmaps(struct intel_gt *gt) continue; node = &vma->mmo->vma_node; - vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; + vma_offset = vma->gtt_view.partial.offset << PAGE_SHIFT; unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, drm_vma_node_offset_addr(node) + vma_offset, @@ -722,14 +876,24 @@ static intel_engine_mask_t reset_prepare(struct intel_gt *gt) intel_engine_mask_t awake = 0; enum intel_engine_id id; + /** + * For GuC mode with submission enabled, ensure submission + * is disabled before stopping ring. + * + * For GuC mode with submission disabled, ensure that GuC is not + * sanitized, do that after engine reset. reset_prepare() + * is followed by engine reset which in this mode requires GuC to + * process any CSB FIFO entries generated by the resets. + */ + if (intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); + for_each_engine(engine, gt, id) { if (intel_engine_pm_get_if_awake(engine)) awake |= engine->mask; reset_prepare_engine(engine); } - intel_uc_reset_prepare(>->uc); - return awake; } @@ -752,8 +916,12 @@ static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) if (err) return err; + local_bh_disable(); for_each_engine(engine, gt, id) __intel_engine_reset(engine, stalled_mask & engine->mask); + local_bh_enable(); + + intel_uc_reset(>->uc, ALL_ENGINES); intel_ggtt_restore_fences(gt->ggtt); @@ -779,22 +947,21 @@ static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) if (awake & engine->mask) intel_engine_pm_put(engine); } + + intel_uc_reset_finish(>->uc); } static void nop_submit_request(struct i915_request *request) { - struct intel_engine_cs *engine = request->engine; - unsigned long flags; - RQ_TRACE(request, "-EIO\n"); - i915_request_set_error_once(request, -EIO); - spin_lock_irqsave(&engine->active.lock, flags); - __i915_request_submit(request); - i915_request_mark_complete(request); - spin_unlock_irqrestore(&engine->active.lock, flags); + request = i915_request_mark_eio(request); + if (request) { + i915_request_submit(request); + intel_engine_signal_breadcrumbs(request->engine); - intel_engine_signal_breadcrumbs(engine); + i915_request_put(request); + } } static void __intel_gt_set_wedged(struct intel_gt *gt) @@ -816,8 +983,8 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) awake = reset_prepare(gt); /* Even if the GPU reset fails, it should still stop the engines */ - if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - __intel_gt_reset(gt, ALL_ENGINES); + if (!intel_gt_gpu_reset_clobbers_display(gt)) + intel_gt_reset_all_engines(gt); for_each_engine(engine, gt, id) engine->submit_request = nop_submit_request; @@ -831,15 +998,27 @@ static void __intel_gt_set_wedged(struct intel_gt *gt) set_bit(I915_WEDGED, >->reset.flags); /* Mark all executing requests as skipped */ + local_bh_disable(); for_each_engine(engine, gt, id) if (engine->reset.cancel) engine->reset.cancel(engine); + intel_uc_cancel_requests(>->uc); + local_bh_enable(); reset_finish(gt, awake); GT_TRACE(gt, "end\n"); } +static void set_wedged_work(struct work_struct *w) +{ + struct intel_gt *gt = container_of(w, struct intel_gt, wedge); + intel_wakeref_t wf; + + with_intel_runtime_pm(gt->uncore->rpm, wf) + __intel_gt_set_wedged(gt); +} + void intel_gt_set_wedged(struct intel_gt *gt) { intel_wakeref_t wakeref; @@ -851,7 +1030,8 @@ void intel_gt_set_wedged(struct intel_gt *gt) mutex_lock(>->reset.mutex); if (GEM_SHOW_DEBUG()) { - struct drm_printer p = drm_debug_printer(__func__); + struct drm_printer p = drm_dbg_printer(>->i915->drm, + DRM_UT_DRIVER, NULL); struct intel_engine_cs *engine; enum intel_engine_id id; @@ -880,7 +1060,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt) return true; /* Never fully initialised, recovery impossible */ - if (test_bit(I915_WEDGED_ON_INIT, >->reset.flags)) + if (intel_gt_has_unrecoverable_error(gt)) return false; GT_TRACE(gt, "start\n"); @@ -915,7 +1095,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt) dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); dma_fence_put(fence); - /* Restart iteration after droping lock */ + /* Restart iteration after dropping lock */ spin_lock(&timelines->lock); tl = list_entry(&timelines->active_list, typeof(*tl), link); } @@ -923,14 +1103,14 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt) /* We must reset pending GPU events before restoring our submission */ ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ - if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; + if (!intel_gt_gpu_reset_clobbers_display(gt)) + ok = intel_gt_reset_all_engines(gt) == 0; if (!ok) { /* * Warn CI about the unrecoverable wedged condition. * Time for a reboot. */ - add_taint_for_CI(TAINT_WARN); + add_taint_for_CI(gt->i915, TAINT_WARN); return false; } @@ -968,12 +1148,10 @@ static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) { int err, i; - gt_revoke(gt); - - err = __intel_gt_reset(gt, ALL_ENGINES); + err = intel_gt_reset_all_engines(gt); for (i = 0; err && i < RESET_MAX_RETRIES; i++) { msleep(10 * (i + 1)); - err = __intel_gt_reset(gt, ALL_ENGINES); + err = intel_gt_reset_all_engines(gt); } if (err) return err; @@ -996,6 +1174,13 @@ static int resume(struct intel_gt *gt) return 0; } +bool intel_gt_gpu_reset_clobbers_display(struct intel_gt *gt) +{ + struct drm_i915_private *i915 = gt->i915; + + return INTEL_INFO(i915)->gpu_reset_clobbers_display; +} + /** * intel_gt_reset - reset chip after a hang * @gt: #intel_gt to reset @@ -1017,6 +1202,7 @@ void intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask, const char *reason) { + struct intel_display *display = gt->i915->display; intel_engine_mask_t awake; int ret; @@ -1024,6 +1210,13 @@ void intel_gt_reset(struct intel_gt *gt, might_sleep(); GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); + + /* + * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence + * critical section like gpu reset. + */ + gt_revoke(gt); + mutex_lock(>->reset.mutex); /* Clear any previous failed attempts at recovery. Time to try again. */ @@ -1031,33 +1224,35 @@ void intel_gt_reset(struct intel_gt *gt, goto unlock; if (reason) - drm_notice(>->i915->drm, - "Resetting chip for %s\n", reason); + gt_notice(gt, "Resetting chip for %s\n", reason); atomic_inc(>->i915->gpu_error.reset_count); awake = reset_prepare(gt); if (!intel_has_gpu_reset(gt)) { - if (i915_modparams.reset) - drm_err(>->i915->drm, "GPU reset not supported\n"); + if (gt->i915->params.reset) + gt_err(gt, "GPU reset not supported\n"); else - drm_dbg(>->i915->drm, "GPU reset disabled\n"); + gt_dbg(gt, "GPU reset disabled\n"); goto error; } - if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - intel_runtime_pm_disable_interrupts(gt->i915); + if (intel_gt_gpu_reset_clobbers_display(gt)) + intel_irq_suspend(gt->i915); if (do_reset(gt, stalled_mask)) { - drm_err(>->i915->drm, "Failed to reset chip\n"); + gt_err(gt, "Failed to reset chip\n"); goto taint; } - if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) - intel_runtime_pm_enable_interrupts(gt->i915); + if (intel_gt_gpu_reset_clobbers_display(gt)) + intel_irq_resume(gt->i915); - intel_overlay_reset(gt->i915); + intel_overlay_reset(display); + /* sanitize uC after engine reset */ + if (!intel_uc_uses_guc_submission(>->uc)) + intel_uc_reset_prepare(>->uc); /* * Next we need to restore the context, but we don't use those * yet either... @@ -1068,9 +1263,7 @@ void intel_gt_reset(struct intel_gt *gt, */ ret = intel_gt_init_hw(gt); if (ret) { - drm_err(>->i915->drm, - "Failed to initialise HW following reset (%d)\n", - ret); + gt_err(gt, "Failed to initialise HW following reset (%d)\n", ret); goto taint; } @@ -1097,39 +1290,51 @@ taint: * rather than continue on into oblivion. For everyone else, * the system should still plod along, but they have been warned! */ - add_taint_for_CI(TAINT_WARN); + add_taint_for_CI(gt->i915, TAINT_WARN); error: __intel_gt_set_wedged(gt); goto finish; } -static inline int intel_gt_reset_engine(struct intel_engine_cs *engine) +/** + * intel_gt_reset_all_engines() - Reset all engines in the given gt. + * @gt: the GT to reset all engines for. + * + * This function resets all engines within the given gt. + * + * Returns: + * Zero on success, negative error code on failure. + */ +int intel_gt_reset_all_engines(struct intel_gt *gt) { - return __intel_gt_reset(engine->gt, engine->mask); + return __intel_gt_reset(gt, ALL_ENGINES); } /** - * intel_engine_reset - reset GPU engine to recover from a hang - * @engine: engine to reset - * @msg: reason for GPU reset; or NULL for no drm_notice() + * intel_gt_reset_engine() - Reset a specific engine within a gt. + * @engine: engine to be reset. * - * Reset a specific GPU engine. Useful if a hang is detected. - * Returns zero on successful reset or otherwise an error code. + * This function resets the specified engine within a gt. * - * Procedure is: - * - identifies the request that caused the hang and it is dropped - * - reset engine (which will force the engine to idle) - * - re-init/configure engine + * Returns: + * Zero on success, negative error code on failure. */ -int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) +int intel_gt_reset_engine(struct intel_engine_cs *engine) +{ + return __intel_gt_reset(engine->gt, engine->mask); +} + +int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) { struct intel_gt *gt = engine->gt; - bool uses_guc = intel_engine_in_guc_submission_mode(engine); int ret; ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); + if (intel_engine_uses_guc(engine)) + return -ENODEV; + if (!intel_engine_pm_get_if_awake(engine)) return 0; @@ -1138,16 +1343,12 @@ int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) if (msg) drm_notice(&engine->i915->drm, "Resetting %s for %s\n", engine->name, msg); - atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); + i915_increase_reset_engine_count(&engine->i915->gpu_error, engine); - if (!uses_guc) - ret = intel_gt_reset_engine(engine); - else - ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine); + ret = intel_gt_reset_engine(engine); if (ret) { /* If we fail here, we expect to fallback to a global reset */ - drm_dbg(>->i915->drm, "%sFailed to reset %s, ret=%d\n", - uses_guc ? "GuC " : "", engine->name, ret); + ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret); goto out; } @@ -1172,6 +1373,35 @@ out: return ret; } +/** + * intel_engine_reset - reset GPU engine to recover from a hang + * @engine: engine to reset + * @msg: reason for GPU reset; or NULL for no drm_notice() + * + * Reset a specific GPU engine. Useful if a hang is detected. + * Returns zero on successful reset or otherwise an error code. + * + * Procedure is: + * - identifies the request that caused the hang and it is dropped + * - reset engine (which will force the engine to idle) + * - re-init/configure engine + */ +int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) +{ + int err; + + local_bh_disable(); + err = __intel_engine_reset_bh(engine, msg); + local_bh_enable(); + + return err; +} + +static void display_reset_modeset_stuck(void *gt) +{ + intel_gt_set_wedged(gt); +} + static void intel_gt_reset_global(struct intel_gt *gt, u32 engine_mask, const char *reason) @@ -1184,23 +1414,39 @@ static void intel_gt_reset_global(struct intel_gt *gt, kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); - drm_dbg(>->i915->drm, "resetting chip, engines=%x\n", engine_mask); + GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask); kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); /* Use a watchdog to ensure that our reset completes */ - intel_wedge_on_timeout(&w, gt, 5 * HZ) { - intel_prepare_reset(gt->i915); + intel_wedge_on_timeout(&w, gt, 60 * HZ) { + struct drm_i915_private *i915 = gt->i915; + struct intel_display *display = i915->display; + bool need_display_reset; + bool reset_display; - /* Flush everyone using a resource about to be clobbered */ - synchronize_srcu_expedited(>->reset.backoff_srcu); + need_display_reset = intel_gt_gpu_reset_clobbers_display(gt) && + intel_has_gpu_reset(gt); + + reset_display = intel_display_reset_test(display) || + need_display_reset; + + if (reset_display) + reset_display = intel_display_reset_prepare(display, + display_reset_modeset_stuck, + gt); intel_gt_reset(gt, engine_mask, reason); - intel_finish_reset(gt->i915); + if (reset_display) + intel_display_reset_finish(display, !need_display_reset); } if (!test_bit(I915_WEDGED, >->reset.flags)) kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); + else + drm_dev_wedged_event(>->i915->drm, + DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET, + NULL); } /** @@ -1246,10 +1492,10 @@ void intel_gt_handle_error(struct intel_gt *gt, */ wakeref = intel_runtime_pm_get(gt->uncore->rpm); - engine_mask &= INTEL_INFO(gt->i915)->engine_mask; + engine_mask &= gt->info.engine_mask; if (flags & I915_ERROR_CAPTURE) { - i915_capture_error_state(gt->i915); + i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE); intel_gt_clear_error_registers(gt, engine_mask); } @@ -1257,19 +1503,22 @@ void intel_gt_handle_error(struct intel_gt *gt, * Try engine reset when available. We fall back to full reset if * single reset fails. */ - if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { + if (!intel_uc_uses_guc_submission(>->uc) && + intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { + local_bh_disable(); for_each_engine_masked(engine, gt, engine_mask, tmp) { - BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); + BUILD_BUG_ON(I915_RESET_BACKOFF >= I915_RESET_ENGINE); if (test_and_set_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)) continue; - if (intel_engine_reset(engine, msg) == 0) + if (__intel_engine_reset_bh(engine, msg) == 0) engine_mask &= ~engine->mask; clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, >->reset.flags); } + local_bh_enable(); } if (!engine_mask) @@ -1285,20 +1534,30 @@ void intel_gt_handle_error(struct intel_gt *gt, /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ synchronize_rcu_expedited(); - /* Prevent any other reset-engine attempt. */ - for_each_engine(engine, gt, tmp) { - while (test_and_set_bit(I915_RESET_ENGINE + engine->id, - >->reset.flags)) - wait_on_bit(>->reset.flags, - I915_RESET_ENGINE + engine->id, - TASK_UNINTERRUPTIBLE); + /* + * Prevent any other reset-engine attempt. We don't do this for GuC + * submission the GuC owns the per-engine reset, not the i915. + */ + if (!intel_uc_uses_guc_submission(>->uc)) { + for_each_engine(engine, gt, tmp) { + while (test_and_set_bit(I915_RESET_ENGINE + engine->id, + >->reset.flags)) + wait_on_bit(>->reset.flags, + I915_RESET_ENGINE + engine->id, + TASK_UNINTERRUPTIBLE); + } } + /* Flush everyone using a resource about to be clobbered */ + synchronize_srcu_expedited(>->reset.backoff_srcu); + intel_gt_reset_global(gt, engine_mask, msg); - for_each_engine(engine, gt, tmp) - clear_bit_unlock(I915_RESET_ENGINE + engine->id, - >->reset.flags); + if (!intel_uc_uses_guc_submission(>->uc)) { + for_each_engine(engine, gt, tmp) + clear_bit_unlock(I915_RESET_ENGINE + engine->id, + >->reset.flags); + } clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); smp_mb__after_atomic(); wake_up_all(>->reset.queue); @@ -1307,15 +1566,19 @@ out: intel_runtime_pm_put(gt->uncore->rpm, wakeref); } -int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) +static int _intel_gt_reset_lock(struct intel_gt *gt, int *srcu, bool retry) { might_lock(>->reset.backoff_srcu); - might_sleep(); + if (retry) + might_sleep(); rcu_read_lock(); while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { rcu_read_unlock(); + if (!retry) + return -EBUSY; + if (wait_event_interruptible(gt->reset.queue, !test_bit(I915_RESET_BACKOFF, >->reset.flags))) @@ -1329,6 +1592,16 @@ int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) return 0; } +int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) +{ + return _intel_gt_reset_lock(gt, srcu, false); +} + +int intel_gt_reset_lock_interruptible(struct intel_gt *gt, int *srcu) +{ + return _intel_gt_reset_lock(gt, srcu, true); +} + void intel_gt_reset_unlock(struct intel_gt *gt, int tag) __releases(>->reset.backoff_srcu) { @@ -1342,7 +1615,7 @@ int intel_gt_terminally_wedged(struct intel_gt *gt) if (!intel_gt_is_wedged(gt)) return 0; - if (intel_gt_has_init_error(gt)) + if (intel_gt_has_unrecoverable_error(gt)) return -EIO; /* Reset still in progress? Maybe we will recover? */ @@ -1359,7 +1632,19 @@ void intel_gt_set_wedged_on_init(struct intel_gt *gt) BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > I915_WEDGED_ON_INIT); intel_gt_set_wedged(gt); + i915_disable_error_state(gt->i915, -ENODEV); set_bit(I915_WEDGED_ON_INIT, >->reset.flags); + + /* Wedged on init is non-recoverable */ + add_taint_for_CI(gt->i915, TAINT_WARN); +} + +void intel_gt_set_wedged_on_fini(struct intel_gt *gt) +{ + intel_gt_set_wedged(gt); + i915_disable_error_state(gt->i915, -ENODEV); + set_bit(I915_WEDGED_ON_FINI, >->reset.flags); + intel_gt_retire_requests(gt); /* cleanup any wedged requests */ } void intel_gt_init_reset(struct intel_gt *gt) @@ -1367,6 +1652,18 @@ void intel_gt_init_reset(struct intel_gt *gt) init_waitqueue_head(>->reset.queue); mutex_init(>->reset.mutex); init_srcu_struct(>->reset.backoff_srcu); + INIT_WORK(>->wedge, set_wedged_work); + + /* + * While undesirable to wait inside the shrinker, complain anyway. + * + * If we have to wait during shrinking, we guarantee forward progress + * by forcing the reset. Therefore during the reset we must not + * re-enter the shrinker. By declaring that we take the reset mutex + * within the shrinker, we forbid ourselves from performing any + * fs-reclaim or taking related locks during reset. + */ + i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex); /* no GPU until we are ready! */ __set_bit(I915_WEDGED, >->reset.flags); @@ -1381,10 +1678,8 @@ static void intel_wedge_me(struct work_struct *work) { struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); - drm_err(&w->gt->i915->drm, - "%s timed out, cancelling all in-flight rendering.\n", - w->name); - intel_gt_set_wedged(w->gt); + gt_err(w->gt, "%s timed out, cancelling all in-flight rendering.\n", w->name); + set_wedged_work(&w->gt->wedge); } void __intel_init_wedge(struct intel_wedge_me *w, @@ -1396,7 +1691,7 @@ void __intel_init_wedge(struct intel_wedge_me *w, w->name = name; INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); - schedule_delayed_work(&w->work, timeout); + queue_delayed_work(gt->i915->unordered_wq, &w->work, timeout); } void __intel_fini_wedge(struct intel_wedge_me *w) @@ -1406,6 +1701,24 @@ void __intel_fini_wedge(struct intel_wedge_me *w) w->gt = NULL; } +/* + * Wa_22011802037 requires that we (or the GuC) ensure that no command + * streamers are executing MI_FORCE_WAKE while an engine reset is initiated. + */ +bool intel_engine_reset_needs_wa_22011802037(struct intel_gt *gt) +{ + if (GRAPHICS_VER(gt->i915) < 11) + return false; + + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0)) + return true; + + if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) + return false; + + return true; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_reset.c" #include "selftest_hangcheck.c" |
