diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
| -rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 210 |
1 files changed, 138 insertions, 72 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 625b3c024540..7582ef34bf3f 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -40,8 +40,7 @@ #include <drm/drm_cache.h> #include <drm/drm_print.h> -#include "display/intel_dmc.h" -#include "display/intel_overlay.h" +#include "display/intel_display_snapshot.h" #include "gem/i915_gem_context.h" #include "gem/i915_gem_lmem.h" @@ -651,8 +650,6 @@ static void err_print_capabilities(struct drm_i915_error_state_buf *m, struct drm_printer p = i915_error_printer(m); intel_device_info_print(&error->device_info, &error->runtime_info, &p); - intel_display_device_info_print(&error->display_device_info, - &error->display_runtime_info, &p); intel_driver_caps_print(&error->driver_caps, &p); } @@ -662,7 +659,6 @@ static void err_print_params(struct drm_i915_error_state_buf *m, struct drm_printer p = i915_error_printer(m); i915_params_dump(params, &p); - intel_display_params_dump(m->i915, &p); } static void err_print_pciid(struct drm_i915_error_state_buf *m, @@ -689,6 +685,74 @@ static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); } +/* This list includes registers that are useful in debugging GuC hangs. */ +const struct { + u32 start; + u32 count; +} guc_hw_reg_state[] = { + { 0xc0b0, 2 }, + { 0xc000, 65 }, + { 0xc140, 1 }, + { 0xc180, 16 }, + { 0xc1dc, 10 }, + { 0xc300, 79 }, + { 0xc4b4, 47 }, + { 0xc574, 1 }, + { 0xc57c, 1 }, + { 0xc584, 11 }, + { 0xc5c0, 8 }, + { 0xc5e4, 1 }, + { 0xc5ec, 103 }, + { 0xc7c0, 1 }, + { 0xc0b0, 2 } +}; + +static u32 print_range_line(struct drm_i915_error_state_buf *m, u32 start, u32 *dump, u32 count) +{ + if (count >= 8) { + err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n", + start, dump[0], dump[1], dump[2], dump[3], + dump[4], dump[5], dump[6], dump[7]); + return 8; + } else if (count >= 4) { + err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x\n", + start, dump[0], dump[1], dump[2], dump[3]); + return 4; + } else if (count >= 2) { + err_printf(m, "[0x%04x] 0x%08x 0x%08x\n", start, dump[0], dump[1]); + return 2; + } + + err_printf(m, "[0x%04x] 0x%08x\n", start, dump[0]); + return 1; +} + +static void err_print_guc_hw_state(struct drm_i915_error_state_buf *m, u32 *hw_state) +{ + u32 total = 0; + int i; + + if (!hw_state) + return; + + err_printf(m, "GuC Register State:\n"); + + for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) { + u32 entry = 0; + + while (entry < guc_hw_reg_state[i].count) { + u32 start = guc_hw_reg_state[i].start + entry * sizeof(u32); + u32 count = guc_hw_reg_state[i].count - entry; + u32 *values = hw_state + total + entry; + + entry += print_range_line(m, start, values, count); + } + + GEM_BUG_ON(entry != guc_hw_reg_state[i].count); + total += entry; + } +} + static void err_print_uc(struct drm_i915_error_state_buf *m, const struct intel_uc_coredump *error_uc) { @@ -697,6 +761,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, intel_uc_fw_dump(&error_uc->guc_fw, &p); intel_uc_fw_dump(&error_uc->huc_fw, &p); err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp); + err_print_guc_hw_state(m, error_uc->guc.hw_state); intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log); err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence); err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0); @@ -730,13 +795,6 @@ static void err_print_gt_info(struct drm_i915_error_state_buf *m, intel_sseu_print_topology(gt->_gt->i915, >->info.sseu, &p); } -static void err_print_gt_display(struct drm_i915_error_state_buf *m, - struct intel_gt_coredump *gt) -{ - err_printf(m, "IER: 0x%08x\n", gt->ier); - err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr); -} - static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m, struct intel_gt_coredump *gt) { @@ -836,6 +894,7 @@ static void err_print_gt_engines(struct drm_i915_error_state_buf *m, static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, struct i915_gpu_coredump *error) { + struct drm_printer p = i915_error_printer(m); const struct intel_engine_coredump *ee; struct timespec64 ts; @@ -844,7 +903,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "Kernel: %s %s\n", init_utsname()->release, init_utsname()->machine); - err_printf(m, "Driver: %s\n", DRIVER_DATE); ts = ktime_to_timespec64(error->time); err_printf(m, "Time: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); @@ -873,8 +931,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "IOMMU enabled?: %d\n", error->iommu); - intel_dmc_print_error_state(m, m->i915); - err_printf(m, "RPM wakelock: %s\n", str_yes_no(error->wakelock)); err_printf(m, "PM suspended: %s\n", str_yes_no(error->suspended)); @@ -884,7 +940,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (error->gt->uc && error->gt->uc->guc.is_guc_capture) print_guc_capture = true; - err_print_gt_display(m, error->gt); err_print_gt_global_nonguc(m, error->gt); err_print_gt_fences(m, error->gt); @@ -903,11 +958,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_print_gt_info(m, error->gt); } - if (error->overlay) - intel_overlay_print_error_state(m, error->overlay); - err_print_capabilities(m, error); err_print_params(m, &error->params); + + intel_display_snapshot_print(error->display_snapshot, &p); } static int err_print_to_sgl(struct i915_gpu_coredump *error) @@ -1030,7 +1084,6 @@ static void i915_vma_coredump_free(struct i915_vma_coredump *vma) static void cleanup_params(struct i915_gpu_coredump *error) { i915_params_free(&error->params); - intel_display_params_free(&error->display_params); } static void cleanup_uc(struct intel_uc_coredump *uc) @@ -1041,6 +1094,7 @@ static void cleanup_uc(struct intel_uc_coredump *uc) kfree(uc->huc_fw.file_wanted.path); i915_vma_coredump_free(uc->guc.vma_log); i915_vma_coredump_free(uc->guc.vma_ctb); + kfree(uc->guc.hw_state); kfree(uc); } @@ -1075,7 +1129,7 @@ void __i915_gpu_coredump_free(struct kref *error_ref) cleanup_gt(gt); } - kfree(error->overlay); + intel_display_snapshot_free(error->display_snapshot); cleanup_params(error); @@ -1111,7 +1165,7 @@ i915_vma_coredump_create(const struct intel_gt *gt, } INIT_LIST_HEAD(&dst->page_list); - strcpy(dst->name, name); + strscpy(dst->name, name); dst->next = NULL; dst->gtt_offset = vma_res->start; @@ -1411,7 +1465,7 @@ static bool record_context(struct i915_gem_context_coredump *e, rcu_read_lock(); task = pid_task(ctx->pid, PIDTYPE_PID); if (task) { - strcpy(e->comm, task->comm); + strscpy(e->comm, task->comm); e->pid = task->pid; } rcu_read_unlock(); @@ -1457,7 +1511,7 @@ capture_vma_snapshot(struct intel_engine_capture_vma *next, return next; } - strcpy(c->name, name); + strscpy(c->name, name); c->vma_res = i915_vma_resource_get(vma_res); c->next = next; @@ -1650,9 +1704,21 @@ capture_engine(struct intel_engine_cs *engine, return NULL; intel_engine_get_hung_entity(engine, &ce, &rq); - if (rq && !i915_request_started(rq)) - drm_info(&engine->gt->i915->drm, "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", - engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); + if (rq && !i915_request_started(rq)) { + /* + * We want to know also what is the guc_id of the context, + * but if we don't have the context reference, then skip + * printing it. + */ + if (ce) + drm_info(&engine->gt->i915->drm, + "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", + engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); + else + drm_info(&engine->gt->i915->drm, + "Got hung context on %s with active request %lld:%lld not yet started\n", + engine->name, rq->fence.context, rq->fence.seqno); + } if (rq) { capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); @@ -1725,6 +1791,37 @@ static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; } +static u32 read_guc_state_reg(struct intel_uncore *uncore, int range, int count) +{ + GEM_BUG_ON(range >= ARRAY_SIZE(guc_hw_reg_state)); + GEM_BUG_ON(count >= guc_hw_reg_state[range].count); + + return intel_uncore_read(uncore, + _MMIO(guc_hw_reg_state[range].start + count * sizeof(u32))); +} + +static void gt_record_guc_hw_state(struct intel_uncore *uncore, + struct intel_uc_coredump *error_uc) +{ + u32 *hw_state; + u32 count = 0; + int i, j; + + for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) + count += guc_hw_reg_state[i].count; + + hw_state = kcalloc(count, sizeof(u32), ALLOW_FAIL); + if (!hw_state) + return; + + count = 0; + for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) + for (j = 0; j < guc_hw_reg_state[i].count; j++) + hw_state[count++] = read_guc_state_reg(uncore, i, j); + + error_uc->guc.hw_state = hw_state; +} + static struct intel_uc_coredump * gt_record_uc(struct intel_gt_coredump *gt, struct i915_vma_compress *compress) @@ -1759,31 +1856,11 @@ gt_record_uc(struct intel_gt_coredump *gt, uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv, uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); + gt_record_guc_hw_state(gt->_gt->uncore, error_uc); return error_uc; } -/* Capture display registers. */ -static void gt_record_display_regs(struct intel_gt_coredump *gt) -{ - struct intel_uncore *uncore = gt->_gt->uncore; - struct drm_i915_private *i915 = uncore->i915; - - if (DISPLAY_VER(i915) >= 6 && DISPLAY_VER(i915) < 20) - gt->derrmr = intel_uncore_read(uncore, DERRMR); - - if (GRAPHICS_VER(i915) >= 8) - gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); - else if (IS_VALLEYVIEW(i915)) - gt->ier = intel_uncore_read(uncore, VLV_IER); - else if (HAS_PCH_SPLIT(i915)) - gt->ier = intel_uncore_read(uncore, DEIER); - else if (GRAPHICS_VER(i915) == 2) - gt->ier = intel_uncore_read16(uncore, GEN2_IER); - else - gt->ier = intel_uncore_read(uncore, GEN2_IER); -} - /* Capture all other registers that GuC doesn't capture. */ static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt) { @@ -1817,9 +1894,12 @@ static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt) gt->gtier[i] = intel_uncore_read(uncore, GEN8_GT_IER(i)); gt->ngtier = 4; - } else if (HAS_PCH_SPLIT(i915)) { + } else if (GRAPHICS_VER(i915) >= 5) { gt->gtier[0] = intel_uncore_read(uncore, GTIER); gt->ngtier = 1; + } else { + gt->gtier[0] = intel_uncore_read(uncore, GEN2_IER); + gt->ngtier = 1; } gt->eir = intel_uncore_read(uncore, EIR); @@ -1991,17 +2071,12 @@ static void capture_gen(struct i915_gpu_coredump *error) error->suspend_count = i915->suspend_count; i915_params_copy(&error->params, &i915->params); - intel_display_params_copy(&error->display_params); memcpy(&error->device_info, INTEL_INFO(i915), sizeof(error->device_info)); memcpy(&error->runtime_info, RUNTIME_INFO(i915), sizeof(error->runtime_info)); - memcpy(&error->display_device_info, DISPLAY_INFO(i915), - sizeof(error->display_device_info)); - memcpy(&error->display_runtime_info, DISPLAY_RUNTIME_INFO(i915), - sizeof(error->display_runtime_info)); error->driver_caps = i915->caps; } @@ -2044,7 +2119,6 @@ intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags) gc->_gt = gt; gc->awake = intel_gt_pm_is_awake(gt); - gt_record_display_regs(gc); gt_record_global_nonguc_regs(gc); /* @@ -2095,6 +2169,7 @@ static struct i915_gpu_coredump * __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) { struct drm_i915_private *i915 = gt->i915; + struct intel_display *display = i915->display; struct i915_gpu_coredump *error; /* Check if GPU capture has been disabled */ @@ -2136,7 +2211,7 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 du error->simulated |= error->gt->simulated; } - error->overlay = intel_overlay_capture_error_state(i915); + error->display_snapshot = intel_display_snapshot_capture(display); return error; } @@ -2160,7 +2235,6 @@ i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump void i915_error_state_store(struct i915_gpu_coredump *error) { struct drm_i915_private *i915; - static bool warned; if (IS_ERR_OR_NULL(error)) return; @@ -2174,16 +2248,8 @@ void i915_error_state_store(struct i915_gpu_coredump *error) i915_gpu_coredump_get(error); - if (!xchg(&warned, true) && - ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { - pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); - pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n"); - pr_info("Please see https://drm.pages.freedesktop.org/intel-docs/how-to-file-i915-bugs.html for details.\n"); - pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); - pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); - pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", - i915->drm.primary->index); - } + drm_info(&i915->drm, "GPU error state saved to /sys/class/drm/card%d/error\n", + i915->drm.primary->index); } /** @@ -2481,16 +2547,16 @@ static const struct file_operations i915_error_state_fops = { void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) { - struct drm_minor *minor = i915->drm.primary; + struct dentry *debugfs_root = i915->drm.debugfs_root; - debugfs_create_file("i915_error_state", 0644, minor->debugfs_root, i915, + debugfs_create_file("i915_error_state", 0644, debugfs_root, i915, &i915_error_state_fops); - debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915, + debugfs_create_file("i915_gpu_info", 0644, debugfs_root, i915, &i915_gpu_info_fops); } static ssize_t error_state_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { @@ -2526,7 +2592,7 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj, } static ssize_t error_state_write(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *kdev = kobj_to_dev(kobj); |
