diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
| -rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 612 |
1 files changed, 511 insertions, 101 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 904f21e1380c..7582ef34bf3f 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -28,6 +28,7 @@ */ #include <linux/ascii85.h> +#include <linux/debugfs.h> #include <linux/highmem.h> #include <linux/nmi.h> #include <linux/pagevec.h> @@ -39,8 +40,7 @@ #include <drm/drm_cache.h> #include <drm/drm_print.h> -#include "display/intel_dmc.h" -#include "display/intel_overlay.h" +#include "display/intel_display_snapshot.h" #include "gem/i915_gem_context.h" #include "gem/i915_gem_lmem.h" @@ -57,6 +57,7 @@ #include "i915_memcpy.h" #include "i915_reg.h" #include "i915_scatterlist.h" +#include "i915_sysfs.h" #include "i915_utils.h" #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) @@ -187,64 +188,64 @@ i915_error_printer(struct drm_i915_error_state_buf *e) } /* single threaded page allocator with a reserved stash for emergencies */ -static void pool_fini(struct pagevec *pv) +static void pool_fini(struct folio_batch *fbatch) { - pagevec_release(pv); + folio_batch_release(fbatch); } -static int pool_refill(struct pagevec *pv, gfp_t gfp) +static int pool_refill(struct folio_batch *fbatch, gfp_t gfp) { - while (pagevec_space(pv)) { - struct page *p; + while (folio_batch_space(fbatch)) { + struct folio *folio; - p = alloc_page(gfp); - if (!p) + folio = folio_alloc(gfp, 0); + if (!folio) return -ENOMEM; - pagevec_add(pv, p); + folio_batch_add(fbatch, folio); } return 0; } -static int pool_init(struct pagevec *pv, gfp_t gfp) +static int pool_init(struct folio_batch *fbatch, gfp_t gfp) { int err; - pagevec_init(pv); + folio_batch_init(fbatch); - err = pool_refill(pv, gfp); + err = pool_refill(fbatch, gfp); if (err) - pool_fini(pv); + pool_fini(fbatch); return err; } -static void *pool_alloc(struct pagevec *pv, gfp_t gfp) +static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp) { - struct page *p; + struct folio *folio; - p = alloc_page(gfp); - if (!p && pagevec_count(pv)) - p = pv->pages[--pv->nr]; + folio = folio_alloc(gfp, 0); + if (!folio && folio_batch_count(fbatch)) + folio = fbatch->folios[--fbatch->nr]; - return p ? page_address(p) : NULL; + return folio ? folio_address(folio) : NULL; } -static void pool_free(struct pagevec *pv, void *addr) +static void pool_free(struct folio_batch *fbatch, void *addr) { - struct page *p = virt_to_page(addr); + struct folio *folio = virt_to_folio(addr); - if (pagevec_space(pv)) - pagevec_add(pv, p); + if (folio_batch_space(fbatch)) + folio_batch_add(fbatch, folio); else - __free_page(p); + folio_put(folio); } #ifdef CONFIG_DRM_I915_COMPRESS_ERROR struct i915_vma_compress { - struct pagevec pool; + struct folio_batch pool; struct z_stream_s zstream; void *tmp; }; @@ -381,7 +382,7 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #else struct i915_vma_compress { - struct pagevec pool; + struct folio_batch pool; }; static bool compress_init(struct i915_vma_compress *c) @@ -505,6 +506,7 @@ static void error_print_context(struct drm_i915_error_state_buf *m, header, ctx->comm, ctx->pid, ctx->sched_attr.priority, ctx->guilty, ctx->active, ctx->total_runtime, ctx->avg_runtime); + err_printf(m, " context timeline seqno %u\n", ctx->hwsp_seqno); } static struct i915_vma_coredump * @@ -519,7 +521,7 @@ __find_vma(struct i915_vma_coredump *vma, const char *name) return NULL; } -struct i915_vma_coredump * +static struct i915_vma_coredump * intel_gpu_error_find_batch(const struct intel_engine_coredump *ee) { return __find_vma(ee->vma, "batch"); @@ -608,9 +610,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) va_end(args); } -void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, - const struct intel_engine_cs *engine, - const struct i915_vma_coredump *vma) +static void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, + const struct intel_engine_cs *engine, + const struct i915_vma_coredump *vma) { char out[ASCII85_BUFSZ]; struct page *page; @@ -683,6 +685,74 @@ static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); } +/* This list includes registers that are useful in debugging GuC hangs. */ +const struct { + u32 start; + u32 count; +} guc_hw_reg_state[] = { + { 0xc0b0, 2 }, + { 0xc000, 65 }, + { 0xc140, 1 }, + { 0xc180, 16 }, + { 0xc1dc, 10 }, + { 0xc300, 79 }, + { 0xc4b4, 47 }, + { 0xc574, 1 }, + { 0xc57c, 1 }, + { 0xc584, 11 }, + { 0xc5c0, 8 }, + { 0xc5e4, 1 }, + { 0xc5ec, 103 }, + { 0xc7c0, 1 }, + { 0xc0b0, 2 } +}; + +static u32 print_range_line(struct drm_i915_error_state_buf *m, u32 start, u32 *dump, u32 count) +{ + if (count >= 8) { + err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n", + start, dump[0], dump[1], dump[2], dump[3], + dump[4], dump[5], dump[6], dump[7]); + return 8; + } else if (count >= 4) { + err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x\n", + start, dump[0], dump[1], dump[2], dump[3]); + return 4; + } else if (count >= 2) { + err_printf(m, "[0x%04x] 0x%08x 0x%08x\n", start, dump[0], dump[1]); + return 2; + } + + err_printf(m, "[0x%04x] 0x%08x\n", start, dump[0]); + return 1; +} + +static void err_print_guc_hw_state(struct drm_i915_error_state_buf *m, u32 *hw_state) +{ + u32 total = 0; + int i; + + if (!hw_state) + return; + + err_printf(m, "GuC Register State:\n"); + + for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) { + u32 entry = 0; + + while (entry < guc_hw_reg_state[i].count) { + u32 start = guc_hw_reg_state[i].start + entry * sizeof(u32); + u32 count = guc_hw_reg_state[i].count - entry; + u32 *values = hw_state + total + entry; + + entry += print_range_line(m, start, values, count); + } + + GEM_BUG_ON(entry != guc_hw_reg_state[i].count); + total += entry; + } +} + static void err_print_uc(struct drm_i915_error_state_buf *m, const struct intel_uc_coredump *error_uc) { @@ -691,6 +761,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, intel_uc_fw_dump(&error_uc->guc_fw, &p); intel_uc_fw_dump(&error_uc->huc_fw, &p); err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp); + err_print_guc_hw_state(m, error_uc->guc.hw_state); intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log); err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence); err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0); @@ -724,13 +795,6 @@ static void err_print_gt_info(struct drm_i915_error_state_buf *m, intel_sseu_print_topology(gt->_gt->i915, >->info.sseu, &p); } -static void err_print_gt_display(struct drm_i915_error_state_buf *m, - struct intel_gt_coredump *gt) -{ - err_printf(m, "IER: 0x%08x\n", gt->ier); - err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr); -} - static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m, struct intel_gt_coredump *gt) { @@ -807,10 +871,15 @@ static void err_print_gt_engines(struct drm_i915_error_state_buf *m, for (ee = gt->engine; ee; ee = ee->next) { const struct i915_vma_coredump *vma; - if (ee->guc_capture_node) - intel_guc_capture_print_engine_node(m, ee); - else + if (gt->uc && gt->uc->guc.is_guc_capture) { + if (ee->guc_capture_node) + intel_guc_capture_print_engine_node(m, ee); + else + err_printf(m, " Missing GuC capture node for %s\n", + ee->engine->name); + } else { error_print_engine(m, ee); + } err_printf(m, " hung: %u\n", ee->hung); err_printf(m, " engine reset count: %u\n", ee->reset_count); @@ -825,6 +894,7 @@ static void err_print_gt_engines(struct drm_i915_error_state_buf *m, static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, struct i915_gpu_coredump *error) { + struct drm_printer p = i915_error_printer(m); const struct intel_engine_coredump *ee; struct timespec64 ts; @@ -833,7 +903,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "Kernel: %s %s\n", init_utsname()->release, init_utsname()->machine); - err_printf(m, "Driver: %s\n", DRIVER_DATE); ts = ktime_to_timespec64(error->time); err_printf(m, "Time: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); @@ -862,8 +931,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "IOMMU enabled?: %d\n", error->iommu); - intel_dmc_print_error_state(m, m->i915); - err_printf(m, "RPM wakelock: %s\n", str_yes_no(error->wakelock)); err_printf(m, "PM suspended: %s\n", str_yes_no(error->suspended)); @@ -873,7 +940,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (error->gt->uc && error->gt->uc->guc.is_guc_capture) print_guc_capture = true; - err_print_gt_display(m, error->gt); err_print_gt_global_nonguc(m, error->gt); err_print_gt_fences(m, error->gt); @@ -892,11 +958,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_print_gt_info(m, error->gt); } - if (error->overlay) - intel_overlay_print_error_state(m, error->overlay); - err_print_capabilities(m, error); err_print_params(m, &error->params); + + intel_display_snapshot_print(error->display_snapshot, &p); } static int err_print_to_sgl(struct i915_gpu_coredump *error) @@ -1029,6 +1094,7 @@ static void cleanup_uc(struct intel_uc_coredump *uc) kfree(uc->huc_fw.file_wanted.path); i915_vma_coredump_free(uc->guc.vma_log); i915_vma_coredump_free(uc->guc.vma_ctb); + kfree(uc->guc.hw_state); kfree(uc); } @@ -1063,7 +1129,7 @@ void __i915_gpu_coredump_free(struct kref *error_ref) cleanup_gt(gt); } - kfree(error->overlay); + intel_display_snapshot_free(error->display_snapshot); cleanup_params(error); @@ -1099,7 +1165,7 @@ i915_vma_coredump_create(const struct intel_gt *gt, } INIT_LIST_HEAD(&dst->page_list); - strcpy(dst->name, name); + strscpy(dst->name, name); dst->next = NULL; dst->gtt_offset = vma_res->start; @@ -1116,10 +1182,14 @@ i915_vma_coredump_create(const struct intel_gt *gt, mutex_lock(&ggtt->error_mutex); if (ggtt->vm.raw_insert_page) ggtt->vm.raw_insert_page(&ggtt->vm, dma, slot, - I915_CACHE_NONE, 0); + i915_gem_get_pat_index(gt->i915, + I915_CACHE_NONE), + 0); else ggtt->vm.insert_page(&ggtt->vm, dma, slot, - I915_CACHE_NONE, 0); + i915_gem_get_pat_index(gt->i915, + I915_CACHE_NONE), + 0); mb(); s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); @@ -1142,7 +1212,7 @@ i915_vma_coredump_create(const struct intel_gt *gt, dma_addr_t offset = dma - mem->region.start; void __iomem *s; - if (offset + PAGE_SIZE > mem->io_size) { + if (offset + PAGE_SIZE > resource_size(&mem->io)) { ret = -EINVAL; break; } @@ -1163,9 +1233,9 @@ i915_vma_coredump_create(const struct intel_gt *gt, drm_clflush_pages(&page, 1); - s = kmap(page); + s = kmap_local_page(page); ret = compress_page(compress, s, dst, false); - kunmap(page); + kunmap_local(s); drm_clflush_pages(&page, 1); @@ -1222,7 +1292,15 @@ static void engine_record_registers(struct intel_engine_coredump *ee) if (GRAPHICS_VER(i915) >= 6) { ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) + /* + * For the media GT, this ring fault register is not replicated, + * so don't do multicast/replicated register read/write + * operation on it. + */ + if (MEDIA_VER(i915) >= 13 && engine->gt->type == GT_MEDIA) + ee->fault_reg = intel_uncore_read(engine->uncore, + XELPMP_RING_FAULT_REG); + else if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55)) ee->fault_reg = intel_gt_mcr_read_any(engine->gt, XEHP_RING_FAULT_REG); else if (GRAPHICS_VER(i915) >= 12) @@ -1387,7 +1465,7 @@ static bool record_context(struct i915_gem_context_coredump *e, rcu_read_lock(); task = pid_task(ctx->pid, PIDTYPE_PID); if (task) { - strcpy(e->comm, task->comm); + strscpy(e->comm, task->comm); e->pid = task->pid; } rcu_read_unlock(); @@ -1395,6 +1473,8 @@ static bool record_context(struct i915_gem_context_coredump *e, e->sched_attr = ctx->sched; e->guilty = atomic_read(&ctx->guilty_count); e->active = atomic_read(&ctx->active_count); + e->hwsp_seqno = (ce->timeline && ce->timeline->hwsp_seqno) ? + *ce->timeline->hwsp_seqno : ~0U; e->total_runtime = intel_context_get_total_runtime_ns(ce); e->avg_runtime = intel_context_get_avg_runtime_ns(ce); @@ -1431,7 +1511,7 @@ capture_vma_snapshot(struct intel_engine_capture_vma *next, return next; } - strcpy(c->name, name); + strscpy(c->name, name); c->vma_res = i915_vma_resource_get(vma_res); c->next = next; @@ -1624,9 +1704,21 @@ capture_engine(struct intel_engine_cs *engine, return NULL; intel_engine_get_hung_entity(engine, &ce, &rq); - if (rq && !i915_request_started(rq)) - drm_info(&engine->gt->i915->drm, "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", - engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); + if (rq && !i915_request_started(rq)) { + /* + * We want to know also what is the guc_id of the context, + * but if we don't have the context reference, then skip + * printing it. + */ + if (ce) + drm_info(&engine->gt->i915->drm, + "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n", + engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); + else + drm_info(&engine->gt->i915->drm, + "Got hung context on %s with active request %lld:%lld not yet started\n", + engine->name, rq->fence.context, rq->fence.seqno); + } if (rq) { capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); @@ -1699,6 +1791,37 @@ static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; } +static u32 read_guc_state_reg(struct intel_uncore *uncore, int range, int count) +{ + GEM_BUG_ON(range >= ARRAY_SIZE(guc_hw_reg_state)); + GEM_BUG_ON(count >= guc_hw_reg_state[range].count); + + return intel_uncore_read(uncore, + _MMIO(guc_hw_reg_state[range].start + count * sizeof(u32))); +} + +static void gt_record_guc_hw_state(struct intel_uncore *uncore, + struct intel_uc_coredump *error_uc) +{ + u32 *hw_state; + u32 count = 0; + int i, j; + + for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) + count += guc_hw_reg_state[i].count; + + hw_state = kcalloc(count, sizeof(u32), ALLOW_FAIL); + if (!hw_state) + return; + + count = 0; + for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) + for (j = 0; j < guc_hw_reg_state[i].count; j++) + hw_state[count++] = read_guc_state_reg(uncore, i, j); + + error_uc->guc.hw_state = hw_state; +} + static struct intel_uc_coredump * gt_record_uc(struct intel_gt_coredump *gt, struct i915_vma_compress *compress) @@ -1733,31 +1856,11 @@ gt_record_uc(struct intel_gt_coredump *gt, uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv, uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc); + gt_record_guc_hw_state(gt->_gt->uncore, error_uc); return error_uc; } -/* Capture display registers. */ -static void gt_record_display_regs(struct intel_gt_coredump *gt) -{ - struct intel_uncore *uncore = gt->_gt->uncore; - struct drm_i915_private *i915 = uncore->i915; - - if (GRAPHICS_VER(i915) >= 6) - gt->derrmr = intel_uncore_read(uncore, DERRMR); - - if (GRAPHICS_VER(i915) >= 8) - gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); - else if (IS_VALLEYVIEW(i915)) - gt->ier = intel_uncore_read(uncore, VLV_IER); - else if (HAS_PCH_SPLIT(i915)) - gt->ier = intel_uncore_read(uncore, DEIER); - else if (GRAPHICS_VER(i915) == 2) - gt->ier = intel_uncore_read16(uncore, GEN2_IER); - else - gt->ier = intel_uncore_read(uncore, GEN2_IER); -} - /* Capture all other registers that GuC doesn't capture. */ static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt) { @@ -1791,9 +1894,12 @@ static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt) gt->gtier[i] = intel_uncore_read(uncore, GEN8_GT_IER(i)); gt->ngtier = 4; - } else if (HAS_PCH_SPLIT(i915)) { + } else if (GRAPHICS_VER(i915) >= 5) { gt->gtier[0] = intel_uncore_read(uncore, GTIER); gt->ngtier = 1; + } else { + gt->gtier[0] = intel_uncore_read(uncore, GEN2_IER); + gt->ngtier = 1; } gt->eir = intel_uncore_read(uncore, EIR); @@ -1826,7 +1932,7 @@ static void gt_record_global_regs(struct intel_gt_coredump *gt) if (GRAPHICS_VER(i915) == 7) gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { + if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55)) { gt->fault_data0 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt, XEHP_FAULT_TLB_DATA0); gt->fault_data1 = intel_gt_mcr_read_any((struct intel_gt *)gt->_gt, @@ -1958,7 +2064,7 @@ static void capture_gen(struct i915_gpu_coredump *error) struct drm_i915_private *i915 = error->i915; error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); - error->suspended = i915->runtime_pm.suspended; + error->suspended = pm_runtime_suspended(i915->drm.dev); error->iommu = i915_vtd_active(i915); error->reset_count = i915_reset_count(&i915->gpu_error); @@ -2013,7 +2119,6 @@ intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags) gc->_gt = gt; gc->awake = intel_gt_pm_is_awake(gt); - gt_record_display_regs(gc); gt_record_global_nonguc_regs(gc); /* @@ -2064,6 +2169,7 @@ static struct i915_gpu_coredump * __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) { struct drm_i915_private *i915 = gt->i915; + struct intel_display *display = i915->display; struct i915_gpu_coredump *error; /* Check if GPU capture has been disabled */ @@ -2105,12 +2211,12 @@ __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 du error->simulated |= error->gt->simulated; } - error->overlay = intel_overlay_capture_error_state(i915); + error->display_snapshot = intel_display_snapshot_capture(display); return error; } -struct i915_gpu_coredump * +static struct i915_gpu_coredump * i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) { static DEFINE_MUTEX(capture_mutex); @@ -2129,7 +2235,6 @@ i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump void i915_error_state_store(struct i915_gpu_coredump *error) { struct drm_i915_private *i915; - static bool warned; if (IS_ERR_OR_NULL(error)) return; @@ -2143,23 +2248,15 @@ void i915_error_state_store(struct i915_gpu_coredump *error) i915_gpu_coredump_get(error); - if (!xchg(&warned, true) && - ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { - pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); - pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n"); - pr_info("Please see https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs for details.\n"); - pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); - pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); - pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", - i915->drm.primary->index); - } + drm_info(&i915->drm, "GPU error state saved to /sys/class/drm/card%d/error\n", + i915->drm.primary->index); } /** * i915_capture_error_state - capture an error record for later analysis * @gt: intel_gt which originated the hang * @engine_mask: hung engines - * + * @dump_flags: dump flags * * Should be called when an error is detected (either a hang or an error * interrupt) to capture error state from the time of the error. Fills @@ -2181,7 +2278,7 @@ void i915_capture_error_state(struct intel_gt *gt, i915_gpu_coredump_put(error); } -struct i915_gpu_coredump * +static struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { struct i915_gpu_coredump *error; @@ -2216,3 +2313,316 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err) i915->gpu_error.first_error = ERR_PTR(err); spin_unlock_irq(&i915->gpu_error.lock); } + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) +void intel_klog_error_capture(struct intel_gt *gt, + intel_engine_mask_t engine_mask) +{ + static int g_count; + struct drm_i915_private *i915 = gt->i915; + struct i915_gpu_coredump *error; + intel_wakeref_t wakeref; + size_t buf_size = PAGE_SIZE * 128; + size_t pos_err; + char *buf, *ptr, *next; + int l_count = g_count++; + int line = 0; + + /* Can't allocate memory during a reset */ + if (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { + drm_err(>->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n", + l_count, line++); + return; + } + + error = READ_ONCE(i915->gpu_error.first_error); + if (error) { + drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n", + l_count, line++); + i915_reset_error_state(i915); + } + + with_intel_runtime_pm(&i915->runtime_pm, wakeref) + error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE); + + if (IS_ERR(error)) { + drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n", + l_count, line++, PTR_ERR(error)); + return; + } + + buf = kvmalloc(buf_size, GFP_KERNEL); + if (!buf) { + drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n", + l_count, line++); + i915_gpu_coredump_put(error); + return; + } + + drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n", + l_count, line++, __builtin_return_address(0)); + + /* Largest string length safe to print via dmesg */ +# define MAX_CHUNK 800 + + pos_err = 0; + while (1) { + ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1); + + if (got <= 0) + break; + + buf[got] = 0; + pos_err += got; + + ptr = buf; + while (got > 0) { + size_t count; + char tag[2]; + + next = strnchr(ptr, got, '\n'); + if (next) { + count = next - ptr; + *next = 0; + tag[0] = '>'; + tag[1] = '<'; + } else { + count = got; + tag[0] = '}'; + tag[1] = '{'; + } + + if (count > MAX_CHUNK) { + size_t pos; + char *ptr2 = ptr; + + for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) { + char chr = ptr[pos]; + + ptr[pos] = 0; + drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n", + l_count, line++, ptr2); + ptr[pos] = chr; + ptr2 = ptr + pos; + + /* + * If spewing large amounts of data via a serial console, + * this can be a very slow process. So be friendly and try + * not to cause 'softlockup on CPU' problems. + */ + cond_resched(); + } + + if (ptr2 < (ptr + count)) + drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n", + l_count, line++, tag[0], ptr2, tag[1]); + else if (tag[0] == '>') + drm_info(&i915->drm, "[Capture/%d.%d] ><\n", + l_count, line++); + } else { + drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n", + l_count, line++, tag[0], ptr, tag[1]); + } + + ptr = next; + got -= count; + if (next) { + ptr++; + got--; + } + + /* As above. */ + cond_resched(); + } + + if (got) + drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n", + l_count, line++, got); + } + + kvfree(buf); + + drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err); +} +#endif + +static ssize_t gpu_state_read(struct file *file, char __user *ubuf, + size_t count, loff_t *pos) +{ + struct i915_gpu_coredump *error; + ssize_t ret; + void *buf; + + error = file->private_data; + if (!error) + return 0; + + /* Bounce buffer required because of kernfs __user API convenience. */ + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count); + if (ret <= 0) + goto out; + + if (!copy_to_user(ubuf, buf, ret)) + *pos += ret; + else + ret = -EFAULT; + +out: + kfree(buf); + return ret; +} + +static int gpu_state_release(struct inode *inode, struct file *file) +{ + i915_gpu_coredump_put(file->private_data); + return 0; +} + +static int i915_gpu_info_open(struct inode *inode, struct file *file) +{ + struct drm_i915_private *i915 = inode->i_private; + struct i915_gpu_coredump *gpu; + intel_wakeref_t wakeref; + + gpu = NULL; + with_intel_runtime_pm(&i915->runtime_pm, wakeref) + gpu = i915_gpu_coredump(to_gt(i915), ALL_ENGINES, CORE_DUMP_FLAG_NONE); + + if (IS_ERR(gpu)) + return PTR_ERR(gpu); + + file->private_data = gpu; + return 0; +} + +static const struct file_operations i915_gpu_info_fops = { + .owner = THIS_MODULE, + .open = i915_gpu_info_open, + .read = gpu_state_read, + .llseek = default_llseek, + .release = gpu_state_release, +}; + +static ssize_t +i915_error_state_write(struct file *filp, + const char __user *ubuf, + size_t cnt, + loff_t *ppos) +{ + struct i915_gpu_coredump *error = filp->private_data; + + if (!error) + return 0; + + drm_dbg(&error->i915->drm, "Resetting error state\n"); + i915_reset_error_state(error->i915); + + return cnt; +} + +static int i915_error_state_open(struct inode *inode, struct file *file) +{ + struct i915_gpu_coredump *error; + + error = i915_first_error_state(inode->i_private); + if (IS_ERR(error)) + return PTR_ERR(error); + + file->private_data = error; + return 0; +} + +static const struct file_operations i915_error_state_fops = { + .owner = THIS_MODULE, + .open = i915_error_state_open, + .read = gpu_state_read, + .write = i915_error_state_write, + .llseek = default_llseek, + .release = gpu_state_release, +}; + +void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) +{ + struct dentry *debugfs_root = i915->drm.debugfs_root; + + debugfs_create_file("i915_error_state", 0644, debugfs_root, i915, + &i915_error_state_fops); + debugfs_create_file("i915_gpu_info", 0644, debugfs_root, i915, + &i915_gpu_info_fops); +} + +static ssize_t error_state_read(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); + struct i915_gpu_coredump *gpu; + ssize_t ret = 0; + + /* + * FIXME: Concurrent clients triggering resets and reading + clearing + * dumps can cause inconsistent sysfs reads when a user calls in with a + * non-zero offset to complete a prior partial read but the + * gpu_coredump has been cleared or replaced. + */ + + gpu = i915_first_error_state(i915); + if (IS_ERR(gpu)) { + ret = PTR_ERR(gpu); + } else if (gpu) { + ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count); + i915_gpu_coredump_put(gpu); + } else { + const char *str = "No error state collected\n"; + size_t len = strlen(str); + + if (off < len) { + ret = min_t(size_t, count, len - off); + memcpy(buf, str + off, ret); + } + } + + return ret; +} + +static ssize_t error_state_write(struct file *file, struct kobject *kobj, + const struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); + + drm_dbg(&dev_priv->drm, "Resetting error state\n"); + i915_reset_error_state(dev_priv); + + return count; +} + +static const struct bin_attribute error_state_attr = { + .attr.name = "error", + .attr.mode = S_IRUSR | S_IWUSR, + .size = 0, + .read = error_state_read, + .write = error_state_write, +}; + +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr)) + drm_err(&i915->drm, "error_state sysfs setup failed\n"); +} + +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + sysfs_remove_bin_file(&kdev->kobj, &error_state_attr); +} |
