diff options
Diffstat (limited to 'drivers/gpu/drm/xe/xe_guc_submit.c')
-rw-r--r-- | drivers/gpu/drm/xe/xe_guc_submit.c | 77 |
1 files changed, 55 insertions, 22 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 6d84a52b660a..cafb47711e9b 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -498,6 +498,15 @@ static void __register_mlrc_exec_queue(struct xe_guc *guc, action[len++] = upper_32_bits(xe_lrc_descriptor(lrc)); } + /* explicitly checks some fields that we might fixup later */ + xe_gt_assert(guc_to_gt(guc), info->wq_desc_lo == + action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_5_WQ_DESC_ADDR_LOWER]); + xe_gt_assert(guc_to_gt(guc), info->wq_base_lo == + action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_7_WQ_BUF_BASE_LOWER]); + xe_gt_assert(guc_to_gt(guc), q->width == + action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_10_NUM_CTXS]); + xe_gt_assert(guc_to_gt(guc), info->hwlrca_lo == + action[XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_11_HW_LRC_ADDR]); xe_gt_assert(guc_to_gt(guc), len <= MAX_MLRC_REG_SIZE); #undef MAX_MLRC_REG_SIZE @@ -522,6 +531,14 @@ static void __register_exec_queue(struct xe_guc *guc, info->hwlrca_hi, }; + /* explicitly checks some fields that we might fixup later */ + xe_gt_assert(guc_to_gt(guc), info->wq_desc_lo == + action[XE_GUC_REGISTER_CONTEXT_DATA_5_WQ_DESC_ADDR_LOWER]); + xe_gt_assert(guc_to_gt(guc), info->wq_base_lo == + action[XE_GUC_REGISTER_CONTEXT_DATA_7_WQ_BUF_BASE_LOWER]); + xe_gt_assert(guc_to_gt(guc), info->hwlrca_lo == + action[XE_GUC_REGISTER_CONTEXT_DATA_10_HW_LRC_ADDR]); + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0); } @@ -891,12 +908,13 @@ static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) struct xe_exec_queue *q = ge->q; struct xe_guc *guc = exec_queue_to_guc(q); struct xe_gpu_scheduler *sched = &ge->sched; - bool wedged; + bool wedged = false; xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q)); trace_xe_exec_queue_lr_cleanup(q); - wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); + if (!exec_queue_killed(q)) + wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); /* Kill the run_job / process_msg entry points */ xe_sched_submission_stop(sched); @@ -970,10 +988,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job) */ xe_gt_assert(gt, timeout_ms < 100 * MSEC_PER_SEC); - if (ctx_timestamp < ctx_job_timestamp) - diff = ctx_timestamp + U32_MAX - ctx_job_timestamp; - else - diff = ctx_timestamp - ctx_job_timestamp; + diff = ctx_timestamp - ctx_job_timestamp; /* * Ensure timeout is within 5% to account for an GuC scheduling latency @@ -1070,7 +1085,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) int err = -ETIME; pid_t pid = -1; int i = 0; - bool wedged, skip_timeout_check; + bool wedged = false, skip_timeout_check; /* * TDR has fired before free job worker. Common if exec queue @@ -1078,12 +1093,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * list so job can be freed and kick scheduler ensuring free job is not * lost. */ - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) { - xe_sched_add_pending_job(sched, job); - xe_sched_submission_start(sched); - - return DRM_GPU_SCHED_STAT_NOMINAL; - } + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) + return DRM_GPU_SCHED_STAT_NO_HANG; /* Kill the run_job entry point */ xe_sched_submission_stop(sched); @@ -1116,7 +1127,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * doesn't work for SRIOV. For now assuming timeouts in wedged mode are * genuine timeouts. */ - wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); + if (!exec_queue_killed(q)) + wedged = guc_submit_hint_wedged(exec_queue_to_guc(q)); /* Engine state now stable, disable scheduling to check timestamp */ if (!wedged && exec_queue_registered(q)) { @@ -1251,7 +1263,7 @@ trigger_reset: /* Start fence signaling */ xe_hw_fence_irq_start(q->fence_irq); - return DRM_GPU_SCHED_STAT_NOMINAL; + return DRM_GPU_SCHED_STAT_RESET; sched_enable: enable_scheduling(q); @@ -1261,10 +1273,8 @@ rearm: * but there is not currently an easy way to do in DRM scheduler. With * some thought, do this in a follow up. */ - xe_sched_add_pending_job(sched, job); xe_sched_submission_start(sched); - - return DRM_GPU_SCHED_STAT_NOMINAL; + return DRM_GPU_SCHED_STAT_NO_HANG; } static void __guc_exec_queue_fini_async(struct work_struct *w) @@ -1285,7 +1295,11 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); - kfree(ge); + /* + * RCU free due sched being exported via DRM scheduler fences + * (timeline name). + */ + kfree_rcu(ge, rcu); xe_exec_queue_fini(q); xe_pm_runtime_put(guc_to_xe(guc)); } @@ -1468,6 +1482,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q) q->guc = ge; ge->q = q; + init_rcu_head(&ge->rcu); init_waitqueue_head(&ge->suspend_wait); for (i = 0; i < MAX_STATIC_MSG_TYPE; ++i) @@ -1762,6 +1777,9 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc) { int ret; + if (!guc->submission_state.initialized) + return 0; + /* * Using an atomic here rather than submission_state.lock as this * function can be called while holding the CT lock (engine reset @@ -2068,12 +2086,16 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg, struct xe_gt *gt = guc_to_gt(guc); struct xe_exec_queue *q; u32 guc_id; + u32 type = XE_GUC_CAT_ERR_TYPE_INVALID; - if (unlikely(len < 1)) + if (unlikely(!len || len > 2)) return -EPROTO; guc_id = msg[0]; + if (len == 2) + type = msg[1]; + if (guc_id == GUC_ID_UNKNOWN) { /* * GuC uses GUC_ID_UNKNOWN if it can not map the CAT fault to any PF/VF @@ -2087,8 +2109,19 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg, if (unlikely(!q)) return -EPROTO; - xe_gt_dbg(gt, "Engine memory cat error: engine_class=%s, logical_mask: 0x%x, guc_id=%d", - xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id); + /* + * The type is HW-defined and changes based on platform, so we don't + * decode it in the kernel and only check if it is valid. + * See bspec 54047 and 72187 for details. + */ + if (type != XE_GUC_CAT_ERR_TYPE_INVALID) + xe_gt_dbg(gt, + "Engine memory CAT error [%u]: class=%s, logical_mask: 0x%x, guc_id=%d", + type, xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id); + else + xe_gt_dbg(gt, + "Engine memory CAT error: class=%s, logical_mask: 0x%x, guc_id=%d", + xe_hw_engine_class_to_str(q->class), q->logical_mask, guc_id); trace_xe_exec_queue_memory_cat_error(q); |