From 80b6ef8ae25ade45e6418df3ddf699a5a10a7ca4 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 21 Feb 2025 10:50:34 +0000 Subject: drm/amdgpu: Pop jobs from the queue more robustly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace a copy of DRM scheduler's to_drm_sched_job with a copy of a newly added drm_sched_entity_queue_pop. This allows breaking the hidden dependency that queue_node has to be the first element in struct drm_sched_job. A comment is also added with a reference to the mailing list discussion explaining the copied helper will be removed when the whole broken amdgpu_job_stop_all_jobs_on_sched is removed. Signed-off-by: Tvrtko Ursulin Cc: Christian König Cc: Danilo Krummrich Cc: Matthew Brost Cc: Philipp Stanner Cc: Zhang, Hawking Reviewed-by: Christian König Signed-off-by: Philipp Stanner Link: https://patchwork.freedesktop.org/patch/msgid/20250221105038.79665-3-tvrtko.ursulin@igalia.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 100f04475943..1899c601c95c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -411,8 +411,24 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) return fence; } -#define to_drm_sched_job(sched_job) \ - container_of((sched_job), struct drm_sched_job, queue_node) +/* + * This is a duplicate function from DRM scheduler sched_internal.h. + * Plan is to remove it when amdgpu_job_stop_all_jobs_on_sched is removed, due + * latter being incorrect and racy. + * + * See https://lore.kernel.org/amd-gfx/44edde63-7181-44fb-a4f7-94e50514f539@amd.com/ + */ +static struct drm_sched_job * +drm_sched_entity_queue_pop(struct drm_sched_entity *entity) +{ + struct spsc_node *node; + + node = spsc_queue_pop(&entity->job_queue); + if (!node) + return NULL; + + return container_of(node, struct drm_sched_job, queue_node); +} void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) { @@ -425,7 +441,7 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) struct drm_sched_rq *rq = sched->sched_rq[i]; spin_lock(&rq->lock); list_for_each_entry(s_entity, &rq->entities, list) { - while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { + while ((s_job = drm_sched_entity_queue_pop(s_entity))) { struct drm_sched_fence *s_fence = s_job->s_fence; dma_fence_signal(&s_fence->scheduled); -- cgit From c94943b0863ef3b8e88769f0805f715c8247b2bf Mon Sep 17 00:00:00 2001 From: "Jesse.zhang@amd.com" Date: Fri, 21 Feb 2025 10:26:52 +0800 Subject: drm/amdgpu: Update amdgpu_job_timedout to check if the ring is guilty This patch updates the `amdgpu_job_timedout` function to check if the ring is actually guilty of causing the timeout. If not, it skips error handling and fence completion. v2: move the is_guilty check down into the queue reset area (Alex) v3: need to call is_guilty before reset (Alex) v4: squash in is_guilty logic fixes (Alex) Signed-off-by: Alex Deucher Signed-off-by: Jesse Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 100f04475943..abfbbc6babe7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -130,29 +130,45 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_vm_put_task_info(ti); } - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); - /* attempt a per ring reset */ if (amdgpu_gpu_recovery && ring->funcs->reset) { + bool is_guilty; + dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); /* stop the scheduler, but don't mess with the * bad job yet because if ring reset fails * we'll fall back to full GPU reset. */ drm_sched_wqueue_stop(&ring->sched); + + /* for engine resets, we need to reset the engine, + * but individual queues may be unaffected. + * check here to make sure the accounting is correct. + */ + if (ring->funcs->is_guilty) + is_guilty = ring->funcs->is_guilty(ring); + else + is_guilty = true; + + if (is_guilty) + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); + r = amdgpu_ring_reset(ring, job->vmid); if (!r) { if (amdgpu_ring_sched_ready(ring)) drm_sched_stop(&ring->sched, s_job); - atomic_inc(&ring->adev->gpu_reset_counter); - amdgpu_fence_driver_force_completion(ring); + if (is_guilty) { + atomic_inc(&ring->adev->gpu_reset_counter); + amdgpu_fence_driver_force_completion(ring); + } if (amdgpu_ring_sched_ready(ring)) drm_sched_start(&ring->sched, 0); goto exit; } dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); } + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); if (amdgpu_device_should_recover_gpu(ring->adev)) { struct amdgpu_reset_context reset_context; -- cgit From b7fd6528b5ad80eea66df6240f2399602d9fd388 Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 20 Feb 2025 13:27:49 -0300 Subject: drm/amdgpu: Log after a successful ring reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a ring reset happens, the kernel log shows only "amdgpu: Starting ring reset", but when it finishes nothing appears in the log. Explicitly write in the log that the reset has finished correctly. Reviewed-by: Christian König Signed-off-by: André Almeida Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index abfbbc6babe7..c37bc683253a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -164,6 +164,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } if (amdgpu_ring_sched_ready(ring)) drm_sched_start(&ring->sched, 0); + dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); goto exit; } dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); -- cgit From 9c696cc57c1a6dab6da6b51f4b30a7d16e233cbc Mon Sep 17 00:00:00 2001 From: André Almeida Date: Wed, 26 Feb 2025 10:11:18 -0300 Subject: drm/amdgpu: Create a debug option to disable ring reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to the addition of ring reset, the debug option `debug_disable_soft_recovery` could be used to force a full device reset. Now that we have ring reset, create a debug option to disable them in amdgpu, forcing the driver to go with the full device reset path again when both options are combined. This option is useful for testing and debugging purposes when one wants to test the full reset from userspace. Signed-off-by: André Almeida Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index c37bc683253a..5537c8bfd227 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -131,8 +131,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } /* attempt a per ring reset */ - if (amdgpu_gpu_recovery && - ring->funcs->reset) { + if (unlikely(adev->debug_disable_gpu_ring_reset)) { + dev_err(adev->dev, "Ring reset disabled by debug mask\n"); + } else if (amdgpu_gpu_recovery && ring->funcs->reset) { bool is_guilty; dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); -- cgit From 099f273eff9c4927be47e337ecf9b10df88a99ad Mon Sep 17 00:00:00 2001 From: André Almeida Date: Mon, 24 Feb 2025 22:02:21 -0300 Subject: drm/amdgpu: Trigger a wedged event for ring reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of only triggering a wedged event for complete GPU resets, trigger for ring resets. Regardless of the reset, it's useful for userspace to know that it happened because the kernel will reject further submissions from that app. Reviewed-by: Christian König Signed-off-by: André Almeida Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 1d26be3c6d9d..935df2cdcc16 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -166,6 +166,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) if (amdgpu_ring_sched_ready(ring)) drm_sched_start(&ring->sched, 0); dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); goto exit; } dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); -- cgit From bd22e44ad415ac22e3a4f9a983d2a085f6cb4427 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 15 Jan 2025 13:44:26 +0100 Subject: drm/amdgpu: rework how isolation is enforced v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Limiting the number of available VMIDs to enforce isolation causes some issues with gang submit and applying certain HW workarounds which require multiple VMIDs to work correctly. So instead start to track all submissions to the relevant engines in a per partition data structure and use the dma_fences of the submissions to enforce isolation similar to what a VMID limit does. v2: use ~0l for jobs without isolation to distinct it from kernel submissions which uses NULL for the owner. Add some warning when we are OOM. Signed-off-by: Christian König Acked-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 935df2cdcc16..acb21fc8b3ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -361,17 +361,24 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job, { struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); struct amdgpu_job *job = to_amdgpu_job(sched_job); - struct dma_fence *fence = NULL; + struct dma_fence *fence; int r; r = drm_sched_entity_error(s_entity); if (r) goto error; - if (job->gang_submit) + if (job->gang_submit) { fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); + if (fence) + return fence; + } + + fence = amdgpu_device_enforce_isolation(ring->adev, ring, job); + if (fence) + return fence; - if (!fence && job->vm && !job->vmid) { + if (job->vm && !job->vmid) { r = amdgpu_vmid_grab(job->vm, ring, job, &fence); if (r) { dev_err(ring->adev->dev, "Error getting VM ID (%d)\n", r); @@ -384,9 +391,10 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job, */ if (!fence) job->vm = NULL; + return fence; } - return fence; + return NULL; error: dma_fence_set_error(&job->base.s_fence->finished, r); -- cgit