diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 71 |
1 files changed, 26 insertions, 45 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index ddb9d3269357..e6061d45f142 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -89,10 +89,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) { struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); struct amdgpu_job *job = to_amdgpu_job(s_job); - struct amdgpu_task_info *ti; + struct drm_wedge_task_info *info = NULL; + struct amdgpu_task_info *ti = NULL; struct amdgpu_device *adev = ring->adev; - int idx; - int r; + int idx, r; if (!drm_dev_enter(adev_to_drm(adev), &idx)) { dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s", @@ -112,6 +112,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_job_core_dump(adev, job); if (amdgpu_gpu_recovery && + amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_SOFT_RESET) && amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { dev_err(adev->dev, "ring %s timeout, but soft recovered\n", s_job->sched->name); @@ -124,53 +125,30 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); if (ti) { - dev_err(adev->dev, - "Process information: process %s pid %d thread %s pid %d\n", - ti->process_name, ti->tgid, ti->task_name, ti->pid); - amdgpu_vm_put_task_info(ti); + amdgpu_vm_print_task_info(adev, ti); + info = &ti->task; } /* attempt a per ring reset */ if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "Ring reset disabled by debug mask\n"); - } else if (amdgpu_gpu_recovery && ring->funcs->reset) { - bool is_guilty; - - dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); - /* stop the scheduler, but don't mess with the - * bad job yet because if ring reset fails - * we'll fall back to full GPU reset. - */ - drm_sched_wqueue_stop(&ring->sched); - - /* for engine resets, we need to reset the engine, - * but individual queues may be unaffected. - * check here to make sure the accounting is correct. - */ - if (ring->funcs->is_guilty) - is_guilty = ring->funcs->is_guilty(ring); - else - is_guilty = true; - - if (is_guilty) - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); - - r = amdgpu_ring_reset(ring, job->vmid); + } else if (amdgpu_gpu_recovery && + amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_PER_QUEUE) && + ring->funcs->reset) { + dev_err(adev->dev, "Starting %s ring reset\n", + s_job->sched->name); + r = amdgpu_ring_reset(ring, job->vmid, &job->hw_fence); if (!r) { - if (amdgpu_ring_sched_ready(ring)) - drm_sched_stop(&ring->sched, s_job); - if (is_guilty) { - atomic_inc(&ring->adev->gpu_reset_counter); - amdgpu_fence_driver_force_completion(ring); - } - if (amdgpu_ring_sched_ready(ring)) - drm_sched_start(&ring->sched, 0); - dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); + atomic_inc(&ring->adev->gpu_reset_counter); + dev_err(adev->dev, "Ring %s reset succeeded\n", + ring->sched.name); + drm_dev_wedged_event(adev_to_drm(adev), + DRM_WEDGE_RECOVERY_NONE, info); goto exit; } - dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); + dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); } + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); if (amdgpu_device_should_recover_gpu(ring->adev)) { @@ -198,13 +176,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } exit: + amdgpu_vm_put_task_info(ti); drm_dev_exit(idx); - return DRM_GPU_SCHED_STAT_NOMINAL; + return DRM_GPU_SCHED_STAT_RESET; } int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct drm_sched_entity *entity, void *owner, - unsigned int num_ibs, struct amdgpu_job **job) + unsigned int num_ibs, struct amdgpu_job **job, + u64 drm_client_id) { if (num_ibs == 0) return -EINVAL; @@ -222,7 +202,8 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (!entity) return 0; - return drm_sched_job_init(&(*job)->base, entity, 1, owner); + return drm_sched_job_init(&(*job)->base, entity, 1, owner, + drm_client_id); } int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, @@ -232,7 +213,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, { int r; - r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job); + r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job, 0); if (r) return r; |