diff options
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 66 |
3 files changed, 61 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 9f26da7e7e34..42a7b86e41c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -475,6 +475,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct if (!funcs) return; + /* Ensure the reset_callback_list is initialized */ + if (!adev->sdma.reset_callback_list.next) { + INIT_LIST_HEAD(&adev->sdma.reset_callback_list); + } /* Initialize the list node in the callback structure */ INIT_LIST_HEAD(&funcs->list); @@ -517,7 +521,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b */ if (!amdgpu_ring_sched_ready(gfx_ring)) { drm_sched_wqueue_stop(&gfx_ring->sched); - gfx_sched_stopped = true;; + gfx_sched_stopped = true; } if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index dd4bf2e103e5..965169320065 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -126,6 +126,9 @@ struct amdgpu_sdma { uint32_t *ip_dump; uint32_t supported_reset; struct list_head reset_callback_list; + /* track guilty state of GFX and PAGE queues */ + bool gfx_guilty; + bool page_guilty; }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 4ef828df8e56..0f76e2f955eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -671,11 +671,12 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl) * @adev: amdgpu_device pointer * @i: instance to resume * @restore: used to restore wptr when restart + * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error) * * Set up the gfx DMA ring buffers and enable them. * Returns 0 for success, error for failure. */ -static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore) +static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty) { struct amdgpu_ring *ring = &adev->sdma.instance[i].ring; u32 rb_cntl, ib_cntl, wptr_poll_cntl; @@ -683,6 +684,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b u32 doorbell; u32 doorbell_offset; u64 wptr_gpu_addr; + u64 rwptr; wb_offset = (ring->rptr_offs * 4); @@ -708,12 +710,20 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b /* before programing wptr to a less value, need set minor_ptr_update first */ WREG32_SDMA(i, regSDMA_GFX_MINOR_PTR_UPDATE, 1); + /* For the guilty queue, set RPTR to the current wptr to skip bad commands, + * It is not a guilty queue, restore cache_rptr and continue execution. + */ + if (guilty) + rwptr = ring->wptr; + else + rwptr = ring->cached_rptr; + /* Initialize the ring buffer's read and write pointers */ if (restore) { - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2)); - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2)); - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2)); - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2)); + WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(rwptr << 2)); + WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(rwptr << 2)); + WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(rwptr << 2)); + WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(rwptr << 2)); } else { WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, 0); WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, 0); @@ -768,11 +778,12 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b * @adev: amdgpu_device pointer * @i: instance to resume * @restore: boolean to say restore needed or not + * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error) * * Set up the page DMA ring buffers and enable them. * Returns 0 for success, error for failure. */ -static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore) +static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty) { struct amdgpu_ring *ring = &adev->sdma.instance[i].page; u32 rb_cntl, ib_cntl, wptr_poll_cntl; @@ -780,6 +791,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, u32 doorbell; u32 doorbell_offset; u64 wptr_gpu_addr; + u64 rwptr; wb_offset = (ring->rptr_offs * 4); @@ -787,12 +799,20 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, rb_cntl = sdma_v4_4_2_rb_cntl(ring, rb_cntl); WREG32_SDMA(i, regSDMA_PAGE_RB_CNTL, rb_cntl); + /* For the guilty queue, set RPTR to the current wptr to skip bad commands, + * It is not a guilty queue, restore cache_rptr and continue execution. + */ + if (guilty) + rwptr = ring->wptr; + else + rwptr = ring->cached_rptr; + /* Initialize the ring buffer's read and write pointers */ if (restore) { - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2)); - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2)); - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2)); - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2)); + WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, lower_32_bits(rwptr << 2)); + WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, upper_32_bits(rwptr << 2)); + WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR, lower_32_bits(rwptr << 2)); + WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR_HI, upper_32_bits(rwptr << 2)); } else { WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, 0); WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, 0); @@ -968,9 +988,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev, uint32_t temp; WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0); - sdma_v4_4_2_gfx_resume(adev, i, restore); + sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty); if (adev->sdma.has_page_queue) - sdma_v4_4_2_page_resume(adev, i, restore); + sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty); /* set utc l1 enable flag always to 1 */ temp = RREG32_SDMA(i, regSDMA_CNTL); @@ -1480,7 +1500,9 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) r = amdgpu_sdma_sysfs_reset_mask_init(adev); if (r) return r; - INIT_LIST_HEAD(&adev->sdma.reset_callback_list); + /* Initialize guilty flags for GFX and PAGE queues */ + adev->sdma.gfx_guilty = false; + adev->sdma.page_guilty = false; return r; } @@ -1644,11 +1666,29 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid) static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id) { u32 inst_mask; + uint64_t rptr; struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring; if (amdgpu_sriov_vf(adev)) return -EINVAL; + /* Check if this queue is the guilty one */ + adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false); + if (adev->sdma.has_page_queue) + adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true); + + /* Cache the rptr before reset, after the reset, + * all of the registers will be reset to 0 + */ + rptr = amdgpu_ring_get_rptr(ring); + ring->cached_rptr = rptr; + /* Cache the rptr for the page queue if it exists */ + if (adev->sdma.has_page_queue) { + struct amdgpu_ring *page_ring = &adev->sdma.instance[instance_id].page; + rptr = amdgpu_ring_get_rptr(page_ring); + page_ring->cached_rptr = rptr; + } + /* stop queue */ inst_mask = 1 << ring->me; sdma_v4_4_2_inst_gfx_stop(adev, inst_mask); |