summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c66
3 files changed, 61 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 9f26da7e7e34..42a7b86e41c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -475,6 +475,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
if (!funcs)
return;
+ /* Ensure the reset_callback_list is initialized */
+ if (!adev->sdma.reset_callback_list.next) {
+ INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+ }
/* Initialize the list node in the callback structure */
INIT_LIST_HEAD(&funcs->list);
@@ -517,7 +521,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b
*/
if (!amdgpu_ring_sched_ready(gfx_ring)) {
drm_sched_wqueue_stop(&gfx_ring->sched);
- gfx_sched_stopped = true;;
+ gfx_sched_stopped = true;
}
if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index dd4bf2e103e5..965169320065 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -126,6 +126,9 @@ struct amdgpu_sdma {
uint32_t *ip_dump;
uint32_t supported_reset;
struct list_head reset_callback_list;
+ /* track guilty state of GFX and PAGE queues */
+ bool gfx_guilty;
+ bool page_guilty;
};
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 4ef828df8e56..0f76e2f955eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -671,11 +671,12 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
* @adev: amdgpu_device pointer
* @i: instance to resume
* @restore: used to restore wptr when restart
+ * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
*
* Set up the gfx DMA ring buffers and enable them.
* Returns 0 for success, error for failure.
*/
-static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
+static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
{
struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -683,6 +684,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
u32 doorbell;
u32 doorbell_offset;
u64 wptr_gpu_addr;
+ u64 rwptr;
wb_offset = (ring->rptr_offs * 4);
@@ -708,12 +710,20 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
/* before programing wptr to a less value, need set minor_ptr_update first */
WREG32_SDMA(i, regSDMA_GFX_MINOR_PTR_UPDATE, 1);
+ /* For the guilty queue, set RPTR to the current wptr to skip bad commands,
+ * It is not a guilty queue, restore cache_rptr and continue execution.
+ */
+ if (guilty)
+ rwptr = ring->wptr;
+ else
+ rwptr = ring->cached_rptr;
+
/* Initialize the ring buffer's read and write pointers */
if (restore) {
- WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2));
- WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2));
- WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2));
- WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2));
+ WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(rwptr << 2));
+ WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(rwptr << 2));
+ WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(rwptr << 2));
+ WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(rwptr << 2));
} else {
WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, 0);
WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, 0);
@@ -768,11 +778,12 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
* @adev: amdgpu_device pointer
* @i: instance to resume
* @restore: boolean to say restore needed or not
+ * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
*
* Set up the page DMA ring buffers and enable them.
* Returns 0 for success, error for failure.
*/
-static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
+static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
{
struct amdgpu_ring *ring = &adev->sdma.instance[i].page;
u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -780,6 +791,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
u32 doorbell;
u32 doorbell_offset;
u64 wptr_gpu_addr;
+ u64 rwptr;
wb_offset = (ring->rptr_offs * 4);
@@ -787,12 +799,20 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
rb_cntl = sdma_v4_4_2_rb_cntl(ring, rb_cntl);
WREG32_SDMA(i, regSDMA_PAGE_RB_CNTL, rb_cntl);
+ /* For the guilty queue, set RPTR to the current wptr to skip bad commands,
+ * It is not a guilty queue, restore cache_rptr and continue execution.
+ */
+ if (guilty)
+ rwptr = ring->wptr;
+ else
+ rwptr = ring->cached_rptr;
+
/* Initialize the ring buffer's read and write pointers */
if (restore) {
- WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2));
- WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2));
- WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2));
- WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2));
+ WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, lower_32_bits(rwptr << 2));
+ WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, upper_32_bits(rwptr << 2));
+ WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR, lower_32_bits(rwptr << 2));
+ WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR_HI, upper_32_bits(rwptr << 2));
} else {
WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, 0);
WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, 0);
@@ -968,9 +988,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev,
uint32_t temp;
WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
- sdma_v4_4_2_gfx_resume(adev, i, restore);
+ sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
if (adev->sdma.has_page_queue)
- sdma_v4_4_2_page_resume(adev, i, restore);
+ sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty);
/* set utc l1 enable flag always to 1 */
temp = RREG32_SDMA(i, regSDMA_CNTL);
@@ -1480,7 +1500,9 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
r = amdgpu_sdma_sysfs_reset_mask_init(adev);
if (r)
return r;
- INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
+ /* Initialize guilty flags for GFX and PAGE queues */
+ adev->sdma.gfx_guilty = false;
+ adev->sdma.page_guilty = false;
return r;
}
@@ -1644,11 +1666,29 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id)
{
u32 inst_mask;
+ uint64_t rptr;
struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
if (amdgpu_sriov_vf(adev))
return -EINVAL;
+ /* Check if this queue is the guilty one */
+ adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
+ if (adev->sdma.has_page_queue)
+ adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
+
+ /* Cache the rptr before reset, after the reset,
+ * all of the registers will be reset to 0
+ */
+ rptr = amdgpu_ring_get_rptr(ring);
+ ring->cached_rptr = rptr;
+ /* Cache the rptr for the page queue if it exists */
+ if (adev->sdma.has_page_queue) {
+ struct amdgpu_ring *page_ring = &adev->sdma.instance[instance_id].page;
+ rptr = amdgpu_ring_get_rptr(page_ring);
+ page_ring->cached_rptr = rptr;
+ }
+
/* stop queue */
inst_mask = 1 << ring->me;
sdma_v4_4_2_inst_gfx_stop(adev, inst_mask);