summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
diff options
context:
space:
mode:
authorJesse.zhang@amd.com <Jesse.zhang@amd.com>2025-02-20 14:25:47 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-02-25 11:43:59 -0500
commit4c02f730165765ad412a1ce8de6ea0d7abc7a333 (patch)
tree1b60b37aaf99745b2ec2d112c9767d33212b6101 /drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
parent0ca57515606d3a8462abe8dfa83f23c39e5e69a9 (diff)
drm/amdgpu: Introduce conditional user queue suspension for SDMA resets
- Modify the `amdgpu_sdma_reset_engine` function to accept a `suspend_user_queues` parameter. - This parameter allows the function to conditionally suspend and resume user queues during SDMA resets. - Ensure that user queues are suspended only when necessary to avoid unnecessary overhead and potential deadlocks. - Restart the scheduler's work queue for the GFX and page rings after the reset to allow new tasks to be submitted. This change improves synchronization between the KGD and the KFD during SDMA resets, ensuring proper handling of user queues and avoiding race conditions. V2: replace the ring_lock with the existed the scheduler locks for the queues (ring->sched) on the sdma engine.(Alex) v3: call drm_sched_wqueue_stop() rather than job_list_lock. If a GPU ring reset was already initiated for one ring at amdgpu_job_timedout, skip resetting that ring and call drm_sched_wqueue_stop() for the other rings (Alex) replace the common lock (sdma_reset_lock) with DQM lock to to resolve reset races between the two driver sections during KFD eviction.(Jon) Rename the caller to Reset_src and Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon) v4: restart the wqueue if the reset was successful, or fall back to a full adapter reset. (Alex) move definition of reset source to enumeration AMDGPU_RESET_SRCS, and check reset src in amdgpu_sdma_reset_instance (Jon) v5: Call amdgpu_amdkfd_suspend/resume at the start/end of reset function respectively under !SRC_HWS conditions only (Jon) v6: replace the paramter src with a bool suspend_user_queues, remove the paramter src in pre/post func. (Jon) Suggested-by: Alex Deucher <alexander.deucher@amd.com> Suggested-by: Jiadong Zhu <Jiadong.Zhu@amd.com> Suggested-by: Jonathan Kim <Jonathan.Kim@amd.com> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com> Acked-by: Jonathan Kim <jonathan.kim@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c56
1 files changed, 50 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index fe39198307ec..9f26da7e7e34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -25,6 +25,7 @@
#include "amdgpu.h"
#include "amdgpu_sdma.h"
#include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
#define AMDGPU_CSA_SDMA_SIZE 64
/* SDMA CSA reside in the 3rd page of CSA */
@@ -485,6 +486,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
* amdgpu_sdma_reset_engine - Reset a specific SDMA engine
* @adev: Pointer to the AMDGPU device
* @instance_id: ID of the SDMA engine instance to reset
+ * @suspend_user_queues: check if suspend user queue.
*
* This function performs the following steps:
* 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
@@ -493,10 +495,35 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
*
* Returns: 0 on success, or a negative error code on failure.
*/
-int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
+int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues)
{
struct sdma_on_reset_funcs *funcs;
- int ret;
+ int ret = 0;
+ struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];;
+ struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
+ struct amdgpu_ring *page_ring = &sdma_instance->page;
+ bool gfx_sched_stopped = false, page_sched_stopped = false;
+
+ /* Suspend KFD if suspend_user_queues is true.
+ * prevent the destruction of in-flight healthy user queue packets and
+ * avoid race conditions between KFD and KGD during the reset process.
+ */
+ if (suspend_user_queues)
+ amdgpu_amdkfd_suspend(adev, false);
+
+ /* Stop the scheduler's work queue for the GFX and page rings if they are running.
+ * This ensures that no new tasks are submitted to the queues while
+ * the reset is in progress.
+ */
+ if (!amdgpu_ring_sched_ready(gfx_ring)) {
+ drm_sched_wqueue_stop(&gfx_ring->sched);
+ gfx_sched_stopped = true;;
+ }
+
+ if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
+ drm_sched_wqueue_stop(&page_ring->sched);
+ page_sched_stopped = true;
+ }
/* Invoke all registered pre_reset callbacks */
list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
@@ -506,7 +533,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
dev_err(adev->dev,
"beforeReset callback failed for instance %u: %d\n",
instance_id, ret);
- return ret;
+ goto exit;
}
}
}
@@ -515,7 +542,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
if (ret) {
dev_err(adev->dev, "Failed to reset SDMA instance %u\n", instance_id);
- return ret;
+ goto exit;
}
/* Invoke all registered post_reset callbacks */
@@ -526,10 +553,27 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
dev_err(adev->dev,
"afterReset callback failed for instance %u: %d\n",
instance_id, ret);
- return ret;
+ goto exit;
}
}
}
- return 0;
+exit:
+ /* Restart the scheduler's work queue for the GFX and page rings
+ * if they were stopped by this function. This allows new tasks
+ * to be submitted to the queues after the reset is complete.
+ */
+ if (ret) {
+ if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
+ drm_sched_wqueue_start(&gfx_ring->sched);
+ }
+ if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) {
+ drm_sched_wqueue_start(&page_ring->sched);
+ }
+ }
+
+ if (suspend_user_queues)
+ amdgpu_amdkfd_resume(adev, false);
+
+ return ret;
}