summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/ras
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2025-10-28 16:18:31 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-11-04 11:53:59 -0500
commitd95ca7f515cfd2e721de07e86aa79adb17575a52 (patch)
tree726550907388bb73193175e1da23f00e7e8a80ea /drivers/gpu/drm/amd/ras
parentd4432f16d3393180e8f0b344b21839e553f7938b (diff)
drm/amdgpu: suspend ras module before gpu reset
During gpu reset, all GPU-related resources are inaccessible. To avoid affecting ras functionality, suspend ras module before gpu reset and resume it after gpu reset is complete. V2: Rename functions to avoid misunderstanding. V3: Move flush_delayed_work to amdgpu_ras_process_pause, Move schedule_delayed_work to amdgpu_ras_process_unpause. V4: Rename functions. V5: Move the function to amdgpu_ras.c. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Acked-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/ras')
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c22
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h5
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c64
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h4
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c6
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras.h2
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_process.c7
7 files changed, 110 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
index adb01bdee003..afe8135b6258 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
@@ -624,3 +624,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
return ret;
}
+
+int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
+{
+ if (!amdgpu_ras_mgr_is_ready(adev)) {
+ RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
+ return -EPERM;
+ }
+
+ amdgpu_ras_process_pre_reset(adev);
+ return 0;
+}
+
+int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
+{
+ if (!amdgpu_ras_mgr_is_ready(adev)) {
+ RAS_DEV_ERR(adev, "Invalid ras resume!\n");
+ return -EPERM;
+ }
+
+ amdgpu_ras_process_post_reset(adev);
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
index 42f190a8feb9..8fb7eb4b8f13 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
@@ -52,6 +52,9 @@ struct amdgpu_ras_mgr {
struct ras_event_manager ras_event_mgr;
uint64_t last_poison_consumption_seqno;
bool ras_is_ready;
+
+ bool is_paused;
+ struct completion ras_event_done;
};
extern const struct amdgpu_ip_block_version ras_v1_0_ip_block;
@@ -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
uint32_t cmd_id, void *input, uint32_t input_size,
void *output, uint32_t out_size);
+int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c
index 6727fc9a2b9b..5782c007de71 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c
@@ -29,6 +29,7 @@
#include "amdgpu_ras_process.h"
#define RAS_MGR_RETIRE_PAGE_INTERVAL 100
+#define RAS_EVENT_PROCESS_TIMEOUT 1200
static void ras_process_retire_page_dwork(struct work_struct *work)
{
@@ -57,6 +58,9 @@ int amdgpu_ras_process_init(struct amdgpu_device *adev)
{
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ ras_mgr->is_paused = false;
+ init_completion(&ras_mgr->ras_event_done);
+
INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork);
return 0;
@@ -66,6 +70,7 @@ int amdgpu_ras_process_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ ras_mgr->is_paused = false;
/* Save all cached bad pages to eeprom */
flush_delayed_work(&ras_mgr->retire_page_dwork);
cancel_delayed_work_sync(&ras_mgr->retire_page_dwork);
@@ -124,3 +129,62 @@ int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false);
}
+
+int amdgpu_ras_process_begin(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (ras_mgr->is_paused)
+ return -EAGAIN;
+
+ reinit_completion(&ras_mgr->ras_event_done);
+ return 0;
+}
+
+int amdgpu_ras_process_end(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ complete(&ras_mgr->ras_event_done);
+ return 0;
+}
+
+int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ long rc;
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EINVAL;
+
+ if (!ras_mgr->ras_core->is_initialized)
+ return -EPERM;
+
+ ras_mgr->is_paused = true;
+
+ /* Wait for RAS event processing to complete */
+ rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done,
+ msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT));
+ if (rc <= 0)
+ RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n",
+ rc ? "interrupted" : "timeout");
+
+ flush_delayed_work(&ras_mgr->retire_page_dwork);
+ return 0;
+}
+
+int amdgpu_ras_process_post_reset(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EINVAL;
+
+ if (!ras_mgr->ras_core->is_initialized)
+ return -EPERM;
+
+ ras_mgr->is_paused = false;
+
+ schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h
index b9502bd21beb..d55cdaeac441 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h
@@ -34,4 +34,8 @@ int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev,
void *data);
int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
void *data);
+int amdgpu_ras_process_begin(struct amdgpu_device *adev);
+int amdgpu_ras_process_end(struct amdgpu_device *adev);
+int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev);
+int amdgpu_ras_process_post_reset(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
index f21cd55a25be..45ed8c3b5563 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -142,6 +142,12 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
case RAS_EVENT_ID__RESET_GPU:
ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
break;
+ case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN:
+ ret = amdgpu_ras_process_begin(ras_core->dev);
+ break;
+ case RAS_EVENT_ID__RAS_EVENT_PROC_END:
+ ret = amdgpu_ras_process_end(ras_core->dev);
+ break;
default:
RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id);
break;
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h
index fa224b36e3f2..3396b2e0949d 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -115,6 +115,8 @@ enum ras_notify_event {
RAS_EVENT_ID__FATAL_ERROR_DETECTED,
RAS_EVENT_ID__RESET_GPU,
RAS_EVENT_ID__RESET_VF,
+ RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN,
+ RAS_EVENT_ID__RAS_EVENT_PROC_END,
};
enum ras_gpu_status {
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c b/drivers/gpu/drm/amd/ras/rascore/ras_process.c
index 02f0657f78a3..3267dcdb169c 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_process.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c
@@ -162,6 +162,11 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core)
uint32_t umc_event_count;
int ret;
+ ret = ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
+ if (ret)
+ return ret;
+
ras_aca_clear_fatal_flag(ras_core);
ras_umc_log_pending_bad_bank(ras_core);
@@ -185,6 +190,8 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core)
atomic_set(&ras_proc->umc_interrupt_count, 0);
}
+ ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
return ret;
}