summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorganglxie <ganglxie@amd.com>2025-06-26 17:00:45 +0800
committerAlex Deucher <alexander.deucher@amd.com>2025-06-30 12:08:00 -0400
commitcfce8f4fa7686657e119392e03964f401efea765 (patch)
tree4f220c6bd4cd7112584c4405050bee89f0c5c1ac
parent0b7f13551e4c87e51a330ce296222780b9512293 (diff)
drm/amdgpu: refine ras error injection when eeprom initialization failed
when eeprom initialization failed, we still support ras error injection, and reserve bad pages, but do not save bad pages to eeprom Signed-off-by: ganglxie <ganglxie@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c22
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h2
2 files changed, 18 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 69b8733a2b2d..662cddc35fd3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3006,6 +3006,15 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
return 0;
}
+ if (!con->eeprom_control.is_eeprom_valid) {
+ dev_warn(adev->dev,
+ "Failed to save EEPROM table data because of EEPROM data corruption!");
+ if (new_cnt)
+ *new_cnt = 0;
+
+ return 0;
+ }
+
mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
@@ -3491,8 +3500,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
control = &con->eeprom_control;
ret = amdgpu_ras_eeprom_init(control);
- if (ret)
- return ret;
+ control->is_eeprom_valid = !ret;
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
control->ras_num_pa_recs = control->ras_num_recs;
@@ -3501,10 +3509,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev->umc.ras->get_retire_flip_bits)
adev->umc.ras->get_retire_flip_bits(adev);
- if (control->ras_num_recs) {
+ if (control->ras_num_recs && control->is_eeprom_valid) {
ret = amdgpu_ras_load_bad_pages(adev);
- if (ret)
- return ret;
+ if (ret) {
+ control->is_eeprom_valid = false;
+ return 0;
+ }
amdgpu_dpm_send_hbm_bad_pages_num(
adev, control->ras_num_bad_pages);
@@ -3523,7 +3533,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
- return ret;
+ return 0;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index ec6d7ea37ad0..35c69ac3dbeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control {
/* Record channel info which occurred bad pages
*/
u32 bad_channel_bitmap;
+
+ bool is_eeprom_valid;
};
/*