diff options
author | Guchun Chen <guchun.chen@amd.com> | 2020-07-23 15:42:19 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2020-08-04 17:26:31 -0400 |
commit | b82e65a93510465cb4c203c938245f137a4e95dc (patch) | |
tree | 82187ca76db2ab385bf2369fe162b4c3f020be59 /drivers/gpu/drm | |
parent | 1d6a9d122d2c7817ad1bae0e59c8a29450c2b14d (diff) |
drm/amdgpu: break driver init process when it's bad GPU(v5)
When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.
v2: Fix spelling typo, correct the condition to detect
bad gpu tag and refine error message.
v3: Refine function argument name.
v4: Fix missing check of returning value of i2c
initialization error case.
v5: Use dev_err to print PCI information in dmesg instead
of DRM_ERROR.
Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 |
4 files changed, 36 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 40caa7437ce2..905c5ab486a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2055,13 +2055,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. - * recovery_init may fail, but it can free all resources allocated by - * itself and its failure should not stop amdgpu init process. + * + * amdgpu_ras_recovery_init may fail, but the upper only cares the + * failure from bad gpu situation and stop amdgpu init process + * accordingly. For other failed cases, it will still release all + * the resource and print error message, rather than returning one + * negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6660094a1063..1a1652ea76b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + /* + * This calling fails when exc_err_limit is true or + * ret != 0. + */ + if (exc_err_limit || ret) goto free; if (con->eeprom_control.num_recs) { @@ -1867,6 +1872,15 @@ free: out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* + * Except error threshold exceeding case, other failure cases in this + * function would not fail amdgpu driver init. + */ + if (!exc_err_limit) + ret = 0; + else + ret = -EINVAL; + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 35c0c849d49b..da8b35a5b41c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) { int ret = 0; struct amdgpu_device *adev = to_amdgpu_device(control); @@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) .buf = buff, }; + *exceed_err_limit = false; + /* Verify i2c adapter is initialized */ if (!adev->pm.smu_i2c.algo) return -ENOENT; @@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", control->num_recs); + } else if ((hdr->header == EEPROM_TABLE_HDR_BAD) && + (amdgpu_bad_page_threshold != 0)) { + *exceed_err_limit = true; + dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, " + "disabling the GPU.\n"); } else { DRM_INFO("Creating new EEPROM table"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index e285e8cafd48..9839b4ea5e18 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -76,7 +76,8 @@ struct eeprom_table_record { unsigned char mcumc_id; }__attribute__((__packed__)); -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control); +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, |