From b75efe88b20c2be28b67e2821a794cc183e32374 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 4 May 2023 17:09:39 +0800 Subject: drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation An intentional delay is added on soft ctf triggered. Then there will be a double check for the GPU temperature before taking further action. This can avoid unintended shutdown due to temperature momentary fluctuation. Signed-off-by: Evan Quan Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c | 48 ++++++++++++++++++++++ .../gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c | 27 ++++-------- drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h | 2 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +++++++++++++++ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 + drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 9 +--- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 9 +--- 7 files changed, 99 insertions(+), 32 deletions(-) (limited to 'drivers/gpu/drm/amd/pm') diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c index 11b7b4cffaae..ff360c699171 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "amd_shared.h" #include "amd_powerplay.h" #include "power_state.h" @@ -91,6 +92,45 @@ static int pp_early_init(void *handle) return 0; } +static void pp_swctf_delayed_work_handler(struct work_struct *work) +{ + struct pp_hwmgr *hwmgr = + container_of(work, struct pp_hwmgr, swctf_delayed_work.work); + struct amdgpu_device *adev = hwmgr->adev; + struct amdgpu_dpm_thermal *range = + &adev->pm.dpm.thermal; + uint32_t gpu_temperature, size; + int ret; + + /* + * If the hotspot/edge temperature is confirmed as below SW CTF setting point + * after the delay enforced, nothing will be done. + * Otherwise, a graceful shutdown will be performed to prevent further damage. + */ + if (range->sw_ctf_threshold && + hwmgr->hwmgr_func->read_sensor) { + ret = hwmgr->hwmgr_func->read_sensor(hwmgr, + AMDGPU_PP_SENSOR_HOTSPOT_TEMP, + &gpu_temperature, + &size); + /* + * For some legacy ASICs, hotspot temperature retrieving might be not + * supported. Check the edge temperature instead then. + */ + if (ret == -EOPNOTSUPP) + ret = hwmgr->hwmgr_func->read_sensor(hwmgr, + AMDGPU_PP_SENSOR_EDGE_TEMP, + &gpu_temperature, + &size); + if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold) + return; + } + + dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); + orderly_poweroff(true); +} + static int pp_sw_init(void *handle) { struct amdgpu_device *adev = handle; @@ -101,6 +141,10 @@ static int pp_sw_init(void *handle) pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully"); + if (!ret) + INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work, + pp_swctf_delayed_work_handler); + return ret; } @@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle) struct amdgpu_device *adev = handle; struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; + cancel_delayed_work_sync(&hwmgr->swctf_delayed_work); + hwmgr_hw_fini(hwmgr); return 0; @@ -221,6 +267,8 @@ static int pp_suspend(void *handle) struct amdgpu_device *adev = handle; struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; + cancel_delayed_work_sync(&hwmgr->swctf_delayed_work); + return hwmgr_suspend(hwmgr); } diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c index bfe80ac0ad8c..d0b1ab6c4523 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c @@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { + struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; uint32_t client_id = entry->client_id; uint32_t src_id = entry->src_id; if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) { if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) { - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); - } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) + schedule_delayed_work(&hwmgr->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); + } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) { dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); - else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { + } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n"); /* * HW CTF just occurred. Shutdown to prevent further damage. @@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev, orderly_poweroff(true); } } else if (client_id == SOC15_IH_CLIENTID_THM) { - if (src_id == 0) { - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); - } else + if (src_id == 0) + schedule_delayed_work(&hwmgr->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); + else dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) { dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n"); diff --git a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h index f1580a26a850..612d66aeaab9 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h +++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h @@ -811,6 +811,8 @@ struct pp_hwmgr { bool gfxoff_state_changed_by_workload; uint32_t pstate_sclk_peak; uint32_t pstate_mclk_peak; + + struct delayed_work swctf_delayed_work; }; int hwmgr_early_init(struct pp_hwmgr *hwmgr); diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 4dea79a0c5b5..ce41a8309582 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -24,6 +24,7 @@ #include #include +#include #include "amdgpu.h" #include "amdgpu_smu.h" @@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work) smu->ppt_funcs->interrupt_work(smu); } +static void smu_swctf_delayed_work_handler(struct work_struct *work) +{ + struct smu_context *smu = + container_of(work, struct smu_context, swctf_delayed_work.work); + struct smu_temperature_range *range = + &smu->thermal_range; + struct amdgpu_device *adev = smu->adev; + uint32_t hotspot_tmp, size; + + /* + * If the hotspot temperature is confirmed as below SW CTF setting point + * after the delay enforced, nothing will be done. + * Otherwise, a graceful shutdown will be performed to prevent further damage. + */ + if (range->software_shutdown_temp && + smu->ppt_funcs->read_sensor && + !smu->ppt_funcs->read_sensor(smu, + AMDGPU_PP_SENSOR_HOTSPOT_TEMP, + &hotspot_tmp, + &size) && + hotspot_tmp / 1000 < range->software_shutdown_temp) + return; + + dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); + orderly_poweroff(true); +} + static int smu_sw_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle) smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO; smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO; + INIT_DELAYED_WORK(&smu->swctf_delayed_work, + smu_swctf_delayed_work_handler); + ret = smu_smc_table_sw_init(smu); if (ret) { dev_err(adev->dev, "Failed to sw init smc table!\n"); @@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) return ret; } + cancel_delayed_work_sync(&smu->swctf_delayed_work); + ret = smu_disable_dpms(smu); if (ret) { dev_err(adev->dev, "Fail to disable dpm features!\n"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index 09469c750a96..6e2069dcb6b9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -573,6 +573,8 @@ struct smu_context u32 debug_param_reg; u32 debug_msg_reg; u32 debug_resp_reg; + + struct delayed_work swctf_delayed_work; }; struct i2c_adapter; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index e1ef88ee1ed3..aa4a5498a12f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, if (client_id == SOC15_IH_CLIENTID_THM) { switch (src_id) { case THM_11_0__SRCID__THM_DIG_THERM_L2H: - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); + schedule_delayed_work(&smu->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); break; case THM_11_0__SRCID__THM_DIG_THERM_H2L: dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index e52c563f0dac..3856da6c3f3d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1353,13 +1353,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev, if (client_id == SOC15_IH_CLIENTID_THM) { switch (src_id) { case THM_11_0__SRCID__THM_DIG_THERM_L2H: - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); + schedule_delayed_work(&smu->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); break; case THM_11_0__SRCID__THM_DIG_THERM_H2L: dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); -- cgit