summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
diff options
context:
space:
mode:
authorEvan Quan <evan.quan@amd.com>2023-05-04 17:09:39 +0800
committerAlex Deucher <alexander.deucher@amd.com>2023-06-30 13:12:16 -0400
commitb75efe88b20c2be28b67e2821a794cc183e32374 (patch)
tree4e52b79d8f0c9ac5b01f55d8b65b3fb6620e8978 /drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
parent064329c595da56eff6d7a7e7760660c726433139 (diff)
drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation
An intentional delay is added on soft ctf triggered. Then there will be a double check for the GPU temperature before taking further action. This can avoid unintended shutdown due to temperature momentary fluctuation. Signed-off-by: Evan Quan <evan.quan@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c')
-rw-r--r--drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c34
1 files changed, 34 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 4dea79a0c5b5..ce41a8309582 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
#include <linux/firmware.h>
#include <linux/pci.h>
+#include <linux/reboot.h>
#include "amdgpu.h"
#include "amdgpu_smu.h"
@@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
smu->ppt_funcs->interrupt_work(smu);
}
+static void smu_swctf_delayed_work_handler(struct work_struct *work)
+{
+ struct smu_context *smu =
+ container_of(work, struct smu_context, swctf_delayed_work.work);
+ struct smu_temperature_range *range =
+ &smu->thermal_range;
+ struct amdgpu_device *adev = smu->adev;
+ uint32_t hotspot_tmp, size;
+
+ /*
+ * If the hotspot temperature is confirmed as below SW CTF setting point
+ * after the delay enforced, nothing will be done.
+ * Otherwise, a graceful shutdown will be performed to prevent further damage.
+ */
+ if (range->software_shutdown_temp &&
+ smu->ppt_funcs->read_sensor &&
+ !smu->ppt_funcs->read_sensor(smu,
+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+ &hotspot_tmp,
+ &size) &&
+ hotspot_tmp / 1000 < range->software_shutdown_temp)
+ return;
+
+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+ orderly_poweroff(true);
+}
+
static int smu_sw_init(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle)
smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
+ INIT_DELAYED_WORK(&smu->swctf_delayed_work,
+ smu_swctf_delayed_work_handler);
+
ret = smu_smc_table_sw_init(smu);
if (ret) {
dev_err(adev->dev, "Failed to sw init smc table!\n");
@@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
return ret;
}
+ cancel_delayed_work_sync(&smu->swctf_delayed_work);
+
ret = smu_disable_dpms(smu);
if (ret) {
dev_err(adev->dev, "Fail to disable dpm features!\n");