drm/amdgpu: update athub interrupt harvesting handle

GCEA/MMHUB EA error should not result to DF freeze, this is fixed in next generation, but for some reasons the GCEA/MMHUB EA error will result to DF freeze in previous generation, diver should avoid to indicate GCEA/MMHUB EA error as hw fatal error in kernel message by read GCEA/MMHUB err status registers. Changed from V1: make query_ras_error_status function more general make read mmhub er status register more friendly Changed from V2: move ras error status query function into do_recovery workqueue Changed from V3: remove useless code from V2, print GCEA error status instance number Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Stanley.Yang <Stanley.Yang@amd.com> 2020-09-15 16:15:05 +0800
committer: Alex Deucher <alexander.deucher@amd.com> 2020-09-22 17:37:38 -0400
commit: 3f975d0f71d384825f47c3598d1d5358e40a57f5 (patch)
tree: 1daf6289c4aa95790897b0cf2ba2b450708b06a2 /drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
parent: d117413f5e1be245b7b3e0cd6afda402609b2572 (diff)
1 files changed, 29 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
index 6c6ad529c65c..c2ef8142136e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
@@ -1624,6 +1624,34 @@ static void mmhub_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
 	}
 }
 
+static const struct soc15_reg_entry mmhub_v9_4_err_status_regs[] = {
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA3_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA4_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA5_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA6_ERR_STATUS), 0, 0, 0 },
+	{ SOC15_REG_ENTRY(MMHUB, 0, mmMMEA7_ERR_STATUS), 0, 0, 0 },
+};
+
+static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev)
+{
+	int i;
+	uint32_t reg_value;
+
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) {
+		reg_value =
+			RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i]));
+		if (reg_value)
+			dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n",
+					i, reg_value);
+	}
+}
+
 const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
 	.ras_late_init = amdgpu_mmhub_ras_late_init,
 	.query_ras_error_count = mmhub_v9_4_query_ras_error_count,
@@ -1636,4 +1664,5 @@ const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = {
 	.set_clockgating = mmhub_v9_4_set_clockgating,
 	.get_clockgating = mmhub_v9_4_get_clockgating,
 	.setup_vm_pt_regs = mmhub_v9_4_setup_vm_pt_regs,
+	.query_ras_error_status = mmhub_v9_4_query_ras_error_status,
 };
author	Stanley.Yang <Stanley.Yang@amd.com>	2020-09-15 16:15:05 +0800
committer	Alex Deucher <alexander.deucher@amd.com>	2020-09-22 17:37:38 -0400
commit	3f975d0f71d384825f47c3598d1d5358e40a57f5 (patch)
tree	1daf6289c4aa95790897b0cf2ba2b450708b06a2 /drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c
parent	d117413f5e1be245b7b3e0cd6afda402609b2572 (diff)