drm/amdgpu: update athub interrupt harvesting handle

GCEA/MMHUB EA error should not result to DF freeze, this is fixed in next generation, but for some reasons the GCEA/MMHUB EA error will result to DF freeze in previous generation, diver should avoid to indicate GCEA/MMHUB EA error as hw fatal error in kernel message by read GCEA/MMHUB err status registers. Changed from V1: make query_ras_error_status function more general make read mmhub er status register more friendly Changed from V2: move ras error status query function into do_recovery workqueue Changed from V3: remove useless code from V2, print GCEA error status instance number Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Stanley.Yang <Stanley.Yang@amd.com> 2020-09-15 16:15:05 +0800
committer: Alex Deucher <alexander.deucher@amd.com> 2020-09-22 17:37:38 -0400
commit: 3f975d0f71d384825f47c3598d1d5358e40a57f5 (patch)
tree: 1daf6289c4aa95790897b0cf2ba2b450708b06a2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent: d117413f5e1be245b7b3e0cd6afda402609b2572 (diff)
1 files changed, 42 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e5ea14774c0c..40614ac9a111 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1498,6 +1498,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
 	}
 }
 
+/* Parse RdRspStatus and WrRspStatus */
+void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
+		struct ras_query_if *info)
+{
+	/*
+	 * Only two block need to query read/write
+	 * RspStatus at current state
+	 */
+	switch (info->head.block) {
+	case AMDGPU_RAS_BLOCK__GFX:
+		if (adev->gfx.funcs->query_ras_error_status)
+			adev->gfx.funcs->query_ras_error_status(adev);
+		break;
+	case AMDGPU_RAS_BLOCK__MMHUB:
+		if (adev->mmhub.funcs->query_ras_error_status)
+			adev->mmhub.funcs->query_ras_error_status(adev);
+		break;
+	default:
+		break;
+	}
+}
+
+static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj;
+
+	if (!con)
+		return;
+
+	list_for_each_entry(obj, &con->head, node) {
+		struct ras_query_if info = {
+			.head = obj->head,
+		};
+
+		amdgpu_ras_error_status_query(adev, &info);
+	}
+}
+
 /* recovery begin */
 
 /* return 0 on success.
@@ -1568,8 +1607,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 		}
 
 		list_for_each_entry(remote_adev,
-				device_list_handle, gmc.xgmi.head)
+				device_list_handle, gmc.xgmi.head) {
+			amdgpu_ras_query_err_status(remote_adev);
 			amdgpu_ras_log_on_err_counter(remote_adev);
+		}
 
 		amdgpu_put_xgmi_hive(hive);
 	}
author	Stanley.Yang <Stanley.Yang@amd.com>	2020-09-15 16:15:05 +0800
committer	Alex Deucher <alexander.deucher@amd.com>	2020-09-22 17:37:38 -0400
commit	3f975d0f71d384825f47c3598d1d5358e40a57f5 (patch)
tree	1daf6289c4aa95790897b0cf2ba2b450708b06a2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent	d117413f5e1be245b7b3e0cd6afda402609b2572 (diff)