summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2023-10-19 16:01:07 +0800
committerAlex Deucher <alexander.deucher@amd.com>2023-11-07 12:03:31 -0500
commit20238a2cc9a6a926f9f47ae4ae9edd1bc98f278c (patch)
treececfad7fbf79156b9e116a76ccf2995da6f9e566 /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
parent61fe5536d06cf485d387c894d2083de883c81ad7 (diff)
drm/amdgpu: add RAS reset/query operations for XGMI v6_4
Reset/query RAS error status and count. v2: use XGMI IP version instead of WAFL version. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c46
1 files changed, 43 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 9d5d742ee9d3..713e17cca071 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -103,6 +103,16 @@ static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
};
+static const int xgmi3x16_pcs_err_status_reg_v6_4[] = {
+ smnPCS_XGMI3X16_PCS_ERROR_STATUS,
+ smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000
+};
+
+static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
+};
+
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
{"XGMI PCS DataLossErr",
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -952,6 +962,16 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
default:
break;
}
+
+ switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+ case IP_VERSION(6, 4, 0):
+ for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++)
+ pcs_clear_status(adev,
+ xgmi3x16_pcs_err_status_reg_v6_4[i]);
+ break;
+ default:
+ break;
+ }
}
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
@@ -969,7 +989,9 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
if (is_xgmi_pcs) {
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
- IP_VERSION(6, 1, 0)) {
+ IP_VERSION(6, 1, 0) ||
+ amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
+ IP_VERSION(6, 4, 0)) {
pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
} else {
@@ -1007,7 +1029,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- int i;
+ int i, supported = 1;
uint32_t data, mask_data = 0;
uint32_t ue_cnt = 0, ce_cnt = 0;
@@ -1071,7 +1093,25 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
}
break;
default:
- dev_warn(adev->dev, "XGMI RAS error query not supported");
+ supported = 0;
+ break;
+ }
+
+ switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+ case IP_VERSION(6, 4, 0):
+ /* check xgmi3x16 pcs error */
+ for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) {
+ data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]);
+ mask_data =
+ RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]);
+ if (data)
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, true, true);
+ }
+ break;
+ default:
+ if (!supported)
+ dev_warn(adev->dev, "XGMI RAS error query not supported");
break;
}