summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c166
1 files changed, 129 insertions, 37 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 4b9e7b050ccd..4340d08f7607 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -29,13 +29,16 @@
#include "df/df_3_6_offset.h"
#include "xgmi/xgmi_4_0_0_smn.h"
#include "xgmi/xgmi_4_0_0_sh_mask.h"
+#include "xgmi/xgmi_6_1_0_sh_mask.h"
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"
#include "amdgpu_reset.h"
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
+#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
+#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218
static DEFINE_MUTEX(xgmi_mutex);
@@ -79,11 +82,27 @@ static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
};
+static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000,
+ smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000
+};
+
static const int walf_pcs_err_status_reg_aldebaran[] = {
smnPCS_GOPX1_PCS_ERROR_STATUS,
smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
};
+static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
+ smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK,
+ smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
+};
+
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
{"XGMI PCS DataLossErr",
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -162,6 +181,67 @@ static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
};
+static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = {
+ {"XGMI3X16 PCS DataLossErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)},
+ {"XGMI3X16 PCS TrainingErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)},
+ {"XGMI3X16 PCS FlowCtrlAckErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)},
+ {"XGMI3X16 PCS RxFifoUnderflowErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)},
+ {"XGMI3X16 PCS RxFifoOverflowErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)},
+ {"XGMI3X16 PCS CRCErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)},
+ {"XGMI3X16 PCS BERExceededErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)},
+ {"XGMI3X16 PCS TxVcidDataErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)},
+ {"XGMI3X16 PCS ReplayBufParityErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)},
+ {"XGMI3X16 PCS DataParityErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)},
+ {"XGMI3X16 PCS ReplayFifoOverflowErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
+ {"XGMI3X16 PCS ReplayFifoUnderflowErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
+ {"XGMI3X16 PCS ElasticFifoOverflowErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
+ {"XGMI3X16 PCS DeskewErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)},
+ {"XGMI3X16 PCS FlowCtrlCRCErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)},
+ {"XGMI3X16 PCS DataStartupLimitErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)},
+ {"XGMI3X16 PCS FCInitTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
+ {"XGMI3X16 PCS RecoveryTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
+ {"XGMI3X16 PCS ReadySerialTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
+ {"XGMI3X16 PCS ReadySerialAttemptErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
+ {"XGMI3X16 PCS RecoveryAttemptErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
+ {"XGMI3X16 PCS RecoveryRelockAttemptErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
+ {"XGMI3X16 PCS ReplayAttemptErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)},
+ {"XGMI3X16 PCS SyncHdrErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)},
+ {"XGMI3X16 PCS TxReplayTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)},
+ {"XGMI3X16 PCS RxReplayTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)},
+ {"XGMI3X16 PCS LinkSubTxTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)},
+ {"XGMI3X16 PCS LinkSubRxTimeoutErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)},
+ {"XGMI3X16 PCS RxCMDPktErr",
+ SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)},
+};
+
/**
* DOC: AMDGPU XGMI Support
*
@@ -809,39 +889,47 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
uint32_t value,
+ uint32_t mask_value,
uint32_t *ue_count,
uint32_t *ce_count,
- bool is_xgmi_pcs)
+ bool is_xgmi_pcs,
+ bool check_mask)
{
int i;
- int ue_cnt;
+ int ue_cnt = 0;
+ const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL;
+ uint32_t field_array_size = 0;
if (is_xgmi_pcs) {
- /* query xgmi pcs error status,
- * only ue is supported */
- for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
- ue_cnt = (value &
- xgmi_pcs_ras_fields[i].pcs_err_mask) >>
- xgmi_pcs_ras_fields[i].pcs_err_shift;
- if (ue_cnt) {
- dev_info(adev->dev, "%s detected\n",
- xgmi_pcs_ras_fields[i].err_name);
- *ue_count += ue_cnt;
- }
+ if (adev->ip_versions[XGMI_HWIP][0] == IP_VERSION(6, 1, 0)) {
+ pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
+ field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
+ } else {
+ pcs_ras_fields = &xgmi_pcs_ras_fields[0];
+ field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields);
}
} else {
- /* query wafl pcs error status,
- * only ue is supported */
- for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
- ue_cnt = (value &
- wafl_pcs_ras_fields[i].pcs_err_mask) >>
- wafl_pcs_ras_fields[i].pcs_err_shift;
- if (ue_cnt) {
- dev_info(adev->dev, "%s detected\n",
- wafl_pcs_ras_fields[i].err_name);
- *ue_count += ue_cnt;
- }
+ pcs_ras_fields = &wafl_pcs_ras_fields[0];
+ field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields);
+ }
+
+ if (check_mask)
+ value = value & ~mask_value;
+
+ /* query xgmi/walf pcs error status,
+ * only ue is supported */
+ for (i = 0; value && i < field_array_size; i++) {
+ ue_cnt = (value &
+ pcs_ras_fields[i].pcs_err_mask) >>
+ pcs_ras_fields[i].pcs_err_shift;
+ if (ue_cnt) {
+ dev_info(adev->dev, "%s detected\n",
+ pcs_ras_fields[i].err_name);
+ *ue_count += ue_cnt;
}
+
+ /* reset bit value if the bit is checked */
+ value &= ~(pcs_ras_fields[i].pcs_err_mask);
}
return 0;
@@ -852,7 +940,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
int i;
- uint32_t data;
+ uint32_t data, mask_data = 0;
uint32_t ue_cnt = 0, ce_cnt = 0;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
@@ -867,15 +955,15 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, true);
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, true, false);
}
/* check wafl pcs error */
for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, false);
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, false, false);
}
break;
case CHIP_VEGA20:
@@ -883,31 +971,35 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, true);
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, true, false);
}
/* check wafl pcs error */
for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, false);
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, false, false);
}
break;
case CHIP_ALDEBARAN:
/* check xgmi3x16 pcs error */
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
+ mask_data =
+ RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, true);
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, true, true);
}
/* check wafl pcs error */
for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
+ mask_data =
+ RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]);
if (data)
- amdgpu_xgmi_query_pcs_error_status(adev,
- data, &ue_cnt, &ce_cnt, false);
+ amdgpu_xgmi_query_pcs_error_status(adev, data,
+ mask_data, &ue_cnt, &ce_cnt, false, true);
}
break;
default: