diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/umc_v8_10.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 311 |
1 files changed, 203 insertions, 108 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c index da394bc06bba..a32f87992f20 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c @@ -76,10 +76,13 @@ static inline uint32_t get_umc_v8_10_reg_offset(struct amdgpu_device *adev, UMC_8_NODE_DIST * node_inst; } -static void umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, - uint32_t umc_reg_offset) +static int umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) { uint32_t ecc_err_cnt_addr; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); ecc_err_cnt_addr = SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt); @@ -87,24 +90,14 @@ static void umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, /* clear error count */ WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_10_CE_CNT_INIT); + + return 0; } static void umc_v8_10_clear_error_count(struct amdgpu_device *adev) { - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_clear_error_count_per_channel(adev, - umc_reg_offset); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_clear_error_count_per_channel, NULL); } static void umc_v8_10_query_correctable_error_count(struct amdgpu_device *adev, @@ -147,29 +140,29 @@ static void umc_v8_10_query_uncorrectable_error_count(struct amdgpu_device *adev *error_count += 1; } +static int umc_v8_10_query_ecc_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) +{ + struct ras_err_data *err_data = (struct ras_err_data *)data; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); + + umc_v8_10_query_correctable_error_count(adev, + umc_reg_offset, + &(err_data->ce_count)); + umc_v8_10_query_uncorrectable_error_count(adev, + umc_reg_offset, + &(err_data->ue_count)); + + return 0; +} + static void umc_v8_10_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_query_correctable_error_count(adev, - umc_reg_offset, - &(err_data->ce_count)); - umc_v8_10_query_uncorrectable_error_count(adev, - umc_reg_offset, - &(err_data->ue_count)); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_query_ecc_error_count, ras_error_status); umc_v8_10_clear_error_count(adev); } @@ -209,39 +202,69 @@ static int umc_v8_10_swizzle_mode_na_to_pa(struct amdgpu_device *adev, return 0; } -static void umc_v8_10_query_error_address(struct amdgpu_device *adev, - struct ras_err_data *err_data, - uint32_t umc_reg_offset, - uint32_t node_inst, - uint32_t ch_inst, - uint32_t umc_inst) +static void umc_v8_10_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst, + uint32_t node_inst, uint64_t mc_umc_status) { - uint64_t mc_umc_status_addr; - uint64_t mc_umc_status, err_addr; - uint64_t mc_umc_addrt0, na_err_addr_base; + uint64_t na_err_addr_base; uint64_t na_err_addr, retired_page_addr; uint32_t channel_index, addr_lsb, col = 0; int ret = 0; + channel_index = + adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst]; + + /* the lowest lsb bits should be ignored */ + addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb); + err_addr &= ~((0x1ULL << addr_lsb) - 1); + na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT); + + /* loop for all possibilities of [C6 C5] in normal address. */ + for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) { + na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT); + + /* Mapping normal error address to retired soc physical address. */ + ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index, + na_err_addr, &retired_page_addr); + if (ret) { + dev_err(adev->dev, "Failed to map pa from umc na.\n"); + break; + } + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", + retired_page_addr); + amdgpu_umc_fill_error_record(err_data, na_err_addr, + retired_page_addr, channel_index, umc_inst); + } +} + +static int umc_v8_10_query_error_address(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) +{ + uint64_t mc_umc_status_addr; + uint64_t mc_umc_status, err_addr; + uint64_t mc_umc_addrt0; + struct ras_err_data *err_data = (struct ras_err_data *)data; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); + mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4); if (mc_umc_status == 0) - return; + return 0; if (!err_data->err_addr) { /* clear umc status */ WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); - return; + return 0; } - channel_index = - adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * - adev->umc.channel_inst_num + - umc_inst * adev->umc.channel_inst_num + - ch_inst]; - /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && @@ -251,62 +274,31 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - /* the lowest lsb bits should be ignored */ - addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb); - err_addr &= ~((0x1ULL << addr_lsb) - 1); - na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT); - - /* loop for all possibilities of [C6 C5] in normal address. */ - for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) { - na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT); - - /* Mapping normal error address to retired soc physical address. */ - ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index, - na_err_addr, &retired_page_addr); - if (ret) { - dev_err(adev->dev, "Failed to map pa from umc na.\n"); - break; - } - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", - retired_page_addr); - amdgpu_umc_fill_error_record(err_data, na_err_addr, - retired_page_addr, channel_index, umc_inst); - } + umc_v8_10_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst, node_inst, mc_umc_status); } /* clear umc status */ WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); + + return 0; } static void umc_v8_10_query_ras_error_address(struct amdgpu_device *adev, void *ras_error_status) { - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_query_error_address(adev, - err_data, - umc_reg_offset, - node_inst, - ch_inst, - umc_inst); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_query_error_address, ras_error_status); } -static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, - uint32_t umc_reg_offset) +static int umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) { uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; uint32_t ecc_err_cnt_addr; + uint32_t umc_reg_offset = + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); ecc_err_cnt_sel_addr = SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCntSel); @@ -321,23 +313,14 @@ static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel); /* set error count to initial value */ WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_10_CE_CNT_INIT); + + return 0; } static void umc_v8_10_err_cnt_init(struct amdgpu_device *adev) { - uint32_t node_inst = 0; - uint32_t umc_inst = 0; - uint32_t ch_inst = 0; - uint32_t umc_reg_offset = 0; - - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { - umc_reg_offset = get_umc_v8_10_reg_offset(adev, - node_inst, - umc_inst, - ch_inst); - - umc_v8_10_err_cnt_init_per_channel(adev, umc_reg_offset); - } + amdgpu_umc_loop_channels(adev, + umc_v8_10_err_cnt_init_per_channel, NULL); } static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev) @@ -349,6 +332,116 @@ static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev) return true; } +static void umc_v8_10_ecc_info_query_correctable_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, + unsigned long *error_count) +{ + uint16_t ecc_ce_cnt; + uint32_t eccinfo_table_idx; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst; + + /* Retrieve CE count */ + ecc_ce_cnt = ras->umc_ecc.ecc[eccinfo_table_idx].ce_count_lo_chip; + if (ecc_ce_cnt) + *error_count += ecc_ce_cnt; +} + +static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, + unsigned long *error_count) +{ + uint64_t mc_umc_status; + uint32_t eccinfo_table_idx; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst; + + /* check the MCUMC_STATUS */ + mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; + if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) { + *error_count += 1; + } +} + +static int umc_v8_10_ecc_info_query_ecc_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) +{ + struct ras_err_data *err_data = (struct ras_err_data *)data; + + umc_v8_10_ecc_info_query_correctable_error_count(adev, + node_inst, umc_inst, ch_inst, + &(err_data->ce_count)); + umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, + node_inst, umc_inst, ch_inst, + &(err_data->ue_count)); + return 0; +} + +static void umc_v8_10_ecc_info_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + amdgpu_umc_loop_channels(adev, + umc_v8_10_ecc_info_query_ecc_error_count, ras_error_status); +} + +static int umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, + uint32_t ch_inst, void *data) +{ + uint32_t eccinfo_table_idx; + uint64_t mc_umc_status, err_addr; + struct ras_err_data *err_data = (struct ras_err_data *)data; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst; + + mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; + + if (mc_umc_status == 0) + return 0; + + if (!err_data->err_addr) + return 0; + + /* calculate error address if ue error is detected */ + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1)) { + + err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; + err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + + umc_v8_10_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst, node_inst, mc_umc_status); + } + + return 0; +} + +static void umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *adev, + void *ras_error_status) +{ + amdgpu_umc_loop_channels(adev, + umc_v8_10_ecc_info_query_error_address, ras_error_status); +} + const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = { .query_ras_error_count = umc_v8_10_query_ras_error_count, .query_ras_error_address = umc_v8_10_query_ras_error_address, @@ -360,4 +453,6 @@ struct amdgpu_umc_ras umc_v8_10_ras = { }, .err_cnt_init = umc_v8_10_err_cnt_init, .query_ras_poison_mode = umc_v8_10_query_ras_poison_mode, + .ecc_info_query_ras_error_count = umc_v8_10_ecc_info_query_ras_error_count, + .ecc_info_query_ras_error_address = umc_v8_10_ecc_info_query_ras_error_address, }; |
