diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 212 |
1 files changed, 155 insertions, 57 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index b12808c0c331..2c58e09e56f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -58,7 +58,7 @@ #define EEPROM_I2C_MADDR_4 0x40000 /* - * The 2 macros bellow represent the actual size in bytes that + * The 2 macros below represent the actual size in bytes that * those entities occupy in the EEPROM memory. * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which * uses uint64 to store 6b fields such as retired_page. @@ -161,6 +161,8 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) case IP_VERSION(13, 0, 10): return true; case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 12): + case IP_VERSION(13, 0, 14): return (adev->gmc.is_app_apu) ? false : true; default: return false; @@ -176,7 +178,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, if (!control) return false; - if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { + if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { /* The address given by VBIOS is an 8-bit, wire-format * address, i.e. the most significant byte. * @@ -222,6 +224,8 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, return true; case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 12): + case IP_VERSION(13, 0, 14): control->i2c_address = EEPROM_I2C_MADDR_4; return true; default: @@ -404,6 +408,25 @@ static int amdgpu_ras_eeprom_correct_header_tag( return res; } +static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + + switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { + case IP_VERSION(8, 10, 0): + hdr->version = RAS_TABLE_VER_V2_1; + return; + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 5, 0): + hdr->version = RAS_TABLE_VER_V3; + return; + default: + hdr->version = RAS_TABLE_VER_V1; + return; + } +} + /** * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table * @control: pointer to control structure @@ -423,13 +446,9 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) mutex_lock(&control->ras_tbl_mutex); hdr->header = RAS_TABLE_HDR_VAL; - if (adev->umc.ras && - adev->umc.ras->set_eeprom_table_version) - adev->umc.ras->set_eeprom_table_version(hdr); - else - hdr->version = RAS_TABLE_VER_V1; + amdgpu_ras_set_eeprom_table_version(control); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { hdr->first_rec_offset = RAS_RECORD_START_V2_1; hdr->tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE; @@ -447,7 +466,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } csum = __calc_hdr_byte_sum(control); - if (hdr->version == RAS_TABLE_VER_V2_1) + if (hdr->version >= RAS_TABLE_VER_V2_1) csum += __calc_ras_info_byte_sum(control); csum = -csum; hdr->checksum = csum; @@ -456,9 +475,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) res = __write_table_ras_info(control); control->ras_num_recs = 0; + control->ras_num_bad_pages = 0; control->ras_fri = 0; - amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs); + amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages); control->bad_channel_bitmap = 0; amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap); @@ -543,16 +563,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) return false; if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { - if (amdgpu_bad_page_threshold == -1) { + if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold) dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", - con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold); + con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { dev_warn(adev->dev, - "But GPU can be operated due to bad_page_threshold = -1.\n"); + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n"); return false; } else { - dev_warn(adev->dev, "This GPU is in BAD status."); - dev_warn(adev->dev, "Please retire it or set a larger " - "threshold value when reloading driver.\n"); + dev_warn(adev->dev, + "Please consider adjusting the customized threshold.\n"); return true; } } @@ -607,6 +628,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control)); + struct amdgpu_device *adev = to_amdgpu_device(control); u32 a, b, i; u8 *buf, *pp; int res; @@ -709,6 +731,15 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, control->ras_num_recs = 1 + (control->ras_max_record_count + b - control->ras_fri) % control->ras_max_record_count; + + /*old asics only save pa to eeprom like before*/ + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) + control->ras_num_pa_recs += num; + else + control->ras_num_mca_recs += num; + + control->ras_num_bad_pages = control->ras_num_pa_recs + + control->ras_num_mca_recs * adev->umc.retire_unit; Out: kfree(buf); return res; @@ -726,21 +757,25 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) /* Modify the header if it exceeds. */ if (amdgpu_bad_page_threshold != 0 && - control->ras_num_recs >= ras->bad_page_cnt_threshold) { + control->ras_num_bad_pages > ras->bad_page_cnt_threshold) { dev_warn(adev->dev, "Saved bad pages %d reaches threshold value %d\n", - control->ras_num_recs, ras->bad_page_cnt_threshold); + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); control->tbl_hdr.header = RAS_TABLE_HDR_BAD; - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD; control->tbl_rai.health_percent = 0; } + if ((amdgpu_bad_page_threshold != -1) && + (amdgpu_bad_page_threshold != -2)) + ras->is_rma = true; + /* ignore the -ENOTSUPP return value */ amdgpu_dpm_send_rma_reason(adev); } - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -780,10 +815,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) * now calculate gpu health percent */ if (amdgpu_bad_page_threshold != 0 && - control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && - control->ras_num_recs < ras->bad_page_cnt_threshold) + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && + control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - - control->ras_num_recs) * 100) / + control->ras_num_bad_pages) * 100) / ras->bad_page_cnt_threshold; /* Recalc the checksum. @@ -793,7 +828,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) csum += *pp; csum += __calc_hdr_byte_sum(control); - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) csum += __calc_ras_info_byte_sum(control); /* avoid sign extension when assigning to "checksum" */ csum = -csum; @@ -824,7 +859,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_device *adev = to_amdgpu_device(control); - int res; + int res, i; + uint64_t nps = AMDGPU_NPS1_PARTITION_MODE; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -838,6 +874,13 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, return -EINVAL; } + if (adev->gmc.gmc_funcs->query_mem_partition_mode) + nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); + + /* set the new channel index flag */ + for (i = 0; i < num; i++) + record[i].retired_page |= (nps << UMC_NPS_SHIFT); + mutex_lock(&control->ras_tbl_mutex); res = amdgpu_ras_eeprom_append_table(control, record, num); @@ -847,6 +890,11 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, amdgpu_ras_debugfs_set_ret_size(control); mutex_unlock(&control->ras_tbl_mutex); + + /* clear channel index flag, the flag is only saved on eeprom */ + for (i = 0; i < num; i++) + record[i].retired_page &= ~(nps << UMC_NPS_SHIFT); + return res; } @@ -994,7 +1042,10 @@ Out: uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control) { - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + /* get available eeprom table version first before eeprom table init */ + amdgpu_ras_set_eeprom_table_version(control); + + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) return RAS_MAX_RECORD_COUNT_V2_1; else return RAS_MAX_RECORD_COUNT; @@ -1239,7 +1290,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control int buf_size, res; u8 csum, *buf, *pp; - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) buf_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -1307,8 +1358,7 @@ Out: return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1316,7 +1366,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res; - *exceed_err_limit = false; + ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1343,22 +1393,69 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, __decode_table_header_from_buf(hdr, buf); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->header != RAS_TABLE_HDR_VAL && + hdr->header != RAS_TABLE_HDR_BAD) { + dev_info(adev->dev, "Creating a new EEPROM table"); + return amdgpu_ras_eeprom_reset_table(control); + } + + switch (hdr->version) { + case RAS_TABLE_VER_V2_1: + case RAS_TABLE_VER_V3: control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); control->ras_record_offset = RAS_RECORD_START_V2_1; control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; - } else { + break; + case RAS_TABLE_VER_V1: control->ras_num_recs = RAS_NUM_RECS(hdr); control->ras_record_offset = RAS_RECORD_START; control->ras_max_record_count = RAS_MAX_RECORD_COUNT; + break; + default: + dev_err(adev->dev, + "RAS header invalid, unsupported version: %u", + hdr->version); + return -EINVAL; + } + + if (control->ras_num_recs > control->ras_max_record_count) { + dev_err(adev->dev, + "RAS header invalid, records in header: %u max allowed :%u", + control->ras_num_recs, control->ras_max_record_count); + return -EINVAL; } + control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); + control->ras_num_mca_recs = 0; + control->ras_num_pa_recs = 0; + return 0; +} + +int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int res = 0; + + if (!__is_ras_eeprom_supported(adev)) + return 0; + + /* Verify i2c adapter is initialized */ + if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo) + return -ENOENT; + + if (!__get_eeprom_i2c_addr(adev, control)) + return -EINVAL; + + control->ras_num_bad_pages = control->ras_num_pa_recs + + control->ras_num_mca_recs * adev->umc.retire_unit; if (hdr->header == RAS_TABLE_HDR_VAL) { DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", - control->ras_num_recs); + control->ras_num_bad_pages); - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { res = __read_table_ras_info(control); if (res) return res; @@ -1366,28 +1463,32 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, res = __verify_ras_table_checksum(control); if (res) - DRM_ERROR("RAS table incorrect checksum or error:%d\n", - res); + dev_err(adev->dev, + "RAS table incorrect checksum or error:%d\n", + res); /* Warn if we are at 90% of the threshold or above */ - if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold) + if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", - control->ras_num_recs, + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); } else if (hdr->header == RAS_TABLE_HDR_BAD && amdgpu_bad_page_threshold != 0) { - if (hdr->version == RAS_TABLE_VER_V2_1) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { res = __read_table_ras_info(control); if (res) return res; } res = __verify_ras_table_checksum(control); - if (res) - DRM_ERROR("RAS Table incorrect checksum or error:%d\n", - res); - if (ras->bad_page_cnt_threshold > control->ras_num_recs) { + if (res) { + dev_err(adev->dev, + "RAS Table incorrect checksum or error:%d\n", + res); + return -EINVAL; + } + if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) { /* This means that, the threshold was increased since * the last time the system was booted, and now, * ras->bad_page_cnt_threshold - control->num_recs > 0, @@ -1397,28 +1498,25 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, dev_info(adev->dev, "records:%d threshold:%d, resetting " "RAS table header signature", - control->ras_num_recs, + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); res = amdgpu_ras_eeprom_correct_header_tag(control, RAS_TABLE_HDR_VAL); } else { - dev_err(adev->dev, "RAS records:%d exceed threshold:%d", - control->ras_num_recs, ras->bad_page_cnt_threshold); - if (amdgpu_bad_page_threshold == -1) { - dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); + dev_warn(adev->dev, + "RAS records:%d exceed threshold:%d\n", + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { res = 0; + dev_warn(adev->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); } else { - *exceed_err_limit = true; - dev_err(adev->dev, - "RAS records:%d exceed threshold:%d, " - "GPU will not be initialized. Replace this GPU or increase the threshold", - control->ras_num_recs, ras->bad_page_cnt_threshold); + ras->is_rma = true; + dev_warn(adev->dev, + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); } } - } else { - DRM_INFO("Creating a new EEPROM table"); - - res = amdgpu_ras_eeprom_reset_table(control); } return res < 0 ? res : 0; |