summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c212
1 files changed, 155 insertions, 57 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index b12808c0c331..2c58e09e56f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -58,7 +58,7 @@
#define EEPROM_I2C_MADDR_4 0x40000
/*
- * The 2 macros bellow represent the actual size in bytes that
+ * The 2 macros below represent the actual size in bytes that
* those entities occupy in the EEPROM memory.
* RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
* uses uint64 to store 6b fields such as retired_page.
@@ -161,6 +161,8 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 10):
return true;
case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 12):
+ case IP_VERSION(13, 0, 14):
return (adev->gmc.is_app_apu) ? false : true;
default:
return false;
@@ -176,7 +178,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
if (!control)
return false;
- if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
+ if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
/* The address given by VBIOS is an 8-bit, wire-format
* address, i.e. the most significant byte.
*
@@ -222,6 +224,8 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
return true;
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
+ case IP_VERSION(13, 0, 12):
+ case IP_VERSION(13, 0, 14):
control->i2c_address = EEPROM_I2C_MADDR_4;
return true;
default:
@@ -404,6 +408,25 @@ static int amdgpu_ras_eeprom_correct_header_tag(
return res;
}
+static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+ switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+ case IP_VERSION(8, 10, 0):
+ hdr->version = RAS_TABLE_VER_V2_1;
+ return;
+ case IP_VERSION(12, 0, 0):
+ case IP_VERSION(12, 5, 0):
+ hdr->version = RAS_TABLE_VER_V3;
+ return;
+ default:
+ hdr->version = RAS_TABLE_VER_V1;
+ return;
+ }
+}
+
/**
* amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
* @control: pointer to control structure
@@ -423,13 +446,9 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
mutex_lock(&control->ras_tbl_mutex);
hdr->header = RAS_TABLE_HDR_VAL;
- if (adev->umc.ras &&
- adev->umc.ras->set_eeprom_table_version)
- adev->umc.ras->set_eeprom_table_version(hdr);
- else
- hdr->version = RAS_TABLE_VER_V1;
+ amdgpu_ras_set_eeprom_table_version(control);
- if (hdr->version == RAS_TABLE_VER_V2_1) {
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE;
@@ -447,7 +466,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
}
csum = __calc_hdr_byte_sum(control);
- if (hdr->version == RAS_TABLE_VER_V2_1)
+ if (hdr->version >= RAS_TABLE_VER_V2_1)
csum += __calc_ras_info_byte_sum(control);
csum = -csum;
hdr->checksum = csum;
@@ -456,9 +475,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
res = __write_table_ras_info(control);
control->ras_num_recs = 0;
+ control->ras_num_bad_pages = 0;
control->ras_fri = 0;
- amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
+ amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
@@ -543,16 +563,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
return false;
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
- if (amdgpu_bad_page_threshold == -1) {
+ if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
- con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
+ con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
+ if ((amdgpu_bad_page_threshold == -1) ||
+ (amdgpu_bad_page_threshold == -2)) {
dev_warn(adev->dev,
- "But GPU can be operated due to bad_page_threshold = -1.\n");
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
return false;
} else {
- dev_warn(adev->dev, "This GPU is in BAD status.");
- dev_warn(adev->dev, "Please retire it or set a larger "
- "threshold value when reloading driver.\n");
+ dev_warn(adev->dev,
+ "Please consider adjusting the customized threshold.\n");
return true;
}
}
@@ -607,6 +628,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
+ struct amdgpu_device *adev = to_amdgpu_device(control);
u32 a, b, i;
u8 *buf, *pp;
int res;
@@ -709,6 +731,15 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
control->ras_num_recs = 1 + (control->ras_max_record_count + b
- control->ras_fri)
% control->ras_max_record_count;
+
+ /*old asics only save pa to eeprom like before*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12)
+ control->ras_num_pa_recs += num;
+ else
+ control->ras_num_mca_recs += num;
+
+ control->ras_num_bad_pages = control->ras_num_pa_recs +
+ control->ras_num_mca_recs * adev->umc.retire_unit;
Out:
kfree(buf);
return res;
@@ -726,21 +757,25 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
/* Modify the header if it exceeds.
*/
if (amdgpu_bad_page_threshold != 0 &&
- control->ras_num_recs >= ras->bad_page_cnt_threshold) {
+ control->ras_num_bad_pages > ras->bad_page_cnt_threshold) {
dev_warn(adev->dev,
"Saved bad pages %d reaches threshold value %d\n",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
- if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
control->tbl_rai.health_percent = 0;
}
+ if ((amdgpu_bad_page_threshold != -1) &&
+ (amdgpu_bad_page_threshold != -2))
+ ras->is_rma = true;
+
/* ignore the -ENOTSUPP return value */
amdgpu_dpm_send_rma_reason(adev);
}
- if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE +
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -780,10 +815,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
* now calculate gpu health percent
*/
if (amdgpu_bad_page_threshold != 0 &&
- control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
- control->ras_num_recs < ras->bad_page_cnt_threshold)
+ control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
+ control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
- control->ras_num_recs) * 100) /
+ control->ras_num_bad_pages) * 100) /
ras->bad_page_cnt_threshold;
/* Recalc the checksum.
@@ -793,7 +828,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
csum += *pp;
csum += __calc_hdr_byte_sum(control);
- if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
csum += __calc_ras_info_byte_sum(control);
/* avoid sign extension when assigning to "checksum" */
csum = -csum;
@@ -824,7 +859,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
- int res;
+ int res, i;
+ uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -838,6 +874,13 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
return -EINVAL;
}
+ if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+ nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
+
+ /* set the new channel index flag */
+ for (i = 0; i < num; i++)
+ record[i].retired_page |= (nps << UMC_NPS_SHIFT);
+
mutex_lock(&control->ras_tbl_mutex);
res = amdgpu_ras_eeprom_append_table(control, record, num);
@@ -847,6 +890,11 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
amdgpu_ras_debugfs_set_ret_size(control);
mutex_unlock(&control->ras_tbl_mutex);
+
+ /* clear channel index flag, the flag is only saved on eeprom */
+ for (i = 0; i < num; i++)
+ record[i].retired_page &= ~(nps << UMC_NPS_SHIFT);
+
return res;
}
@@ -994,7 +1042,10 @@ Out:
uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
{
- if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+ /* get available eeprom table version first before eeprom table init */
+ amdgpu_ras_set_eeprom_table_version(control);
+
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
return RAS_MAX_RECORD_COUNT_V2_1;
else
return RAS_MAX_RECORD_COUNT;
@@ -1239,7 +1290,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
int buf_size, res;
u8 csum, *buf, *pp;
- if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
buf_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE +
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -1307,8 +1358,7 @@ Out:
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
}
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
- bool *exceed_err_limit)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1316,7 +1366,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int res;
- *exceed_err_limit = false;
+ ras->is_rma = false;
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1343,22 +1393,69 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
__decode_table_header_from_buf(hdr, buf);
- if (hdr->version == RAS_TABLE_VER_V2_1) {
+ if (hdr->header != RAS_TABLE_HDR_VAL &&
+ hdr->header != RAS_TABLE_HDR_BAD) {
+ dev_info(adev->dev, "Creating a new EEPROM table");
+ return amdgpu_ras_eeprom_reset_table(control);
+ }
+
+ switch (hdr->version) {
+ case RAS_TABLE_VER_V2_1:
+ case RAS_TABLE_VER_V3:
control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
control->ras_record_offset = RAS_RECORD_START_V2_1;
control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
- } else {
+ break;
+ case RAS_TABLE_VER_V1:
control->ras_num_recs = RAS_NUM_RECS(hdr);
control->ras_record_offset = RAS_RECORD_START;
control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ break;
+ default:
+ dev_err(adev->dev,
+ "RAS header invalid, unsupported version: %u",
+ hdr->version);
+ return -EINVAL;
+ }
+
+ if (control->ras_num_recs > control->ras_max_record_count) {
+ dev_err(adev->dev,
+ "RAS header invalid, records in header: %u max allowed :%u",
+ control->ras_num_recs, control->ras_max_record_count);
+ return -EINVAL;
}
+
control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+ control->ras_num_mca_recs = 0;
+ control->ras_num_pa_recs = 0;
+ return 0;
+}
+
+int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ int res = 0;
+
+ if (!__is_ras_eeprom_supported(adev))
+ return 0;
+
+ /* Verify i2c adapter is initialized */
+ if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
+ return -ENOENT;
+
+ if (!__get_eeprom_i2c_addr(adev, control))
+ return -EINVAL;
+
+ control->ras_num_bad_pages = control->ras_num_pa_recs +
+ control->ras_num_mca_recs * adev->umc.retire_unit;
if (hdr->header == RAS_TABLE_HDR_VAL) {
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
- control->ras_num_recs);
+ control->ras_num_bad_pages);
- if (hdr->version == RAS_TABLE_VER_V2_1) {
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
if (res)
return res;
@@ -1366,28 +1463,32 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
res = __verify_ras_table_checksum(control);
if (res)
- DRM_ERROR("RAS table incorrect checksum or error:%d\n",
- res);
+ dev_err(adev->dev,
+ "RAS table incorrect checksum or error:%d\n",
+ res);
/* Warn if we are at 90% of the threshold or above
*/
- if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
+ if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
- control->ras_num_recs,
+ control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
- if (hdr->version == RAS_TABLE_VER_V2_1) {
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
if (res)
return res;
}
res = __verify_ras_table_checksum(control);
- if (res)
- DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
- res);
- if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
+ if (res) {
+ dev_err(adev->dev,
+ "RAS Table incorrect checksum or error:%d\n",
+ res);
+ return -EINVAL;
+ }
+ if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) {
/* This means that, the threshold was increased since
* the last time the system was booted, and now,
* ras->bad_page_cnt_threshold - control->num_recs > 0,
@@ -1397,28 +1498,25 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
dev_info(adev->dev,
"records:%d threshold:%d, resetting "
"RAS table header signature",
- control->ras_num_recs,
+ control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
- dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
- if (amdgpu_bad_page_threshold == -1) {
- dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
+ dev_warn(adev->dev,
+ "RAS records:%d exceed threshold:%d\n",
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
+ if ((amdgpu_bad_page_threshold == -1) ||
+ (amdgpu_bad_page_threshold == -2)) {
res = 0;
+ dev_warn(adev->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
} else {
- *exceed_err_limit = true;
- dev_err(adev->dev,
- "RAS records:%d exceed threshold:%d, "
- "GPU will not be initialized. Replace this GPU or increase the threshold",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
+ ras->is_rma = true;
+ dev_warn(adev->dev,
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n");
}
}
- } else {
- DRM_INFO("Creating a new EEPROM table");
-
- res = amdgpu_ras_eeprom_reset_table(control);
}
return res < 0 ? res : 0;