summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c1048
1 files changed, 907 insertions, 141 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 84c241b9a2a1..64dd7a81bff5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -32,16 +32,34 @@
#include <linux/uaccess.h>
#include "amdgpu_reset.h"
+#include "amdgpu_ras_mgr.h"
-#define EEPROM_I2C_MADDR_VEGA20 0x0
-#define EEPROM_I2C_MADDR_ARCTURUS 0x40000
-#define EEPROM_I2C_MADDR_ARCTURUS_D342 0x0
-#define EEPROM_I2C_MADDR_SIENNA_CICHLID 0x0
-#define EEPROM_I2C_MADDR_ALDEBARAN 0x0
-#define EEPROM_I2C_MADDR_SMU_13_0_0 (0x54UL << 16)
+/* These are memory addresses as would be seen by one or more EEPROM
+ * chips strung on the I2C bus, usually by manipulating pins 1-3 of a
+ * set of EEPROM devices. They form a continuous memory space.
+ *
+ * The I2C device address includes the device type identifier, 1010b,
+ * which is a reserved value and indicates that this is an I2C EEPROM
+ * device. It also includes the top 3 bits of the 19 bit EEPROM memory
+ * address, namely bits 18, 17, and 16. This makes up the 7 bit
+ * address sent on the I2C bus with bit 0 being the direction bit,
+ * which is not represented here, and sent by the hardware directly.
+ *
+ * For instance,
+ * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0.
+ * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h.
+ * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h.
+ * Depending on the size of the I2C EEPROM device(s), bits 18:16 may
+ * address memory in a device or a device on the I2C bus, depending on
+ * the status of pins 1-3. See top of amdgpu_eeprom.c.
+ *
+ * The RAS table lives either at address 0 or address 40000h of EEPROM.
+ */
+#define EEPROM_I2C_MADDR_0 0x0
+#define EEPROM_I2C_MADDR_4 0x40000
/*
- * The 2 macros bellow represent the actual size in bytes that
+ * The 2 macros below represent the actual size in bytes that
* those entities occupy in the EEPROM memory.
* RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
* uses uint64 to store 6b fields such as retired_page.
@@ -51,11 +69,24 @@
/* Table hdr is 'AMDR' */
#define RAS_TABLE_HDR_VAL 0x414d4452
-#define RAS_TABLE_VER 0x00010000
/* Bad GPU tag ‘BADG’ */
#define RAS_TABLE_HDR_BAD 0x42414447
+/*
+ * EEPROM Table structure v1
+ * ---------------------------------
+ * | |
+ * | EEPROM TABLE HEADER |
+ * | ( size 20 Bytes ) |
+ * | |
+ * ---------------------------------
+ * | |
+ * | BAD PAGE RECORD AREA |
+ * | |
+ * ---------------------------------
+ */
+
/* Assume 2-Mbit size EEPROM and take up the whole space. */
#define RAS_TBL_SIZE_BYTES (256 * 1024)
#define RAS_TABLE_START 0
@@ -64,6 +95,37 @@
#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
/ RAS_TABLE_RECORD_SIZE)
+/*
+ * EEPROM Table structrue v2.1
+ * ---------------------------------
+ * | |
+ * | EEPROM TABLE HEADER |
+ * | ( size 20 Bytes ) |
+ * | |
+ * ---------------------------------
+ * | |
+ * | EEPROM TABLE RAS INFO |
+ * | (available info size 4 Bytes) |
+ * | ( reserved size 252 Bytes ) |
+ * | |
+ * ---------------------------------
+ * | |
+ * | BAD PAGE RECORD AREA |
+ * | |
+ * ---------------------------------
+ */
+
+/* EEPROM Table V2_1 */
+#define RAS_TABLE_V2_1_INFO_SIZE 256
+#define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE
+#define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \
+ RAS_TABLE_V2_1_INFO_SIZE)
+#define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \
+ RAS_TABLE_V2_1_INFO_SIZE) \
+ / RAS_TABLE_RECORD_SIZE)
+
+#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */
+
/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
* offset off of RAS_TABLE_START. That is, this is something you can
* add to control->i2c_address, and then tell I2C layer to read
@@ -86,43 +148,40 @@
#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
-#define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
+#define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
+ RAS_TABLE_HEADER_SIZE - \
+ RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE)
-static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
-{
- return adev->asic_type == CHIP_VEGA20 ||
- adev->asic_type == CHIP_ARCTURUS ||
- adev->asic_type == CHIP_SIENNA_CICHLID ||
- adev->asic_type == CHIP_ALDEBARAN;
-}
+#define to_amdgpu_device(x) ((container_of(x, struct amdgpu_ras, eeprom_control))->adev)
-static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
- struct amdgpu_ras_eeprom_control *control)
+static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
{
- struct atom_context *atom_ctx = adev->mode_info.atom_context;
-
- if (!control || !atom_ctx)
+ switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+ case IP_VERSION(11, 0, 2): /* VEGA20 and ARCTURUS */
+ case IP_VERSION(11, 0, 7): /* Sienna cichlid */
+ case IP_VERSION(13, 0, 0):
+ case IP_VERSION(13, 0, 2): /* Aldebaran */
+ case IP_VERSION(13, 0, 10):
+ return true;
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 12):
+ case IP_VERSION(13, 0, 14):
+ return (adev->gmc.is_app_apu) ? false : true;
+ default:
return false;
-
- if (strnstr(atom_ctx->vbios_version,
- "D342",
- sizeof(atom_ctx->vbios_version)))
- control->i2c_address = EEPROM_I2C_MADDR_ARCTURUS_D342;
- else
- control->i2c_address = EEPROM_I2C_MADDR_ARCTURUS;
-
- return true;
+ }
}
static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
struct amdgpu_ras_eeprom_control *control)
{
+ struct atom_context *atom_ctx = adev->mode_info.atom_context;
u8 i2c_addr;
if (!control)
return false;
- if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
+ if (adev->bios && amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
/* The address given by VBIOS is an 8-bit, wire-format
* address, i.e. the most significant byte.
*
@@ -137,36 +196,44 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
return true;
}
- switch (adev->asic_type) {
- case CHIP_VEGA20:
- control->i2c_address = EEPROM_I2C_MADDR_VEGA20;
- break;
-
- case CHIP_ARCTURUS:
- return __get_eeprom_i2c_addr_arct(adev, control);
-
- case CHIP_SIENNA_CICHLID:
- control->i2c_address = EEPROM_I2C_MADDR_SIENNA_CICHLID;
- break;
-
- case CHIP_ALDEBARAN:
- control->i2c_address = EEPROM_I2C_MADDR_ALDEBARAN;
- break;
-
- default:
- return false;
- }
-
- switch (adev->ip_versions[MP1_HWIP][0]) {
+ switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+ case IP_VERSION(11, 0, 2):
+ /* VEGA20 and ARCTURUS */
+ if (adev->asic_type == CHIP_VEGA20)
+ control->i2c_address = EEPROM_I2C_MADDR_0;
+ else if (strnstr(atom_ctx->vbios_pn,
+ "D342",
+ sizeof(atom_ctx->vbios_pn)))
+ control->i2c_address = EEPROM_I2C_MADDR_0;
+ else
+ control->i2c_address = EEPROM_I2C_MADDR_4;
+ return true;
+ case IP_VERSION(11, 0, 7):
+ control->i2c_address = EEPROM_I2C_MADDR_0;
+ return true;
+ case IP_VERSION(13, 0, 2):
+ if (strnstr(atom_ctx->vbios_pn, "D673",
+ sizeof(atom_ctx->vbios_pn)))
+ control->i2c_address = EEPROM_I2C_MADDR_4;
+ else
+ control->i2c_address = EEPROM_I2C_MADDR_0;
+ return true;
case IP_VERSION(13, 0, 0):
- control->i2c_address = EEPROM_I2C_MADDR_SMU_13_0_0;
- break;
-
+ if (strnstr(atom_ctx->vbios_pn, "D707",
+ sizeof(atom_ctx->vbios_pn)))
+ control->i2c_address = EEPROM_I2C_MADDR_0;
+ else
+ control->i2c_address = EEPROM_I2C_MADDR_4;
+ return true;
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 10):
+ case IP_VERSION(13, 0, 12):
+ case IP_VERSION(13, 0, 14):
+ control->i2c_address = EEPROM_I2C_MADDR_4;
+ return true;
default:
- break;
+ return false;
}
-
- return true;
}
static void
@@ -213,15 +280,81 @@ static int __write_table_header(struct amdgpu_ras_eeprom_control *control)
up_read(&adev->reset_domain->sem);
if (res < 0) {
- DRM_ERROR("Failed to write EEPROM table header:%d", res);
+ dev_err(adev->dev, "Failed to write EEPROM table header:%d",
+ res);
} else if (res < RAS_TABLE_HEADER_SIZE) {
- DRM_ERROR("Short write:%d out of %d\n",
- res, RAS_TABLE_HEADER_SIZE);
+ dev_err(adev->dev, "Short write:%d out of %d\n", res,
+ RAS_TABLE_HEADER_SIZE);
+ res = -EIO;
+ } else {
+ res = 0;
+ }
+
+ return res;
+}
+
+static void
+__encode_table_ras_info_to_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
+ unsigned char *buf)
+{
+ u32 *pp = (uint32_t *)buf;
+ u32 tmp;
+
+ tmp = ((uint32_t)(rai->rma_status) & 0xFF) |
+ (((uint32_t)(rai->health_percent) << 8) & 0xFF00) |
+ (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000);
+ pp[0] = cpu_to_le32(tmp);
+}
+
+static void
+__decode_table_ras_info_from_buf(struct amdgpu_ras_eeprom_table_ras_info *rai,
+ unsigned char *buf)
+{
+ u32 *pp = (uint32_t *)buf;
+ u32 tmp;
+
+ tmp = le32_to_cpu(pp[0]);
+ rai->rma_status = tmp & 0xFF;
+ rai->health_percent = (tmp >> 8) & 0xFF;
+ rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF;
+}
+
+static int __write_table_ras_info(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ u8 *buf;
+ int res;
+
+ buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
+ if (!buf) {
+ dev_err(adev->dev,
+ "Failed to alloc buf to write table ras info\n");
+ return -ENOMEM;
+ }
+
+ __encode_table_ras_info_to_buf(&control->tbl_rai, buf);
+
+ /* i2c may be unstable in gpu reset */
+ down_read(&adev->reset_domain->sem);
+ res = amdgpu_eeprom_write(adev->pm.ras_eeprom_i2c_bus,
+ control->i2c_address +
+ control->ras_info_offset,
+ buf, RAS_TABLE_V2_1_INFO_SIZE);
+ up_read(&adev->reset_domain->sem);
+
+ if (res < 0) {
+ dev_err(adev->dev, "Failed to write EEPROM table ras info:%d",
+ res);
+ } else if (res < RAS_TABLE_V2_1_INFO_SIZE) {
+ dev_err(adev->dev, "Short write:%d out of %d\n", res,
+ RAS_TABLE_V2_1_INFO_SIZE);
res = -EIO;
} else {
res = 0;
}
+ kfree(buf);
+
return res;
}
@@ -241,6 +374,21 @@ static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control)
return csum;
}
+static u8 __calc_ras_info_byte_sum(const struct amdgpu_ras_eeprom_control *control)
+{
+ int ii;
+ u8 *pp, csum;
+ size_t sz;
+
+ sz = sizeof(control->tbl_rai);
+ pp = (u8 *) &control->tbl_rai;
+ csum = 0;
+ for (ii = 0; ii < sz; ii++, pp++)
+ csum += *pp;
+
+ return csum;
+}
+
static int amdgpu_ras_eeprom_correct_header_tag(
struct amdgpu_ras_eeprom_control *control,
uint32_t header)
@@ -266,6 +414,25 @@ static int amdgpu_ras_eeprom_correct_header_tag(
return res;
}
+static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+ switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+ case IP_VERSION(8, 10, 0):
+ hdr->version = RAS_TABLE_VER_V2_1;
+ return;
+ case IP_VERSION(12, 0, 0):
+ case IP_VERSION(12, 5, 0):
+ hdr->version = RAS_TABLE_VER_V3;
+ return;
+ default:
+ hdr->version = RAS_TABLE_VER_V1;
+ return;
+ }
+}
+
/**
* amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
* @control: pointer to control structure
@@ -277,26 +444,66 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ u32 erase_res = 0;
u8 csum;
int res;
mutex_lock(&control->ras_tbl_mutex);
- hdr->header = RAS_TABLE_HDR_VAL;
- hdr->version = RAS_TABLE_VER;
- hdr->first_rec_offset = RAS_RECORD_START;
- hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
+ if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+ hdr->header = RAS_TABLE_HDR_VAL;
+ amdgpu_ras_set_eeprom_table_version(control);
+
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ hdr->first_rec_offset = RAS_RECORD_START_V2_1;
+ hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE;
+ rai->rma_status = GPU_HEALTH_USABLE;
+
+ control->ras_record_offset = RAS_RECORD_START_V2_1;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+ /**
+ * GPU health represented as a percentage.
+ * 0 means worst health, 100 means fully health.
+ */
+ rai->health_percent = 100;
+ /* ecc_page_threshold = 0 means disable bad page retirement */
+ rai->ecc_page_threshold = con->bad_page_cnt_threshold;
+ } else {
+ hdr->first_rec_offset = RAS_RECORD_START;
+ hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
- csum = __calc_hdr_byte_sum(control);
- csum = -csum;
- hdr->checksum = csum;
- res = __write_table_header(control);
+ control->ras_record_offset = RAS_RECORD_START;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ }
+
+ csum = __calc_hdr_byte_sum(control);
+ if (hdr->version >= RAS_TABLE_VER_V2_1)
+ csum += __calc_ras_info_byte_sum(control);
+ csum = -csum;
+ hdr->checksum = csum;
+ res = __write_table_header(control);
+ if (!res && hdr->version > RAS_TABLE_VER_V1)
+ res = __write_table_ras_info(control);
+ } else {
+ res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res);
+ if (res || erase_res) {
+ dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d",
+ res, erase_res);
+ if (!res)
+ res = -EIO;
+ }
+ }
control->ras_num_recs = 0;
+ control->ras_num_bad_pages = 0;
+ control->ras_num_mca_recs = 0;
+ control->ras_num_pa_recs = 0;
control->ras_fri = 0;
- amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
+ amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
control->bad_channel_bitmap = 0;
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
@@ -369,7 +576,11 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!__is_ras_eeprom_supported(adev))
+ if (amdgpu_uniras_enabled(adev))
+ return amdgpu_ras_mgr_check_eeprom_safety_watermark(adev);
+
+ if (!__is_ras_eeprom_supported(adev) ||
+ !amdgpu_bad_page_threshold)
return false;
/* skip check eeprom table for VEGA20 Gaming */
@@ -380,10 +591,19 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
return false;
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
- dev_warn(adev->dev, "This GPU is in BAD status.");
- dev_warn(adev->dev, "Please retire it or set a larger "
- "threshold value when reloading driver.\n");
- return true;
+ if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
+ dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
+ con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
+ if ((amdgpu_bad_page_threshold == -1) ||
+ (amdgpu_bad_page_threshold == -2)) {
+ dev_warn(adev->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
+ return false;
+ } else {
+ dev_warn(adev->dev,
+ "Please consider adjusting the customized threshold.\n");
+ return true;
+ }
}
return false;
@@ -415,13 +635,13 @@ static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
buf, buf_size);
up_read(&adev->reset_domain->sem);
if (res < 0) {
- DRM_ERROR("Writing %d EEPROM table records error:%d",
- num, res);
+ dev_err(adev->dev, "Writing %d EEPROM table records error:%d",
+ num, res);
} else if (res < buf_size) {
/* Short write, return error.
*/
- DRM_ERROR("Wrote %d records out of %d",
- res / RAS_TABLE_RECORD_SIZE, num);
+ dev_err(adev->dev, "Wrote %d records out of %d",
+ res / RAS_TABLE_RECORD_SIZE, num);
res = -EIO;
} else {
res = 0;
@@ -436,6 +656,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
+ struct amdgpu_device *adev = to_amdgpu_device(control);
u32 a, b, i;
u8 *buf, *pp;
int res;
@@ -451,7 +672,8 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
__encode_table_record_to_buf(control, &record[i], pp);
/* update bad channel bitmap */
- if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+ if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+ !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
@@ -537,6 +759,14 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
control->ras_num_recs = 1 + (control->ras_max_record_count + b
- control->ras_fri)
% control->ras_max_record_count;
+
+ /*old asics only save pa to eeprom like before*/
+ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12)
+ control->ras_num_pa_recs += num;
+ else
+ control->ras_num_mca_recs += num;
+
+ control->ras_num_bad_pages = con->bad_page_num;
Out:
kfree(buf);
return res;
@@ -554,23 +784,44 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
/* Modify the header if it exceeds.
*/
if (amdgpu_bad_page_threshold != 0 &&
- control->ras_num_recs >= ras->bad_page_cnt_threshold) {
+ control->ras_num_bad_pages > ras->bad_page_cnt_threshold) {
dev_warn(adev->dev,
"Saved bad pages %d reaches threshold value %d\n",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
- control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
+
+ if (adev->cper.enabled && !amdgpu_uniras_enabled(adev) &&
+ amdgpu_cper_generate_bp_threshold_record(adev))
+ dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
+
+ if ((amdgpu_bad_page_threshold != -1) &&
+ (amdgpu_bad_page_threshold != -2)) {
+ control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
+ control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
+ control->tbl_rai.health_percent = 0;
+ }
+ ras->is_rma = true;
+ }
+
+ /* ignore the -ENOTSUPP return value */
+ amdgpu_dpm_send_rma_reason(adev);
}
- control->tbl_hdr.version = RAS_TABLE_VER;
- control->tbl_hdr.first_rec_offset = RAS_INDEX_TO_OFFSET(control, control->ras_fri);
- control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ else
+ control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
control->tbl_hdr.checksum = 0;
buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
if (!buf) {
- DRM_ERROR("allocating memory for table of size %d bytes failed\n",
- control->tbl_hdr.tbl_size);
+ dev_err(adev->dev,
+ "allocating memory for table of size %d bytes failed\n",
+ control->tbl_hdr.tbl_size);
res = -ENOMEM;
goto Out;
}
@@ -582,16 +833,26 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
buf, buf_size);
up_read(&adev->reset_domain->sem);
if (res < 0) {
- DRM_ERROR("EEPROM failed reading records:%d\n",
- res);
+ dev_err(adev->dev, "EEPROM failed reading records:%d\n", res);
goto Out;
} else if (res < buf_size) {
- DRM_ERROR("EEPROM read %d out of %d bytes\n",
- res, buf_size);
+ dev_err(adev->dev, "EEPROM read %d out of %d bytes\n", res,
+ buf_size);
res = -EIO;
goto Out;
}
+ /**
+ * bad page records have been stored in eeprom,
+ * now calculate gpu health percent
+ */
+ if (amdgpu_bad_page_threshold != 0 &&
+ control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
+ control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
+ control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
+ control->ras_num_bad_pages) * 100) /
+ ras->bad_page_cnt_threshold;
+
/* Recalc the checksum.
*/
csum = 0;
@@ -599,15 +860,84 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
csum += *pp;
csum += __calc_hdr_byte_sum(control);
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ csum += __calc_ras_info_byte_sum(control);
/* avoid sign extension when assigning to "checksum" */
csum = -csum;
control->tbl_hdr.checksum = csum;
res = __write_table_header(control);
+ if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1)
+ res = __write_table_ras_info(control);
Out:
kfree(buf);
return res;
}
+int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ int ret, retry = 20;
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return 0;
+
+ control->ras_num_recs_old = control->ras_num_recs;
+
+ do {
+ /* 1000ms timeout is long enough, smu_get_badpage_count won't
+ * return -EBUSY before timeout.
+ */
+ ret = amdgpu_ras_smu_get_badpage_count(adev,
+ &(control->ras_num_recs), RAS_SMU_MESSAGE_TIMEOUT_MS);
+ if (!ret &&
+ (control->ras_num_recs_old == control->ras_num_recs)) {
+ /* record number update in PMFW needs some time,
+ * smu_get_badpage_count may return immediately without
+ * count update, sleep for a while and retry again.
+ */
+ msleep(50);
+ retry--;
+ } else {
+ break;
+ }
+ } while (retry);
+
+ /* no update of record number is not a real failure,
+ * don't print warning here
+ */
+ if (!ret && (control->ras_num_recs_old == control->ras_num_recs))
+ ret = -EINVAL;
+
+ return ret;
+}
+
+static int amdgpu_ras_smu_eeprom_append(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev) || !con)
+ return 0;
+
+ control->ras_num_bad_pages = con->bad_page_num;
+
+ if (amdgpu_bad_page_threshold != 0 &&
+ control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
+ dev_warn(adev->dev,
+ "Saved bad pages %d reaches threshold value %d\n",
+ control->ras_num_bad_pages, con->bad_page_cnt_threshold);
+
+ if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev))
+ dev_warn(adev->dev, "fail to generate bad page threshold cper records\n");
+
+ if ((amdgpu_bad_page_threshold != -1) &&
+ (amdgpu_bad_page_threshold != -2))
+ con->is_rma = true;
+ }
+
+ return 0;
+}
+
/**
* amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
* @control: pointer to control structure
@@ -626,20 +956,32 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
- int res;
+ int res, i;
+ uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
if (!__is_ras_eeprom_supported(adev))
return 0;
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ return amdgpu_ras_smu_eeprom_append(control);
+
if (num == 0) {
- DRM_ERROR("will not append 0 records\n");
+ dev_err(adev->dev, "will not append 0 records\n");
return -EINVAL;
} else if (num > control->ras_max_record_count) {
- DRM_ERROR("cannot append %d records than the size of table %d\n",
- num, control->ras_max_record_count);
+ dev_err(adev->dev,
+ "cannot append %d records than the size of table %d\n",
+ num, control->ras_max_record_count);
return -EINVAL;
}
+ if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+ nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
+
+ /* set the new channel index flag */
+ for (i = 0; i < num; i++)
+ record[i].retired_page |= (nps << UMC_NPS_SHIFT);
+
mutex_lock(&control->ras_tbl_mutex);
res = amdgpu_ras_eeprom_append_table(control, record, num);
@@ -649,6 +991,11 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
amdgpu_ras_debugfs_set_ret_size(control);
mutex_unlock(&control->ras_tbl_mutex);
+
+ /* clear channel index flag, the flag is only saved on eeprom */
+ for (i = 0; i < num; i++)
+ record[i].retired_page &= ~(nps << UMC_NPS_SHIFT);
+
return res;
}
@@ -678,13 +1025,13 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
buf, buf_size);
up_read(&adev->reset_domain->sem);
if (res < 0) {
- DRM_ERROR("Reading %d EEPROM table records error:%d",
- num, res);
+ dev_err(adev->dev, "Reading %d EEPROM table records error:%d",
+ num, res);
} else if (res < buf_size) {
/* Short read, return error.
*/
- DRM_ERROR("Read %d records out of %d",
- res / RAS_TABLE_RECORD_SIZE, num);
+ dev_err(adev->dev, "Read %d records out of %d",
+ res / RAS_TABLE_RECORD_SIZE, num);
res = -EIO;
} else {
res = 0;
@@ -693,6 +1040,50 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
return res;
}
+int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
+ struct eeprom_table_record *record, u32 rec_idx,
+ const u32 num)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ uint64_t ts, end_idx;
+ int i, ret;
+ u64 mca, ipid;
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return 0;
+
+ if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse)
+ return -EOPNOTSUPP;
+
+ end_idx = rec_idx + num;
+ for (i = rec_idx; i < end_idx; i++) {
+ ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts);
+ if (ret)
+ return ret;
+
+ record[i - rec_idx].address = mca;
+ /* retired_page (pa) is unused now */
+ record[i - rec_idx].retired_page = 0x1ULL;
+ record[i - rec_idx].ts = ts;
+ record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+
+ adev->umc.ras->mca_ipid_parse(adev, ipid,
+ (uint32_t *)&(record[i - rec_idx].cu),
+ (uint32_t *)&(record[i - rec_idx].mem_channel),
+ (uint32_t *)&(record[i - rec_idx].mcumc_id), NULL);
+ }
+
+ return 0;
+}
+
/**
* amdgpu_ras_eeprom_read -- read EEPROM
* @control: pointer to control structure
@@ -714,15 +1105,18 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
u8 *buf, *pp;
u32 g0, g1;
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ return amdgpu_ras_eeprom_read_idx(control, record, 0, num);
+
if (!__is_ras_eeprom_supported(adev))
return 0;
if (num == 0) {
- DRM_ERROR("will not read 0 records\n");
+ dev_err(adev->dev, "will not read 0 records\n");
return -EINVAL;
} else if (num > control->ras_num_recs) {
- DRM_ERROR("too many records to read:%d available:%d\n",
- num, control->ras_num_recs);
+ dev_err(adev->dev, "too many records to read:%d available:%d\n",
+ num, control->ras_num_recs);
return -EINVAL;
}
@@ -781,7 +1175,8 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
__decode_table_record_from_buf(control, &record[i], pp);
/* update bad channel bitmap */
- if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+ if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+ !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
con->update_channel_flag = true;
}
@@ -793,9 +1188,15 @@ Out:
return res;
}
-uint32_t amdgpu_ras_eeprom_max_record_count(void)
+uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
{
- return RAS_MAX_RECORD_COUNT;
+ /* get available eeprom table version first before eeprom table init */
+ amdgpu_ras_set_eeprom_table_version(control);
+
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ return RAS_MAX_RECORD_COUNT_V2_1;
+ else
+ return RAS_MAX_RECORD_COUNT;
}
static ssize_t
@@ -878,6 +1279,10 @@ static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
int res = -EFAULT;
size_t data_len;
+ /* pmfw manages eeprom data by itself */
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ return 0;
+
mutex_lock(&control->ras_tbl_mutex);
/* We want *pos - data_len > 0, which means there's
@@ -1037,11 +1442,18 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
int buf_size, res;
u8 csum, *buf, *pp;
- buf_size = RAS_TABLE_HEADER_SIZE +
- control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ buf_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ else
+ buf_size = RAS_TABLE_HEADER_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+
buf = kzalloc(buf_size, GFP_KERNEL);
if (!buf) {
- DRM_ERROR("Out of memory checking RAS table checksum.\n");
+ dev_err(adev->dev,
+ "Out of memory checking RAS table checksum.\n");
return -ENOMEM;
}
@@ -1050,7 +1462,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
control->ras_header_offset,
buf, buf_size);
if (res < buf_size) {
- DRM_ERROR("Partial read for checksum, res:%d\n", res);
+ dev_err(adev->dev, "Partial read for checksum, res:%d\n", res);
/* On partial reads, return -EIO.
*/
if (res >= 0)
@@ -1066,8 +1478,78 @@ Out:
return res < 0 ? res : csum;
}
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
- bool *exceed_err_limit)
+static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ unsigned char *buf;
+ int res;
+
+ buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
+ if (!buf) {
+ dev_err(adev->dev,
+ "Failed to alloc buf to read EEPROM table ras info\n");
+ return -ENOMEM;
+ }
+
+ /**
+ * EEPROM table V2_1 supports ras info,
+ * read EEPROM table ras info
+ */
+ res = amdgpu_eeprom_read(adev->pm.ras_eeprom_i2c_bus,
+ control->i2c_address + control->ras_info_offset,
+ buf, RAS_TABLE_V2_1_INFO_SIZE);
+ if (res < RAS_TABLE_V2_1_INFO_SIZE) {
+ dev_err(adev->dev,
+ "Failed to read EEPROM table ras info, res:%d", res);
+ res = res >= 0 ? -EIO : res;
+ goto Out;
+ }
+
+ __decode_table_ras_info_from_buf(rai, buf);
+
+Out:
+ kfree(buf);
+ return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
+}
+
+static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ uint64_t local_time;
+ int res;
+
+ ras->is_rma = false;
+
+ if (!__is_ras_eeprom_supported(adev))
+ return 0;
+ mutex_init(&control->ras_tbl_mutex);
+
+ res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version));
+ if (res)
+ return res;
+
+ res = amdgpu_ras_smu_get_badpage_count(adev,
+ &(control->ras_num_recs), 100);
+ if (res)
+ return res;
+
+ local_time = (uint64_t)ktime_get_real_seconds();
+ res = amdgpu_ras_smu_set_timestamp(adev, local_time);
+ if (res)
+ return res;
+
+ control->ras_max_record_count = 4000;
+
+ control->ras_num_mca_recs = 0;
+ control->ras_num_pa_recs = 0;
+
+ return 0;
+}
+
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1075,7 +1557,10 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int res;
- *exceed_err_limit = false;
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ return amdgpu_ras_smu_eeprom_init(control);
+
+ ras->is_rma = false;
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1088,8 +1573,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
return -EINVAL;
control->ras_header_offset = RAS_HDR_START;
- control->ras_record_offset = RAS_RECORD_START;
- control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ control->ras_info_offset = RAS_TABLE_V2_1_INFO_START;
mutex_init(&control->ras_tbl_mutex);
/* Read the table header from EEPROM address */
@@ -1097,36 +1581,153 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
control->i2c_address + control->ras_header_offset,
buf, RAS_TABLE_HEADER_SIZE);
if (res < RAS_TABLE_HEADER_SIZE) {
- DRM_ERROR("Failed to read EEPROM table header, res:%d", res);
+ dev_err(adev->dev, "Failed to read EEPROM table header, res:%d",
+ res);
return res >= 0 ? -EIO : res;
}
__decode_table_header_from_buf(hdr, buf);
- control->ras_num_recs = RAS_NUM_RECS(hdr);
+ if (hdr->header != RAS_TABLE_HDR_VAL &&
+ hdr->header != RAS_TABLE_HDR_BAD) {
+ dev_info(adev->dev, "Creating a new EEPROM table");
+ return amdgpu_ras_eeprom_reset_table(control);
+ }
+
+ switch (hdr->version) {
+ case RAS_TABLE_VER_V2_1:
+ case RAS_TABLE_VER_V3:
+ control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
+ control->ras_record_offset = RAS_RECORD_START_V2_1;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+ break;
+ case RAS_TABLE_VER_V1:
+ control->ras_num_recs = RAS_NUM_RECS(hdr);
+ control->ras_record_offset = RAS_RECORD_START;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ break;
+ default:
+ dev_err(adev->dev,
+ "RAS header invalid, unsupported version: %u",
+ hdr->version);
+ return -EINVAL;
+ }
+
+ if (control->ras_num_recs > control->ras_max_record_count) {
+ dev_err(adev->dev,
+ "RAS header invalid, records in header: %u max allowed :%u",
+ control->ras_num_recs, control->ras_max_record_count);
+ return -EINVAL;
+ }
+
control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+ control->ras_num_mca_recs = 0;
+ control->ras_num_pa_recs = 0;
+ return 0;
+}
+
+static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ if (!__is_ras_eeprom_supported(adev))
+ return 0;
+
+ control->ras_num_bad_pages = ras->bad_page_num;
+
+ if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) &&
+ amdgpu_bad_page_threshold != 0) {
+ dev_warn(adev->dev,
+ "RAS records:%d exceed threshold:%d\n",
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
+ if ((amdgpu_bad_page_threshold == -1) ||
+ (amdgpu_bad_page_threshold == -2)) {
+ dev_warn(adev->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
+ } else {
+ ras->is_rma = true;
+ dev_warn(adev->dev,
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n");
+ }
+
+ return 0;
+ }
+
+ dev_dbg(adev->dev,
+ "Found existing EEPROM table with %d records",
+ control->ras_num_bad_pages);
+
+ /* Warn if we are at 90% of the threshold or above
+ */
+ if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
+ dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
+ control->ras_num_bad_pages,
+ ras->bad_page_cnt_threshold);
+ return 0;
+}
+
+int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ int res = 0;
+
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ return amdgpu_ras_smu_eeprom_check(control);
+
+ if (!__is_ras_eeprom_supported(adev))
+ return 0;
+
+ /* Verify i2c adapter is initialized */
+ if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
+ return -ENOENT;
+
+ if (!__get_eeprom_i2c_addr(adev, control))
+ return -EINVAL;
+
+ control->ras_num_bad_pages = ras->bad_page_num;
if (hdr->header == RAS_TABLE_HDR_VAL) {
- DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
- control->ras_num_recs);
+ dev_dbg(adev->dev,
+ "Found existing EEPROM table with %d records",
+ control->ras_num_bad_pages);
+
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ res = __read_table_ras_info(control);
+ if (res)
+ return res;
+ }
+
res = __verify_ras_table_checksum(control);
if (res)
- DRM_ERROR("RAS table incorrect checksum or error:%d\n",
- res);
+ dev_err(adev->dev,
+ "RAS table incorrect checksum or error:%d\n",
+ res);
/* Warn if we are at 90% of the threshold or above
*/
- if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
+ if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
- control->ras_num_recs,
+ control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ res = __read_table_ras_info(control);
+ if (res)
+ return res;
+ }
+
res = __verify_ras_table_checksum(control);
- if (res)
- DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
- res);
- if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
+ if (res) {
+ dev_err(adev->dev,
+ "RAS Table incorrect checksum or error:%d\n",
+ res);
+ return -EINVAL;
+ }
+ if (ras->bad_page_cnt_threshold >= control->ras_num_bad_pages) {
/* This means that, the threshold was increased since
* the last time the system was booted, and now,
* ras->bad_page_cnt_threshold - control->num_recs > 0,
@@ -1136,29 +1737,194 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
dev_info(adev->dev,
"records:%d threshold:%d, resetting "
"RAS table header signature",
- control->ras_num_recs,
+ control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
- dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
- if (amdgpu_bad_page_threshold == -2) {
- dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
+ dev_warn(adev->dev,
+ "RAS records:%d exceed threshold:%d\n",
+ control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
+ if ((amdgpu_bad_page_threshold == -1) ||
+ (amdgpu_bad_page_threshold == -2)) {
res = 0;
+ dev_warn(adev->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
} else {
- *exceed_err_limit = true;
- dev_err(adev->dev,
- "RAS records:%d exceed threshold:%d, "
- "GPU will not be initialized. Replace this GPU or increase the threshold",
- control->ras_num_recs, ras->bad_page_cnt_threshold);
+ ras->is_rma = true;
+ dev_warn(adev->dev,
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n");
}
}
- } else {
- DRM_INFO("Creating a new EEPROM table");
-
- res = amdgpu_ras_eeprom_reset_table(control);
}
return res < 0 ? res : 0;
}
+
+void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_eeprom_control *control;
+ int res;
+
+ if (!__is_ras_eeprom_supported(adev) || !ras ||
+ amdgpu_ras_smu_eeprom_supported(adev))
+ return;
+ control = &ras->eeprom_control;
+ if (!control->is_eeprom_valid)
+ return;
+ res = __verify_ras_table_checksum(control);
+ if (res) {
+ dev_warn(adev->dev,
+ "RAS table incorrect checksum or error:%d, try to recover\n",
+ res);
+ if (!amdgpu_ras_eeprom_reset_table(control))
+ if (!amdgpu_ras_save_bad_pages(adev, NULL))
+ if (!__verify_ras_table_checksum(control)) {
+ dev_info(adev->dev, "RAS table recovery succeed\n");
+ return;
+ }
+ dev_err(adev->dev, "RAS table recovery failed\n");
+ control->is_eeprom_valid = false;
+ }
+ return;
+}
+
+static const struct ras_smu_drv *amdgpu_ras_get_smu_ras_drv(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+ if (!ras)
+ return NULL;
+
+ return ras->ras_smu_drv;
+}
+
+static uint64_t amdgpu_ras_smu_get_feature_flags(struct amdgpu_device *adev)
+{
+ const struct ras_smu_drv *ras_smu_drv = amdgpu_ras_get_smu_ras_drv(adev);
+ uint64_t flags = 0ULL;
+
+ if (!ras_smu_drv)
+ goto out;
+
+ if (ras_smu_drv->ras_smu_feature_flags)
+ ras_smu_drv->ras_smu_feature_flags(adev, &flags);
+
+out:
+ return flags;
+}
+
+bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+ uint64_t flags = 0ULL;
+
+ if (!__is_ras_eeprom_supported(adev) || !smu_ras_drv)
+ return false;
+
+ if (!smu_ras_drv->smu_eeprom_funcs)
+ return false;
+
+ flags = amdgpu_ras_smu_get_feature_flags(adev);
+
+ return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM);
+}
+
+int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev,
+ uint32_t *table_version)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->get_ras_table_version)
+ return smu_ras_drv->smu_eeprom_funcs->get_ras_table_version(adev,
+ table_version);
+ return -EOPNOTSUPP;
+}
+
+int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev,
+ uint32_t *count, uint32_t timeout)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->get_badpage_count)
+ return smu_ras_drv->smu_eeprom_funcs->get_badpage_count(adev,
+ count, timeout);
+ return -EOPNOTSUPP;
+}
+
+int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev,
+ uint16_t index, uint64_t *mca_addr)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr)
+ return smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr(adev,
+ index, mca_addr);
+ return -EOPNOTSUPP;
+}
+
+int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev,
+ uint64_t timestamp)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->set_timestamp)
+ return smu_ras_drv->smu_eeprom_funcs->set_timestamp(adev,
+ timestamp);
+ return -EOPNOTSUPP;
+}
+
+int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev,
+ uint16_t index, uint64_t *timestamp)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->get_timestamp)
+ return smu_ras_drv->smu_eeprom_funcs->get_timestamp(adev,
+ index, timestamp);
+ return -EOPNOTSUPP;
+}
+
+int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev,
+ uint16_t index, uint64_t *ipid)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid)
+ return smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid(adev,
+ index, ipid);
+ return -EOPNOTSUPP;
+}
+
+int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
+ uint32_t *result)
+{
+ const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return -EOPNOTSUPP;
+
+ if (smu_ras_drv->smu_eeprom_funcs->erase_ras_table)
+ return smu_ras_drv->smu_eeprom_funcs->erase_ras_table(adev,
+ result);
+ return -EOPNOTSUPP;
+}