diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 275 |
1 files changed, 182 insertions, 93 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 493982f94649..3835f2592914 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -28,17 +28,7 @@ #define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype} -typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data); - -struct aca_banks { - int nr_banks; - struct list_head list; -}; - -struct aca_hwip { - int hwid; - int mcatype; -}; +typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = { ACA_BANK_HWID(SMU, 0x01, 0x01), @@ -80,13 +70,16 @@ static void aca_banks_release(struct aca_banks *banks) { struct aca_bank_node *node, *tmp; + if (list_empty(&banks->list)) + return; + list_for_each_entry_safe(node, tmp, &banks->list, node) { list_del(&node->node); kvfree(node); } } -static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count) +static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count) { struct amdgpu_aca *aca = &adev->aca; const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -108,7 +101,7 @@ static struct aca_regs_dump { {"STATUS", ACA_REG_IDX_STATUS}, {"ADDR", ACA_REG_IDX_ADDR}, {"MISC", ACA_REG_IDX_MISC0}, - {"CONFIG", ACA_REG_IDX_CONFG}, + {"CONFIG", ACA_REG_IDX_CONFIG}, {"IPID", ACA_REG_IDX_IPID}, {"SYND", ACA_REG_IDX_SYND}, {"DESTAT", ACA_REG_IDX_DESTAT}, @@ -116,20 +109,25 @@ static struct aca_regs_dump { {"CONTROL_MASK", ACA_REG_IDX_CTL_MASK}, }; -static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank) +static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, + struct ras_query_context *qctx) { + u64 event_id = qctx ? qctx->evid.event_id : RAS_EVENT_INVALID_ID; int i; - dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n"); + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ for (i = 0; i < ARRAY_SIZE(aca_regs); i++) - dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", - idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); + RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", + idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); + + if (ACA_REG__STATUS__SCRUB(bank->regs[ACA_REG_IDX_STATUS])) + RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n"); } -static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type, +static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, int start, int count, - struct aca_banks *banks) + struct aca_banks *banks, struct ras_query_context *qctx) { struct amdgpu_aca *aca = &adev->aca; const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -143,18 +141,17 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro return -EOPNOTSUPP; switch (type) { - case ACA_ERROR_TYPE_UE: + case ACA_SMU_TYPE_UE: max_count = smu_funcs->max_ue_bank_count; break; - case ACA_ERROR_TYPE_CE: + case ACA_SMU_TYPE_CE: max_count = smu_funcs->max_ce_bank_count; break; - case ACA_ERROR_TYPE_DEFERRED: default: return -EINVAL; } - if (start + count >= max_count) + if (start + count > max_count) return -EINVAL; count = min_t(int, count, max_count); @@ -164,7 +161,9 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro if (ret) return ret; - aca_smu_bank_dump(adev, i, count, &bank); + bank.smu_err_type = type; + + aca_smu_bank_dump(adev, i, count, &bank, qctx); ret = aca_banks_add_bank(banks, &bank); if (ret) @@ -195,10 +194,14 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t return hwip->hwid == hwid && hwip->mcatype == mcatype; } -static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type) +static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) { const struct aca_bank_ops *bank_ops = handle->bank_ops; + /* Parse all deferred errors with UMC aca handle */ + if (ACA_BANK_ERR_IS_DEFFERED(bank)) + return handle->hwip == ACA_HWIP_TYPE_UMC; + if (!aca_bank_hwip_is_matched(bank, handle->hwip)) return false; @@ -273,59 +276,49 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_ return new_bank_error(aerr, info); } -static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type, - struct aca_bank_report *report) +int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, + enum aca_error_type type, u64 count) { struct aca_error_cache *error_cache = &handle->error_cache; struct aca_bank_error *bank_error; struct aca_error *aerr; - if (!handle || !report) + if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT) return -EINVAL; - if (!report->count[type]) + if (!count) return 0; aerr = &error_cache->errors[type]; - bank_error = get_bank_error(aerr, &report->info); + bank_error = get_bank_error(aerr, info); if (!bank_error) return -ENOMEM; - bank_error->count[type] += report->count[type]; + bank_error->count += count; return 0; } -static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, struct aca_bank_report *report) +static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) { const struct aca_bank_ops *bank_ops = handle->bank_ops; - if (!bank || !report) + if (!bank) return -EINVAL; - if (!bank_ops->aca_bank_generate_report) + if (!bank_ops->aca_bank_parser) return -EOPNOTSUPP; - memset(report, 0, sizeof(*report)); - return bank_ops->aca_bank_generate_report(handle, bank, type, - report, handle->data); + return bank_ops->aca_bank_parser(handle, bank, type, + handle->data); } static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type type, void *data) { - struct aca_bank_report report; int ret; - ret = aca_generate_bank_report(handle, bank, type, &report); - if (ret) - return ret; - - if (!report.count[type]) - return 0; - - ret = aca_log_errors(handle, type, &report); + ret = aca_bank_parser(handle, bank, type); if (ret) return ret; @@ -333,7 +326,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank } static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, - enum aca_error_type type, bank_handler_t handler, void *data) + enum aca_smu_type type, bank_handler_t handler, void *data) { struct aca_handle *handle; int ret; @@ -354,7 +347,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba } static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, - enum aca_error_type type, bank_handler_t handler, void *data) + enum aca_smu_type type, bank_handler_t handler, void *data) { struct aca_bank_node *node; struct aca_bank *bank; @@ -378,8 +371,78 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks * return 0; } -static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type, - bank_handler_t handler, void *data) +static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) +{ + struct amdgpu_aca *aca = &adev->aca; + bool ret = true; + + /* + * Because the UE Valid MCA count will only be cleared after reset, + * in order to avoid repeated counting of the error count, + * the aca bank is only updated once during the gpu recovery stage. + */ + if (type == ACA_SMU_TYPE_UE) { + if (amdgpu_ras_intr_triggered()) + ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0; + else + atomic_set(&aca->ue_update_flag, 0); + } + + return ret; +} + +static void aca_banks_generate_cper(struct amdgpu_device *adev, + enum aca_smu_type type, + struct aca_banks *banks, + int count) +{ + struct aca_bank_node *node; + struct aca_bank *bank; + int r; + + if (!adev->cper.enabled) + return; + + if (!banks || !count) { + dev_warn(adev->dev, "fail to generate cper records\n"); + return; + } + + /* UEs must be encoded into separate CPER entries */ + if (type == ACA_SMU_TYPE_UE) { + struct aca_banks de_banks; + + aca_banks_init(&de_banks); + list_for_each_entry(node, &banks->list, node) { + bank = &node->bank; + if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { + r = aca_banks_add_bank(&de_banks, bank); + if (r) + dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r); + } else { + if (amdgpu_cper_generate_ue_record(adev, bank)) + dev_warn(adev->dev, "fail to generate ue cper records\n"); + } + } + + if (!list_empty(&de_banks.list)) { + if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks)) + dev_warn(adev->dev, "fail to generate de cper records\n"); + } + + aca_banks_release(&de_banks); + } else { + /* + * SMU_TYPE_CE banks are combined into 1 CPER entries, + * they could be CEs or DEs or both + */ + if (amdgpu_cper_generate_ce_records(adev, banks, count)) + dev_warn(adev->dev, "fail to generate ce cper records\n"); + } +} + +static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, + bank_handler_t handler, struct ras_query_context *qctx, void *data) { struct amdgpu_aca *aca = &adev->aca; struct aca_banks banks; @@ -389,9 +452,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type if (list_empty(&aca->mgr.list)) return 0; - /* NOTE: pmfw is only support UE and CE */ - if (type == ACA_ERROR_TYPE_DEFERRED) - type = ACA_ERROR_TYPE_CE; + if (!aca_bank_should_update(adev, type)) + return 0; ret = aca_smu_get_valid_aca_count(adev, type, &count); if (ret) @@ -402,7 +464,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type aca_banks_init(&banks); - ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks); + ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx); if (ret) goto err_release_banks; @@ -416,6 +478,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type if (ret) goto err_release_banks; + aca_banks_generate_cper(adev, type, &banks, count); + err_release_banks: aca_banks_release(&banks); @@ -431,7 +495,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er if (type >= ACA_ERROR_TYPE_COUNT) return -EINVAL; - count = bank_error->count[type]; + count = bank_error->count; if (!count) return 0; @@ -441,12 +505,14 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er switch (type) { case ACA_ERROR_TYPE_UE: - amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, NULL, count); + amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, count); break; case ACA_ERROR_TYPE_CE: - amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count); + amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, count); break; case ACA_ERROR_TYPE_DEFERRED: + amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, count); + break; default: break; } @@ -477,15 +543,32 @@ out_unlock: } static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, - struct ras_err_data *err_data) + struct ras_err_data *err_data, struct ras_query_context *qctx) { + enum aca_smu_type smu_type; int ret; - /* udpate aca bank to aca source error_cache first */ - ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL); + switch (type) { + case ACA_ERROR_TYPE_UE: + smu_type = ACA_SMU_TYPE_UE; + break; + case ACA_ERROR_TYPE_CE: + case ACA_ERROR_TYPE_DEFERRED: + smu_type = ACA_SMU_TYPE_CE; + break; + default: + return -EINVAL; + } + + /* update aca bank to aca source error_cache first */ + ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL); if (ret) return ret; + /* DEs may contain in CEs or UEs */ + if (type != ACA_ERROR_TYPE_DEFERRED) + aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data); + return aca_log_aca_error(handle, type, err_data); } @@ -498,20 +581,19 @@ static bool aca_handle_is_valid(struct aca_handle *handle) } int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, - enum aca_error_type type, void *data) + enum aca_error_type type, struct ras_err_data *err_data, + struct ras_query_context *qctx) { - struct ras_err_data *err_data = (struct ras_err_data *)data; - if (!handle || !err_data) return -EINVAL; if (aca_handle_is_valid(handle)) return -EOPNOTSUPP; - if (!(BIT(type) & handle->mask)) + if ((type < 0) || (!(BIT(type) & handle->mask))) return 0; - return __aca_get_error_data(adev, handle, type, err_data); + return __aca_get_error_data(adev, handle, type, err_data, qctx); } static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) @@ -536,9 +618,13 @@ static void aca_error_fini(struct aca_error *aerr) struct aca_bank_error *bank_error, *tmp; mutex_lock(&aerr->lock); + if (list_empty(&aerr->list)) + goto out_unlock; + list_for_each_entry_safe(bank_error, tmp, &aerr->list, node) aca_bank_error_remove(aerr, bank_error); +out_unlock: mutex_destroy(&aerr->lock); } @@ -654,13 +740,17 @@ static void aca_manager_fini(struct aca_handle_manager *mgr) { struct aca_handle *handle, *tmp; + if (list_empty(&mgr->list)) + return; + list_for_each_entry_safe(handle, tmp, &mgr->list, node) amdgpu_aca_remove_handle(handle); } bool amdgpu_aca_is_enabled(struct amdgpu_device *adev) { - return adev->aca.is_enabled; + return (adev->aca.is_enabled || + adev->debug_enable_ras_aca); } int amdgpu_aca_init(struct amdgpu_device *adev) @@ -668,6 +758,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev) struct amdgpu_aca *aca = &adev->aca; int ret; + atomic_set(&aca->ue_update_flag, 0); + ret = aca_manager_init(&aca->mgr); if (ret) return ret; @@ -680,13 +772,17 @@ void amdgpu_aca_fini(struct amdgpu_device *adev) struct amdgpu_aca *aca = &adev->aca; aca_manager_fini(&aca->mgr); + + atomic_set(&aca->ue_update_flag, 0); } int amdgpu_aca_reset(struct amdgpu_device *adev) { - amdgpu_aca_fini(adev); + struct amdgpu_aca *aca = &adev->aca; - return amdgpu_aca_init(adev); + atomic_set(&aca->ue_update_flag, 0); + + return 0; } void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs) @@ -723,23 +819,13 @@ int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info) static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank) { - int error_code; - - switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { - case IP_VERSION(13, 0, 6): - if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) { - error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); - return error_code & 0xff; - } - break; - default: - break; - } + struct amdgpu_aca *aca = &adev->aca; + const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; - /* NOTE: the true error code is encoded in status.errorcode[0:7] */ - error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); + if (!smu_funcs || !smu_funcs->parse_error_code) + return -EOPNOTSUPP; - return error_code & 0xff; + return smu_funcs->parse_error_code(adev, bank); } int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size) @@ -750,6 +836,9 @@ int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank return -EINVAL; error_code = aca_bank_get_error_code(adev, bank); + if (error_code < 0) + return error_code; + for (i = 0; i < size; i++) { if (err_codes[i] == error_code) return 0; @@ -784,7 +873,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val) return 0; } -static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx) +static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx) { struct aca_bank_info info; int i, ret; @@ -793,7 +882,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e if (ret) return; - seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_ERROR_TYPE_UE ? "UE" : "CE"); + seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE"); seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n", idx, info.socket_id, info.die_id, info.hwid, info.mcatype); @@ -807,7 +896,7 @@ struct aca_dump_context { }; static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank, - enum aca_error_type type, void *data) + enum aca_smu_type type, void *data) { struct aca_dump_context *ctx = (struct aca_dump_context *)data; @@ -816,7 +905,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban return handler_aca_log_bank_error(handle, bank, type, NULL); } -static int aca_dump_show(struct seq_file *m, enum aca_error_type type) +static int aca_dump_show(struct seq_file *m, enum aca_smu_type type) { struct amdgpu_device *adev = (struct amdgpu_device *)m->private; struct aca_dump_context context = { @@ -824,12 +913,12 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type) .idx = 0, }; - return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context); + return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context); } static int aca_dump_ce_show(struct seq_file *m, void *unused) { - return aca_dump_show(m, ACA_ERROR_TYPE_CE); + return aca_dump_show(m, ACA_SMU_TYPE_CE); } static int aca_dump_ce_open(struct inode *inode, struct file *file) @@ -847,7 +936,7 @@ static const struct file_operations aca_ce_dump_debug_fops = { static int aca_dump_ue_show(struct seq_file *m, void *unused) { - return aca_dump_show(m, ACA_ERROR_TYPE_UE); + return aca_dump_show(m, ACA_SMU_TYPE_UE); } static int aca_dump_ue_open(struct inode *inode, struct file *file) @@ -869,7 +958,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(aca_debug_mode_fops, NULL, amdgpu_aca_smu_debug_mode_se void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) { #if defined(CONFIG_DEBUG_FS) - if (!root || adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 6)) + if (!root) return; debugfs_create_file("aca_debug_mode", 0200, root, adev, &aca_debug_mode_fops); |