diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 225 |
1 files changed, 195 insertions, 30 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6d1f974e2987..d0307c55da50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms +#define MAX_FLUSH_RETIRE_DWORK_TIMES 100 + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1055,7 +1057,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info; struct ras_err_node *err_node; struct ras_err_info *err_info; - u64 event_id = qctx->event_id; + u64 event_id = qctx->evid.event_id; if (is_ue) { for_each_ras_error(err_node, err_data) { @@ -1140,7 +1142,7 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, { struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); const char *blk_name = get_ras_block_str(&query_if->head); - u64 event_id = qctx->event_id; + u64 event_id = qctx->evid.event_id; if (err_data->ce_count) { if (err_data_has_source_info(err_data)) { @@ -1295,6 +1297,9 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a .head = obj->head, }; + if (!amdgpu_ras_get_error_query_ready(obj->adev)) + return sysfs_emit(buf, "Query currently inaccessible\n"); + if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; @@ -1363,7 +1368,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, } /* query/inject/cure begin */ -int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) +static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, + struct ras_query_if *info, + enum ras_event_type type) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data; @@ -1382,8 +1389,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i return -EINVAL; memset(&qctx, 0, sizeof(qctx)); - qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? - RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); + qctx.evid.type = type; + qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type); if (!down_read_trylock(&adev->reset_domain->sem)) { ret = -EIO; @@ -1412,6 +1419,11 @@ out_fini_err_data: return ret; } +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) +{ + return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); +} + int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { @@ -1721,6 +1733,39 @@ static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, return sysfs_emit(buf, "schema: 0x%x\n", con->schema); } +static struct { + enum ras_event_type type; + const char *name; +} dump_event[] = { + {RAS_EVENT_TYPE_FATAL, "Fatal Error"}, + {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"}, + {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, +}; + +static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct amdgpu_ras *con = + container_of(attr, struct amdgpu_ras, event_state_attr); + struct ras_event_manager *event_mgr = con->event_mgr; + struct ras_event_state *event_state; + int i, size = 0; + + if (!event_mgr) + return -EINVAL; + + size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); + for (i = 0; i < ARRAY_SIZE(dump_event); i++) { + event_state = &event_mgr->event_state[dump_event[i].type]; + size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n", + dump_event[i].name, + atomic64_read(&event_state->count), + event_state->last_seqno); + } + + return (ssize_t)size; +} + static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1738,6 +1783,7 @@ static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev) &con->features_attr.attr, &con->version_attr.attr, &con->schema_attr.attr, + &con->event_state_attr.attr, NULL }; struct attribute_group group = { @@ -1970,6 +2016,8 @@ static DEVICE_ATTR(version, 0444, amdgpu_ras_sysfs_version_show, NULL); static DEVICE_ATTR(schema, 0444, amdgpu_ras_sysfs_schema_show, NULL); +static DEVICE_ATTR(event_state, 0444, + amdgpu_ras_sysfs_event_state_show, NULL); static int amdgpu_ras_fs_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1980,6 +2028,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) &con->features_attr.attr, &con->version_attr.attr, &con->schema_attr.attr, + &con->event_state_attr.attr, NULL }; struct bin_attribute *bin_attrs[] = { @@ -2002,6 +2051,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) con->schema_attr = dev_attr_schema; sysfs_attr_init(attrs[2]); + /* add event_state entry */ + con->event_state_attr = dev_attr_event_state; + sysfs_attr_init(attrs[3]); + if (amdgpu_bad_page_threshold != 0) { /* add bad_page_features entry */ bin_attr_gpu_vram_bad_pages.private = NULL; @@ -2066,10 +2119,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + u64 event_id; + int ret; if (!block_obj || !con) return; + ret = amdgpu_ras_mark_ras_event(adev, type); + if (ret) + return; + /* both query_poison_status and handle_poison_consumption are optional, * but at least one of them should be implemented if we need poison * consumption handler @@ -2094,8 +2154,10 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. */ if (poison_stat && !con->is_rma) { - dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", - block_obj->ras_comm.name); + event_id = amdgpu_ras_acquire_event_id(adev, type); + RAS_EVENT_LOG(adev, event_id, + "GPU reset for %s RAS poison consumption is issued!\n", + block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); } @@ -2106,8 +2168,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, struct amdgpu_iv_entry *entry) { - dev_info(obj->adev->dev, - "Poison is created\n"); + struct amdgpu_device *adev = obj->adev; + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; + u64 event_id; + int ret; + + ret = amdgpu_ras_mark_ras_event(adev, type); + if (ret) + return; + + event_id = amdgpu_ras_acquire_event_id(adev, type); + RAS_EVENT_LOG(adev, event_id, "Poison is created\n"); if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); @@ -2302,7 +2373,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) /* ih end */ /* traversal all IPs except NBIO to query error counter */ -static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) +static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; @@ -2335,7 +2406,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) IP_VERSION(13, 0, 2))) continue; - amdgpu_ras_query_error_status(adev, &info); + amdgpu_ras_query_error_status_with_event(adev, &info, type); if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && @@ -2474,6 +2545,14 @@ bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) return false; } +static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev) +{ + if (amdgpu_ras_intr_triggered()) + return RAS_EVENT_TYPE_FATAL; + else + return RAS_EVENT_TYPE_POISON_CONSUMPTION; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2482,6 +2561,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + enum ras_event_type type; if (hive) { atomic_set(&hive->ras_recovery, 1); @@ -2509,10 +2589,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) device_list_handle = &device_list; } + type = amdgpu_ras_get_fatal_error_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { amdgpu_ras_query_err_status(remote_adev); - amdgpu_ras_log_on_err_counter(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev, type); } } @@ -2828,6 +2909,23 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) ecc_log->prev_de_queried_count = 0; } +static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, + uint32_t delayed_ms) +{ + int ret; + + mutex_lock(&con->umc_ecc_log.lock); + ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, + UMC_ECC_NEW_DETECTED_TAG); + mutex_unlock(&con->umc_ecc_log.lock); + + if (ret) + schedule_delayed_work(&con->page_retirement_dwork, + msecs_to_jiffies(delayed_ms)); + + return ret ? true : false; +} + static void amdgpu_ras_do_page_retirement(struct work_struct *work) { struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, @@ -2836,8 +2934,12 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) struct ras_err_data err_data; unsigned long err_cnt; - if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) + /* If gpu reset is ongoing, delay retiring the bad pages */ + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { + amdgpu_ras_schedule_retirement_dwork(con, + AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3); return; + } amdgpu_ras_error_data_init(&err_data); @@ -2849,12 +2951,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) if (err_cnt && con->is_rma) amdgpu_ras_reset_gpu(adev); - mutex_lock(&con->umc_ecc_log.lock); - if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, - UMC_ECC_NEW_DETECTED_TAG)) - schedule_delayed_work(&con->page_retirement_dwork, - msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL)); - mutex_unlock(&con->umc_ecc_log.lock); + amdgpu_ras_schedule_retirement_dwork(con, + AMDGPU_RAS_RETIRE_PAGE_INTERVAL); } static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, @@ -2869,6 +2967,7 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, uint32_t new_detect_count, total_detect_count; uint32_t need_query_count = poison_creation_count; bool query_data_timeout = false; + enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; @@ -2876,7 +2975,7 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, ecc_log = &ras->umc_ecc_log; total_detect_count = 0; do { - ret = amdgpu_ras_query_error_status(adev, &info); + ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; @@ -3157,11 +3256,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data = con->eh_data; + int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES; + bool ret; /* recovery_init failed to init it, fini is useless */ if (!data) return 0; + /* Save all cached bad pages to eeprom */ + do { + flush_delayed_work(&con->page_retirement_dwork); + ret = amdgpu_ras_schedule_retirement_dwork(con, 0); + } while (ret && max_flush_timeout--); + if (con->page_retirement_thread) kthread_stop(con->page_retirement_thread); @@ -3401,10 +3508,17 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev) static void ras_event_mgr_init(struct ras_event_manager *mgr) { + struct ras_event_state *event_state; int i; - for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++) - atomic64_set(&mgr->seqnos[i], 0); + memset(mgr, 0, sizeof(*mgr)); + atomic64_set(&mgr->seqno, 0); + + for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { + event_state = &mgr->event_state[i]; + event_state->last_seqno = RAS_EVENT_INVALID_ID; + atomic64_set(&event_state->count, 0); + } } static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) @@ -3904,23 +4018,68 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) atomic_set(&ras->fed, !!status); } -bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) +static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); + if (!ras) + return NULL; + + return ras->event_mgr; +} + +int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, + const void *caller) { - return !(id & BIT_ULL(63)); + struct ras_event_manager *event_mgr; + struct ras_event_state *event_state; + int ret = 0; + + if (type >= RAS_EVENT_TYPE_COUNT) { + ret = -EINVAL; + goto out; + } + + event_mgr = __get_ras_event_mgr(adev); + if (!event_mgr) { + ret = -EINVAL; + goto out; + } + + event_state = &event_mgr->event_state[type]; + event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); + atomic64_inc(&event_state->count); + +out: + if (ret && caller) + dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", + (int)type, caller, ret); + + return ret; } u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) { - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + struct ras_event_manager *event_mgr; u64 id; + if (type >= RAS_EVENT_TYPE_COUNT) + return RAS_EVENT_INVALID_ID; + switch (type) { - case RAS_EVENT_TYPE_ISR: - id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]); + case RAS_EVENT_TYPE_FATAL: + case RAS_EVENT_TYPE_POISON_CREATION: + case RAS_EVENT_TYPE_POISON_CONSUMPTION: + event_mgr = __get_ras_event_mgr(adev); + if (!event_mgr) + return RAS_EVENT_INVALID_ID; + + id = event_mgr->event_state[type].last_seqno; break; case RAS_EVENT_TYPE_INVALID: default: - id = BIT_ULL(63) | 0ULL; + id = RAS_EVENT_INVALID_ID; break; } @@ -3931,7 +4090,13 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]); + enum ras_event_type type = RAS_EVENT_TYPE_FATAL; + u64 event_id; + + if (amdgpu_ras_mark_ras_event(adev, type)) + return; + + event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); @@ -4665,7 +4830,7 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, vaf.fmt = fmt; vaf.va = &args; - if (amdgpu_ras_event_id_is_valid(adev, event_id)) + if (RAS_EVENT_ID_IS_VALID(event_id)) dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); else dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); |