diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 287 |
1 files changed, 267 insertions, 20 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index a6c88f2fe6e5..74b4349e345a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -40,6 +40,11 @@ #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 #define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 +#define XGMI_STATE_DISABLE 0xD1 +#define XGMI_STATE_LS0 0x81 +#define XGMI_LINK_ACTIVE 1 +#define XGMI_LINK_INACTIVE 0 + static DEFINE_MUTEX(xgmi_mutex); #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 @@ -289,6 +294,42 @@ static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)}, }; +static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int global_link_num) +{ + const u32 smnpcs_xgmi3x16_pcs_state_hist1 = 0x11a00070; + const int xgmi_inst = 2; + u32 link_inst; + u64 addr; + + link_inst = global_link_num % xgmi_inst; + + addr = (smnpcs_xgmi3x16_pcs_state_hist1 | (link_inst << 20)) + + adev->asic_funcs->encode_ext_smn_addressing(global_link_num / xgmi_inst); + + return RREG32_PCIE_EXT(addr); +} + +int amdgpu_get_xgmi_link_status(struct amdgpu_device *adev, int global_link_num) +{ + u32 xgmi_state_reg_val; + + switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { + case IP_VERSION(6, 4, 0): + xgmi_state_reg_val = xgmi_v6_4_get_link_status(adev, global_link_num); + break; + default: + return -EOPNOTSUPP; + } + + if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_DISABLE) + return -ENOLINK; + + if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_LS0) + return XGMI_LINK_ACTIVE; + + return XGMI_LINK_INACTIVE; +} + /** * DOC: AMDGPU XGMI Support * @@ -434,6 +475,9 @@ static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev, } } + if (i == top->num_nodes) + return -EINVAL; + for (i = 0; i < top->num_nodes; i++) { for (j = 0; j < top->nodes[i].num_links; j++) /* node id in sysfs starts from 1 rather than 0 so +1 here */ @@ -664,6 +708,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL; + atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); /* * hive pstate on boot is high in vega20 so we have to go to low @@ -797,6 +842,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, return -EINVAL; } +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev, + struct amdgpu_device *peer_adev) +{ + struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; + int i; + + /* Sharing should always be enabled for non-SRIOV. */ + if (!amdgpu_sriov_vf(adev)) + return true; + + for (i = 0 ; i < top->num_nodes; ++i) + if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) + return !!top->nodes[i].is_sharing_enabled; + + return false; +} + /* * Devices that support extended data require the entire hive to initialize with * the shared memory buffer flag set. @@ -857,8 +919,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return 0; - if (!adev->gmc.xgmi.pending_reset && - amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { ret = psp_xgmi_initialize(&adev->psp, false, true); if (ret) { dev_err(adev->dev, @@ -904,8 +965,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) task_barrier_add_task(&hive->tb); - if (!adev->gmc.xgmi.pending_reset && - amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { /* update node list for other device in the hive */ if (tmp_adev != adev) { @@ -982,7 +1042,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) } } - if (!ret && !adev->gmc.xgmi.pending_reset) + if (!ret) ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); exit_unlock: @@ -1035,15 +1095,88 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) return 0; } +static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, + enum aca_smu_type type, void *data) +{ + struct amdgpu_device *adev = handle->adev; + struct aca_bank_info info; + const char *error_str; + u64 status, count; + int ret, ext_error_code; + + ret = aca_bank_info_decode(bank, &info); + if (ret) + return ret; + + status = bank->regs[ACA_REG_IDX_STATUS]; + ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); + + error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? + xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; + if (error_str) + dev_info(adev->dev, "%s detected\n", error_str); + + count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); + + switch (type) { + case ACA_SMU_TYPE_UE: + if (ext_error_code != 0 && ext_error_code != 9) + count = 0ULL; + + ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count); + break; + case ACA_SMU_TYPE_CE: + count = ext_error_code == 6 ? count : 0ULL; + ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count); + break; + default: + return -EINVAL; + } + + return ret; +} + +static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = { + .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser, +}; + +static const struct aca_info xgmi_v6_4_0_aca_info = { + .hwip = ACA_HWIP_TYPE_PCS_XGMI, + .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK, + .bank_ops = &xgmi_v6_4_0_aca_bank_ops, +}; + static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) { + int r; + if (!adev->gmc.xgmi.supported || adev->gmc.xgmi.num_physical_nodes == 0) return 0; amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); - return amdgpu_ras_block_late_init(adev, ras_block); + r = amdgpu_ras_block_late_init(adev, ras_block); + if (r) + return r; + + switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { + case IP_VERSION(6, 4, 0): + r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL, + &xgmi_v6_4_0_aca_info, NULL); + if (r) + goto late_fini; + break; + default: + break; + } + + return 0; + +late_fini: + amdgpu_ras_block_late_fini(adev, ras_block); + + return r; } uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, @@ -1099,7 +1232,7 @@ static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev) static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base) { - WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL); + WREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS, 0ULL); } static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst) @@ -1277,12 +1410,12 @@ static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, err_data->ce_count += ce_cnt; } -static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status) +static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status) { const char *error_str; int ext_error_code; - ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status); + ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; @@ -1291,9 +1424,9 @@ static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdg switch (ext_error_code) { case 0: - return AMDGPU_MCA_ERROR_TYPE_UE; + return ACA_ERROR_TYPE_UE; case 6: - return AMDGPU_MCA_ERROR_TYPE_CE; + return ACA_ERROR_TYPE_CE; default: return -EINVAL; } @@ -1307,22 +1440,22 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a int xgmi_inst = mcm_info->die_id; u64 status = 0; - status = RREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS); - if (!MCA_REG__STATUS__VAL(status)) + status = RREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS); + if (!ACA_REG__STATUS__VAL(status)) return; switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) { - case AMDGPU_MCA_ERROR_TYPE_UE: - amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 1ULL); + case ACA_ERROR_TYPE_UE: + amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL); break; - case AMDGPU_MCA_ERROR_TYPE_CE: - amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 1ULL); + case ACA_ERROR_TYPE_CE: + amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL); break; default: break; } - WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL); + WREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS, 0ULL); } static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data) @@ -1370,7 +1503,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) dev_warn(adev->dev, "Failed to disallow df cstate"); - ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW); + ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DISALLOW); if (ret1 && ret1 != -EOPNOTSUPP) dev_warn(adev->dev, "Failed to disallow XGMI power down"); @@ -1379,7 +1512,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, if (amdgpu_ras_intr_triggered()) return ret2; - ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT); + ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DEFAULT); if (ret1 && ret1 != -EOPNOTSUPP) dev_warn(adev->dev, "Failed to allow XGMI power down"); @@ -1424,3 +1557,117 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) return 0; } + +static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work) +{ + struct amdgpu_hive_info *hive = + container_of(work, struct amdgpu_hive_info, reset_on_init_work); + struct amdgpu_reset_context reset_context; + struct amdgpu_device *tmp_adev; + struct list_head device_list; + int r; + + mutex_lock(&hive->hive_lock); + + INIT_LIST_HEAD(&device_list); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_add_tail(&tmp_adev->reset_list, &device_list); + + tmp_adev = list_first_entry(&device_list, struct amdgpu_device, + reset_list); + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + + reset_context.method = AMD_RESET_METHOD_ON_INIT; + reset_context.reset_req_dev = tmp_adev; + reset_context.hive = hive; + reset_context.reset_device_list = &device_list; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + + amdgpu_reset_do_xgmi_reset_on_init(&reset_context); + mutex_unlock(&hive->hive_lock); + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); + + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + r = amdgpu_ras_init_badpage_info(tmp_adev); + if (r && r != -EHWPOISON) + dev_err(tmp_adev->dev, + "error during bad page data initialization"); + } +} + +static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive) +{ + INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work); + amdgpu_reset_domain_schedule(hive->reset_domain, + &hive->reset_on_init_work); +} + +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev) +{ + struct amdgpu_hive_info *hive; + bool reset_scheduled; + int num_devs; + + hive = amdgpu_get_xgmi_hive(adev); + if (!hive) + return -EINVAL; + + mutex_lock(&hive->hive_lock); + num_devs = atomic_read(&hive->number_devices); + reset_scheduled = false; + if (num_devs == adev->gmc.xgmi.num_physical_nodes) { + amdgpu_xgmi_schedule_reset_on_init(hive); + reset_scheduled = true; + } + + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); + + if (reset_scheduled) + flush_work(&hive->reset_on_init_work); + + return 0; +} + +int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive, + int req_nps_mode) +{ + struct amdgpu_device *tmp_adev; + int cur_nps_mode, r; + + /* This is expected to be called only during unload of driver. The + * request needs to be placed only once for all devices in the hive. If + * one of them fail, revert the request for previous successful devices. + * After placing the request, make hive mode as UNKNOWN so that other + * devices don't request anymore. + */ + mutex_lock(&hive->hive_lock); + if (atomic_read(&hive->requested_nps_mode) == + UNKNOWN_MEMORY_PARTITION_MODE) { + dev_dbg(adev->dev, "Unexpected entry for hive NPS change"); + mutex_unlock(&hive->hive_lock); + return 0; + } + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + r = adev->gmc.gmc_funcs->request_mem_partition_mode( + tmp_adev, req_nps_mode); + if (r) + break; + } + if (r) { + /* Request back current mode if one of the requests failed */ + cur_nps_mode = + adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev); + list_for_each_entry_continue_reverse( + tmp_adev, &hive->device_list, gmc.xgmi.head) + adev->gmc.gmc_funcs->request_mem_partition_mode( + tmp_adev, cur_nps_mode); + } + /* Set to UNKNOWN so that other devices don't request anymore */ + atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); + mutex_unlock(&hive->hive_lock); + + return r; +} |