summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c351
1 files changed, 326 insertions, 25 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 20d51f6c9bb8..d9ad37711c3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,6 +40,11 @@
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218
+#define XGMI_STATE_DISABLE 0xD1
+#define XGMI_STATE_LS0 0x81
+#define XGMI_LINK_ACTIVE 1
+#define XGMI_LINK_INACTIVE 0
+
static DEFINE_MUTEX(xgmi_mutex);
#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
@@ -289,6 +294,72 @@ static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = {
SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)},
};
+int amdgpu_xgmi_get_ext_link(struct amdgpu_device *adev, int link_num)
+{
+ int link_map_6_4_x[8] = { 0, 3, 1, 2, 7, 6, 4, 5 };
+
+ switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+ case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
+ if (link_num < ARRAY_SIZE(link_map_6_4_x))
+ return link_map_6_4_x[link_num];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return -EINVAL;
+}
+
+static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int global_link_num)
+{
+ const u32 smn_xgmi_6_4_pcs_state_hist1[2] = { 0x11a00070, 0x11b00070 };
+ const u32 smn_xgmi_6_4_1_pcs_state_hist1[2] = { 0x12100070,
+ 0x11b00070 };
+ u32 i, n;
+ u64 addr;
+
+ switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+ case IP_VERSION(6, 4, 0):
+ n = ARRAY_SIZE(smn_xgmi_6_4_pcs_state_hist1);
+ addr = smn_xgmi_6_4_pcs_state_hist1[global_link_num % n];
+ break;
+ case IP_VERSION(6, 4, 1):
+ n = ARRAY_SIZE(smn_xgmi_6_4_1_pcs_state_hist1);
+ addr = smn_xgmi_6_4_1_pcs_state_hist1[global_link_num % n];
+ break;
+ default:
+ return U32_MAX;
+ }
+
+ i = global_link_num / n;
+ addr += adev->asic_funcs->encode_ext_smn_addressing(i);
+
+ return RREG32_PCIE_EXT(addr);
+}
+
+int amdgpu_get_xgmi_link_status(struct amdgpu_device *adev, int global_link_num)
+{
+ u32 xgmi_state_reg_val;
+
+ switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
+ case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
+ xgmi_state_reg_val = xgmi_v6_4_get_link_status(adev, global_link_num);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_DISABLE)
+ return -ENOLINK;
+
+ if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_LS0)
+ return XGMI_LINK_ACTIVE;
+
+ return XGMI_LINK_INACTIVE;
+}
+
/**
* DOC: AMDGPU XGMI Support
*
@@ -434,6 +505,9 @@ static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev,
}
}
+ if (i == top->num_nodes)
+ return -EINVAL;
+
for (i = 0; i < top->num_nodes; i++) {
for (j = 0; j < top->nodes[i].num_links; j++)
/* node id in sysfs starts from 1 rather than 0 so +1 here */
@@ -664,6 +738,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
task_barrier_init(&hive->tb);
hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
hive->hi_req_gpu = NULL;
+ atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);
/*
* hive pstate on boot is high in vega20 so we have to go to low
@@ -773,28 +848,88 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev
* num_hops[2:0] = number of hops
*/
int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
- struct amdgpu_device *peer_adev)
+ struct amdgpu_device *peer_adev)
{
struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
uint8_t num_hops_mask = 0x7;
int i;
+ if (!adev->gmc.xgmi.supported)
+ return 0;
+
for (i = 0 ; i < top->num_nodes; ++i)
if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
return top->nodes[i].num_hops & num_hops_mask;
- return -EINVAL;
+
+ dev_err(adev->dev, "Failed to get xgmi hops count for peer %d.\n",
+ peer_adev->gmc.xgmi.physical_node_id);
+
+ return 0;
+}
+
+int amdgpu_xgmi_get_bandwidth(struct amdgpu_device *adev, struct amdgpu_device *peer_adev,
+ enum amdgpu_xgmi_bw_mode bw_mode, enum amdgpu_xgmi_bw_unit bw_unit,
+ uint32_t *min_bw, uint32_t *max_bw)
+{
+ bool peer_mode = bw_mode == AMDGPU_XGMI_BW_MODE_PER_PEER;
+ int unit_scale = bw_unit == AMDGPU_XGMI_BW_UNIT_MBYTES ? 1000 : 1;
+ int num_lanes = adev->gmc.xgmi.max_width;
+ int speed = adev->gmc.xgmi.max_speed;
+ int num_links = !peer_mode ? 1 : -1;
+
+ if (!(min_bw && max_bw))
+ return -EINVAL;
+
+ *min_bw = 0;
+ *max_bw = 0;
+
+ if (!adev->gmc.xgmi.supported)
+ return -ENODATA;
+
+ if (peer_mode && !peer_adev)
+ return -EINVAL;
+
+ if (peer_mode) {
+ struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
+ int i;
+
+ for (i = 0 ; i < top->num_nodes; ++i) {
+ if (top->nodes[i].node_id != peer_adev->gmc.xgmi.node_id)
+ continue;
+
+ num_links = top->nodes[i].num_links;
+ break;
+ }
+ }
+
+ if (num_links == -1) {
+ dev_err(adev->dev, "Failed to get number of xgmi links for peer %d.\n",
+ peer_adev->gmc.xgmi.physical_node_id);
+ } else if (num_links) {
+ int per_link_bw = (speed * num_lanes * unit_scale)/BITS_PER_BYTE;
+
+ *min_bw = per_link_bw;
+ *max_bw = num_links * per_link_bw;
+ }
+
+ return 0;
}
-int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
- struct amdgpu_device *peer_adev)
+bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
+ struct amdgpu_device *peer_adev)
{
struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
int i;
+ /* Sharing should always be enabled for non-SRIOV. */
+ if (!amdgpu_sriov_vf(adev))
+ return true;
+
for (i = 0 ; i < top->num_nodes; ++i)
if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
- return top->nodes[i].num_links;
- return -EINVAL;
+ return !!top->nodes[i].is_sharing_enabled;
+
+ return false;
}
/*
@@ -857,8 +992,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return 0;
- if (!adev->gmc.xgmi.pending_reset &&
- amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+ if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
ret = psp_xgmi_initialize(&adev->psp, false, true);
if (ret) {
dev_err(adev->dev,
@@ -904,8 +1038,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
task_barrier_add_task(&hive->tb);
- if (!adev->gmc.xgmi.pending_reset &&
- amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+ if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
/* update node list for other device in the hive */
if (tmp_adev != adev) {
@@ -982,7 +1115,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
}
}
- if (!ret && !adev->gmc.xgmi.pending_reset)
+ if (!ret)
ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
exit_unlock:
@@ -1035,15 +1168,16 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
return 0;
}
-static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
- struct aca_bank_report *report, void *data)
+static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
+ enum aca_smu_type type, void *data)
{
struct amdgpu_device *adev = handle->adev;
+ struct aca_bank_info info;
const char *error_str;
- u64 status;
+ u64 status, count;
int ret, ext_error_code;
- ret = aca_bank_info_decode(bank, &report->info);
+ ret = aca_bank_info_decode(bank, &info);
if (ret)
return ret;
@@ -1055,15 +1189,30 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc
if (error_str)
dev_info(adev->dev, "%s detected\n", error_str);
- if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
- (type == ACA_ERROR_TYPE_CE && ext_error_code == 6))
- report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
+ count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
- return 0;
+ switch (type) {
+ case ACA_SMU_TYPE_UE:
+ if (ext_error_code != 0 && ext_error_code != 9)
+ count = 0ULL;
+
+ bank->aca_err_type = ACA_ERROR_TYPE_UE;
+ ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
+ break;
+ case ACA_SMU_TYPE_CE:
+ count = ext_error_code == 6 ? count : 0ULL;
+ bank->aca_err_type = ACA_ERROR_TYPE_CE;
+ ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
}
static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
- .aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
+ .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,
};
static const struct aca_info xgmi_v6_4_0_aca_info = {
@@ -1088,6 +1237,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL,
&xgmi_v6_4_0_aca_info, NULL);
if (r)
@@ -1147,6 +1297,7 @@ static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++)
pcs_clear_status(adev,
xgmi3x16_pcs_err_status_reg_v6_4[i]);
@@ -1181,6 +1332,7 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
{
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
xgmi_v6_4_0_reset_ras_error_count(adev);
break;
default:
@@ -1206,7 +1358,9 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
IP_VERSION(6, 1, 0) ||
amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
- IP_VERSION(6, 4, 0)) {
+ IP_VERSION(6, 4, 0) ||
+ amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
+ IP_VERSION(6, 4, 1)) {
pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
} else {
@@ -1314,6 +1468,7 @@ static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
/* check xgmi3x16 pcs error */
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) {
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]);
@@ -1372,10 +1527,10 @@ static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct a
switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
case ACA_ERROR_TYPE_UE:
- amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, NULL, 1ULL);
+ amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
break;
case ACA_ERROR_TYPE_CE:
- amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, NULL, 1ULL);
+ amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
break;
default:
break;
@@ -1410,6 +1565,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
{
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
case IP_VERSION(6, 4, 0):
+ case IP_VERSION(6, 4, 1):
xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status);
break;
default:
@@ -1429,7 +1585,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
dev_warn(adev->dev, "Failed to disallow df cstate");
- ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DISALLOW);
+ ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DISALLOW);
if (ret1 && ret1 != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to disallow XGMI power down");
@@ -1438,7 +1594,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
if (amdgpu_ras_intr_triggered())
return ret2;
- ret1 = amdgpu_dpm_set_xgmi_plpd_mode(adev, XGMI_PLPD_DEFAULT);
+ ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DEFAULT);
if (ret1 && ret1 != -EOPNOTSUPP)
dev_warn(adev->dev, "Failed to allow XGMI power down");
@@ -1483,3 +1639,148 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
return 0;
}
+
+static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
+{
+ struct amdgpu_hive_info *hive =
+ container_of(work, struct amdgpu_hive_info, reset_on_init_work);
+ struct amdgpu_reset_context reset_context;
+ struct amdgpu_device *tmp_adev;
+ struct list_head device_list;
+ int r;
+
+ mutex_lock(&hive->hive_lock);
+
+ INIT_LIST_HEAD(&device_list);
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_add_tail(&tmp_adev->reset_list, &device_list);
+
+ tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
+ reset_list);
+ amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+
+ reset_context.method = AMD_RESET_METHOD_ON_INIT;
+ reset_context.reset_req_dev = tmp_adev;
+ reset_context.hive = hive;
+ reset_context.reset_device_list = &device_list;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
+
+ amdgpu_reset_do_xgmi_reset_on_init(&reset_context);
+ mutex_unlock(&hive->hive_lock);
+ amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ r = amdgpu_ras_init_badpage_info(tmp_adev);
+ if (r && r != -EHWPOISON)
+ dev_err(tmp_adev->dev,
+ "error during bad page data initialization");
+ }
+}
+
+static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive)
+{
+ INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work);
+ amdgpu_reset_domain_schedule(hive->reset_domain,
+ &hive->reset_on_init_work);
+}
+
+int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_hive_info *hive;
+ bool reset_scheduled;
+ int num_devs;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (!hive)
+ return -EINVAL;
+
+ mutex_lock(&hive->hive_lock);
+ num_devs = atomic_read(&hive->number_devices);
+ reset_scheduled = false;
+ if (num_devs == adev->gmc.xgmi.num_physical_nodes) {
+ amdgpu_xgmi_schedule_reset_on_init(hive);
+ reset_scheduled = true;
+ }
+
+ mutex_unlock(&hive->hive_lock);
+ amdgpu_put_xgmi_hive(hive);
+
+ if (reset_scheduled)
+ flush_work(&hive->reset_on_init_work);
+
+ return 0;
+}
+
+int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev,
+ struct amdgpu_hive_info *hive,
+ int req_nps_mode)
+{
+ struct amdgpu_device *tmp_adev;
+ int cur_nps_mode, r;
+
+ /* This is expected to be called only during unload of driver. The
+ * request needs to be placed only once for all devices in the hive. If
+ * one of them fail, revert the request for previous successful devices.
+ * After placing the request, make hive mode as UNKNOWN so that other
+ * devices don't request anymore.
+ */
+ mutex_lock(&hive->hive_lock);
+ if (atomic_read(&hive->requested_nps_mode) ==
+ UNKNOWN_MEMORY_PARTITION_MODE) {
+ dev_dbg(adev->dev, "Unexpected entry for hive NPS change");
+ mutex_unlock(&hive->hive_lock);
+ return 0;
+ }
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ r = adev->gmc.gmc_funcs->request_mem_partition_mode(
+ tmp_adev, req_nps_mode);
+ if (r)
+ break;
+ }
+ if (r) {
+ /* Request back current mode if one of the requests failed */
+ cur_nps_mode =
+ adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev);
+ list_for_each_entry_continue_reverse(
+ tmp_adev, &hive->device_list, gmc.xgmi.head)
+ adev->gmc.gmc_funcs->request_mem_partition_mode(
+ tmp_adev, cur_nps_mode);
+ }
+ /* Set to UNKNOWN so that other devices don't request anymore */
+ atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);
+ mutex_unlock(&hive->hive_lock);
+
+ return r;
+}
+
+bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
+ struct amdgpu_device *bo_adev)
+{
+ return (amdgpu_use_xgmi_p2p && adev != bo_adev &&
+ adev->gmc.xgmi.hive_id &&
+ adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
+}
+
+void amdgpu_xgmi_early_init(struct amdgpu_device *adev)
+{
+ if (!adev->gmc.xgmi.supported)
+ return;
+
+ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+ case IP_VERSION(9, 4, 0):
+ case IP_VERSION(9, 4, 1):
+ case IP_VERSION(9, 4, 2):
+ adev->gmc.xgmi.max_speed = XGMI_SPEED_25GT;
+ adev->gmc.xgmi.max_width = 16;
+ break;
+ case IP_VERSION(9, 4, 3):
+ case IP_VERSION(9, 4, 4):
+ case IP_VERSION(9, 5, 0):
+ adev->gmc.xgmi.max_speed = XGMI_SPEED_32GT;
+ adev->gmc.xgmi.max_width = 16;
+ break;
+ default:
+ break;
+ }
+}