summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c557
1 files changed, 354 insertions, 203 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e1bab6a96cb6..6f93473436be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -232,7 +232,7 @@ static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
{
int ret = 0;
- if (!amdgpu_sriov_vf(adev))
+ if (amdgpu_nbio_is_replay_cnt_supported(adev))
ret = sysfs_create_file(&adev->dev->kobj,
&dev_attr_pcie_replay_count.attr);
@@ -241,7 +241,7 @@ static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
{
- if (!amdgpu_sriov_vf(adev))
+ if (amdgpu_nbio_is_replay_cnt_supported(adev))
sysfs_remove_file(&adev->dev->kobj,
&dev_attr_pcie_replay_count.attr);
}
@@ -411,19 +411,16 @@ static const struct attribute_group amdgpu_board_attrs_group = {
static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
-
/**
* amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
*
- * @dev: drm_device pointer
+ * @adev: amdgpu device pointer
*
* Returns true if the device is a dGPU with ATPX power control,
* otherwise return false.
*/
-bool amdgpu_device_supports_px(struct drm_device *dev)
+bool amdgpu_device_supports_px(struct amdgpu_device *adev)
{
- struct amdgpu_device *adev = drm_to_adev(dev);
-
if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
return true;
return false;
@@ -432,15 +429,13 @@ bool amdgpu_device_supports_px(struct drm_device *dev)
/**
* amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
*
- * @dev: drm_device pointer
+ * @adev: amdgpu device pointer
*
* Returns true if the device is a dGPU with ACPI power control,
* otherwise return false.
*/
-bool amdgpu_device_supports_boco(struct drm_device *dev)
+bool amdgpu_device_supports_boco(struct amdgpu_device *adev)
{
- struct amdgpu_device *adev = drm_to_adev(dev);
-
if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
return false;
@@ -453,29 +448,24 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
/**
* amdgpu_device_supports_baco - Does the device support BACO
*
- * @dev: drm_device pointer
+ * @adev: amdgpu device pointer
*
* Return:
* 1 if the device supports BACO;
* 3 if the device supports MACO (only works if BACO is supported)
* otherwise return 0.
*/
-int amdgpu_device_supports_baco(struct drm_device *dev)
+int amdgpu_device_supports_baco(struct amdgpu_device *adev)
{
- struct amdgpu_device *adev = drm_to_adev(dev);
-
return amdgpu_asic_supports_baco(adev);
}
void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
{
- struct drm_device *dev;
int bamaco_support;
- dev = adev_to_drm(adev);
-
adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
- bamaco_support = amdgpu_device_supports_baco(dev);
+ bamaco_support = amdgpu_device_supports_baco(adev);
switch (amdgpu_runtime_pm) {
case 2:
@@ -495,10 +485,12 @@ void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
break;
case -1:
case -2:
- if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
+ if (amdgpu_device_supports_px(adev)) {
+ /* enable PX as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
dev_info(adev->dev, "Using ATPX for runtime pm\n");
- } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
+ } else if (amdgpu_device_supports_boco(adev)) {
+ /* enable boco as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
dev_info(adev->dev, "Using BOCO for runtime pm\n");
} else {
@@ -547,14 +539,14 @@ no_runtime_pm:
* amdgpu_device_supports_smart_shift - Is the device dGPU with
* smart shift support
*
- * @dev: drm_device pointer
+ * @adev: amdgpu device pointer
*
* Returns true if the device is a dGPU with Smart Shift support,
* otherwise returns false.
*/
-bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
+bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev)
{
- return (amdgpu_device_supports_boco(dev) &&
+ return (amdgpu_device_supports_boco(adev) &&
amdgpu_acpi_is_power_shift_control_supported());
}
@@ -1288,14 +1280,14 @@ u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
*/
static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
{
- DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
+ dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg);
BUG();
return 0;
}
static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
{
- DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
+ dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
BUG();
return 0;
}
@@ -1312,15 +1304,17 @@ static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg
*/
static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
{
- DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
- reg, v);
+ dev_err(adev->dev,
+ "Invalid callback to write register 0x%04X with 0x%08X\n", reg,
+ v);
BUG();
}
static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
{
- DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
- reg, v);
+ dev_err(adev->dev,
+ "Invalid callback to write register 0x%llX with 0x%08X\n", reg,
+ v);
BUG();
}
@@ -1336,14 +1330,15 @@ static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, ui
*/
static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
{
- DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
+ dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n",
+ reg);
BUG();
return 0;
}
static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
{
- DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
+ dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
BUG();
return 0;
}
@@ -1360,15 +1355,17 @@ static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t r
*/
static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
{
- DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
- reg, v);
+ dev_err(adev->dev,
+ "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
+ reg, v);
BUG();
}
static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
{
- DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
- reg, v);
+ dev_err(adev->dev,
+ "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
+ reg, v);
BUG();
}
@@ -1386,8 +1383,9 @@ static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg,
static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
uint32_t block, uint32_t reg)
{
- DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
- reg, block);
+ dev_err(adev->dev,
+ "Invalid callback to read register 0x%04X in block 0x%04X\n",
+ reg, block);
BUG();
return 0;
}
@@ -1407,8 +1405,9 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
uint32_t block,
uint32_t reg, uint32_t v)
{
- DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
- reg, block, v);
+ dev_err(adev->dev,
+ "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
+ reg, block, v);
BUG();
}
@@ -1694,7 +1693,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
- DRM_WARN("System can't access extended configuration space, please check!!\n");
+ dev_warn(
+ adev->dev,
+ "System can't access extended configuration space, please check!!\n");
/* skip if the bios has already enabled large BAR */
if (adev->gmc.real_vram_size &&
@@ -1734,9 +1735,10 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
r = pci_resize_resource(adev->pdev, 0, rbar_size);
if (r == -ENOSPC)
- DRM_INFO("Not enough PCI address space for a large BAR.");
+ dev_info(adev->dev,
+ "Not enough PCI address space for a large BAR.");
else if (r && r != -ENOTSUPP)
- DRM_ERROR("Problem resizing BAR0 (%d).", r);
+ dev_err(adev->dev, "Problem resizing BAR0 (%d).", r);
pci_assign_unassigned_bus_resources(adev->pdev->bus);
@@ -1838,8 +1840,8 @@ bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
case 0:
return false;
default:
- DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
- amdgpu_seamless);
+ dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n",
+ amdgpu_seamless);
return false;
}
@@ -2015,7 +2017,7 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
return;
if (!is_os_64) {
- DRM_WARN("Not 64-bit OS, feature not supported\n");
+ dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n");
goto def_value;
}
si_meminfo(&si);
@@ -2030,7 +2032,7 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
if (total_memory < dram_size_seven_GB)
goto def_value1;
} else {
- DRM_WARN("Smu memory pool size not supported\n");
+ dev_warn(adev->dev, "Smu memory pool size not supported\n");
goto def_value;
}
adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
@@ -2038,7 +2040,7 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
return;
def_value1:
- DRM_WARN("No enough system memory\n");
+ dev_warn(adev->dev, "No enough system memory\n");
def_value:
adev->pm.smu_prv_buffer_size = 0;
}
@@ -2190,7 +2192,8 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
struct drm_device *dev = pci_get_drvdata(pdev);
int r;
- if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
+ if (amdgpu_device_supports_px(drm_to_adev(dev)) &&
+ state == VGA_SWITCHEROO_OFF)
return;
if (state == VGA_SWITCHEROO_ON) {
@@ -2202,12 +2205,13 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
amdgpu_device_load_pci_state(pdev);
r = pci_enable_device(pdev);
if (r)
- DRM_WARN("pci_enable_device failed (%d)\n", r);
+ dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n",
+ r);
amdgpu_device_resume(dev, true);
dev->switch_power_state = DRM_SWITCH_POWER_ON;
} else {
- pr_info("switched off\n");
+ dev_info(&pdev->dev, "switched off\n");
dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
amdgpu_device_prepare(dev);
amdgpu_device_suspend(dev, true);
@@ -2274,8 +2278,9 @@ int amdgpu_device_ip_set_clockgating_state(void *dev,
r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
&adev->ip_blocks[i], state);
if (r)
- DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "set_clockgating_state of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
}
return r;
}
@@ -2308,8 +2313,9 @@ int amdgpu_device_ip_set_powergating_state(void *dev,
r = adev->ip_blocks[i].version->funcs->set_powergating_state(
&adev->ip_blocks[i], state);
if (r)
- DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "set_powergating_state of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
}
return r;
}
@@ -2525,9 +2531,11 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
}
}
- DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
- amdgpu_virtual_display, pci_address_name,
- adev->enable_virtual_display, adev->mode_info.num_crtc);
+ dev_info(
+ adev->dev,
+ "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
+ amdgpu_virtual_display, pci_address_name,
+ adev->enable_virtual_display, adev->mode_info.num_crtc);
kfree(pciaddstr);
}
@@ -2538,8 +2546,9 @@ void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
adev->mode_info.num_crtc = 1;
adev->enable_virtual_display = true;
- DRM_INFO("virtual_display:%d, num_crtc:%d\n",
- adev->enable_virtual_display, adev->mode_info.num_crtc);
+ dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n",
+ adev->enable_virtual_display,
+ adev->mode_info.num_crtc);
}
}
@@ -2773,21 +2782,29 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
+ adev->virt.is_xgmi_node_migrate_enabled = false;
+ if (amdgpu_sriov_vf(adev)) {
+ adev->virt.is_xgmi_node_migrate_enabled =
+ amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
+ }
+
total = true;
for (i = 0; i < adev->num_ip_blocks; i++) {
ip_block = &adev->ip_blocks[i];
if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
- DRM_WARN("disabled ip block: %d <%s>\n",
- i, adev->ip_blocks[i].version->funcs->name);
+ dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i,
+ adev->ip_blocks[i].version->funcs->name);
adev->ip_blocks[i].status.valid = false;
} else if (ip_block->version->funcs->early_init) {
r = ip_block->version->funcs->early_init(ip_block);
if (r == -ENOENT) {
adev->ip_blocks[i].status.valid = false;
} else if (r) {
- DRM_ERROR("early_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "early_init of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
total = false;
} else {
adev->ip_blocks[i].status.valid = true;
@@ -2868,8 +2885,10 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("hw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "hw_init of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
return r;
}
adev->ip_blocks[i].status.hw = true;
@@ -2893,8 +2912,9 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
continue;
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("hw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "hw_init of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
return r;
}
adev->ip_blocks[i].status.hw = true;
@@ -2932,8 +2952,11 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
} else {
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("hw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "hw_init of IP block <%s> failed %d\n",
+ adev->ip_blocks[i]
+ .version->funcs->name,
+ r);
return r;
}
adev->ip_blocks[i].status.hw = true;
@@ -2988,25 +3011,29 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
r = drm_sched_init(&ring->sched, &args);
if (r) {
- DRM_ERROR("Failed to create scheduler on ring %s.\n",
- ring->name);
+ dev_err(adev->dev,
+ "Failed to create scheduler on ring %s.\n",
+ ring->name);
return r;
}
r = amdgpu_uvd_entity_init(adev, ring);
if (r) {
- DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
- ring->name);
+ dev_err(adev->dev,
+ "Failed to create UVD scheduling entity on ring %s.\n",
+ ring->name);
return r;
}
r = amdgpu_vce_entity_init(adev, ring);
if (r) {
- DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
- ring->name);
+ dev_err(adev->dev,
+ "Failed to create VCE scheduling entity on ring %s.\n",
+ ring->name);
return r;
}
}
- amdgpu_xcp_update_partition_sched_list(adev);
+ if (adev->xcp_mgr)
+ amdgpu_xcp_update_partition_sched_list(adev);
return 0;
}
@@ -3038,8 +3065,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->funcs->sw_init) {
r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("sw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "sw_init of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
goto init_failed;
}
}
@@ -3053,7 +3082,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
/* need to do common hw init early so everything is set up for gmc */
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("hw_init %d failed %d\n", i, r);
+ dev_err(adev->dev, "hw_init %d failed %d\n", i,
+ r);
goto init_failed;
}
adev->ip_blocks[i].status.hw = true;
@@ -3065,17 +3095,21 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
r = amdgpu_device_mem_scratch_init(adev);
if (r) {
- DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
+ dev_err(adev->dev,
+ "amdgpu_mem_scratch_init failed %d\n",
+ r);
goto init_failed;
}
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("hw_init %d failed %d\n", i, r);
+ dev_err(adev->dev, "hw_init %d failed %d\n", i,
+ r);
goto init_failed;
}
r = amdgpu_device_wb_init(adev);
if (r) {
- DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
+ dev_err(adev->dev,
+ "amdgpu_device_wb_init failed %d\n", r);
goto init_failed;
}
adev->ip_blocks[i].status.hw = true;
@@ -3087,14 +3121,16 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
AMDGPU_GEM_DOMAIN_GTT,
AMDGPU_CSA_SIZE);
if (r) {
- DRM_ERROR("allocate CSA failed %d\n", r);
+ dev_err(adev->dev,
+ "allocate CSA failed %d\n", r);
goto init_failed;
}
}
r = amdgpu_seq64_init(adev);
if (r) {
- DRM_ERROR("allocate seq64 failed %d\n", r);
+ dev_err(adev->dev, "allocate seq64 failed %d\n",
+ r);
goto init_failed;
}
}
@@ -3284,8 +3320,10 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
state);
if (r) {
- DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "set_clockgating_state(gate) of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
return r;
}
}
@@ -3321,8 +3359,10 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
state);
if (r) {
- DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "set_powergating_state(gate) of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
return r;
}
}
@@ -3388,8 +3428,10 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->funcs->late_init) {
r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
if (r) {
- DRM_ERROR("late_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_err(adev->dev,
+ "late_init of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
return r;
}
}
@@ -3398,7 +3440,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
r = amdgpu_ras_late_init(adev);
if (r) {
- DRM_ERROR("amdgpu_ras_late_init failed %d", r);
+ dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r);
return r;
}
@@ -3412,7 +3454,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
r = amdgpu_device_enable_mgpu_fan_boost();
if (r)
- DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
+ dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r);
/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
if (amdgpu_passthrough(adev) &&
@@ -3445,7 +3487,9 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
AMDGPU_XGMI_PSTATE_MIN);
if (r) {
- DRM_ERROR("pstate setting failed (%d).\n", r);
+ dev_err(adev->dev,
+ "pstate setting failed (%d).\n",
+ r);
break;
}
}
@@ -3459,17 +3503,19 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
{
+ struct amdgpu_device *adev = ip_block->adev;
int r;
if (!ip_block->version->funcs->hw_fini) {
- DRM_ERROR("hw_fini of IP block <%s> not defined\n",
- ip_block->version->funcs->name);
+ dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n",
+ ip_block->version->funcs->name);
} else {
r = ip_block->version->funcs->hw_fini(ip_block);
/* XXX handle errors */
if (r) {
- DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
- ip_block->version->funcs->name, r);
+ dev_dbg(adev->dev,
+ "hw_fini of IP block <%s> failed %d\n",
+ ip_block->version->funcs->name, r);
}
}
@@ -3510,15 +3556,16 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
if (r) {
- DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_dbg(adev->dev,
+ "early_fini of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
}
}
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
- amdgpu_amdkfd_suspend(adev, false);
+ amdgpu_amdkfd_suspend(adev, true);
amdgpu_userq_suspend(adev);
/* Workaround for ASICs need to disable SMC first */
@@ -3533,7 +3580,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev)) {
if (amdgpu_virt_release_full_gpu(adev, false))
- DRM_ERROR("failed to release exclusive mode on fini\n");
+ dev_err(adev->dev,
+ "failed to release exclusive mode on fini\n");
}
return 0;
@@ -3581,8 +3629,10 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
/* XXX handle errors */
if (r) {
- DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ dev_dbg(adev->dev,
+ "sw_fini of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name,
+ r);
}
}
adev->ip_blocks[i].status.sw = false;
@@ -3615,7 +3665,7 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
r = amdgpu_ib_ring_tests(adev);
if (r)
- DRM_ERROR("ib ring test failed (%d).\n", r);
+ dev_err(adev->dev, "ib ring test failed (%d).\n", r);
}
static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
@@ -3756,8 +3806,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
if (r) {
- DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
- adev->mp1_state, r);
+ dev_err(adev->dev,
+ "SMC failed to set mp1 state %d, %d\n",
+ adev->mp1_state, r);
return r;
}
}
@@ -4041,12 +4092,14 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
/**
* amdgpu_device_asic_has_dc_support - determine if DC supports the asic
*
+ * @pdev : pci device context
* @asic_type: AMD asic type
*
* Check if there is DC (new modesetting infrastructre) support for an asic.
* returns true if DC has support, false if not.
*/
-bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
+bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev,
+ enum amd_asic_type asic_type)
{
switch (asic_type) {
#ifdef CONFIG_DRM_AMDGPU_SI
@@ -4089,7 +4142,9 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
#else
default:
if (amdgpu_dc > 0)
- DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
+ dev_info_once(
+ &pdev->dev,
+ "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
return false;
#endif
}
@@ -4108,7 +4163,7 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
(adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
return false;
- return amdgpu_device_asic_has_dc_support(adev->asic_type);
+ return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type);
}
static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
@@ -4130,13 +4185,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
task_barrier_enter(&hive->tb);
- adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
+ adev->asic_reset_res = amdgpu_device_baco_enter(adev);
if (adev->asic_reset_res)
goto fail;
task_barrier_exit(&hive->tb);
- adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
+ adev->asic_reset_res = amdgpu_device_baco_exit(adev);
if (adev->asic_reset_res)
goto fail;
@@ -4150,7 +4205,8 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
fail:
if (adev->asic_reset_res)
- DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
+ dev_warn(adev->dev,
+ "ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev_to_drm(adev)->unique);
amdgpu_put_xgmi_hive(hive);
}
@@ -4164,18 +4220,10 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
int ret = 0;
/*
- * By default timeout for non compute jobs is 10000
- * and 60000 for compute jobs.
- * In SR-IOV or passthrough mode, timeout for compute
- * jobs are 60000 by default.
+ * By default timeout for jobs is 10 sec
*/
- adev->gfx_timeout = msecs_to_jiffies(10000);
+ adev->compute_timeout = adev->gfx_timeout = msecs_to_jiffies(10000);
adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
- if (amdgpu_sriov_vf(adev))
- adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
- msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
- else
- adev->compute_timeout = msecs_to_jiffies(60000);
if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
while ((timeout_setting = strsep(&input, ",")) &&
@@ -4274,7 +4322,7 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
adev->gfx.mcbp = true;
if (adev->gfx.mcbp)
- DRM_INFO("MCBP is enabled\n");
+ dev_info(adev->dev, "MCBP is enabled\n");
}
/**
@@ -4290,7 +4338,6 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
int amdgpu_device_init(struct amdgpu_device *adev,
uint32_t flags)
{
- struct drm_device *ddev = adev_to_drm(adev);
struct pci_dev *pdev = adev->pdev;
int r, i;
bool px = false;
@@ -4342,9 +4389,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
- DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
- amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
- pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
+ dev_info(
+ adev->dev,
+ "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
+ amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
+ pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
/* mutex initialization are all done here so we
* can recall function without having locking issues
@@ -4461,8 +4510,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (!adev->rmmio)
return -ENOMEM;
- DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
- DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
+ dev_info(adev->dev, "register mmio base: 0x%08X\n",
+ (uint32_t)adev->rmmio_base);
+ dev_info(adev->dev, "register mmio size: %u\n",
+ (unsigned int)adev->rmmio_size);
/*
* Reset domain needs to be present early, before XGMI hive discovered
@@ -4599,7 +4650,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = -EINVAL;
goto failed;
}
- DRM_INFO("GPU posting now...\n");
+ dev_info(adev->dev, "GPU posting now...\n");
r = amdgpu_device_asic_init(adev);
if (r) {
dev_err(adev->dev, "gpu post error!\n");
@@ -4709,12 +4760,12 @@ fence_driver_init:
r = amdgpu_pm_sysfs_init(adev);
if (r)
- DRM_ERROR("registering pm sysfs failed (%d).\n", r);
+ dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
r = amdgpu_ucode_sysfs_init(adev);
if (r) {
adev->ucode_sysfs_en = false;
- DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
+ dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
} else
adev->ucode_sysfs_en = true;
@@ -4747,7 +4798,7 @@ fence_driver_init:
if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
- px = amdgpu_device_supports_px(ddev);
+ px = amdgpu_device_supports_px(adev);
if (px || (!dev_is_removable(&adev->pdev->dev) &&
apple_gmux_detect(NULL, NULL)))
@@ -4913,7 +4964,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
kfree(adev->xcp_mgr);
adev->xcp_mgr = NULL;
- px = amdgpu_device_supports_px(adev_to_drm(adev));
+ px = amdgpu_device_supports_px(adev);
if (px || (!dev_is_removable(&adev->pdev->dev) &&
apple_gmux_detect(NULL, NULL)))
@@ -4962,8 +5013,16 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
return 0;
ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
- if (ret)
- DRM_WARN("evicting device resources failed\n");
+ if (ret) {
+ dev_warn(adev->dev, "evicting device resources failed\n");
+ return ret;
+ }
+
+ if (adev->in_s4) {
+ ret = ttm_device_prepare_hibernation(&adev->mman.bdev);
+ if (ret)
+ dev_err(adev->dev, "prepare hibernation failed, %d\n", ret);
+ }
return ret;
}
@@ -5035,6 +5094,28 @@ int amdgpu_device_prepare(struct drm_device *dev)
}
/**
+ * amdgpu_device_complete - complete power state transition
+ *
+ * @dev: drm dev pointer
+ *
+ * Undo the changes from amdgpu_device_prepare. This will be
+ * called on all resume transitions, including those that failed.
+ */
+void amdgpu_device_complete(struct drm_device *dev)
+{
+ struct amdgpu_device *adev = drm_to_adev(dev);
+ int i;
+
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (!adev->ip_blocks[i].status.valid)
+ continue;
+ if (!adev->ip_blocks[i].version->funcs->complete)
+ continue;
+ adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]);
+ }
+}
+
+/**
* amdgpu_device_suspend - initiate device suspend
*
* @dev: drm dev pointer
@@ -5055,14 +5136,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
adev->in_suspend = true;
if (amdgpu_sriov_vf(adev)) {
+ if (!adev->in_s0ix && !adev->in_runpm)
+ amdgpu_amdkfd_suspend_process(adev);
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_virt_request_full_gpu(adev, false);
if (r)
return r;
}
- if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
- DRM_WARN("smart shift update failed\n");
+ if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
+ dev_warn(adev->dev, "smart shift update failed\n");
if (notify_clients)
drm_client_dev_suspend(adev_to_drm(adev), false);
@@ -5074,7 +5157,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
amdgpu_device_ip_suspend_phase1(adev);
if (!adev->in_s0ix) {
- amdgpu_amdkfd_suspend(adev, adev->in_runpm);
+ amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
amdgpu_userq_suspend(adev);
}
@@ -5098,6 +5181,32 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
return 0;
}
+static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
+{
+ int r;
+ unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
+
+ /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO)
+ * may not work. The access could be blocked by nBIF protection as VF isn't in
+ * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX
+ * so that QEMU reprograms MSIX table.
+ */
+ amdgpu_restore_msix(adev);
+
+ r = adev->gfxhub.funcs->get_xgmi_info(adev);
+ if (r)
+ return r;
+
+ dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
+ prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
+
+ adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
+ adev->vm_manager.vram_base_offset +=
+ adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
+
+ return 0;
+}
+
/**
* amdgpu_device_resume - initiate device resume
*
@@ -5119,6 +5228,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
return r;
}
+ if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
+ r = amdgpu_virt_resume(adev);
+ if (r)
+ goto exit;
+ }
+
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
return 0;
@@ -5140,7 +5255,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
}
if (!adev->in_s0ix) {
- r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
+ r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
if (r)
goto exit;
@@ -5159,6 +5274,9 @@ exit:
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_init_data_exchange(adev);
amdgpu_virt_release_full_gpu(adev, true);
+
+ if (!adev->in_s0ix && !r && !adev->in_runpm)
+ r = amdgpu_amdkfd_resume_process(adev);
}
if (r)
@@ -5193,10 +5311,12 @@ exit:
dev->dev->power.disable_depth--;
#endif
}
+
+ amdgpu_vram_mgr_clear_reset_blocks(adev);
adev->in_suspend = false;
- if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
- DRM_WARN("smart shift update failed\n");
+ if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0))
+ dev_warn(adev->dev, "smart shift update failed\n");
return 0;
}
@@ -5727,7 +5847,9 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
if (vram_lost) {
- DRM_INFO("VRAM is lost due to GPU reset!\n");
+ dev_info(
+ tmp_adev->dev,
+ "VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
}
@@ -6006,29 +6128,20 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
{
struct amdgpu_device *tmp_adev;
int ret = 0;
- u32 status;
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
- pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
- if (PCI_POSSIBLE_ERROR(status)) {
- dev_err(tmp_adev->dev, "device lost from bus!");
- ret = -ENODEV;
- }
+ ret |= amdgpu_device_bus_status_check(tmp_adev);
}
return ret;
}
-static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
- struct amdgpu_job *job,
- struct amdgpu_reset_context *reset_context,
- struct list_head *device_list,
- struct amdgpu_hive_info *hive,
- bool need_emergency_restart)
+static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+ struct list_head *device_list,
+ struct amdgpu_hive_info *hive)
{
- struct list_head *device_list_handle = NULL;
struct amdgpu_device *tmp_adev = NULL;
- int i, r = 0;
+ int r;
/*
* Build list of devices to reset.
@@ -6045,26 +6158,55 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
}
if (!list_is_first(&adev->reset_list, device_list))
list_rotate_to_front(&adev->reset_list, device_list);
- device_list_handle = device_list;
} else {
list_add_tail(&adev->reset_list, device_list);
- device_list_handle = device_list;
}
if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
- r = amdgpu_device_health_check(device_list_handle);
+ r = amdgpu_device_health_check(device_list);
if (r)
return r;
}
- /* We need to lock reset domain only once both for XGMI and single device */
- tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
- reset_list);
+ return 0;
+}
+
+static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
+ struct list_head *device_list)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ if (list_empty(device_list))
+ return;
+ tmp_adev =
+ list_first_entry(device_list, struct amdgpu_device, reset_list);
amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+}
- /* block all schedulers and reset given job's ring */
- list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
+ struct list_head *device_list)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ if (list_empty(device_list))
+ return;
+ tmp_adev =
+ list_first_entry(device_list, struct amdgpu_device, reset_list);
+ amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+}
+static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
+ struct amdgpu_job *job,
+ struct amdgpu_reset_context *reset_context,
+ struct list_head *device_list,
+ struct amdgpu_hive_info *hive,
+ bool need_emergency_restart)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+ int i;
+
+ /* block all schedulers and reset given job's ring */
+ list_for_each_entry(tmp_adev, device_list, reset_list) {
amdgpu_device_set_mp1_state(tmp_adev);
/*
@@ -6113,8 +6255,6 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
}
atomic_inc(&tmp_adev->gpu_reset_counter);
}
-
- return r;
}
static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
@@ -6219,8 +6359,10 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
} else {
dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
- if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
- DRM_WARN("smart shift update failed\n");
+ if (amdgpu_acpi_smart_shift_update(tmp_adev,
+ AMDGPU_SS_DEV_D0))
+ dev_warn(tmp_adev->dev,
+ "smart shift update failed\n");
}
}
@@ -6252,11 +6394,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
amdgpu_ras_set_error_query_ready(tmp_adev, true);
}
-
- tmp_adev = list_first_entry(device_list, struct amdgpu_device,
- reset_list);
- amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
-
}
@@ -6306,7 +6443,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*/
if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
amdgpu_ras_get_context(adev)->reboot) {
- DRM_WARN("Emergency reboot.");
+ dev_warn(adev->dev, "Emergency reboot.");
ksys_sync_helper();
emergency_restart();
@@ -6324,11 +6461,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->hive = hive;
INIT_LIST_HEAD(&device_list);
- r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
- hive, need_emergency_restart);
- if (r)
+ if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
goto end_reset;
+ /* We need to lock reset domain only once both for XGMI and single device */
+ amdgpu_device_recovery_get_reset_lock(adev, &device_list);
+
+ amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
+ hive, need_emergency_restart);
if (need_emergency_restart)
goto skip_sched_resume;
/*
@@ -6337,7 +6477,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*
* job->base holds a reference to parent fence
*/
- if (job && dma_fence_is_signaled(&job->hw_fence)) {
+ if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
job_signaled = true;
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset;
@@ -6345,13 +6485,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
if (r)
- goto end_reset;
+ goto reset_unlock;
skip_hw_reset:
r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
if (r)
- goto end_reset;
+ goto reset_unlock;
skip_sched_resume:
amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
+reset_unlock:
+ amdgpu_device_recovery_put_reset_lock(adev, &device_list);
end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
@@ -6363,8 +6505,17 @@ end_reset:
atomic_set(&adev->reset_domain->reset_res, r);
- if (!r)
- drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+ if (!r) {
+ struct amdgpu_task_info *ti = NULL;
+
+ if (job)
+ ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
+
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE,
+ ti ? &ti->task : NULL);
+
+ amdgpu_vm_put_task_info(ti);
+ }
return r;
}
@@ -6683,12 +6834,11 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
#endif
}
-int amdgpu_device_baco_enter(struct drm_device *dev)
+int amdgpu_device_baco_enter(struct amdgpu_device *adev)
{
- struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- if (!amdgpu_device_supports_baco(dev))
+ if (!amdgpu_device_supports_baco(adev))
return -ENOTSUPP;
if (ras && adev->ras_enabled &&
@@ -6698,13 +6848,12 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
return amdgpu_dpm_baco_enter(adev);
}
-int amdgpu_device_baco_exit(struct drm_device *dev)
+int amdgpu_device_baco_exit(struct amdgpu_device *adev)
{
- struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int ret = 0;
- if (!amdgpu_device_supports_baco(dev))
+ if (!amdgpu_device_supports_baco(adev))
return -ENOTSUPP;
ret = amdgpu_dpm_baco_exit(adev);
@@ -6738,7 +6887,6 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
struct amdgpu_reset_context reset_context;
struct list_head device_list;
- int r = 0;
dev_info(adev->dev, "PCI error: detected callback!!\n");
@@ -6763,14 +6911,14 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
memset(&reset_context, 0, sizeof(reset_context));
INIT_LIST_HEAD(&device_list);
- r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
- hive, false);
+ amdgpu_device_recovery_prepare(adev, &device_list, hive);
+ amdgpu_device_recovery_get_reset_lock(adev, &device_list);
+ amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
+ hive, false);
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
}
- if (r)
- return PCI_ERS_RESULT_DISCONNECT;
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
@@ -6880,8 +7028,8 @@ out:
if (hive) {
list_for_each_entry(tmp_adev, &device_list, reset_list)
amdgpu_device_unset_mp1_state(tmp_adev);
- amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
+ amdgpu_device_recovery_put_reset_lock(adev, &device_list);
}
if (hive) {
@@ -6927,6 +7075,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
amdgpu_device_sched_resume(&device_list, NULL, NULL);
amdgpu_device_gpu_resume(adev, &device_list, false);
+ amdgpu_device_recovery_put_reset_lock(adev, &device_list);
adev->pcie_reset_ctx.occurs_dpc = false;
if (hive) {
@@ -6951,11 +7100,11 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
adev->pci_state = pci_store_saved_state(pdev);
if (!adev->pci_state) {
- DRM_ERROR("Failed to store PCI saved state");
+ dev_err(adev->dev, "Failed to store PCI saved state");
return false;
}
} else {
- DRM_WARN("Failed to save PCI state, err:%d\n", r);
+ dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r);
return false;
}
@@ -6976,7 +7125,7 @@ bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
if (!r) {
pci_restore_state(pdev);
} else {
- DRM_WARN("Failed to load PCI state, err:%d\n", r);
+ dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r);
return false;
}
@@ -7222,7 +7371,7 @@ struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
if (r)
- DRM_WARN("OOM tracking isolation\n");
+ dev_warn(adev->dev, "OOM tracking isolation\n");
out_grab_ref:
dma_fence_get(dep);
@@ -7290,9 +7439,11 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
tmp_ = RREG32(reg_addr);
loop--;
if (!loop) {
- DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
- inst, reg_name, (uint32_t)expected_value,
- (uint32_t)(tmp_ & (mask)));
+ dev_warn(
+ adev->dev,
+ "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
+ inst, reg_name, (uint32_t)expected_value,
+ (uint32_t)(tmp_ & (mask)));
ret = -ETIMEDOUT;
break;
}