summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c122
1 files changed, 100 insertions, 22 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 25e1f5ed7ead..ab8f970b2849 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2459,19 +2459,21 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
if (amdgpu_xgmi_add_device(adev) == 0) {
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ if (!amdgpu_sriov_vf(adev)) {
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
- if (!hive->reset_domain ||
- !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
- r = -ENOENT;
+ if (!hive->reset_domain ||
+ !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+ r = -ENOENT;
+ amdgpu_put_xgmi_hive(hive);
+ goto init_failed;
+ }
+
+ /* Drop the early temporary reset domain we created for device */
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = hive->reset_domain;
amdgpu_put_xgmi_hive(hive);
- goto init_failed;
}
-
- /* Drop the early temporary reset domain we created for device */
- amdgpu_reset_put_reset_domain(adev->reset_domain);
- adev->reset_domain = hive->reset_domain;
- amdgpu_put_xgmi_hive(hive);
}
}
@@ -3510,6 +3512,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->gmc.gart_size = 512 * 1024 * 1024;
adev->accel_working = false;
adev->num_rings = 0;
+ RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
adev->mman.buffer_funcs = NULL;
adev->mman.buffer_funcs_ring = NULL;
adev->vm_manager.vm_pte_funcs = NULL;
@@ -3588,6 +3591,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
adev->gfx.gfx_off_req_count = 1;
+ adev->gfx.gfx_off_residency = 0;
+ adev->gfx.gfx_off_entrycount = 0;
adev->pm.ac_power = power_supply_is_system_supplied() > 0;
atomic_set(&adev->throttling_logging_enabled, 1);
@@ -3976,8 +3981,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_gart_dummy_page_fini(adev);
- if (drm_dev_is_unplugged(adev_to_drm(adev)))
- amdgpu_device_unmap_mmio(adev);
+ amdgpu_device_unmap_mmio(adev);
}
@@ -3990,6 +3994,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
release_firmware(adev->firmware.gpu_info_fw);
adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false;
+ dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
amdgpu_reset_fini(adev);
@@ -4542,14 +4547,15 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
*/
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
{
- if (!amdgpu_device_ip_check_soft_reset(adev)) {
- dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
- return false;
- }
if (amdgpu_gpu_recovery == 0)
goto disabled;
+ if (!amdgpu_device_ip_check_soft_reset(adev)) {
+ dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
+ return false;
+ }
+
if (amdgpu_sriov_vf(adev))
return true;
@@ -4674,7 +4680,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!need_full_reset)
need_full_reset = amdgpu_device_ip_need_full_reset(adev);
- if (!need_full_reset) {
+ if (!need_full_reset && amdgpu_gpu_recovery) {
amdgpu_device_ip_pre_soft_reset(adev);
r = amdgpu_device_ip_soft_reset(adev);
amdgpu_device_ip_post_soft_reset(adev);
@@ -4770,6 +4776,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
+ bool gpu_reset_for_dev_remove = 0;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -4789,6 +4796,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
+ gpu_reset_for_dev_remove =
+ test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
@@ -4833,6 +4844,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
amdgpu_ras_intr_cleared();
}
+ /* Since the mode1 reset affects base ip blocks, the
+ * phase1 ip blocks need to be resumed. Otherwise there
+ * will be a BIOS signature error and the psp bootloader
+ * can't load kdb on the next amdgpu install.
+ */
+ if (gpu_reset_for_dev_remove) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+ amdgpu_device_ip_resume_phase1(tmp_adev);
+
+ goto end;
+ }
+
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
@@ -5072,6 +5095,7 @@ static void amdgpu_device_recheck_guilty_jobs(
/* set guilty */
drm_sched_increase_karma(s_job);
+ amdgpu_reset_prepare_hwcontext(adev, reset_context);
retry:
/* do hw reset */
if (amdgpu_sriov_vf(adev)) {
@@ -5154,6 +5178,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
bool need_emergency_restart = false;
bool audio_suspended = false;
int tmp_vram_lost_counter;
+ bool gpu_reset_for_dev_remove = false;
+
+ gpu_reset_for_dev_remove =
+ test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/*
* Special case: RAS triggered and full reset isn't supported
@@ -5181,6 +5210,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->job = job;
reset_context->hive = hive;
+
/*
* Build list of devices to reset.
* In case we are in XGMI hive mode, resort the device list
@@ -5188,8 +5218,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*/
INIT_LIST_HEAD(&device_list);
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list);
+ if (gpu_reset_for_dev_remove && adev->shutdown)
+ tmp_adev->shutdown = true;
+ }
if (!list_is_first(&adev->reset_list, &device_list))
list_rotate_to_front(&adev->reset_list, &device_list);
device_list_handle = &device_list;
@@ -5272,6 +5305,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ if (gpu_reset_for_dev_remove) {
+ /* Workaroud for ASICs need to disable SMC first */
+ amdgpu_device_smu_fini_early(tmp_adev);
+ }
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/
if (r) {
@@ -5300,8 +5337,14 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
amdgpu_ras_resume(adev);
} else {
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
- if (r && r == -EAGAIN)
+ if (r && r == -EAGAIN) {
+ set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
+ adev->asic_reset_res = 0;
goto retry;
+ }
+
+ if (!r && gpu_reset_for_dev_remove)
+ goto recover_end;
}
skip_hw_reset:
@@ -5375,6 +5418,7 @@ skip_sched_resume:
amdgpu_device_unset_mp1_state(tmp_adev);
}
+recover_end:
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
@@ -5557,9 +5601,9 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
resource_size_t aper_limit =
adev->gmc.aper_base + adev->gmc.aper_size - 1;
- bool p2p_access = !adev->gmc.xgmi.connected_to_cpu &&
- !(pci_p2pdma_distance_many(adev->pdev,
- &peer_adev->dev, 1, true) < 0);
+ bool p2p_access =
+ !adev->gmc.xgmi.connected_to_cpu &&
+ !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
@@ -5733,6 +5777,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+ set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
adev->no_hw_access = true;
r = amdgpu_device_pre_asic_reset(adev, &reset_context);
@@ -5942,3 +5987,36 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
(void)RREG32(data);
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
}
+
+/**
+ * amdgpu_device_switch_gang - switch to a new gang
+ * @adev: amdgpu_device pointer
+ * @gang: the gang to switch to
+ *
+ * Try to switch to a new gang.
+ * Returns: NULL if we switched to the new gang or a reference to the current
+ * gang leader.
+ */
+struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
+ struct dma_fence *gang)
+{
+ struct dma_fence *old = NULL;
+
+ do {
+ dma_fence_put(old);
+ rcu_read_lock();
+ old = dma_fence_get_rcu_safe(&adev->gang_submit);
+ rcu_read_unlock();
+
+ if (old == gang)
+ break;
+
+ if (!dma_fence_is_signaled(old))
+ return old;
+
+ } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
+ old, gang) != old);
+
+ dma_fence_put(old);
+ return NULL;
+}