diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 122 |
1 files changed, 100 insertions, 22 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 25e1f5ed7ead..ab8f970b2849 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2459,19 +2459,21 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) */ if (adev->gmc.xgmi.num_physical_nodes > 1) { if (amdgpu_xgmi_add_device(adev) == 0) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (!amdgpu_sriov_vf(adev)) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); - if (!hive->reset_domain || - !amdgpu_reset_get_reset_domain(hive->reset_domain)) { - r = -ENOENT; + if (!hive->reset_domain || + !amdgpu_reset_get_reset_domain(hive->reset_domain)) { + r = -ENOENT; + amdgpu_put_xgmi_hive(hive); + goto init_failed; + } + + /* Drop the early temporary reset domain we created for device */ + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = hive->reset_domain; amdgpu_put_xgmi_hive(hive); - goto init_failed; } - - /* Drop the early temporary reset domain we created for device */ - amdgpu_reset_put_reset_domain(adev->reset_domain); - adev->reset_domain = hive->reset_domain; - amdgpu_put_xgmi_hive(hive); } } @@ -3510,6 +3512,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->gmc.gart_size = 512 * 1024 * 1024; adev->accel_working = false; adev->num_rings = 0; + RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); adev->mman.buffer_funcs = NULL; adev->mman.buffer_funcs_ring = NULL; adev->vm_manager.vm_pte_funcs = NULL; @@ -3588,6 +3591,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); adev->gfx.gfx_off_req_count = 1; + adev->gfx.gfx_off_residency = 0; + adev->gfx.gfx_off_entrycount = 0; adev->pm.ac_power = power_supply_is_system_supplied() > 0; atomic_set(&adev->throttling_logging_enabled, 1); @@ -3976,8 +3981,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_gart_dummy_page_fini(adev); - if (drm_dev_is_unplugged(adev_to_drm(adev))) - amdgpu_device_unmap_mmio(adev); + amdgpu_device_unmap_mmio(adev); } @@ -3990,6 +3994,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) release_firmware(adev->firmware.gpu_info_fw); adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; + dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); amdgpu_reset_fini(adev); @@ -4542,14 +4547,15 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev) */ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) { - if (!amdgpu_device_ip_check_soft_reset(adev)) { - dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); - return false; - } if (amdgpu_gpu_recovery == 0) goto disabled; + if (!amdgpu_device_ip_check_soft_reset(adev)) { + dev_info(adev->dev,"Timeout, but no hardware hang detected.\n"); + return false; + } + if (amdgpu_sriov_vf(adev)) return true; @@ -4674,7 +4680,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!need_full_reset) need_full_reset = amdgpu_device_ip_need_full_reset(adev); - if (!need_full_reset) { + if (!need_full_reset && amdgpu_gpu_recovery) { amdgpu_device_ip_pre_soft_reset(adev); r = amdgpu_device_ip_soft_reset(adev); amdgpu_device_ip_post_soft_reset(adev); @@ -4770,6 +4776,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; + bool gpu_reset_for_dev_remove = 0; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -4789,6 +4796,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -4833,6 +4844,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, amdgpu_ras_intr_cleared(); } + /* Since the mode1 reset affects base ip blocks, the + * phase1 ip blocks need to be resumed. Otherwise there + * will be a BIOS signature error and the psp bootloader + * can't load kdb on the next amdgpu install. + */ + if (gpu_reset_for_dev_remove) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) + amdgpu_device_ip_resume_phase1(tmp_adev); + + goto end; + } + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ @@ -5072,6 +5095,7 @@ static void amdgpu_device_recheck_guilty_jobs( /* set guilty */ drm_sched_increase_karma(s_job); + amdgpu_reset_prepare_hwcontext(adev, reset_context); retry: /* do hw reset */ if (amdgpu_sriov_vf(adev)) { @@ -5154,6 +5178,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; int tmp_vram_lost_counter; + bool gpu_reset_for_dev_remove = false; + + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); /* * Special case: RAS triggered and full reset isn't supported @@ -5181,6 +5210,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_context->job = job; reset_context->hive = hive; + /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -5188,8 +5218,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_add_tail(&tmp_adev->reset_list, &device_list); + if (gpu_reset_for_dev_remove && adev->shutdown) + tmp_adev->shutdown = true; + } if (!list_is_first(&adev->reset_list, &device_list)) list_rotate_to_front(&adev->reset_list, &device_list); device_list_handle = &device_list; @@ -5272,6 +5305,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + if (gpu_reset_for_dev_remove) { + /* Workaroud for ASICs need to disable SMC first */ + amdgpu_device_smu_fini_early(tmp_adev); + } r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -5300,8 +5337,14 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, reset_context); - if (r && r == -EAGAIN) + if (r && r == -EAGAIN) { + set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags); + adev->asic_reset_res = 0; goto retry; + } + + if (!r && gpu_reset_for_dev_remove) + goto recover_end; } skip_hw_reset: @@ -5375,6 +5418,7 @@ skip_sched_resume: amdgpu_device_unset_mp1_state(tmp_adev); } +recover_end: tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); @@ -5557,9 +5601,9 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); resource_size_t aper_limit = adev->gmc.aper_base + adev->gmc.aper_size - 1; - bool p2p_access = !adev->gmc.xgmi.connected_to_cpu && - !(pci_p2pdma_distance_many(adev->pdev, - &peer_adev->dev, 1, true) < 0); + bool p2p_access = + !adev->gmc.xgmi.connected_to_cpu && + !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && adev->gmc.real_vram_size == adev->gmc.visible_vram_size && @@ -5733,6 +5777,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); adev->no_hw_access = true; r = amdgpu_device_pre_asic_reset(adev, &reset_context); @@ -5942,3 +5987,36 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, (void)RREG32(data); spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); } + +/** + * amdgpu_device_switch_gang - switch to a new gang + * @adev: amdgpu_device pointer + * @gang: the gang to switch to + * + * Try to switch to a new gang. + * Returns: NULL if we switched to the new gang or a reference to the current + * gang leader. + */ +struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, + struct dma_fence *gang) +{ + struct dma_fence *old = NULL; + + do { + dma_fence_put(old); + rcu_read_lock(); + old = dma_fence_get_rcu_safe(&adev->gang_submit); + rcu_read_unlock(); + + if (old == gang) + break; + + if (!dma_fence_is_signaled(old)) + return old; + + } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, + old, gang) != old); + + dma_fence_put(old); + return NULL; +} |